1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-03-17 20:17:55 +02:00

adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)

Originally committed as revision 3578 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Aurelien Jacobs 2004-10-11 02:19:29 +00:00 committed by Michael Niedermayer
parent 3ba1438dec
commit 053dea12f2
15 changed files with 1129 additions and 1060 deletions

16
configure vendored
View File

@ -106,6 +106,14 @@ case "$cpu" in
i386|i486|i586|i686|i86pc|BePC) i386|i486|i586|i686|i86pc|BePC)
cpu="x86" cpu="x86"
;; ;;
x86_64)
if [ "`$cc -dumpmachine | grep x86_64 | cut -d- -f1`" = "x86_64" -a \
-z "`echo $CFLAGS | grep -- -m32`" ]; then
cpu="x86_64"
else
cpu="x86"
fi
;;
# armv4l is a subset of armv5tel # armv4l is a subset of armv5tel
armv4l|armv5tel) armv4l|armv5tel)
cpu="armv4l" cpu="armv4l"
@ -500,7 +508,7 @@ fi
# compute mmx state # compute mmx state
if test $mmx = "default"; then if test $mmx = "default"; then
if test $cpu = "x86"; then if test $cpu = "x86" -o $cpu = "x86_64"; then
mmx="yes" mmx="yes"
else else
mmx="no" mmx="no"
@ -827,6 +835,7 @@ done
# test gcc version to see if vector builtins can be used # test gcc version to see if vector builtins can be used
# currently only used on i386 for MMX builtins # currently only used on i386 for MMX builtins
cat > $TMPC << EOF cat > $TMPC << EOF
#include <xmmintrin.h>
int main(void) { int main(void) {
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 2) #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 2)
return 0; return 0;
@ -985,7 +994,7 @@ echo "CPU $cpu ($tune)"
echo "Big Endian $bigendian" echo "Big Endian $bigendian"
echo "inttypes.h $inttypes" echo "inttypes.h $inttypes"
echo "broken inttypes.h $emu_fast_int" echo "broken inttypes.h $emu_fast_int"
if test $cpu = "x86"; then if test $cpu = "x86" -o $cpu = "x86_64"; then
echo "MMX enabled $mmx" echo "MMX enabled $mmx"
echo "Vector Builtins $builtin_vector" echo "Vector Builtins $builtin_vector"
fi fi
@ -1074,6 +1083,9 @@ echo "TARGET_OS=$TARGET_OS" >> config.mak
if test "$cpu" = "x86" ; then if test "$cpu" = "x86" ; then
echo "TARGET_ARCH_X86=yes" >> config.mak echo "TARGET_ARCH_X86=yes" >> config.mak
echo "#define ARCH_X86 1" >> $TMPH echo "#define ARCH_X86 1" >> $TMPH
elif test "$cpu" = "x86_64" ; then
echo "TARGET_ARCH_X86_64=yes" >> config.mak
echo "#define ARCH_X86_64 1" >> $TMPH
elif test "$cpu" = "armv4l" ; then elif test "$cpu" = "armv4l" ; then
echo "TARGET_ARCH_ARMV4L=yes" >> config.mak echo "TARGET_ARCH_ARMV4L=yes" >> config.mak
echo "#define ARCH_ARMV4L 1" >> $TMPH echo "#define ARCH_ARMV4L 1" >> $TMPH

View File

@ -10,17 +10,23 @@
#include <byteswap.h> #include <byteswap.h>
#else #else
#ifdef ARCH_X86 #ifdef ARCH_X86_64
static inline unsigned short ByteSwap16(unsigned short x) # define LEGACY_REGS "=Q"
#else
# define LEGACY_REGS "=q"
#endif
#if defined(ARCH_X86) || defined(ARCH_X86_64)
static inline uint16_t ByteSwap16(uint16_t x)
{ {
__asm("xchgb %b0,%h0" : __asm("xchgb %b0,%h0" :
"=q" (x) : LEGACY_REGS (x) :
"0" (x)); "0" (x));
return x; return x;
} }
#define bswap_16(x) ByteSwap16(x) #define bswap_16(x) ByteSwap16(x)
static inline unsigned int ByteSwap32(unsigned int x) static inline uint32_t ByteSwap32(uint32_t x)
{ {
#if __CPU__ > 386 #if __CPU__ > 386
__asm("bswap %0": __asm("bswap %0":
@ -29,21 +35,28 @@ static inline unsigned int ByteSwap32(unsigned int x)
__asm("xchgb %b0,%h0\n" __asm("xchgb %b0,%h0\n"
" rorl $16,%0\n" " rorl $16,%0\n"
" xchgb %b0,%h0": " xchgb %b0,%h0":
"=q" (x) : LEGACY_REGS (x) :
#endif #endif
"0" (x)); "0" (x));
return x; return x;
} }
#define bswap_32(x) ByteSwap32(x) #define bswap_32(x) ByteSwap32(x)
static inline unsigned long long int ByteSwap64(unsigned long long int x) static inline uint64_t ByteSwap64(uint64_t x)
{ {
#ifdef ARCH_X86_64
__asm("bswap %0":
"=r" (x) :
"0" (x));
return x;
#else
register union { __extension__ uint64_t __ll; register union { __extension__ uint64_t __ll;
uint32_t __l[2]; } __x; uint32_t __l[2]; } __x;
asm("xchgl %0,%1": asm("xchgl %0,%1":
"=r"(__x.__l[0]),"=r"(__x.__l[1]): "=r"(__x.__l[0]),"=r"(__x.__l[1]):
"0"(bswap_32((unsigned long)x)),"1"(bswap_32((unsigned long)(x>>32)))); "0"(bswap_32((uint32_t)x)),"1"(bswap_32((uint32_t)(x>>32))));
return __x.__ll; return __x.__ll;
#endif
} }
#define bswap_64(x) ByteSwap64(x) #define bswap_64(x) ByteSwap64(x)

View File

@ -254,7 +254,7 @@ inline void dprintf(const char* fmt,...) {}
extern const uint32_t inverse[256]; extern const uint32_t inverse[256];
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
# define FASTDIV(a,b) \ # define FASTDIV(a,b) \
({\ ({\
int ret,dmy;\ int ret,dmy;\
@ -271,7 +271,7 @@ extern const uint32_t inverse[256];
# define FASTDIV(a,b) ((a)/(b)) # define FASTDIV(a,b) ((a)/(b))
#endif #endif
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
// avoid +32 for shift optimization (gcc should do that ...) // avoid +32 for shift optimization (gcc should do that ...)
static inline int32_t NEG_SSR32( int32_t a, int8_t s){ static inline int32_t NEG_SSR32( int32_t a, int8_t s){
asm ("sarl %1, %0\n\t" asm ("sarl %1, %0\n\t"
@ -390,7 +390,7 @@ typedef struct RL_VLC_ELEM {
#endif #endif
/* used to avoid missaligned exceptions on some archs (alpha, ...) */ /* used to avoid missaligned exceptions on some archs (alpha, ...) */
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
# define unaligned32(a) (*(const uint32_t*)(a)) # define unaligned32(a) (*(const uint32_t*)(a))
#else #else
# ifdef __GNUC__ # ifdef __GNUC__
@ -460,7 +460,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
static inline void put_bits(PutBitContext *s, int n, unsigned int value) static inline void put_bits(PutBitContext *s, int n, unsigned int value)
{ {
# ifdef ALIGNED_BITSTREAM_WRITER # ifdef ALIGNED_BITSTREAM_WRITER
# ifdef ARCH_X86 # if defined(ARCH_X86) || defined(ARCH_X86_64)
asm volatile( asm volatile(
"movl %0, %%ecx \n\t" "movl %0, %%ecx \n\t"
"xorl %%eax, %%eax \n\t" "xorl %%eax, %%eax \n\t"
@ -491,7 +491,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value)
s->index= index; s->index= index;
# endif # endif
# else //ALIGNED_BITSTREAM_WRITER # else //ALIGNED_BITSTREAM_WRITER
# ifdef ARCH_X86 # if defined(ARCH_X86) || defined(ARCH_X86_64)
asm volatile( asm volatile(
"movl $7, %%ecx \n\t" "movl $7, %%ecx \n\t"
"andl %0, %%ecx \n\t" "andl %0, %%ecx \n\t"
@ -738,7 +738,7 @@ static inline int get_bits_count(GetBitContext *s){
name##_bit_count-= 32;\ name##_bit_count-= 32;\
}\ }\
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
# define SKIP_CACHE(name, gb, num)\ # define SKIP_CACHE(name, gb, num)\
asm(\ asm(\
"shldl %2, %1, %0 \n\t"\ "shldl %2, %1, %0 \n\t"\
@ -1218,7 +1218,7 @@ static inline int ff_get_fourcc(const char *s){
#define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24)) #define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24))
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
#define MASK_ABS(mask, level)\ #define MASK_ABS(mask, level)\
asm volatile(\ asm volatile(\
"cdq \n\t"\ "cdq \n\t"\
@ -1252,7 +1252,7 @@ if((y)<(x)){\
} }
#endif #endif
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
static inline long long rdtsc(void) static inline long long rdtsc(void)
{ {
long long l; long long l;

View File

@ -4,12 +4,20 @@
#include <stdlib.h> #include <stdlib.h>
#include "../dsputil.h" #include "../dsputil.h"
#ifdef ARCH_X86_64
# define REG_b "rbx"
# define REG_S "rsi"
#else
# define REG_b "ebx"
# define REG_S "esi"
#endif
/* ebx saving is necessary for PIC. gcc seems unable to see it alone */ /* ebx saving is necessary for PIC. gcc seems unable to see it alone */
#define cpuid(index,eax,ebx,ecx,edx)\ #define cpuid(index,eax,ebx,ecx,edx)\
__asm __volatile\ __asm __volatile\
("movl %%ebx, %%esi\n\t"\ ("mov %%"REG_b", %%"REG_S"\n\t"\
"cpuid\n\t"\ "cpuid\n\t"\
"xchgl %%ebx, %%esi"\ "xchg %%"REG_b", %%"REG_S\
: "=a" (eax), "=S" (ebx),\ : "=a" (eax), "=S" (ebx),\
"=c" (ecx), "=d" (edx)\ "=c" (ecx), "=d" (edx)\
: "0" (index)); : "0" (index));
@ -24,7 +32,7 @@ int mm_support(void)
/* See if CPUID instruction is supported ... */ /* See if CPUID instruction is supported ... */
/* ... Get copies of EFLAGS into eax and ecx */ /* ... Get copies of EFLAGS into eax and ecx */
"pushf\n\t" "pushf\n\t"
"popl %0\n\t" "pop %0\n\t"
"movl %0, %1\n\t" "movl %0, %1\n\t"
/* ... Toggle the ID bit in one copy and store */ /* ... Toggle the ID bit in one copy and store */
@ -35,7 +43,7 @@ int mm_support(void)
/* ... Get the (hopefully modified) EFLAGS */ /* ... Get the (hopefully modified) EFLAGS */
"pushf\n\t" "pushf\n\t"
"popl %0\n\t" "pop %0\n\t"
: "=a" (eax), "=c" (ecx) : "=a" (eax), "=c" (ecx)
: :
: "cc" : "cc"

View File

@ -187,7 +187,7 @@ static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xF
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
{ {
asm volatile( asm volatile(
"movl $-128, %%eax \n\t" "mov $-128, %%"REG_a" \n\t"
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
@ -199,16 +199,16 @@ static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
"punpckhbw %%mm7, %%mm1 \n\t" "punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" "punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t" "punpckhbw %%mm7, %%mm3 \n\t"
"movq %%mm0, (%1, %%eax)\n\t" "movq %%mm0, (%1, %%"REG_a")\n\t"
"movq %%mm1, 8(%1, %%eax)\n\t" "movq %%mm1, 8(%1, %%"REG_a")\n\t"
"movq %%mm2, 16(%1, %%eax)\n\t" "movq %%mm2, 16(%1, %%"REG_a")\n\t"
"movq %%mm3, 24(%1, %%eax)\n\t" "movq %%mm3, 24(%1, %%"REG_a")\n\t"
"addl %3, %0 \n\t" "add %3, %0 \n\t"
"addl $32, %%eax \n\t" "add $32, %%"REG_a" \n\t"
"js 1b \n\t" "js 1b \n\t"
: "+r" (pixels) : "+r" (pixels)
: "r" (block+64), "r" (line_size), "r" (line_size*2) : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
: "%eax" : "%"REG_a
); );
} }
@ -216,7 +216,7 @@ static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint
{ {
asm volatile( asm volatile(
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
"movl $-128, %%eax \n\t" "mov $-128, %%"REG_a" \n\t"
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"movq (%0), %%mm0 \n\t" "movq (%0), %%mm0 \n\t"
@ -229,15 +229,15 @@ static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint
"punpckhbw %%mm7, %%mm3 \n\t" "punpckhbw %%mm7, %%mm3 \n\t"
"psubw %%mm2, %%mm0 \n\t" "psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t" "psubw %%mm3, %%mm1 \n\t"
"movq %%mm0, (%2, %%eax)\n\t" "movq %%mm0, (%2, %%"REG_a")\n\t"
"movq %%mm1, 8(%2, %%eax)\n\t" "movq %%mm1, 8(%2, %%"REG_a")\n\t"
"addl %3, %0 \n\t" "add %3, %0 \n\t"
"addl %3, %1 \n\t" "add %3, %1 \n\t"
"addl $16, %%eax \n\t" "add $16, %%"REG_a" \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
: "+r" (s1), "+r" (s2) : "+r" (s1), "+r" (s2)
: "r" (block+64), "r" (stride) : "r" (block+64), "r" ((long)stride)
: "%eax" : "%"REG_a
); );
} }
#endif //CONFIG_ENCODERS #endif //CONFIG_ENCODERS
@ -268,7 +268,7 @@ void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size
"movq %%mm2, (%0, %1)\n\t" "movq %%mm2, (%0, %1)\n\t"
"movq %%mm4, (%0, %1, 2)\n\t" "movq %%mm4, (%0, %1, 2)\n\t"
"movq %%mm6, (%0, %2)\n\t" "movq %%mm6, (%0, %2)\n\t"
::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
:"memory"); :"memory");
pix += line_size*4; pix += line_size*4;
p += 32; p += 32;
@ -293,7 +293,7 @@ void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size
"movq %%mm2, (%0, %1)\n\t" "movq %%mm2, (%0, %1)\n\t"
"movq %%mm4, (%0, %1, 2)\n\t" "movq %%mm4, (%0, %1, 2)\n\t"
"movq %%mm6, (%0, %2)\n\t" "movq %%mm6, (%0, %2)\n\t"
::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
:"memory"); :"memory");
} }
@ -359,59 +359,59 @@ void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size
static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{ {
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
".balign 8 \n\t" ".balign 8 \n\t"
"1: \n\t" "1: \n\t"
"movd (%1), %%mm0 \n\t" "movd (%1), %%mm0 \n\t"
"movd (%1, %3), %%mm1 \n\t" "movd (%1, %3), %%mm1 \n\t"
"movd %%mm0, (%2) \n\t" "movd %%mm0, (%2) \n\t"
"movd %%mm1, (%2, %3) \n\t" "movd %%mm1, (%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"movd (%1), %%mm0 \n\t" "movd (%1), %%mm0 \n\t"
"movd (%1, %3), %%mm1 \n\t" "movd (%1, %3), %%mm1 \n\t"
"movd %%mm0, (%2) \n\t" "movd %%mm0, (%2) \n\t"
"movd %%mm1, (%2, %3) \n\t" "movd %%mm1, (%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
: "+g"(h), "+r" (pixels), "+r" (block) : "+g"(h), "+r" (pixels), "+r" (block)
: "r"(line_size) : "r"((long)line_size)
: "%eax", "memory" : "%"REG_a, "memory"
); );
} }
static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{ {
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
".balign 8 \n\t" ".balign 8 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq %%mm0, (%2) \n\t" "movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq %%mm0, (%2) \n\t" "movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
: "+g"(h), "+r" (pixels), "+r" (block) : "+g"(h), "+r" (pixels), "+r" (block)
: "r"(line_size) : "r"((long)line_size)
: "%eax", "memory" : "%"REG_a, "memory"
); );
} }
static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{ {
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
".balign 8 \n\t" ".balign 8 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
@ -422,8 +422,8 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_siz
"movq %%mm4, 8(%2) \n\t" "movq %%mm4, 8(%2) \n\t"
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"movq %%mm5, 8(%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm4 \n\t" "movq 8(%1), %%mm4 \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
@ -432,13 +432,13 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_siz
"movq %%mm4, 8(%2) \n\t" "movq %%mm4, 8(%2) \n\t"
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"movq %%mm5, 8(%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
: "+g"(h), "+r" (pixels), "+r" (block) : "+g"(h), "+r" (pixels), "+r" (block)
: "r"(line_size) : "r"((long)line_size)
: "%eax", "memory" : "%"REG_a, "memory"
); );
} }
@ -446,16 +446,16 @@ static void clear_blocks_mmx(DCTELEM *blocks)
{ {
__asm __volatile( __asm __volatile(
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
"movl $-128*6, %%eax \n\t" "mov $-128*6, %%"REG_a" \n\t"
"1: \n\t" "1: \n\t"
"movq %%mm7, (%0, %%eax) \n\t" "movq %%mm7, (%0, %%"REG_a") \n\t"
"movq %%mm7, 8(%0, %%eax) \n\t" "movq %%mm7, 8(%0, %%"REG_a") \n\t"
"movq %%mm7, 16(%0, %%eax) \n\t" "movq %%mm7, 16(%0, %%"REG_a") \n\t"
"movq %%mm7, 24(%0, %%eax) \n\t" "movq %%mm7, 24(%0, %%"REG_a") \n\t"
"addl $32, %%eax \n\t" "add $32, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
: : "r" (((int)blocks)+128*6) : : "r" (((uint8_t *)blocks)+128*6)
: "%eax" : "%"REG_a
); );
} }
@ -463,7 +463,7 @@ static void clear_blocks_mmx(DCTELEM *blocks)
static int pix_sum16_mmx(uint8_t * pix, int line_size){ static int pix_sum16_mmx(uint8_t * pix, int line_size){
const int h=16; const int h=16;
int sum; int sum;
int index= -line_size*h; long index= -line_size*h;
__asm __volatile( __asm __volatile(
"pxor %%mm7, %%mm7 \n\t" "pxor %%mm7, %%mm7 \n\t"
@ -481,7 +481,7 @@ static int pix_sum16_mmx(uint8_t * pix, int line_size){
"paddw %%mm2, %%mm3 \n\t" "paddw %%mm2, %%mm3 \n\t"
"paddw %%mm1, %%mm3 \n\t" "paddw %%mm1, %%mm3 \n\t"
"paddw %%mm3, %%mm6 \n\t" "paddw %%mm3, %%mm6 \n\t"
"addl %3, %1 \n\t" "add %3, %1 \n\t"
" js 1b \n\t" " js 1b \n\t"
"movq %%mm6, %%mm5 \n\t" "movq %%mm6, %%mm5 \n\t"
"psrlq $32, %%mm6 \n\t" "psrlq $32, %%mm6 \n\t"
@ -492,7 +492,7 @@ static int pix_sum16_mmx(uint8_t * pix, int line_size){
"movd %%mm6, %0 \n\t" "movd %%mm6, %0 \n\t"
"andl $0xFFFF, %0 \n\t" "andl $0xFFFF, %0 \n\t"
: "=&r" (sum), "+r" (index) : "=&r" (sum), "+r" (index)
: "r" (pix - index), "r" (line_size) : "r" (pix - index), "r" ((long)line_size)
); );
return sum; return sum;
@ -500,7 +500,7 @@ static int pix_sum16_mmx(uint8_t * pix, int line_size){
#endif //CONFIG_ENCODERS #endif //CONFIG_ENCODERS
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
int i=0; long i=0;
asm volatile( asm volatile(
"1: \n\t" "1: \n\t"
"movq (%1, %0), %%mm0 \n\t" "movq (%1, %0), %%mm0 \n\t"
@ -511,11 +511,11 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
"movq 8(%2, %0), %%mm1 \n\t" "movq 8(%2, %0), %%mm1 \n\t"
"paddb %%mm0, %%mm1 \n\t" "paddb %%mm0, %%mm1 \n\t"
"movq %%mm1, 8(%2, %0) \n\t" "movq %%mm1, 8(%2, %0) \n\t"
"addl $16, %0 \n\t" "add $16, %0 \n\t"
"cmpl %3, %0 \n\t" "cmp %3, %0 \n\t"
" jb 1b \n\t" " jb 1b \n\t"
: "+r" (i) : "+r" (i)
: "r"(src), "r"(dst), "r"(w-15) : "r"(src), "r"(dst), "r"((long)w-15)
); );
for(; i<w; i++) for(; i<w; i++)
dst[i+0] += src[i+0]; dst[i+0] += src[i+0];
@ -726,7 +726,7 @@ static int pix_norm1_mmx(uint8_t *pix, int line_size) {
"paddd %%mm3,%%mm4\n" "paddd %%mm3,%%mm4\n"
"paddd %%mm2,%%mm7\n" "paddd %%mm2,%%mm7\n"
"addl %2, %0\n" "add %2, %0\n"
"paddd %%mm4,%%mm7\n" "paddd %%mm4,%%mm7\n"
"dec %%ecx\n" "dec %%ecx\n"
"jnz 1b\n" "jnz 1b\n"
@ -735,7 +735,7 @@ static int pix_norm1_mmx(uint8_t *pix, int line_size) {
"psrlq $32, %%mm7\n" /* shift hi dword to lo */ "psrlq $32, %%mm7\n" /* shift hi dword to lo */
"paddd %%mm7,%%mm1\n" "paddd %%mm7,%%mm1\n"
"movd %%mm1,%1\n" "movd %%mm1,%1\n"
: "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" ); : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
return tmp; return tmp;
} }
@ -763,8 +763,8 @@ static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
"pmaddwd %%mm2,%%mm2\n" "pmaddwd %%mm2,%%mm2\n"
"pmaddwd %%mm1,%%mm1\n" "pmaddwd %%mm1,%%mm1\n"
"addl %3,%0\n" "add %3,%0\n"
"addl %3,%1\n" "add %3,%1\n"
"paddd %%mm2,%%mm1\n" "paddd %%mm2,%%mm1\n"
"paddd %%mm1,%%mm7\n" "paddd %%mm1,%%mm7\n"
@ -777,7 +777,7 @@ static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
"paddd %%mm7,%%mm1\n" "paddd %%mm7,%%mm1\n"
"movd %%mm1,%2\n" "movd %%mm1,%2\n"
: "+r" (pix1), "+r" (pix2), "=r"(tmp) : "+r" (pix1), "+r" (pix2), "=r"(tmp)
: "r" (line_size) , "m" (h) : "r" ((long)line_size) , "m" (h)
: "%ecx"); : "%ecx");
return tmp; return tmp;
} }
@ -821,8 +821,8 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
"pmaddwd %%mm1,%%mm1\n" "pmaddwd %%mm1,%%mm1\n"
"pmaddwd %%mm3,%%mm3\n" "pmaddwd %%mm3,%%mm3\n"
"addl %3,%0\n" "add %3,%0\n"
"addl %3,%1\n" "add %3,%1\n"
"paddd %%mm2,%%mm1\n" "paddd %%mm2,%%mm1\n"
"paddd %%mm4,%%mm3\n" "paddd %%mm4,%%mm3\n"
@ -837,7 +837,7 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
"paddd %%mm7,%%mm1\n" "paddd %%mm7,%%mm1\n"
"movd %%mm1,%2\n" "movd %%mm1,%2\n"
: "+r" (pix1), "+r" (pix2), "=r"(tmp) : "+r" (pix1), "+r" (pix2), "=r"(tmp)
: "r" (line_size) , "m" (h) : "r" ((long)line_size) , "m" (h)
: "%ecx"); : "%ecx");
return tmp; return tmp;
} }
@ -863,7 +863,7 @@ static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
"psubw %%mm1, %%mm0\n" "psubw %%mm1, %%mm0\n"
"psubw %%mm3, %%mm2\n" "psubw %%mm3, %%mm2\n"
"addl %2,%0\n" "add %2,%0\n"
"movq (%0),%%mm4\n" "movq (%0),%%mm4\n"
"movq %%mm4, %%mm1\n" "movq %%mm4, %%mm1\n"
@ -891,7 +891,7 @@ static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
"paddw %%mm0, %%mm2\n" "paddw %%mm0, %%mm2\n"
"paddw %%mm2, %%mm6\n" "paddw %%mm2, %%mm6\n"
"addl %2,%0\n" "add %2,%0\n"
"1:\n" "1:\n"
"movq (%0),%%mm0\n" "movq (%0),%%mm0\n"
@ -920,7 +920,7 @@ static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
"paddw %%mm4, %%mm5\n" "paddw %%mm4, %%mm5\n"
"paddw %%mm5, %%mm6\n" "paddw %%mm5, %%mm6\n"
"addl %2,%0\n" "add %2,%0\n"
"movq (%0),%%mm4\n" "movq (%0),%%mm4\n"
"movq %%mm4, %%mm1\n" "movq %%mm4, %%mm1\n"
@ -948,7 +948,7 @@ static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
"paddw %%mm0, %%mm2\n" "paddw %%mm0, %%mm2\n"
"paddw %%mm2, %%mm6\n" "paddw %%mm2, %%mm6\n"
"addl %2,%0\n" "add %2,%0\n"
"subl $2, %%ecx\n" "subl $2, %%ecx\n"
" jnz 1b\n" " jnz 1b\n"
@ -962,7 +962,7 @@ static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
"paddd %%mm6,%%mm0\n" "paddd %%mm6,%%mm0\n"
"movd %%mm0,%1\n" "movd %%mm0,%1\n"
: "+r" (pix1), "=r"(tmp) : "+r" (pix1), "=r"(tmp)
: "r" (line_size) , "g" (h-2) : "r" ((long)line_size) , "g" (h-2)
: "%ecx"); : "%ecx");
return tmp; return tmp;
} }
@ -986,7 +986,7 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
"psubw %%mm1, %%mm0\n" "psubw %%mm1, %%mm0\n"
"psubw %%mm3, %%mm2\n" "psubw %%mm3, %%mm2\n"
"addl %2,%0\n" "add %2,%0\n"
"movq (%0),%%mm4\n" "movq (%0),%%mm4\n"
"movq 1(%0),%%mm1\n" "movq 1(%0),%%mm1\n"
@ -1011,7 +1011,7 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
"paddw %%mm0, %%mm2\n" "paddw %%mm0, %%mm2\n"
"paddw %%mm2, %%mm6\n" "paddw %%mm2, %%mm6\n"
"addl %2,%0\n" "add %2,%0\n"
"1:\n" "1:\n"
"movq (%0),%%mm0\n" "movq (%0),%%mm0\n"
@ -1037,7 +1037,7 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
"paddw %%mm4, %%mm5\n" "paddw %%mm4, %%mm5\n"
"paddw %%mm5, %%mm6\n" "paddw %%mm5, %%mm6\n"
"addl %2,%0\n" "add %2,%0\n"
"movq (%0),%%mm4\n" "movq (%0),%%mm4\n"
"movq 1(%0),%%mm1\n" "movq 1(%0),%%mm1\n"
@ -1062,7 +1062,7 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
"paddw %%mm0, %%mm2\n" "paddw %%mm0, %%mm2\n"
"paddw %%mm2, %%mm6\n" "paddw %%mm2, %%mm6\n"
"addl %2,%0\n" "add %2,%0\n"
"subl $2, %%ecx\n" "subl $2, %%ecx\n"
" jnz 1b\n" " jnz 1b\n"
@ -1076,7 +1076,7 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
"paddd %%mm6,%%mm0\n" "paddd %%mm6,%%mm0\n"
"movd %%mm0,%1\n" "movd %%mm0,%1\n"
: "+r" (pix1), "=r"(tmp) : "+r" (pix1), "=r"(tmp)
: "r" (line_size) , "g" (h-2) : "r" ((long)line_size) , "g" (h-2)
: "%ecx"); : "%ecx");
return tmp + hf_noise8_mmx(pix+8, line_size, h); return tmp + hf_noise8_mmx(pix+8, line_size, h);
} }
@ -1106,7 +1106,7 @@ static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_si
#define SUM(in0, in1, out0, out1) \ #define SUM(in0, in1, out0, out1) \
"movq (%0), %%mm2\n"\ "movq (%0), %%mm2\n"\
"movq 8(%0), %%mm3\n"\ "movq 8(%0), %%mm3\n"\
"addl %2,%0\n"\ "add %2,%0\n"\
"movq %%mm2, " #out0 "\n"\ "movq %%mm2, " #out0 "\n"\
"movq %%mm3, " #out1 "\n"\ "movq %%mm3, " #out1 "\n"\
"psubusb " #in0 ", %%mm2\n"\ "psubusb " #in0 ", %%mm2\n"\
@ -1133,7 +1133,7 @@ static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_si
"pxor %%mm7,%%mm7\n" "pxor %%mm7,%%mm7\n"
"movq (%0),%%mm0\n" "movq (%0),%%mm0\n"
"movq 8(%0),%%mm1\n" "movq 8(%0),%%mm1\n"
"addl %2,%0\n" "add %2,%0\n"
"subl $2, %%ecx\n" "subl $2, %%ecx\n"
SUM(%%mm0, %%mm1, %%mm4, %%mm5) SUM(%%mm0, %%mm1, %%mm4, %%mm5)
"1:\n" "1:\n"
@ -1153,7 +1153,7 @@ static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_si
"paddw %%mm6,%%mm0\n" "paddw %%mm6,%%mm0\n"
"movd %%mm0,%1\n" "movd %%mm0,%1\n"
: "+r" (pix), "=r"(tmp) : "+r" (pix), "=r"(tmp)
: "r" (line_size) , "m" (h) : "r" ((long)line_size) , "m" (h)
: "%ecx"); : "%ecx");
return tmp & 0xFFFF; return tmp & 0xFFFF;
} }
@ -1168,7 +1168,7 @@ static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_s
#define SUM(in0, in1, out0, out1) \ #define SUM(in0, in1, out0, out1) \
"movq (%0), " #out0 "\n"\ "movq (%0), " #out0 "\n"\
"movq 8(%0), " #out1 "\n"\ "movq 8(%0), " #out1 "\n"\
"addl %2,%0\n"\ "add %2,%0\n"\
"psadbw " #out0 ", " #in0 "\n"\ "psadbw " #out0 ", " #in0 "\n"\
"psadbw " #out1 ", " #in1 "\n"\ "psadbw " #out1 ", " #in1 "\n"\
"paddw " #in1 ", " #in0 "\n"\ "paddw " #in1 ", " #in0 "\n"\
@ -1180,7 +1180,7 @@ static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_s
"pxor %%mm7,%%mm7\n" "pxor %%mm7,%%mm7\n"
"movq (%0),%%mm0\n" "movq (%0),%%mm0\n"
"movq 8(%0),%%mm1\n" "movq 8(%0),%%mm1\n"
"addl %2,%0\n" "add %2,%0\n"
"subl $2, %%ecx\n" "subl $2, %%ecx\n"
SUM(%%mm0, %%mm1, %%mm4, %%mm5) SUM(%%mm0, %%mm1, %%mm4, %%mm5)
"1:\n" "1:\n"
@ -1194,7 +1194,7 @@ static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_s
"movd %%mm6,%1\n" "movd %%mm6,%1\n"
: "+r" (pix), "=r"(tmp) : "+r" (pix), "=r"(tmp)
: "r" (line_size) , "m" (h) : "r" ((long)line_size) , "m" (h)
: "%ecx"); : "%ecx");
return tmp; return tmp;
} }
@ -1212,8 +1212,8 @@ static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in
"movq (%1)," #out0 "\n"\ "movq (%1)," #out0 "\n"\
"movq 8(%0),%%mm3\n"\ "movq 8(%0),%%mm3\n"\
"movq 8(%1)," #out1 "\n"\ "movq 8(%1)," #out1 "\n"\
"addl %3,%0\n"\ "add %3,%0\n"\
"addl %3,%1\n"\ "add %3,%1\n"\
"psubb " #out0 ", %%mm2\n"\ "psubb " #out0 ", %%mm2\n"\
"psubb " #out1 ", %%mm3\n"\ "psubb " #out1 ", %%mm3\n"\
"pxor %%mm7, %%mm2\n"\ "pxor %%mm7, %%mm2\n"\
@ -1248,8 +1248,8 @@ static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in
"movq (%1),%%mm2\n" "movq (%1),%%mm2\n"
"movq 8(%0),%%mm1\n" "movq 8(%0),%%mm1\n"
"movq 8(%1),%%mm3\n" "movq 8(%1),%%mm3\n"
"addl %3,%0\n" "add %3,%0\n"
"addl %3,%1\n" "add %3,%1\n"
"subl $2, %%ecx\n" "subl $2, %%ecx\n"
"psubb %%mm2, %%mm0\n" "psubb %%mm2, %%mm0\n"
"psubb %%mm3, %%mm1\n" "psubb %%mm3, %%mm1\n"
@ -1273,7 +1273,7 @@ static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in
"paddw %%mm6,%%mm0\n" "paddw %%mm6,%%mm0\n"
"movd %%mm0,%2\n" "movd %%mm0,%2\n"
: "+r" (pix1), "+r" (pix2), "=r"(tmp) : "+r" (pix1), "+r" (pix2), "=r"(tmp)
: "r" (line_size) , "m" (h) : "r" ((long)line_size) , "m" (h)
: "%ecx"); : "%ecx");
return tmp & 0x7FFF; return tmp & 0x7FFF;
} }
@ -1291,8 +1291,8 @@ static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, i
"movq (%1),%%mm2\n"\ "movq (%1),%%mm2\n"\
"movq 8(%0)," #out1 "\n"\ "movq 8(%0)," #out1 "\n"\
"movq 8(%1),%%mm3\n"\ "movq 8(%1),%%mm3\n"\
"addl %3,%0\n"\ "add %3,%0\n"\
"addl %3,%1\n"\ "add %3,%1\n"\
"psubb %%mm2, " #out0 "\n"\ "psubb %%mm2, " #out0 "\n"\
"psubb %%mm3, " #out1 "\n"\ "psubb %%mm3, " #out1 "\n"\
"pxor %%mm7, " #out0 "\n"\ "pxor %%mm7, " #out0 "\n"\
@ -1312,8 +1312,8 @@ static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, i
"movq (%1),%%mm2\n" "movq (%1),%%mm2\n"
"movq 8(%0),%%mm1\n" "movq 8(%0),%%mm1\n"
"movq 8(%1),%%mm3\n" "movq 8(%1),%%mm3\n"
"addl %3,%0\n" "add %3,%0\n"
"addl %3,%1\n" "add %3,%1\n"
"subl $2, %%ecx\n" "subl $2, %%ecx\n"
"psubb %%mm2, %%mm0\n" "psubb %%mm2, %%mm0\n"
"psubb %%mm3, %%mm1\n" "psubb %%mm3, %%mm1\n"
@ -1331,14 +1331,14 @@ static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, i
"movd %%mm6,%2\n" "movd %%mm6,%2\n"
: "+r" (pix1), "+r" (pix2), "=r"(tmp) : "+r" (pix1), "+r" (pix2), "=r"(tmp)
: "r" (line_size) , "m" (h) : "r" ((long)line_size) , "m" (h)
: "%ecx"); : "%ecx");
return tmp; return tmp;
} }
#undef SUM #undef SUM
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
int i=0; long i=0;
asm volatile( asm volatile(
"1: \n\t" "1: \n\t"
"movq (%2, %0), %%mm0 \n\t" "movq (%2, %0), %%mm0 \n\t"
@ -1349,18 +1349,18 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
"movq 8(%1, %0), %%mm1 \n\t" "movq 8(%1, %0), %%mm1 \n\t"
"psubb %%mm0, %%mm1 \n\t" "psubb %%mm0, %%mm1 \n\t"
"movq %%mm1, 8(%3, %0) \n\t" "movq %%mm1, 8(%3, %0) \n\t"
"addl $16, %0 \n\t" "add $16, %0 \n\t"
"cmpl %4, %0 \n\t" "cmp %4, %0 \n\t"
" jb 1b \n\t" " jb 1b \n\t"
: "+r" (i) : "+r" (i)
: "r"(src1), "r"(src2), "r"(dst), "r"(w-15) : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
); );
for(; i<w; i++) for(; i<w; i++)
dst[i+0] = src1[i+0]-src2[i+0]; dst[i+0] = src1[i+0]-src2[i+0];
} }
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
int i=0; long i=0;
uint8_t l, lt; uint8_t l, lt;
asm volatile( asm volatile(
@ -1379,11 +1379,11 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
"pmaxub %%mm1, %%mm4 \n\t" "pmaxub %%mm1, %%mm4 \n\t"
"psubb %%mm4, %%mm3 \n\t" // dst - pred "psubb %%mm4, %%mm3 \n\t" // dst - pred
"movq %%mm3, (%3, %0) \n\t" "movq %%mm3, (%3, %0) \n\t"
"addl $8, %0 \n\t" "add $8, %0 \n\t"
"cmpl %4, %0 \n\t" "cmp %4, %0 \n\t"
" jb 1b \n\t" " jb 1b \n\t"
: "+r" (i) : "+r" (i)
: "r"(src1), "r"(src2), "r"(dst), "r"(w) : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
); );
l= *left; l= *left;
@ -1772,12 +1772,12 @@ static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, in
"packuswb %%mm4, %%mm0 \n\t"\ "packuswb %%mm4, %%mm0 \n\t"\
OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
\ \
"addl %3, %0 \n\t"\ "add %3, %0 \n\t"\
"addl %4, %1 \n\t"\ "add %4, %1 \n\t"\
"decl %2 \n\t"\ "decl %2 \n\t"\
" jnz 1b \n\t"\ " jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+m"(h)\ : "+a"(src), "+c"(dst), "+m"(h)\
: "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
: "memory"\ : "memory"\
);\ );\
}\ }\
@ -1885,12 +1885,12 @@ static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int
"packuswb %%mm3, %%mm0 \n\t"\ "packuswb %%mm3, %%mm0 \n\t"\
OP_MMX2(%%mm0, (%1), %%mm4, q)\ OP_MMX2(%%mm0, (%1), %%mm4, q)\
\ \
"addl %3, %0 \n\t"\ "add %3, %0 \n\t"\
"addl %4, %1 \n\t"\ "add %4, %1 \n\t"\
"decl %2 \n\t"\ "decl %2 \n\t"\
" jnz 1b \n\t"\ " jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+m"(h)\ : "+a"(src), "+c"(dst), "+m"(h)\
: "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
: "memory"\ : "memory"\
);\ );\
}\ }\
@ -1949,12 +1949,12 @@ static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
"movq %%mm1, 17*8(%1) \n\t"\ "movq %%mm1, 17*8(%1) \n\t"\
"movq %%mm2, 2*17*8(%1) \n\t"\ "movq %%mm2, 2*17*8(%1) \n\t"\
"movq %%mm3, 3*17*8(%1) \n\t"\ "movq %%mm3, 3*17*8(%1) \n\t"\
"addl $8, %1 \n\t"\ "add $8, %1 \n\t"\
"addl %3, %0 \n\t"\ "add %3, %0 \n\t"\
"decl %2 \n\t"\ "decl %2 \n\t"\
" jnz 1b \n\t"\ " jnz 1b \n\t"\
: "+r" (src), "+r" (temp_ptr), "+r"(count)\ : "+r" (src), "+r" (temp_ptr), "+r"(count)\
: "r" (srcStride)\ : "r" ((long)srcStride)\
: "memory"\ : "memory"\
);\ );\
\ \
@ -1971,37 +1971,37 @@ static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
"movq 24(%0), %%mm3 \n\t"\ "movq 24(%0), %%mm3 \n\t"\
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
"addl %4, %1 \n\t"\ "add %4, %1 \n\t"\
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
\ \
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
"addl %4, %1 \n\t"\ "add %4, %1 \n\t"\
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
"addl %4, %1 \n\t"\ "add %4, %1 \n\t"\
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
"addl %4, %1 \n\t"\ "add %4, %1 \n\t"\
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
"addl %4, %1 \n\t"\ "add %4, %1 \n\t"\
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
"addl %4, %1 \n\t"\ "add %4, %1 \n\t"\
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
\ \
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
"addl %4, %1 \n\t" \ "add %4, %1 \n\t" \
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
\ \
"addl $136, %0 \n\t"\ "add $136, %0 \n\t"\
"addl %6, %1 \n\t"\ "add %6, %1 \n\t"\
"decl %2 \n\t"\ "decl %2 \n\t"\
" jnz 1b \n\t"\ " jnz 1b \n\t"\
\ \
: "+r"(temp_ptr), "+r"(dst), "+g"(count)\ : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
: "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\ : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
:"memory"\ :"memory"\
);\ );\
}\ }\
@ -2021,12 +2021,12 @@ static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
"punpckhbw %%mm7, %%mm1 \n\t"\ "punpckhbw %%mm7, %%mm1 \n\t"\
"movq %%mm0, (%1) \n\t"\ "movq %%mm0, (%1) \n\t"\
"movq %%mm1, 9*8(%1) \n\t"\ "movq %%mm1, 9*8(%1) \n\t"\
"addl $8, %1 \n\t"\ "add $8, %1 \n\t"\
"addl %3, %0 \n\t"\ "add %3, %0 \n\t"\
"decl %2 \n\t"\ "decl %2 \n\t"\
" jnz 1b \n\t"\ " jnz 1b \n\t"\
: "+r" (src), "+r" (temp_ptr), "+r"(count)\ : "+r" (src), "+r" (temp_ptr), "+r"(count)\
: "r" (srcStride)\ : "r" ((long)srcStride)\
: "memory"\ : "memory"\
);\ );\
\ \
@ -2043,25 +2043,25 @@ static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
"movq 24(%0), %%mm3 \n\t"\ "movq 24(%0), %%mm3 \n\t"\
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
"addl %4, %1 \n\t"\ "add %4, %1 \n\t"\
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
\ \
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
"addl %4, %1 \n\t"\ "add %4, %1 \n\t"\
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
\ \
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
"addl %4, %1 \n\t"\ "add %4, %1 \n\t"\
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
\ \
"addl $72, %0 \n\t"\ "add $72, %0 \n\t"\
"addl %6, %1 \n\t"\ "add %6, %1 \n\t"\
"decl %2 \n\t"\ "decl %2 \n\t"\
" jnz 1b \n\t"\ " jnz 1b \n\t"\
\ \
: "+r"(temp_ptr), "+r"(dst), "+g"(count)\ : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
: "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\ : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
: "memory"\ : "memory"\
);\ );\
}\ }\
@ -2297,7 +2297,7 @@ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride
"psubw "#B", %%mm6 \n\t"\ "psubw "#B", %%mm6 \n\t"\
"psubw "#E", %%mm6 \n\t"\ "psubw "#E", %%mm6 \n\t"\
"pmullw %4, %%mm6 \n\t"\ "pmullw %4, %%mm6 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\ "punpcklbw %%mm7, "#F" \n\t"\
"paddw %5, "#A" \n\t"\ "paddw %5, "#A" \n\t"\
"paddw "#F", "#A" \n\t"\ "paddw "#F", "#A" \n\t"\
@ -2305,7 +2305,7 @@ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride
"psraw $5, %%mm6 \n\t"\ "psraw $5, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\ "packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d)\ OP(%%mm6, (%1), A, d)\
"addl %3, %1 \n\t" "add %3, %1 \n\t"
#define QPEL_H264HV(A,B,C,D,E,F,OF)\ #define QPEL_H264HV(A,B,C,D,E,F,OF)\
"movd (%0), "#F" \n\t"\ "movd (%0), "#F" \n\t"\
@ -2315,7 +2315,7 @@ static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride
"psubw "#B", %%mm6 \n\t"\ "psubw "#B", %%mm6 \n\t"\
"psubw "#E", %%mm6 \n\t"\ "psubw "#E", %%mm6 \n\t"\
"pmullw %3, %%mm6 \n\t"\ "pmullw %3, %%mm6 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\ "punpcklbw %%mm7, "#F" \n\t"\
"paddw "#F", "#A" \n\t"\ "paddw "#F", "#A" \n\t"\
"paddw "#A", %%mm6 \n\t"\ "paddw "#A", %%mm6 \n\t"\
@ -2353,12 +2353,12 @@ static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i
"psraw $5, %%mm0 \n\t"\ "psraw $5, %%mm0 \n\t"\
"packuswb %%mm0, %%mm0 \n\t"\ "packuswb %%mm0, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm6, d)\ OP(%%mm0, (%1),%%mm6, d)\
"addl %3, %0 \n\t"\ "add %3, %0 \n\t"\
"addl %4, %1 \n\t"\ "add %4, %1 \n\t"\
"decl %2 \n\t"\ "decl %2 \n\t"\
" jnz 1b \n\t"\ " jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+m"(h)\ : "+a"(src), "+c"(dst), "+m"(h)\
: "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
: "memory"\ : "memory"\
);\ );\
}\ }\
@ -2367,15 +2367,15 @@ static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i
asm volatile(\ asm volatile(\
"pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\ "movd (%0), %%mm0 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\ "movd (%0), %%mm1 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\ "movd (%0), %%mm2 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\ "movd (%0), %%mm3 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\ "movd (%0), %%mm4 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\ "punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\
@ -2387,7 +2387,7 @@ static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
\ \
: "+a"(src), "+c"(dst)\ : "+a"(src), "+c"(dst)\
: "S"(srcStride), "D"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
: "memory"\ : "memory"\
);\ );\
}\ }\
@ -2399,15 +2399,15 @@ static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
asm volatile(\ asm volatile(\
"pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\ "movd (%0), %%mm0 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\ "movd (%0), %%mm1 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\ "movd (%0), %%mm2 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\ "movd (%0), %%mm3 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\ "movd (%0), %%mm4 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\ "punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\
@ -2419,7 +2419,7 @@ static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
\ \
: "+a"(src)\ : "+a"(src)\
: "c"(tmp), "S"(srcStride), "m"(ff_pw_5)\ : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
: "memory"\ : "memory"\
);\ );\
tmp += 4;\ tmp += 4;\
@ -2445,12 +2445,12 @@ static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
"psraw $6, %%mm0 \n\t"\ "psraw $6, %%mm0 \n\t"\
"packuswb %%mm0, %%mm0 \n\t"\ "packuswb %%mm0, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm7, d)\ OP(%%mm0, (%1),%%mm7, d)\
"addl $24, %0 \n\t"\ "add $24, %0 \n\t"\
"addl %3, %1 \n\t"\ "add %3, %1 \n\t"\
"decl %2 \n\t"\ "decl %2 \n\t"\
" jnz 1b \n\t"\ " jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+m"(h)\ : "+a"(tmp), "+c"(dst), "+m"(h)\
: "S"(dstStride), "m"(ff_pw_32)\ : "S"((long)dstStride), "m"(ff_pw_32)\
: "memory"\ : "memory"\
);\ );\
}\ }\
@ -2502,12 +2502,12 @@ static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i
"psraw $5, %%mm1 \n\t"\ "psraw $5, %%mm1 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\ "packuswb %%mm1, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm5, q)\ OP(%%mm0, (%1),%%mm5, q)\
"addl %3, %0 \n\t"\ "add %3, %0 \n\t"\
"addl %4, %1 \n\t"\ "add %4, %1 \n\t"\
"decl %2 \n\t"\ "decl %2 \n\t"\
" jnz 1b \n\t"\ " jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+m"(h)\ : "+a"(src), "+c"(dst), "+m"(h)\
: "d"(srcStride), "S"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
: "memory"\ : "memory"\
);\ );\
}\ }\
@ -2520,15 +2520,15 @@ static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i
asm volatile(\ asm volatile(\
"pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\ "movd (%0), %%mm0 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\ "movd (%0), %%mm1 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\ "movd (%0), %%mm2 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\ "movd (%0), %%mm3 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\ "movd (%0), %%mm4 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\ "punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\
@ -2544,7 +2544,7 @@ static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
\ \
: "+a"(src), "+c"(dst)\ : "+a"(src), "+c"(dst)\
: "S"(srcStride), "D"(dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
: "memory"\ : "memory"\
);\ );\
src += 4-13*srcStride;\ src += 4-13*srcStride;\
@ -2559,15 +2559,15 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
asm volatile(\ asm volatile(\
"pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\ "movd (%0), %%mm0 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\ "movd (%0), %%mm1 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\ "movd (%0), %%mm2 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\ "movd (%0), %%mm3 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\ "movd (%0), %%mm4 \n\t"\
"addl %2, %0 \n\t"\ "add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\ "punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\
@ -2583,7 +2583,7 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\
\ \
: "+a"(src)\ : "+a"(src)\
: "c"(tmp), "S"(srcStride), "m"(ff_pw_5)\ : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
: "memory"\ : "memory"\
);\ );\
tmp += 4;\ tmp += 4;\
@ -2623,12 +2623,12 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
"psraw $6, %%mm3 \n\t"\ "psraw $6, %%mm3 \n\t"\
"packuswb %%mm3, %%mm0 \n\t"\ "packuswb %%mm3, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm7, q)\ OP(%%mm0, (%1),%%mm7, q)\
"addl $32, %0 \n\t"\ "add $32, %0 \n\t"\
"addl %3, %1 \n\t"\ "add %3, %1 \n\t"\
"decl %2 \n\t"\ "decl %2 \n\t"\
" jnz 1b \n\t"\ " jnz 1b \n\t"\
: "+a"(tmp), "+c"(dst), "+m"(h)\ : "+a"(tmp), "+c"(dst), "+m"(h)\
: "S"(dstStride), "m"(ff_pw_32)\ : "S"((long)dstStride), "m"(ff_pw_32)\
: "memory"\ : "memory"\
);\ );\
}\ }\
@ -2831,7 +2831,7 @@ static void just_return() { return; }
c->avg_ ## postfix1 = avg_ ## postfix2; c->avg_ ## postfix1 = avg_ ## postfix2;
static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
int i=0; long i=0;
assert(ABS(scale) < 256); assert(ABS(scale) < 256);
scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
@ -2863,8 +2863,8 @@ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[6
"paddd %%mm1, %%mm0 \n\t" "paddd %%mm1, %%mm0 \n\t"
"psrld $4, %%mm0 \n\t" "psrld $4, %%mm0 \n\t"
"paddd %%mm0, %%mm7 \n\t" "paddd %%mm0, %%mm7 \n\t"
"addl $16, %0 \n\t" "add $16, %0 \n\t"
"cmpl $128, %0 \n\t" //FIXME optimize & bench "cmp $128, %0 \n\t" //FIXME optimize & bench
" jb 1b \n\t" " jb 1b \n\t"
"movq %%mm7, %%mm6 \n\t" "movq %%mm7, %%mm6 \n\t"
"psrlq $32, %%mm7 \n\t" "psrlq $32, %%mm7 \n\t"
@ -2879,7 +2879,7 @@ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[6
} }
static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
int i=0; long i=0;
if(ABS(scale) < 256){ if(ABS(scale) < 256){
scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
@ -2902,8 +2902,8 @@ static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
"paddw 8(%2, %0), %%mm1 \n\t" "paddw 8(%2, %0), %%mm1 \n\t"
"movq %%mm0, (%2, %0) \n\t" "movq %%mm0, (%2, %0) \n\t"
"movq %%mm1, 8(%2, %0) \n\t" "movq %%mm1, 8(%2, %0) \n\t"
"addl $16, %0 \n\t" "add $16, %0 \n\t"
"cmpl $128, %0 \n\t" //FIXME optimize & bench "cmp $128, %0 \n\t" //FIXME optimize & bench
" jb 1b \n\t" " jb 1b \n\t"
: "+r" (i) : "+r" (i)

View File

@ -28,7 +28,7 @@
static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{ {
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
@ -36,21 +36,21 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_
PAVGB" 1(%1, %3), %%mm1 \n\t" PAVGB" 1(%1, %3), %%mm1 \n\t"
"movq %%mm0, (%2) \n\t" "movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
PAVGB" 1(%1), %%mm0 \n\t" PAVGB" 1(%1), %%mm0 \n\t"
PAVGB" 1(%1, %3), %%mm1 \n\t" PAVGB" 1(%1, %3), %%mm1 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"movq %%mm0, (%2) \n\t" "movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block) :"+g"(h), "+S"(pixels), "+D"(block)
:"r" (line_size) :"r" ((long)line_size)
:"%eax", "memory"); :"%"REG_a, "memory");
} }
static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
@ -60,34 +60,34 @@ static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
" jz 1f \n\t" " jz 1f \n\t"
"movd (%1), %%mm0 \n\t" "movd (%1), %%mm0 \n\t"
"movd (%2), %%mm1 \n\t" "movd (%2), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"addl $4, %2 \n\t" "add $4, %2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm1, %%mm0 \n\t"
"movd %%mm0, (%3) \n\t" "movd %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"decl %0 \n\t" "decl %0 \n\t"
"1: \n\t" "1: \n\t"
"movd (%1), %%mm0 \n\t" "movd (%1), %%mm0 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movd (%1), %%mm1 \n\t" "movd (%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGB" (%2), %%mm0 \n\t" PAVGB" (%2), %%mm0 \n\t"
PAVGB" 4(%2), %%mm1 \n\t" PAVGB" 4(%2), %%mm1 \n\t"
"movd %%mm0, (%3) \n\t" "movd %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movd %%mm1, (%3) \n\t" "movd %%mm1, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movd (%1), %%mm0 \n\t" "movd (%1), %%mm0 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movd (%1), %%mm1 \n\t" "movd (%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGB" 8(%2), %%mm0 \n\t" PAVGB" 8(%2), %%mm0 \n\t"
PAVGB" 12(%2), %%mm1 \n\t" PAVGB" 12(%2), %%mm1 \n\t"
"movd %%mm0, (%3) \n\t" "movd %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movd %%mm1, (%3) \n\t" "movd %%mm1, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"addl $16, %2 \n\t" "add $16, %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
@ -95,7 +95,7 @@ static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
#else #else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif #endif
:"S"(src1Stride), "D"(dstStride) :"S"((long)src1Stride), "D"((long)dstStride)
:"memory"); :"memory");
} }
@ -107,34 +107,34 @@ static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
" jz 1f \n\t" " jz 1f \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%2), %%mm1 \n\t" "movq (%2), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"addl $8, %2 \n\t" "add $8, %2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm1, %%mm0 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"decl %0 \n\t" "decl %0 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movq (%1), %%mm1 \n\t" "movq (%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGB" (%2), %%mm0 \n\t" PAVGB" (%2), %%mm0 \n\t"
PAVGB" 8(%2), %%mm1 \n\t" PAVGB" 8(%2), %%mm1 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq %%mm1, (%3) \n\t" "movq %%mm1, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movq (%1), %%mm1 \n\t" "movq (%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGB" 16(%2), %%mm0 \n\t" PAVGB" 16(%2), %%mm0 \n\t"
PAVGB" 24(%2), %%mm1 \n\t" PAVGB" 24(%2), %%mm1 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq %%mm1, (%3) \n\t" "movq %%mm1, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"addl $32, %2 \n\t" "add $32, %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
@ -142,7 +142,7 @@ static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
#else #else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif #endif
:"S"(src1Stride), "D"(dstStride) :"S"((long)src1Stride), "D"((long)dstStride)
:"memory"); :"memory");
//the following should be used, though better not with gcc ... //the following should be used, though better not with gcc ...
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
@ -158,20 +158,20 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src
" jz 1f \n\t" " jz 1f \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%2), %%mm1 \n\t" "movq (%2), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"addl $8, %2 \n\t" "add $8, %2 \n\t"
"pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm0 \n\t"
"pxor %%mm6, %%mm1 \n\t" "pxor %%mm6, %%mm1 \n\t"
PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm1, %%mm0 \n\t"
"pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm0 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"decl %0 \n\t" "decl %0 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movq (%1), %%mm1 \n\t" "movq (%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movq (%2), %%mm2 \n\t" "movq (%2), %%mm2 \n\t"
"movq 8(%2), %%mm3 \n\t" "movq 8(%2), %%mm3 \n\t"
"pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm0 \n\t"
@ -183,13 +183,13 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src
"pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm0 \n\t"
"pxor %%mm6, %%mm1 \n\t" "pxor %%mm6, %%mm1 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq %%mm1, (%3) \n\t" "movq %%mm1, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movq (%1), %%mm1 \n\t" "movq (%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movq 16(%2), %%mm2 \n\t" "movq 16(%2), %%mm2 \n\t"
"movq 24(%2), %%mm3 \n\t" "movq 24(%2), %%mm3 \n\t"
"pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm0 \n\t"
@ -201,10 +201,10 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src
"pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm0 \n\t"
"pxor %%mm6, %%mm1 \n\t" "pxor %%mm6, %%mm1 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq %%mm1, (%3) \n\t" "movq %%mm1, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"addl $32, %2 \n\t" "add $32, %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
@ -212,7 +212,7 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src
#else #else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif #endif
:"S"(src1Stride), "D"(dstStride) :"S"((long)src1Stride), "D"((long)dstStride)
:"memory"); :"memory");
//the following should be used, though better not with gcc ... //the following should be used, though better not with gcc ...
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
@ -227,39 +227,39 @@ static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
" jz 1f \n\t" " jz 1f \n\t"
"movd (%1), %%mm0 \n\t" "movd (%1), %%mm0 \n\t"
"movd (%2), %%mm1 \n\t" "movd (%2), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"addl $4, %2 \n\t" "add $4, %2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" (%3), %%mm0 \n\t" PAVGB" (%3), %%mm0 \n\t"
"movd %%mm0, (%3) \n\t" "movd %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"decl %0 \n\t" "decl %0 \n\t"
"1: \n\t" "1: \n\t"
"movd (%1), %%mm0 \n\t" "movd (%1), %%mm0 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movd (%1), %%mm1 \n\t" "movd (%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGB" (%2), %%mm0 \n\t" PAVGB" (%2), %%mm0 \n\t"
PAVGB" 4(%2), %%mm1 \n\t" PAVGB" 4(%2), %%mm1 \n\t"
PAVGB" (%3), %%mm0 \n\t" PAVGB" (%3), %%mm0 \n\t"
"movd %%mm0, (%3) \n\t" "movd %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
PAVGB" (%3), %%mm1 \n\t" PAVGB" (%3), %%mm1 \n\t"
"movd %%mm1, (%3) \n\t" "movd %%mm1, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movd (%1), %%mm0 \n\t" "movd (%1), %%mm0 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movd (%1), %%mm1 \n\t" "movd (%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGB" 8(%2), %%mm0 \n\t" PAVGB" 8(%2), %%mm0 \n\t"
PAVGB" 12(%2), %%mm1 \n\t" PAVGB" 12(%2), %%mm1 \n\t"
PAVGB" (%3), %%mm0 \n\t" PAVGB" (%3), %%mm0 \n\t"
"movd %%mm0, (%3) \n\t" "movd %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
PAVGB" (%3), %%mm1 \n\t" PAVGB" (%3), %%mm1 \n\t"
"movd %%mm1, (%3) \n\t" "movd %%mm1, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"addl $16, %2 \n\t" "add $16, %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
@ -267,7 +267,7 @@ static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
#else #else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif #endif
:"S"(src1Stride), "D"(dstStride) :"S"((long)src1Stride), "D"((long)dstStride)
:"memory"); :"memory");
} }
@ -279,39 +279,39 @@ static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
" jz 1f \n\t" " jz 1f \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%2), %%mm1 \n\t" "movq (%2), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"addl $8, %2 \n\t" "add $8, %2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" (%3), %%mm0 \n\t" PAVGB" (%3), %%mm0 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"decl %0 \n\t" "decl %0 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movq (%1), %%mm1 \n\t" "movq (%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGB" (%2), %%mm0 \n\t" PAVGB" (%2), %%mm0 \n\t"
PAVGB" 8(%2), %%mm1 \n\t" PAVGB" 8(%2), %%mm1 \n\t"
PAVGB" (%3), %%mm0 \n\t" PAVGB" (%3), %%mm0 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
PAVGB" (%3), %%mm1 \n\t" PAVGB" (%3), %%mm1 \n\t"
"movq %%mm1, (%3) \n\t" "movq %%mm1, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movq (%1), %%mm1 \n\t" "movq (%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGB" 16(%2), %%mm0 \n\t" PAVGB" 16(%2), %%mm0 \n\t"
PAVGB" 24(%2), %%mm1 \n\t" PAVGB" 24(%2), %%mm1 \n\t"
PAVGB" (%3), %%mm0 \n\t" PAVGB" (%3), %%mm0 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
PAVGB" (%3), %%mm1 \n\t" PAVGB" (%3), %%mm1 \n\t"
"movq %%mm1, (%3) \n\t" "movq %%mm1, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"addl $32, %2 \n\t" "add $32, %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
@ -319,7 +319,7 @@ static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
#else #else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif #endif
:"S"(src1Stride), "D"(dstStride) :"S"((long)src1Stride), "D"((long)dstStride)
:"memory"); :"memory");
//the following should be used, though better not with gcc ... //the following should be used, though better not with gcc ...
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
@ -330,7 +330,7 @@ static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{ {
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
@ -344,8 +344,8 @@ static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"movq %%mm2, 8(%2) \n\t" "movq %%mm2, 8(%2) \n\t"
"movq %%mm3, 8(%2, %3) \n\t" "movq %%mm3, 8(%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq 8(%1), %%mm2 \n\t" "movq 8(%1), %%mm2 \n\t"
@ -354,17 +354,17 @@ static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line
PAVGB" 1(%1, %3), %%mm1 \n\t" PAVGB" 1(%1, %3), %%mm1 \n\t"
PAVGB" 9(%1), %%mm2 \n\t" PAVGB" 9(%1), %%mm2 \n\t"
PAVGB" 9(%1, %3), %%mm3 \n\t" PAVGB" 9(%1, %3), %%mm3 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"movq %%mm0, (%2) \n\t" "movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"movq %%mm2, 8(%2) \n\t" "movq %%mm2, 8(%2) \n\t"
"movq %%mm3, 8(%2, %3) \n\t" "movq %%mm3, 8(%2, %3) \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block) :"+g"(h), "+S"(pixels), "+D"(block)
:"r" (line_size) :"r" ((long)line_size)
:"%eax", "memory"); :"%"REG_a, "memory");
} }
static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
@ -376,30 +376,30 @@ static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
"movq 8(%1), %%mm1 \n\t" "movq 8(%1), %%mm1 \n\t"
PAVGB" (%2), %%mm0 \n\t" PAVGB" (%2), %%mm0 \n\t"
PAVGB" 8(%2), %%mm1 \n\t" PAVGB" 8(%2), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"addl $16, %2 \n\t" "add $16, %2 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"movq %%mm1, 8(%3) \n\t" "movq %%mm1, 8(%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"decl %0 \n\t" "decl %0 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm1 \n\t" "movq 8(%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGB" (%2), %%mm0 \n\t" PAVGB" (%2), %%mm0 \n\t"
PAVGB" 8(%2), %%mm1 \n\t" PAVGB" 8(%2), %%mm1 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"movq %%mm1, 8(%3) \n\t" "movq %%mm1, 8(%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm1 \n\t" "movq 8(%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGB" 16(%2), %%mm0 \n\t" PAVGB" 16(%2), %%mm0 \n\t"
PAVGB" 24(%2), %%mm1 \n\t" PAVGB" 24(%2), %%mm1 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"movq %%mm1, 8(%3) \n\t" "movq %%mm1, 8(%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"addl $32, %2 \n\t" "add $32, %2 \n\t"
"subl $2, %0 \n\t" "subl $2, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
@ -407,7 +407,7 @@ static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
#else #else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif #endif
:"S"(src1Stride), "D"(dstStride) :"S"((long)src1Stride), "D"((long)dstStride)
:"memory"); :"memory");
//the following should be used, though better not with gcc ... //the following should be used, though better not with gcc ...
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
@ -424,36 +424,36 @@ static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
"movq 8(%1), %%mm1 \n\t" "movq 8(%1), %%mm1 \n\t"
PAVGB" (%2), %%mm0 \n\t" PAVGB" (%2), %%mm0 \n\t"
PAVGB" 8(%2), %%mm1 \n\t" PAVGB" 8(%2), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"addl $16, %2 \n\t" "add $16, %2 \n\t"
PAVGB" (%3), %%mm0 \n\t" PAVGB" (%3), %%mm0 \n\t"
PAVGB" 8(%3), %%mm1 \n\t" PAVGB" 8(%3), %%mm1 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"movq %%mm1, 8(%3) \n\t" "movq %%mm1, 8(%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"decl %0 \n\t" "decl %0 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm1 \n\t" "movq 8(%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGB" (%2), %%mm0 \n\t" PAVGB" (%2), %%mm0 \n\t"
PAVGB" 8(%2), %%mm1 \n\t" PAVGB" 8(%2), %%mm1 \n\t"
PAVGB" (%3), %%mm0 \n\t" PAVGB" (%3), %%mm0 \n\t"
PAVGB" 8(%3), %%mm1 \n\t" PAVGB" 8(%3), %%mm1 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"movq %%mm1, 8(%3) \n\t" "movq %%mm1, 8(%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm1 \n\t" "movq 8(%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGB" 16(%2), %%mm0 \n\t" PAVGB" 16(%2), %%mm0 \n\t"
PAVGB" 24(%2), %%mm1 \n\t" PAVGB" 24(%2), %%mm1 \n\t"
PAVGB" (%3), %%mm0 \n\t" PAVGB" (%3), %%mm0 \n\t"
PAVGB" 8(%3), %%mm1 \n\t" PAVGB" 8(%3), %%mm1 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"movq %%mm1, 8(%3) \n\t" "movq %%mm1, 8(%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"addl $32, %2 \n\t" "add $32, %2 \n\t"
"subl $2, %0 \n\t" "subl $2, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
@ -461,7 +461,7 @@ static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
#else #else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif #endif
:"S"(src1Stride), "D"(dstStride) :"S"((long)src1Stride), "D"((long)dstStride)
:"memory"); :"memory");
//the following should be used, though better not with gcc ... //the following should be used, though better not with gcc ...
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
@ -487,16 +487,16 @@ static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *sr
PAVGB" %%mm3, %%mm1 \n\t" PAVGB" %%mm3, %%mm1 \n\t"
"pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm0 \n\t"
"pxor %%mm6, %%mm1 \n\t" "pxor %%mm6, %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"addl $16, %2 \n\t" "add $16, %2 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"movq %%mm1, 8(%3) \n\t" "movq %%mm1, 8(%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"decl %0 \n\t" "decl %0 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm1 \n\t" "movq 8(%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movq (%2), %%mm2 \n\t" "movq (%2), %%mm2 \n\t"
"movq 8(%2), %%mm3 \n\t" "movq 8(%2), %%mm3 \n\t"
"pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm0 \n\t"
@ -509,10 +509,10 @@ static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *sr
"pxor %%mm6, %%mm1 \n\t" "pxor %%mm6, %%mm1 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"movq %%mm1, 8(%3) \n\t" "movq %%mm1, 8(%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm1 \n\t" "movq 8(%1), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movq 16(%2), %%mm2 \n\t" "movq 16(%2), %%mm2 \n\t"
"movq 24(%2), %%mm3 \n\t" "movq 24(%2), %%mm3 \n\t"
"pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm0 \n\t"
@ -525,8 +525,8 @@ static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *sr
"pxor %%mm6, %%mm1 \n\t" "pxor %%mm6, %%mm1 \n\t"
"movq %%mm0, (%3) \n\t" "movq %%mm0, (%3) \n\t"
"movq %%mm1, 8(%3) \n\t" "movq %%mm1, 8(%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"addl $32, %2 \n\t" "add $32, %2 \n\t"
"subl $2, %0 \n\t" "subl $2, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
@ -534,7 +534,7 @@ static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *sr
#else #else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif #endif
:"S"(src1Stride), "D"(dstStride) :"S"((long)src1Stride), "D"((long)dstStride)
:"memory"); :"memory");
//the following should be used, though better not with gcc ... //the following should be used, though better not with gcc ...
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
@ -547,13 +547,13 @@ static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, in
{ {
MOVQ_BONE(mm6); MOVQ_BONE(mm6);
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm2 \n\t" "movq (%1, %3), %%mm2 \n\t"
"movq 1(%1), %%mm1 \n\t" "movq 1(%1), %%mm1 \n\t"
"movq 1(%1, %3), %%mm3 \n\t" "movq 1(%1, %3), %%mm3 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"psubusb %%mm6, %%mm0 \n\t" "psubusb %%mm6, %%mm0 \n\t"
"psubusb %%mm6, %%mm2 \n\t" "psubusb %%mm6, %%mm2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm1, %%mm0 \n\t"
@ -564,50 +564,50 @@ static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, in
"movq 1(%1), %%mm1 \n\t" "movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t" "movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t" "movq 1(%1, %3), %%mm3 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"psubusb %%mm6, %%mm0 \n\t" "psubusb %%mm6, %%mm0 \n\t"
"psubusb %%mm6, %%mm2 \n\t" "psubusb %%mm6, %%mm2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" %%mm3, %%mm2 \n\t" PAVGB" %%mm3, %%mm2 \n\t"
"movq %%mm0, (%2) \n\t" "movq %%mm0, (%2) \n\t"
"movq %%mm2, (%2, %3) \n\t" "movq %%mm2, (%2, %3) \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block) :"+g"(h), "+S"(pixels), "+D"(block)
:"r" (line_size) :"r" ((long)line_size)
:"%eax", "memory"); :"%"REG_a, "memory");
} }
static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{ {
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"subl %3, %2 \n\t" "sub %3, %2 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax), %%mm2 \n\t" "movq (%1, %%"REG_a"), %%mm2 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" %%mm2, %%mm1 \n\t" PAVGB" %%mm2, %%mm1 \n\t"
"movq %%mm0, (%2, %3) \n\t" "movq %%mm0, (%2, %3) \n\t"
"movq %%mm1, (%2, %%eax) \n\t" "movq %%mm1, (%2, %%"REG_a") \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
PAVGB" %%mm1, %%mm2 \n\t" PAVGB" %%mm1, %%mm2 \n\t"
PAVGB" %%mm0, %%mm1 \n\t" PAVGB" %%mm0, %%mm1 \n\t"
"movq %%mm2, (%2, %3) \n\t" "movq %%mm2, (%2, %3) \n\t"
"movq %%mm1, (%2, %%eax) \n\t" "movq %%mm1, (%2, %%"REG_a") \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D" (block) :"+g"(h), "+S"(pixels), "+D" (block)
:"r" (line_size) :"r" ((long)line_size)
:"%eax", "memory"); :"%"REG_a, "memory");
} }
/* GL: this function does incorrect rounding if overflow */ /* GL: this function does incorrect rounding if overflow */
@ -615,39 +615,39 @@ static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, in
{ {
MOVQ_BONE(mm6); MOVQ_BONE(mm6);
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"subl %3, %2 \n\t" "sub %3, %2 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax), %%mm2 \n\t" "movq (%1, %%"REG_a"), %%mm2 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"psubusb %%mm6, %%mm1 \n\t" "psubusb %%mm6, %%mm1 \n\t"
PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" %%mm2, %%mm1 \n\t" PAVGB" %%mm2, %%mm1 \n\t"
"movq %%mm0, (%2, %3) \n\t" "movq %%mm0, (%2, %3) \n\t"
"movq %%mm1, (%2, %%eax) \n\t" "movq %%mm1, (%2, %%"REG_a") \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"psubusb %%mm6, %%mm1 \n\t" "psubusb %%mm6, %%mm1 \n\t"
PAVGB" %%mm1, %%mm2 \n\t" PAVGB" %%mm1, %%mm2 \n\t"
PAVGB" %%mm0, %%mm1 \n\t" PAVGB" %%mm0, %%mm1 \n\t"
"movq %%mm2, (%2, %3) \n\t" "movq %%mm2, (%2, %3) \n\t"
"movq %%mm1, (%2, %%eax) \n\t" "movq %%mm1, (%2, %%"REG_a") \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D" (block) :"+g"(h), "+S"(pixels), "+D" (block)
:"r" (line_size) :"r" ((long)line_size)
:"%eax", "memory"); :"%"REG_a, "memory");
} }
static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{ {
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
"1: \n\t" "1: \n\t"
"movq (%2), %%mm0 \n\t" "movq (%2), %%mm0 \n\t"
"movq (%2, %3), %%mm1 \n\t" "movq (%2, %3), %%mm1 \n\t"
@ -655,27 +655,27 @@ static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_siz
PAVGB" (%1, %3), %%mm1 \n\t" PAVGB" (%1, %3), %%mm1 \n\t"
"movq %%mm0, (%2) \n\t" "movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"movq (%2), %%mm0 \n\t" "movq (%2), %%mm0 \n\t"
"movq (%2, %3), %%mm1 \n\t" "movq (%2, %3), %%mm1 \n\t"
PAVGB" (%1), %%mm0 \n\t" PAVGB" (%1), %%mm0 \n\t"
PAVGB" (%1, %3), %%mm1 \n\t" PAVGB" (%1, %3), %%mm1 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"movq %%mm0, (%2) \n\t" "movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block) :"+g"(h), "+S"(pixels), "+D"(block)
:"r" (line_size) :"r" ((long)line_size)
:"%eax", "memory"); :"%"REG_a, "memory");
} }
static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{ {
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm2 \n\t" "movq (%1, %3), %%mm2 \n\t"
@ -683,63 +683,63 @@ static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_
PAVGB" 1(%1, %3), %%mm2 \n\t" PAVGB" 1(%1, %3), %%mm2 \n\t"
PAVGB" (%2), %%mm0 \n\t" PAVGB" (%2), %%mm0 \n\t"
PAVGB" (%2, %3), %%mm2 \n\t" PAVGB" (%2, %3), %%mm2 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"movq %%mm0, (%2) \n\t" "movq %%mm0, (%2) \n\t"
"movq %%mm2, (%2, %3) \n\t" "movq %%mm2, (%2, %3) \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm2 \n\t" "movq (%1, %3), %%mm2 \n\t"
PAVGB" 1(%1), %%mm0 \n\t" PAVGB" 1(%1), %%mm0 \n\t"
PAVGB" 1(%1, %3), %%mm2 \n\t" PAVGB" 1(%1, %3), %%mm2 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
PAVGB" (%2), %%mm0 \n\t" PAVGB" (%2), %%mm0 \n\t"
PAVGB" (%2, %3), %%mm2 \n\t" PAVGB" (%2, %3), %%mm2 \n\t"
"movq %%mm0, (%2) \n\t" "movq %%mm0, (%2) \n\t"
"movq %%mm2, (%2, %3) \n\t" "movq %%mm2, (%2, %3) \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block) :"+g"(h), "+S"(pixels), "+D"(block)
:"r" (line_size) :"r" ((long)line_size)
:"%eax", "memory"); :"%"REG_a, "memory");
} }
static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
{ {
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"subl %3, %2 \n\t" "sub %3, %2 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax), %%mm2 \n\t" "movq (%1, %%"REG_a"), %%mm2 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" %%mm2, %%mm1 \n\t" PAVGB" %%mm2, %%mm1 \n\t"
"movq (%2, %3), %%mm3 \n\t" "movq (%2, %3), %%mm3 \n\t"
"movq (%2, %%eax), %%mm4 \n\t" "movq (%2, %%"REG_a"), %%mm4 \n\t"
PAVGB" %%mm3, %%mm0 \n\t" PAVGB" %%mm3, %%mm0 \n\t"
PAVGB" %%mm4, %%mm1 \n\t" PAVGB" %%mm4, %%mm1 \n\t"
"movq %%mm0, (%2, %3) \n\t" "movq %%mm0, (%2, %3) \n\t"
"movq %%mm1, (%2, %%eax) \n\t" "movq %%mm1, (%2, %%"REG_a") \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
PAVGB" %%mm1, %%mm2 \n\t" PAVGB" %%mm1, %%mm2 \n\t"
PAVGB" %%mm0, %%mm1 \n\t" PAVGB" %%mm0, %%mm1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"movq (%2, %3), %%mm3 \n\t" "movq (%2, %3), %%mm3 \n\t"
"movq (%2, %%eax), %%mm4 \n\t" "movq (%2, %%"REG_a"), %%mm4 \n\t"
PAVGB" %%mm3, %%mm2 \n\t" PAVGB" %%mm3, %%mm2 \n\t"
PAVGB" %%mm4, %%mm1 \n\t" PAVGB" %%mm4, %%mm1 \n\t"
"movq %%mm2, (%2, %3) \n\t" "movq %%mm2, (%2, %3) \n\t"
"movq %%mm1, (%2, %%eax) \n\t" "movq %%mm1, (%2, %%"REG_a") \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block) :"+g"(h), "+S"(pixels), "+D"(block)
:"r" (line_size) :"r" ((long)line_size)
:"%eax", "memory"); :"%"REG_a, "memory");
} }
// Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
@ -747,17 +747,17 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line
{ {
MOVQ_BONE(mm6); MOVQ_BONE(mm6);
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
PAVGB" 1(%1), %%mm0 \n\t" PAVGB" 1(%1), %%mm0 \n\t"
".balign 8 \n\t" ".balign 8 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm2 \n\t" "movq (%1, %%"REG_a"), %%mm2 \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"psubusb %%mm6, %%mm2 \n\t" "psubusb %%mm6, %%mm2 \n\t"
PAVGB" 1(%1, %3), %%mm1 \n\t" PAVGB" 1(%1, %3), %%mm1 \n\t"
PAVGB" 1(%1, %%eax), %%mm2 \n\t" PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" %%mm2, %%mm1 \n\t" PAVGB" %%mm2, %%mm1 \n\t"
PAVGB" (%2), %%mm0 \n\t" PAVGB" (%2), %%mm0 \n\t"
@ -765,23 +765,23 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line
"movq %%mm0, (%2) \n\t" "movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
PAVGB" 1(%1, %3), %%mm1 \n\t" PAVGB" 1(%1, %3), %%mm1 \n\t"
PAVGB" 1(%1, %%eax), %%mm0 \n\t" PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
PAVGB" %%mm1, %%mm2 \n\t" PAVGB" %%mm1, %%mm2 \n\t"
PAVGB" %%mm0, %%mm1 \n\t" PAVGB" %%mm0, %%mm1 \n\t"
PAVGB" (%2), %%mm2 \n\t" PAVGB" (%2), %%mm2 \n\t"
PAVGB" (%2, %3), %%mm1 \n\t" PAVGB" (%2, %3), %%mm1 \n\t"
"movq %%mm2, (%2) \n\t" "movq %%mm2, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block) :"+g"(h), "+S"(pixels), "+D"(block)
:"r" (line_size) :"r" ((long)line_size)
:"%eax", "memory"); :"%"REG_a, "memory");
} }
//FIXME the following could be optimized too ... //FIXME the following could be optimized too ...

View File

@ -27,7 +27,7 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line
{ {
MOVQ_BFE(mm6); MOVQ_BFE(mm6);
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
".balign 8 \n\t" ".balign 8 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
@ -37,8 +37,8 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t" "movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t" "movq %%mm5, (%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t" "movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t" "movq (%1, %3), %%mm2 \n\t"
@ -46,13 +46,13 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t" "movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t" "movq %%mm5, (%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block) :"+g"(h), "+S"(pixels), "+D"(block)
:"r"(line_size) :"r"((long)line_size)
:"eax", "memory"); :REG_a, "memory");
} }
static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
@ -63,37 +63,37 @@ static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
" jz 1f \n\t" " jz 1f \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%2), %%mm1 \n\t" "movq (%2), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"addl $8, %2 \n\t" "add $8, %2 \n\t"
PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
"movq %%mm4, (%3) \n\t" "movq %%mm4, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"decl %0 \n\t" "decl %0 \n\t"
".balign 8 \n\t" ".balign 8 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq (%2), %%mm1 \n\t" "movq (%2), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movq (%1), %%mm2 \n\t" "movq (%1), %%mm2 \n\t"
"movq 8(%2), %%mm3 \n\t" "movq 8(%2), %%mm3 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3) \n\t" "movq %%mm4, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq %%mm5, (%3) \n\t" "movq %%mm5, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq 16(%2), %%mm1 \n\t" "movq 16(%2), %%mm1 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"movq (%1), %%mm2 \n\t" "movq (%1), %%mm2 \n\t"
"movq 24(%2), %%mm3 \n\t" "movq 24(%2), %%mm3 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"addl $32, %2 \n\t" "add $32, %2 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3) \n\t" "movq %%mm4, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq %%mm5, (%3) \n\t" "movq %%mm5, (%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
@ -101,7 +101,7 @@ static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
#else #else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif #endif
:"S"(src1Stride), "D"(dstStride) :"S"((long)src1Stride), "D"((long)dstStride)
:"memory"); :"memory");
} }
@ -109,7 +109,7 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int lin
{ {
MOVQ_BFE(mm6); MOVQ_BFE(mm6);
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
".balign 8 \n\t" ".balign 8 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
@ -126,8 +126,8 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int lin
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, 8(%2) \n\t" "movq %%mm4, 8(%2) \n\t"
"movq %%mm5, 8(%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t" "movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t" "movq (%1, %3), %%mm2 \n\t"
@ -142,13 +142,13 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int lin
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, 8(%2) \n\t" "movq %%mm4, 8(%2) \n\t"
"movq %%mm5, 8(%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block) :"+g"(h), "+S"(pixels), "+D"(block)
:"r"(line_size) :"r"((long)line_size)
:"eax", "memory"); :REG_a, "memory");
} }
static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
@ -161,12 +161,12 @@ static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in
"movq (%2), %%mm1 \n\t" "movq (%2), %%mm1 \n\t"
"movq 8(%1), %%mm2 \n\t" "movq 8(%1), %%mm2 \n\t"
"movq 8(%2), %%mm3 \n\t" "movq 8(%2), %%mm3 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
"addl $16, %2 \n\t" "add $16, %2 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3) \n\t" "movq %%mm4, (%3) \n\t"
"movq %%mm5, 8(%3) \n\t" "movq %%mm5, 8(%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"decl %0 \n\t" "decl %0 \n\t"
".balign 8 \n\t" ".balign 8 \n\t"
"1: \n\t" "1: \n\t"
@ -174,21 +174,21 @@ static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in
"movq (%2), %%mm1 \n\t" "movq (%2), %%mm1 \n\t"
"movq 8(%1), %%mm2 \n\t" "movq 8(%1), %%mm2 \n\t"
"movq 8(%2), %%mm3 \n\t" "movq 8(%2), %%mm3 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3) \n\t" "movq %%mm4, (%3) \n\t"
"movq %%mm5, 8(%3) \n\t" "movq %%mm5, 8(%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
"movq 16(%2), %%mm1 \n\t" "movq 16(%2), %%mm1 \n\t"
"movq 8(%1), %%mm2 \n\t" "movq 8(%1), %%mm2 \n\t"
"movq 24(%2), %%mm3 \n\t" "movq 24(%2), %%mm3 \n\t"
"addl %4, %1 \n\t" "add %4, %1 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3) \n\t" "movq %%mm4, (%3) \n\t"
"movq %%mm5, 8(%3) \n\t" "movq %%mm5, 8(%3) \n\t"
"addl %5, %3 \n\t" "add %5, %3 \n\t"
"addl $32, %2 \n\t" "add $32, %2 \n\t"
"subl $2, %0 \n\t" "subl $2, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used
@ -196,7 +196,7 @@ static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in
#else #else
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
#endif #endif
:"S"(src1Stride), "D"(dstStride) :"S"((long)src1Stride), "D"((long)dstStride)
:"memory"); :"memory");
} }
@ -204,29 +204,29 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line
{ {
MOVQ_BFE(mm6); MOVQ_BFE(mm6);
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
".balign 8 \n\t" ".balign 8 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax),%%mm2 \n\t" "movq (%1, %%"REG_a"),%%mm2 \n\t"
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t" "movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t" "movq %%mm5, (%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax),%%mm0 \n\t" "movq (%1, %%"REG_a"),%%mm0 \n\t"
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t" "movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t" "movq %%mm5, (%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block) :"+g"(h), "+S"(pixels), "+D"(block)
:"r"(line_size) :"r"((long)line_size)
:"eax", "memory"); :REG_a, "memory");
} }
static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ -244,12 +244,12 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
"punpckhbw %%mm7, %%mm5 \n\t" "punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t" "paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t" "paddusw %%mm1, %%mm5 \n\t"
"xorl %%eax, %%eax \n\t" "xor %%"REG_a", %%"REG_a" \n\t"
"addl %3, %1 \n\t" "add %3, %1 \n\t"
".balign 8 \n\t" ".balign 8 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq 1(%1, %%eax), %%mm2 \n\t" "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t" "movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t" "movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t" "punpcklbw %%mm7, %%mm0 \n\t"
@ -265,11 +265,11 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
"psrlw $2, %%mm4 \n\t" "psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm5 \n\t" "psrlw $2, %%mm5 \n\t"
"packuswb %%mm5, %%mm4 \n\t" "packuswb %%mm5, %%mm4 \n\t"
"movq %%mm4, (%2, %%eax) \n\t" "movq %%mm4, (%2, %%"REG_a") \n\t"
"addl %3, %%eax \n\t" "add %3, %%"REG_a" \n\t"
"movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
"movq 1(%1, %%eax), %%mm4 \n\t" "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
"movq %%mm2, %%mm3 \n\t" "movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t" "movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" "punpcklbw %%mm7, %%mm2 \n\t"
@ -285,14 +285,14 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
"psrlw $2, %%mm0 \n\t" "psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t" "psrlw $2, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t" "packuswb %%mm1, %%mm0 \n\t"
"movq %%mm0, (%2, %%eax) \n\t" "movq %%mm0, (%2, %%"REG_a") \n\t"
"addl %3, %%eax \n\t" "add %3, %%"REG_a" \n\t"
"subl $2, %0 \n\t" "subl $2, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels) :"+g"(h), "+S"(pixels)
:"D"(block), "r"(line_size) :"D"(block), "r"((long)line_size)
:"eax", "memory"); :REG_a, "memory");
} }
// avg_pixels // avg_pixels
@ -456,12 +456,12 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line
{ {
MOVQ_BFE(mm6); MOVQ_BFE(mm6);
__asm __volatile( __asm __volatile(
"lea (%3, %3), %%eax \n\t" "lea (%3, %3), %%"REG_a" \n\t"
"movq (%1), %%mm0 \n\t" "movq (%1), %%mm0 \n\t"
".balign 8 \n\t" ".balign 8 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax), %%mm2 \n\t" "movq (%1, %%"REG_a"), %%mm2 \n\t"
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
"movq (%2), %%mm3 \n\t" "movq (%2), %%mm3 \n\t"
PAVGB(%%mm3, %%mm4, %%mm0, %%mm6) PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
@ -469,11 +469,11 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line
PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
"movq %%mm0, (%2) \n\t" "movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"movq (%1, %3), %%mm1 \n\t" "movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
"movq (%2), %%mm3 \n\t" "movq (%2), %%mm3 \n\t"
PAVGB(%%mm3, %%mm4, %%mm2, %%mm6) PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
@ -481,14 +481,14 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line
PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
"movq %%mm2, (%2) \n\t" "movq %%mm2, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t" "movq %%mm1, (%2, %3) \n\t"
"addl %%eax, %1 \n\t" "add %%"REG_a", %1 \n\t"
"addl %%eax, %2 \n\t" "add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t" "subl $4, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block) :"+g"(h), "+S"(pixels), "+D"(block)
:"r"(line_size) :"r"((long)line_size)
:"eax", "memory"); :REG_a, "memory");
} }
// this routine is 'slightly' suboptimal but mostly unused // this routine is 'slightly' suboptimal but mostly unused
@ -507,12 +507,12 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
"punpckhbw %%mm7, %%mm5 \n\t" "punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t" "paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t" "paddusw %%mm1, %%mm5 \n\t"
"xorl %%eax, %%eax \n\t" "xor %%"REG_a", %%"REG_a" \n\t"
"addl %3, %1 \n\t" "add %3, %1 \n\t"
".balign 8 \n\t" ".balign 8 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq 1(%1, %%eax), %%mm2 \n\t" "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t" "movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t" "movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t" "punpcklbw %%mm7, %%mm0 \n\t"
@ -527,16 +527,16 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
"paddusw %%mm1, %%mm5 \n\t" "paddusw %%mm1, %%mm5 \n\t"
"psrlw $2, %%mm4 \n\t" "psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm5 \n\t" "psrlw $2, %%mm5 \n\t"
"movq (%2, %%eax), %%mm3 \n\t" "movq (%2, %%"REG_a"), %%mm3 \n\t"
"packuswb %%mm5, %%mm4 \n\t" "packuswb %%mm5, %%mm4 \n\t"
"pcmpeqd %%mm2, %%mm2 \n\t" "pcmpeqd %%mm2, %%mm2 \n\t"
"paddb %%mm2, %%mm2 \n\t" "paddb %%mm2, %%mm2 \n\t"
PAVGB(%%mm3, %%mm4, %%mm5, %%mm2) PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
"movq %%mm5, (%2, %%eax) \n\t" "movq %%mm5, (%2, %%"REG_a") \n\t"
"addl %3, %%eax \n\t" "add %3, %%"REG_a" \n\t"
"movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
"movq 1(%1, %%eax), %%mm4 \n\t" "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
"movq %%mm2, %%mm3 \n\t" "movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t" "movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" "punpcklbw %%mm7, %%mm2 \n\t"
@ -551,19 +551,19 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
"paddusw %%mm5, %%mm1 \n\t" "paddusw %%mm5, %%mm1 \n\t"
"psrlw $2, %%mm0 \n\t" "psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t" "psrlw $2, %%mm1 \n\t"
"movq (%2, %%eax), %%mm3 \n\t" "movq (%2, %%"REG_a"), %%mm3 \n\t"
"packuswb %%mm1, %%mm0 \n\t" "packuswb %%mm1, %%mm0 \n\t"
"pcmpeqd %%mm2, %%mm2 \n\t" "pcmpeqd %%mm2, %%mm2 \n\t"
"paddb %%mm2, %%mm2 \n\t" "paddb %%mm2, %%mm2 \n\t"
PAVGB(%%mm3, %%mm0, %%mm1, %%mm2) PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
"movq %%mm1, (%2, %%eax) \n\t" "movq %%mm1, (%2, %%"REG_a") \n\t"
"addl %3, %%eax \n\t" "add %3, %%"REG_a" \n\t"
"subl $2, %0 \n\t" "subl $2, %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
:"+g"(h), "+S"(pixels) :"+g"(h), "+S"(pixels)
:"D"(block), "r"(line_size) :"D"(block), "r"((long)line_size)
:"eax", "memory"); :REG_a, "memory");
} }
//FIXME optimize //FIXME optimize

View File

@ -47,13 +47,13 @@ static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5
}; };
static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL; static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
struct struct
{ {
const long fdct_r_row_sse2[4] ATTR_ALIGN(16); const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
} fdct_r_row_sse2 ATTR_ALIGN(16)= } fdct_r_row_sse2 ATTR_ALIGN(16)=
{{ {{
RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW

View File

@ -5,6 +5,12 @@
#ifndef AVCODEC_I386MMX_H #ifndef AVCODEC_I386MMX_H
#define AVCODEC_I386MMX_H #define AVCODEC_I386MMX_H
#ifdef ARCH_X86_64
# define REG_a "rax"
#else
# define REG_a "eax"
#endif
/* /*
* The type of an value that fits in an MMX register (note that long * The type of an value that fits in an MMX register (note that long
* long constant values MUST be suffixed by LL and unsigned long long * long constant values MUST be suffixed by LL and unsigned long long

View File

@ -20,6 +20,7 @@
* mostly by Michael Niedermayer <michaelni@gmx.at> * mostly by Michael Niedermayer <michaelni@gmx.at>
*/ */
#include "../dsputil.h" #include "../dsputil.h"
#include "mmx.h"
static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={ static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
0x0000000000000000ULL, 0x0000000000000000ULL,
@ -31,19 +32,19 @@ static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x010101010101
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{ {
int len= -(stride*h); long len= -(stride*h);
asm volatile( asm volatile(
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%eax), %%mm2 \n\t" "movq (%2, %%"REG_a"), %%mm2 \n\t"
"movq (%2, %%eax), %%mm4 \n\t" "movq (%2, %%"REG_a"), %%mm4 \n\t"
"addl %3, %%eax \n\t" "add %3, %%"REG_a" \n\t"
"psubusb %%mm0, %%mm2 \n\t" "psubusb %%mm0, %%mm2 \n\t"
"psubusb %%mm4, %%mm0 \n\t" "psubusb %%mm4, %%mm0 \n\t"
"movq (%1, %%eax), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%eax), %%mm3 \n\t" "movq (%2, %%"REG_a"), %%mm3 \n\t"
"movq (%2, %%eax), %%mm5 \n\t" "movq (%2, %%"REG_a"), %%mm5 \n\t"
"psubusb %%mm1, %%mm3 \n\t" "psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm5, %%mm1 \n\t" "psubusb %%mm5, %%mm1 \n\t"
"por %%mm2, %%mm0 \n\t" "por %%mm2, %%mm0 \n\t"
@ -58,116 +59,116 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
"paddw %%mm3, %%mm2 \n\t" "paddw %%mm3, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t" "paddw %%mm2, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t" "paddw %%mm0, %%mm6 \n\t"
"addl %3, %%eax \n\t" "add %3, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+a" (len) : "+a" (len)
: "r" (blk1 - len), "r" (blk2 - len), "r" (stride) : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
); );
} }
static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{ {
int len= -(stride*h); long len= -(stride*h);
asm volatile( asm volatile(
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%eax), %%mm2 \n\t" "movq (%2, %%"REG_a"), %%mm2 \n\t"
"psadbw %%mm2, %%mm0 \n\t" "psadbw %%mm2, %%mm0 \n\t"
"addl %3, %%eax \n\t" "add %3, %%"REG_a" \n\t"
"movq (%1, %%eax), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%eax), %%mm3 \n\t" "movq (%2, %%"REG_a"), %%mm3 \n\t"
"psadbw %%mm1, %%mm3 \n\t" "psadbw %%mm1, %%mm3 \n\t"
"paddw %%mm3, %%mm0 \n\t" "paddw %%mm3, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t" "paddw %%mm0, %%mm6 \n\t"
"addl %3, %%eax \n\t" "add %3, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+a" (len) : "+a" (len)
: "r" (blk1 - len), "r" (blk2 - len), "r" (stride) : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
); );
} }
static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
{ {
int len= -(stride*h); long len= -(stride*h);
asm volatile( asm volatile(
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%eax), %%mm2 \n\t" "movq (%2, %%"REG_a"), %%mm2 \n\t"
"pavgb %%mm2, %%mm0 \n\t" "pavgb %%mm2, %%mm0 \n\t"
"movq (%3, %%eax), %%mm2 \n\t" "movq (%3, %%"REG_a"), %%mm2 \n\t"
"psadbw %%mm2, %%mm0 \n\t" "psadbw %%mm2, %%mm0 \n\t"
"addl %4, %%eax \n\t" "add %4, %%"REG_a" \n\t"
"movq (%1, %%eax), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%eax), %%mm3 \n\t" "movq (%2, %%"REG_a"), %%mm3 \n\t"
"pavgb %%mm1, %%mm3 \n\t" "pavgb %%mm1, %%mm3 \n\t"
"movq (%3, %%eax), %%mm1 \n\t" "movq (%3, %%"REG_a"), %%mm1 \n\t"
"psadbw %%mm1, %%mm3 \n\t" "psadbw %%mm1, %%mm3 \n\t"
"paddw %%mm3, %%mm0 \n\t" "paddw %%mm3, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t" "paddw %%mm0, %%mm6 \n\t"
"addl %4, %%eax \n\t" "add %4, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+a" (len) : "+a" (len)
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride) : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
); );
} }
static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{ //FIXME reuse src { //FIXME reuse src
int len= -(stride*h); long len= -(stride*h);
asm volatile( asm volatile(
".balign 16 \n\t" ".balign 16 \n\t"
"movq "MANGLE(bone)", %%mm5 \n\t" "movq "MANGLE(bone)", %%mm5 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%eax), %%mm2 \n\t" "movq (%2, %%"REG_a"), %%mm2 \n\t"
"movq 1(%1, %%eax), %%mm1 \n\t" "movq 1(%1, %%"REG_a"), %%mm1 \n\t"
"movq 1(%2, %%eax), %%mm3 \n\t" "movq 1(%2, %%"REG_a"), %%mm3 \n\t"
"pavgb %%mm2, %%mm0 \n\t" "pavgb %%mm2, %%mm0 \n\t"
"pavgb %%mm1, %%mm3 \n\t" "pavgb %%mm1, %%mm3 \n\t"
"psubusb %%mm5, %%mm3 \n\t" "psubusb %%mm5, %%mm3 \n\t"
"pavgb %%mm3, %%mm0 \n\t" "pavgb %%mm3, %%mm0 \n\t"
"movq (%3, %%eax), %%mm2 \n\t" "movq (%3, %%"REG_a"), %%mm2 \n\t"
"psadbw %%mm2, %%mm0 \n\t" "psadbw %%mm2, %%mm0 \n\t"
"addl %4, %%eax \n\t" "add %4, %%"REG_a" \n\t"
"movq (%1, %%eax), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%eax), %%mm3 \n\t" "movq (%2, %%"REG_a"), %%mm3 \n\t"
"movq 1(%1, %%eax), %%mm2 \n\t" "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq 1(%2, %%eax), %%mm4 \n\t" "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
"pavgb %%mm3, %%mm1 \n\t" "pavgb %%mm3, %%mm1 \n\t"
"pavgb %%mm4, %%mm2 \n\t" "pavgb %%mm4, %%mm2 \n\t"
"psubusb %%mm5, %%mm2 \n\t" "psubusb %%mm5, %%mm2 \n\t"
"pavgb %%mm1, %%mm2 \n\t" "pavgb %%mm1, %%mm2 \n\t"
"movq (%3, %%eax), %%mm1 \n\t" "movq (%3, %%"REG_a"), %%mm1 \n\t"
"psadbw %%mm1, %%mm2 \n\t" "psadbw %%mm1, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t" "paddw %%mm2, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t" "paddw %%mm0, %%mm6 \n\t"
"addl %4, %%eax \n\t" "add %4, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+a" (len) : "+a" (len)
: "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride) : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride)
); );
} }
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
{ {
int len= -(stride*h); long len= -(stride*h);
asm volatile( asm volatile(
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%eax), %%mm1 \n\t" "movq (%2, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%eax), %%mm2 \n\t" "movq (%1, %%"REG_a"), %%mm2 \n\t"
"movq (%2, %%eax), %%mm3 \n\t" "movq (%2, %%"REG_a"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t" "punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t" "punpcklbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t" "punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm0, %%mm1 \n\t" "paddw %%mm0, %%mm1 \n\t"
"paddw %%mm2, %%mm3 \n\t" "paddw %%mm2, %%mm3 \n\t"
"movq (%3, %%eax), %%mm4 \n\t" "movq (%3, %%"REG_a"), %%mm4 \n\t"
"movq (%3, %%eax), %%mm2 \n\t" "movq (%3, %%"REG_a"), %%mm2 \n\t"
"paddw %%mm5, %%mm1 \n\t" "paddw %%mm5, %%mm1 \n\t"
"paddw %%mm5, %%mm3 \n\t" "paddw %%mm5, %%mm3 \n\t"
"psrlw $1, %%mm1 \n\t" "psrlw $1, %%mm1 \n\t"
@ -181,21 +182,21 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int
"punpckhbw %%mm7, %%mm1 \n\t" "punpckhbw %%mm7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t" "paddw %%mm1, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t" "paddw %%mm0, %%mm6 \n\t"
"addl %4, %%eax \n\t" "add %4, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+a" (len) : "+a" (len)
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride) : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
); );
} }
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{ {
int len= -(stride*h); long len= -(stride*h);
asm volatile( asm volatile(
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"movq (%1, %%eax), %%mm0 \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%eax), %%mm1 \n\t" "movq (%2, %%"REG_a"), %%mm1 \n\t"
"movq %%mm0, %%mm4 \n\t" "movq %%mm0, %%mm4 \n\t"
"movq %%mm1, %%mm2 \n\t" "movq %%mm1, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t" "punpcklbw %%mm7, %%mm0 \n\t"
@ -204,8 +205,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
"punpckhbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm2 \n\t"
"paddw %%mm1, %%mm0 \n\t" "paddw %%mm1, %%mm0 \n\t"
"paddw %%mm2, %%mm4 \n\t" "paddw %%mm2, %%mm4 \n\t"
"movq 1(%1, %%eax), %%mm2 \n\t" "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq 1(%2, %%eax), %%mm3 \n\t" "movq 1(%2, %%"REG_a"), %%mm3 \n\t"
"movq %%mm2, %%mm1 \n\t" "movq %%mm2, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t" "punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t" "punpckhbw %%mm7, %%mm1 \n\t"
@ -216,8 +217,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
"punpckhbw %%mm7, %%mm4 \n\t" "punpckhbw %%mm7, %%mm4 \n\t"
"paddw %%mm3, %%mm2 \n\t" "paddw %%mm3, %%mm2 \n\t"
"paddw %%mm4, %%mm1 \n\t" "paddw %%mm4, %%mm1 \n\t"
"movq (%3, %%eax), %%mm3 \n\t" "movq (%3, %%"REG_a"), %%mm3 \n\t"
"movq (%3, %%eax), %%mm4 \n\t" "movq (%3, %%"REG_a"), %%mm4 \n\t"
"paddw %%mm5, %%mm2 \n\t" "paddw %%mm5, %%mm2 \n\t"
"paddw %%mm5, %%mm1 \n\t" "paddw %%mm5, %%mm1 \n\t"
"psrlw $2, %%mm2 \n\t" "psrlw $2, %%mm2 \n\t"
@ -231,10 +232,10 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
"punpckhbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t" "paddw %%mm2, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t" "paddw %%mm0, %%mm6 \n\t"
"addl %4, %%eax \n\t" "add %4, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
: "+a" (len) : "+a" (len)
: "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride) : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride)
); );
} }

View File

@ -23,6 +23,7 @@
#include "../dsputil.h" #include "../dsputil.h"
#include "../mpegvideo.h" #include "../mpegvideo.h"
#include "../avcodec.h" #include "../avcodec.h"
#include "mmx.h"
extern uint8_t zigzag_direct_noperm[64]; extern uint8_t zigzag_direct_noperm[64];
extern uint16_t inv_zigzag_direct16[64]; extern uint16_t inv_zigzag_direct16[64];
@ -34,7 +35,7 @@ static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x000
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
DCTELEM *block, int n, int qscale) DCTELEM *block, int n, int qscale)
{ {
int level, qmul, qadd, nCoeffs; long level, qmul, qadd, nCoeffs;
qmul = qscale << 1; qmul = qscale << 1;
@ -97,7 +98,7 @@ asm volatile(
"movq %%mm0, (%0, %3) \n\t" "movq %%mm0, (%0, %3) \n\t"
"movq %%mm1, 8(%0, %3) \n\t" "movq %%mm1, 8(%0, %3) \n\t"
"addl $16, %3 \n\t" "add $16, %3 \n\t"
"jng 1b \n\t" "jng 1b \n\t"
::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
: "memory" : "memory"
@ -109,7 +110,7 @@ asm volatile(
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
DCTELEM *block, int n, int qscale) DCTELEM *block, int n, int qscale)
{ {
int qmul, qadd, nCoeffs; long qmul, qadd, nCoeffs;
qmul = qscale << 1; qmul = qscale << 1;
qadd = (qscale - 1) | 1; qadd = (qscale - 1) | 1;
@ -160,7 +161,7 @@ asm volatile(
"movq %%mm0, (%0, %3) \n\t" "movq %%mm0, (%0, %3) \n\t"
"movq %%mm1, 8(%0, %3) \n\t" "movq %%mm1, 8(%0, %3) \n\t"
"addl $16, %3 \n\t" "add $16, %3 \n\t"
"jng 1b \n\t" "jng 1b \n\t"
::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
: "memory" : "memory"
@ -200,7 +201,7 @@ asm volatile(
static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
DCTELEM *block, int n, int qscale) DCTELEM *block, int n, int qscale)
{ {
int nCoeffs; long nCoeffs;
const uint16_t *quant_matrix; const uint16_t *quant_matrix;
int block0; int block0;
@ -220,13 +221,13 @@ asm volatile(
"movd %2, %%mm6 \n\t" "movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t"
"movl %3, %%eax \n\t" "mov %3, %%"REG_a" \n\t"
".balign 16\n\t" ".balign 16\n\t"
"1: \n\t" "1: \n\t"
"movq (%0, %%eax), %%mm0 \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq 8(%0, %%eax), %%mm1 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%eax), %%mm4 \n\t" "movq (%1, %%"REG_a"), %%mm4 \n\t"
"movq 8(%1, %%eax), %%mm5 \n\t" "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t" "pxor %%mm2, %%mm2 \n\t"
@ -241,8 +242,8 @@ asm volatile(
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
"pxor %%mm4, %%mm4 \n\t" "pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow "pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $3, %%mm0 \n\t" "psraw $3, %%mm0 \n\t"
"psraw $3, %%mm1 \n\t" "psraw $3, %%mm1 \n\t"
"psubw %%mm7, %%mm0 \n\t" "psubw %%mm7, %%mm0 \n\t"
@ -255,13 +256,13 @@ asm volatile(
"psubw %%mm3, %%mm1 \n\t" "psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t" "pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t" "pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%eax) \n\t" "movq %%mm4, (%0, %%"REG_a") \n\t"
"movq %%mm5, 8(%0, %%eax) \n\t" "movq %%mm5, 8(%0, %%"REG_a") \n\t"
"addl $16, %%eax \n\t" "add $16, %%"REG_a" \n\t"
"js 1b \n\t" "js 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
: "%eax", "memory" : "%"REG_a, "memory"
); );
block[0]= block0; block[0]= block0;
} }
@ -269,7 +270,7 @@ asm volatile(
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
DCTELEM *block, int n, int qscale) DCTELEM *block, int n, int qscale)
{ {
int nCoeffs; long nCoeffs;
const uint16_t *quant_matrix; const uint16_t *quant_matrix;
assert(s->block_last_index[n]>=0); assert(s->block_last_index[n]>=0);
@ -283,13 +284,13 @@ asm volatile(
"movd %2, %%mm6 \n\t" "movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t"
"movl %3, %%eax \n\t" "mov %3, %%"REG_a" \n\t"
".balign 16\n\t" ".balign 16\n\t"
"1: \n\t" "1: \n\t"
"movq (%0, %%eax), %%mm0 \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq 8(%0, %%eax), %%mm1 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%eax), %%mm4 \n\t" "movq (%1, %%"REG_a"), %%mm4 \n\t"
"movq 8(%1, %%eax), %%mm5 \n\t" "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t" "pxor %%mm2, %%mm2 \n\t"
@ -308,8 +309,8 @@ asm volatile(
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
"pxor %%mm4, %%mm4 \n\t" "pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow "pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $4, %%mm0 \n\t" "psraw $4, %%mm0 \n\t"
"psraw $4, %%mm1 \n\t" "psraw $4, %%mm1 \n\t"
"psubw %%mm7, %%mm0 \n\t" "psubw %%mm7, %%mm0 \n\t"
@ -322,20 +323,20 @@ asm volatile(
"psubw %%mm3, %%mm1 \n\t" "psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t" "pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t" "pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%eax) \n\t" "movq %%mm4, (%0, %%"REG_a") \n\t"
"movq %%mm5, 8(%0, %%eax) \n\t" "movq %%mm5, 8(%0, %%"REG_a") \n\t"
"addl $16, %%eax \n\t" "add $16, %%"REG_a" \n\t"
"js 1b \n\t" "js 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
: "%eax", "memory" : "%"REG_a, "memory"
); );
} }
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
DCTELEM *block, int n, int qscale) DCTELEM *block, int n, int qscale)
{ {
int nCoeffs; long nCoeffs;
const uint16_t *quant_matrix; const uint16_t *quant_matrix;
int block0; int block0;
@ -355,13 +356,13 @@ asm volatile(
"movd %2, %%mm6 \n\t" "movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t"
"movl %3, %%eax \n\t" "mov %3, %%"REG_a" \n\t"
".balign 16\n\t" ".balign 16\n\t"
"1: \n\t" "1: \n\t"
"movq (%0, %%eax), %%mm0 \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq 8(%0, %%eax), %%mm1 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%eax), %%mm4 \n\t" "movq (%1, %%"REG_a"), %%mm4 \n\t"
"movq 8(%1, %%eax), %%mm5 \n\t" "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t" "pxor %%mm2, %%mm2 \n\t"
@ -376,8 +377,8 @@ asm volatile(
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
"pxor %%mm4, %%mm4 \n\t" "pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow "pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $3, %%mm0 \n\t" "psraw $3, %%mm0 \n\t"
"psraw $3, %%mm1 \n\t" "psraw $3, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t"
@ -386,13 +387,13 @@ asm volatile(
"psubw %%mm3, %%mm1 \n\t" "psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t" "pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t" "pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%eax) \n\t" "movq %%mm4, (%0, %%"REG_a") \n\t"
"movq %%mm5, 8(%0, %%eax) \n\t" "movq %%mm5, 8(%0, %%"REG_a") \n\t"
"addl $16, %%eax \n\t" "add $16, %%"REG_a" \n\t"
"jng 1b \n\t" "jng 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
: "%eax", "memory" : "%"REG_a, "memory"
); );
block[0]= block0; block[0]= block0;
//Note, we dont do mismatch control for intra as errors cannot accumulate //Note, we dont do mismatch control for intra as errors cannot accumulate
@ -401,7 +402,7 @@ asm volatile(
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
DCTELEM *block, int n, int qscale) DCTELEM *block, int n, int qscale)
{ {
int nCoeffs; long nCoeffs;
const uint16_t *quant_matrix; const uint16_t *quant_matrix;
assert(s->block_last_index[n]>=0); assert(s->block_last_index[n]>=0);
@ -416,13 +417,13 @@ asm volatile(
"movd %2, %%mm6 \n\t" "movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t"
"movl %3, %%eax \n\t" "mov %3, %%"REG_a" \n\t"
".balign 16\n\t" ".balign 16\n\t"
"1: \n\t" "1: \n\t"
"movq (%0, %%eax), %%mm0 \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq 8(%0, %%eax), %%mm1 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%eax), %%mm4 \n\t" "movq (%1, %%"REG_a"), %%mm4 \n\t"
"movq 8(%1, %%eax), %%mm5 \n\t" "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t" "pxor %%mm2, %%mm2 \n\t"
@ -441,8 +442,8 @@ asm volatile(
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
"pxor %%mm4, %%mm4 \n\t" "pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow "pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psrlw $4, %%mm0 \n\t" "psrlw $4, %%mm0 \n\t"
"psrlw $4, %%mm1 \n\t" "psrlw $4, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t"
@ -453,10 +454,10 @@ asm volatile(
"pandn %%mm1, %%mm5 \n\t" "pandn %%mm1, %%mm5 \n\t"
"pxor %%mm4, %%mm7 \n\t" "pxor %%mm4, %%mm7 \n\t"
"pxor %%mm5, %%mm7 \n\t" "pxor %%mm5, %%mm7 \n\t"
"movq %%mm4, (%0, %%eax) \n\t" "movq %%mm4, (%0, %%"REG_a") \n\t"
"movq %%mm5, 8(%0, %%eax) \n\t" "movq %%mm5, 8(%0, %%"REG_a") \n\t"
"addl $16, %%eax \n\t" "add $16, %%"REG_a" \n\t"
"jng 1b \n\t" "jng 1b \n\t"
"movd 124(%0, %3), %%mm0 \n\t" "movd 124(%0, %3), %%mm0 \n\t"
"movq %%mm7, %%mm6 \n\t" "movq %%mm7, %%mm6 \n\t"
@ -471,7 +472,7 @@ asm volatile(
"movd %%mm0, 124(%0, %3) \n\t" "movd %%mm0, 124(%0, %3) \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs)
: "%eax", "memory" : "%"REG_a, "memory"
); );
} }
@ -499,11 +500,11 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
"punpckhwd %%mm1, %%mm1 \n\t" "punpckhwd %%mm1, %%mm1 \n\t"
"punpckhdq %%mm1, %%mm1 \n\t" "punpckhdq %%mm1, %%mm1 \n\t"
"movq %%mm1, (%0, %2) \n\t" "movq %%mm1, (%0, %2) \n\t"
"addl %1, %0 \n\t" "add %1, %0 \n\t"
"cmpl %3, %0 \n\t" "cmp %3, %0 \n\t"
" jb 1b \n\t" " jb 1b \n\t"
: "+r" (ptr) : "+r" (ptr)
: "r" (wrap), "r" (width), "r" (ptr + wrap*height) : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
); );
} }
else else
@ -522,11 +523,11 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
"punpckhdq %%mm1, %%mm1 \n\t" "punpckhdq %%mm1, %%mm1 \n\t"
"movq %%mm1, (%0, %2) \n\t" "movq %%mm1, (%0, %2) \n\t"
"movq %%mm1, 8(%0, %2) \n\t" "movq %%mm1, 8(%0, %2) \n\t"
"addl %1, %0 \n\t" "add %1, %0 \n\t"
"cmpl %3, %0 \n\t" "cmp %3, %0 \n\t"
" jb 1b \n\t" " jb 1b \n\t"
: "+r" (ptr) : "+r" (ptr)
: "r" (wrap), "r" (width), "r" (ptr + wrap*height) : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
); );
} }
@ -540,11 +541,11 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
"movq %%mm0, (%0, %2) \n\t" "movq %%mm0, (%0, %2) \n\t"
"movq %%mm0, (%0, %2, 2) \n\t" "movq %%mm0, (%0, %2, 2) \n\t"
"movq %%mm0, (%0, %3) \n\t" "movq %%mm0, (%0, %3) \n\t"
"addl $8, %0 \n\t" "add $8, %0 \n\t"
"cmpl %4, %0 \n\t" "cmp %4, %0 \n\t"
" jb 1b \n\t" " jb 1b \n\t"
: "+r" (ptr) : "+r" (ptr)
: "r" ((int)buf - (int)ptr - w), "r" (-wrap), "r" (-wrap*3), "r" (ptr+width+2*w) : "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w)
); );
ptr= last_line + (i + 1) * wrap - w; ptr= last_line + (i + 1) * wrap - w;
asm volatile( asm volatile(
@ -554,11 +555,11 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
"movq %%mm0, (%0, %2) \n\t" "movq %%mm0, (%0, %2) \n\t"
"movq %%mm0, (%0, %2, 2) \n\t" "movq %%mm0, (%0, %2, 2) \n\t"
"movq %%mm0, (%0, %3) \n\t" "movq %%mm0, (%0, %3) \n\t"
"addl $8, %0 \n\t" "add $8, %0 \n\t"
"cmpl %4, %0 \n\t" "cmp %4, %0 \n\t"
" jb 1b \n\t" " jb 1b \n\t"
: "+r" (ptr) : "+r" (ptr)
: "r" ((int)last_line - (int)ptr - w), "r" (wrap), "r" (wrap*3), "r" (ptr+width+2*w) : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w)
); );
} }
} }
@ -607,10 +608,10 @@ static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
"movq %%mm2, 8(%1) \n\t" "movq %%mm2, 8(%1) \n\t"
"movq %%mm5, 16(%1) \n\t" "movq %%mm5, 16(%1) \n\t"
"movq %%mm3, 24(%1) \n\t" "movq %%mm3, 24(%1) \n\t"
"addl $16, %0 \n\t" "add $16, %0 \n\t"
"addl $32, %1 \n\t" "add $32, %1 \n\t"
"addl $16, %2 \n\t" "add $16, %2 \n\t"
"cmpl %3, %0 \n\t" "cmp %3, %0 \n\t"
" jb 1b \n\t" " jb 1b \n\t"
: "+r" (block), "+r" (sum), "+r" (offset) : "+r" (block), "+r" (sum), "+r" (offset)
: "r"(block+64) : "r"(block+64)
@ -661,10 +662,10 @@ static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
"movdqa %%xmm6, 16(%1) \n\t" "movdqa %%xmm6, 16(%1) \n\t"
"movdqa %%xmm5, 32(%1) \n\t" "movdqa %%xmm5, 32(%1) \n\t"
"movdqa %%xmm0, 48(%1) \n\t" "movdqa %%xmm0, 48(%1) \n\t"
"addl $32, %0 \n\t" "add $32, %0 \n\t"
"addl $64, %1 \n\t" "add $64, %1 \n\t"
"addl $32, %2 \n\t" "add $32, %2 \n\t"
"cmpl %3, %0 \n\t" "cmp %3, %0 \n\t"
" jb 1b \n\t" " jb 1b \n\t"
: "+r" (block), "+r" (sum), "+r" (offset) : "+r" (block), "+r" (sum), "+r" (offset)
: "r"(block+64) : "r"(block+64)

View File

@ -36,7 +36,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
DCTELEM *block, int n, DCTELEM *block, int n,
int qscale, int *overflow) int qscale, int *overflow)
{ {
int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ... long last_non_zero_p1;
int level=0, q; //=0 is cuz gcc says uninitalized ...
const uint16_t *qmat, *bias; const uint16_t *qmat, *bias;
__align8 int16_t temp_block[64]; __align8 int16_t temp_block[64];
@ -90,18 +91,18 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
if(s->out_format == FMT_H263 && s->mpeg_quant==0){ if(s->out_format == FMT_H263 && s->mpeg_quant==0){
asm volatile( asm volatile(
"movd %%eax, %%mm3 \n\t" // last_non_zero_p1 "movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1
SPREADW(%%mm3) SPREADW(%%mm3)
"pxor %%mm7, %%mm7 \n\t" // 0 "pxor %%mm7, %%mm7 \n\t" // 0
"pxor %%mm4, %%mm4 \n\t" // 0 "pxor %%mm4, %%mm4 \n\t" // 0
"movq (%2), %%mm5 \n\t" // qmat[0] "movq (%2), %%mm5 \n\t" // qmat[0]
"pxor %%mm6, %%mm6 \n\t" "pxor %%mm6, %%mm6 \n\t"
"psubw (%3), %%mm6 \n\t" // -bias[0] "psubw (%3), %%mm6 \n\t" // -bias[0]
"movl $-128, %%eax \n\t" "mov $-128, %%"REG_a" \n\t"
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"pxor %%mm1, %%mm1 \n\t" // 0 "pxor %%mm1, %%mm1 \n\t" // 0
"movq (%1, %%eax), %%mm0 \n\t" // block[i] "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i]
"pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00
"pxor %%mm1, %%mm0 \n\t" "pxor %%mm1, %%mm0 \n\t"
"psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) "psubw %%mm1, %%mm0 \n\t" // ABS(block[i])
@ -110,13 +111,13 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
"por %%mm0, %%mm4 \n\t" "por %%mm0, %%mm4 \n\t"
"pxor %%mm1, %%mm0 \n\t" "pxor %%mm1, %%mm0 \n\t"
"psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
"movq %%mm0, (%5, %%eax) \n\t" "movq %%mm0, (%5, %%"REG_a") \n\t"
"pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00
"movq (%4, %%eax), %%mm1 \n\t" "movq (%4, %%"REG_a"), %%mm1 \n\t"
"movq %%mm7, (%1, %%eax) \n\t" // 0 "movq %%mm7, (%1, %%"REG_a") \n\t" // 0
"pandn %%mm1, %%mm0 \n\t" "pandn %%mm1, %%mm0 \n\t"
PMAXW(%%mm0, %%mm3) PMAXW(%%mm0, %%mm3)
"addl $8, %%eax \n\t" "add $8, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
"movq %%mm3, %%mm0 \n\t" "movq %%mm3, %%mm0 \n\t"
"psrlq $32, %%mm3 \n\t" "psrlq $32, %%mm3 \n\t"
@ -124,8 +125,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
"movq %%mm3, %%mm0 \n\t" "movq %%mm3, %%mm0 \n\t"
"psrlq $16, %%mm3 \n\t" "psrlq $16, %%mm3 \n\t"
PMAXW(%%mm0, %%mm3) PMAXW(%%mm0, %%mm3)
"movd %%mm3, %%eax \n\t" "movd %%mm3, %%"REG_a" \n\t"
"movzbl %%al, %%eax \n\t" // last_non_zero_p1 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1) : "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat), "r" (bias), : "r" (block+64), "r" (qmat), "r" (bias),
"r" (inv_zigzag_direct16+64), "r" (temp_block+64) "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
@ -142,32 +143,32 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
); );
}else{ // FMT_H263 }else{ // FMT_H263
asm volatile( asm volatile(
"movd %%eax, %%mm3 \n\t" // last_non_zero_p1 "movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1
SPREADW(%%mm3) SPREADW(%%mm3)
"pxor %%mm7, %%mm7 \n\t" // 0 "pxor %%mm7, %%mm7 \n\t" // 0
"pxor %%mm4, %%mm4 \n\t" // 0 "pxor %%mm4, %%mm4 \n\t" // 0
"movl $-128, %%eax \n\t" "mov $-128, %%"REG_a" \n\t"
".balign 16 \n\t" ".balign 16 \n\t"
"1: \n\t" "1: \n\t"
"pxor %%mm1, %%mm1 \n\t" // 0 "pxor %%mm1, %%mm1 \n\t" // 0
"movq (%1, %%eax), %%mm0 \n\t" // block[i] "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i]
"pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00
"pxor %%mm1, %%mm0 \n\t" "pxor %%mm1, %%mm0 \n\t"
"psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) "psubw %%mm1, %%mm0 \n\t" // ABS(block[i])
"movq (%3, %%eax), %%mm6 \n\t" // bias[0] "movq (%3, %%"REG_a"), %%mm6 \n\t" // bias[0]
"paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] "paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0]
"movq (%2, %%eax), %%mm5 \n\t" // qmat[i] "movq (%2, %%"REG_a"), %%mm5 \n\t" // qmat[i]
"pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
"por %%mm0, %%mm4 \n\t" "por %%mm0, %%mm4 \n\t"
"pxor %%mm1, %%mm0 \n\t" "pxor %%mm1, %%mm0 \n\t"
"psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
"movq %%mm0, (%5, %%eax) \n\t" "movq %%mm0, (%5, %%"REG_a") \n\t"
"pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00
"movq (%4, %%eax), %%mm1 \n\t" "movq (%4, %%"REG_a"), %%mm1 \n\t"
"movq %%mm7, (%1, %%eax) \n\t" // 0 "movq %%mm7, (%1, %%"REG_a") \n\t" // 0
"pandn %%mm1, %%mm0 \n\t" "pandn %%mm1, %%mm0 \n\t"
PMAXW(%%mm0, %%mm3) PMAXW(%%mm0, %%mm3)
"addl $8, %%eax \n\t" "add $8, %%"REG_a" \n\t"
" js 1b \n\t" " js 1b \n\t"
"movq %%mm3, %%mm0 \n\t" "movq %%mm3, %%mm0 \n\t"
"psrlq $32, %%mm3 \n\t" "psrlq $32, %%mm3 \n\t"
@ -175,8 +176,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
"movq %%mm3, %%mm0 \n\t" "movq %%mm3, %%mm0 \n\t"
"psrlq $16, %%mm3 \n\t" "psrlq $16, %%mm3 \n\t"
PMAXW(%%mm0, %%mm3) PMAXW(%%mm0, %%mm3)
"movd %%mm3, %%eax \n\t" "movd %%mm3, %%"REG_a" \n\t"
"movzbl %%al, %%eax \n\t" // last_non_zero_p1 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1) : "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat+64), "r" (bias+64), : "r" (block+64), "r" (qmat+64), "r" (bias+64),
"r" (inv_zigzag_direct16+64), "r" (temp_block+64) "r" (inv_zigzag_direct16+64), "r" (temp_block+64)

View File

@ -119,7 +119,7 @@ try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
# define always_inline inline # define always_inline inline
#endif #endif
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL; static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL;
static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL; static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL;
static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL; static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL;
@ -172,7 +172,7 @@ static char *replaceTable[]=
}; };
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
static inline void prefetchnta(void *p) static inline void prefetchnta(void *p)
{ {
asm volatile( "prefetchnta (%0)\n\t" asm volatile( "prefetchnta (%0)\n\t"
@ -597,7 +597,7 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#endif //HAVE_ALTIVEC #endif //HAVE_ALTIVEC
#endif //ARCH_POWERPC #endif //ARCH_POWERPC
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
#define COMPILE_MMX #define COMPILE_MMX
@ -616,13 +616,11 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#undef HAVE_MMX2 #undef HAVE_MMX2
#undef HAVE_3DNOW #undef HAVE_3DNOW
#undef HAVE_ALTIVEC #undef HAVE_ALTIVEC
#undef ARCH_X86
#ifdef COMPILE_C #ifdef COMPILE_C
#undef HAVE_MMX #undef HAVE_MMX
#undef HAVE_MMX2 #undef HAVE_MMX2
#undef HAVE_3DNOW #undef HAVE_3DNOW
#undef ARCH_X86
#define RENAME(a) a ## _C #define RENAME(a) a ## _C
#include "postprocess_template.c" #include "postprocess_template.c"
#endif #endif
@ -643,7 +641,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#define HAVE_MMX #define HAVE_MMX
#undef HAVE_MMX2 #undef HAVE_MMX2
#undef HAVE_3DNOW #undef HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _MMX #define RENAME(a) a ## _MMX
#include "postprocess_template.c" #include "postprocess_template.c"
#endif #endif
@ -654,7 +651,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#define HAVE_MMX #define HAVE_MMX
#define HAVE_MMX2 #define HAVE_MMX2
#undef HAVE_3DNOW #undef HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _MMX2 #define RENAME(a) a ## _MMX2
#include "postprocess_template.c" #include "postprocess_template.c"
#endif #endif
@ -665,7 +661,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC
#define HAVE_MMX #define HAVE_MMX
#undef HAVE_MMX2 #undef HAVE_MMX2
#define HAVE_3DNOW #define HAVE_3DNOW
#define ARCH_X86
#define RENAME(a) a ## _3DNow #define RENAME(a) a ## _3DNow
#include "postprocess_template.c" #include "postprocess_template.c"
#endif #endif
@ -683,7 +678,7 @@ static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int
// difference wouldnt be messureable here but its much better because // difference wouldnt be messureable here but its much better because
// someone might exchange the cpu whithout restarting mplayer ;) // someone might exchange the cpu whithout restarting mplayer ;)
#ifdef RUNTIME_CPUDETECT #ifdef RUNTIME_CPUDETECT
#ifdef ARCH_X86 #if defined(ARCH_X86) || defined(ARCH_X86_64)
// ordered per speed fasterst first // ordered per speed fasterst first
if(c->cpuCaps & PP_CPU_CAPS_MMX2) if(c->cpuCaps & PP_CPU_CAPS_MMX2)
postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);

File diff suppressed because it is too large Load Diff

View File

@ -716,7 +716,7 @@ static inline int msmpeg4_pred_dc(MpegEncContext * s, int n,
necessitate to modify mpegvideo.c. The problem comes from the necessitate to modify mpegvideo.c. The problem comes from the
fact they decided to store the quantized DC (which would lead fact they decided to store the quantized DC (which would lead
to problems if Q could vary !) */ to problems if Q could vary !) */
#if defined ARCH_X86 && !defined PIC #if (defined(ARCH_X86) || defined(ARCH_X86_64)) && !defined PIC
asm volatile( asm volatile(
"movl %3, %%eax \n\t" "movl %3, %%eax \n\t"
"shrl $1, %%eax \n\t" "shrl $1, %%eax \n\t"