better hadamard8_diff16 in AltiVec, and more patch by (Romain Dolbeau <dolbeau at irisa dot fr>)

Originally committed as revision 3038 to svn://svn.ffmpeg.org/ffmpeg/trunk
2025-11-23 21:54:53 +02:00 · 2004-04-22 13:21:59 +00:00
parent 2750b827b3
commit 9007f51460
6 changed files with 329 additions and 78 deletions
--- a/9
+++ b/9
@@ -516,6 +516,7 @@ fi

 # Add processor-specific flags
 TUNECPU="generic"
+POWERPCMODE="32bits"
 if test $tune != "generic"; then
    case $tune in
 	601|ppc601|PowerPC601)
@@ -561,11 +562,12 @@ if test $tune != "generic"; then
 	    TUNECPU=ppc7400
 	;;
 	G5|g5|970|ppc970|PowerPC970|power4*|Power4*)
-	    CFLAGS="$CFLAGS -mcpu=970 -mtune=970 -mpowerpc64 -force_cpusubtype_ALL "
+	    CFLAGS="$CFLAGS -mcpu=970 -mtune=970 -mpowerpc-gfxopt -mpowerpc64"
 	    if test $altivec = "no"; then
 		echo "WARNING: tuning for PPC970 but altivec disabled !";
 	    fi
 	    TUNECPU=ppc970
+            POWERPCMODE="64bits"
 	;;
 	*)
 	echo "WARNING: unknown CPU "$tune", ignored"
@@ -1027,6 +1029,11 @@ elif test "$cpu" = "sparc64" ; then
 elif test "$cpu" = "powerpc" ; then
  echo "TARGET_ARCH_POWERPC=yes" >> config.mak
  echo "#define ARCH_POWERPC 1" >> $TMPH
+  if test $POWERPCMODE = "32bits"; then
+    echo "#define POWERPC_MODE_32BITS 1" >> $TMPH
+  else
+    echo "#define POWERPC_MODE_64BITS 1" >> $TMPH
+  fi
  if test "$powerpc_perf" = "yes"; then
    echo "#define POWERPC_PERFORMANCE_REPORT 1" >> $TMPH
  fi
--- a/doc/ffmpeg_powerpc_performance_evaluation_howto.txt
+++ b/doc/ffmpeg_powerpc_performance_evaluation_howto.txt
@@ -17,7 +17,7 @@ The firsts are always available, always active, but they're not very accurate :

 The PMC are much more useful : not only they can report cycle-accurate timing, but they can also be used to monitor many other parameters, such as the number of AltiVec stalls for every kind of instructions, or instruction cache misses. The downside is that not all processors support the PMC (all G3, all G4 and the 970 do support them), and they're inactive by default - you need to activate them with a dedicated tool. Also, the number of available PMC depend on the procesor : the various 604 have 2, the various 75x (aka. G3) have 4, anbd the various 74xx (aka G4) have 6.

-*WARNING*: The powerpc 970 is not very well documented, and it seems its PMC registers are 64bits wide. The current implementation in FFMpeg assume the register are 32bits wide, and will *not* work on a powerpc 970 (aka G5).
+*WARNING*: The powerpc 970 is not very well documented, and its PMC registers are 64bits wide. To properly notify the code, you *must* tune for the 970 (using --tune=970), or the code will assume 32bits registers.


 II - Enabling FFmpeg PowerPC performance support
--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@@ -1306,42 +1306,43 @@ int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t
 POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
  int sum;
 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
-  {
-    const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
+  register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
+  register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 #ifdef CONFIG_DARWIN
-    const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
-    const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
-    const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
-    const_vector unsigned char perm1 = (const_vector unsigned char)
+  {
+    register const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
+    register const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
+    register const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
+    register const_vector unsigned char perm1 = (const_vector unsigned char)
      (0x02, 0x03, 0x00, 0x01,
       0x06, 0x07, 0x04, 0x05,
       0x0A, 0x0B, 0x08, 0x09,
       0x0E, 0x0F, 0x0C, 0x0D);
-    const_vector unsigned char perm2 = (const_vector unsigned char)
+    register const_vector unsigned char perm2 = (const_vector unsigned char)
      (0x04, 0x05, 0x06, 0x07,
       0x00, 0x01, 0x02, 0x03,
       0x0C, 0x0D, 0x0E, 0x0F,
       0x08, 0x09, 0x0A, 0x0B);
-    const_vector unsigned char perm3 = (const_vector unsigned char)
+    register const_vector unsigned char perm3 = (const_vector unsigned char)
      (0x08, 0x09, 0x0A, 0x0B,
       0x0C, 0x0D, 0x0E, 0x0F,
       0x00, 0x01, 0x02, 0x03,
       0x04, 0x05, 0x06, 0x07);
 #else
-    const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
-    const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
-    const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
-    const_vector unsigned char perm1 = (const_vector unsigned char)
+    register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
+    register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
+    register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
+    register const_vector unsigned char perm1 = (const_vector unsigned char)
      {0x02, 0x03, 0x00, 0x01,
       0x06, 0x07, 0x04, 0x05,
       0x0A, 0x0B, 0x08, 0x09,
       0x0E, 0x0F, 0x0C, 0x0D};
-    const_vector unsigned char perm2 = (const_vector unsigned char)
+    register const_vector unsigned char perm2 = (const_vector unsigned char)
      {0x04, 0x05, 0x06, 0x07,
       0x00, 0x01, 0x02, 0x03,
       0x0C, 0x0D, 0x0E, 0x0F,
       0x08, 0x09, 0x0A, 0x0B};
-    const_vector unsigned char perm3 = (const_vector unsigned char)
+    register const_vector unsigned char perm3 = (const_vector unsigned char)
      {0x08, 0x09, 0x0A, 0x0B,
       0x0C, 0x0D, 0x0E, 0x0F,
       0x00, 0x01, 0x02, 0x03,
@@ -1350,8 +1351,8 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);

 #define ONEITERBUTTERFLY(i, res)					\
    {									\
-      vector unsigned char src1, src2, srcO;				\
-      vector unsigned char dst1, dst2, dstO;				\
+      register vector unsigned char src1, src2, srcO;		       	\
+      register vector unsigned char dst1, dst2, dstO;		       	\
      src1 = vec_ld(stride * i, src);					\
      if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8)	\
 	src2 = vec_ld((stride * i) + 16, src);				\
@@ -1362,20 +1363,19 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
      dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));		\
      /* promote the unsigned chars to signed shorts */			\
      /* we're in the 8x8 function, we only care for the first 8 */	\
-      vector signed short srcV =					\
+      register vector signed short srcV =			       	\
 	(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
-      vector signed short dstV =					\
+      register vector signed short dstV =			       	\
 	(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
      /* substractions inside the first butterfly */			\
-      vector signed short but0 = vec_sub(srcV, dstV);			\
-      vector signed short op1 = vec_perm(but0, but0, perm1);		\
-      vector signed short but1 = vec_mladd(but0, vprod1, op1);		\
-      vector signed short op2 = vec_perm(but1, but1, perm2);		\
-      vector signed short but2 = vec_mladd(but1, vprod2, op2);		\
-      vector signed short op3 = vec_perm(but2, but2, perm3);		\
+      register vector signed short but0 = vec_sub(srcV, dstV);	       	\
+      register vector signed short op1 = vec_perm(but0, but0, perm1);  	\
+      register vector signed short but1 = vec_mladd(but0, vprod1, op1);	\
+      register vector signed short op2 = vec_perm(but1, but1, perm2);  	\
+      register vector signed short but2 = vec_mladd(but1, vprod2, op2);	\
+      register vector signed short op3 = vec_perm(but2, but2, perm3);  	\
      res = vec_mladd(but2, vprod3, op3);				\
    }
-    vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
    ONEITERBUTTERFLY(0, temp0);
    ONEITERBUTTERFLY(1, temp1);
    ONEITERBUTTERFLY(2, temp2);
@@ -1384,53 +1384,275 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
    ONEITERBUTTERFLY(5, temp5);
    ONEITERBUTTERFLY(6, temp6);
    ONEITERBUTTERFLY(7, temp7);
+  }
 #undef ONEITERBUTTERFLY
-    {
-      vector signed int vsum;
-      vector signed short line0 = vec_add(temp0, temp1);
-      vector signed short line1 = vec_sub(temp0, temp1);
-      vector signed short line2 = vec_add(temp2, temp3);
-      vector signed short line3 = vec_sub(temp2, temp3);
-      vector signed short line4 = vec_add(temp4, temp5);
-      vector signed short line5 = vec_sub(temp4, temp5);
-      vector signed short line6 = vec_add(temp6, temp7);
-      vector signed short line7 = vec_sub(temp6, temp7);
-      
-      vector signed short line0B = vec_add(line0, line2);
-      vector signed short line2B = vec_sub(line0, line2);
-      vector signed short line1B = vec_add(line1, line3);
-      vector signed short line3B = vec_sub(line1, line3);
-      vector signed short line4B = vec_add(line4, line6);
-      vector signed short line6B = vec_sub(line4, line6);
-      vector signed short line5B = vec_add(line5, line7);
-      vector signed short line7B = vec_sub(line5, line7);
-      
-      vector signed short line0C = vec_add(line0B, line4B);
-      vector signed short line4C = vec_sub(line0B, line4B);
-      vector signed short line1C = vec_add(line1B, line5B);
-      vector signed short line5C = vec_sub(line1B, line5B);
-      vector signed short line2C = vec_add(line2B, line6B);
-      vector signed short line6C = vec_sub(line2B, line6B);
-      vector signed short line3C = vec_add(line3B, line7B);
-      vector signed short line7C = vec_sub(line3B, line7B);
-      
-      vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
-      vsum = vec_sum4s(vec_abs(line1C), vsum);
-      vsum = vec_sum4s(vec_abs(line2C), vsum);
-      vsum = vec_sum4s(vec_abs(line3C), vsum);
-      vsum = vec_sum4s(vec_abs(line4C), vsum);
-      vsum = vec_sum4s(vec_abs(line5C), vsum);
-      vsum = vec_sum4s(vec_abs(line6C), vsum);
-      vsum = vec_sum4s(vec_abs(line7C), vsum);
-      vsum = vec_sums(vsum, (vector signed int)vzero);
-      vsum = vec_splat(vsum, 3);
-      vec_ste(vsum, 0, &sum);
-    }
+  {
+    register vector signed int vsum;
+    register vector signed short line0 = vec_add(temp0, temp1);
+    register vector signed short line1 = vec_sub(temp0, temp1);
+    register vector signed short line2 = vec_add(temp2, temp3);
+    register vector signed short line3 = vec_sub(temp2, temp3);
+    register vector signed short line4 = vec_add(temp4, temp5);
+    register vector signed short line5 = vec_sub(temp4, temp5);
+    register vector signed short line6 = vec_add(temp6, temp7);
+    register vector signed short line7 = vec_sub(temp6, temp7);
+    
+    register vector signed short line0B = vec_add(line0, line2);
+    register vector signed short line2B = vec_sub(line0, line2);
+    register vector signed short line1B = vec_add(line1, line3);
+    register vector signed short line3B = vec_sub(line1, line3);
+    register vector signed short line4B = vec_add(line4, line6);
+    register vector signed short line6B = vec_sub(line4, line6);
+    register vector signed short line5B = vec_add(line5, line7);
+    register vector signed short line7B = vec_sub(line5, line7);
+    
+    register vector signed short line0C = vec_add(line0B, line4B);
+    register vector signed short line4C = vec_sub(line0B, line4B);
+    register vector signed short line1C = vec_add(line1B, line5B);
+    register vector signed short line5C = vec_sub(line1B, line5B);
+    register vector signed short line2C = vec_add(line2B, line6B);
+    register vector signed short line6C = vec_sub(line2B, line6B);
+    register vector signed short line3C = vec_add(line3B, line7B);
+    register vector signed short line7C = vec_sub(line3B, line7B);
+    
+    vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
+    vsum = vec_sum4s(vec_abs(line1C), vsum);
+    vsum = vec_sum4s(vec_abs(line2C), vsum);
+    vsum = vec_sum4s(vec_abs(line3C), vsum);
+    vsum = vec_sum4s(vec_abs(line4C), vsum);
+    vsum = vec_sum4s(vec_abs(line5C), vsum);
+    vsum = vec_sum4s(vec_abs(line6C), vsum);
+    vsum = vec_sum4s(vec_abs(line7C), vsum);
+    vsum = vec_sums(vsum, (vector signed int)vzero);
+    vsum = vec_splat(vsum, 3);
+    vec_ste(vsum, 0, &sum);
  }
 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
  return sum;
 }

+/*
+  16x8 works with 16 elements ; it allows to avoid replicating
+  loads, and give the compiler more rooms for scheduling.
+  It's only used from inside hadamard8_diff16_altivec.
+  
+  Unfortunately, it seems gcc-3.3 is a bit dumb, and
+  the compiled code has a LOT of spill code, it seems
+  gcc (unlike xlc) cannot keep everything in registers
+  by itself. The following code include hand-made
+  registers allocation. It's not clean, but on
+  a 7450 the resulting code is much faster (best case
+  fall from 700+ cycles to 550).
+  
+  xlc doesn't add spill code, but it doesn't know how to
+  schedule for the 7450, and its code isn't much faster than
+  gcc-3.3 on the 7450 (but uses 25% less instructions...)
+  
+  On the 970, the hand-made RA is still a win (arount 690
+  vs. around 780), but xlc goes to around 660 on the
+  regular C code...
+*/
+
+static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
+  int sum;
+  register vector signed short
+    temp0 asm ("v0"),
+    temp1 asm ("v1"),
+    temp2 asm ("v2"),
+    temp3 asm ("v3"),
+    temp4 asm ("v4"),
+    temp5 asm ("v5"),
+    temp6 asm ("v6"),
+    temp7 asm ("v7");
+  register vector signed short
+    temp0S asm ("v8"),
+    temp1S asm ("v9"),
+    temp2S asm ("v10"),
+    temp3S asm ("v11"),
+    temp4S asm ("v12"),
+    temp5S asm ("v13"),
+    temp6S asm ("v14"),
+    temp7S asm ("v15");
+  register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0);
+  {
+#ifdef CONFIG_DARWIN
+    register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
+    register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
+    register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
+    register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char)
+      (0x02, 0x03, 0x00, 0x01,
+       0x06, 0x07, 0x04, 0x05,
+       0x0A, 0x0B, 0x08, 0x09,
+       0x0E, 0x0F, 0x0C, 0x0D);
+    register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char)
+      (0x04, 0x05, 0x06, 0x07,
+       0x00, 0x01, 0x02, 0x03,
+       0x0C, 0x0D, 0x0E, 0x0F,
+       0x08, 0x09, 0x0A, 0x0B);
+    register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char)
+      (0x08, 0x09, 0x0A, 0x0B,
+       0x0C, 0x0D, 0x0E, 0x0F,
+       0x00, 0x01, 0x02, 0x03,
+       0x04, 0x05, 0x06, 0x07);
+#else
+    register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
+    register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
+    register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
+    register const_vector unsigned char perm1 = (const_vector unsigned char)
+      {0x02, 0x03, 0x00, 0x01,
+       0x06, 0x07, 0x04, 0x05,
+       0x0A, 0x0B, 0x08, 0x09,
+       0x0E, 0x0F, 0x0C, 0x0D};
+    register const_vector unsigned char perm2 = (const_vector unsigned char)
+      {0x04, 0x05, 0x06, 0x07,
+       0x00, 0x01, 0x02, 0x03,
+       0x0C, 0x0D, 0x0E, 0x0F,
+       0x08, 0x09, 0x0A, 0x0B};
+    register const_vector unsigned char perm3 = (const_vector unsigned char)
+      {0x08, 0x09, 0x0A, 0x0B,
+       0x0C, 0x0D, 0x0E, 0x0F,
+       0x00, 0x01, 0x02, 0x03,
+       0x04, 0x05, 0x06, 0x07};
+#endif
+#define ONEITERBUTTERFLY(i, res1, res2)					\
+    {									\
+      register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \
+      register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \
+      src1 = vec_ld(stride * i, src);					\
+      src2 = vec_ld((stride * i) + 16, src);				\
+      register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
+      dst1 = vec_ld(stride * i, dst);					\
+      dst2 = vec_ld((stride * i) + 16, dst);				\
+      register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
+      /* promote the unsigned chars to signed shorts */			\
+      register vector signed short srcV asm ("v24") =                   \
+	(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
+      register vector signed short dstV asm ("v25") =                   \
+	(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
+      register vector signed short srcW asm ("v26") =                   \
+	(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
+      register vector signed short dstW asm ("v27") =                   \
+	(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
+      /* substractions inside the first butterfly */			\
+      register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \
+      register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \
+      register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \
+      register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \
+      register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \
+      register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \
+      register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \
+      register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \
+      register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \
+      register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \
+      register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \
+      res1 = vec_mladd(but2, vprod3, op3);				\
+      register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \
+      res2 = vec_mladd(but2S, vprod3, op3S);				\
+    }
+    ONEITERBUTTERFLY(0, temp0, temp0S);
+    ONEITERBUTTERFLY(1, temp1, temp1S);
+    ONEITERBUTTERFLY(2, temp2, temp2S);
+    ONEITERBUTTERFLY(3, temp3, temp3S);
+    ONEITERBUTTERFLY(4, temp4, temp4S);
+    ONEITERBUTTERFLY(5, temp5, temp5S);
+    ONEITERBUTTERFLY(6, temp6, temp6S);
+    ONEITERBUTTERFLY(7, temp7, temp7S);
+  }
+#undef ONEITERBUTTERFLY
+  {
+    register vector signed int vsum;
+    register vector signed short line0 = vec_add(temp0, temp1);
+    register vector signed short line1 = vec_sub(temp0, temp1);
+    register vector signed short line2 = vec_add(temp2, temp3);
+    register vector signed short line3 = vec_sub(temp2, temp3);
+    register vector signed short line4 = vec_add(temp4, temp5);
+    register vector signed short line5 = vec_sub(temp4, temp5);
+    register vector signed short line6 = vec_add(temp6, temp7);
+    register vector signed short line7 = vec_sub(temp6, temp7);
+      
+    register vector signed short line0B = vec_add(line0, line2);
+    register vector signed short line2B = vec_sub(line0, line2);
+    register vector signed short line1B = vec_add(line1, line3);
+    register vector signed short line3B = vec_sub(line1, line3);
+    register vector signed short line4B = vec_add(line4, line6);
+    register vector signed short line6B = vec_sub(line4, line6);
+    register vector signed short line5B = vec_add(line5, line7);
+    register vector signed short line7B = vec_sub(line5, line7);
+      
+    register vector signed short line0C = vec_add(line0B, line4B);
+    register vector signed short line4C = vec_sub(line0B, line4B);
+    register vector signed short line1C = vec_add(line1B, line5B);
+    register vector signed short line5C = vec_sub(line1B, line5B);
+    register vector signed short line2C = vec_add(line2B, line6B);
+    register vector signed short line6C = vec_sub(line2B, line6B);
+    register vector signed short line3C = vec_add(line3B, line7B);
+    register vector signed short line7C = vec_sub(line3B, line7B);
+      
+    vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
+    vsum = vec_sum4s(vec_abs(line1C), vsum);
+    vsum = vec_sum4s(vec_abs(line2C), vsum);
+    vsum = vec_sum4s(vec_abs(line3C), vsum);
+    vsum = vec_sum4s(vec_abs(line4C), vsum);
+    vsum = vec_sum4s(vec_abs(line5C), vsum);
+    vsum = vec_sum4s(vec_abs(line6C), vsum);
+    vsum = vec_sum4s(vec_abs(line7C), vsum);
+
+    register vector signed short line0S = vec_add(temp0S, temp1S);
+    register vector signed short line1S = vec_sub(temp0S, temp1S);
+    register vector signed short line2S = vec_add(temp2S, temp3S);
+    register vector signed short line3S = vec_sub(temp2S, temp3S);
+    register vector signed short line4S = vec_add(temp4S, temp5S);
+    register vector signed short line5S = vec_sub(temp4S, temp5S);
+    register vector signed short line6S = vec_add(temp6S, temp7S);
+    register vector signed short line7S = vec_sub(temp6S, temp7S);
+
+    register vector signed short line0BS = vec_add(line0S, line2S);
+    register vector signed short line2BS = vec_sub(line0S, line2S);
+    register vector signed short line1BS = vec_add(line1S, line3S);
+    register vector signed short line3BS = vec_sub(line1S, line3S);
+    register vector signed short line4BS = vec_add(line4S, line6S);
+    register vector signed short line6BS = vec_sub(line4S, line6S);
+    register vector signed short line5BS = vec_add(line5S, line7S);
+    register vector signed short line7BS = vec_sub(line5S, line7S);
+
+    register vector signed short line0CS = vec_add(line0BS, line4BS);
+    register vector signed short line4CS = vec_sub(line0BS, line4BS);
+    register vector signed short line1CS = vec_add(line1BS, line5BS);
+    register vector signed short line5CS = vec_sub(line1BS, line5BS);
+    register vector signed short line2CS = vec_add(line2BS, line6BS);
+    register vector signed short line6CS = vec_sub(line2BS, line6BS);
+    register vector signed short line3CS = vec_add(line3BS, line7BS);
+    register vector signed short line7CS = vec_sub(line3BS, line7BS);
+
+    vsum = vec_sum4s(vec_abs(line0CS), vsum);
+    vsum = vec_sum4s(vec_abs(line1CS), vsum);
+    vsum = vec_sum4s(vec_abs(line2CS), vsum);
+    vsum = vec_sum4s(vec_abs(line3CS), vsum);
+    vsum = vec_sum4s(vec_abs(line4CS), vsum);
+    vsum = vec_sum4s(vec_abs(line5CS), vsum);
+    vsum = vec_sum4s(vec_abs(line6CS), vsum);
+    vsum = vec_sum4s(vec_abs(line7CS), vsum);
+    vsum = vec_sums(vsum, (vector signed int)vzero);
+    vsum = vec_splat(vsum, 3);
+    vec_ste(vsum, 0, &sum);
+  }
+  return sum;
+}
+
+int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
+POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
+  int score;
+POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
+  score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
+  if (h==16) {
+    dst += 8*stride;
+    src += 8*stride;
+    score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
+  }
+POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
+  return score;
+}
+
 int has_altivec(void)
 {
 #ifdef CONFIG_DARWIN
--- a/libavcodec/ppc/dsputil_altivec.h
+++ b/libavcodec/ppc/dsputil_altivec.h
@@ -47,6 +47,7 @@ extern void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
 extern void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
 extern void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h);
 extern int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h);
+extern int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h);

 extern void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder);

--- a/libavcodec/ppc/dsputil_ppc.c
+++ b/libavcodec/ppc/dsputil_ppc.c
@@ -61,6 +61,7 @@ static unsigned char* perfname[] = {
  "put_pixels16_xy2_altivec",
  "put_no_rnd_pixels16_xy2_altivec",
  "hadamard8_diff8x8_altivec",
+  "hadamard8_diff16_altivec",
  "clear_blocks_dcbz32_ppc",
  "clear_blocks_dcbz128_ppc"
 };
@@ -226,12 +227,6 @@ long check_dcbzl_effect(void)
 }
 #endif

-#ifdef HAVE_ALTIVEC
-// can't put that in dsputil_altivec.c,
-// has WARPER8_16_SQ declare the function "static" ...
-WARPER8_16_SQ(hadamard8_diff8x8_altivec, hadamard8_diff16_altivec)
-#endif
-
 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
 {
    // Common optimizations whether Altivec is available or not
--- a/libavcodec/ppc/dsputil_ppc.h
+++ b/libavcodec/ppc/dsputil_ppc.h
@@ -51,6 +51,7 @@ enum powerpc_perf_index {
  altivec_put_pixels16_xy2_num,
  altivec_put_no_rnd_pixels16_xy2_num,
  altivec_hadamard8_diff8x8_num,
+  altivec_hadamard8_diff16_num,
  powerpc_clear_blocks_dcbz32,
  powerpc_clear_blocks_dcbz128,
  powerpc_perf_total
@@ -64,6 +65,8 @@ enum powerpc_data_index {
 };
 extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];

+#ifndef POWERPC_MODE_64BITS
+#define POWERP_PMC_DATATYPE unsigned long
 #define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 937" : "=r" (a))
 #define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 938" : "=r" (a))
 #if (POWERPC_NUM_PMC_ENABLED > 2)
@@ -80,7 +83,30 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
 #define POWERPC_GET_PMC5(a) do {} while (0)
 #define POWERPC_GET_PMC6(a) do {} while (0)
 #endif
-#define POWERPC_PERF_DECLARE(a, cond) unsigned long pmc_start[POWERPC_NUM_PMC_ENABLED], pmc_stop[POWERPC_NUM_PMC_ENABLED], pmc_loop_index;
+#else /* POWERPC_MODE_64BITS */
+#define POWERP_PMC_DATATYPE unsigned long long
+#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 771" : "=r" (a))
+#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 772" : "=r" (a))
+#if (POWERPC_NUM_PMC_ENABLED > 2)
+#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 773" : "=r" (a))
+#define POWERPC_GET_PMC4(a) asm volatile("mfspr %0, 774" : "=r" (a))
+#else
+#define POWERPC_GET_PMC3(a) do {} while (0)
+#define POWERPC_GET_PMC4(a) do {} while (0)
+#endif
+#if (POWERPC_NUM_PMC_ENABLED > 4)
+#define POWERPC_GET_PMC5(a) asm volatile("mfspr %0, 775" : "=r" (a))
+#define POWERPC_GET_PMC6(a) asm volatile("mfspr %0, 776" : "=r" (a))
+#else
+#define POWERPC_GET_PMC5(a) do {} while (0)
+#define POWERPC_GET_PMC6(a) do {} while (0)
+#endif
+#endif /* POWERPC_MODE_64BITS */
+#define POWERPC_PERF_DECLARE(a, cond)				\
+  POWERP_PMC_DATATYPE						\
+    pmc_start[POWERPC_NUM_PMC_ENABLED],				\
+    pmc_stop[POWERPC_NUM_PMC_ENABLED],				\
+    pmc_loop_index;
 #define POWERPC_PERF_START_COUNT(a, cond) do { \
  POWERPC_GET_PMC6(pmc_start[5]); \
  POWERPC_GET_PMC5(pmc_start[4]); \
@@ -102,9 +128,9 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][
        pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \
        pmc_loop_index++)         \
    {                             \
-      if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \
-      {                           \
-        unsigned long diff =      \
+      if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index])  \
+	{							  \
+        POWERP_PMC_DATATYPE diff =				  \
          pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index];   \
        if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \
          perfdata[pmc_loop_index][a][powerpc_data_min] = diff;   \