From 4d1f69f2440041b58d5a31bcfcff83ee3c88ac7e Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Mon, 30 Jul 2012 17:04:26 +0200
Subject: [PATCH 1/2] x86: h264_qpel_10bit: port to cpuflags

---
 libavcodec/x86/h264_qpel_10bit.asm | 314 ++++++++++++++---------------
 1 file changed, 155 insertions(+), 159 deletions(-)

diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
index a84b810955..4aea03209c 100644
--- a/libavcodec/x86/h264_qpel_10bit.asm
+++ b/libavcodec/x86/h264_qpel_10bit.asm
@@ -97,81 +97,73 @@ SECTION .text
 
 %macro MC 1
 %define OP_MOV mova
-INIT_MMX
-%1 mmxext, put, 4
-INIT_XMM
-%1 sse2  , put, 8
+INIT_MMX mmxext
+%1 put, 4
+INIT_XMM sse2
+%1 put, 8
 
 %define OP_MOV AVG_MOV
-INIT_MMX
-%1 mmxext, avg, 4
-INIT_XMM
-%1 sse2  , avg, 8
+INIT_MMX mmxext
+%1 avg, 4
+INIT_XMM sse2
+%1 avg, 8
 %endmacro
 
-%macro MCAxA 8
-%if ARCH_X86_64
-%ifnidn %1,mmxext
-MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
-%endif
-%else
-MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
-%endif
-%endmacro
-
-%macro MCAxA_OP 8
+%macro MCAxA_OP 7
 %if ARCH_X86_32
-cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
-    call stub_%2_h264_qpel%4_%3_10_%1
+cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
     mov  r0, r0m
     mov  r1, r1m
-    add  r0, %4*2
-    add  r1, %4*2
-    call stub_%2_h264_qpel%4_%3_10_%1
+    add  r0, %3*2
+    add  r1, %3*2
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
     mov  r0, r0m
     mov  r1, r1m
-    lea  r0, [r0+r2*%4]
-    lea  r1, [r1+r2*%4]
-    call stub_%2_h264_qpel%4_%3_10_%1
+    lea  r0, [r0+r2*%3]
+    lea  r1, [r1+r2*%3]
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
     mov  r0, r0m
     mov  r1, r1m
-    lea  r0, [r0+r2*%4+%4*2]
-    lea  r1, [r1+r2*%4+%4*2]
-    call stub_%2_h264_qpel%4_%3_10_%1
+    lea  r0, [r0+r2*%3+%3*2]
+    lea  r1, [r1+r2*%3+%3*2]
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
     RET
 %else ; ARCH_X86_64
-cglobal %2_h264_qpel%5_%3_10_%1, %6,%7 + 2,%8
-    mov r%7, r0
-%assign p1 %7+1
+cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
+    mov r%6, r0
+%assign p1 %6+1
     mov r %+ p1, r1
-    call stub_%2_h264_qpel%4_%3_10_%1
-    lea  r0, [r%7+%4*2]
-    lea  r1, [r %+ p1+%4*2]
-    call stub_%2_h264_qpel%4_%3_10_%1
-    lea  r0, [r%7+r2*%4]
-    lea  r1, [r %+ p1+r2*%4]
-    call stub_%2_h264_qpel%4_%3_10_%1
-    lea  r0, [r%7+r2*%4+%4*2]
-    lea  r1, [r %+ p1+r2*%4+%4*2]
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+    lea  r0, [r%6+%3*2]
+    lea  r1, [r %+ p1+%3*2]
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+    lea  r0, [r%6+r2*%3]
+    lea  r1, [r %+ p1+r2*%3]
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
+    lea  r0, [r%6+r2*%3+%3*2]
+    lea  r1, [r %+ p1+r2*%3+%3*2]
 %if UNIX64 == 0 ; fall through to function
-    call stub_%2_h264_qpel%4_%3_10_%1
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
     RET
 %endif
 %endif
 %endmacro
 
 ;cpu, put/avg, mc, 4/8, ...
-%macro cglobal_mc 7
-%assign i %4*2
-MCAxA %1, %2, %3, %4, i, %5,%6,%7
+%macro cglobal_mc 6
+%assign i %3*2
+%if ARCH_X86_32 || cpuflag(sse2)
+MCAxA_OP %1, %2, %3, i, %4,%5,%6
+%endif
 
-cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7
+cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
 %if UNIX64 == 0 ; no prologue or epilogue for UNIX64
-    call stub_%2_h264_qpel%4_%3_10_%1
+    call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
     RET
 %endif
 
-stub_%2_h264_qpel%4_%3_10_%1:
+stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -189,14 +181,14 @@ stub_%2_h264_qpel%4_%3_10_%1:
 %endmacro
 
 %macro MC00 1
-INIT_MMX
-cglobal_mc mmxext, %1, mc00, 4, 3,4,0
+INIT_MMX mmxext
+cglobal_mc %1, mc00, 4, 3,4,0
     lea           r3, [r2*3]
     COPY4
     ret
 
-INIT_XMM
-cglobal %1_h264_qpel8_mc00_10_sse2, 3,4
+INIT_XMM sse2
+cglobal %1_h264_qpel8_mc00_10, 3,4
     lea  r3, [r2*3]
     COPY4
     lea  r0, [r0+r2*4]
@@ -204,7 +196,7 @@ cglobal %1_h264_qpel8_mc00_10_sse2, 3,4
     COPY4
     RET
 
-cglobal %1_h264_qpel16_mc00_10_sse2, 3,4
+cglobal %1_h264_qpel16_mc00_10, 3,4
     mov r3d, 8
 .loop:
     movu           m0, [r1      ]
@@ -234,28 +226,32 @@ MC00 avg
 %macro MC_CACHE 1
 %define OP_MOV mova
 %define PALIGNR PALIGNR_MMX
-INIT_MMX
-%1 mmxext       , put, 4
-INIT_XMM
-%1 sse2_cache64 , put, 8
+INIT_MMX mmxext
+%1 put, 4
+INIT_XMM sse2, cache64
+%1 put, 8
+INIT_XMM ssse3, cache64
 %define PALIGNR PALIGNR_SSSE3
-%1 ssse3_cache64, put, 8
-%1 sse2         , put, 8, 0
+%1 put, 8
+INIT_XMM sse2
+%1 put, 8, 0
 
 %define OP_MOV AVG_MOV
 %define PALIGNR PALIGNR_MMX
-INIT_MMX
-%1 mmxext       , avg, 4
-INIT_XMM
-%1 sse2_cache64 , avg, 8
+INIT_MMX mmxext
+%1 avg, 4
+INIT_XMM sse2, cache64
+%1 avg, 8
+INIT_XMM ssse3, cache64
 %define PALIGNR PALIGNR_SSSE3
-%1 ssse3_cache64, avg, 8
-%1 sse2         , avg, 8, 0
+%1 avg, 8
+INIT_XMM sse2
+%1 avg, 8, 0
 %endmacro
 
-%macro MC20 3-4
-cglobal_mc %1, %2, mc20, %3, 3,4,9
-    mov     r3d, %3
+%macro MC20 2-3
+cglobal_mc %1, mc20, %2, 3,4,9
+    mov     r3d, %2
     mova     m1, [pw_pixel_max]
 %if num_mmregs > 8
     mova     m8, [pw_16]
@@ -315,10 +311,10 @@ MC_CACHE MC20
 ;-----------------------------------------------------------------------------
 ; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
-%macro MC30 3-4
-cglobal_mc %1, %2, mc30, %3, 3,5,9
+%macro MC30 2-3
+cglobal_mc %1, mc30, %2, 3,5,9
     lea r4, [r1+2]
-    jmp stub_%2_h264_qpel%3_mc10_10_%1.body
+    jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
 %endmacro
 
 MC_CACHE MC30
@@ -326,11 +322,11 @@ MC_CACHE MC30
 ;-----------------------------------------------------------------------------
 ; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
-%macro MC10 3-4
-cglobal_mc %1, %2, mc10, %3, 3,5,9
+%macro MC10 2-3
+cglobal_mc %1, mc10, %2, 3,5,9
     mov      r4, r1
 .body:
-    mov     r3d, %3
+    mov     r3d, %2
     mova     m1, [pw_pixel_max]
 %if num_mmregs > 8
     mova     m8, [pw_16]
@@ -393,8 +389,8 @@ MC_CACHE MC10
 ;-----------------------------------------------------------------------------
 ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
-%macro V_FILT 11
-v_filt%9_%10_10_%11:
+%macro V_FILT 10
+v_filt%9_%10_10
     add    r4, r2
 .no_addr4:
     FILT_V m0, m1, m2, m3, m4, m5, m6, m7
@@ -403,33 +399,33 @@ v_filt%9_%10_10_%11:
     ret
 %endmacro
 
-INIT_MMX
+INIT_MMX mmxext
 RESET_MM_PERMUTATION
 %assign i 0
 %rep 4
-V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext
+V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
 SWAP 0,1,2,3,4,5
 %assign i i+1
 %endrep
 
-INIT_XMM
+INIT_XMM sse2
 RESET_MM_PERMUTATION
 %assign i 0
 %rep 6
-V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2
+V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
 SWAP 0,1,2,3,4,5
 %assign i i+1
 %endrep
 
-%macro MC02 3
-cglobal_mc %1, %2, mc02, %3, 3,4,8
+%macro MC02 2
+cglobal_mc %1, mc02, %2, 3,4,8
     PRELOAD_V
 
     sub      r0, r2
 %assign j 0
-%rep %3
+%rep %2
     %assign i (j % 6)
-    call v_filt%3_ %+ i %+ _10_%1.no_addr4
+    call v_filt%2_ %+ i %+ _10.no_addr4
     OP_MOV [r0], m0
     SWAP 0,1,2,3,4,5
     %assign j j+1
@@ -442,8 +438,8 @@ MC MC02
 ;-----------------------------------------------------------------------------
 ; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
-%macro MC01 3
-cglobal_mc %1, %2, mc01, %3, 3,5,8
+%macro MC01 2
+cglobal_mc %1, mc01, %2, 3,5,8
     mov      r4, r1
 .body:
     PRELOAD_V
@@ -451,9 +447,9 @@ cglobal_mc %1, %2, mc01, %3, 3,5,8
     sub      r4, r2
     sub      r0, r2
 %assign j 0
-%rep %3
+%rep %2
     %assign i (j % 6)
-    call v_filt%3_ %+ i %+ _10_%1
+    call v_filt%2_ %+ i %+ _10
     movu     m7, [r4]
     pavgw    m0, m7
     OP_MOV [r0], m0
@@ -468,10 +464,10 @@ MC MC01
 ;-----------------------------------------------------------------------------
 ; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
-%macro MC03 3
-cglobal_mc %1, %2, mc03, %3, 3,5,8
+%macro MC03 2
+cglobal_mc %1, mc03, %2, 3,5,8
     lea r4, [r1+r2]
-    jmp stub_%2_h264_qpel%3_mc01_10_%1.body
+    jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
 %endmacro
 
 MC MC03
@@ -479,8 +475,8 @@ MC MC03
 ;-----------------------------------------------------------------------------
 ; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
-%macro H_FILT_AVG 3-4
-h_filt%2_%3_10_%1:
+%macro H_FILT_AVG 2-3
+h_filt%1_%2_10:
 ;FILT_H with fewer registers and averaged with the FILT_V result
 ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
 ;unfortunately I need three registers, so m5 will have to be re-read from memory
@@ -507,32 +503,32 @@ h_filt%2_%3_10_%1:
     ret
 %endmacro
 
-INIT_MMX
+INIT_MMX mmxext
 RESET_MM_PERMUTATION
 %assign i 0
 %rep 3
-H_FILT_AVG mmxext, 4, i
+H_FILT_AVG 4, i
 SWAP 0,1,2,3,4,5
 %assign i i+1
 %endrep
-H_FILT_AVG mmxext, 4, i, 0
+H_FILT_AVG 4, i, 0
 
-INIT_XMM
+INIT_XMM sse2
 RESET_MM_PERMUTATION
 %assign i 0
 %rep 6
 %if i==1
-H_FILT_AVG sse2,   8, i, 0
+H_FILT_AVG 8, i, 0
 %else
-H_FILT_AVG sse2,   8, i
+H_FILT_AVG 8, i
 %endif
 SWAP 0,1,2,3,4,5
 %assign i i+1
 %endrep
 
-%macro MC11 3
+%macro MC11 2
 ; this REALLY needs x86_64
-cglobal_mc %1, %2, mc11, %3, 3,6,8
+cglobal_mc %1, mc11, %2, 3,6,8
     mov      r4, r1
 .body:
     PRELOAD_V
@@ -542,11 +538,11 @@ cglobal_mc %1, %2, mc11, %3, 3,6,8
     mov      r5, r2
     neg      r5
 %assign j 0
-%rep %3
+%rep %2
     %assign i (j % 6)
-    call v_filt%3_ %+ i %+ _10_%1
-    call h_filt%3_ %+ i %+ _10_%1
-%if %3==8 && i==1
+    call v_filt%2_ %+ i %+ _10
+    call h_filt%2_ %+ i %+ _10
+%if %2==8 && i==1
     movu     m5, [r1+r5]
 %endif
     OP_MOV [r0], m0
@@ -561,11 +557,11 @@ MC MC11
 ;-----------------------------------------------------------------------------
 ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
-%macro MC31 3
-cglobal_mc %1, %2, mc31, %3, 3,6,8
+%macro MC31 2
+cglobal_mc %1, mc31, %2, 3,6,8
     mov r4, r1
     add r1, 2
-    jmp stub_%2_h264_qpel%3_mc11_10_%1.body
+    jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
 %endmacro
 
 MC MC31
@@ -573,10 +569,10 @@ MC MC31
 ;-----------------------------------------------------------------------------
 ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
-%macro MC13 3
-cglobal_mc %1, %2, mc13, %3, 3,7,12
+%macro MC13 2
+cglobal_mc %1, mc13, %2, 3,7,12
     lea r4, [r1+r2]
-    jmp stub_%2_h264_qpel%3_mc11_10_%1.body
+    jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
 %endmacro
 
 MC MC13
@@ -584,11 +580,11 @@ MC MC13
 ;-----------------------------------------------------------------------------
 ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
-%macro MC33 3
-cglobal_mc %1, %2, mc33, %3, 3,6,8
+%macro MC33 2
+cglobal_mc %1, mc33, %2, 3,6,8
     lea r4, [r1+r2]
     add r1, 2
-    jmp stub_%2_h264_qpel%3_mc11_10_%1.body
+    jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
 %endmacro
 
 MC MC33
@@ -615,15 +611,15 @@ MC MC33
     FILT_H2  %1, %7, %8
 %endmacro
 
-%macro HV 2
-%ifidn %1,sse2
+%macro HV 1
+%if mmsize==16
 %define PAD 12
 %define COUNT 2
 %else
 %define PAD 4
 %define COUNT 3
 %endif
-put_hv%2_10_%1:
+put_hv%1_10:
     neg      r2           ; This actually saves instructions
     lea      r1, [r1+r2*2-mmsize+PAD]
     lea      r4, [rsp+PAD+gprsize]
@@ -640,7 +636,7 @@ put_hv%2_10_%1:
     movu     m4, [r1]
     sub      r1, r2
 %assign i 0
-%rep %2-1
+%rep %1-1
     FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
     psubw    m0, [pad20]
     movu     [r4+i*mmsize*3], m0
@@ -653,7 +649,7 @@ put_hv%2_10_%1:
     movu     [r4+i*mmsize*3], m0
     add      r4, mmsize
     lea      r1, [r1+r2*8+mmsize]
-%if %2==8
+%if %1==8
     lea      r1, [r1+r2*4]
 %endif
     dec      r3d
@@ -662,12 +658,12 @@ put_hv%2_10_%1:
     ret
 %endmacro
 
-INIT_MMX
-HV mmxext, 4
-INIT_XMM
-HV sse2  , 8
+INIT_MMX mmxext
+HV 4
+INIT_XMM sse2
+HV 8
 
-%macro H_LOOP 2
+%macro H_LOOP 1
 %if num_mmregs > 8
     %define s1 m8
     %define s2 m9
@@ -679,7 +675,7 @@ HV sse2  , 8
     %define s3 [tap3]
     %define d1 [depad]
 %endif
-h%2_loop_op_%1:
+h%1_loop_op:
     movu       m1, [r1+mmsize-4]
     movu       m2, [r1+mmsize-2]
     mova       m3, [r1+mmsize+0]
@@ -726,21 +722,21 @@ h%2_loop_op_%1:
     ret
 %endmacro
 
-INIT_MMX
-H_LOOP mmxext, 4
-INIT_XMM
-H_LOOP sse2  , 8
+INIT_MMX mmxext
+H_LOOP 4
+INIT_XMM sse2
+H_LOOP 8
 
-%macro MC22 3
-cglobal_mc %1, %2, mc22, %3, 3,7,12
+%macro MC22 2
+cglobal_mc %1, mc22, %2, 3,7,12
 %define PAD mmsize*8*4*2      ; SIZE*16*4*sizeof(pixel)
     mov      r6, rsp          ; backup stack pointer
     and     rsp, ~(mmsize-1)  ; align stack
     sub     rsp, PAD
 
-    call put_hv%3_10_%1
+    call put_hv%2_10
 
-    mov       r3d, %3
+    mov       r3d, %2
     mova       m7, [pw_pixel_max]
 %if num_mmregs > 8
     pxor       m0, m0
@@ -751,7 +747,7 @@ cglobal_mc %1, %2, mc22, %3, 3,7,12
 %endif
     mov        r1, rsp
 .h_loop:
-    call h%3_loop_op_%1
+    call h%2_loop_op
 
     OP_MOV   [r0], m1
     add        r0, r2
@@ -767,18 +763,18 @@ MC MC22
 ;-----------------------------------------------------------------------------
 ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
-%macro MC12 3
-cglobal_mc %1, %2, mc12, %3, 3,7,12
+%macro MC12 2
+cglobal_mc %1, mc12, %2, 3,7,12
 %define PAD mmsize*8*4*2        ; SIZE*16*4*sizeof(pixel)
     mov        r6, rsp          ; backup stack pointer
     and       rsp, ~(mmsize-1)  ; align stack
     sub       rsp, PAD
 
-    call put_hv%3_10_%1
+    call put_hv%2_10
 
     xor       r4d, r4d
 .body:
-    mov       r3d, %3
+    mov       r3d, %2
     pxor       m0, m0
     mova       m7, [pw_pixel_max]
 %if num_mmregs > 8
@@ -789,7 +785,7 @@ cglobal_mc %1, %2, mc12, %3, 3,7,12
 %endif
     mov        r1, rsp
 .h_loop:
-    call h%3_loop_op_%1
+    call h%2_loop_op
 
     movu       m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
     paddw      m3, [depad2]
@@ -812,17 +808,17 @@ MC MC12
 ;-----------------------------------------------------------------------------
 ; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
-%macro MC32 3
-cglobal_mc %1, %2, mc32, %3, 3,7,12
+%macro MC32 2
+cglobal_mc %1, mc32, %2, 3,7,12
 %define PAD mmsize*8*3*2  ; SIZE*16*4*sizeof(pixel)
     mov  r6, rsp          ; backup stack pointer
     and rsp, ~(mmsize-1)  ; align stack
     sub rsp, PAD
 
-    call put_hv%3_10_%1
+    call put_hv%2_10
 
     mov r4d, 2            ; sizeof(pixel)
-    jmp stub_%2_h264_qpel%3_mc12_10_%1.body
+    jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
 %endmacro
 
 MC MC32
@@ -830,10 +826,10 @@ MC MC32
 ;-----------------------------------------------------------------------------
 ; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
-%macro H_NRD 2
-put_h%2_10_%1:
+%macro H_NRD 1
+put_h%1_10:
     add       rsp, gprsize
-    mov       r3d, %2
+    mov       r3d, %1
     xor       r4d, r4d
     mova       m6, [pad20]
 .nextrow:
@@ -855,13 +851,13 @@ put_h%2_10_%1:
     ret
 %endmacro
 
-INIT_MMX
-H_NRD mmxext, 4
-INIT_XMM
-H_NRD sse2  , 8
+INIT_MMX mmxext
+H_NRD 4
+INIT_XMM sse2
+H_NRD 8
 
-%macro MC21 3
-cglobal_mc %1, %2, mc21, %3, 3,7,12
+%macro MC21 2
+cglobal_mc %1, mc21, %2, 3,7,12
     mov   r5, r1
 .body:
 %define PAD mmsize*8*3*2   ; SIZE*16*4*sizeof(pixel)
@@ -869,13 +865,13 @@ cglobal_mc %1, %2, mc21, %3, 3,7,12
     and  rsp, ~(mmsize-1)  ; align stack
 
     sub  rsp, PAD
-    call put_h%3_10_%1
+    call put_h%2_10
 
     sub  rsp, PAD
-    call put_hv%3_10_%1
+    call put_hv%2_10
 
     mov r4d, PAD-mmsize    ; H buffer
-    jmp stub_%2_h264_qpel%3_mc12_10_%1.body
+    jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
 %endmacro
 
 MC MC21
@@ -883,10 +879,10 @@ MC MC21
 ;-----------------------------------------------------------------------------
 ; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
 ;-----------------------------------------------------------------------------
-%macro MC23 3
-cglobal_mc %1, %2, mc23, %3, 3,7,12
+%macro MC23 2
+cglobal_mc %1, mc23, %2, 3,7,12
     lea   r5, [r1+r2]
-    jmp stub_%2_h264_qpel%3_mc21_10_%1.body
+    jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
 %endmacro
 
 MC MC23

From 4b60fac4199680957b15b7a08c5df47e47c6e25e Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Sun, 8 Jul 2012 01:30:30 +0200
Subject: [PATCH 2/2] x86: PALIGNR: port to cpuflags

---
 libavcodec/x86/h264_intrapred.asm       | 27 -------------------------
 libavcodec/x86/h264_intrapred_10bit.asm | 16 ---------------
 libavcodec/x86/h264_qpel_10bit.asm      |  4 ----
 libavresample/x86/audio_convert.asm     |  6 ------
 libavutil/x86/x86util.asm               | 16 +++++++--------
 libswscale/x86/output.asm               |  4 +---
 6 files changed, 9 insertions(+), 64 deletions(-)

diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
index dc3d475e44..8faaaf4f06 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -1063,10 +1063,8 @@ cglobal pred8x8l_top_dc_8, 4,4
 %endmacro
 
 INIT_MMX mmxext
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_TOP_DC
 INIT_MMX ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_TOP_DC
 
 ;-----------------------------------------------------------------------------
@@ -1168,10 +1166,8 @@ cglobal pred8x8l_dc_8, 4,5
 %endmacro
 
 INIT_MMX mmxext
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_DC
 INIT_MMX ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_DC
 
 ;-----------------------------------------------------------------------------
@@ -1241,10 +1237,8 @@ cglobal pred8x8l_horizontal_8, 4,4
 %endmacro
 
 INIT_MMX mmxext
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_HORIZONTAL
 INIT_MMX ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_HORIZONTAL
 
 ;-----------------------------------------------------------------------------
@@ -1293,10 +1287,8 @@ cglobal pred8x8l_vertical_8, 4,4
 %endmacro
 
 INIT_MMX mmxext
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_VERTICAL
 INIT_MMX ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_VERTICAL
 
 ;-----------------------------------------------------------------------------
@@ -1304,7 +1296,6 @@ PRED8x8L_VERTICAL
 ;-----------------------------------------------------------------------------
 
 INIT_MMX mmxext
-%define PALIGNR PALIGNR_MMX
 cglobal pred8x8l_down_left_8, 4,5
     sub          r0, r3
     movq        mm0, [r0-8]
@@ -1496,10 +1487,8 @@ INIT_XMM cpuname
 %endmacro
 
 INIT_MMX sse2
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_DOWN_LEFT
 INIT_MMX ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_DOWN_LEFT
 
 ;-----------------------------------------------------------------------------
@@ -1507,7 +1496,6 @@ PRED8x8L_DOWN_LEFT
 ;-----------------------------------------------------------------------------
 
 INIT_MMX mmxext
-%define PALIGNR PALIGNR_MMX
 cglobal pred8x8l_down_right_8, 4,5
     sub          r0, r3
     lea          r4, [r0+r3*2]
@@ -1750,10 +1738,8 @@ INIT_XMM cpuname
 %endmacro
 
 INIT_MMX sse2
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_DOWN_RIGHT
 INIT_MMX ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_DOWN_RIGHT
 
 ;-----------------------------------------------------------------------------
@@ -1761,7 +1747,6 @@ PRED8x8L_DOWN_RIGHT
 ;-----------------------------------------------------------------------------
 
 INIT_MMX mmxext
-%define PALIGNR PALIGNR_MMX
 cglobal pred8x8l_vertical_right_8, 4,5
     sub          r0, r3
     lea          r4, [r0+r3*2]
@@ -1980,10 +1965,8 @@ INIT_XMM cpuname
 %endmacro
 
 INIT_MMX sse2
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_VERTICAL_RIGHT
 INIT_MMX ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_VERTICAL_RIGHT
 
 ;-----------------------------------------------------------------------------
@@ -2071,10 +2054,8 @@ INIT_XMM cpuname
 %endmacro
 
 INIT_MMX sse2
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_VERTICAL_LEFT
 INIT_MMX ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_VERTICAL_LEFT
 
 ;-----------------------------------------------------------------------------
@@ -2160,10 +2141,8 @@ cglobal pred8x8l_horizontal_up_8, 4,4
 %endmacro
 
 INIT_MMX mmxext
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_HORIZONTAL_UP
 INIT_MMX ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_HORIZONTAL_UP
 
 ;-----------------------------------------------------------------------------
@@ -2171,7 +2150,6 @@ PRED8x8L_HORIZONTAL_UP
 ;-----------------------------------------------------------------------------
 
 INIT_MMX mmxext
-%define PALIGNR PALIGNR_MMX
 cglobal pred8x8l_horizontal_down_8, 4,5
     sub          r0, r3
     lea          r4, [r0+r3*2]
@@ -2411,10 +2389,8 @@ INIT_XMM cpuname
 %endmacro
 
 INIT_MMX sse2
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_HORIZONTAL_DOWN
 INIT_MMX ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_HORIZONTAL_DOWN
 
 ;-----------------------------------------------------------------------------
@@ -2637,7 +2613,6 @@ cglobal pred4x4_horizontal_up_8, 3,3
 ;-----------------------------------------------------------------------------
 
 INIT_MMX mmxext
-%define PALIGNR PALIGNR_MMX
 cglobal pred4x4_horizontal_down_8, 3,3
     sub       r0, r2
     lea       r1, [r0+r2*2]
@@ -2673,7 +2648,6 @@ cglobal pred4x4_horizontal_down_8, 3,3
 ;-----------------------------------------------------------------------------
 
 INIT_MMX mmxext
-%define PALIGNR PALIGNR_MMX
 cglobal pred4x4_vertical_right_8, 3,3
     sub     r0, r2
     lea     r1, [r0+r2*2]
@@ -2704,7 +2678,6 @@ cglobal pred4x4_vertical_right_8, 3,3
 ;-----------------------------------------------------------------------------
 
 INIT_MMX mmxext
-%define PALIGNR PALIGNR_MMX
 cglobal pred4x4_down_right_8, 3,3
     sub       r0, r2
     lea       r1, [r0+r2*2]
diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm
index 50ebaa78c3..039af6d712 100644
--- a/libavcodec/x86/h264_intrapred_10bit.asm
+++ b/libavcodec/x86/h264_intrapred_10bit.asm
@@ -79,10 +79,8 @@ cglobal pred4x4_down_right_10, 3, 3
 %endmacro
 
 INIT_XMM sse2
-%define PALIGNR PALIGNR_MMX
 PRED4x4_DR
 INIT_XMM ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED4x4_DR
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
@@ -120,10 +118,8 @@ cglobal pred4x4_vertical_right_10, 3, 3, 6
 %endmacro
 
 INIT_XMM sse2
-%define PALIGNR PALIGNR_MMX
 PRED4x4_VR
 INIT_XMM ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED4x4_VR
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
@@ -164,10 +160,8 @@ cglobal pred4x4_horizontal_down_10, 3, 3
 %endmacro
 
 INIT_XMM sse2
-%define PALIGNR PALIGNR_MMX
 PRED4x4_HD
 INIT_XMM ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED4x4_HD
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
@@ -726,10 +720,8 @@ cglobal pred8x8l_horizontal_10, 4, 4, 5
 %endmacro
 
 INIT_XMM sse2
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_HORIZONTAL
 INIT_XMM ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_HORIZONTAL
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
@@ -796,10 +788,8 @@ cglobal pred8x8l_down_left_10, 4, 4, 7
 %endmacro
 
 INIT_XMM sse2
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_DOWN_LEFT
 INIT_XMM ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_DOWN_LEFT
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
@@ -872,10 +862,8 @@ cglobal pred8x8l_down_right_10, 4, 5, 8
 %endmacro
 
 INIT_XMM sse2
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_DOWN_RIGHT
 INIT_XMM ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_DOWN_RIGHT
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
@@ -944,10 +932,8 @@ cglobal pred8x8l_vertical_right_10, 4, 5, 7
 %endmacro
 
 INIT_XMM sse2
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_VERTICAL_RIGHT
 INIT_XMM ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_VERTICAL_RIGHT
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
@@ -1007,10 +993,8 @@ cglobal pred8x8l_horizontal_up_10, 4, 4, 6
 %endmacro
 
 INIT_XMM sse2
-%define PALIGNR PALIGNR_MMX
 PRED8x8L_HORIZONTAL_UP
 INIT_XMM ssse3
-%define PALIGNR PALIGNR_SSSE3
 PRED8x8L_HORIZONTAL_UP
 %if HAVE_AVX_EXTERNAL
 INIT_XMM avx
diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm
index 4aea03209c..c05c7a64d8 100644
--- a/libavcodec/x86/h264_qpel_10bit.asm
+++ b/libavcodec/x86/h264_qpel_10bit.asm
@@ -225,25 +225,21 @@ MC00 avg
 ;-----------------------------------------------------------------------------
 %macro MC_CACHE 1
 %define OP_MOV mova
-%define PALIGNR PALIGNR_MMX
 INIT_MMX mmxext
 %1 put, 4
 INIT_XMM sse2, cache64
 %1 put, 8
 INIT_XMM ssse3, cache64
-%define PALIGNR PALIGNR_SSSE3
 %1 put, 8
 INIT_XMM sse2
 %1 put, 8, 0
 
 %define OP_MOV AVG_MOV
-%define PALIGNR PALIGNR_MMX
 INIT_MMX mmxext
 %1 avg, 4
 INIT_XMM sse2, cache64
 %1 avg, 8
 INIT_XMM ssse3, cache64
-%define PALIGNR PALIGNR_SSSE3
 %1 avg, 8
 INIT_XMM sse2
 %1 avg, 8, 0
diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm
index 567a916bc8..1d125c2b50 100644
--- a/libavresample/x86/audio_convert.asm
+++ b/libavresample/x86/audio_convert.asm
@@ -919,10 +919,8 @@ cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5
     REP_RET
 %endmacro
 
-%define PALIGNR PALIGNR_MMX
 INIT_XMM sse2
 CONV_S16_TO_S16P_6CH
-%define PALIGNR PALIGNR_SSSE3
 INIT_XMM ssse3
 CONV_S16_TO_S16P_6CH
 %if HAVE_AVX_EXTERNAL
@@ -1038,10 +1036,8 @@ cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
     REP_RET
 %endmacro
 
-%define PALIGNR PALIGNR_MMX
 INIT_XMM sse2
 CONV_S16_TO_FLTP_6CH
-%define PALIGNR PALIGNR_SSSE3
 INIT_XMM ssse3
 CONV_S16_TO_FLTP_6CH
 INIT_XMM sse4
@@ -1160,10 +1156,8 @@ cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5
     REP_RET
 %endmacro
 
-%define PALIGNR PALIGNR_MMX
 INIT_XMM sse2
 CONV_FLT_TO_S16P_6CH
-%define PALIGNR PALIGNR_SSSE3
 INIT_XMM ssse3
 CONV_FLT_TO_S16P_6CH
 %if HAVE_AVX_EXTERNAL
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index b35d5945d8..31163eec7b 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -280,7 +280,14 @@
 %endif
 %endmacro
 
-%macro PALIGNR_MMX 4-5 ; [dst,] src1, src2, imm, tmp
+%macro PALIGNR 4-5
+%if cpuflag(ssse3)
+%if %0==5
+    palignr %1, %2, %3, %4
+%else
+    palignr %1, %2, %3
+%endif
+%elif cpuflag(mmx) ; [dst,] src1, src2, imm, tmp
     %define %%dst %1
 %if %0==5
 %ifnidn %1, %2
@@ -299,13 +306,6 @@
     psrldq  %4, %3
 %endif
     por     %%dst, %4
-%endmacro
-
-%macro PALIGNR_SSSE3 4-5
-%if %0==5
-    palignr %1, %2, %3, %4
-%else
-    palignr %1, %2, %3
 %endif
 %endmacro
 
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index cf0dec3843..23508b8d82 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -246,7 +246,6 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
 %endif ; %1 == 8/9/10/16
 %endmacro
 
-%define PALIGNR PALIGNR_MMX
 %if ARCH_X86_32
 INIT_MMX mmx2
 yuv2planeX_fn  8,  0, 7
@@ -259,7 +258,6 @@ yuv2planeX_fn  8, 10, 7
 yuv2planeX_fn  9,  7, 5
 yuv2planeX_fn 10,  7, 5
 
-%define PALIGNR PALIGNR_SSSE3
 INIT_XMM sse4
 yuv2planeX_fn  8, 10, 7
 yuv2planeX_fn  9,  7, 5
@@ -344,7 +342,7 @@ cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
 %if mmsize == 16
     punpcklqdq      m3, m3
 %endif ; mmsize == 16
-    PALIGNR_MMX     m3, m3, 3, m2
+    PALIGNR         m3, m3, 3, m2
 .no_rot:
 %if mmsize == 8
     mova            m2, m3