From 4d1f69f2440041b58d5a31bcfcff83ee3c88ac7e Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Mon, 30 Jul 2012 17:04:26 +0200 Subject: [PATCH 1/2] x86: h264_qpel_10bit: port to cpuflags --- libavcodec/x86/h264_qpel_10bit.asm | 314 ++++++++++++++--------------- 1 file changed, 155 insertions(+), 159 deletions(-) diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm index a84b810955..4aea03209c 100644 --- a/libavcodec/x86/h264_qpel_10bit.asm +++ b/libavcodec/x86/h264_qpel_10bit.asm @@ -97,81 +97,73 @@ SECTION .text %macro MC 1 %define OP_MOV mova -INIT_MMX -%1 mmxext, put, 4 -INIT_XMM -%1 sse2 , put, 8 +INIT_MMX mmxext +%1 put, 4 +INIT_XMM sse2 +%1 put, 8 %define OP_MOV AVG_MOV -INIT_MMX -%1 mmxext, avg, 4 -INIT_XMM -%1 sse2 , avg, 8 +INIT_MMX mmxext +%1 avg, 4 +INIT_XMM sse2 +%1 avg, 8 %endmacro -%macro MCAxA 8 -%if ARCH_X86_64 -%ifnidn %1,mmxext -MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8 -%endif -%else -MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8 -%endif -%endmacro - -%macro MCAxA_OP 8 +%macro MCAxA_OP 7 %if ARCH_X86_32 -cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8 - call stub_%2_h264_qpel%4_%3_10_%1 +cglobal %1_h264_qpel%4_%2_10, %5,%6,%7 + call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX mov r0, r0m mov r1, r1m - add r0, %4*2 - add r1, %4*2 - call stub_%2_h264_qpel%4_%3_10_%1 + add r0, %3*2 + add r1, %3*2 + call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX mov r0, r0m mov r1, r1m - lea r0, [r0+r2*%4] - lea r1, [r1+r2*%4] - call stub_%2_h264_qpel%4_%3_10_%1 + lea r0, [r0+r2*%3] + lea r1, [r1+r2*%3] + call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX mov r0, r0m mov r1, r1m - lea r0, [r0+r2*%4+%4*2] - lea r1, [r1+r2*%4+%4*2] - call stub_%2_h264_qpel%4_%3_10_%1 + lea r0, [r0+r2*%3+%3*2] + lea r1, [r1+r2*%3+%3*2] + call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX RET %else ; ARCH_X86_64 -cglobal %2_h264_qpel%5_%3_10_%1, %6,%7 + 2,%8 - mov r%7, r0 -%assign p1 %7+1 +cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7 + mov r%6, r0 +%assign p1 %6+1 mov r %+ p1, r1 - call stub_%2_h264_qpel%4_%3_10_%1 - lea r0, [r%7+%4*2] - lea r1, [r %+ p1+%4*2] - call stub_%2_h264_qpel%4_%3_10_%1 - lea r0, [r%7+r2*%4] - lea r1, [r %+ p1+r2*%4] - call stub_%2_h264_qpel%4_%3_10_%1 - lea r0, [r%7+r2*%4+%4*2] - lea r1, [r %+ p1+r2*%4+%4*2] + call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX + lea r0, [r%6+%3*2] + lea r1, [r %+ p1+%3*2] + call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX + lea r0, [r%6+r2*%3] + lea r1, [r %+ p1+r2*%3] + call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX + lea r0, [r%6+r2*%3+%3*2] + lea r1, [r %+ p1+r2*%3+%3*2] %if UNIX64 == 0 ; fall through to function - call stub_%2_h264_qpel%4_%3_10_%1 + call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX RET %endif %endif %endmacro ;cpu, put/avg, mc, 4/8, ... -%macro cglobal_mc 7 -%assign i %4*2 -MCAxA %1, %2, %3, %4, i, %5,%6,%7 +%macro cglobal_mc 6 +%assign i %3*2 +%if ARCH_X86_32 || cpuflag(sse2) +MCAxA_OP %1, %2, %3, i, %4,%5,%6 +%endif -cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7 +cglobal %1_h264_qpel%3_%2_10, %4,%5,%6 %if UNIX64 == 0 ; no prologue or epilogue for UNIX64 - call stub_%2_h264_qpel%4_%3_10_%1 + call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX RET %endif -stub_%2_h264_qpel%4_%3_10_%1: +stub_%1_h264_qpel%3_%2_10 %+ SUFFIX: %endmacro ;----------------------------------------------------------------------------- @@ -189,14 +181,14 @@ stub_%2_h264_qpel%4_%3_10_%1: %endmacro %macro MC00 1 -INIT_MMX -cglobal_mc mmxext, %1, mc00, 4, 3,4,0 +INIT_MMX mmxext +cglobal_mc %1, mc00, 4, 3,4,0 lea r3, [r2*3] COPY4 ret -INIT_XMM -cglobal %1_h264_qpel8_mc00_10_sse2, 3,4 +INIT_XMM sse2 +cglobal %1_h264_qpel8_mc00_10, 3,4 lea r3, [r2*3] COPY4 lea r0, [r0+r2*4] @@ -204,7 +196,7 @@ cglobal %1_h264_qpel8_mc00_10_sse2, 3,4 COPY4 RET -cglobal %1_h264_qpel16_mc00_10_sse2, 3,4 +cglobal %1_h264_qpel16_mc00_10, 3,4 mov r3d, 8 .loop: movu m0, [r1 ] @@ -234,28 +226,32 @@ MC00 avg %macro MC_CACHE 1 %define OP_MOV mova %define PALIGNR PALIGNR_MMX -INIT_MMX -%1 mmxext , put, 4 -INIT_XMM -%1 sse2_cache64 , put, 8 +INIT_MMX mmxext +%1 put, 4 +INIT_XMM sse2, cache64 +%1 put, 8 +INIT_XMM ssse3, cache64 %define PALIGNR PALIGNR_SSSE3 -%1 ssse3_cache64, put, 8 -%1 sse2 , put, 8, 0 +%1 put, 8 +INIT_XMM sse2 +%1 put, 8, 0 %define OP_MOV AVG_MOV %define PALIGNR PALIGNR_MMX -INIT_MMX -%1 mmxext , avg, 4 -INIT_XMM -%1 sse2_cache64 , avg, 8 +INIT_MMX mmxext +%1 avg, 4 +INIT_XMM sse2, cache64 +%1 avg, 8 +INIT_XMM ssse3, cache64 %define PALIGNR PALIGNR_SSSE3 -%1 ssse3_cache64, avg, 8 -%1 sse2 , avg, 8, 0 +%1 avg, 8 +INIT_XMM sse2 +%1 avg, 8, 0 %endmacro -%macro MC20 3-4 -cglobal_mc %1, %2, mc20, %3, 3,4,9 - mov r3d, %3 +%macro MC20 2-3 +cglobal_mc %1, mc20, %2, 3,4,9 + mov r3d, %2 mova m1, [pw_pixel_max] %if num_mmregs > 8 mova m8, [pw_16] @@ -315,10 +311,10 @@ MC_CACHE MC20 ;----------------------------------------------------------------------------- ; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro MC30 3-4 -cglobal_mc %1, %2, mc30, %3, 3,5,9 +%macro MC30 2-3 +cglobal_mc %1, mc30, %2, 3,5,9 lea r4, [r1+2] - jmp stub_%2_h264_qpel%3_mc10_10_%1.body + jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body %endmacro MC_CACHE MC30 @@ -326,11 +322,11 @@ MC_CACHE MC30 ;----------------------------------------------------------------------------- ; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro MC10 3-4 -cglobal_mc %1, %2, mc10, %3, 3,5,9 +%macro MC10 2-3 +cglobal_mc %1, mc10, %2, 3,5,9 mov r4, r1 .body: - mov r3d, %3 + mov r3d, %2 mova m1, [pw_pixel_max] %if num_mmregs > 8 mova m8, [pw_16] @@ -393,8 +389,8 @@ MC_CACHE MC10 ;----------------------------------------------------------------------------- ; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro V_FILT 11 -v_filt%9_%10_10_%11: +%macro V_FILT 10 +v_filt%9_%10_10 add r4, r2 .no_addr4: FILT_V m0, m1, m2, m3, m4, m5, m6, m7 @@ -403,33 +399,33 @@ v_filt%9_%10_10_%11: ret %endmacro -INIT_MMX +INIT_MMX mmxext RESET_MM_PERMUTATION %assign i 0 %rep 4 -V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext +V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i SWAP 0,1,2,3,4,5 %assign i i+1 %endrep -INIT_XMM +INIT_XMM sse2 RESET_MM_PERMUTATION %assign i 0 %rep 6 -V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2 +V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i SWAP 0,1,2,3,4,5 %assign i i+1 %endrep -%macro MC02 3 -cglobal_mc %1, %2, mc02, %3, 3,4,8 +%macro MC02 2 +cglobal_mc %1, mc02, %2, 3,4,8 PRELOAD_V sub r0, r2 %assign j 0 -%rep %3 +%rep %2 %assign i (j % 6) - call v_filt%3_ %+ i %+ _10_%1.no_addr4 + call v_filt%2_ %+ i %+ _10.no_addr4 OP_MOV [r0], m0 SWAP 0,1,2,3,4,5 %assign j j+1 @@ -442,8 +438,8 @@ MC MC02 ;----------------------------------------------------------------------------- ; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro MC01 3 -cglobal_mc %1, %2, mc01, %3, 3,5,8 +%macro MC01 2 +cglobal_mc %1, mc01, %2, 3,5,8 mov r4, r1 .body: PRELOAD_V @@ -451,9 +447,9 @@ cglobal_mc %1, %2, mc01, %3, 3,5,8 sub r4, r2 sub r0, r2 %assign j 0 -%rep %3 +%rep %2 %assign i (j % 6) - call v_filt%3_ %+ i %+ _10_%1 + call v_filt%2_ %+ i %+ _10 movu m7, [r4] pavgw m0, m7 OP_MOV [r0], m0 @@ -468,10 +464,10 @@ MC MC01 ;----------------------------------------------------------------------------- ; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro MC03 3 -cglobal_mc %1, %2, mc03, %3, 3,5,8 +%macro MC03 2 +cglobal_mc %1, mc03, %2, 3,5,8 lea r4, [r1+r2] - jmp stub_%2_h264_qpel%3_mc01_10_%1.body + jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body %endmacro MC MC03 @@ -479,8 +475,8 @@ MC MC03 ;----------------------------------------------------------------------------- ; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro H_FILT_AVG 3-4 -h_filt%2_%3_10_%1: +%macro H_FILT_AVG 2-3 +h_filt%1_%2_10: ;FILT_H with fewer registers and averaged with the FILT_V result ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration ;unfortunately I need three registers, so m5 will have to be re-read from memory @@ -507,32 +503,32 @@ h_filt%2_%3_10_%1: ret %endmacro -INIT_MMX +INIT_MMX mmxext RESET_MM_PERMUTATION %assign i 0 %rep 3 -H_FILT_AVG mmxext, 4, i +H_FILT_AVG 4, i SWAP 0,1,2,3,4,5 %assign i i+1 %endrep -H_FILT_AVG mmxext, 4, i, 0 +H_FILT_AVG 4, i, 0 -INIT_XMM +INIT_XMM sse2 RESET_MM_PERMUTATION %assign i 0 %rep 6 %if i==1 -H_FILT_AVG sse2, 8, i, 0 +H_FILT_AVG 8, i, 0 %else -H_FILT_AVG sse2, 8, i +H_FILT_AVG 8, i %endif SWAP 0,1,2,3,4,5 %assign i i+1 %endrep -%macro MC11 3 +%macro MC11 2 ; this REALLY needs x86_64 -cglobal_mc %1, %2, mc11, %3, 3,6,8 +cglobal_mc %1, mc11, %2, 3,6,8 mov r4, r1 .body: PRELOAD_V @@ -542,11 +538,11 @@ cglobal_mc %1, %2, mc11, %3, 3,6,8 mov r5, r2 neg r5 %assign j 0 -%rep %3 +%rep %2 %assign i (j % 6) - call v_filt%3_ %+ i %+ _10_%1 - call h_filt%3_ %+ i %+ _10_%1 -%if %3==8 && i==1 + call v_filt%2_ %+ i %+ _10 + call h_filt%2_ %+ i %+ _10 +%if %2==8 && i==1 movu m5, [r1+r5] %endif OP_MOV [r0], m0 @@ -561,11 +557,11 @@ MC MC11 ;----------------------------------------------------------------------------- ; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro MC31 3 -cglobal_mc %1, %2, mc31, %3, 3,6,8 +%macro MC31 2 +cglobal_mc %1, mc31, %2, 3,6,8 mov r4, r1 add r1, 2 - jmp stub_%2_h264_qpel%3_mc11_10_%1.body + jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body %endmacro MC MC31 @@ -573,10 +569,10 @@ MC MC31 ;----------------------------------------------------------------------------- ; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro MC13 3 -cglobal_mc %1, %2, mc13, %3, 3,7,12 +%macro MC13 2 +cglobal_mc %1, mc13, %2, 3,7,12 lea r4, [r1+r2] - jmp stub_%2_h264_qpel%3_mc11_10_%1.body + jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body %endmacro MC MC13 @@ -584,11 +580,11 @@ MC MC13 ;----------------------------------------------------------------------------- ; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro MC33 3 -cglobal_mc %1, %2, mc33, %3, 3,6,8 +%macro MC33 2 +cglobal_mc %1, mc33, %2, 3,6,8 lea r4, [r1+r2] add r1, 2 - jmp stub_%2_h264_qpel%3_mc11_10_%1.body + jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body %endmacro MC MC33 @@ -615,15 +611,15 @@ MC MC33 FILT_H2 %1, %7, %8 %endmacro -%macro HV 2 -%ifidn %1,sse2 +%macro HV 1 +%if mmsize==16 %define PAD 12 %define COUNT 2 %else %define PAD 4 %define COUNT 3 %endif -put_hv%2_10_%1: +put_hv%1_10: neg r2 ; This actually saves instructions lea r1, [r1+r2*2-mmsize+PAD] lea r4, [rsp+PAD+gprsize] @@ -640,7 +636,7 @@ put_hv%2_10_%1: movu m4, [r1] sub r1, r2 %assign i 0 -%rep %2-1 +%rep %1-1 FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 psubw m0, [pad20] movu [r4+i*mmsize*3], m0 @@ -653,7 +649,7 @@ put_hv%2_10_%1: movu [r4+i*mmsize*3], m0 add r4, mmsize lea r1, [r1+r2*8+mmsize] -%if %2==8 +%if %1==8 lea r1, [r1+r2*4] %endif dec r3d @@ -662,12 +658,12 @@ put_hv%2_10_%1: ret %endmacro -INIT_MMX -HV mmxext, 4 -INIT_XMM -HV sse2 , 8 +INIT_MMX mmxext +HV 4 +INIT_XMM sse2 +HV 8 -%macro H_LOOP 2 +%macro H_LOOP 1 %if num_mmregs > 8 %define s1 m8 %define s2 m9 @@ -679,7 +675,7 @@ HV sse2 , 8 %define s3 [tap3] %define d1 [depad] %endif -h%2_loop_op_%1: +h%1_loop_op: movu m1, [r1+mmsize-4] movu m2, [r1+mmsize-2] mova m3, [r1+mmsize+0] @@ -726,21 +722,21 @@ h%2_loop_op_%1: ret %endmacro -INIT_MMX -H_LOOP mmxext, 4 -INIT_XMM -H_LOOP sse2 , 8 +INIT_MMX mmxext +H_LOOP 4 +INIT_XMM sse2 +H_LOOP 8 -%macro MC22 3 -cglobal_mc %1, %2, mc22, %3, 3,7,12 +%macro MC22 2 +cglobal_mc %1, mc22, %2, 3,7,12 %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) mov r6, rsp ; backup stack pointer and rsp, ~(mmsize-1) ; align stack sub rsp, PAD - call put_hv%3_10_%1 + call put_hv%2_10 - mov r3d, %3 + mov r3d, %2 mova m7, [pw_pixel_max] %if num_mmregs > 8 pxor m0, m0 @@ -751,7 +747,7 @@ cglobal_mc %1, %2, mc22, %3, 3,7,12 %endif mov r1, rsp .h_loop: - call h%3_loop_op_%1 + call h%2_loop_op OP_MOV [r0], m1 add r0, r2 @@ -767,18 +763,18 @@ MC MC22 ;----------------------------------------------------------------------------- ; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro MC12 3 -cglobal_mc %1, %2, mc12, %3, 3,7,12 +%macro MC12 2 +cglobal_mc %1, mc12, %2, 3,7,12 %define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) mov r6, rsp ; backup stack pointer and rsp, ~(mmsize-1) ; align stack sub rsp, PAD - call put_hv%3_10_%1 + call put_hv%2_10 xor r4d, r4d .body: - mov r3d, %3 + mov r3d, %2 pxor m0, m0 mova m7, [pw_pixel_max] %if num_mmregs > 8 @@ -789,7 +785,7 @@ cglobal_mc %1, %2, mc12, %3, 3,7,12 %endif mov r1, rsp .h_loop: - call h%3_loop_op_%1 + call h%2_loop_op movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc paddw m3, [depad2] @@ -812,17 +808,17 @@ MC MC12 ;----------------------------------------------------------------------------- ; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro MC32 3 -cglobal_mc %1, %2, mc32, %3, 3,7,12 +%macro MC32 2 +cglobal_mc %1, mc32, %2, 3,7,12 %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) mov r6, rsp ; backup stack pointer and rsp, ~(mmsize-1) ; align stack sub rsp, PAD - call put_hv%3_10_%1 + call put_hv%2_10 mov r4d, 2 ; sizeof(pixel) - jmp stub_%2_h264_qpel%3_mc12_10_%1.body + jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body %endmacro MC MC32 @@ -830,10 +826,10 @@ MC MC32 ;----------------------------------------------------------------------------- ; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro H_NRD 2 -put_h%2_10_%1: +%macro H_NRD 1 +put_h%1_10: add rsp, gprsize - mov r3d, %2 + mov r3d, %1 xor r4d, r4d mova m6, [pad20] .nextrow: @@ -855,13 +851,13 @@ put_h%2_10_%1: ret %endmacro -INIT_MMX -H_NRD mmxext, 4 -INIT_XMM -H_NRD sse2 , 8 +INIT_MMX mmxext +H_NRD 4 +INIT_XMM sse2 +H_NRD 8 -%macro MC21 3 -cglobal_mc %1, %2, mc21, %3, 3,7,12 +%macro MC21 2 +cglobal_mc %1, mc21, %2, 3,7,12 mov r5, r1 .body: %define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) @@ -869,13 +865,13 @@ cglobal_mc %1, %2, mc21, %3, 3,7,12 and rsp, ~(mmsize-1) ; align stack sub rsp, PAD - call put_h%3_10_%1 + call put_h%2_10 sub rsp, PAD - call put_hv%3_10_%1 + call put_hv%2_10 mov r4d, PAD-mmsize ; H buffer - jmp stub_%2_h264_qpel%3_mc12_10_%1.body + jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body %endmacro MC MC21 @@ -883,10 +879,10 @@ MC MC21 ;----------------------------------------------------------------------------- ; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride) ;----------------------------------------------------------------------------- -%macro MC23 3 -cglobal_mc %1, %2, mc23, %3, 3,7,12 +%macro MC23 2 +cglobal_mc %1, mc23, %2, 3,7,12 lea r5, [r1+r2] - jmp stub_%2_h264_qpel%3_mc21_10_%1.body + jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body %endmacro MC MC23 From 4b60fac4199680957b15b7a08c5df47e47c6e25e Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Sun, 8 Jul 2012 01:30:30 +0200 Subject: [PATCH 2/2] x86: PALIGNR: port to cpuflags --- libavcodec/x86/h264_intrapred.asm | 27 ------------------------- libavcodec/x86/h264_intrapred_10bit.asm | 16 --------------- libavcodec/x86/h264_qpel_10bit.asm | 4 ---- libavresample/x86/audio_convert.asm | 6 ------ libavutil/x86/x86util.asm | 16 +++++++-------- libswscale/x86/output.asm | 4 +--- 6 files changed, 9 insertions(+), 64 deletions(-) diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm index dc3d475e44..8faaaf4f06 100644 --- a/libavcodec/x86/h264_intrapred.asm +++ b/libavcodec/x86/h264_intrapred.asm @@ -1063,10 +1063,8 @@ cglobal pred8x8l_top_dc_8, 4,4 %endmacro INIT_MMX mmxext -%define PALIGNR PALIGNR_MMX PRED8x8L_TOP_DC INIT_MMX ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_TOP_DC ;----------------------------------------------------------------------------- @@ -1168,10 +1166,8 @@ cglobal pred8x8l_dc_8, 4,5 %endmacro INIT_MMX mmxext -%define PALIGNR PALIGNR_MMX PRED8x8L_DC INIT_MMX ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_DC ;----------------------------------------------------------------------------- @@ -1241,10 +1237,8 @@ cglobal pred8x8l_horizontal_8, 4,4 %endmacro INIT_MMX mmxext -%define PALIGNR PALIGNR_MMX PRED8x8L_HORIZONTAL INIT_MMX ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_HORIZONTAL ;----------------------------------------------------------------------------- @@ -1293,10 +1287,8 @@ cglobal pred8x8l_vertical_8, 4,4 %endmacro INIT_MMX mmxext -%define PALIGNR PALIGNR_MMX PRED8x8L_VERTICAL INIT_MMX ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_VERTICAL ;----------------------------------------------------------------------------- @@ -1304,7 +1296,6 @@ PRED8x8L_VERTICAL ;----------------------------------------------------------------------------- INIT_MMX mmxext -%define PALIGNR PALIGNR_MMX cglobal pred8x8l_down_left_8, 4,5 sub r0, r3 movq mm0, [r0-8] @@ -1496,10 +1487,8 @@ INIT_XMM cpuname %endmacro INIT_MMX sse2 -%define PALIGNR PALIGNR_MMX PRED8x8L_DOWN_LEFT INIT_MMX ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_DOWN_LEFT ;----------------------------------------------------------------------------- @@ -1507,7 +1496,6 @@ PRED8x8L_DOWN_LEFT ;----------------------------------------------------------------------------- INIT_MMX mmxext -%define PALIGNR PALIGNR_MMX cglobal pred8x8l_down_right_8, 4,5 sub r0, r3 lea r4, [r0+r3*2] @@ -1750,10 +1738,8 @@ INIT_XMM cpuname %endmacro INIT_MMX sse2 -%define PALIGNR PALIGNR_MMX PRED8x8L_DOWN_RIGHT INIT_MMX ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_DOWN_RIGHT ;----------------------------------------------------------------------------- @@ -1761,7 +1747,6 @@ PRED8x8L_DOWN_RIGHT ;----------------------------------------------------------------------------- INIT_MMX mmxext -%define PALIGNR PALIGNR_MMX cglobal pred8x8l_vertical_right_8, 4,5 sub r0, r3 lea r4, [r0+r3*2] @@ -1980,10 +1965,8 @@ INIT_XMM cpuname %endmacro INIT_MMX sse2 -%define PALIGNR PALIGNR_MMX PRED8x8L_VERTICAL_RIGHT INIT_MMX ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_VERTICAL_RIGHT ;----------------------------------------------------------------------------- @@ -2071,10 +2054,8 @@ INIT_XMM cpuname %endmacro INIT_MMX sse2 -%define PALIGNR PALIGNR_MMX PRED8x8L_VERTICAL_LEFT INIT_MMX ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_VERTICAL_LEFT ;----------------------------------------------------------------------------- @@ -2160,10 +2141,8 @@ cglobal pred8x8l_horizontal_up_8, 4,4 %endmacro INIT_MMX mmxext -%define PALIGNR PALIGNR_MMX PRED8x8L_HORIZONTAL_UP INIT_MMX ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_HORIZONTAL_UP ;----------------------------------------------------------------------------- @@ -2171,7 +2150,6 @@ PRED8x8L_HORIZONTAL_UP ;----------------------------------------------------------------------------- INIT_MMX mmxext -%define PALIGNR PALIGNR_MMX cglobal pred8x8l_horizontal_down_8, 4,5 sub r0, r3 lea r4, [r0+r3*2] @@ -2411,10 +2389,8 @@ INIT_XMM cpuname %endmacro INIT_MMX sse2 -%define PALIGNR PALIGNR_MMX PRED8x8L_HORIZONTAL_DOWN INIT_MMX ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_HORIZONTAL_DOWN ;----------------------------------------------------------------------------- @@ -2637,7 +2613,6 @@ cglobal pred4x4_horizontal_up_8, 3,3 ;----------------------------------------------------------------------------- INIT_MMX mmxext -%define PALIGNR PALIGNR_MMX cglobal pred4x4_horizontal_down_8, 3,3 sub r0, r2 lea r1, [r0+r2*2] @@ -2673,7 +2648,6 @@ cglobal pred4x4_horizontal_down_8, 3,3 ;----------------------------------------------------------------------------- INIT_MMX mmxext -%define PALIGNR PALIGNR_MMX cglobal pred4x4_vertical_right_8, 3,3 sub r0, r2 lea r1, [r0+r2*2] @@ -2704,7 +2678,6 @@ cglobal pred4x4_vertical_right_8, 3,3 ;----------------------------------------------------------------------------- INIT_MMX mmxext -%define PALIGNR PALIGNR_MMX cglobal pred4x4_down_right_8, 3,3 sub r0, r2 lea r1, [r0+r2*2] diff --git a/libavcodec/x86/h264_intrapred_10bit.asm b/libavcodec/x86/h264_intrapred_10bit.asm index 50ebaa78c3..039af6d712 100644 --- a/libavcodec/x86/h264_intrapred_10bit.asm +++ b/libavcodec/x86/h264_intrapred_10bit.asm @@ -79,10 +79,8 @@ cglobal pred4x4_down_right_10, 3, 3 %endmacro INIT_XMM sse2 -%define PALIGNR PALIGNR_MMX PRED4x4_DR INIT_XMM ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED4x4_DR %if HAVE_AVX_EXTERNAL INIT_XMM avx @@ -120,10 +118,8 @@ cglobal pred4x4_vertical_right_10, 3, 3, 6 %endmacro INIT_XMM sse2 -%define PALIGNR PALIGNR_MMX PRED4x4_VR INIT_XMM ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED4x4_VR %if HAVE_AVX_EXTERNAL INIT_XMM avx @@ -164,10 +160,8 @@ cglobal pred4x4_horizontal_down_10, 3, 3 %endmacro INIT_XMM sse2 -%define PALIGNR PALIGNR_MMX PRED4x4_HD INIT_XMM ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED4x4_HD %if HAVE_AVX_EXTERNAL INIT_XMM avx @@ -726,10 +720,8 @@ cglobal pred8x8l_horizontal_10, 4, 4, 5 %endmacro INIT_XMM sse2 -%define PALIGNR PALIGNR_MMX PRED8x8L_HORIZONTAL INIT_XMM ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_HORIZONTAL %if HAVE_AVX_EXTERNAL INIT_XMM avx @@ -796,10 +788,8 @@ cglobal pred8x8l_down_left_10, 4, 4, 7 %endmacro INIT_XMM sse2 -%define PALIGNR PALIGNR_MMX PRED8x8L_DOWN_LEFT INIT_XMM ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_DOWN_LEFT %if HAVE_AVX_EXTERNAL INIT_XMM avx @@ -872,10 +862,8 @@ cglobal pred8x8l_down_right_10, 4, 5, 8 %endmacro INIT_XMM sse2 -%define PALIGNR PALIGNR_MMX PRED8x8L_DOWN_RIGHT INIT_XMM ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_DOWN_RIGHT %if HAVE_AVX_EXTERNAL INIT_XMM avx @@ -944,10 +932,8 @@ cglobal pred8x8l_vertical_right_10, 4, 5, 7 %endmacro INIT_XMM sse2 -%define PALIGNR PALIGNR_MMX PRED8x8L_VERTICAL_RIGHT INIT_XMM ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_VERTICAL_RIGHT %if HAVE_AVX_EXTERNAL INIT_XMM avx @@ -1007,10 +993,8 @@ cglobal pred8x8l_horizontal_up_10, 4, 4, 6 %endmacro INIT_XMM sse2 -%define PALIGNR PALIGNR_MMX PRED8x8L_HORIZONTAL_UP INIT_XMM ssse3 -%define PALIGNR PALIGNR_SSSE3 PRED8x8L_HORIZONTAL_UP %if HAVE_AVX_EXTERNAL INIT_XMM avx diff --git a/libavcodec/x86/h264_qpel_10bit.asm b/libavcodec/x86/h264_qpel_10bit.asm index 4aea03209c..c05c7a64d8 100644 --- a/libavcodec/x86/h264_qpel_10bit.asm +++ b/libavcodec/x86/h264_qpel_10bit.asm @@ -225,25 +225,21 @@ MC00 avg ;----------------------------------------------------------------------------- %macro MC_CACHE 1 %define OP_MOV mova -%define PALIGNR PALIGNR_MMX INIT_MMX mmxext %1 put, 4 INIT_XMM sse2, cache64 %1 put, 8 INIT_XMM ssse3, cache64 -%define PALIGNR PALIGNR_SSSE3 %1 put, 8 INIT_XMM sse2 %1 put, 8, 0 %define OP_MOV AVG_MOV -%define PALIGNR PALIGNR_MMX INIT_MMX mmxext %1 avg, 4 INIT_XMM sse2, cache64 %1 avg, 8 INIT_XMM ssse3, cache64 -%define PALIGNR PALIGNR_SSSE3 %1 avg, 8 INIT_XMM sse2 %1 avg, 8, 0 diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index 567a916bc8..1d125c2b50 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -919,10 +919,8 @@ cglobal conv_s16_to_s16p_6ch, 2,7,5, dst, src, dst1, dst2, dst3, dst4, dst5 REP_RET %endmacro -%define PALIGNR PALIGNR_MMX INIT_XMM sse2 CONV_S16_TO_S16P_6CH -%define PALIGNR PALIGNR_SSSE3 INIT_XMM ssse3 CONV_S16_TO_S16P_6CH %if HAVE_AVX_EXTERNAL @@ -1038,10 +1036,8 @@ cglobal conv_s16_to_fltp_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 REP_RET %endmacro -%define PALIGNR PALIGNR_MMX INIT_XMM sse2 CONV_S16_TO_FLTP_6CH -%define PALIGNR PALIGNR_SSSE3 INIT_XMM ssse3 CONV_S16_TO_FLTP_6CH INIT_XMM sse4 @@ -1160,10 +1156,8 @@ cglobal conv_flt_to_s16p_6ch, 2,7,7, dst, src, dst1, dst2, dst3, dst4, dst5 REP_RET %endmacro -%define PALIGNR PALIGNR_MMX INIT_XMM sse2 CONV_FLT_TO_S16P_6CH -%define PALIGNR PALIGNR_SSSE3 INIT_XMM ssse3 CONV_FLT_TO_S16P_6CH %if HAVE_AVX_EXTERNAL diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index b35d5945d8..31163eec7b 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -280,7 +280,14 @@ %endif %endmacro -%macro PALIGNR_MMX 4-5 ; [dst,] src1, src2, imm, tmp +%macro PALIGNR 4-5 +%if cpuflag(ssse3) +%if %0==5 + palignr %1, %2, %3, %4 +%else + palignr %1, %2, %3 +%endif +%elif cpuflag(mmx) ; [dst,] src1, src2, imm, tmp %define %%dst %1 %if %0==5 %ifnidn %1, %2 @@ -299,13 +306,6 @@ psrldq %4, %3 %endif por %%dst, %4 -%endmacro - -%macro PALIGNR_SSSE3 4-5 -%if %0==5 - palignr %1, %2, %3, %4 -%else - palignr %1, %2, %3 %endif %endmacro diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm index cf0dec3843..23508b8d82 100644 --- a/libswscale/x86/output.asm +++ b/libswscale/x86/output.asm @@ -246,7 +246,6 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset %endif ; %1 == 8/9/10/16 %endmacro -%define PALIGNR PALIGNR_MMX %if ARCH_X86_32 INIT_MMX mmx2 yuv2planeX_fn 8, 0, 7 @@ -259,7 +258,6 @@ yuv2planeX_fn 8, 10, 7 yuv2planeX_fn 9, 7, 5 yuv2planeX_fn 10, 7, 5 -%define PALIGNR PALIGNR_SSSE3 INIT_XMM sse4 yuv2planeX_fn 8, 10, 7 yuv2planeX_fn 9, 7, 5 @@ -344,7 +342,7 @@ cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset %if mmsize == 16 punpcklqdq m3, m3 %endif ; mmsize == 16 - PALIGNR_MMX m3, m3, 3, m2 + PALIGNR m3, m3, 3, m2 .no_rot: %if mmsize == 8 mova m2, m3