From 4618f36a2424a3a4d5760afabc2e9dd18d73f0a4 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt Date: Sat, 11 Jun 2022 16:24:23 +0200 Subject: [PATCH] avcodec/x86/h264dsp_init: Remove obsolete MMX(EXT) functions x64 always has MMX, MMXEXT, SSE and SSE2 and this means that some functions for MMX, MMXEXT and 3dnow are always overridden by other functions (unless one e.g. explicitly disables SSE2) for x64. So given that the only systems that benefit from these functions are truely ancient 32bit x86s they are removed. Signed-off-by: Andreas Rheinhardt --- libavcodec/x86/h264_deblock.asm | 196 ------------- libavcodec/x86/h264_deblock_10bit.asm | 42 +-- libavcodec/x86/h264_idct.asm | 382 -------------------------- libavcodec/x86/h264_weight.asm | 36 --- libavcodec/x86/h264dsp_init.c | 95 +------ 5 files changed, 9 insertions(+), 742 deletions(-) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index a2e745cd8e..479e6c3460 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -581,8 +581,6 @@ cglobal deblock_h_luma_8, 0,5,8,0x60+12 RET %endmacro ; DEBLOCK_LUMA -INIT_MMX mmxext -DEBLOCK_LUMA v8, 8 INIT_XMM sse2 DEBLOCK_LUMA v, 16 %if HAVE_AVX_EXTERNAL @@ -864,200 +862,6 @@ DEBLOCK_LUMA_INTRA v INIT_XMM avx DEBLOCK_LUMA_INTRA v %endif -%if ARCH_X86_64 == 0 -INIT_MMX mmxext -DEBLOCK_LUMA_INTRA v8 -%endif - -INIT_MMX mmxext - -%macro CHROMA_V_START 0 - dec r2d ; alpha-1 - dec r3d ; beta-1 - mov t5, r0 - sub t5, r1 - sub t5, r1 -%endmacro - -%macro CHROMA_H_START 0 - dec r2d - dec r3d - sub r0, 2 - lea t6, [r1*3] - mov t5, r0 - add r0, t6 -%endmacro - -%define t5 r5 -%define t6 r6 - -;----------------------------------------------------------------------------- -; void ff_deblock_v_chroma(uint8_t *pix, int stride, int alpha, int beta, -; int8_t *tc0) -;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_8, 5,6 - CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call ff_chroma_inter_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 - RET - -;----------------------------------------------------------------------------- -; void ff_deblock_h_chroma(uint8_t *pix, int stride, int alpha, int beta, -; int8_t *tc0) -;----------------------------------------------------------------------------- -cglobal deblock_h_chroma_8, 5,7 -%if ARCH_X86_64 - ; This could use the red zone on 64 bit unix to avoid the stack pointer - ; readjustment, but valgrind assumes the red zone is clobbered on - ; function calls and returns. - sub rsp, 16 - %define buf0 [rsp] - %define buf1 [rsp+8] -%else - %define buf0 r0m - %define buf1 r2m -%endif - CHROMA_H_START - TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) - movq buf0, m0 - movq buf1, m3 - LOAD_MASK r2d, r3d - movd m6, [r4] ; tc0 - punpcklbw m6, m6 - pand m7, m6 - DEBLOCK_P0_Q0 - movq m0, buf0 - movq m3, buf1 - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) -%if ARCH_X86_64 - add rsp, 16 -%endif - RET - -ALIGN 16 -ff_chroma_inter_body_mmxext: - LOAD_MASK r2d, r3d - movd m6, [r4] ; tc0 - punpcklbw m6, m6 - pand m7, m6 - DEBLOCK_P0_Q0 - ret - -%define t5 r4 -%define t6 r5 - -cglobal deblock_h_chroma422_8, 5, 6 - SUB rsp, (1+ARCH_X86_64*2)*mmsize - %if ARCH_X86_64 - %define buf0 [rsp+16] - %define buf1 [rsp+8] - %else - %define buf0 r0m - %define buf1 r2m - %endif - - movd m6, [r4] - punpcklbw m6, m6 - movq [rsp], m6 - CHROMA_H_START - - TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6) - movq buf0, m0 - movq buf1, m3 - LOAD_MASK r2d, r3d - movd m6, [rsp] - punpcklwd m6, m6 - pand m7, m6 - DEBLOCK_P0_Q0 - movq m0, buf0 - movq m3, buf1 - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) - - lea r0, [r0+r1*8] - lea t5, [t5+r1*8] - - TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6) - movq buf0, m0 - movq buf1, m3 - LOAD_MASK r2d, r3d - movd m6, [rsp+4] - punpcklwd m6, m6 - pand m7, m6 - DEBLOCK_P0_Q0 - movq m0, buf0 - movq m3, buf1 - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) - ADD rsp, (1+ARCH_X86_64*2)*mmsize -RET - -; in: %1=p0 %2=p1 %3=q1 -; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 -%macro CHROMA_INTRA_P0 3 - movq m4, %1 - pxor m4, %3 - pand m4, [pb_1] ; m4 = (p0^q1)&1 - pavgb %1, %3 - psubusb %1, m4 - pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) -%endmacro - -;------------------------------------------------------------------------------ -; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta) -;------------------------------------------------------------------------------ -cglobal deblock_v_chroma_intra_8, 4,5 - CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call ff_chroma_intra_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 - RET - -;------------------------------------------------------------------------------ -; void ff_deblock_h_chroma_intra(uint8_t *pix, int stride, int alpha, int beta) -;------------------------------------------------------------------------------ -cglobal deblock_h_chroma_intra_8, 4,6 - CHROMA_H_START - TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) - call ff_chroma_intra_body_mmxext - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) - RET - -cglobal deblock_h_chroma422_intra_8, 4, 6 - CHROMA_H_START - TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) - call ff_chroma_intra_body_mmxext - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) - - lea r0, [r0+r1*8] - lea t5, [t5+r1*8] - - TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) - call ff_chroma_intra_body_mmxext - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) -RET - -ALIGN 16 -ff_chroma_intra_body_mmxext: - LOAD_MASK r2d, r3d - movq m5, m1 - movq m6, m2 - CHROMA_INTRA_P0 m1, m0, m3 - CHROMA_INTRA_P0 m2, m3, m0 - psubb m1, m5 - psubb m2, m6 - pand m1, m7 - pand m2, m7 - paddb m1, m5 - paddb m2, m6 - ret %macro LOAD_8_ROWS 8 movd m0, %1 diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm index 1af3257a67..23971b5cb5 100644 --- a/libavcodec/x86/h264_deblock_10bit.asm +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -798,9 +798,11 @@ cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16) %endmacro %if ARCH_X86_64 == 0 +%if HAVE_ALIGNED_STACK == 0 INIT_MMX mmxext DEBLOCK_LUMA DEBLOCK_LUMA_INTRA +%endif INIT_XMM sse2 DEBLOCK_LUMA DEBLOCK_LUMA_INTRA @@ -938,10 +940,6 @@ cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) sub r0, r1 shl r2d, 2 shl r3d, 2 -%if mmsize < 16 - mov r6, 16/mmsize -.loop: -%endif CHROMA_V_LOAD r5 LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 @@ -952,16 +950,7 @@ cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) pand m7, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 CHROMA_V_STORE -%if mmsize < 16 - add r0, mmsize - add r5, mmsize - add r4, mmsize/4 - dec r6 - jg .loop - REP_RET -%else RET -%endif ;----------------------------------------------------------------------------- ; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha, @@ -973,24 +962,12 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) sub r0, r1 shl r2d, 2 shl r3d, 2 -%if mmsize < 16 - mov r5, 16/mmsize -.loop: -%endif CHROMA_V_LOAD r4 LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 CHROMA_V_STORE -%if mmsize < 16 - add r0, mmsize - add r4, mmsize - dec r5 - jg .loop - REP_RET -%else RET -%endif ;----------------------------------------------------------------------------- ; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta, @@ -1002,10 +979,6 @@ cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_, mov r5, pix_q lea r6, [3*stride_q] add r5, r6 -%if mmsize == 8 - mov r6d, 2 - .loop: -%endif CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize] LOAD_AB m4, m5, alpha_d, beta_d @@ -1018,13 +991,6 @@ cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_, DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize] -%if mmsize == 8 - lea pix_q, [pix_q + 4*stride_q] - lea r5, [r5 + 4*stride_q] - add tc0_q, 2 - dec r6d - jg .loop -%endif RET ;----------------------------------------------------------------------------- @@ -1068,10 +1034,6 @@ RET %endmacro -%if ARCH_X86_64 == 0 -INIT_MMX mmxext -DEBLOCK_CHROMA -%endif INIT_XMM sse2 DEBLOCK_CHROMA %if HAVE_AVX_EXTERNAL diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index c54f9f1a68..9b5920d3b0 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -87,13 +87,6 @@ SECTION .text STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 %endmacro -INIT_MMX mmx -; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct_add_8, 3, 3, 0 - movsxdifnidn r2, r2d - IDCT4_ADD r0, r1, r2 - RET - %macro IDCT8_1D 2 psraw m0, m1, 1 SWAP 0, 1 @@ -207,23 +200,6 @@ cglobal h264_idct_add_8, 3, 3, 0 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 %endmacro -INIT_MMX mmx -; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct8_add_8, 3, 4, 0 - movsxdifnidn r2, r2d - %assign pad 128+4-(stack_offset&7) - SUB rsp, pad - - add word [r1], 32 - IDCT8_ADD_MMX_START r1 , rsp - IDCT8_ADD_MMX_START r1+8, rsp+64 - lea r3, [r0+4] - IDCT8_ADD_MMX_END r0 , rsp, r2, r1 - IDCT8_ADD_MMX_END r3 , rsp+8, r2 - - ADD rsp, pad - RET - ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride %macro IDCT8_ADD_SSE 4 IDCT8_1D_FULL %2 @@ -315,16 +291,7 @@ cglobal h264_idct8_add_8, 3, 4, 10 %endmacro INIT_MMX mmxext -; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) %if ARCH_X86_64 -cglobal h264_idct_dc_add_8, 3, 4, 0 - movsxd r2, r2d - movsx r3, word [r1] - mov dword [r1], 0 - DC_ADD_MMXEXT_INIT r3, r2 - DC_ADD_MMXEXT_OP movh, r0, r2, r3 - RET - ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) cglobal h264_idct8_dc_add_8, 3, 4, 0 movsxd r2, r2d @@ -336,15 +303,6 @@ cglobal h264_idct8_dc_add_8, 3, 4, 0 DC_ADD_MMXEXT_OP mova, r0, r2, r3 RET %else -; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct_dc_add_8, 2, 3, 0 - movsx r2, word [r1] - mov dword [r1], 0 - mov r1, r2m - DC_ADD_MMXEXT_INIT r2, r1 - DC_ADD_MMXEXT_OP movh, r0, r1, r2 - RET - ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) cglobal h264_idct8_dc_add_8, 2, 3, 0 movsx r2, word [r1] @@ -357,247 +315,6 @@ cglobal h264_idct8_dc_add_8, 2, 3, 0 RET %endif -INIT_MMX mmx -; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg - movsxdifnidn r3, r3d - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .skipblock - mov r6d, dword [r1+r5*4] - lea r6, [r0+r6] - IDCT4_ADD r6, r2, r3 -.skipblock: - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET - -; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg - movsxdifnidn r3, r3d - %assign pad 128+4-(stack_offset&7) - SUB rsp, pad - - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .skipblock - mov r6d, dword [r1+r5*4] - add r6, r0 - add word [r2], 32 - IDCT8_ADD_MMX_START r2 , rsp - IDCT8_ADD_MMX_START r2+8, rsp+64 - IDCT8_ADD_MMX_END r6 , rsp, r3, r2 - mov r6d, dword [r1+r5*4] - lea r6, [r0+r6+4] - IDCT8_ADD_MMX_END r6 , rsp+8, r3 -.skipblock: - add r5, 4 - add r2, 128 - cmp r5, 16 - jl .nextblock - ADD rsp, pad - RET - -INIT_MMX mmxext -; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - movsxdifnidn r3, r3d - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .skipblock - cmp r6, 1 - jnz .no_dc - movsx r6, word [r2] - test r6, r6 - jz .no_dc - mov word [r2], 0 - DC_ADD_MMXEXT_INIT r6, r3 -%if ARCH_X86_64 == 0 -%define dst2q r1 -%define dst2d r1d -%endif - mov dst2d, dword [r1+r5*4] - lea dst2q, [r0+dst2q] - DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 -%if ARCH_X86_64 == 0 - mov r1, r1m -%endif - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET -.no_dc: - mov r6d, dword [r1+r5*4] - add r6, r0 - IDCT4_ADD r6, r2, r3 -.skipblock: - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET - -INIT_MMX mmx -; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg - movsxdifnidn r3, r3d - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - or r6w, word [r2] - test r6, r6 - jz .skipblock - mov r6d, dword [r1+r5*4] - add r6, r0 - IDCT4_ADD r6, r2, r3 -.skipblock: - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET - -INIT_MMX mmxext -; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - movsxdifnidn r3, r3d - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .try_dc - mov r6d, dword [r1+r5*4] - lea r6, [r0+r6] - IDCT4_ADD r6, r2, r3 - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET -.try_dc: - movsx r6, word [r2] - test r6, r6 - jz .skipblock - mov word [r2], 0 - DC_ADD_MMXEXT_INIT r6, r3 -%if ARCH_X86_64 == 0 -%define dst2q r1 -%define dst2d r1d -%endif - mov dst2d, dword [r1+r5*4] - add dst2q, r0 - DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 -%if ARCH_X86_64 == 0 - mov r1, r1m -%endif -.skipblock: - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET - -; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - movsxdifnidn r3, r3d - %assign pad 128+4-(stack_offset&7) - SUB rsp, pad - - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .skipblock - cmp r6, 1 - jnz .no_dc - movsx r6, word [r2] - test r6, r6 - jz .no_dc - mov word [r2], 0 - DC_ADD_MMXEXT_INIT r6, r3 -%if ARCH_X86_64 == 0 -%define dst2q r1 -%define dst2d r1d -%endif - mov dst2d, dword [r1+r5*4] - lea dst2q, [r0+dst2q] - DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 - lea dst2q, [dst2q+r3*4] - DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 -%if ARCH_X86_64 == 0 - mov r1, r1m -%endif - add r5, 4 - add r2, 128 - cmp r5, 16 - jl .nextblock - - ADD rsp, pad - RET -.no_dc: - mov r6d, dword [r1+r5*4] - add r6, r0 - add word [r2], 32 - IDCT8_ADD_MMX_START r2 , rsp - IDCT8_ADD_MMX_START r2+8, rsp+64 - IDCT8_ADD_MMX_END r6 , rsp, r3, r2 - mov r6d, dword [r1+r5*4] - lea r6, [r0+r6+4] - IDCT8_ADD_MMX_END r6 , rsp+8, r3 -.skipblock: - add r5, 4 - add r2, 128 - cmp r5, 16 - jl .nextblock - - ADD rsp, pad - RET - INIT_XMM sse2 ; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset, ; int16_t *block, int stride, @@ -678,30 +395,6 @@ h264_idct_add8_mmx_plane: jnz .nextblock rep ret -; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - movsxdifnidn r3, r3d - mov r5, 16 - add r2, 512 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -%if ARCH_X86_64 - mov dst2q, r0 -%endif - call h264_idct_add8_mmx_plane - mov r5, 32 - add r2, 384 -%if ARCH_X86_64 - add dst2q, gprsize -%else - add r0mp, gprsize -%endif - call h264_idct_add8_mmx_plane - RET ; TODO: check rep ret after a function call - cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg ; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg movsxdifnidn r3, r3d @@ -734,74 +427,6 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, str RET ; TODO: check rep ret after a function call -h264_idct_add8_mmxext_plane: - movsxdifnidn r3, r3d -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .try_dc -%if ARCH_X86_64 - mov r0d, dword [r1+r5*4] - add r0, [dst2q] -%else - mov r0, r1m ; XXX r1m here is actually r0m of the calling func - mov r0, [r0] - add r0, dword [r1+r5*4] -%endif - IDCT4_ADD r0, r2, r3 - inc r5 - add r2, 32 - test r5, 3 - jnz .nextblock - rep ret -.try_dc: - movsx r6, word [r2] - test r6, r6 - jz .skipblock - mov word [r2], 0 - DC_ADD_MMXEXT_INIT r6, r3 -%if ARCH_X86_64 - mov r0d, dword [r1+r5*4] - add r0, [dst2q] -%else - mov r0, r1m ; XXX r1m here is actually r0m of the calling func - mov r0, [r0] - add r0, dword [r1+r5*4] -%endif - DC_ADD_MMXEXT_OP movh, r0, r3, r6 -.skipblock: - inc r5 - add r2, 32 - test r5, 3 - jnz .nextblock - rep ret - -INIT_MMX mmxext -; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - movsxdifnidn r3, r3d - mov r5, 16 - add r2, 512 -%if ARCH_X86_64 - mov dst2q, r0 -%endif -%ifdef PIC - lea picregq, [scan8_mem] -%endif - call h264_idct_add8_mmxext_plane - mov r5, 32 - add r2, 384 -%if ARCH_X86_64 - add dst2q, gprsize -%else - add r0mp, gprsize -%endif - call h264_idct_add8_mmxext_plane - RET ; TODO: check rep ret after a function call - ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered h264_idct_dc_add8_mmxext: movsxdifnidn r3, r3d @@ -1129,18 +754,11 @@ cglobal h264_luma_dc_dequant_idct, 3, 4, %1 inc t1d shr t3d, t0b sub t1d, t0d -%if cpuflag(sse2) movd xmm6, t1d DEQUANT_STORE xmm6 -%else - movd m6, t1d - DEQUANT_STORE m6 -%endif RET %endmacro -INIT_MMX mmx -IDCT_DC_DEQUANT 0 INIT_MMX sse2 IDCT_DC_DEQUANT 7 diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm index 0975d74fcf..6076e64ae0 100644 --- a/libavcodec/x86/h264_weight.asm +++ b/libavcodec/x86/h264_weight.asm @@ -70,19 +70,6 @@ SECTION .text packuswb m0, m1 %endmacro -INIT_MMX mmxext -cglobal h264_weight_16, 6, 6, 0 - WEIGHT_SETUP -.nextrow: - WEIGHT_OP 0, 4 - mova [r0 ], m0 - WEIGHT_OP 8, 12 - mova [r0+8], m0 - add r0, r1 - dec r2d - jnz .nextrow - REP_RET - %macro WEIGHT_FUNC_MM 2 cglobal h264_weight_%1, 6, 6, %2 WEIGHT_SETUP @@ -95,8 +82,6 @@ cglobal h264_weight_%1, 6, 6, %2 REP_RET %endmacro -INIT_MMX mmxext -WEIGHT_FUNC_MM 8, 0 INIT_XMM sse2 WEIGHT_FUNC_MM 16, 8 @@ -198,25 +183,6 @@ WEIGHT_FUNC_HALF_MM 8, 8 packuswb m0, m1 %endmacro -INIT_MMX mmxext -cglobal h264_biweight_16, 7, 8, 0 - BIWEIGHT_SETUP - movifnidn r3d, r3m -.nextrow: - BIWEIGHT_STEPA 0, 1, 0 - BIWEIGHT_STEPA 1, 2, 4 - BIWEIGHT_STEPB - mova [r0], m0 - BIWEIGHT_STEPA 0, 1, 8 - BIWEIGHT_STEPA 1, 2, 12 - BIWEIGHT_STEPB - mova [r0+8], m0 - add r0, r2 - add r1, r2 - dec r3d - jnz .nextrow - REP_RET - %macro BIWEIGHT_FUNC_MM 2 cglobal h264_biweight_%1, 7, 8, %2 BIWEIGHT_SETUP @@ -233,8 +199,6 @@ cglobal h264_biweight_%1, 7, 8, %2 REP_RET %endmacro -INIT_MMX mmxext -BIWEIGHT_FUNC_MM 8, 0 INIT_XMM sse2 BIWEIGHT_FUNC_MM 16, 8 diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index c9a96c7dca..dc8fc4f720 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -31,17 +31,14 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ int16_t *block, \ int stride); -IDCT_ADD_FUNC(, 8, mmx) IDCT_ADD_FUNC(, 8, sse2) IDCT_ADD_FUNC(, 8, avx) IDCT_ADD_FUNC(, 10, sse2) -IDCT_ADD_FUNC(_dc, 8, mmxext) IDCT_ADD_FUNC(_dc, 8, sse2) IDCT_ADD_FUNC(_dc, 8, avx) IDCT_ADD_FUNC(_dc, 10, mmxext) IDCT_ADD_FUNC(8_dc, 8, mmxext) IDCT_ADD_FUNC(8_dc, 10, sse2) -IDCT_ADD_FUNC(8, 8, mmx) IDCT_ADD_FUNC(8, 8, sse2) IDCT_ADD_FUNC(8, 10, sse2) IDCT_ADD_FUNC(, 10, avx) @@ -54,17 +51,11 @@ void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ (uint8_t *dst, const int *block_offset, \ int16_t *block, int stride, const uint8_t nnzc[5 * 8]); -IDCT_ADD_REP_FUNC(8, 4, 8, mmx) -IDCT_ADD_REP_FUNC(8, 4, 8, mmxext) IDCT_ADD_REP_FUNC(8, 4, 8, sse2) IDCT_ADD_REP_FUNC(8, 4, 10, sse2) IDCT_ADD_REP_FUNC(8, 4, 10, avx) -IDCT_ADD_REP_FUNC(, 16, 8, mmx) -IDCT_ADD_REP_FUNC(, 16, 8, mmxext) IDCT_ADD_REP_FUNC(, 16, 8, sse2) IDCT_ADD_REP_FUNC(, 16, 10, sse2) -IDCT_ADD_REP_FUNC(, 16intra, 8, mmx) -IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext) IDCT_ADD_REP_FUNC(, 16intra, 8, sse2) IDCT_ADD_REP_FUNC(, 16intra, 10, sse2) IDCT_ADD_REP_FUNC(, 16, 10, avx) @@ -76,8 +67,6 @@ void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ (uint8_t **dst, const int *block_offset, \ int16_t *block, int stride, const uint8_t nnzc[15 * 8]); -IDCT_ADD_REP_FUNC2(, 8, 8, mmx) -IDCT_ADD_REP_FUNC2(, 8, 8, mmxext) IDCT_ADD_REP_FUNC2(, 8, 8, sse2) IDCT_ADD_REP_FUNC2(, 8, 10, sse2) IDCT_ADD_REP_FUNC2(, 8, 10, avx) @@ -87,7 +76,6 @@ IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx) IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2) IDCT_ADD_REP_FUNC2(, 8_422, 10, avx) -void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul); void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul); /***********************************/ @@ -112,14 +100,6 @@ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ int beta); #define LF_FUNCS(type, depth) \ -LF_FUNC(h, chroma, depth, mmxext) \ -LF_IFUNC(h, chroma_intra, depth, mmxext) \ -LF_FUNC(h, chroma422, depth, mmxext) \ -LF_IFUNC(h, chroma422_intra, depth, mmxext) \ -LF_FUNC(v, chroma, depth, mmxext) \ -LF_IFUNC(v, chroma_intra, depth, mmxext) \ -LF_FUNC(h, luma, depth, mmxext) \ -LF_IFUNC(h, luma_intra, depth, mmxext) \ LF_FUNC(h, luma, depth, sse2) \ LF_IFUNC(h, luma_intra, depth, sse2) \ LF_FUNC(v, luma, depth, sse2) \ @@ -147,27 +127,10 @@ LF_FUNC(h, luma_mbaff, 8, avx) LF_FUNCS(uint8_t, 8) LF_FUNCS(uint16_t, 10) -#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL -LF_FUNC(v8, luma, 8, mmxext) -static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0) -{ - if ((tc0[0] & tc0[1]) >= 0) - ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0); - if ((tc0[2] & tc0[3]) >= 0) - ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2); -} -LF_IFUNC(v8, luma_intra, 8, mmxext) -static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, - int alpha, int beta) -{ - ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta); - ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta); -} -#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */ - LF_FUNC(v, luma, 10, mmxext) +LF_FUNC(h, luma, 10, mmxext) LF_IFUNC(v, luma_intra, 10, mmxext) +LF_IFUNC(h, luma_intra, 10, mmxext) /***********************************/ /* weighted prediction */ @@ -187,14 +150,13 @@ void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \ H264_WEIGHT(W, mmxext) \ H264_BIWEIGHT(W, mmxext) -#define H264_BIWEIGHT_MMX_SSE(W) \ - H264_BIWEIGHT_MMX(W) \ +#define H264_BIWEIGHT_SSE(W) \ H264_WEIGHT(W, sse2) \ H264_BIWEIGHT(W, sse2) \ H264_BIWEIGHT(W, ssse3) -H264_BIWEIGHT_MMX_SSE(16) -H264_BIWEIGHT_MMX_SSE(8) +H264_BIWEIGHT_SSE(16) +H264_BIWEIGHT_SSE(8) H264_BIWEIGHT_MMX(4) #define H264_WEIGHT_10(W, DEPTH, OPT) \ @@ -236,52 +198,16 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, if (bit_depth == 8) { if (EXTERNAL_MMX(cpu_flags)) { - c->h264_idct_dc_add = - c->h264_idct_add = ff_h264_idct_add_8_mmx; - c->h264_idct8_dc_add = - c->h264_idct8_add = ff_h264_idct8_add_8_mmx; - - c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; if (chroma_format_idc <= 1) { - c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; } else { c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx; } - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; - if (cpu_flags & AV_CPU_FLAG_CMOV) - c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; } if (EXTERNAL_MMXEXT(cpu_flags)) { - c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext; c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext; - c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext; - if (chroma_format_idc <= 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext; - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext; - if (chroma_format_idc <= 1) { - c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext; - c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext; - } else { - c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext; - c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext; - } -#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL - c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext; - c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; -#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */ - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext; c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext; - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext; c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) { @@ -350,19 +276,12 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { -#if ARCH_X86_32 - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext; - if (chroma_format_idc <= 1) { - c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext; - } else { - c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext; - } +#if ARCH_X86_32 && !HAVE_ALIGNED_STACK c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext; c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; -#endif /* ARCH_X86_32 */ +#endif /* ARCH_X86_32 && !HAVE_ALIGNED_STACK */ c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) {