diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index a2e745cd8e..479e6c3460 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -581,8 +581,6 @@ cglobal deblock_h_luma_8, 0,5,8,0x60+12 RET %endmacro ; DEBLOCK_LUMA -INIT_MMX mmxext -DEBLOCK_LUMA v8, 8 INIT_XMM sse2 DEBLOCK_LUMA v, 16 %if HAVE_AVX_EXTERNAL @@ -864,200 +862,6 @@ DEBLOCK_LUMA_INTRA v INIT_XMM avx DEBLOCK_LUMA_INTRA v %endif -%if ARCH_X86_64 == 0 -INIT_MMX mmxext -DEBLOCK_LUMA_INTRA v8 -%endif - -INIT_MMX mmxext - -%macro CHROMA_V_START 0 - dec r2d ; alpha-1 - dec r3d ; beta-1 - mov t5, r0 - sub t5, r1 - sub t5, r1 -%endmacro - -%macro CHROMA_H_START 0 - dec r2d - dec r3d - sub r0, 2 - lea t6, [r1*3] - mov t5, r0 - add r0, t6 -%endmacro - -%define t5 r5 -%define t6 r6 - -;----------------------------------------------------------------------------- -; void ff_deblock_v_chroma(uint8_t *pix, int stride, int alpha, int beta, -; int8_t *tc0) -;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_8, 5,6 - CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call ff_chroma_inter_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 - RET - -;----------------------------------------------------------------------------- -; void ff_deblock_h_chroma(uint8_t *pix, int stride, int alpha, int beta, -; int8_t *tc0) -;----------------------------------------------------------------------------- -cglobal deblock_h_chroma_8, 5,7 -%if ARCH_X86_64 - ; This could use the red zone on 64 bit unix to avoid the stack pointer - ; readjustment, but valgrind assumes the red zone is clobbered on - ; function calls and returns. - sub rsp, 16 - %define buf0 [rsp] - %define buf1 [rsp+8] -%else - %define buf0 r0m - %define buf1 r2m -%endif - CHROMA_H_START - TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) - movq buf0, m0 - movq buf1, m3 - LOAD_MASK r2d, r3d - movd m6, [r4] ; tc0 - punpcklbw m6, m6 - pand m7, m6 - DEBLOCK_P0_Q0 - movq m0, buf0 - movq m3, buf1 - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) -%if ARCH_X86_64 - add rsp, 16 -%endif - RET - -ALIGN 16 -ff_chroma_inter_body_mmxext: - LOAD_MASK r2d, r3d - movd m6, [r4] ; tc0 - punpcklbw m6, m6 - pand m7, m6 - DEBLOCK_P0_Q0 - ret - -%define t5 r4 -%define t6 r5 - -cglobal deblock_h_chroma422_8, 5, 6 - SUB rsp, (1+ARCH_X86_64*2)*mmsize - %if ARCH_X86_64 - %define buf0 [rsp+16] - %define buf1 [rsp+8] - %else - %define buf0 r0m - %define buf1 r2m - %endif - - movd m6, [r4] - punpcklbw m6, m6 - movq [rsp], m6 - CHROMA_H_START - - TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6) - movq buf0, m0 - movq buf1, m3 - LOAD_MASK r2d, r3d - movd m6, [rsp] - punpcklwd m6, m6 - pand m7, m6 - DEBLOCK_P0_Q0 - movq m0, buf0 - movq m3, buf1 - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) - - lea r0, [r0+r1*8] - lea t5, [t5+r1*8] - - TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6) - movq buf0, m0 - movq buf1, m3 - LOAD_MASK r2d, r3d - movd m6, [rsp+4] - punpcklwd m6, m6 - pand m7, m6 - DEBLOCK_P0_Q0 - movq m0, buf0 - movq m3, buf1 - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) - ADD rsp, (1+ARCH_X86_64*2)*mmsize -RET - -; in: %1=p0 %2=p1 %3=q1 -; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 -%macro CHROMA_INTRA_P0 3 - movq m4, %1 - pxor m4, %3 - pand m4, [pb_1] ; m4 = (p0^q1)&1 - pavgb %1, %3 - psubusb %1, m4 - pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) -%endmacro - -;------------------------------------------------------------------------------ -; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta) -;------------------------------------------------------------------------------ -cglobal deblock_v_chroma_intra_8, 4,5 - CHROMA_V_START - movq m0, [t5] - movq m1, [t5+r1] - movq m2, [r0] - movq m3, [r0+r1] - call ff_chroma_intra_body_mmxext - movq [t5+r1], m1 - movq [r0], m2 - RET - -;------------------------------------------------------------------------------ -; void ff_deblock_h_chroma_intra(uint8_t *pix, int stride, int alpha, int beta) -;------------------------------------------------------------------------------ -cglobal deblock_h_chroma_intra_8, 4,6 - CHROMA_H_START - TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) - call ff_chroma_intra_body_mmxext - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) - RET - -cglobal deblock_h_chroma422_intra_8, 4, 6 - CHROMA_H_START - TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) - call ff_chroma_intra_body_mmxext - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) - - lea r0, [r0+r1*8] - lea t5, [t5+r1*8] - - TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) - call ff_chroma_intra_body_mmxext - TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) -RET - -ALIGN 16 -ff_chroma_intra_body_mmxext: - LOAD_MASK r2d, r3d - movq m5, m1 - movq m6, m2 - CHROMA_INTRA_P0 m1, m0, m3 - CHROMA_INTRA_P0 m2, m3, m0 - psubb m1, m5 - psubb m2, m6 - pand m1, m7 - pand m2, m7 - paddb m1, m5 - paddb m2, m6 - ret %macro LOAD_8_ROWS 8 movd m0, %1 diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm index 1af3257a67..23971b5cb5 100644 --- a/libavcodec/x86/h264_deblock_10bit.asm +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -798,9 +798,11 @@ cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16) %endmacro %if ARCH_X86_64 == 0 +%if HAVE_ALIGNED_STACK == 0 INIT_MMX mmxext DEBLOCK_LUMA DEBLOCK_LUMA_INTRA +%endif INIT_XMM sse2 DEBLOCK_LUMA DEBLOCK_LUMA_INTRA @@ -938,10 +940,6 @@ cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) sub r0, r1 shl r2d, 2 shl r3d, 2 -%if mmsize < 16 - mov r6, 16/mmsize -.loop: -%endif CHROMA_V_LOAD r5 LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 @@ -952,16 +950,7 @@ cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) pand m7, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 CHROMA_V_STORE -%if mmsize < 16 - add r0, mmsize - add r5, mmsize - add r4, mmsize/4 - dec r6 - jg .loop - REP_RET -%else RET -%endif ;----------------------------------------------------------------------------- ; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha, @@ -973,24 +962,12 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) sub r0, r1 shl r2d, 2 shl r3d, 2 -%if mmsize < 16 - mov r5, 16/mmsize -.loop: -%endif CHROMA_V_LOAD r4 LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 CHROMA_V_STORE -%if mmsize < 16 - add r0, mmsize - add r4, mmsize - dec r5 - jg .loop - REP_RET -%else RET -%endif ;----------------------------------------------------------------------------- ; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta, @@ -1002,10 +979,6 @@ cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_, mov r5, pix_q lea r6, [3*stride_q] add r5, r6 -%if mmsize == 8 - mov r6d, 2 - .loop: -%endif CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize] LOAD_AB m4, m5, alpha_d, beta_d @@ -1018,13 +991,6 @@ cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_, DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize] -%if mmsize == 8 - lea pix_q, [pix_q + 4*stride_q] - lea r5, [r5 + 4*stride_q] - add tc0_q, 2 - dec r6d - jg .loop -%endif RET ;----------------------------------------------------------------------------- @@ -1068,10 +1034,6 @@ RET %endmacro -%if ARCH_X86_64 == 0 -INIT_MMX mmxext -DEBLOCK_CHROMA -%endif INIT_XMM sse2 DEBLOCK_CHROMA %if HAVE_AVX_EXTERNAL diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index c54f9f1a68..9b5920d3b0 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -87,13 +87,6 @@ SECTION .text STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 %endmacro -INIT_MMX mmx -; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct_add_8, 3, 3, 0 - movsxdifnidn r2, r2d - IDCT4_ADD r0, r1, r2 - RET - %macro IDCT8_1D 2 psraw m0, m1, 1 SWAP 0, 1 @@ -207,23 +200,6 @@ cglobal h264_idct_add_8, 3, 3, 0 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 %endmacro -INIT_MMX mmx -; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct8_add_8, 3, 4, 0 - movsxdifnidn r2, r2d - %assign pad 128+4-(stack_offset&7) - SUB rsp, pad - - add word [r1], 32 - IDCT8_ADD_MMX_START r1 , rsp - IDCT8_ADD_MMX_START r1+8, rsp+64 - lea r3, [r0+4] - IDCT8_ADD_MMX_END r0 , rsp, r2, r1 - IDCT8_ADD_MMX_END r3 , rsp+8, r2 - - ADD rsp, pad - RET - ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride %macro IDCT8_ADD_SSE 4 IDCT8_1D_FULL %2 @@ -315,16 +291,7 @@ cglobal h264_idct8_add_8, 3, 4, 10 %endmacro INIT_MMX mmxext -; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) %if ARCH_X86_64 -cglobal h264_idct_dc_add_8, 3, 4, 0 - movsxd r2, r2d - movsx r3, word [r1] - mov dword [r1], 0 - DC_ADD_MMXEXT_INIT r3, r2 - DC_ADD_MMXEXT_OP movh, r0, r2, r3 - RET - ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) cglobal h264_idct8_dc_add_8, 3, 4, 0 movsxd r2, r2d @@ -336,15 +303,6 @@ cglobal h264_idct8_dc_add_8, 3, 4, 0 DC_ADD_MMXEXT_OP mova, r0, r2, r3 RET %else -; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) -cglobal h264_idct_dc_add_8, 2, 3, 0 - movsx r2, word [r1] - mov dword [r1], 0 - mov r1, r2m - DC_ADD_MMXEXT_INIT r2, r1 - DC_ADD_MMXEXT_OP movh, r0, r1, r2 - RET - ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) cglobal h264_idct8_dc_add_8, 2, 3, 0 movsx r2, word [r1] @@ -357,247 +315,6 @@ cglobal h264_idct8_dc_add_8, 2, 3, 0 RET %endif -INIT_MMX mmx -; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg - movsxdifnidn r3, r3d - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .skipblock - mov r6d, dword [r1+r5*4] - lea r6, [r0+r6] - IDCT4_ADD r6, r2, r3 -.skipblock: - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET - -; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg - movsxdifnidn r3, r3d - %assign pad 128+4-(stack_offset&7) - SUB rsp, pad - - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .skipblock - mov r6d, dword [r1+r5*4] - add r6, r0 - add word [r2], 32 - IDCT8_ADD_MMX_START r2 , rsp - IDCT8_ADD_MMX_START r2+8, rsp+64 - IDCT8_ADD_MMX_END r6 , rsp, r3, r2 - mov r6d, dword [r1+r5*4] - lea r6, [r0+r6+4] - IDCT8_ADD_MMX_END r6 , rsp+8, r3 -.skipblock: - add r5, 4 - add r2, 128 - cmp r5, 16 - jl .nextblock - ADD rsp, pad - RET - -INIT_MMX mmxext -; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - movsxdifnidn r3, r3d - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .skipblock - cmp r6, 1 - jnz .no_dc - movsx r6, word [r2] - test r6, r6 - jz .no_dc - mov word [r2], 0 - DC_ADD_MMXEXT_INIT r6, r3 -%if ARCH_X86_64 == 0 -%define dst2q r1 -%define dst2d r1d -%endif - mov dst2d, dword [r1+r5*4] - lea dst2q, [r0+dst2q] - DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 -%if ARCH_X86_64 == 0 - mov r1, r1m -%endif - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET -.no_dc: - mov r6d, dword [r1+r5*4] - add r6, r0 - IDCT4_ADD r6, r2, r3 -.skipblock: - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET - -INIT_MMX mmx -; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg - movsxdifnidn r3, r3d - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - or r6w, word [r2] - test r6, r6 - jz .skipblock - mov r6d, dword [r1+r5*4] - add r6, r0 - IDCT4_ADD r6, r2, r3 -.skipblock: - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET - -INIT_MMX mmxext -; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - movsxdifnidn r3, r3d - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .try_dc - mov r6d, dword [r1+r5*4] - lea r6, [r0+r6] - IDCT4_ADD r6, r2, r3 - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET -.try_dc: - movsx r6, word [r2] - test r6, r6 - jz .skipblock - mov word [r2], 0 - DC_ADD_MMXEXT_INIT r6, r3 -%if ARCH_X86_64 == 0 -%define dst2q r1 -%define dst2d r1d -%endif - mov dst2d, dword [r1+r5*4] - add dst2q, r0 - DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 -%if ARCH_X86_64 == 0 - mov r1, r1m -%endif -.skipblock: - inc r5 - add r2, 32 - cmp r5, 16 - jl .nextblock - REP_RET - -; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - movsxdifnidn r3, r3d - %assign pad 128+4-(stack_offset&7) - SUB rsp, pad - - xor r5, r5 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .skipblock - cmp r6, 1 - jnz .no_dc - movsx r6, word [r2] - test r6, r6 - jz .no_dc - mov word [r2], 0 - DC_ADD_MMXEXT_INIT r6, r3 -%if ARCH_X86_64 == 0 -%define dst2q r1 -%define dst2d r1d -%endif - mov dst2d, dword [r1+r5*4] - lea dst2q, [r0+dst2q] - DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 - lea dst2q, [dst2q+r3*4] - DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 -%if ARCH_X86_64 == 0 - mov r1, r1m -%endif - add r5, 4 - add r2, 128 - cmp r5, 16 - jl .nextblock - - ADD rsp, pad - RET -.no_dc: - mov r6d, dword [r1+r5*4] - add r6, r0 - add word [r2], 32 - IDCT8_ADD_MMX_START r2 , rsp - IDCT8_ADD_MMX_START r2+8, rsp+64 - IDCT8_ADD_MMX_END r6 , rsp, r3, r2 - mov r6d, dword [r1+r5*4] - lea r6, [r0+r6+4] - IDCT8_ADD_MMX_END r6 , rsp+8, r3 -.skipblock: - add r5, 4 - add r2, 128 - cmp r5, 16 - jl .nextblock - - ADD rsp, pad - RET - INIT_XMM sse2 ; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset, ; int16_t *block, int stride, @@ -678,30 +395,6 @@ h264_idct_add8_mmx_plane: jnz .nextblock rep ret -; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - movsxdifnidn r3, r3d - mov r5, 16 - add r2, 512 -%ifdef PIC - lea picregq, [scan8_mem] -%endif -%if ARCH_X86_64 - mov dst2q, r0 -%endif - call h264_idct_add8_mmx_plane - mov r5, 32 - add r2, 384 -%if ARCH_X86_64 - add dst2q, gprsize -%else - add r0mp, gprsize -%endif - call h264_idct_add8_mmx_plane - RET ; TODO: check rep ret after a function call - cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg ; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg movsxdifnidn r3, r3d @@ -734,74 +427,6 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, str RET ; TODO: check rep ret after a function call -h264_idct_add8_mmxext_plane: - movsxdifnidn r3, r3d -.nextblock: - movzx r6, byte [scan8+r5] - movzx r6, byte [r4+r6] - test r6, r6 - jz .try_dc -%if ARCH_X86_64 - mov r0d, dword [r1+r5*4] - add r0, [dst2q] -%else - mov r0, r1m ; XXX r1m here is actually r0m of the calling func - mov r0, [r0] - add r0, dword [r1+r5*4] -%endif - IDCT4_ADD r0, r2, r3 - inc r5 - add r2, 32 - test r5, 3 - jnz .nextblock - rep ret -.try_dc: - movsx r6, word [r2] - test r6, r6 - jz .skipblock - mov word [r2], 0 - DC_ADD_MMXEXT_INIT r6, r3 -%if ARCH_X86_64 - mov r0d, dword [r1+r5*4] - add r0, [dst2q] -%else - mov r0, r1m ; XXX r1m here is actually r0m of the calling func - mov r0, [r0] - add r0, dword [r1+r5*4] -%endif - DC_ADD_MMXEXT_OP movh, r0, r3, r6 -.skipblock: - inc r5 - add r2, 32 - test r5, 3 - jnz .nextblock - rep ret - -INIT_MMX mmxext -; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset, -; int16_t *block, int stride, -; const uint8_t nnzc[6 * 8]) -cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg - movsxdifnidn r3, r3d - mov r5, 16 - add r2, 512 -%if ARCH_X86_64 - mov dst2q, r0 -%endif -%ifdef PIC - lea picregq, [scan8_mem] -%endif - call h264_idct_add8_mmxext_plane - mov r5, 32 - add r2, 384 -%if ARCH_X86_64 - add dst2q, gprsize -%else - add r0mp, gprsize -%endif - call h264_idct_add8_mmxext_plane - RET ; TODO: check rep ret after a function call - ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered h264_idct_dc_add8_mmxext: movsxdifnidn r3, r3d @@ -1129,18 +754,11 @@ cglobal h264_luma_dc_dequant_idct, 3, 4, %1 inc t1d shr t3d, t0b sub t1d, t0d -%if cpuflag(sse2) movd xmm6, t1d DEQUANT_STORE xmm6 -%else - movd m6, t1d - DEQUANT_STORE m6 -%endif RET %endmacro -INIT_MMX mmx -IDCT_DC_DEQUANT 0 INIT_MMX sse2 IDCT_DC_DEQUANT 7 diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm index 0975d74fcf..6076e64ae0 100644 --- a/libavcodec/x86/h264_weight.asm +++ b/libavcodec/x86/h264_weight.asm @@ -70,19 +70,6 @@ SECTION .text packuswb m0, m1 %endmacro -INIT_MMX mmxext -cglobal h264_weight_16, 6, 6, 0 - WEIGHT_SETUP -.nextrow: - WEIGHT_OP 0, 4 - mova [r0 ], m0 - WEIGHT_OP 8, 12 - mova [r0+8], m0 - add r0, r1 - dec r2d - jnz .nextrow - REP_RET - %macro WEIGHT_FUNC_MM 2 cglobal h264_weight_%1, 6, 6, %2 WEIGHT_SETUP @@ -95,8 +82,6 @@ cglobal h264_weight_%1, 6, 6, %2 REP_RET %endmacro -INIT_MMX mmxext -WEIGHT_FUNC_MM 8, 0 INIT_XMM sse2 WEIGHT_FUNC_MM 16, 8 @@ -198,25 +183,6 @@ WEIGHT_FUNC_HALF_MM 8, 8 packuswb m0, m1 %endmacro -INIT_MMX mmxext -cglobal h264_biweight_16, 7, 8, 0 - BIWEIGHT_SETUP - movifnidn r3d, r3m -.nextrow: - BIWEIGHT_STEPA 0, 1, 0 - BIWEIGHT_STEPA 1, 2, 4 - BIWEIGHT_STEPB - mova [r0], m0 - BIWEIGHT_STEPA 0, 1, 8 - BIWEIGHT_STEPA 1, 2, 12 - BIWEIGHT_STEPB - mova [r0+8], m0 - add r0, r2 - add r1, r2 - dec r3d - jnz .nextrow - REP_RET - %macro BIWEIGHT_FUNC_MM 2 cglobal h264_biweight_%1, 7, 8, %2 BIWEIGHT_SETUP @@ -233,8 +199,6 @@ cglobal h264_biweight_%1, 7, 8, %2 REP_RET %endmacro -INIT_MMX mmxext -BIWEIGHT_FUNC_MM 8, 0 INIT_XMM sse2 BIWEIGHT_FUNC_MM 16, 8 diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index c9a96c7dca..dc8fc4f720 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -31,17 +31,14 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ int16_t *block, \ int stride); -IDCT_ADD_FUNC(, 8, mmx) IDCT_ADD_FUNC(, 8, sse2) IDCT_ADD_FUNC(, 8, avx) IDCT_ADD_FUNC(, 10, sse2) -IDCT_ADD_FUNC(_dc, 8, mmxext) IDCT_ADD_FUNC(_dc, 8, sse2) IDCT_ADD_FUNC(_dc, 8, avx) IDCT_ADD_FUNC(_dc, 10, mmxext) IDCT_ADD_FUNC(8_dc, 8, mmxext) IDCT_ADD_FUNC(8_dc, 10, sse2) -IDCT_ADD_FUNC(8, 8, mmx) IDCT_ADD_FUNC(8, 8, sse2) IDCT_ADD_FUNC(8, 10, sse2) IDCT_ADD_FUNC(, 10, avx) @@ -54,17 +51,11 @@ void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ (uint8_t *dst, const int *block_offset, \ int16_t *block, int stride, const uint8_t nnzc[5 * 8]); -IDCT_ADD_REP_FUNC(8, 4, 8, mmx) -IDCT_ADD_REP_FUNC(8, 4, 8, mmxext) IDCT_ADD_REP_FUNC(8, 4, 8, sse2) IDCT_ADD_REP_FUNC(8, 4, 10, sse2) IDCT_ADD_REP_FUNC(8, 4, 10, avx) -IDCT_ADD_REP_FUNC(, 16, 8, mmx) -IDCT_ADD_REP_FUNC(, 16, 8, mmxext) IDCT_ADD_REP_FUNC(, 16, 8, sse2) IDCT_ADD_REP_FUNC(, 16, 10, sse2) -IDCT_ADD_REP_FUNC(, 16intra, 8, mmx) -IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext) IDCT_ADD_REP_FUNC(, 16intra, 8, sse2) IDCT_ADD_REP_FUNC(, 16intra, 10, sse2) IDCT_ADD_REP_FUNC(, 16, 10, avx) @@ -76,8 +67,6 @@ void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ (uint8_t **dst, const int *block_offset, \ int16_t *block, int stride, const uint8_t nnzc[15 * 8]); -IDCT_ADD_REP_FUNC2(, 8, 8, mmx) -IDCT_ADD_REP_FUNC2(, 8, 8, mmxext) IDCT_ADD_REP_FUNC2(, 8, 8, sse2) IDCT_ADD_REP_FUNC2(, 8, 10, sse2) IDCT_ADD_REP_FUNC2(, 8, 10, avx) @@ -87,7 +76,6 @@ IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx) IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2) IDCT_ADD_REP_FUNC2(, 8_422, 10, avx) -void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul); void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul); /***********************************/ @@ -112,14 +100,6 @@ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ int beta); #define LF_FUNCS(type, depth) \ -LF_FUNC(h, chroma, depth, mmxext) \ -LF_IFUNC(h, chroma_intra, depth, mmxext) \ -LF_FUNC(h, chroma422, depth, mmxext) \ -LF_IFUNC(h, chroma422_intra, depth, mmxext) \ -LF_FUNC(v, chroma, depth, mmxext) \ -LF_IFUNC(v, chroma_intra, depth, mmxext) \ -LF_FUNC(h, luma, depth, mmxext) \ -LF_IFUNC(h, luma_intra, depth, mmxext) \ LF_FUNC(h, luma, depth, sse2) \ LF_IFUNC(h, luma_intra, depth, sse2) \ LF_FUNC(v, luma, depth, sse2) \ @@ -147,27 +127,10 @@ LF_FUNC(h, luma_mbaff, 8, avx) LF_FUNCS(uint8_t, 8) LF_FUNCS(uint16_t, 10) -#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL -LF_FUNC(v8, luma, 8, mmxext) -static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0) -{ - if ((tc0[0] & tc0[1]) >= 0) - ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0); - if ((tc0[2] & tc0[3]) >= 0) - ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2); -} -LF_IFUNC(v8, luma_intra, 8, mmxext) -static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, - int alpha, int beta) -{ - ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta); - ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta); -} -#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */ - LF_FUNC(v, luma, 10, mmxext) +LF_FUNC(h, luma, 10, mmxext) LF_IFUNC(v, luma_intra, 10, mmxext) +LF_IFUNC(h, luma_intra, 10, mmxext) /***********************************/ /* weighted prediction */ @@ -187,14 +150,13 @@ void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \ H264_WEIGHT(W, mmxext) \ H264_BIWEIGHT(W, mmxext) -#define H264_BIWEIGHT_MMX_SSE(W) \ - H264_BIWEIGHT_MMX(W) \ +#define H264_BIWEIGHT_SSE(W) \ H264_WEIGHT(W, sse2) \ H264_BIWEIGHT(W, sse2) \ H264_BIWEIGHT(W, ssse3) -H264_BIWEIGHT_MMX_SSE(16) -H264_BIWEIGHT_MMX_SSE(8) +H264_BIWEIGHT_SSE(16) +H264_BIWEIGHT_SSE(8) H264_BIWEIGHT_MMX(4) #define H264_WEIGHT_10(W, DEPTH, OPT) \ @@ -236,52 +198,16 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, if (bit_depth == 8) { if (EXTERNAL_MMX(cpu_flags)) { - c->h264_idct_dc_add = - c->h264_idct_add = ff_h264_idct_add_8_mmx; - c->h264_idct8_dc_add = - c->h264_idct8_add = ff_h264_idct8_add_8_mmx; - - c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; if (chroma_format_idc <= 1) { - c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; } else { c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx; } - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; - if (cpu_flags & AV_CPU_FLAG_CMOV) - c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; } if (EXTERNAL_MMXEXT(cpu_flags)) { - c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext; c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext; - c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext; - if (chroma_format_idc <= 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext; - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext; - if (chroma_format_idc <= 1) { - c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext; - c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext; - } else { - c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext; - c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext; - } -#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL - c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext; - c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; -#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */ - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext; c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext; - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext; c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) { @@ -350,19 +276,12 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { -#if ARCH_X86_32 - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext; - if (chroma_format_idc <= 1) { - c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext; - } else { - c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext; - } +#if ARCH_X86_32 && !HAVE_ALIGNED_STACK c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext; c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext; c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; -#endif /* ARCH_X86_32 */ +#endif /* ARCH_X86_32 && !HAVE_ALIGNED_STACK */ c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext; } if (EXTERNAL_SSE2(cpu_flags)) {