1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-23 12:43:46 +02:00

avcodec/x86/h264dsp_init: Remove obsolete MMX(EXT) functions

x64 always has MMX, MMXEXT, SSE and SSE2 and this means
that some functions for MMX, MMXEXT and 3dnow are always
overridden by other functions (unless one e.g. explicitly
disables SSE2) for x64. So given that the only systems that
benefit from these functions are truely ancient 32bit x86s
they are removed.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt 2022-06-11 16:24:23 +02:00
parent 55d8618a47
commit 4618f36a24
5 changed files with 9 additions and 742 deletions

View File

@ -581,8 +581,6 @@ cglobal deblock_h_luma_8, 0,5,8,0x60+12
RET
%endmacro ; DEBLOCK_LUMA
INIT_MMX mmxext
DEBLOCK_LUMA v8, 8
INIT_XMM sse2
DEBLOCK_LUMA v, 16
%if HAVE_AVX_EXTERNAL
@ -864,200 +862,6 @@ DEBLOCK_LUMA_INTRA v
INIT_XMM avx
DEBLOCK_LUMA_INTRA v
%endif
%if ARCH_X86_64 == 0
INIT_MMX mmxext
DEBLOCK_LUMA_INTRA v8
%endif
INIT_MMX mmxext
%macro CHROMA_V_START 0
dec r2d ; alpha-1
dec r3d ; beta-1
mov t5, r0
sub t5, r1
sub t5, r1
%endmacro
%macro CHROMA_H_START 0
dec r2d
dec r3d
sub r0, 2
lea t6, [r1*3]
mov t5, r0
add r0, t6
%endmacro
%define t5 r5
%define t6 r6
;-----------------------------------------------------------------------------
; void ff_deblock_v_chroma(uint8_t *pix, int stride, int alpha, int beta,
; int8_t *tc0)
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_8, 5,6
CHROMA_V_START
movq m0, [t5]
movq m1, [t5+r1]
movq m2, [r0]
movq m3, [r0+r1]
call ff_chroma_inter_body_mmxext
movq [t5+r1], m1
movq [r0], m2
RET
;-----------------------------------------------------------------------------
; void ff_deblock_h_chroma(uint8_t *pix, int stride, int alpha, int beta,
; int8_t *tc0)
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_8, 5,7
%if ARCH_X86_64
; This could use the red zone on 64 bit unix to avoid the stack pointer
; readjustment, but valgrind assumes the red zone is clobbered on
; function calls and returns.
sub rsp, 16
%define buf0 [rsp]
%define buf1 [rsp+8]
%else
%define buf0 r0m
%define buf1 r2m
%endif
CHROMA_H_START
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
movq buf0, m0
movq buf1, m3
LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0
punpcklbw m6, m6
pand m7, m6
DEBLOCK_P0_Q0
movq m0, buf0
movq m3, buf1
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
%if ARCH_X86_64
add rsp, 16
%endif
RET
ALIGN 16
ff_chroma_inter_body_mmxext:
LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0
punpcklbw m6, m6
pand m7, m6
DEBLOCK_P0_Q0
ret
%define t5 r4
%define t6 r5
cglobal deblock_h_chroma422_8, 5, 6
SUB rsp, (1+ARCH_X86_64*2)*mmsize
%if ARCH_X86_64
%define buf0 [rsp+16]
%define buf1 [rsp+8]
%else
%define buf0 r0m
%define buf1 r2m
%endif
movd m6, [r4]
punpcklbw m6, m6
movq [rsp], m6
CHROMA_H_START
TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
movq buf0, m0
movq buf1, m3
LOAD_MASK r2d, r3d
movd m6, [rsp]
punpcklwd m6, m6
pand m7, m6
DEBLOCK_P0_Q0
movq m0, buf0
movq m3, buf1
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
lea r0, [r0+r1*8]
lea t5, [t5+r1*8]
TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
movq buf0, m0
movq buf1, m3
LOAD_MASK r2d, r3d
movd m6, [rsp+4]
punpcklwd m6, m6
pand m7, m6
DEBLOCK_P0_Q0
movq m0, buf0
movq m3, buf1
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
ADD rsp, (1+ARCH_X86_64*2)*mmsize
RET
; in: %1=p0 %2=p1 %3=q1
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
%macro CHROMA_INTRA_P0 3
movq m4, %1
pxor m4, %3
pand m4, [pb_1] ; m4 = (p0^q1)&1
pavgb %1, %3
psubusb %1, m4
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
%endmacro
;------------------------------------------------------------------------------
; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
;------------------------------------------------------------------------------
cglobal deblock_v_chroma_intra_8, 4,5
CHROMA_V_START
movq m0, [t5]
movq m1, [t5+r1]
movq m2, [r0]
movq m3, [r0+r1]
call ff_chroma_intra_body_mmxext
movq [t5+r1], m1
movq [r0], m2
RET
;------------------------------------------------------------------------------
; void ff_deblock_h_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
;------------------------------------------------------------------------------
cglobal deblock_h_chroma_intra_8, 4,6
CHROMA_H_START
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
call ff_chroma_intra_body_mmxext
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
RET
cglobal deblock_h_chroma422_intra_8, 4, 6
CHROMA_H_START
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
call ff_chroma_intra_body_mmxext
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
lea r0, [r0+r1*8]
lea t5, [t5+r1*8]
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
call ff_chroma_intra_body_mmxext
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
RET
ALIGN 16
ff_chroma_intra_body_mmxext:
LOAD_MASK r2d, r3d
movq m5, m1
movq m6, m2
CHROMA_INTRA_P0 m1, m0, m3
CHROMA_INTRA_P0 m2, m3, m0
psubb m1, m5
psubb m2, m6
pand m1, m7
pand m2, m7
paddb m1, m5
paddb m2, m6
ret
%macro LOAD_8_ROWS 8
movd m0, %1

View File

@ -798,9 +798,11 @@ cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
%endmacro
%if ARCH_X86_64 == 0
%if HAVE_ALIGNED_STACK == 0
INIT_MMX mmxext
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
%endif
INIT_XMM sse2
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
@ -938,10 +940,6 @@ cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
sub r0, r1
shl r2d, 2
shl r3d, 2
%if mmsize < 16
mov r6, 16/mmsize
.loop:
%endif
CHROMA_V_LOAD r5
LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
@ -952,16 +950,7 @@ cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
pand m7, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
CHROMA_V_STORE
%if mmsize < 16
add r0, mmsize
add r5, mmsize
add r4, mmsize/4
dec r6
jg .loop
REP_RET
%else
RET
%endif
;-----------------------------------------------------------------------------
; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha,
@ -973,24 +962,12 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
sub r0, r1
shl r2d, 2
shl r3d, 2
%if mmsize < 16
mov r5, 16/mmsize
.loop:
%endif
CHROMA_V_LOAD r4
LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
CHROMA_V_STORE
%if mmsize < 16
add r0, mmsize
add r4, mmsize
dec r5
jg .loop
REP_RET
%else
RET
%endif
;-----------------------------------------------------------------------------
; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
@ -1002,10 +979,6 @@ cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_,
mov r5, pix_q
lea r6, [3*stride_q]
add r5, r6
%if mmsize == 8
mov r6d, 2
.loop:
%endif
CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize]
LOAD_AB m4, m5, alpha_d, beta_d
@ -1018,13 +991,6 @@ cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_,
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize]
%if mmsize == 8
lea pix_q, [pix_q + 4*stride_q]
lea r5, [r5 + 4*stride_q]
add tc0_q, 2
dec r6d
jg .loop
%endif
RET
;-----------------------------------------------------------------------------
@ -1068,10 +1034,6 @@ RET
%endmacro
%if ARCH_X86_64 == 0
INIT_MMX mmxext
DEBLOCK_CHROMA
%endif
INIT_XMM sse2
DEBLOCK_CHROMA
%if HAVE_AVX_EXTERNAL

View File

@ -87,13 +87,6 @@ SECTION .text
STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
%endmacro
INIT_MMX mmx
; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct_add_8, 3, 3, 0
movsxdifnidn r2, r2d
IDCT4_ADD r0, r1, r2
RET
%macro IDCT8_1D 2
psraw m0, m1, 1
SWAP 0, 1
@ -207,23 +200,6 @@ cglobal h264_idct_add_8, 3, 3, 0
STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
%endmacro
INIT_MMX mmx
; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_add_8, 3, 4, 0
movsxdifnidn r2, r2d
%assign pad 128+4-(stack_offset&7)
SUB rsp, pad
add word [r1], 32
IDCT8_ADD_MMX_START r1 , rsp
IDCT8_ADD_MMX_START r1+8, rsp+64
lea r3, [r0+4]
IDCT8_ADD_MMX_END r0 , rsp, r2, r1
IDCT8_ADD_MMX_END r3 , rsp+8, r2
ADD rsp, pad
RET
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_SSE 4
IDCT8_1D_FULL %2
@ -315,16 +291,7 @@ cglobal h264_idct8_add_8, 3, 4, 10
%endmacro
INIT_MMX mmxext
; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
%if ARCH_X86_64
cglobal h264_idct_dc_add_8, 3, 4, 0
movsxd r2, r2d
movsx r3, word [r1]
mov dword [r1], 0
DC_ADD_MMXEXT_INIT r3, r2
DC_ADD_MMXEXT_OP movh, r0, r2, r3
RET
; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_dc_add_8, 3, 4, 0
movsxd r2, r2d
@ -336,15 +303,6 @@ cglobal h264_idct8_dc_add_8, 3, 4, 0
DC_ADD_MMXEXT_OP mova, r0, r2, r3
RET
%else
; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct_dc_add_8, 2, 3, 0
movsx r2, word [r1]
mov dword [r1], 0
mov r1, r2m
DC_ADD_MMXEXT_INIT r2, r1
DC_ADD_MMXEXT_OP movh, r0, r1, r2
RET
; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_dc_add_8, 2, 3, 0
movsx r2, word [r1]
@ -357,247 +315,6 @@ cglobal h264_idct8_dc_add_8, 2, 3, 0
RET
%endif
INIT_MMX mmx
; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
movsxdifnidn r3, r3d
xor r5, r5
%ifdef PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
test r6, r6
jz .skipblock
mov r6d, dword [r1+r5*4]
lea r6, [r0+r6]
IDCT4_ADD r6, r2, r3
.skipblock:
inc r5
add r2, 32
cmp r5, 16
jl .nextblock
REP_RET
; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
movsxdifnidn r3, r3d
%assign pad 128+4-(stack_offset&7)
SUB rsp, pad
xor r5, r5
%ifdef PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
test r6, r6
jz .skipblock
mov r6d, dword [r1+r5*4]
add r6, r0
add word [r2], 32
IDCT8_ADD_MMX_START r2 , rsp
IDCT8_ADD_MMX_START r2+8, rsp+64
IDCT8_ADD_MMX_END r6 , rsp, r3, r2
mov r6d, dword [r1+r5*4]
lea r6, [r0+r6+4]
IDCT8_ADD_MMX_END r6 , rsp+8, r3
.skipblock:
add r5, 4
add r2, 128
cmp r5, 16
jl .nextblock
ADD rsp, pad
RET
INIT_MMX mmxext
; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
movsxdifnidn r3, r3d
xor r5, r5
%ifdef PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
test r6, r6
jz .skipblock
cmp r6, 1
jnz .no_dc
movsx r6, word [r2]
test r6, r6
jz .no_dc
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
%endif
mov dst2d, dword [r1+r5*4]
lea dst2q, [r0+dst2q]
DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
%if ARCH_X86_64 == 0
mov r1, r1m
%endif
inc r5
add r2, 32
cmp r5, 16
jl .nextblock
REP_RET
.no_dc:
mov r6d, dword [r1+r5*4]
add r6, r0
IDCT4_ADD r6, r2, r3
.skipblock:
inc r5
add r2, 32
cmp r5, 16
jl .nextblock
REP_RET
INIT_MMX mmx
; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
movsxdifnidn r3, r3d
xor r5, r5
%ifdef PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
or r6w, word [r2]
test r6, r6
jz .skipblock
mov r6d, dword [r1+r5*4]
add r6, r0
IDCT4_ADD r6, r2, r3
.skipblock:
inc r5
add r2, 32
cmp r5, 16
jl .nextblock
REP_RET
INIT_MMX mmxext
; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
movsxdifnidn r3, r3d
xor r5, r5
%ifdef PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
test r6, r6
jz .try_dc
mov r6d, dword [r1+r5*4]
lea r6, [r0+r6]
IDCT4_ADD r6, r2, r3
inc r5
add r2, 32
cmp r5, 16
jl .nextblock
REP_RET
.try_dc:
movsx r6, word [r2]
test r6, r6
jz .skipblock
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
%endif
mov dst2d, dword [r1+r5*4]
add dst2q, r0
DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
%if ARCH_X86_64 == 0
mov r1, r1m
%endif
.skipblock:
inc r5
add r2, 32
cmp r5, 16
jl .nextblock
REP_RET
; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
movsxdifnidn r3, r3d
%assign pad 128+4-(stack_offset&7)
SUB rsp, pad
xor r5, r5
%ifdef PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
test r6, r6
jz .skipblock
cmp r6, 1
jnz .no_dc
movsx r6, word [r2]
test r6, r6
jz .no_dc
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
%endif
mov dst2d, dword [r1+r5*4]
lea dst2q, [r0+dst2q]
DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
lea dst2q, [dst2q+r3*4]
DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
%if ARCH_X86_64 == 0
mov r1, r1m
%endif
add r5, 4
add r2, 128
cmp r5, 16
jl .nextblock
ADD rsp, pad
RET
.no_dc:
mov r6d, dword [r1+r5*4]
add r6, r0
add word [r2], 32
IDCT8_ADD_MMX_START r2 , rsp
IDCT8_ADD_MMX_START r2+8, rsp+64
IDCT8_ADD_MMX_END r6 , rsp, r3, r2
mov r6d, dword [r1+r5*4]
lea r6, [r0+r6+4]
IDCT8_ADD_MMX_END r6 , rsp+8, r3
.skipblock:
add r5, 4
add r2, 128
cmp r5, 16
jl .nextblock
ADD rsp, pad
RET
INIT_XMM sse2
; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
@ -678,30 +395,6 @@ h264_idct_add8_mmx_plane:
jnz .nextblock
rep ret
; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
movsxdifnidn r3, r3d
mov r5, 16
add r2, 512
%ifdef PIC
lea picregq, [scan8_mem]
%endif
%if ARCH_X86_64
mov dst2q, r0
%endif
call h264_idct_add8_mmx_plane
mov r5, 32
add r2, 384
%if ARCH_X86_64
add dst2q, gprsize
%else
add r0mp, gprsize
%endif
call h264_idct_add8_mmx_plane
RET ; TODO: check rep ret after a function call
cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
movsxdifnidn r3, r3d
@ -734,74 +427,6 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, str
RET ; TODO: check rep ret after a function call
h264_idct_add8_mmxext_plane:
movsxdifnidn r3, r3d
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
test r6, r6
jz .try_dc
%if ARCH_X86_64
mov r0d, dword [r1+r5*4]
add r0, [dst2q]
%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
add r0, dword [r1+r5*4]
%endif
IDCT4_ADD r0, r2, r3
inc r5
add r2, 32
test r5, 3
jnz .nextblock
rep ret
.try_dc:
movsx r6, word [r2]
test r6, r6
jz .skipblock
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64
mov r0d, dword [r1+r5*4]
add r0, [dst2q]
%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
add r0, dword [r1+r5*4]
%endif
DC_ADD_MMXEXT_OP movh, r0, r3, r6
.skipblock:
inc r5
add r2, 32
test r5, 3
jnz .nextblock
rep ret
INIT_MMX mmxext
; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
movsxdifnidn r3, r3d
mov r5, 16
add r2, 512
%if ARCH_X86_64
mov dst2q, r0
%endif
%ifdef PIC
lea picregq, [scan8_mem]
%endif
call h264_idct_add8_mmxext_plane
mov r5, 32
add r2, 384
%if ARCH_X86_64
add dst2q, gprsize
%else
add r0mp, gprsize
%endif
call h264_idct_add8_mmxext_plane
RET ; TODO: check rep ret after a function call
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
h264_idct_dc_add8_mmxext:
movsxdifnidn r3, r3d
@ -1129,18 +754,11 @@ cglobal h264_luma_dc_dequant_idct, 3, 4, %1
inc t1d
shr t3d, t0b
sub t1d, t0d
%if cpuflag(sse2)
movd xmm6, t1d
DEQUANT_STORE xmm6
%else
movd m6, t1d
DEQUANT_STORE m6
%endif
RET
%endmacro
INIT_MMX mmx
IDCT_DC_DEQUANT 0
INIT_MMX sse2
IDCT_DC_DEQUANT 7

View File

@ -70,19 +70,6 @@ SECTION .text
packuswb m0, m1
%endmacro
INIT_MMX mmxext
cglobal h264_weight_16, 6, 6, 0
WEIGHT_SETUP
.nextrow:
WEIGHT_OP 0, 4
mova [r0 ], m0
WEIGHT_OP 8, 12
mova [r0+8], m0
add r0, r1
dec r2d
jnz .nextrow
REP_RET
%macro WEIGHT_FUNC_MM 2
cglobal h264_weight_%1, 6, 6, %2
WEIGHT_SETUP
@ -95,8 +82,6 @@ cglobal h264_weight_%1, 6, 6, %2
REP_RET
%endmacro
INIT_MMX mmxext
WEIGHT_FUNC_MM 8, 0
INIT_XMM sse2
WEIGHT_FUNC_MM 16, 8
@ -198,25 +183,6 @@ WEIGHT_FUNC_HALF_MM 8, 8
packuswb m0, m1
%endmacro
INIT_MMX mmxext
cglobal h264_biweight_16, 7, 8, 0
BIWEIGHT_SETUP
movifnidn r3d, r3m
.nextrow:
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, 4
BIWEIGHT_STEPB
mova [r0], m0
BIWEIGHT_STEPA 0, 1, 8
BIWEIGHT_STEPA 1, 2, 12
BIWEIGHT_STEPB
mova [r0+8], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
%macro BIWEIGHT_FUNC_MM 2
cglobal h264_biweight_%1, 7, 8, %2
BIWEIGHT_SETUP
@ -233,8 +199,6 @@ cglobal h264_biweight_%1, 7, 8, %2
REP_RET
%endmacro
INIT_MMX mmxext
BIWEIGHT_FUNC_MM 8, 0
INIT_XMM sse2
BIWEIGHT_FUNC_MM 16, 8

View File

@ -31,17 +31,14 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
int16_t *block, \
int stride);
IDCT_ADD_FUNC(, 8, mmx)
IDCT_ADD_FUNC(, 8, sse2)
IDCT_ADD_FUNC(, 8, avx)
IDCT_ADD_FUNC(, 10, sse2)
IDCT_ADD_FUNC(_dc, 8, mmxext)
IDCT_ADD_FUNC(_dc, 8, sse2)
IDCT_ADD_FUNC(_dc, 8, avx)
IDCT_ADD_FUNC(_dc, 10, mmxext)
IDCT_ADD_FUNC(8_dc, 8, mmxext)
IDCT_ADD_FUNC(8_dc, 10, sse2)
IDCT_ADD_FUNC(8, 8, mmx)
IDCT_ADD_FUNC(8, 8, sse2)
IDCT_ADD_FUNC(8, 10, sse2)
IDCT_ADD_FUNC(, 10, avx)
@ -54,17 +51,11 @@ void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, const int *block_offset, \
int16_t *block, int stride, const uint8_t nnzc[5 * 8]);
IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, avx)
IDCT_ADD_REP_FUNC(, 16, 8, mmx)
IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16, 8, sse2)
IDCT_ADD_REP_FUNC(, 16, 10, sse2)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
IDCT_ADD_REP_FUNC(, 16, 10, avx)
@ -76,8 +67,6 @@ void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
(uint8_t **dst, const int *block_offset, \
int16_t *block, int stride, const uint8_t nnzc[15 * 8]);
IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, avx)
@ -87,7 +76,6 @@ IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx)
IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2)
IDCT_ADD_REP_FUNC2(, 8_422, 10, avx)
void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
/***********************************/
@ -112,14 +100,6 @@ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
int beta);
#define LF_FUNCS(type, depth) \
LF_FUNC(h, chroma, depth, mmxext) \
LF_IFUNC(h, chroma_intra, depth, mmxext) \
LF_FUNC(h, chroma422, depth, mmxext) \
LF_IFUNC(h, chroma422_intra, depth, mmxext) \
LF_FUNC(v, chroma, depth, mmxext) \
LF_IFUNC(v, chroma_intra, depth, mmxext) \
LF_FUNC(h, luma, depth, mmxext) \
LF_IFUNC(h, luma_intra, depth, mmxext) \
LF_FUNC(h, luma, depth, sse2) \
LF_IFUNC(h, luma_intra, depth, sse2) \
LF_FUNC(v, luma, depth, sse2) \
@ -147,27 +127,10 @@ LF_FUNC(h, luma_mbaff, 8, avx)
LF_FUNCS(uint8_t, 8)
LF_FUNCS(uint16_t, 10)
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
LF_FUNC(v8, luma, 8, mmxext)
static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0)
{
if ((tc0[0] & tc0[1]) >= 0)
ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
if ((tc0[2] & tc0[3]) >= 0)
ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
}
LF_IFUNC(v8, luma_intra, 8, mmxext)
static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
int alpha, int beta)
{
ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
}
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
LF_FUNC(v, luma, 10, mmxext)
LF_FUNC(h, luma, 10, mmxext)
LF_IFUNC(v, luma_intra, 10, mmxext)
LF_IFUNC(h, luma_intra, 10, mmxext)
/***********************************/
/* weighted prediction */
@ -187,14 +150,13 @@ void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \
H264_WEIGHT(W, mmxext) \
H264_BIWEIGHT(W, mmxext)
#define H264_BIWEIGHT_MMX_SSE(W) \
H264_BIWEIGHT_MMX(W) \
#define H264_BIWEIGHT_SSE(W) \
H264_WEIGHT(W, sse2) \
H264_BIWEIGHT(W, sse2) \
H264_BIWEIGHT(W, ssse3)
H264_BIWEIGHT_MMX_SSE(16)
H264_BIWEIGHT_MMX_SSE(8)
H264_BIWEIGHT_SSE(16)
H264_BIWEIGHT_SSE(8)
H264_BIWEIGHT_MMX(4)
#define H264_WEIGHT_10(W, DEPTH, OPT) \
@ -236,52 +198,16 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
if (bit_depth == 8) {
if (EXTERNAL_MMX(cpu_flags)) {
c->h264_idct_dc_add =
c->h264_idct_add = ff_h264_idct_add_8_mmx;
c->h264_idct8_dc_add =
c->h264_idct8_add = ff_h264_idct8_add_8_mmx;
c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
if (chroma_format_idc <= 1) {
c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
} else {
c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx;
}
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
if (cpu_flags & AV_CPU_FLAG_CMOV)
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext;
if (chroma_format_idc <= 1)
c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext;
}
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
@ -350,19 +276,12 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
#if ARCH_X86_32
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
if (chroma_format_idc <= 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
} else {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
}
#if ARCH_X86_32 && !HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
#endif /* ARCH_X86_32 */
#endif /* ARCH_X86_32 && !HAVE_ALIGNED_STACK */
c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {