mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-28 20:53:54 +02:00
vp9/x86: 4x4 iadst SIMD (ssse3) variants.
Cycle measurements for intra itxfm_4x4_add on ped1080p.webm: idct_idct: 66 -> 67 cycles (noise measurement) idct_iadst: 199 -> 79 cycles iadst_idct: 165 -> 70 cycles iadst_iadst: 183 -> 82 cycles
This commit is contained in:
parent
baf47020cd
commit
d43efa68bd
@ -166,7 +166,7 @@ itxfm_func(iadst, idct, size, opt); \
|
||||
itxfm_func(idct, iadst, size, opt); \
|
||||
itxfm_func(iadst, iadst, size, opt)
|
||||
|
||||
itxfm_func(idct, idct, 4, ssse3);
|
||||
itxfm_funcs(4, ssse3);
|
||||
itxfm_funcs(8, ssse3);
|
||||
itxfm_funcs(8, avx);
|
||||
itxfm_funcs(16, ssse3);
|
||||
@ -250,6 +250,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
|
||||
init_subpel3(0, put, ssse3);
|
||||
init_subpel3(1, avg, ssse3);
|
||||
dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
|
||||
dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3;
|
||||
dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3;
|
||||
dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3;
|
||||
if (ARCH_X86_64) {
|
||||
dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
|
||||
dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3;
|
||||
|
@ -58,6 +58,13 @@ VP9_IDCT_COEFFS 8423, 14053
|
||||
VP9_IDCT_COEFFS 13160, 9760
|
||||
VP9_IDCT_COEFFS 2404, 16207
|
||||
|
||||
pw_5283_13377: times 4 dw 5283, 13377
|
||||
pw_9929_13377: times 4 dw 9929, 13377
|
||||
pw_15212_m13377: times 4 dw 15212, -13377
|
||||
pw_15212_9929: times 4 dw 15212, 9929
|
||||
pw_m5283_m15212: times 4 dw -5283, -15212
|
||||
pw_13377x2: times 8 dw 13377*2
|
||||
|
||||
pd_8192: times 4 dd 8192
|
||||
pw_2048: times 8 dw 2048
|
||||
pw_1024: times 8 dw 1024
|
||||
@ -239,6 +246,68 @@ cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob
|
||||
VP9_IDCT4_WRITEOUT
|
||||
RET
|
||||
|
||||
;-------------------------------------------------------------------------------------------
|
||||
; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
|
||||
;-------------------------------------------------------------------------------------------
|
||||
|
||||
%macro VP9_IADST4_1D 0
|
||||
movq2dq xmm0, m0
|
||||
movq2dq xmm1, m1
|
||||
movq2dq xmm2, m2
|
||||
movq2dq xmm3, m3
|
||||
paddw m3, m0
|
||||
punpcklwd xmm0, xmm1
|
||||
punpcklwd xmm2, xmm3
|
||||
pmaddwd xmm1, xmm0, [pw_5283_13377]
|
||||
pmaddwd xmm4, xmm0, [pw_9929_13377]
|
||||
pmaddwd xmm0, [pw_15212_m13377]
|
||||
pmaddwd xmm3, xmm2, [pw_15212_9929]
|
||||
pmaddwd xmm2, [pw_m5283_m15212]
|
||||
psubw m3, m2
|
||||
paddd xmm0, xmm2
|
||||
paddd xmm3, [pd_8192]
|
||||
paddd xmm2, [pd_8192]
|
||||
paddd xmm1, xmm3
|
||||
paddd xmm0, xmm3
|
||||
paddd xmm4, xmm2
|
||||
psrad xmm1, 14
|
||||
psrad xmm0, 14
|
||||
psrad xmm4, 14
|
||||
pmulhrsw m3, [pw_13377x2] ; out2
|
||||
packssdw xmm0, xmm0
|
||||
packssdw xmm1, xmm1
|
||||
packssdw xmm4, xmm4
|
||||
movdq2q m0, xmm0 ; out3
|
||||
movdq2q m1, xmm1 ; out0
|
||||
movdq2q m2, xmm4 ; out1
|
||||
SWAP 0, 1, 2, 3
|
||||
%endmacro
|
||||
|
||||
%macro IADST4_FN 5
|
||||
INIT_MMX %5
|
||||
cglobal vp9_%1_%3_4x4_add, 3, 3, 8, dst, stride, block, eob
|
||||
mova m0, [blockq+ 0]
|
||||
mova m1, [blockq+ 8]
|
||||
mova m2, [blockq+16]
|
||||
mova m3, [blockq+24]
|
||||
mova m6, [pw_11585x2]
|
||||
mova m7, [pd_8192] ; rounding
|
||||
VP9_%2_1D
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
VP9_%4_1D
|
||||
pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
|
||||
mova [blockq+ 0], m4
|
||||
mova [blockq+ 8], m4
|
||||
mova [blockq+16], m4
|
||||
mova [blockq+24], m4
|
||||
VP9_IDCT4_WRITEOUT
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
IADST4_FN idct, IDCT4, iadst, IADST4, ssse3
|
||||
IADST4_FN iadst, IADST4, idct, IDCT4, ssse3
|
||||
IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
|
||||
|
||||
%if ARCH_X86_64 ; TODO: 32-bit? (32-bit limited to 8 xmm reg, we use more)
|
||||
|
||||
;-------------------------------------------------------------------------------------------
|
||||
|
Loading…
Reference in New Issue
Block a user