mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
x86/hevc_res_add: add ff_hevc_transform_add32_8_avx2
~20% faster than AVX. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
parent
467a55a4ee
commit
c3d2426cca
@ -89,8 +89,12 @@ cglobal hevc_transform_add4_8, 3, 4, 6
|
||||
%endmacro
|
||||
|
||||
%macro TR_ADD_SSE_16_32_8 3
|
||||
mova m2, [r1+%1 ]
|
||||
mova m6, [r1+%1+16]
|
||||
mova xm2, [r1+%1 ]
|
||||
mova xm6, [r1+%1+16]
|
||||
%if cpuflag(avx2)
|
||||
vinserti128 m2, m2, [r1+%1+32], 1
|
||||
vinserti128 m6, m6, [r1+%1+48], 1
|
||||
%endif
|
||||
%if cpuflag(avx)
|
||||
psubw m1, m0, m2
|
||||
psubw m5, m0, m6
|
||||
@ -103,8 +107,12 @@ cglobal hevc_transform_add4_8, 3, 4, 6
|
||||
packuswb m2, m6
|
||||
packuswb m1, m5
|
||||
|
||||
mova m4, [r1+%1+32]
|
||||
mova m6, [r1+%1+48]
|
||||
mova xm4, [r1+%1+mmsize*2 ]
|
||||
mova xm6, [r1+%1+mmsize*2+16]
|
||||
%if cpuflag(avx2)
|
||||
vinserti128 m4, m4, [r1+%1+96 ], 1
|
||||
vinserti128 m6, m6, [r1+%1+112], 1
|
||||
%endif
|
||||
%if cpuflag(avx)
|
||||
psubw m3, m0, m4
|
||||
psubw m5, m0, m6
|
||||
@ -169,6 +177,21 @@ TRANSFORM_ADD_8
|
||||
INIT_XMM avx
|
||||
TRANSFORM_ADD_8
|
||||
|
||||
INIT_YMM avx2
|
||||
; void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
|
||||
cglobal hevc_transform_add32_8, 3, 4, 7
|
||||
pxor m0, m0
|
||||
lea r3, [r2*3]
|
||||
TR_ADD_SSE_16_32_8 0, r0, r0+r2
|
||||
TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
|
||||
%rep 7
|
||||
add r1, 256
|
||||
lea r0, [r0+r2*4]
|
||||
TR_ADD_SSE_16_32_8 0, r0, r0+r2
|
||||
TR_ADD_SSE_16_32_8 128, r0+r2*2, r0+r3
|
||||
%endrep
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_hevc_transform_add_10(pixel *dst, int16_t *block, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
|
@ -143,6 +143,8 @@ void ff_hevc_transform_add8_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t strid
|
||||
void ff_hevc_transform_add16_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
|
||||
void ff_hevc_transform_add32_8_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
|
||||
|
||||
void ff_hevc_transform_add32_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
|
||||
|
||||
void ff_hevc_transform_add4_10_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
|
||||
void ff_hevc_transform_add8_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
|
||||
void ff_hevc_transform_add16_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
|
||||
|
@ -555,6 +555,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
||||
if (EXTERNAL_AVX2(cpu_flags)) {
|
||||
c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
|
||||
c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
|
||||
|
||||
c->transform_add[3] = ff_hevc_transform_add32_8_avx2;
|
||||
}
|
||||
} else if (bit_depth == 10) {
|
||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||
|
Loading…
Reference in New Issue
Block a user