From e40ea9f34b196176d80f68c0925de7dc785a5df6 Mon Sep 17 00:00:00 2001 From: James Almer Date: Wed, 22 Nov 2023 16:25:54 -0300 Subject: [PATCH] x86/ac3dsp: add ff_float_to_fixed24_avx() Signed-off-by: James Almer --- libavcodec/ac3dsp.h | 4 ++-- libavcodec/ac3enc_template.c | 4 ++-- libavcodec/x86/ac3dsp.asm | 24 ++++++++++++++++++++++++ libavcodec/x86/ac3dsp_init.c | 4 ++++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h index 9996ef19ec..ec2f598451 100644 --- a/libavcodec/ac3dsp.h +++ b/libavcodec/ac3dsp.h @@ -47,9 +47,9 @@ typedef struct AC3DSPContext { * [-(1<<24),(1<<24)] * * @param dst destination array of int32_t. - * constraints: 16-byte aligned + * constraints: 32-byte aligned * @param src source array of float. - * constraints: 16-byte aligned + * constraints: 32-byte aligned * @param len number of elements to convert. * constraints: multiple of 32 greater than zero */ diff --git a/libavcodec/ac3enc_template.c b/libavcodec/ac3enc_template.c index be4ecebc9c..ce9ef58a33 100644 --- a/libavcodec/ac3enc_template.c +++ b/libavcodec/ac3enc_template.c @@ -110,9 +110,9 @@ static void apply_mdct(AC3EncodeContext *s) */ static void apply_channel_coupling(AC3EncodeContext *s) { - LOCAL_ALIGNED_16(CoefType, cpl_coords, [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]); + LOCAL_ALIGNED_32(CoefType, cpl_coords, [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]); #if AC3ENC_FLOAT - LOCAL_ALIGNED_16(int32_t, fixed_cpl_coords, [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]); + LOCAL_ALIGNED_32(int32_t, fixed_cpl_coords, [AC3_MAX_BLOCKS], [AC3_MAX_CHANNELS][16]); #else int32_t (*fixed_cpl_coords)[AC3_MAX_CHANNELS][16] = cpl_coords; #endif diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm index 42c8310462..0ba980aa7b 100644 --- a/libavcodec/x86/ac3dsp.asm +++ b/libavcodec/x86/ac3dsp.asm @@ -128,6 +128,30 @@ cglobal float_to_fixed24, 3, 3, 9, dst, src, len jl .loop RET +INIT_YMM avx +cglobal float_to_fixed24, 3, 3, 5, dst, src, len + vbroadcastf128 m0, [pf_1_24] + shl lenq, 2 + add srcq, lenq + add dstq, lenq + neg lenq +.loop: + mulps m1, m0, [srcq+lenq+mmsize*0] + mulps m2, m0, [srcq+lenq+mmsize*1] + mulps m3, m0, [srcq+lenq+mmsize*2] + mulps m4, m0, [srcq+lenq+mmsize*3] + cvtps2dq m1, m1 + cvtps2dq m2, m2 + cvtps2dq m3, m3 + cvtps2dq m4, m4 + mova [dstq+lenq+mmsize*0], m1 + mova [dstq+lenq+mmsize*1], m2 + mova [dstq+lenq+mmsize*2], m3 + mova [dstq+lenq+mmsize*3], m4 + add lenq, mmsize*4 + jl .loop + RET + ;------------------------------------------------------------------------------ ; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16]) ;------------------------------------------------------------------------------ diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c index 472d39fa5e..353cf38f86 100644 --- a/libavcodec/x86/ac3dsp_init.c +++ b/libavcodec/x86/ac3dsp_init.c @@ -27,6 +27,7 @@ void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, size_t len); +void ff_float_to_fixed24_avx (int32_t *dst, const float *src, size_t len); int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); @@ -48,6 +49,9 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) if (!(cpu_flags & AV_CPU_FLAG_ATOM)) c->extract_exponents = ff_ac3_extract_exponents_ssse3; } + if (EXTERNAL_AVX_FAST(cpu_flags)) { + c->float_to_fixed24 = ff_float_to_fixed24_avx; + } } #define DOWNMIX_FUNC_OPT(ch, opt) \