diff --git a/libavfilter/x86/af_volume.asm b/libavfilter/x86/af_volume.asm index 922ad5dcc4..4e5ad2258c 100644 --- a/libavfilter/x86/af_volume.asm +++ b/libavfilter/x86/af_volume.asm @@ -19,12 +19,15 @@ ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** -%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" SECTION_RODATA 32 -pw_1: times 8 dw 1 -pw_128: times 8 dw 128 +pd_1_256: times 4 dq 0x3F70000000000000 +pd_int32_max: times 4 dq 0x41DFFFFFFFC00000 +pw_1: times 8 dw 1 +pw_128: times 8 dw 128 +pq_128: times 2 dq 128 SECTION_TEXT @@ -54,3 +57,82 @@ cglobal scale_samples_s16, 4,4,4, dst, src, len, volume sub lenq, mmsize jge .loop REP_RET + +;------------------------------------------------------------------------------ +; void ff_scale_samples_s32(uint8_t *dst, const uint8_t *src, int len, +; int volume) +;------------------------------------------------------------------------------ + +%macro SCALE_SAMPLES_S32 0 +cglobal scale_samples_s32, 4,4,4, dst, src, len, volume +%if ARCH_X86_32 && cpuflag(avx) + vbroadcastss xmm2, volumem +%else + movd xmm2, volumed + pshufd xmm2, xmm2, 0 +%endif + CVTDQ2PD m2, xmm2 + mulpd m2, m2, [pd_1_256] + mova m3, [pd_int32_max] + lea lenq, [lend*4-mmsize] +.loop: + CVTDQ2PD m0, [srcq+lenq ] + CVTDQ2PD m1, [srcq+lenq+mmsize/2] + mulpd m0, m0, m2 + mulpd m1, m1, m2 + minpd m0, m0, m3 + minpd m1, m1, m3 + cvtpd2dq xmm0, m0 + cvtpd2dq xmm1, m1 +%if cpuflag(avx) + vmovdqa [dstq+lenq ], xmm0 + vmovdqa [dstq+lenq+mmsize/2], xmm1 +%else + movq [dstq+lenq ], xmm0 + movq [dstq+lenq+mmsize/2], xmm1 +%endif + sub lenq, mmsize + jge .loop + REP_RET +%endmacro + +INIT_XMM sse2 +%define CVTDQ2PD cvtdq2pd +SCALE_SAMPLES_S32 +%define CVTDQ2PD vcvtdq2pd +INIT_YMM avx +SCALE_SAMPLES_S32 +%undef CVTDQ2PD + +; NOTE: This is not bit-identical with the C version because it clips to +; [-INT_MAX, INT_MAX] instead of [INT_MIN, INT_MAX] + +INIT_XMM ssse3, atom +cglobal scale_samples_s32, 4,4,8, dst, src, len, volume + movd m4, volumem + pshufd m4, m4, 0 + mova m5, [pq_128] + pxor m6, m6 + lea lenq, [lend*4-mmsize] +.loop: + ; src[i] = av_clipl_int32((src[i] * volume + 128) >> 8); + mova m7, [srcq+lenq] + pabsd m3, m7 + pshufd m0, m3, q0100 + pshufd m1, m3, q0302 + pmuludq m0, m4 + pmuludq m1, m4 + paddq m0, m5 + paddq m1, m5 + psrlq m0, 7 + psrlq m1, 7 + shufps m2, m0, m1, q3131 + shufps m0, m0, m1, q2020 + pcmpgtd m2, m6 + por m0, m2 + psrld m0, 1 + psignd m0, m7 + mova [dstq+lenq], m0 + sub lenq, mmsize + jge .loop + REP_RET diff --git a/libavfilter/x86/af_volume_init.c b/libavfilter/x86/af_volume_init.c index 00103df216..02bedd2c57 100644 --- a/libavfilter/x86/af_volume_init.c +++ b/libavfilter/x86/af_volume_init.c @@ -25,6 +25,13 @@ void ff_scale_samples_s16_sse2(uint8_t *dst, const uint8_t *src, int len, int volume); +void ff_scale_samples_s32_sse2(uint8_t *dst, const uint8_t *src, int len, + int volume); +void ff_scale_samples_s32_ssse3_atom(uint8_t *dst, const uint8_t *src, int len, + int volume); +void ff_scale_samples_s32_avx(uint8_t *dst, const uint8_t *src, int len, + int volume); + void ff_volume_init_x86(VolumeContext *vol) { int mm_flags = av_get_cpu_flags(); @@ -35,5 +42,18 @@ void ff_volume_init_x86(VolumeContext *vol) vol->scale_samples = ff_scale_samples_s16_sse2; vol->samples_align = 8; } + } else if (sample_fmt == AV_SAMPLE_FMT_S32) { + if (EXTERNAL_SSE2(mm_flags)) { + vol->scale_samples = ff_scale_samples_s32_sse2; + vol->samples_align = 4; + } + if (EXTERNAL_SSSE3(mm_flags) && mm_flags & AV_CPU_FLAG_ATOM) { + vol->scale_samples = ff_scale_samples_s32_ssse3_atom; + vol->samples_align = 4; + } + if (EXTERNAL_AVX(mm_flags)) { + vol->scale_samples = ff_scale_samples_s32_avx; + vol->samples_align = 8; + } } } diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 52ee46ab76..8a30689f00 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -956,6 +956,7 @@ AVX_INSTR cmpps, 1, 0, 0 AVX_INSTR cmpsd, 1, 0, 0 AVX_INSTR cmpss, 1, 0, 0 AVX_INSTR cvtdq2ps, 1, 0, 0 +AVX_INSTR cvtpd2dq, 1, 0, 0 AVX_INSTR cvtps2dq, 1, 0, 0 AVX_INSTR divpd, 1, 0, 0 AVX_INSTR divps, 1, 0, 0