swr: implement stereo S16/S32/FLT->S16/S32/FLT planar->packed in SSE/SSE2

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2024-12-23 12:43:46 +02:00 · 2012-05-05 15:31:06 +02:00 · 2012-05-05 15:31:06 +02:00 · 47055b8913
commit 47055b8913
parent fec3700dcd
2 changed files with 182 additions and 0 deletions
--- a/libswresample/x86/audio_convert.asm
+++ b/libswresample/x86/audio_convert.asm
@ -227,6 +227,135 @@ int32_to_int16_u_int %+ SUFFIX
    REP_RET
 %endmacro

+;to, from, a/u, log2_outsize, log_intsize, const
+%macro PACK_2CH 5-7
+cglobal pack_2ch_%2_to_%1_%3, 3, 4, 5, dst, src, len, src2
+    mov src2q   , [srcq+gprsize]
+    mov srcq    , [srcq]
+    mov dstq    , [dstq]
+%ifidn %3, a
+    test dstq, mmsize-1
+        jne pack_2ch_%1_to_%2_u_int %+ SUFFIX
+    test srcq, mmsize-1
+        jne pack_2ch_%1_to_%2_u_int %+ SUFFIX
+    test src2q, mmsize-1
+        jne pack_2ch_%1_to_%2_u_int %+ SUFFIX
+%else
+pack_2ch_%1_to_%2_u_int %+ SUFFIX
+%endif
+    lea     srcq , [srcq  + (1<<%5)*lenq]
+    lea     src2q, [src2q + (1<<%5)*lenq]
+    lea     dstq , [dstq  + (2<<%4)*lenq]
+    neg     lenq
+    %7
+.next:
+    mov%3     m0, [         srcq +(1<<%5)*lenq]
+    mova      m1, m0
+    mov%3     m2, [         src2q+(1<<%5)*lenq]
+%if %5 == 1
+    punpcklwd m0, m2
+    punpckhwd m1, m2
+%else
+    punpckldq m0, m2
+    punpckhdq m1, m2
+%endif
+%if %4 < %5
+    mov%3     m2, [mmsize + srcq +(1<<%5)*lenq]
+    mova      m3, m2
+    mov%3     m4, [mmsize + src2q+(1<<%5)*lenq]
+    punpckldq m2, m4
+    punpckhdq m3, m4
+%endif
+    %6
+    mov%3 [           dstq+(2<<%4)*lenq], m0
+    mov%3 [  mmsize + dstq+(2<<%4)*lenq], m1
+%if %4 > %5
+    mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2
+    mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3
+    add lenq, 4*mmsize/(2<<%4)
+%else
+    add lenq, 2*mmsize/(2<<%4)
+%endif
+        jl .next
+    REP_RET
+%endmacro
+
+%macro INT16_TO_INT32_N 0
+    pxor      m2, m2
+    pxor      m3, m3
+    punpcklwd m2, m1
+    punpckhwd m3, m1
+    SWAP 4,0
+    pxor      m0, m0
+    pxor      m1, m1
+    punpcklwd m0, m4
+    punpckhwd m1, m4
+%endmacro
+
+%macro INT32_TO_INT16_N 0
+    psrad     m0, 16
+    psrad     m1, 16
+    psrad     m2, 16
+    psrad     m3, 16
+    packssdw  m0, m1
+    packssdw  m2, m3
+    SWAP 1,2
+%endmacro
+
+%macro INT32_TO_FLOAT_INIT 0
+    mova      m3, [flt2pm31]
+%endmacro
+%macro INT32_TO_FLOAT_N 0
+    cvtdq2ps  m0, m0
+    cvtdq2ps  m1, m1
+    mulps m0, m0, m3
+    mulps m1, m1, m3
+%endmacro
+
+%macro FLOAT_TO_INT32_INIT 0
+    mova      m3, [flt2p31]
+%endmacro
+%macro FLOAT_TO_INT32_N 0
+    mulps m0, m3
+    mulps m1, m3
+    cvtps2dq  m2, m0
+    cvtps2dq  m4, m1
+    cmpnltps m0, m3
+    cmpnltps m1, m3
+    paddd m0, m2
+    paddd m1, m4
+%endmacro
+
+%macro INT16_TO_FLOAT_INIT 0
+    mova      m5, [flt2pm31]
+%endmacro
+%macro INT16_TO_FLOAT_N 0
+    INT16_TO_INT32_N
+    cvtdq2ps  m0, m0
+    cvtdq2ps  m1, m1
+    cvtdq2ps  m2, m2
+    cvtdq2ps  m3, m3
+    mulps m0, m0, m5
+    mulps m1, m1, m5
+    mulps m2, m2, m5
+    mulps m3, m3, m5
+%endmacro
+
+%macro FLOAT_TO_INT16_INIT 0
+    mova      m5, [flt2p15]
+%endmacro
+%macro FLOAT_TO_INT16_N 0
+    mulps m0, m5
+    mulps m1, m5
+    mulps m2, m5
+    mulps m3, m5
+    cvtps2dq  m0, m0
+    cvtps2dq  m1, m1
+    packssdw  m0, m1
+    cvtps2dq  m1, m2
+    cvtps2dq  m3, m3
+    packssdw  m1, m3
+%endmacro

 INIT_MMX mmx
 INT16_TO_INT32 u
@ -240,6 +369,15 @@ INT16_TO_INT32 a
 INT32_TO_INT16 u
 INT32_TO_INT16 a

+PACK_2CH int16, int16, u, 1, 1
+PACK_2CH int16, int16, a, 1, 1
+PACK_2CH int32, int32, u, 2, 2
+PACK_2CH int32, int32, a, 2, 2
+PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N
+PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N
+PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N
+PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N
+
 INIT_XMM sse2
 INT32_TO_FLOAT u
 INT32_TO_FLOAT a
@ -250,6 +388,16 @@ FLOAT_TO_INT32 a
 FLOAT_TO_INT16 u
 FLOAT_TO_INT16 a

+PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
+PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
+PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
+PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
+PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
+PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
+
+
 %if HAVE_AVX
 INIT_YMM avx
 INT32_TO_FLOAT u
--- a/libswresample/x86/swresample_x86.c
+++ b/libswresample/x86/swresample_x86.c
@ -35,6 +35,16 @@ void ff_float_to_int16_a_sse2(uint8_t **dst, const uint8_t **src, int len);

 void ff_int32_to_float_a_avx(uint8_t **dst, const uint8_t **src, int len);

+void ff_pack_2ch_int16_to_int16_a_sse(uint8_t **dst, const uint8_t **src, int len);
+void ff_pack_2ch_int32_to_int32_a_sse(uint8_t **dst, const uint8_t **src, int len);
+void ff_pack_2ch_int16_to_int32_a_sse(uint8_t **dst, const uint8_t **src, int len);
+void ff_pack_2ch_int32_to_int16_a_sse(uint8_t **dst, const uint8_t **src, int len);
+
+void ff_pack_2ch_int32_to_float_a_sse2(uint8_t **dst, const uint8_t **src, int len);
+void ff_pack_2ch_float_to_int32_a_sse2(uint8_t **dst, const uint8_t **src, int len);
+void ff_pack_2ch_int16_to_float_a_sse2(uint8_t **dst, const uint8_t **src, int len);
+void ff_pack_2ch_float_to_int16_a_sse2(uint8_t **dst, const uint8_t **src, int len);
+
 void swri_audio_convert_init_x86(struct AudioConvert *ac,
                                 enum AVSampleFormat out_fmt,
                                 enum AVSampleFormat in_fmt,
@ -56,6 +66,19 @@ void swri_audio_convert_init_x86(struct AudioConvert *ac,
 MULTI_CAPS_FUNC(AV_CPU_FLAG_MMX, mmx)
 MULTI_CAPS_FUNC(AV_CPU_FLAG_SSE, sse)

+    if(mm_flags & AV_CPU_FLAG_SSE) {
+        if(channels == 2) {
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_2ch_int32_to_int32_a_sse;
+            if(   out_fmt == AV_SAMPLE_FMT_S16  && in_fmt == AV_SAMPLE_FMT_S16P)
+                ac->simd_f =  ff_pack_2ch_int16_to_int16_a_sse;
+            if(   out_fmt == AV_SAMPLE_FMT_S32  && in_fmt == AV_SAMPLE_FMT_S16P)
+                ac->simd_f =  ff_pack_2ch_int16_to_int32_a_sse;
+            if(   out_fmt == AV_SAMPLE_FMT_S16  && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_2ch_int32_to_int16_a_sse;
+        }
+    }
+
    if(mm_flags & AV_CPU_FLAG_SSE2) {
        if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P)
            ac->simd_f =  ff_int32_to_float_a_sse2;
@ -65,6 +88,17 @@ MULTI_CAPS_FUNC(AV_CPU_FLAG_SSE, sse)
            ac->simd_f =  ff_float_to_int32_a_sse2;
        if(   out_fmt == AV_SAMPLE_FMT_S16  && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLTP)
            ac->simd_f =  ff_float_to_int16_a_sse2;
+
+        if(channels == 2) {
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32P)
+                ac->simd_f =  ff_pack_2ch_int32_to_float_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S32  && in_fmt == AV_SAMPLE_FMT_FLTP)
+                ac->simd_f =  ff_pack_2ch_float_to_int32_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S16P)
+                ac->simd_f =  ff_pack_2ch_int16_to_float_a_sse2;
+            if(   out_fmt == AV_SAMPLE_FMT_S16  && in_fmt == AV_SAMPLE_FMT_FLTP)
+                ac->simd_f =  ff_pack_2ch_float_to_int16_a_sse2;
+        }
    }
    if(HAVE_AVX && mm_flags & AV_CPU_FLAG_AVX) {
        if(   out_fmt == AV_SAMPLE_FMT_FLT  && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P)