Add apply_window_int16() to DSPContext with x86-optimized versions and use it

in the ac3_fixed encoder.
2025-07-16 22:42:38 +02:00 · 2011-03-20 13:31:36 -04:00
parent e971d81364
commit e6e9823488
11 changed files with 210 additions and 36 deletions
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@ -167,7 +167,7 @@ static av_cold int mdct_init(AVCodecContext *avctx, AC3MDCTContext *mdct,
 static void mdct512(AC3MDCTContext *mdct, CoefType *out, SampleType *in);
 static void apply_window(DSPContext *dsp, SampleType *output, const SampleType *input,
-                         const SampleType *window, int n);
+                         const SampleType *window, unsigned int len);
 static int normalize_samples(AC3EncodeContext *s);
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@ -252,15 +252,9 @@ static void mdct512(AC3MDCTContext *mdct, int32_t *out, int16_t *in)
 * Apply KBD window to input samples prior to MDCT.
 */
 static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input,
-                         const int16_t *window, int n)
+                         const int16_t *window, unsigned int len)
 {
-    int i;
+    dsp->apply_window_int16(output, input, window, len);
    int n2 = n >> 1;
    for (i = 0; i < n2; i++) {
        output[i]     = MUL16(input[i],     window[i]) >> 15;
        output[n-i-1] = MUL16(input[n-i-1], window[i]) >> 15;
    }
 }
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@ -83,9 +83,9 @@ static void mdct512(AC3MDCTContext *mdct, float *out, float *in)
 * Apply KBD window to input samples prior to MDCT.
 */
 static void apply_window(DSPContext *dsp, float *output, const float *input,
-                         const float *window, int n)
+                         const float *window, unsigned int len)
 {
-    dsp->vector_fmul(output, input, window, n);
+    dsp->vector_fmul(output, input, window, len);
 }
--- a/libavcodec/ac3tab.c
+++ b/libavcodec/ac3tab.c
@ -141,7 +141,7 @@ const uint8_t ff_ac3_rematrix_band_tab[5] = { 13, 25, 37, 61, 253 };
 /* AC-3 MDCT window */
 /* MDCT window */
-const int16_t ff_ac3_window[AC3_WINDOW_SIZE/2] = {
+DECLARE_ALIGNED(16, const int16_t, ff_ac3_window)[AC3_WINDOW_SIZE/2] = {
    4,    7,   12,   16,   21,   28,   34,   42,
   51,   61,   72,   84,   97,  111,  127,  145,
  164,  184,  207,  231,  257,  285,  315,  347,
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@ -3890,6 +3890,19 @@ static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, co
    return res;
 }
 static void apply_window_int16_c(int16_t *output, const int16_t *input,
                                 const int16_t *window, unsigned int len)
 {
    int i;
    int len2 = len >> 1;
    for (i = 0; i < len2; i++) {
        int16_t w       = window[i];
        output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
        output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
    }
 }
 #define W0 2048
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@ -4364,6 +4377,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
    c->vector_clipf = vector_clipf_c;
    c->scalarproduct_int16 = scalarproduct_int16_c;
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
    c->apply_window_int16 = apply_window_int16_c;
    c->scalarproduct_float = scalarproduct_float_c;
    c->butterflies_float = butterflies_float_c;
    c->vector_fmul_scalar = vector_fmul_scalar_c;
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@ -524,6 +524,20 @@ typedef struct DSPContext {
     */
    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, const int16_t *v2, const int16_t *v3, int len, int mul);
    /**
     * Apply symmetric window in 16-bit fixed-point.
     * @param output destination array
     *               constraints: 16-byte aligned
     * @param input  source array
     *               constraints: 16-byte aligned
     * @param window window array
     *               constraints: 16-byte aligned, at least len/2 elements
     * @param len    full window length
     *               constraints: multiple of ? greater than zero
     */
    void (*apply_window_int16)(int16_t *output, const int16_t *input,
                               const int16_t *window, unsigned int len);
    /* rv30 functions */
    qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
    qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@ -2388,6 +2388,20 @@ int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int or
 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
 void ff_apply_window_int16_mmxext    (int16_t *output, const int16_t *input,
                                      const int16_t *window, unsigned int len);
 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
                                      const int16_t *window, unsigned int len);
 void ff_apply_window_int16_sse2      (int16_t *output, const int16_t *input,
                                      const int16_t *window, unsigned int len);
 void ff_apply_window_int16_sse2_ba   (int16_t *output, const int16_t *input,
                                      const int16_t *window, unsigned int len);
 void ff_apply_window_int16_ssse3     (int16_t *output, const int16_t *input,
                                      const int16_t *window, unsigned int len);
 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
                                      const int16_t *window, unsigned int len);
 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
@ -2749,6 +2763,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 #if HAVE_YASM
            c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
            if (avctx->flags & CODEC_FLAG_BITEXACT) {
                c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
            } else {
                c->apply_window_int16 = ff_apply_window_int16_mmxext;
            }
 #endif
        }
        if(mm_flags & AV_CPU_FLAG_SSE){
@ -2771,13 +2790,30 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 #if HAVE_YASM
            c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
            if (avctx->flags & CODEC_FLAG_BITEXACT) {
                c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
            } else {
                if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
                    c->apply_window_int16 = ff_apply_window_int16_sse2;
                }
            }
            c->emulated_edge_mc = emulated_edge_mc_sse;
            c->gmc= gmc_sse;
 #endif
        }
-        if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit
+        if (mm_flags & AV_CPU_FLAG_SSSE3) {
-            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+#if HAVE_YASM
            if (mm_flags & AV_CPU_FLAG_ATOM) {
                c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
            } else {
                c->apply_window_int16 = ff_apply_window_int16_ssse3;
            }
            if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
                c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
            }
 #endif
        }
    }
    if (CONFIG_ENCODERS)
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@ -27,6 +27,8 @@ pb_zzzzzzzz77777777: times 8 db -1
 pb_7: times 8 db 7
 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
 pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
 pd_16384: times 4 dd 16384
 section .text align=16
@ -202,6 +204,130 @@ SCALARPRODUCT_LOOP 0
    RET
 ;-----------------------------------------------------------------------------
 ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
 ;                            const int16_t *window, unsigned int len)
 ;-----------------------------------------------------------------------------
 %macro REVERSE_WORDS_MMXEXT 1-2
    pshufw   %1, %1, 0x1B
 %endmacro
 %macro REVERSE_WORDS_SSE2 1-2
    pshuflw  %1, %1, 0x1B
    pshufhw  %1, %1, 0x1B
    pshufd   %1, %1, 0x4E
 %endmacro
 %macro REVERSE_WORDS_SSSE3 2
    pshufb  %1, %2
 %endmacro
 ; dst = (dst * src) >> 15
 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
 ; in from the pmullw result.
 %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
    mova    %3, %1
    pmulhw  %1, %2
    pmullw  %3, %2
    psrlw   %3, 15
    psllw   %1, 1
    por     %1, %3
 %endmacro
 ; dst = ((dst * src) + (1<<14)) >> 15
 %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
    pmulhrsw   %1, %2
 %endmacro
 %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
 cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
    lea     offset2q, [offsetq-mmsize]
 %if %2
    mova          m5, [pd_16384]
 %elifidn %1, ssse3
    mova          m5, [pb_revwords]
    ALIGN 16
 %endif
 .loop:
 %if %2
    ; This version expands 16-bit to 32-bit, multiplies by the window,
    ; adds 16384 for rounding, right shifts 15, then repacks back to words to
    ; save to the output. The window is reversed for the second half.
    mova          m3, [windowq+offset2q]
    mova          m4, [ inputq+offset2q]
    pxor          m0, m0
    punpcklwd     m0, m3
    punpcklwd     m1, m4
    pmaddwd       m0, m1
    paddd         m0, m5
    psrad         m0, 15
    pxor          m2, m2
    punpckhwd     m2, m3
    punpckhwd     m1, m4
    pmaddwd       m2, m1
    paddd         m2, m5
    psrad         m2, 15
    packssdw      m0, m2
    mova  [outputq+offset2q], m0
    REVERSE_WORDS m3
    mova          m4, [ inputq+offsetq]
    pxor          m0, m0
    punpcklwd     m0, m3
    punpcklwd     m1, m4
    pmaddwd       m0, m1
    paddd         m0, m5
    psrad         m0, 15
    pxor          m2, m2
    punpckhwd     m2, m3
    punpckhwd     m1, m4
    pmaddwd       m2, m1
    paddd         m2, m5
    psrad         m2, 15
    packssdw      m0, m2
    mova  [outputq+offsetq], m0
 %elif %3
    ; This version does the 16x16->16 multiplication in-place without expanding
    ; to 32-bit. The ssse3 version is bit-identical.
    mova          m0, [windowq+offset2q]
    mova          m1, [ inputq+offset2q]
    pmulhrsw      m1, m0
    REVERSE_WORDS m0, m5
    pmulhrsw      m0, [ inputq+offsetq ]
    mova  [outputq+offset2q], m1
    mova  [outputq+offsetq ], m0
 %else
    ; This version does the 16x16->16 multiplication in-place without expanding
    ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
    ; therefore are not bit-identical to the C version.
    mova          m0, [windowq+offset2q]
    mova          m1, [ inputq+offset2q]
    mova          m2, [ inputq+offsetq ]
    MUL16FIXED    m1, m0, m3
    REVERSE_WORDS m0
    MUL16FIXED    m2, m0, m3
    mova  [outputq+offset2q], m1
    mova  [outputq+offsetq ], m2
 %endif
    add      offsetd, mmsize
    sub     offset2d, mmsize
    jae .loop
    REP_RET
 %endmacro
 INIT_MMX
 %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
 %define MUL16FIXED MUL16FIXED_MMXEXT
 APPLY_WINDOW_INT16 mmxext,     0, 0
 APPLY_WINDOW_INT16 mmxext_ba,  1, 0
 INIT_XMM
 %define REVERSE_WORDS REVERSE_WORDS_SSE2
 APPLY_WINDOW_INT16 sse2,       0, 0
 APPLY_WINDOW_INT16 sse2_ba,    1, 0
 APPLY_WINDOW_INT16 ssse3_atom, 0, 1
 %define REVERSE_WORDS REVERSE_WORDS_SSSE3
 APPLY_WINDOW_INT16 ssse3,      0, 1
 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
--- a/tests/ref/acodec/ac3_fixed
+++ b/tests/ref/acodec/ac3_fixed
@ -1,2 +1,2 @@
-b3a8f0a8809a58b2ece90744f06fff96 *./tests/data/acodec/ac3.rm
+346073c97eada69330f61e103a170ca1 *./tests/data/acodec/ac3.rm
 98751 ./tests/data/acodec/ac3.rm
--- a/tests/ref/lavf/rm
+++ b/tests/ref/lavf/rm
@ -1,2 +1,2 @@
-7da378131db880bcf2e58305d54418ec *./tests/data/lavf/lavf.rm
+7b7ede9548a09346675edad36acfbf19 *./tests/data/lavf/lavf.rm
 346706 ./tests/data/lavf/lavf.rm
--- a/tests/ref/seek/ac3_rm
+++ b/tests/ref/seek/ac3_rm
@ -1,45 +1,35 @@
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret: 0         st:-1 flags:0  ts:-1.000000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
-ret: 0         st:-1 flags:1  ts: 1.894167
+ret:-1         st:-1 flags:1  ts: 1.894167
-ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
+ret:-1         st: 0 flags:0  ts: 0.788000
 ret: 0         st: 0 flags:0  ts: 0.788000
 ret: 0         st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos:   5822 size:   916
 ret: 0         st: 0 flags:1  ts:-0.317000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
-ret: 0         st:-1 flags:0  ts: 2.576668
+ret:-1         st:-1 flags:0  ts: 2.576668
 ret: 0         st: 0 flags:1 dts:524.800000 pts:524.800000 pos:   6155 size:   244
 ret:-1         st:-1 flags:1  ts: 1.470835
-ret: 0         st: 0 flags:0  ts: 0.365000
+ret:-1         st: 0 flags:0  ts: 0.365000
 ret: 0         st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos:   5822 size:   916
 ret: 0         st: 0 flags:1  ts:-0.741000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret:-1         st:-1 flags:0  ts: 2.153336
-ret: 0         st:-1 flags:1  ts: 1.047503
+ret:-1         st:-1 flags:1  ts: 1.047503
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret: 0         st: 0 flags:0  ts:-0.058000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
-ret: 0         st: 0 flags:1  ts: 2.836000
+ret:-1         st: 0 flags:1  ts: 2.836000
 ret: 0         st: 0 flags:1 dts: 2.681000 pts: 2.681000 pos:  44105 size:   558
 ret:-1         st:-1 flags:0  ts: 1.730004
-ret: 0         st:-1 flags:1  ts: 0.624171
+ret:-1         st:-1 flags:1  ts: 0.624171
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret: 0         st: 0 flags:0  ts:-0.482000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret:-1         st: 0 flags:1  ts: 2.413000
 ret:-1         st:-1 flags:0  ts: 1.306672
-ret: 0         st:-1 flags:1  ts: 0.200839
+ret:-1         st:-1 flags:1  ts: 0.200839
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret: 0         st: 0 flags:0  ts:-0.905000
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret:-1         st: 0 flags:1  ts: 1.989000
-ret: 0         st:-1 flags:0  ts: 0.883340
+ret:-1         st:-1 flags:0  ts: 0.883340
 ret: 0         st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos:   5822 size:   916
 ret: 0         st:-1 flags:1  ts:-0.222493
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
 ret:-1         st: 0 flags:0  ts: 2.672000
 ret:-1         st: 0 flags:1  ts: 1.566000
-ret: 0         st:-1 flags:0  ts: 0.460008
+ret:-1         st:-1 flags:0  ts: 0.460008
 ret: 0         st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos:   5822 size:   916
 ret: 0         st:-1 flags:1  ts:-0.645825
 ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
`@ -1,2 +1,2 @@`
	`b3a8f0a8809a58b2ece90744f06fff96 *./tests/data/acodec/ac3.rm`	`346073c97eada69330f61e103a170ca1 *./tests/data/acodec/ac3.rm`
	`98751 ./tests/data/acodec/ac3.rm`	`98751 ./tests/data/acodec/ac3.rm`
`@ -1,2 +1,2 @@`
	`7da378131db880bcf2e58305d54418ec *./tests/data/lavf/lavf.rm`	`7b7ede9548a09346675edad36acfbf19 *./tests/data/lavf/lavf.rm`
	`346706 ./tests/data/lavf/lavf.rm`	`346706 ./tests/data/lavf/lavf.rm`