x86/swr: add ff_resample_{common, linear}_float_fma

Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
2025-08-10 06:10:52 +02:00 · 2014-06-30 13:06:01 -03:00
parent a441a2437b
commit 1a69224f44
2 changed files with 51 additions and 35 deletions
--- a/libswresample/x86/resample.asm
+++ b/libswresample/x86/resample.asm
@@ -179,17 +179,16 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
    pmaddwd                       m1, [filterq+min_filter_count_x4q*1]
    paddd                         m0, m1
 %else ; float/double
+%if cpuflag(fma4) || cpuflag(fma3)
+    fmaddp%4                      m0, m1, [filterq+min_filter_count_x4q*1], m0
+%else
    mulp%4                        m1, m1, [filterq+min_filter_count_x4q*1]
    addp%4                        m0, m0, m1
+%endif ; cpuflag
 %endif
    add         min_filter_count_x4q, mmsize
    js .inner_loop

-%if cpuflag(avx)
-    vextractf128                 xm1, m0, 0x1
-    addps                        xm0, xm1
-%endif
-
 %ifidn %1, int16
 %if mmsize == 16
    pshufd                        m1, m0, q0032
@@ -206,6 +205,10 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
    movd                      [dstq], m0
 %else ; float/double
    ; horizontal sum & store
+%if mmsize == 32
+    vextractf128                 xm1, m0, 0x1
+    addps                        xm0, xm1
+%endif
    movhlps                      xm1, xm0
 %ifidn %1, float
    addps                        xm0, xm1
@@ -429,21 +432,19 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
    paddd                         m2, m3
    paddd                         m0, m1
 %else ; float/double
+%if cpuflag(fma4) || cpuflag(fma3)
+    fmaddp%4                      m2, m1, [filter2q+min_filter_count_x4q*1], m2
+    fmaddp%4                      m0, m1, [filter1q+min_filter_count_x4q*1], m0
+%else
    mulp%4                        m3, m1, [filter2q+min_filter_count_x4q*1]
    mulp%4                        m1, m1, [filter1q+min_filter_count_x4q*1]
    addp%4                        m2, m2, m3
    addp%4                        m0, m0, m1
+%endif ; cpuflag
 %endif
    add         min_filter_count_x4q, mmsize
    js .inner_loop

-%if cpuflag(avx)
-    vextractf128                 xm1, m0, 0x1
-    vextractf128                 xm3, m2, 0x1
-    addps                        xm0, xm1
-    addps                        xm2, xm3
-%endif
-
 %ifidn %1, int16
 %if mmsize == 16
    pshufd                        m3, m2, q0032
@@ -479,12 +480,22 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
    ; - unix64: eax=r6[filter1], edx=r2[todo]
 %else ; float/double
    ; val += (v2 - val) * (FELEML) frac / c->src_incr;
+%if mmsize == 32
+    vextractf128                 xm1, m0, 0x1
+    vextractf128                 xm3, m2, 0x1
+    addps                        xm0, xm1
+    addps                        xm2, xm3
+%endif
    cvtsi2s%4                    xm1, fracd
    subp%4                       xm2, xm0
    mulp%4                       xm1, xm4
    shufp%4                      xm1, xm1, q0000
+%if cpuflag(fma4) || cpuflag(fma3)
+    fmaddp%4                     xm0, xm2, xm1, xm0
+%else
    mulp%4                       xm2, xm1
    addp%4                       xm0, xm2
+%endif ; cpuflag

    ; horizontal sum & store
    movhlps                      xm1, xm0
@@ -564,6 +575,14 @@ RESAMPLE_FNS float, 4, 2, s, pf_1
 INIT_YMM avx
 RESAMPLE_FNS float, 4, 2, s, pf_1
 %endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+RESAMPLE_FNS float, 4, 2, s, pf_1
+%endif
+%if HAVE_FMA4_EXTERNAL
+INIT_XMM fma4
+RESAMPLE_FNS float, 4, 2, s, pf_1
+%endif

 %if ARCH_X86_32
 INIT_MMX mmxext
--- a/libswresample/x86/resample_x86_dsp.c
+++ b/libswresample/x86/resample_x86_dsp.c
@@ -27,30 +27,19 @@

 #include "libswresample/resample.h"

-int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst,
-                                    const uint8_t *src, int sz, int upd);
-int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst,
-                                    const uint8_t *src, int sz, int upd);
+#define RESAMPLE_FUNCS(type, opt) \
+int ff_resample_common_##type##_##opt(ResampleContext *c, uint8_t *dst, \
+                                      const uint8_t *src, int sz, int upd); \
+int ff_resample_linear_##type##_##opt(ResampleContext *c, uint8_t *dst, \
+                                      const uint8_t *src, int sz, int upd)

-int ff_resample_common_int16_sse2(ResampleContext *c, uint8_t *dst,
-                                  const uint8_t *src, int sz, int upd);
-int ff_resample_linear_int16_sse2(ResampleContext *c, uint8_t *dst,
-                                  const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst,
-                                 const uint8_t *src, int sz, int upd);
-int ff_resample_linear_float_sse(ResampleContext *c, uint8_t *dst,
-                                 const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst,
-                                 const uint8_t *src, int sz, int upd);
-int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst,
-                                 const uint8_t *src, int sz, int upd);
-
-int ff_resample_common_double_sse2(ResampleContext *c, uint8_t *dst,
-                                   const uint8_t *src, int sz, int upd);
-int ff_resample_linear_double_sse2(ResampleContext *c, uint8_t *dst,
-                                   const uint8_t *src, int sz, int upd);
+RESAMPLE_FUNCS(int16,  mmxext);
+RESAMPLE_FUNCS(int16,  sse2);
+RESAMPLE_FUNCS(float,  sse);
+RESAMPLE_FUNCS(float,  avx);
+RESAMPLE_FUNCS(float,  fma3);
+RESAMPLE_FUNCS(float,  fma4);
+RESAMPLE_FUNCS(double, sse2);

 void swresample_dsp_x86_init(ResampleContext *c)
 {
@@ -76,4 +65,12 @@ void swresample_dsp_x86_init(ResampleContext *c)
        c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;
        c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_avx;
    }
+    if (HAVE_FMA3_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA3) {
+        c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma3;
+        c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma3;
+    }
+    if (HAVE_FMA4_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA4) {
+        c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma4;
+        c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma4;
+    }
 }