mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
x86/swr: add ff_resample_{common, linear}_float_fma
Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
a441a2437b
commit
1a69224f44
@ -179,17 +179,16 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
|
|||||||
pmaddwd m1, [filterq+min_filter_count_x4q*1]
|
pmaddwd m1, [filterq+min_filter_count_x4q*1]
|
||||||
paddd m0, m1
|
paddd m0, m1
|
||||||
%else ; float/double
|
%else ; float/double
|
||||||
|
%if cpuflag(fma4) || cpuflag(fma3)
|
||||||
|
fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0
|
||||||
|
%else
|
||||||
mulp%4 m1, m1, [filterq+min_filter_count_x4q*1]
|
mulp%4 m1, m1, [filterq+min_filter_count_x4q*1]
|
||||||
addp%4 m0, m0, m1
|
addp%4 m0, m0, m1
|
||||||
|
%endif ; cpuflag
|
||||||
%endif
|
%endif
|
||||||
add min_filter_count_x4q, mmsize
|
add min_filter_count_x4q, mmsize
|
||||||
js .inner_loop
|
js .inner_loop
|
||||||
|
|
||||||
%if cpuflag(avx)
|
|
||||||
vextractf128 xm1, m0, 0x1
|
|
||||||
addps xm0, xm1
|
|
||||||
%endif
|
|
||||||
|
|
||||||
%ifidn %1, int16
|
%ifidn %1, int16
|
||||||
%if mmsize == 16
|
%if mmsize == 16
|
||||||
pshufd m1, m0, q0032
|
pshufd m1, m0, q0032
|
||||||
@ -206,6 +205,10 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
|
|||||||
movd [dstq], m0
|
movd [dstq], m0
|
||||||
%else ; float/double
|
%else ; float/double
|
||||||
; horizontal sum & store
|
; horizontal sum & store
|
||||||
|
%if mmsize == 32
|
||||||
|
vextractf128 xm1, m0, 0x1
|
||||||
|
addps xm0, xm1
|
||||||
|
%endif
|
||||||
movhlps xm1, xm0
|
movhlps xm1, xm0
|
||||||
%ifidn %1, float
|
%ifidn %1, float
|
||||||
addps xm0, xm1
|
addps xm0, xm1
|
||||||
@ -429,21 +432,19 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
|
|||||||
paddd m2, m3
|
paddd m2, m3
|
||||||
paddd m0, m1
|
paddd m0, m1
|
||||||
%else ; float/double
|
%else ; float/double
|
||||||
|
%if cpuflag(fma4) || cpuflag(fma3)
|
||||||
|
fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2
|
||||||
|
fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0
|
||||||
|
%else
|
||||||
mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1]
|
mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1]
|
||||||
mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1]
|
mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1]
|
||||||
addp%4 m2, m2, m3
|
addp%4 m2, m2, m3
|
||||||
addp%4 m0, m0, m1
|
addp%4 m0, m0, m1
|
||||||
|
%endif ; cpuflag
|
||||||
%endif
|
%endif
|
||||||
add min_filter_count_x4q, mmsize
|
add min_filter_count_x4q, mmsize
|
||||||
js .inner_loop
|
js .inner_loop
|
||||||
|
|
||||||
%if cpuflag(avx)
|
|
||||||
vextractf128 xm1, m0, 0x1
|
|
||||||
vextractf128 xm3, m2, 0x1
|
|
||||||
addps xm0, xm1
|
|
||||||
addps xm2, xm3
|
|
||||||
%endif
|
|
||||||
|
|
||||||
%ifidn %1, int16
|
%ifidn %1, int16
|
||||||
%if mmsize == 16
|
%if mmsize == 16
|
||||||
pshufd m3, m2, q0032
|
pshufd m3, m2, q0032
|
||||||
@ -479,12 +480,22 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
|
|||||||
; - unix64: eax=r6[filter1], edx=r2[todo]
|
; - unix64: eax=r6[filter1], edx=r2[todo]
|
||||||
%else ; float/double
|
%else ; float/double
|
||||||
; val += (v2 - val) * (FELEML) frac / c->src_incr;
|
; val += (v2 - val) * (FELEML) frac / c->src_incr;
|
||||||
|
%if mmsize == 32
|
||||||
|
vextractf128 xm1, m0, 0x1
|
||||||
|
vextractf128 xm3, m2, 0x1
|
||||||
|
addps xm0, xm1
|
||||||
|
addps xm2, xm3
|
||||||
|
%endif
|
||||||
cvtsi2s%4 xm1, fracd
|
cvtsi2s%4 xm1, fracd
|
||||||
subp%4 xm2, xm0
|
subp%4 xm2, xm0
|
||||||
mulp%4 xm1, xm4
|
mulp%4 xm1, xm4
|
||||||
shufp%4 xm1, xm1, q0000
|
shufp%4 xm1, xm1, q0000
|
||||||
|
%if cpuflag(fma4) || cpuflag(fma3)
|
||||||
|
fmaddp%4 xm0, xm2, xm1, xm0
|
||||||
|
%else
|
||||||
mulp%4 xm2, xm1
|
mulp%4 xm2, xm1
|
||||||
addp%4 xm0, xm2
|
addp%4 xm0, xm2
|
||||||
|
%endif ; cpuflag
|
||||||
|
|
||||||
; horizontal sum & store
|
; horizontal sum & store
|
||||||
movhlps xm1, xm0
|
movhlps xm1, xm0
|
||||||
@ -564,6 +575,14 @@ RESAMPLE_FNS float, 4, 2, s, pf_1
|
|||||||
INIT_YMM avx
|
INIT_YMM avx
|
||||||
RESAMPLE_FNS float, 4, 2, s, pf_1
|
RESAMPLE_FNS float, 4, 2, s, pf_1
|
||||||
%endif
|
%endif
|
||||||
|
%if HAVE_FMA3_EXTERNAL
|
||||||
|
INIT_YMM fma3
|
||||||
|
RESAMPLE_FNS float, 4, 2, s, pf_1
|
||||||
|
%endif
|
||||||
|
%if HAVE_FMA4_EXTERNAL
|
||||||
|
INIT_XMM fma4
|
||||||
|
RESAMPLE_FNS float, 4, 2, s, pf_1
|
||||||
|
%endif
|
||||||
|
|
||||||
%if ARCH_X86_32
|
%if ARCH_X86_32
|
||||||
INIT_MMX mmxext
|
INIT_MMX mmxext
|
||||||
|
@ -27,30 +27,19 @@
|
|||||||
|
|
||||||
#include "libswresample/resample.h"
|
#include "libswresample/resample.h"
|
||||||
|
|
||||||
int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst,
|
#define RESAMPLE_FUNCS(type, opt) \
|
||||||
const uint8_t *src, int sz, int upd);
|
int ff_resample_common_##type##_##opt(ResampleContext *c, uint8_t *dst, \
|
||||||
int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst,
|
const uint8_t *src, int sz, int upd); \
|
||||||
const uint8_t *src, int sz, int upd);
|
int ff_resample_linear_##type##_##opt(ResampleContext *c, uint8_t *dst, \
|
||||||
|
const uint8_t *src, int sz, int upd)
|
||||||
|
|
||||||
int ff_resample_common_int16_sse2(ResampleContext *c, uint8_t *dst,
|
RESAMPLE_FUNCS(int16, mmxext);
|
||||||
const uint8_t *src, int sz, int upd);
|
RESAMPLE_FUNCS(int16, sse2);
|
||||||
int ff_resample_linear_int16_sse2(ResampleContext *c, uint8_t *dst,
|
RESAMPLE_FUNCS(float, sse);
|
||||||
const uint8_t *src, int sz, int upd);
|
RESAMPLE_FUNCS(float, avx);
|
||||||
|
RESAMPLE_FUNCS(float, fma3);
|
||||||
int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst,
|
RESAMPLE_FUNCS(float, fma4);
|
||||||
const uint8_t *src, int sz, int upd);
|
RESAMPLE_FUNCS(double, sse2);
|
||||||
int ff_resample_linear_float_sse(ResampleContext *c, uint8_t *dst,
|
|
||||||
const uint8_t *src, int sz, int upd);
|
|
||||||
|
|
||||||
int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst,
|
|
||||||
const uint8_t *src, int sz, int upd);
|
|
||||||
int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst,
|
|
||||||
const uint8_t *src, int sz, int upd);
|
|
||||||
|
|
||||||
int ff_resample_common_double_sse2(ResampleContext *c, uint8_t *dst,
|
|
||||||
const uint8_t *src, int sz, int upd);
|
|
||||||
int ff_resample_linear_double_sse2(ResampleContext *c, uint8_t *dst,
|
|
||||||
const uint8_t *src, int sz, int upd);
|
|
||||||
|
|
||||||
void swresample_dsp_x86_init(ResampleContext *c)
|
void swresample_dsp_x86_init(ResampleContext *c)
|
||||||
{
|
{
|
||||||
@ -76,4 +65,12 @@ void swresample_dsp_x86_init(ResampleContext *c)
|
|||||||
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;
|
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;
|
||||||
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_avx;
|
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_avx;
|
||||||
}
|
}
|
||||||
|
if (HAVE_FMA3_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA3) {
|
||||||
|
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma3;
|
||||||
|
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma3;
|
||||||
|
}
|
||||||
|
if (HAVE_FMA4_EXTERNAL && mm_flags & AV_CPU_FLAG_FMA4) {
|
||||||
|
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_fma4;
|
||||||
|
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_fma4;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user