mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
x86/aacpsdsp: add ps_hybrid_analysis_fma3
This replace the sse3 version, which was not really faster than the sse one. Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
parent
2bcf86d53d
commit
48615f0a78
@ -403,10 +403,8 @@ HYBRID_SYNTHESIS_DEINT
|
||||
%macro PS_HYBRID_ANALYSIS_IN 1
|
||||
movu m0, [inq+mmsize*%1]
|
||||
movu m1, [inq+mmsize*(5-%1)+8]
|
||||
mova m3, m0
|
||||
mova m4, m1
|
||||
shufps m3, m3, q2301
|
||||
shufps m4, m4, q0123
|
||||
shufps m3, m0, m0, q2301
|
||||
shufps m4, m1, m1, q0123
|
||||
shufps m1, m1, q1032
|
||||
%if cpuflag(sse3)
|
||||
addsubps m3, m4
|
||||
@ -424,6 +422,15 @@ HYBRID_SYNTHESIS_DEINT
|
||||
%macro PS_HYBRID_ANALYSIS_LOOP 3
|
||||
mova m2, [filterq+nq+mmsize*%3]
|
||||
shufps m2, m2, q2301
|
||||
%if cpuflag(fma3)
|
||||
%if %3
|
||||
fmaddps m3, m2, [rsp+mmsize*%3*2], m3
|
||||
fmaddps m0, m2, [rsp+mmsize+mmsize*%3*2], m0
|
||||
%else
|
||||
mulps m3, m2, [rsp]
|
||||
mulps m0, m2, [rsp+mmsize]
|
||||
%endif
|
||||
%else ; cpuflag(sse)
|
||||
mova %2, [rsp+mmsize*%3*2]
|
||||
mova %1, [rsp+mmsize+mmsize*%3*2]
|
||||
mulps %2, m2
|
||||
@ -432,20 +439,21 @@ HYBRID_SYNTHESIS_DEINT
|
||||
addps m3, %2
|
||||
addps m0, %1
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PS_HYBRID_ANALYSIS 0
|
||||
cglobal ps_hybrid_analysis, 5, 5, 8, 24 * 4, out, in, filter, stride, n
|
||||
cglobal ps_hybrid_analysis, 5, 5, 5 + notcpuflag(fma3) * 3, 24 * 4, out, in, filter, stride, n
|
||||
%if cpuflag(sse3)
|
||||
%define MOVH movsd
|
||||
%else
|
||||
%define MOVH movlps
|
||||
mova m7, [ps_p1m1p1m1]
|
||||
%endif
|
||||
shl strideq, 3
|
||||
shl nd, 6
|
||||
add filterq, nq
|
||||
neg nq
|
||||
mova m7, [ps_p1m1p1m1]
|
||||
PS_HYBRID_ANALYSIS_IN 0
|
||||
PS_HYBRID_ANALYSIS_IN 1
|
||||
PS_HYBRID_ANALYSIS_IN 2
|
||||
@ -456,30 +464,22 @@ align 16
|
||||
PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
|
||||
PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
|
||||
|
||||
%if cpuflag(sse3)
|
||||
pshufd m3, m3, q2301
|
||||
xorps m0, m7
|
||||
hsubps m3, m0
|
||||
pshufd m1, m3, q0020
|
||||
pshufd m3, m3, q0031
|
||||
addps m1, m3
|
||||
movsd m2, [inq+6*8]
|
||||
%else
|
||||
mova m1, m3
|
||||
mova m2, m0
|
||||
shufps m1, m1, q2301
|
||||
shufps m2, m2, q2301
|
||||
shufps m1, m3, m3, q2301
|
||||
shufps m2, m0, m0, q2301
|
||||
subps m1, m3
|
||||
addps m2, m0
|
||||
unpcklps m3, m1, m2
|
||||
unpckhps m1, m2
|
||||
addps m1, m3
|
||||
movu m2, [inq+6*8] ; faster than movlps and no risk of overread
|
||||
%endif
|
||||
movss m3, [filterq+nq+8*6]
|
||||
SPLATD m3
|
||||
%if cpuflag(fma3)
|
||||
fmaddps m1, m2, m3, m1
|
||||
%else
|
||||
mulps m2, m3
|
||||
addps m1, m2
|
||||
%endif
|
||||
MOVH [outq], m1
|
||||
add outq, strideq
|
||||
add nq, 64
|
||||
@ -489,5 +489,5 @@ align 16
|
||||
|
||||
INIT_XMM sse
|
||||
PS_HYBRID_ANALYSIS
|
||||
INIT_XMM sse3
|
||||
INIT_XMM fma3
|
||||
PS_HYBRID_ANALYSIS
|
||||
|
@ -33,7 +33,7 @@ void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
|
||||
void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
|
||||
const float (*filter)[8][2],
|
||||
ptrdiff_t stride, int n);
|
||||
void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
|
||||
void ff_ps_hybrid_analysis_fma3(float (*out)[2], float (*in)[2],
|
||||
const float (*filter)[8][2],
|
||||
ptrdiff_t stride, int n);
|
||||
void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
|
||||
@ -64,9 +64,11 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
|
||||
s->add_squares = ff_ps_add_squares_sse3;
|
||||
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3;
|
||||
s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
|
||||
s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
|
||||
}
|
||||
if (EXTERNAL_FMA3(cpu_flags)) {
|
||||
s->hybrid_analysis = ff_ps_hybrid_analysis_fma3;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user