mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
x86: lpc: simd av_evaluate_lls
1.5x-1.8x faster on sandybridge Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
This commit is contained in:
parent
502ab21af0
commit
b545179fdf
@ -194,3 +194,41 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2
|
|||||||
jle .loop2x1
|
jle .loop2x1
|
||||||
.ret:
|
.ret:
|
||||||
REP_RET
|
REP_RET
|
||||||
|
|
||||||
|
|
||||||
|
INIT_XMM sse2
|
||||||
|
cglobal evaluate_lls, 2,4,2, ctx, var, order, i
|
||||||
|
; This function is often called on the same buffer as update_lls, but with
|
||||||
|
; an offset. They can't both be aligned.
|
||||||
|
; Load halves rather than movu to avoid store-forwarding stalls, since the
|
||||||
|
; input was initialized immediately prior to this function using scalar math.
|
||||||
|
%define coefsq ctxq
|
||||||
|
mov id, orderd
|
||||||
|
imul orderd, MAX_VARS
|
||||||
|
lea coefsq, [ctxq + LLSModel.coeff + orderq*8]
|
||||||
|
movsd m0, [varq]
|
||||||
|
movhpd m0, [varq + 8]
|
||||||
|
mulpd m0, [coefsq]
|
||||||
|
lea coefsq, [coefsq + iq*8]
|
||||||
|
lea varq, [varq + iq*8]
|
||||||
|
neg iq
|
||||||
|
add iq, 2
|
||||||
|
.loop:
|
||||||
|
movsd m1, [varq + iq*8]
|
||||||
|
movhpd m1, [varq + iq*8 + 8]
|
||||||
|
mulpd m1, [coefsq + iq*8]
|
||||||
|
addpd m0, m1
|
||||||
|
add iq, 2
|
||||||
|
jl .loop
|
||||||
|
jg .skip1
|
||||||
|
movsd m1, [varq + iq*8]
|
||||||
|
mulsd m1, [coefsq + iq*8]
|
||||||
|
addpd m0, m1
|
||||||
|
.skip1:
|
||||||
|
movhlps m1, m0
|
||||||
|
addsd m0, m1
|
||||||
|
%if ARCH_X86_32
|
||||||
|
movsd r0m, m0
|
||||||
|
fld qword r0m
|
||||||
|
%endif
|
||||||
|
RET
|
||||||
|
@ -25,12 +25,15 @@
|
|||||||
|
|
||||||
void ff_update_lls_sse2(LLSModel *m, double *var);
|
void ff_update_lls_sse2(LLSModel *m, double *var);
|
||||||
void ff_update_lls_avx(LLSModel *m, double *var);
|
void ff_update_lls_avx(LLSModel *m, double *var);
|
||||||
|
double ff_evaluate_lls_sse2(LLSModel *m, double *var, int order);
|
||||||
|
|
||||||
av_cold void ff_init_lls_x86(LLSModel *m)
|
av_cold void ff_init_lls_x86(LLSModel *m)
|
||||||
{
|
{
|
||||||
int cpu_flags = av_get_cpu_flags();
|
int cpu_flags = av_get_cpu_flags();
|
||||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||||
m->update_lls = ff_update_lls_sse2;
|
m->update_lls = ff_update_lls_sse2;
|
||||||
|
if (m->indep_count >= 4)
|
||||||
|
m->evaluate_lls = ff_evaluate_lls_sse2;
|
||||||
}
|
}
|
||||||
if (EXTERNAL_AVX(cpu_flags)) {
|
if (EXTERNAL_AVX(cpu_flags)) {
|
||||||
m->update_lls = ff_update_lls_avx;
|
m->update_lls = ff_update_lls_avx;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user