mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
x86/float_dsp: add ff_vector_dmul_{sse2,avx}
~3x to 5x faster. Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
parent
93bf1dcaec
commit
9d002d7818
@ -58,6 +58,39 @@ INIT_YMM avx
|
||||
VECTOR_FMUL
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void vector_dmul(double *dst, const double *src0, const double *src1, int len)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro VECTOR_DMUL 0
|
||||
cglobal vector_dmul, 4,4,4, dst, src0, src1, len
|
||||
lea lend, [lenq*8 - mmsize*4]
|
||||
ALIGN 16
|
||||
.loop:
|
||||
movaps m0, [src0q + lenq + 0*mmsize]
|
||||
movaps m1, [src0q + lenq + 1*mmsize]
|
||||
movaps m2, [src0q + lenq + 2*mmsize]
|
||||
movaps m3, [src0q + lenq + 3*mmsize]
|
||||
mulpd m0, m0, [src1q + lenq + 0*mmsize]
|
||||
mulpd m1, m1, [src1q + lenq + 1*mmsize]
|
||||
mulpd m2, m2, [src1q + lenq + 2*mmsize]
|
||||
mulpd m3, m3, [src1q + lenq + 3*mmsize]
|
||||
movaps [dstq + lenq + 0*mmsize], m0
|
||||
movaps [dstq + lenq + 1*mmsize], m1
|
||||
movaps [dstq + lenq + 2*mmsize], m2
|
||||
movaps [dstq + lenq + 3*mmsize], m3
|
||||
|
||||
sub lenq, mmsize*4
|
||||
jge .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
VECTOR_DMUL
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
VECTOR_DMUL
|
||||
%endif
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
|
||||
;------------------------------------------------------------------------------
|
||||
|
@ -29,6 +29,11 @@ void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
|
||||
void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
|
||||
int len);
|
||||
|
||||
void ff_vector_dmul_sse2(double *dst, const double *src0, const double *src1,
|
||||
int len);
|
||||
void ff_vector_dmul_avx(double *dst, const double *src0, const double *src1,
|
||||
int len);
|
||||
|
||||
void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
|
||||
int len);
|
||||
void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
|
||||
@ -92,11 +97,13 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
|
||||
fdsp->butterflies_float = ff_butterflies_float_sse;
|
||||
}
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
fdsp->vector_dmul = ff_vector_dmul_sse2;
|
||||
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2;
|
||||
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
|
||||
}
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
||||
fdsp->vector_fmul = ff_vector_fmul_avx;
|
||||
fdsp->vector_dmul = ff_vector_dmul_avx;
|
||||
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
|
||||
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
|
||||
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx;
|
||||
|
Loading…
x
Reference in New Issue
Block a user