mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
SIMD opus pvq_search implementation
Explanation on the workings and methods used by the Pyramid Vector Quantization Search function could be found in the following Work-In-Progress mail threads: http://ffmpeg.org/pipermail/ffmpeg-devel/2017-June/212146.html http://ffmpeg.org/pipermail/ffmpeg-devel/2017-June/212816.html http://ffmpeg.org/pipermail/ffmpeg-devel/2017-July/213030.html http://ffmpeg.org/pipermail/ffmpeg-devel/2017-July/213436.html Signed-off-by: Ivan Kalvachev <ikalvachev@gmail.com>
This commit is contained in:
parent
30ae07d7ef
commit
7205513f8f
@ -947,6 +947,9 @@ int av_cold ff_celt_pvq_init(CeltPVQ **pvq)
|
||||
s->encode_band = pvq_encode_band;
|
||||
s->band_cost = pvq_band_cost;
|
||||
|
||||
if (ARCH_X86)
|
||||
ff_opus_dsp_init_x86(s);
|
||||
|
||||
*pvq = s;
|
||||
|
||||
return 0;
|
||||
|
@ -33,8 +33,8 @@
|
||||
float *lowband_scratch, int fill)
|
||||
|
||||
struct CeltPVQ {
|
||||
DECLARE_ALIGNED(32, int, qcoeff )[176];
|
||||
DECLARE_ALIGNED(32, float, hadamard_tmp)[176];
|
||||
DECLARE_ALIGNED(32, int, qcoeff )[256];
|
||||
DECLARE_ALIGNED(32, float, hadamard_tmp)[256];
|
||||
|
||||
float (*pvq_search)(float *X, int *y, int K, int N);
|
||||
|
||||
@ -45,6 +45,7 @@ struct CeltPVQ {
|
||||
};
|
||||
|
||||
int ff_celt_pvq_init (struct CeltPVQ **pvq);
|
||||
void ff_opus_dsp_init_x86(struct CeltPVQ *s);
|
||||
void ff_celt_pvq_uninit(struct CeltPVQ **pvq);
|
||||
|
||||
#endif /* AVCODEC_OPUS_PVQ_H */
|
||||
|
@ -52,6 +52,8 @@ OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
|
||||
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
|
||||
OBJS-$(CONFIG_OPUS_DECODER) += x86/opus_dsp_init.o
|
||||
OBJS-$(CONFIG_OPUS_ENCODER) += x86/opus_dsp_init.o
|
||||
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
|
||||
OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o
|
||||
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o
|
||||
@ -123,6 +125,7 @@ X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o
|
||||
X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
|
||||
X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/opus_pvq_search.o
|
||||
X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \
|
||||
x86/fpel.o \
|
||||
|
45
libavcodec/x86/opus_dsp_init.c
Normal file
45
libavcodec/x86/opus_dsp_init.c
Normal file
@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Opus encoder assembly optimizations
|
||||
* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/opus_pvq.h"
|
||||
|
||||
extern float ff_pvq_search_sse2(float *X, int *y, int K, int N);
|
||||
extern float ff_pvq_search_sse4(float *X, int *y, int K, int N);
|
||||
extern float ff_pvq_search_avx (float *X, int *y, int K, int N);
|
||||
|
||||
av_cold void ff_opus_dsp_init_x86(CeltPVQ *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if CONFIG_OPUS_ENCODER
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
s->pvq_search = ff_pvq_search_sse2;
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags))
|
||||
s->pvq_search = ff_pvq_search_sse4;
|
||||
|
||||
if (EXTERNAL_AVX(cpu_flags))
|
||||
s->pvq_search = ff_pvq_search_avx;
|
||||
#endif
|
||||
}
|
395
libavcodec/x86/opus_pvq_search.asm
Normal file
395
libavcodec/x86/opus_pvq_search.asm
Normal file
@ -0,0 +1,395 @@
|
||||
;******************************************************************************
|
||||
;* SIMD optimized Opus encoder DSP function
|
||||
;*
|
||||
;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "config.asm"
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
%ifdef __NASM_VER__
|
||||
%use "smartalign"
|
||||
ALIGNMODE p6
|
||||
%endif
|
||||
|
||||
; Use float op that give half precision but execute for around 3 cycles.
|
||||
; On Skylake & Ryzen the division is much faster (around 11c/3),
|
||||
; that makes the full precision code about 2% slower.
|
||||
; Opus also does use rsqrt approximation in their intrinsics code.
|
||||
%define USE_APPROXIMATION 1
|
||||
|
||||
; Presearch tries to quantize by using the property Sum( abs(x[i]*K)/Sx ) = K.
|
||||
; If we use truncation for the division result then the integer Sum would be <= K.
|
||||
; By using nearest rounding we get closer approximation, but
|
||||
; we could also get more than K pulses and we have to remove the extra ones.
|
||||
; This method is the main improvement of the ffmpeg C function over the Opus original.
|
||||
%define PRESEARCH_ROUNDING 1
|
||||
|
||||
SECTION_RODATA 64
|
||||
|
||||
const_float_abs_mask: times 8 dd 0x7fffffff
|
||||
const_align_abs_edge: times 8 dd 0
|
||||
|
||||
const_float_0_5: times 8 dd 0.5
|
||||
const_float_1: times 8 dd 1.0
|
||||
const_float_sign_mask: times 8 dd 0x80000000
|
||||
|
||||
const_int32_offsets:
|
||||
%rep 8
|
||||
dd $-const_int32_offsets
|
||||
%endrep
|
||||
SECTION .text
|
||||
|
||||
;
|
||||
; Setup High Register to be used
|
||||
; for holding memory constants
|
||||
;
|
||||
; %1 - the register to be used, assmues it is >= mm8
|
||||
; %2 - name of the constant.
|
||||
;
|
||||
; Subsequent opcodes are going to use the constant in the form
|
||||
; "addps m0, mm_const_name" and it would be turned into:
|
||||
; "addps m0, [const_name]" on 32 bit arch or
|
||||
; "addps m0, m8" on 64 bit arch
|
||||
%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
|
||||
%if num_mmregs > 8
|
||||
%define mm_%3 %2
|
||||
%{1} %2, [%3] ; movaps m8, [const_name]
|
||||
%else
|
||||
%define mm_%3 [%3]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; Set Position Independent Code
|
||||
; Base address of a constant
|
||||
; %1 - the register to be used, if PIC is set
|
||||
; %2 - name of the constant.
|
||||
;
|
||||
; Subsequent opcode are going to use the base address in the form
|
||||
; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
|
||||
; "movaps m0, [r5 + r4]" if PIC is enabled
|
||||
; "movaps m0, [constant_name + r4]" if texrel are used
|
||||
%macro SET_PIC_BASE 3; reg, const_label
|
||||
%ifdef PIC
|
||||
%{1} %2, [%3] ; lea r5, [rip+const]
|
||||
%define pic_base_%3 %2
|
||||
%else
|
||||
%define pic_base_%3 %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PULSES_SEARCH 1
|
||||
; m6 Syy_norm
|
||||
; m7 Sxy_norm
|
||||
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
|
||||
pxor m1, m1 ; max_idx
|
||||
xorps m3, m3 ; p_max
|
||||
xor r4d, r4d
|
||||
align 16
|
||||
%%distortion_search:
|
||||
movd xm2, dword r4d ; movd zero extends
|
||||
%ifidn %1,add
|
||||
movaps m4, [tmpY + r4] ; y[i]
|
||||
movaps m5, [tmpX + r4] ; X[i]
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
xorps m0, m0
|
||||
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
|
||||
%endif
|
||||
|
||||
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
|
||||
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
|
||||
%endif
|
||||
|
||||
%else
|
||||
movaps m5, [tmpY + r4] ; m5 = y[i]
|
||||
|
||||
xorps m0, m0 ; m0 = 0;
|
||||
cmpps m0, m0, m5, 1 ; m0 = (0<y)
|
||||
|
||||
subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
|
||||
subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i]
|
||||
andps m5, m0 ; (0<y)?m5:0
|
||||
%endif
|
||||
|
||||
%if USE_APPROXIMATION == 1
|
||||
rsqrtps m4, m4
|
||||
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
|
||||
%else
|
||||
mulps m5, m5
|
||||
divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
|
||||
%endif
|
||||
VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)
|
||||
|
||||
cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
|
||||
maxps m3, m5 ; m3=max(p_max,p)
|
||||
; maxps here is faster than blendvps, despite blend having lower latency.
|
||||
|
||||
pand m2, m0 ; This version seems faster than sse41 pblendvb
|
||||
pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4
|
||||
|
||||
add r4d, mmsize
|
||||
cmp r4d, Nd
|
||||
jb %%distortion_search
|
||||
|
||||
por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop)
|
||||
movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing
|
||||
|
||||
%if mmsize >= 32
|
||||
; Merge parallel maximums round 8 (4 vs 4)
|
||||
|
||||
vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
|
||||
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
|
||||
|
||||
vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
|
||||
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
|
||||
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128]
|
||||
%endif
|
||||
|
||||
; Merge parallel maximums round 4 (2 vs 2)
|
||||
; m3=p[3210]
|
||||
movhlps xm5, xm3 ; m5=p[xx32]
|
||||
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
|
||||
|
||||
pshufd xm2, xm1, q3232
|
||||
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
|
||||
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0]
|
||||
|
||||
; Merge parallel maximums final round (1 vs 1)
|
||||
shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1]
|
||||
cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
|
||||
|
||||
pshufd xm2, xm1, q1111
|
||||
PBLENDVB xm1, xm2, xm0
|
||||
|
||||
movd dword r4d, xm1 ; zero extends to the rest of r4q
|
||||
|
||||
VBROADCASTSS m3, [tmpX + r4]
|
||||
%{1}ps m7, m3 ; Sxy += X[max_idx]
|
||||
|
||||
VBROADCASTSS m5, [tmpY + r4]
|
||||
%{1}ps m6, m5 ; Syy += Y[max_idx]
|
||||
|
||||
; We have to update a single element in Y[i]
|
||||
; However writing 4 bytes and then doing 16 byte load in the inner loop
|
||||
; could cause a stall due to breaking write forwarding.
|
||||
VPBROADCASTD m1, xm1
|
||||
pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it
|
||||
|
||||
and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load
|
||||
movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0]
|
||||
andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0]
|
||||
%{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
|
||||
movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0;
|
||||
%endmacro
|
||||
|
||||
;
|
||||
; We need one more register for
|
||||
; PIC relative addressing. Use this
|
||||
; to count it in cglobal
|
||||
;
|
||||
%ifdef PIC
|
||||
%define num_pic_regs 1
|
||||
%else
|
||||
%define num_pic_regs 0
|
||||
%endif
|
||||
|
||||
;
|
||||
; Pyramid Vector Quantization Search implementation
|
||||
;
|
||||
; float * inX - Unaligned (SIMD) access, it will be overread,
|
||||
; but extra data is masked away.
|
||||
; int32 * outY - Should be aligned and padded buffer.
|
||||
; It is used as temp buffer.
|
||||
; uint32 K - Number of pulses to have after quantizations.
|
||||
; uint32 N - Number of vector elements. Must be 0 < N < 256
|
||||
;
|
||||
%macro PVQ_FAST_SEARCH 0
|
||||
cglobal pvq_search, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
|
||||
%define tmpX rsp
|
||||
%define tmpY outYq
|
||||
|
||||
movaps m0, [const_float_abs_mask]
|
||||
shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
|
||||
mov r4d, Nd
|
||||
|
||||
neg r4d
|
||||
and r4d, mmsize-1
|
||||
|
||||
SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const
|
||||
movups m2, [pic_base_const_align_abs_edge + r4 - mmsize]
|
||||
|
||||
add Nd, r4d ; N = align(N, mmsize)
|
||||
|
||||
lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
|
||||
movups m1, [inXq + r4]
|
||||
andps m1, m2
|
||||
movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] )
|
||||
|
||||
align 16
|
||||
%%loop_abs_sum:
|
||||
sub r4d, mmsize
|
||||
jc %%end_loop_abs_sum
|
||||
|
||||
movups m2, [inXq + r4]
|
||||
andps m2, m0
|
||||
|
||||
movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
|
||||
addps m1, m2 ; Sx += abs(X[i])
|
||||
jmp %%loop_abs_sum
|
||||
|
||||
align 16
|
||||
%%end_loop_abs_sum:
|
||||
|
||||
HSUMPS m1, m2 ; m1 = Sx
|
||||
|
||||
xorps m0, m0
|
||||
comiss xm0, xm1 ;
|
||||
jz %%zero_input ; if (Sx==0) goto zero_input
|
||||
|
||||
cvtsi2ss xm0, dword Kd ; m0 = K
|
||||
%if USE_APPROXIMATION == 1
|
||||
rcpss xm1, xm1 ; m1 = approx(1/Sx)
|
||||
mulss xm0, xm1 ; m0 = K*(1/Sx)
|
||||
%else
|
||||
divss xm0, xm1 ; b = K/Sx
|
||||
; b = K/max_x
|
||||
%endif
|
||||
|
||||
VBROADCASTSS m0, xm0
|
||||
|
||||
lea r4d, [Nd - mmsize]
|
||||
pxor m5, m5 ; Sy ( Sum of abs( y[i]) )
|
||||
xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
|
||||
xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
|
||||
align 16
|
||||
%%loop_guess:
|
||||
movaps m1, [tmpX + r4] ; m1 = X[i]
|
||||
mulps m2, m0, m1 ; m2 = res*X[i]
|
||||
%if PRESEARCH_ROUNDING == 0
|
||||
cvttps2dq m2, m2 ; yt = (int)truncf( res*X[i] )
|
||||
%else
|
||||
cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
|
||||
%endif
|
||||
paddd m5, m2 ; Sy += yt
|
||||
cvtdq2ps m2, m2 ; yt = (float)yt
|
||||
mulps m1, m2 ; m1 = X[i]*yt
|
||||
movaps [tmpY + r4], m2 ; y[i] = m2
|
||||
addps m7, m1 ; Sxy += m1;
|
||||
mulps m2, m2 ; m2 = yt*yt
|
||||
addps m6, m2 ; Syy += m2
|
||||
|
||||
sub r4d, mmsize
|
||||
jnc %%loop_guess
|
||||
|
||||
HSUMPS m6, m1 ; Syy_norm
|
||||
HADDD m5, m4 ; pulses
|
||||
|
||||
movd dword r4d, xm5 ; zero extends to the rest of r4q
|
||||
|
||||
sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
|
||||
jz %%finish ; K - pulses == 0
|
||||
|
||||
SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5
|
||||
SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1
|
||||
SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
|
||||
; Use Syy/2 in distortion parameter calculations.
|
||||
; Saves pre and post-caclulation to correct Y[] values.
|
||||
; Same precision, since float mantisa is normalized.
|
||||
; The SQRT approximation does differ.
|
||||
HSUMPS m7, m0 ; Sxy_norm
|
||||
mulps m6, mm_const_float_0_5
|
||||
|
||||
jc %%remove_pulses_loop ; K - pulses < 0
|
||||
|
||||
align 16 ; K - pulses > 0
|
||||
%%add_pulses_loop:
|
||||
|
||||
PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
|
||||
|
||||
sub Kd, 1
|
||||
jnz %%add_pulses_loop
|
||||
|
||||
addps m6, m6 ; Syy*=2
|
||||
|
||||
jmp %%finish
|
||||
|
||||
align 16
|
||||
%%remove_pulses_loop:
|
||||
|
||||
PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
|
||||
|
||||
add Kd, 1
|
||||
jnz %%remove_pulses_loop
|
||||
|
||||
addps m6, m6 ; Syy*=2
|
||||
|
||||
align 16
|
||||
%%finish:
|
||||
lea r4d, [Nd - mmsize]
|
||||
movaps m2, [const_float_sign_mask]
|
||||
|
||||
align 16
|
||||
%%restore_sign_loop:
|
||||
movaps m0, [tmpY + r4] ; m0 = Y[i]
|
||||
movups m1, [inXq + r4] ; m1 = X[i]
|
||||
andps m1, m2 ; m1 = sign(X[i])
|
||||
orps m0, m1 ; m0 = Y[i]*sign
|
||||
cvtps2dq m3, m0 ; m3 = (int)m0
|
||||
movaps [outYq + r4], m3
|
||||
|
||||
sub r4d, mmsize
|
||||
jnc %%restore_sign_loop
|
||||
%%return:
|
||||
|
||||
%if ARCH_X86_64 == 0 ; sbrdsp
|
||||
movss r0m, xm6 ; return (float)Syy_norm
|
||||
fld dword r0m
|
||||
%else
|
||||
movaps m0, m6 ; return (float)Syy_norm
|
||||
%endif
|
||||
|
||||
RET
|
||||
|
||||
align 16
|
||||
%%zero_input:
|
||||
lea r4d, [Nd - mmsize]
|
||||
xorps m0, m0
|
||||
%%zero_loop:
|
||||
movaps [outYq + r4], m0
|
||||
sub r4d, mmsize
|
||||
jnc %%zero_loop
|
||||
|
||||
movaps m6, [const_float_1]
|
||||
jmp %%return
|
||||
%endmacro
|
||||
|
||||
|
||||
INIT_XMM sse2
|
||||
PVQ_FAST_SEARCH
|
||||
|
||||
INIT_XMM sse4
|
||||
PVQ_FAST_SEARCH
|
||||
|
||||
INIT_XMM avx
|
||||
PVQ_FAST_SEARCH
|
Loading…
Reference in New Issue
Block a user