You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-06-30 22:24:04 +02:00
x86: fmtconvert: add special asm for float_to_int16_interleave_misc_*
This gets rid of a variable-length array and a for loop in C code. Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
committed by
Martin Storsjö
parent
4f2c846d96
commit
66a02159ea
@ -115,6 +115,84 @@ FLOAT_TO_INT16 sse, 0
|
|||||||
FLOAT_TO_INT16 3dnow, 0
|
FLOAT_TO_INT16 3dnow, 0
|
||||||
%undef cvtps2pi
|
%undef cvtps2pi
|
||||||
|
|
||||||
|
;------------------------------------------------------------------------------
|
||||||
|
; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
|
||||||
|
;------------------------------------------------------------------------------
|
||||||
|
%macro FLOAT_TO_INT16_STEP 2
|
||||||
|
cglobal float_to_int16_step_%1, 4,7,%2, dst, src, len, step, step3, v1, v2
|
||||||
|
add lenq, lenq
|
||||||
|
lea srcq, [srcq+2*lenq]
|
||||||
|
lea step3q, [stepq*3]
|
||||||
|
neg lenq
|
||||||
|
.loop:
|
||||||
|
%ifidn %1, sse2
|
||||||
|
cvtps2dq m0, [srcq+2*lenq ]
|
||||||
|
cvtps2dq m1, [srcq+2*lenq+16]
|
||||||
|
packssdw m0, m1
|
||||||
|
movd v1d, m0
|
||||||
|
psrldq m0, 4
|
||||||
|
movd v2d, m0
|
||||||
|
psrldq m0, 4
|
||||||
|
mov [dstq], v1w
|
||||||
|
mov [dstq+stepq*4], v2w
|
||||||
|
shr v1d, 16
|
||||||
|
shr v2d, 16
|
||||||
|
mov [dstq+stepq*2], v1w
|
||||||
|
mov [dstq+step3q*2], v2w
|
||||||
|
lea dstq, [dstq+stepq*8]
|
||||||
|
movd v1d, m0
|
||||||
|
psrldq m0, 4
|
||||||
|
movd v2d, m0
|
||||||
|
mov [dstq], v1w
|
||||||
|
mov [dstq+stepq*4], v2w
|
||||||
|
shr v1d, 16
|
||||||
|
shr v2d, 16
|
||||||
|
mov [dstq+stepq*2], v1w
|
||||||
|
mov [dstq+step3q*2], v2w
|
||||||
|
lea dstq, [dstq+stepq*8]
|
||||||
|
%else
|
||||||
|
cvtps2pi m0, [srcq+2*lenq ]
|
||||||
|
cvtps2pi m1, [srcq+2*lenq+ 8]
|
||||||
|
cvtps2pi m2, [srcq+2*lenq+16]
|
||||||
|
cvtps2pi m3, [srcq+2*lenq+24]
|
||||||
|
packssdw m0, m1
|
||||||
|
packssdw m2, m3
|
||||||
|
movd v1d, m0
|
||||||
|
psrlq m0, 32
|
||||||
|
movd v2d, m0
|
||||||
|
mov [dstq], v1w
|
||||||
|
mov [dstq+stepq*4], v2w
|
||||||
|
shr v1d, 16
|
||||||
|
shr v2d, 16
|
||||||
|
mov [dstq+stepq*2], v1w
|
||||||
|
mov [dstq+step3q*2], v2w
|
||||||
|
lea dstq, [dstq+stepq*8]
|
||||||
|
movd v1d, m2
|
||||||
|
psrlq m2, 32
|
||||||
|
movd v2d, m2
|
||||||
|
mov [dstq], v1w
|
||||||
|
mov [dstq+stepq*4], v2w
|
||||||
|
shr v1d, 16
|
||||||
|
shr v2d, 16
|
||||||
|
mov [dstq+stepq*2], v1w
|
||||||
|
mov [dstq+step3q*2], v2w
|
||||||
|
lea dstq, [dstq+stepq*8]
|
||||||
|
%endif
|
||||||
|
add lenq, 16
|
||||||
|
js .loop
|
||||||
|
%ifnidn %1, sse2
|
||||||
|
emms
|
||||||
|
%endif
|
||||||
|
REP_RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_XMM
|
||||||
|
FLOAT_TO_INT16_STEP sse2, 2
|
||||||
|
INIT_MMX
|
||||||
|
FLOAT_TO_INT16_STEP sse, 0
|
||||||
|
%define cvtps2pi pf2id
|
||||||
|
FLOAT_TO_INT16_STEP 3dnow, 0
|
||||||
|
%undef cvtps2pi
|
||||||
|
|
||||||
;-------------------------------------------------------------------------------
|
;-------------------------------------------------------------------------------
|
||||||
; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
|
; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
#include "libavutil/cpu.h"
|
#include "libavutil/cpu.h"
|
||||||
#include "libavutil/x86_cpu.h"
|
#include "libavutil/x86_cpu.h"
|
||||||
#include "libavcodec/fmtconvert.h"
|
#include "libavcodec/fmtconvert.h"
|
||||||
|
#include "libavcodec/dsputil.h"
|
||||||
|
|
||||||
#if HAVE_YASM
|
#if HAVE_YASM
|
||||||
|
|
||||||
@ -35,6 +36,10 @@ void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
|
|||||||
void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
|
void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
|
||||||
void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
|
void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
|
||||||
|
|
||||||
|
void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step);
|
||||||
|
void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step);
|
||||||
|
void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step);
|
||||||
|
|
||||||
void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
|
void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
|
||||||
void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len);
|
void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len);
|
||||||
void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
|
void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
|
||||||
@ -48,12 +53,9 @@ void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len
|
|||||||
#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
|
#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
|
||||||
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
|
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
|
||||||
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
||||||
DECLARE_ALIGNED(16, int16_t, tmp)[len];\
|
int c;\
|
||||||
int i,j,c;\
|
|
||||||
for(c=0; c<channels; c++){\
|
for(c=0; c<channels; c++){\
|
||||||
ff_float_to_int16_##cpu(tmp, src[c], len);\
|
ff_float_to_int16_step_##cpu(dst+c, src[c], len, channels);\
|
||||||
for(i=0, j=c; i<len; i++, j+=channels)\
|
|
||||||
dst[j] = tmp[i];\
|
|
||||||
}\
|
}\
|
||||||
}\
|
}\
|
||||||
\
|
\
|
||||||
|
Reference in New Issue
Block a user