mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
Separate format conversion DSP functions from DSPContext.
This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com>
This commit is contained in:
parent
770c410fbb
commit
c73d99e672
@ -12,6 +12,7 @@ OBJS = allcodecs.o \
|
||||
bitstream_filter.o \
|
||||
dsputil.o \
|
||||
faanidct.o \
|
||||
fmtconvert.o \
|
||||
imgconvert.o \
|
||||
jrevdct.o \
|
||||
opt.o \
|
||||
|
@ -35,6 +35,7 @@
|
||||
#include "fft.h"
|
||||
#include "mpeg4audio.h"
|
||||
#include "sbr.h"
|
||||
#include "fmtconvert.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
@ -268,6 +269,7 @@ typedef struct {
|
||||
FFTContext mdct;
|
||||
FFTContext mdct_small;
|
||||
DSPContext dsp;
|
||||
FmtConvertContext fmt_conv;
|
||||
int random_state;
|
||||
/** @} */
|
||||
|
||||
|
@ -85,6 +85,7 @@
|
||||
#include "get_bits.h"
|
||||
#include "dsputil.h"
|
||||
#include "fft.h"
|
||||
#include "fmtconvert.h"
|
||||
#include "lpc.h"
|
||||
|
||||
#include "aac.h"
|
||||
@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx)
|
||||
ff_aac_sbr_init();
|
||||
|
||||
dsputil_init(&ac->dsp, avctx);
|
||||
ff_fmt_convert_init(&ac->fmt_conv, avctx);
|
||||
|
||||
ac->random_state = 0x1f2e3d4c;
|
||||
|
||||
@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
|
||||
*data_size = data_size_tmp;
|
||||
|
||||
if (samples)
|
||||
ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
|
||||
ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
|
||||
|
||||
if (ac->output_configured)
|
||||
ac->output_configured = OC_LOCKED;
|
||||
|
@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
|
||||
ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
|
||||
ff_kbd_window_init(s->window, 5.0, 256);
|
||||
dsputil_init(&s->dsp, avctx);
|
||||
ff_fmt_convert_init(&s->fmt_conv, avctx);
|
||||
av_lfg_init(&s->dith_state, 0);
|
||||
|
||||
/* set scale value for float to int16 conversion */
|
||||
@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
|
||||
} else {
|
||||
gain *= s->dynamic_range[0];
|
||||
}
|
||||
s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
|
||||
s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
|
||||
}
|
||||
|
||||
/* apply spectral extension to high frequency bins */
|
||||
@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
|
||||
av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
|
||||
err = 1;
|
||||
}
|
||||
s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
|
||||
s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
|
||||
out_samples += 256 * s->out_channels;
|
||||
}
|
||||
*data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t);
|
||||
|
@ -55,6 +55,7 @@
|
||||
#include "get_bits.h"
|
||||
#include "dsputil.h"
|
||||
#include "fft.h"
|
||||
#include "fmtconvert.h"
|
||||
|
||||
/* override ac3.h to include coupling channel */
|
||||
#undef AC3_MAX_CHANNELS
|
||||
@ -190,6 +191,7 @@ typedef struct {
|
||||
|
||||
///@defgroup opt optimization
|
||||
DSPContext dsp; ///< for optimization
|
||||
FmtConvertContext fmt_conv; ///< optimized conversion functions
|
||||
float mul_bias; ///< scaling for float_to_int16 conversion
|
||||
///@}
|
||||
|
||||
|
@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
|
||||
OBJS += arm/dsputil_init_arm.o \
|
||||
arm/dsputil_arm.o \
|
||||
arm/fft_init_arm.o \
|
||||
arm/fmtconvert_init_arm.o \
|
||||
arm/jrevdct_arm.o \
|
||||
arm/mpegvideo_arm.o \
|
||||
arm/simple_idct_arm.o \
|
||||
@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \
|
||||
arm/dsputil_armv6.o \
|
||||
arm/simple_idct_armv6.o \
|
||||
|
||||
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \
|
||||
|
||||
OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \
|
||||
arm/dsputil_init_vfp.o \
|
||||
$(VFP-OBJS-yes)
|
||||
|
||||
OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \
|
||||
arm/mpegvideo_iwmmxt.o \
|
||||
@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \
|
||||
|
||||
OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \
|
||||
arm/dsputil_neon.o \
|
||||
arm/fmtconvert_neon.o \
|
||||
arm/int_neon.o \
|
||||
arm/mpegvideo_neon.o \
|
||||
arm/simple_idct_neon.o \
|
||||
|
@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
|
||||
int len);
|
||||
void ff_butterflies_float_neon(float *v1, float *v2, int len);
|
||||
float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
|
||||
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
|
||||
float mul, int len);
|
||||
void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
|
||||
const float *src1, int len);
|
||||
void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
|
||||
@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
|
||||
|
||||
void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
|
||||
int len);
|
||||
void ff_float_to_int16_neon(int16_t *, const float *, long);
|
||||
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
|
||||
|
||||
void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
|
||||
|
||||
@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
|
||||
c->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
|
||||
c->butterflies_float = ff_butterflies_float_neon;
|
||||
c->scalarproduct_float = ff_scalarproduct_float_neon;
|
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
|
||||
c->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
|
||||
c->vector_fmul_add = ff_vector_fmul_add_neon;
|
||||
c->vector_clipf = ff_vector_clipf_neon;
|
||||
@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
|
||||
c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
|
||||
c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;
|
||||
|
||||
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
||||
c->float_to_int16 = ff_float_to_int16_neon;
|
||||
c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
|
||||
}
|
||||
|
||||
if (CONFIG_VORBIS_DECODER)
|
||||
c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
|
||||
|
||||
|
@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0,
|
||||
const float *src1, int len);
|
||||
void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
|
||||
const float *src1, int len);
|
||||
void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
|
||||
|
||||
void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
|
||||
{
|
||||
c->vector_fmul = ff_vector_fmul_vfp;
|
||||
c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
|
||||
#if HAVE_ARMV6
|
||||
c->float_to_int16 = ff_float_to_int16_vfp;
|
||||
#endif
|
||||
}
|
||||
|
@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_float_to_int16_neon, export=1
|
||||
subs r2, r2, #8
|
||||
vld1.64 {d0-d1}, [r1,:128]!
|
||||
vcvt.s32.f32 q8, q0, #16
|
||||
vld1.64 {d2-d3}, [r1,:128]!
|
||||
vcvt.s32.f32 q9, q1, #16
|
||||
beq 3f
|
||||
bics ip, r2, #15
|
||||
beq 2f
|
||||
1: subs ip, ip, #16
|
||||
vshrn.s32 d4, q8, #16
|
||||
vld1.64 {d0-d1}, [r1,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vshrn.s32 d5, q9, #16
|
||||
vld1.64 {d2-d3}, [r1,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
vshrn.s32 d6, q0, #16
|
||||
vst1.64 {d4-d5}, [r0,:128]!
|
||||
vshrn.s32 d7, q1, #16
|
||||
vld1.64 {d16-d17},[r1,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vld1.64 {d18-d19},[r1,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vst1.64 {d6-d7}, [r0,:128]!
|
||||
bne 1b
|
||||
ands r2, r2, #15
|
||||
beq 3f
|
||||
2: vld1.64 {d0-d1}, [r1,:128]!
|
||||
vshrn.s32 d4, q8, #16
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vld1.64 {d2-d3}, [r1,:128]!
|
||||
vshrn.s32 d5, q9, #16
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
vshrn.s32 d6, q0, #16
|
||||
vst1.64 {d4-d5}, [r0,:128]!
|
||||
vshrn.s32 d7, q1, #16
|
||||
vst1.64 {d6-d7}, [r0,:128]!
|
||||
bx lr
|
||||
3: vshrn.s32 d4, q8, #16
|
||||
vshrn.s32 d5, q9, #16
|
||||
vst1.64 {d4-d5}, [r0,:128]!
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_float_to_int16_interleave_neon, export=1
|
||||
cmp r3, #2
|
||||
ldrlt r1, [r1]
|
||||
blt ff_float_to_int16_neon
|
||||
bne 4f
|
||||
|
||||
ldr r3, [r1]
|
||||
ldr r1, [r1, #4]
|
||||
|
||||
subs r2, r2, #8
|
||||
vld1.64 {d0-d1}, [r3,:128]!
|
||||
vcvt.s32.f32 q8, q0, #16
|
||||
vld1.64 {d2-d3}, [r3,:128]!
|
||||
vcvt.s32.f32 q9, q1, #16
|
||||
vld1.64 {d20-d21},[r1,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vld1.64 {d22-d23},[r1,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
beq 3f
|
||||
bics ip, r2, #15
|
||||
beq 2f
|
||||
1: subs ip, ip, #16
|
||||
vld1.64 {d0-d1}, [r3,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vsri.32 q10, q8, #16
|
||||
vld1.64 {d2-d3}, [r3,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
vld1.64 {d24-d25},[r1,:128]!
|
||||
vcvt.s32.f32 q12, q12, #16
|
||||
vld1.64 {d26-d27},[r1,:128]!
|
||||
vsri.32 q11, q9, #16
|
||||
vst1.64 {d20-d21},[r0,:128]!
|
||||
vcvt.s32.f32 q13, q13, #16
|
||||
vst1.64 {d22-d23},[r0,:128]!
|
||||
vsri.32 q12, q0, #16
|
||||
vld1.64 {d16-d17},[r3,:128]!
|
||||
vsri.32 q13, q1, #16
|
||||
vst1.64 {d24-d25},[r0,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vld1.64 {d18-d19},[r3,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vld1.64 {d20-d21},[r1,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vld1.64 {d22-d23},[r1,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
vst1.64 {d26-d27},[r0,:128]!
|
||||
bne 1b
|
||||
ands r2, r2, #15
|
||||
beq 3f
|
||||
2: vsri.32 q10, q8, #16
|
||||
vld1.64 {d0-d1}, [r3,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vld1.64 {d2-d3}, [r3,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
vld1.64 {d24-d25},[r1,:128]!
|
||||
vcvt.s32.f32 q12, q12, #16
|
||||
vsri.32 q11, q9, #16
|
||||
vld1.64 {d26-d27},[r1,:128]!
|
||||
vcvt.s32.f32 q13, q13, #16
|
||||
vst1.64 {d20-d21},[r0,:128]!
|
||||
vsri.32 q12, q0, #16
|
||||
vst1.64 {d22-d23},[r0,:128]!
|
||||
vsri.32 q13, q1, #16
|
||||
vst1.64 {d24-d27},[r0,:128]!
|
||||
bx lr
|
||||
3: vsri.32 q10, q8, #16
|
||||
vsri.32 q11, q9, #16
|
||||
vst1.64 {d20-d23},[r0,:128]!
|
||||
bx lr
|
||||
|
||||
4: push {r4-r8,lr}
|
||||
cmp r3, #4
|
||||
lsl ip, r3, #1
|
||||
blt 4f
|
||||
|
||||
@ 4 channels
|
||||
5: ldmia r1!, {r4-r7}
|
||||
mov lr, r2
|
||||
mov r8, r0
|
||||
vld1.64 {d16-d17},[r4,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vld1.64 {d18-d19},[r5,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vld1.64 {d20-d21},[r6,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vld1.64 {d22-d23},[r7,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
6: subs lr, lr, #8
|
||||
vld1.64 {d0-d1}, [r4,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vsri.32 q9, q8, #16
|
||||
vld1.64 {d2-d3}, [r5,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
vsri.32 q11, q10, #16
|
||||
vld1.64 {d4-d5}, [r6,:128]!
|
||||
vcvt.s32.f32 q2, q2, #16
|
||||
vzip.32 d18, d22
|
||||
vld1.64 {d6-d7}, [r7,:128]!
|
||||
vcvt.s32.f32 q3, q3, #16
|
||||
vzip.32 d19, d23
|
||||
vst1.64 {d18}, [r8], ip
|
||||
vsri.32 q1, q0, #16
|
||||
vst1.64 {d22}, [r8], ip
|
||||
vsri.32 q3, q2, #16
|
||||
vst1.64 {d19}, [r8], ip
|
||||
vzip.32 d2, d6
|
||||
vst1.64 {d23}, [r8], ip
|
||||
vzip.32 d3, d7
|
||||
beq 7f
|
||||
vld1.64 {d16-d17},[r4,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vst1.64 {d2}, [r8], ip
|
||||
vld1.64 {d18-d19},[r5,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vst1.64 {d6}, [r8], ip
|
||||
vld1.64 {d20-d21},[r6,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vst1.64 {d3}, [r8], ip
|
||||
vld1.64 {d22-d23},[r7,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
vst1.64 {d7}, [r8], ip
|
||||
b 6b
|
||||
7: vst1.64 {d2}, [r8], ip
|
||||
vst1.64 {d6}, [r8], ip
|
||||
vst1.64 {d3}, [r8], ip
|
||||
vst1.64 {d7}, [r8], ip
|
||||
subs r3, r3, #4
|
||||
popeq {r4-r8,pc}
|
||||
cmp r3, #4
|
||||
add r0, r0, #8
|
||||
bge 5b
|
||||
|
||||
@ 2 channels
|
||||
4: cmp r3, #2
|
||||
blt 4f
|
||||
ldmia r1!, {r4-r5}
|
||||
mov lr, r2
|
||||
mov r8, r0
|
||||
tst lr, #8
|
||||
vld1.64 {d16-d17},[r4,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vld1.64 {d18-d19},[r5,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vld1.64 {d20-d21},[r4,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vld1.64 {d22-d23},[r5,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
beq 6f
|
||||
subs lr, lr, #8
|
||||
beq 7f
|
||||
vsri.32 d18, d16, #16
|
||||
vsri.32 d19, d17, #16
|
||||
vld1.64 {d16-d17},[r4,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vst1.32 {d18[0]}, [r8], ip
|
||||
vsri.32 d22, d20, #16
|
||||
vst1.32 {d18[1]}, [r8], ip
|
||||
vsri.32 d23, d21, #16
|
||||
vst1.32 {d19[0]}, [r8], ip
|
||||
vst1.32 {d19[1]}, [r8], ip
|
||||
vld1.64 {d18-d19},[r5,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vst1.32 {d22[0]}, [r8], ip
|
||||
vst1.32 {d22[1]}, [r8], ip
|
||||
vld1.64 {d20-d21},[r4,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vst1.32 {d23[0]}, [r8], ip
|
||||
vst1.32 {d23[1]}, [r8], ip
|
||||
vld1.64 {d22-d23},[r5,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
6: subs lr, lr, #16
|
||||
vld1.64 {d0-d1}, [r4,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vsri.32 d18, d16, #16
|
||||
vld1.64 {d2-d3}, [r5,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
vsri.32 d19, d17, #16
|
||||
vld1.64 {d4-d5}, [r4,:128]!
|
||||
vcvt.s32.f32 q2, q2, #16
|
||||
vld1.64 {d6-d7}, [r5,:128]!
|
||||
vcvt.s32.f32 q3, q3, #16
|
||||
vst1.32 {d18[0]}, [r8], ip
|
||||
vsri.32 d22, d20, #16
|
||||
vst1.32 {d18[1]}, [r8], ip
|
||||
vsri.32 d23, d21, #16
|
||||
vst1.32 {d19[0]}, [r8], ip
|
||||
vsri.32 d2, d0, #16
|
||||
vst1.32 {d19[1]}, [r8], ip
|
||||
vsri.32 d3, d1, #16
|
||||
vst1.32 {d22[0]}, [r8], ip
|
||||
vsri.32 d6, d4, #16
|
||||
vst1.32 {d22[1]}, [r8], ip
|
||||
vsri.32 d7, d5, #16
|
||||
vst1.32 {d23[0]}, [r8], ip
|
||||
vst1.32 {d23[1]}, [r8], ip
|
||||
beq 6f
|
||||
vld1.64 {d16-d17},[r4,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vst1.32 {d2[0]}, [r8], ip
|
||||
vst1.32 {d2[1]}, [r8], ip
|
||||
vld1.64 {d18-d19},[r5,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vst1.32 {d3[0]}, [r8], ip
|
||||
vst1.32 {d3[1]}, [r8], ip
|
||||
vld1.64 {d20-d21},[r4,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vst1.32 {d6[0]}, [r8], ip
|
||||
vst1.32 {d6[1]}, [r8], ip
|
||||
vld1.64 {d22-d23},[r5,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
vst1.32 {d7[0]}, [r8], ip
|
||||
vst1.32 {d7[1]}, [r8], ip
|
||||
bgt 6b
|
||||
6: vst1.32 {d2[0]}, [r8], ip
|
||||
vst1.32 {d2[1]}, [r8], ip
|
||||
vst1.32 {d3[0]}, [r8], ip
|
||||
vst1.32 {d3[1]}, [r8], ip
|
||||
vst1.32 {d6[0]}, [r8], ip
|
||||
vst1.32 {d6[1]}, [r8], ip
|
||||
vst1.32 {d7[0]}, [r8], ip
|
||||
vst1.32 {d7[1]}, [r8], ip
|
||||
b 8f
|
||||
7: vsri.32 d18, d16, #16
|
||||
vsri.32 d19, d17, #16
|
||||
vst1.32 {d18[0]}, [r8], ip
|
||||
vsri.32 d22, d20, #16
|
||||
vst1.32 {d18[1]}, [r8], ip
|
||||
vsri.32 d23, d21, #16
|
||||
vst1.32 {d19[0]}, [r8], ip
|
||||
vst1.32 {d19[1]}, [r8], ip
|
||||
vst1.32 {d22[0]}, [r8], ip
|
||||
vst1.32 {d22[1]}, [r8], ip
|
||||
vst1.32 {d23[0]}, [r8], ip
|
||||
vst1.32 {d23[1]}, [r8], ip
|
||||
8: subs r3, r3, #2
|
||||
add r0, r0, #4
|
||||
popeq {r4-r8,pc}
|
||||
|
||||
@ 1 channel
|
||||
4: ldr r4, [r1],#4
|
||||
tst r2, #8
|
||||
mov lr, r2
|
||||
mov r5, r0
|
||||
vld1.64 {d0-d1}, [r4,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vld1.64 {d2-d3}, [r4,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
bne 8f
|
||||
6: subs lr, lr, #16
|
||||
vld1.64 {d4-d5}, [r4,:128]!
|
||||
vcvt.s32.f32 q2, q2, #16
|
||||
vld1.64 {d6-d7}, [r4,:128]!
|
||||
vcvt.s32.f32 q3, q3, #16
|
||||
vst1.16 {d0[1]}, [r5,:16], ip
|
||||
vst1.16 {d0[3]}, [r5,:16], ip
|
||||
vst1.16 {d1[1]}, [r5,:16], ip
|
||||
vst1.16 {d1[3]}, [r5,:16], ip
|
||||
vst1.16 {d2[1]}, [r5,:16], ip
|
||||
vst1.16 {d2[3]}, [r5,:16], ip
|
||||
vst1.16 {d3[1]}, [r5,:16], ip
|
||||
vst1.16 {d3[3]}, [r5,:16], ip
|
||||
beq 7f
|
||||
vld1.64 {d0-d1}, [r4,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vld1.64 {d2-d3}, [r4,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
7: vst1.16 {d4[1]}, [r5,:16], ip
|
||||
vst1.16 {d4[3]}, [r5,:16], ip
|
||||
vst1.16 {d5[1]}, [r5,:16], ip
|
||||
vst1.16 {d5[3]}, [r5,:16], ip
|
||||
vst1.16 {d6[1]}, [r5,:16], ip
|
||||
vst1.16 {d6[3]}, [r5,:16], ip
|
||||
vst1.16 {d7[1]}, [r5,:16], ip
|
||||
vst1.16 {d7[3]}, [r5,:16], ip
|
||||
bgt 6b
|
||||
pop {r4-r8,pc}
|
||||
8: subs lr, lr, #8
|
||||
vst1.16 {d0[1]}, [r5,:16], ip
|
||||
vst1.16 {d0[3]}, [r5,:16], ip
|
||||
vst1.16 {d1[1]}, [r5,:16], ip
|
||||
vst1.16 {d1[3]}, [r5,:16], ip
|
||||
vst1.16 {d2[1]}, [r5,:16], ip
|
||||
vst1.16 {d2[3]}, [r5,:16], ip
|
||||
vst1.16 {d3[1]}, [r5,:16], ip
|
||||
vst1.16 {d3[3]}, [r5,:16], ip
|
||||
popeq {r4-r8,pc}
|
||||
vld1.64 {d0-d1}, [r4,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vld1.64 {d2-d3}, [r4,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
b 6b
|
||||
endfunc
|
||||
|
||||
function ff_vector_fmul_neon, export=1
|
||||
subs r3, r3, #8
|
||||
vld1.64 {d0-d3}, [r1,:128]!
|
||||
@ -1050,34 +713,6 @@ NOVFP vmov.32 r0, d0[0]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_int32_to_float_fmul_scalar_neon, export=1
|
||||
VFP vdup.32 q0, d0[0]
|
||||
VFP len .req r2
|
||||
NOVFP vdup.32 q0, r2
|
||||
NOVFP len .req r3
|
||||
|
||||
vld1.32 {q1},[r1,:128]!
|
||||
vcvt.f32.s32 q3, q1
|
||||
vld1.32 {q2},[r1,:128]!
|
||||
vcvt.f32.s32 q8, q2
|
||||
1: subs len, len, #8
|
||||
pld [r1, #16]
|
||||
vmul.f32 q9, q3, q0
|
||||
vmul.f32 q10, q8, q0
|
||||
beq 2f
|
||||
vld1.32 {q1},[r1,:128]!
|
||||
vcvt.f32.s32 q3, q1
|
||||
vld1.32 {q2},[r1,:128]!
|
||||
vcvt.f32.s32 q8, q2
|
||||
vst1.32 {q9}, [r0,:128]!
|
||||
vst1.32 {q10},[r0,:128]!
|
||||
b 1b
|
||||
2: vst1.32 {q9}, [r0,:128]!
|
||||
vst1.32 {q10},[r0,:128]!
|
||||
bx lr
|
||||
.unreq len
|
||||
endfunc
|
||||
|
||||
function ff_vector_fmul_reverse_neon, export=1
|
||||
add r2, r2, r3, lsl #2
|
||||
sub r2, r2, #32
|
||||
|
@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
#if HAVE_ARMV6
|
||||
/**
|
||||
* ARM VFP optimized float to int16 conversion.
|
||||
* Assume that len is a positive number and is multiple of 8, destination
|
||||
* buffer is at least 4 bytes aligned (8 bytes alignment is better for
|
||||
* performance), little endian byte sex
|
||||
*/
|
||||
@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
|
||||
function ff_float_to_int16_vfp, export=1
|
||||
push {r4-r8,lr}
|
||||
vpush {d8-d11}
|
||||
vldmia r1!, {s16-s23}
|
||||
vcvt.s32.f32 s0, s16
|
||||
vcvt.s32.f32 s1, s17
|
||||
vcvt.s32.f32 s2, s18
|
||||
vcvt.s32.f32 s3, s19
|
||||
vcvt.s32.f32 s4, s20
|
||||
vcvt.s32.f32 s5, s21
|
||||
vcvt.s32.f32 s6, s22
|
||||
vcvt.s32.f32 s7, s23
|
||||
1:
|
||||
subs r2, r2, #8
|
||||
vmov r3, r4, s0, s1
|
||||
vmov r5, r6, s2, s3
|
||||
vmov r7, r8, s4, s5
|
||||
vmov ip, lr, s6, s7
|
||||
vldmiagt r1!, {s16-s23}
|
||||
ssat r4, #16, r4
|
||||
ssat r3, #16, r3
|
||||
ssat r6, #16, r6
|
||||
ssat r5, #16, r5
|
||||
pkhbt r3, r3, r4, lsl #16
|
||||
pkhbt r4, r5, r6, lsl #16
|
||||
vcvtgt.s32.f32 s0, s16
|
||||
vcvtgt.s32.f32 s1, s17
|
||||
vcvtgt.s32.f32 s2, s18
|
||||
vcvtgt.s32.f32 s3, s19
|
||||
vcvtgt.s32.f32 s4, s20
|
||||
vcvtgt.s32.f32 s5, s21
|
||||
vcvtgt.s32.f32 s6, s22
|
||||
vcvtgt.s32.f32 s7, s23
|
||||
ssat r8, #16, r8
|
||||
ssat r7, #16, r7
|
||||
ssat lr, #16, lr
|
||||
ssat ip, #16, ip
|
||||
pkhbt r5, r7, r8, lsl #16
|
||||
pkhbt r6, ip, lr, lsl #16
|
||||
stmia r0!, {r3-r6}
|
||||
bgt 1b
|
||||
|
||||
vpop {d8-d11}
|
||||
pop {r4-r8,pc}
|
||||
endfunc
|
||||
#endif
|
||||
|
48
libavcodec/arm/fmtconvert_init_arm.c
Normal file
48
libavcodec/arm/fmtconvert_init_arm.c
Normal file
@ -0,0 +1,48 @@
|
||||
/*
|
||||
* ARM optimized Format Conversion Utils
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/fmtconvert.h"
|
||||
|
||||
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
|
||||
float mul, int len);
|
||||
|
||||
void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
|
||||
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
|
||||
|
||||
void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
|
||||
|
||||
void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
if (HAVE_ARMVFP && HAVE_ARMV6) {
|
||||
c->float_to_int16 = ff_float_to_int16_vfp;
|
||||
}
|
||||
|
||||
if (HAVE_NEON) {
|
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
|
||||
|
||||
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
||||
c->float_to_int16 = ff_float_to_int16_neon;
|
||||
c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
|
||||
}
|
||||
}
|
||||
}
|
391
libavcodec/arm/fmtconvert_neon.S
Normal file
391
libavcodec/arm/fmtconvert_neon.S
Normal file
@ -0,0 +1,391 @@
|
||||
/*
|
||||
* ARM NEON optimised Format Conversion Utils
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "asm.S"
|
||||
|
||||
preserve8
|
||||
.text
|
||||
|
||||
function ff_float_to_int16_neon, export=1
|
||||
subs r2, r2, #8
|
||||
vld1.64 {d0-d1}, [r1,:128]!
|
||||
vcvt.s32.f32 q8, q0, #16
|
||||
vld1.64 {d2-d3}, [r1,:128]!
|
||||
vcvt.s32.f32 q9, q1, #16
|
||||
beq 3f
|
||||
bics ip, r2, #15
|
||||
beq 2f
|
||||
1: subs ip, ip, #16
|
||||
vshrn.s32 d4, q8, #16
|
||||
vld1.64 {d0-d1}, [r1,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vshrn.s32 d5, q9, #16
|
||||
vld1.64 {d2-d3}, [r1,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
vshrn.s32 d6, q0, #16
|
||||
vst1.64 {d4-d5}, [r0,:128]!
|
||||
vshrn.s32 d7, q1, #16
|
||||
vld1.64 {d16-d17},[r1,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vld1.64 {d18-d19},[r1,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vst1.64 {d6-d7}, [r0,:128]!
|
||||
bne 1b
|
||||
ands r2, r2, #15
|
||||
beq 3f
|
||||
2: vld1.64 {d0-d1}, [r1,:128]!
|
||||
vshrn.s32 d4, q8, #16
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vld1.64 {d2-d3}, [r1,:128]!
|
||||
vshrn.s32 d5, q9, #16
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
vshrn.s32 d6, q0, #16
|
||||
vst1.64 {d4-d5}, [r0,:128]!
|
||||
vshrn.s32 d7, q1, #16
|
||||
vst1.64 {d6-d7}, [r0,:128]!
|
||||
bx lr
|
||||
3: vshrn.s32 d4, q8, #16
|
||||
vshrn.s32 d5, q9, #16
|
||||
vst1.64 {d4-d5}, [r0,:128]!
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_float_to_int16_interleave_neon, export=1
|
||||
cmp r3, #2
|
||||
ldrlt r1, [r1]
|
||||
blt ff_float_to_int16_neon
|
||||
bne 4f
|
||||
|
||||
ldr r3, [r1]
|
||||
ldr r1, [r1, #4]
|
||||
|
||||
subs r2, r2, #8
|
||||
vld1.64 {d0-d1}, [r3,:128]!
|
||||
vcvt.s32.f32 q8, q0, #16
|
||||
vld1.64 {d2-d3}, [r3,:128]!
|
||||
vcvt.s32.f32 q9, q1, #16
|
||||
vld1.64 {d20-d21},[r1,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vld1.64 {d22-d23},[r1,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
beq 3f
|
||||
bics ip, r2, #15
|
||||
beq 2f
|
||||
1: subs ip, ip, #16
|
||||
vld1.64 {d0-d1}, [r3,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vsri.32 q10, q8, #16
|
||||
vld1.64 {d2-d3}, [r3,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
vld1.64 {d24-d25},[r1,:128]!
|
||||
vcvt.s32.f32 q12, q12, #16
|
||||
vld1.64 {d26-d27},[r1,:128]!
|
||||
vsri.32 q11, q9, #16
|
||||
vst1.64 {d20-d21},[r0,:128]!
|
||||
vcvt.s32.f32 q13, q13, #16
|
||||
vst1.64 {d22-d23},[r0,:128]!
|
||||
vsri.32 q12, q0, #16
|
||||
vld1.64 {d16-d17},[r3,:128]!
|
||||
vsri.32 q13, q1, #16
|
||||
vst1.64 {d24-d25},[r0,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vld1.64 {d18-d19},[r3,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vld1.64 {d20-d21},[r1,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vld1.64 {d22-d23},[r1,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
vst1.64 {d26-d27},[r0,:128]!
|
||||
bne 1b
|
||||
ands r2, r2, #15
|
||||
beq 3f
|
||||
2: vsri.32 q10, q8, #16
|
||||
vld1.64 {d0-d1}, [r3,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vld1.64 {d2-d3}, [r3,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
vld1.64 {d24-d25},[r1,:128]!
|
||||
vcvt.s32.f32 q12, q12, #16
|
||||
vsri.32 q11, q9, #16
|
||||
vld1.64 {d26-d27},[r1,:128]!
|
||||
vcvt.s32.f32 q13, q13, #16
|
||||
vst1.64 {d20-d21},[r0,:128]!
|
||||
vsri.32 q12, q0, #16
|
||||
vst1.64 {d22-d23},[r0,:128]!
|
||||
vsri.32 q13, q1, #16
|
||||
vst1.64 {d24-d27},[r0,:128]!
|
||||
bx lr
|
||||
3: vsri.32 q10, q8, #16
|
||||
vsri.32 q11, q9, #16
|
||||
vst1.64 {d20-d23},[r0,:128]!
|
||||
bx lr
|
||||
|
||||
4: push {r4-r8,lr}
|
||||
cmp r3, #4
|
||||
lsl ip, r3, #1
|
||||
blt 4f
|
||||
|
||||
@ 4 channels
|
||||
5: ldmia r1!, {r4-r7}
|
||||
mov lr, r2
|
||||
mov r8, r0
|
||||
vld1.64 {d16-d17},[r4,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vld1.64 {d18-d19},[r5,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vld1.64 {d20-d21},[r6,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vld1.64 {d22-d23},[r7,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
6: subs lr, lr, #8
|
||||
vld1.64 {d0-d1}, [r4,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vsri.32 q9, q8, #16
|
||||
vld1.64 {d2-d3}, [r5,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
vsri.32 q11, q10, #16
|
||||
vld1.64 {d4-d5}, [r6,:128]!
|
||||
vcvt.s32.f32 q2, q2, #16
|
||||
vzip.32 d18, d22
|
||||
vld1.64 {d6-d7}, [r7,:128]!
|
||||
vcvt.s32.f32 q3, q3, #16
|
||||
vzip.32 d19, d23
|
||||
vst1.64 {d18}, [r8], ip
|
||||
vsri.32 q1, q0, #16
|
||||
vst1.64 {d22}, [r8], ip
|
||||
vsri.32 q3, q2, #16
|
||||
vst1.64 {d19}, [r8], ip
|
||||
vzip.32 d2, d6
|
||||
vst1.64 {d23}, [r8], ip
|
||||
vzip.32 d3, d7
|
||||
beq 7f
|
||||
vld1.64 {d16-d17},[r4,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vst1.64 {d2}, [r8], ip
|
||||
vld1.64 {d18-d19},[r5,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vst1.64 {d6}, [r8], ip
|
||||
vld1.64 {d20-d21},[r6,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vst1.64 {d3}, [r8], ip
|
||||
vld1.64 {d22-d23},[r7,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
vst1.64 {d7}, [r8], ip
|
||||
b 6b
|
||||
7: vst1.64 {d2}, [r8], ip
|
||||
vst1.64 {d6}, [r8], ip
|
||||
vst1.64 {d3}, [r8], ip
|
||||
vst1.64 {d7}, [r8], ip
|
||||
subs r3, r3, #4
|
||||
popeq {r4-r8,pc}
|
||||
cmp r3, #4
|
||||
add r0, r0, #8
|
||||
bge 5b
|
||||
|
||||
@ 2 channels
|
||||
4: cmp r3, #2
|
||||
blt 4f
|
||||
ldmia r1!, {r4-r5}
|
||||
mov lr, r2
|
||||
mov r8, r0
|
||||
tst lr, #8
|
||||
vld1.64 {d16-d17},[r4,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vld1.64 {d18-d19},[r5,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vld1.64 {d20-d21},[r4,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vld1.64 {d22-d23},[r5,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
beq 6f
|
||||
subs lr, lr, #8
|
||||
beq 7f
|
||||
vsri.32 d18, d16, #16
|
||||
vsri.32 d19, d17, #16
|
||||
vld1.64 {d16-d17},[r4,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vst1.32 {d18[0]}, [r8], ip
|
||||
vsri.32 d22, d20, #16
|
||||
vst1.32 {d18[1]}, [r8], ip
|
||||
vsri.32 d23, d21, #16
|
||||
vst1.32 {d19[0]}, [r8], ip
|
||||
vst1.32 {d19[1]}, [r8], ip
|
||||
vld1.64 {d18-d19},[r5,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vst1.32 {d22[0]}, [r8], ip
|
||||
vst1.32 {d22[1]}, [r8], ip
|
||||
vld1.64 {d20-d21},[r4,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vst1.32 {d23[0]}, [r8], ip
|
||||
vst1.32 {d23[1]}, [r8], ip
|
||||
vld1.64 {d22-d23},[r5,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
6: subs lr, lr, #16
|
||||
vld1.64 {d0-d1}, [r4,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vsri.32 d18, d16, #16
|
||||
vld1.64 {d2-d3}, [r5,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
vsri.32 d19, d17, #16
|
||||
vld1.64 {d4-d5}, [r4,:128]!
|
||||
vcvt.s32.f32 q2, q2, #16
|
||||
vld1.64 {d6-d7}, [r5,:128]!
|
||||
vcvt.s32.f32 q3, q3, #16
|
||||
vst1.32 {d18[0]}, [r8], ip
|
||||
vsri.32 d22, d20, #16
|
||||
vst1.32 {d18[1]}, [r8], ip
|
||||
vsri.32 d23, d21, #16
|
||||
vst1.32 {d19[0]}, [r8], ip
|
||||
vsri.32 d2, d0, #16
|
||||
vst1.32 {d19[1]}, [r8], ip
|
||||
vsri.32 d3, d1, #16
|
||||
vst1.32 {d22[0]}, [r8], ip
|
||||
vsri.32 d6, d4, #16
|
||||
vst1.32 {d22[1]}, [r8], ip
|
||||
vsri.32 d7, d5, #16
|
||||
vst1.32 {d23[0]}, [r8], ip
|
||||
vst1.32 {d23[1]}, [r8], ip
|
||||
beq 6f
|
||||
vld1.64 {d16-d17},[r4,:128]!
|
||||
vcvt.s32.f32 q8, q8, #16
|
||||
vst1.32 {d2[0]}, [r8], ip
|
||||
vst1.32 {d2[1]}, [r8], ip
|
||||
vld1.64 {d18-d19},[r5,:128]!
|
||||
vcvt.s32.f32 q9, q9, #16
|
||||
vst1.32 {d3[0]}, [r8], ip
|
||||
vst1.32 {d3[1]}, [r8], ip
|
||||
vld1.64 {d20-d21},[r4,:128]!
|
||||
vcvt.s32.f32 q10, q10, #16
|
||||
vst1.32 {d6[0]}, [r8], ip
|
||||
vst1.32 {d6[1]}, [r8], ip
|
||||
vld1.64 {d22-d23},[r5,:128]!
|
||||
vcvt.s32.f32 q11, q11, #16
|
||||
vst1.32 {d7[0]}, [r8], ip
|
||||
vst1.32 {d7[1]}, [r8], ip
|
||||
bgt 6b
|
||||
6: vst1.32 {d2[0]}, [r8], ip
|
||||
vst1.32 {d2[1]}, [r8], ip
|
||||
vst1.32 {d3[0]}, [r8], ip
|
||||
vst1.32 {d3[1]}, [r8], ip
|
||||
vst1.32 {d6[0]}, [r8], ip
|
||||
vst1.32 {d6[1]}, [r8], ip
|
||||
vst1.32 {d7[0]}, [r8], ip
|
||||
vst1.32 {d7[1]}, [r8], ip
|
||||
b 8f
|
||||
7: vsri.32 d18, d16, #16
|
||||
vsri.32 d19, d17, #16
|
||||
vst1.32 {d18[0]}, [r8], ip
|
||||
vsri.32 d22, d20, #16
|
||||
vst1.32 {d18[1]}, [r8], ip
|
||||
vsri.32 d23, d21, #16
|
||||
vst1.32 {d19[0]}, [r8], ip
|
||||
vst1.32 {d19[1]}, [r8], ip
|
||||
vst1.32 {d22[0]}, [r8], ip
|
||||
vst1.32 {d22[1]}, [r8], ip
|
||||
vst1.32 {d23[0]}, [r8], ip
|
||||
vst1.32 {d23[1]}, [r8], ip
|
||||
8: subs r3, r3, #2
|
||||
add r0, r0, #4
|
||||
popeq {r4-r8,pc}
|
||||
|
||||
@ 1 channel
|
||||
4: ldr r4, [r1],#4
|
||||
tst r2, #8
|
||||
mov lr, r2
|
||||
mov r5, r0
|
||||
vld1.64 {d0-d1}, [r4,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vld1.64 {d2-d3}, [r4,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
bne 8f
|
||||
6: subs lr, lr, #16
|
||||
vld1.64 {d4-d5}, [r4,:128]!
|
||||
vcvt.s32.f32 q2, q2, #16
|
||||
vld1.64 {d6-d7}, [r4,:128]!
|
||||
vcvt.s32.f32 q3, q3, #16
|
||||
vst1.16 {d0[1]}, [r5,:16], ip
|
||||
vst1.16 {d0[3]}, [r5,:16], ip
|
||||
vst1.16 {d1[1]}, [r5,:16], ip
|
||||
vst1.16 {d1[3]}, [r5,:16], ip
|
||||
vst1.16 {d2[1]}, [r5,:16], ip
|
||||
vst1.16 {d2[3]}, [r5,:16], ip
|
||||
vst1.16 {d3[1]}, [r5,:16], ip
|
||||
vst1.16 {d3[3]}, [r5,:16], ip
|
||||
beq 7f
|
||||
vld1.64 {d0-d1}, [r4,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vld1.64 {d2-d3}, [r4,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
7: vst1.16 {d4[1]}, [r5,:16], ip
|
||||
vst1.16 {d4[3]}, [r5,:16], ip
|
||||
vst1.16 {d5[1]}, [r5,:16], ip
|
||||
vst1.16 {d5[3]}, [r5,:16], ip
|
||||
vst1.16 {d6[1]}, [r5,:16], ip
|
||||
vst1.16 {d6[3]}, [r5,:16], ip
|
||||
vst1.16 {d7[1]}, [r5,:16], ip
|
||||
vst1.16 {d7[3]}, [r5,:16], ip
|
||||
bgt 6b
|
||||
pop {r4-r8,pc}
|
||||
8: subs lr, lr, #8
|
||||
vst1.16 {d0[1]}, [r5,:16], ip
|
||||
vst1.16 {d0[3]}, [r5,:16], ip
|
||||
vst1.16 {d1[1]}, [r5,:16], ip
|
||||
vst1.16 {d1[3]}, [r5,:16], ip
|
||||
vst1.16 {d2[1]}, [r5,:16], ip
|
||||
vst1.16 {d2[3]}, [r5,:16], ip
|
||||
vst1.16 {d3[1]}, [r5,:16], ip
|
||||
vst1.16 {d3[3]}, [r5,:16], ip
|
||||
popeq {r4-r8,pc}
|
||||
vld1.64 {d0-d1}, [r4,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
vld1.64 {d2-d3}, [r4,:128]!
|
||||
vcvt.s32.f32 q1, q1, #16
|
||||
b 6b
|
||||
endfunc
|
||||
|
||||
function ff_int32_to_float_fmul_scalar_neon, export=1
|
||||
VFP vdup.32 q0, d0[0]
|
||||
VFP len .req r2
|
||||
NOVFP vdup.32 q0, r2
|
||||
NOVFP len .req r3
|
||||
|
||||
vld1.32 {q1},[r1,:128]!
|
||||
vcvt.f32.s32 q3, q1
|
||||
vld1.32 {q2},[r1,:128]!
|
||||
vcvt.f32.s32 q8, q2
|
||||
1: subs len, len, #8
|
||||
pld [r1, #16]
|
||||
vmul.f32 q9, q3, q0
|
||||
vmul.f32 q10, q8, q0
|
||||
beq 2f
|
||||
vld1.32 {q1},[r1,:128]!
|
||||
vcvt.f32.s32 q3, q1
|
||||
vld1.32 {q2},[r1,:128]!
|
||||
vcvt.f32.s32 q8, q2
|
||||
vst1.32 {q9}, [r0,:128]!
|
||||
vst1.32 {q10},[r0,:128]!
|
||||
b 1b
|
||||
2: vst1.32 {q9}, [r0,:128]!
|
||||
vst1.32 {q10},[r0,:128]!
|
||||
bx lr
|
||||
.unreq len
|
||||
endfunc
|
77
libavcodec/arm/fmtconvert_vfp.S
Normal file
77
libavcodec/arm/fmtconvert_vfp.S
Normal file
@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "asm.S"
|
||||
|
||||
.syntax unified
|
||||
|
||||
/**
|
||||
* ARM VFP optimized float to int16 conversion.
|
||||
* Assume that len is a positive number and is multiple of 8, destination
|
||||
* buffer is at least 4 bytes aligned (8 bytes alignment is better for
|
||||
* performance), little endian byte sex
|
||||
*/
|
||||
@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
|
||||
function ff_float_to_int16_vfp, export=1
|
||||
push {r4-r8,lr}
|
||||
vpush {d8-d11}
|
||||
vldmia r1!, {s16-s23}
|
||||
vcvt.s32.f32 s0, s16
|
||||
vcvt.s32.f32 s1, s17
|
||||
vcvt.s32.f32 s2, s18
|
||||
vcvt.s32.f32 s3, s19
|
||||
vcvt.s32.f32 s4, s20
|
||||
vcvt.s32.f32 s5, s21
|
||||
vcvt.s32.f32 s6, s22
|
||||
vcvt.s32.f32 s7, s23
|
||||
1:
|
||||
subs r2, r2, #8
|
||||
vmov r3, r4, s0, s1
|
||||
vmov r5, r6, s2, s3
|
||||
vmov r7, r8, s4, s5
|
||||
vmov ip, lr, s6, s7
|
||||
vldmiagt r1!, {s16-s23}
|
||||
ssat r4, #16, r4
|
||||
ssat r3, #16, r3
|
||||
ssat r6, #16, r6
|
||||
ssat r5, #16, r5
|
||||
pkhbt r3, r3, r4, lsl #16
|
||||
pkhbt r4, r5, r6, lsl #16
|
||||
vcvtgt.s32.f32 s0, s16
|
||||
vcvtgt.s32.f32 s1, s17
|
||||
vcvtgt.s32.f32 s2, s18
|
||||
vcvtgt.s32.f32 s3, s19
|
||||
vcvtgt.s32.f32 s4, s20
|
||||
vcvtgt.s32.f32 s5, s21
|
||||
vcvtgt.s32.f32 s6, s22
|
||||
vcvtgt.s32.f32 s7, s23
|
||||
ssat r8, #16, r8
|
||||
ssat r7, #16, r7
|
||||
ssat lr, #16, lr
|
||||
ssat ip, #16, ip
|
||||
pkhbt r5, r7, r8, lsl #16
|
||||
pkhbt r6, ip, lr, lsl #16
|
||||
stmia r0!, {r3-r6}
|
||||
bgt 1b
|
||||
|
||||
vpop {d8-d11}
|
||||
pop {r4-r8,pc}
|
||||
endfunc
|
@ -33,6 +33,7 @@
|
||||
#include "get_bits.h"
|
||||
#include "dsputil.h"
|
||||
#include "fft.h"
|
||||
#include "fmtconvert.h"
|
||||
|
||||
extern const uint16_t ff_wma_critical_freqs[25];
|
||||
|
||||
@ -43,6 +44,7 @@ typedef struct {
|
||||
AVCodecContext *avctx;
|
||||
GetBitContext gb;
|
||||
DSPContext dsp;
|
||||
FmtConvertContext fmt_conv;
|
||||
int first;
|
||||
int channels;
|
||||
int frame_len; ///< transform size (samples)
|
||||
@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
|
||||
|
||||
s->avctx = avctx;
|
||||
dsputil_init(&s->dsp, avctx);
|
||||
ff_fmt_convert_init(&s->fmt_conv, avctx);
|
||||
|
||||
/* determine frame length */
|
||||
if (avctx->sample_rate < 22050) {
|
||||
@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct)
|
||||
ff_rdft_calc(&s->trans.rdft, coeffs);
|
||||
}
|
||||
|
||||
s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels);
|
||||
s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr,
|
||||
s->frame_len, s->channels);
|
||||
|
||||
if (!s->first) {
|
||||
int count = s->overlap_len * s->channels;
|
||||
|
@ -40,6 +40,7 @@
|
||||
#include "dca.h"
|
||||
#include "synth_filter.h"
|
||||
#include "dcadsp.h"
|
||||
#include "fmtconvert.h"
|
||||
|
||||
//#define TRACE
|
||||
|
||||
@ -347,6 +348,7 @@ typedef struct {
|
||||
FFTContext imdct;
|
||||
SynthFilterContext synth;
|
||||
DCADSPContext dcadsp;
|
||||
FmtConvertContext fmt_conv;
|
||||
} DCAContext;
|
||||
|
||||
static const uint16_t dca_vlc_offs[] = {
|
||||
@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index)
|
||||
block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel);
|
||||
}
|
||||
|
||||
s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l],
|
||||
s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l],
|
||||
block, rscale, 8);
|
||||
}
|
||||
|
||||
@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx,
|
||||
}
|
||||
}
|
||||
|
||||
s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
|
||||
s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
|
||||
samples += 256 * channels;
|
||||
}
|
||||
|
||||
@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx)
|
||||
ff_mdct_init(&s->imdct, 6, 1, 1.0);
|
||||
ff_synth_filter_init(&s->synth);
|
||||
ff_dcadsp_init(&s->dcadsp);
|
||||
ff_fmt_convert_init(&s->fmt_conv, avctx);
|
||||
|
||||
for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++)
|
||||
s->samples_chanptr[i] = s->samples + i * 256;
|
||||
|
@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len)
|
||||
return p;
|
||||
}
|
||||
|
||||
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
|
||||
int i;
|
||||
for(i=0; i<len; i++)
|
||||
dst[i] = src[i] * mul;
|
||||
}
|
||||
|
||||
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
|
||||
uint32_t maxi, uint32_t maxisign)
|
||||
{
|
||||
@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i
|
||||
}
|
||||
}
|
||||
|
||||
static av_always_inline int float_to_int16_one(const float *src){
|
||||
return av_clip_int16(lrintf(*src));
|
||||
}
|
||||
|
||||
static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
|
||||
int i;
|
||||
for(i=0; i<len; i++)
|
||||
dst[i] = float_to_int16_one(src+i);
|
||||
}
|
||||
|
||||
static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
|
||||
int i,j,c;
|
||||
if(channels==2){
|
||||
for(i=0; i<len; i++){
|
||||
dst[2*i] = float_to_int16_one(src[0]+i);
|
||||
dst[2*i+1] = float_to_int16_one(src[1]+i);
|
||||
}
|
||||
}else{
|
||||
for(c=0; c<channels; c++)
|
||||
for(i=0, j=c; i<len; i++, j+=channels)
|
||||
dst[j] = float_to_int16_one(src[c]+i);
|
||||
}
|
||||
}
|
||||
|
||||
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
|
||||
{
|
||||
int res = 0;
|
||||
@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
|
||||
c->vector_fmul_reverse = vector_fmul_reverse_c;
|
||||
c->vector_fmul_add = vector_fmul_add_c;
|
||||
c->vector_fmul_window = vector_fmul_window_c;
|
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
|
||||
c->vector_clipf = vector_clipf_c;
|
||||
c->float_to_int16 = ff_float_to_int16_c;
|
||||
c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
|
||||
c->scalarproduct_int16 = scalarproduct_int16_c;
|
||||
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
|
||||
c->scalarproduct_float = scalarproduct_float_c;
|
||||
|
@ -392,7 +392,6 @@ typedef struct DSPContext {
|
||||
/* assume len is a multiple of 4, and arrays are 16-byte aligned */
|
||||
void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
|
||||
/* assume len is a multiple of 8, and arrays are 16-byte aligned */
|
||||
void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
|
||||
void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
|
||||
/**
|
||||
* Multiply a vector of floats by a scalar float. Source and
|
||||
@ -445,10 +444,6 @@ typedef struct DSPContext {
|
||||
*/
|
||||
void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
|
||||
|
||||
/* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
|
||||
void (*float_to_int16)(int16_t *dst, const float *src, long len);
|
||||
void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
|
||||
|
||||
/* (I)DCT */
|
||||
void (*fdct)(DCTELEM *block/* align 16*/);
|
||||
void (*fdct248)(DCTELEM *block/* align 16*/);
|
||||
|
68
libavcodec/fmtconvert.c
Normal file
68
libavcodec/fmtconvert.c
Normal file
@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Format Conversion Utils
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "avcodec.h"
|
||||
#include "fmtconvert.h"
|
||||
|
||||
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
|
||||
int i;
|
||||
for(i=0; i<len; i++)
|
||||
dst[i] = src[i] * mul;
|
||||
}
|
||||
|
||||
static av_always_inline int float_to_int16_one(const float *src){
|
||||
return av_clip_int16(lrintf(*src));
|
||||
}
|
||||
|
||||
static void float_to_int16_c(int16_t *dst, const float *src, long len)
|
||||
{
|
||||
int i;
|
||||
for(i=0; i<len; i++)
|
||||
dst[i] = float_to_int16_one(src+i);
|
||||
}
|
||||
|
||||
static void float_to_int16_interleave_c(int16_t *dst, const float **src,
|
||||
long len, int channels)
|
||||
{
|
||||
int i,j,c;
|
||||
if(channels==2){
|
||||
for(i=0; i<len; i++){
|
||||
dst[2*i] = float_to_int16_one(src[0]+i);
|
||||
dst[2*i+1] = float_to_int16_one(src[1]+i);
|
||||
}
|
||||
}else{
|
||||
for(c=0; c<channels; c++)
|
||||
for(i=0, j=c; i<len; i++, j+=channels)
|
||||
dst[j] = float_to_int16_one(src[c]+i);
|
||||
}
|
||||
}
|
||||
|
||||
av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
|
||||
c->float_to_int16 = float_to_int16_c;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_c;
|
||||
|
||||
if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
|
||||
if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx);
|
||||
if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx);
|
||||
}
|
79
libavcodec/fmtconvert.h
Normal file
79
libavcodec/fmtconvert.h
Normal file
@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Format Conversion Utils
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_FMTCONVERT_H
|
||||
#define AVCODEC_FMTCONVERT_H
|
||||
|
||||
#include "avcodec.h"
|
||||
|
||||
typedef struct FmtConvertContext {
|
||||
/**
|
||||
* Convert an array of int32_t to float and multiply by a float value.
|
||||
* @param dst destination array of float.
|
||||
* constraints: 16-byte aligned
|
||||
* @param src source array of int32_t.
|
||||
* constraints: 16-byte aligned
|
||||
* @param len number of elements to convert.
|
||||
* constraints: multiple of 8
|
||||
*/
|
||||
void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
|
||||
|
||||
/**
|
||||
* Convert an array of float to an array of int16_t.
|
||||
*
|
||||
* Convert floats from in the range [-32768.0,32767.0] to ints
|
||||
* without rescaling
|
||||
*
|
||||
* @param dst destination array of int16_t.
|
||||
* constraints: 16-byte aligned
|
||||
* @param src source array of float.
|
||||
* constraints: 16-byte aligned
|
||||
* @param len number of elements to convert.
|
||||
* constraints: multiple of 8
|
||||
*/
|
||||
void (*float_to_int16)(int16_t *dst, const float *src, long len);
|
||||
|
||||
/**
|
||||
* Convert multiple arrays of float to an interleaved array of int16_t.
|
||||
*
|
||||
* Convert floats from in the range [-32768.0,32767.0] to ints
|
||||
* without rescaling
|
||||
*
|
||||
* @param dst destination array of interleaved int16_t.
|
||||
* constraints: 16-byte aligned
|
||||
* @param src source array of float arrays, one for each channel.
|
||||
* constraints: 16-byte aligned
|
||||
* @param len number of elements to convert.
|
||||
* constraints: multiple of 8
|
||||
* @param channels number of channels
|
||||
*/
|
||||
void (*float_to_int16_interleave)(int16_t *dst, const float **src,
|
||||
long len, int channels);
|
||||
} FmtConvertContext;
|
||||
|
||||
void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
|
||||
|
||||
void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
|
||||
void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx);
|
||||
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
|
||||
|
||||
#endif /* AVCODEC_FMTCONVERT_H */
|
@ -38,6 +38,7 @@
|
||||
#include "avcodec.h"
|
||||
#include "dsputil.h"
|
||||
#include "fft.h"
|
||||
#include "fmtconvert.h"
|
||||
|
||||
#define ALT_BITSTREAM_READER_LE
|
||||
#include "get_bits.h"
|
||||
@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext {
|
||||
float scale_bias;
|
||||
DSPContext dsp;
|
||||
FFTContext imdct_ctx;
|
||||
FmtConvertContext fmt_conv;
|
||||
DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
|
||||
} NellyMoserDecodeContext;
|
||||
|
||||
@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) {
|
||||
ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);
|
||||
|
||||
dsputil_init(&s->dsp, avctx);
|
||||
ff_fmt_convert_init(&s->fmt_conv, avctx);
|
||||
|
||||
s->scale_bias = 1.0/(1*8);
|
||||
|
||||
@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx,
|
||||
|
||||
for (i=0 ; i<blocks ; i++) {
|
||||
nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf);
|
||||
s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
|
||||
s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
|
||||
*data_size += NELLY_SAMPLES*sizeof(int16_t);
|
||||
}
|
||||
|
||||
|
@ -21,6 +21,7 @@ ALTIVEC-OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o \
|
||||
OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \
|
||||
ppc/fdct_altivec.o \
|
||||
ppc/float_altivec.o \
|
||||
ppc/fmtconvert_altivec.o \
|
||||
ppc/gmc_altivec.o \
|
||||
ppc/idct_altivec.o \
|
||||
ppc/int_altivec.o \
|
||||
|
@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa
|
||||
}
|
||||
}
|
||||
|
||||
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
|
||||
{
|
||||
union {
|
||||
vector float v;
|
||||
float s[4];
|
||||
} mul_u;
|
||||
int i;
|
||||
vector float src1, src2, dst1, dst2, mul_v, zero;
|
||||
|
||||
zero = (vector float)vec_splat_u32(0);
|
||||
mul_u.s[0] = mul;
|
||||
mul_v = vec_splat(mul_u.v, 0);
|
||||
|
||||
for(i=0; i<len; i+=8) {
|
||||
src1 = vec_ctf(vec_ld(0, src+i), 0);
|
||||
src2 = vec_ctf(vec_ld(16, src+i), 0);
|
||||
dst1 = vec_madd(src1, mul_v, zero);
|
||||
dst2 = vec_madd(src2, mul_v, zero);
|
||||
vec_st(dst1, 0, dst+i);
|
||||
vec_st(dst2, 16, dst+i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static vector signed short
|
||||
float_to_int16_one_altivec(const float *src)
|
||||
{
|
||||
vector float s0 = vec_ld(0, src);
|
||||
vector float s1 = vec_ld(16, src);
|
||||
vector signed int t0 = vec_cts(s0, 0);
|
||||
vector signed int t1 = vec_cts(s1, 0);
|
||||
return vec_packs(t0,t1);
|
||||
}
|
||||
|
||||
static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
|
||||
{
|
||||
int i;
|
||||
vector signed short d0, d1, d;
|
||||
vector unsigned char align;
|
||||
if(((long)dst)&15) //FIXME
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
d0 = vec_ld(0, dst+i);
|
||||
d = float_to_int16_one_altivec(src+i);
|
||||
d1 = vec_ld(15, dst+i);
|
||||
d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
|
||||
align = vec_lvsr(0, dst+i);
|
||||
d0 = vec_perm(d1, d, align);
|
||||
d1 = vec_perm(d, d1, align);
|
||||
vec_st(d0, 0, dst+i);
|
||||
vec_st(d1,15, dst+i);
|
||||
}
|
||||
else
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
d = float_to_int16_one_altivec(src+i);
|
||||
vec_st(d, 0, dst+i);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
float_to_int16_interleave_altivec(int16_t *dst, const float **src,
|
||||
long len, int channels)
|
||||
{
|
||||
int i;
|
||||
vector signed short d0, d1, d2, c0, c1, t0, t1;
|
||||
vector unsigned char align;
|
||||
if(channels == 1)
|
||||
float_to_int16_altivec(dst, src[0], len);
|
||||
else
|
||||
if (channels == 2) {
|
||||
if(((long)dst)&15)
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
d0 = vec_ld(0, dst + i);
|
||||
t0 = float_to_int16_one_altivec(src[0] + i);
|
||||
d1 = vec_ld(31, dst + i);
|
||||
t1 = float_to_int16_one_altivec(src[1] + i);
|
||||
c0 = vec_mergeh(t0, t1);
|
||||
c1 = vec_mergel(t0, t1);
|
||||
d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
|
||||
align = vec_lvsr(0, dst + i);
|
||||
d0 = vec_perm(d2, c0, align);
|
||||
d1 = vec_perm(c0, c1, align);
|
||||
vec_st(d0, 0, dst + i);
|
||||
d0 = vec_perm(c1, d2, align);
|
||||
vec_st(d1, 15, dst + i);
|
||||
vec_st(d0, 31, dst + i);
|
||||
dst+=8;
|
||||
}
|
||||
else
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
t0 = float_to_int16_one_altivec(src[0] + i);
|
||||
t1 = float_to_int16_one_altivec(src[1] + i);
|
||||
d0 = vec_mergeh(t0, t1);
|
||||
d1 = vec_mergel(t0, t1);
|
||||
vec_st(d0, 0, dst + i);
|
||||
vec_st(d1, 16, dst + i);
|
||||
dst+=8;
|
||||
}
|
||||
} else {
|
||||
DECLARE_ALIGNED(16, int16_t, tmp)[len];
|
||||
int c, j;
|
||||
for (c = 0; c < channels; c++) {
|
||||
float_to_int16_altivec(tmp, src[c], len);
|
||||
for (i = 0, j = c; i < len; i++, j+=channels) {
|
||||
dst[j] = tmp[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
|
||||
{
|
||||
c->vector_fmul = vector_fmul_altivec;
|
||||
c->vector_fmul_reverse = vector_fmul_reverse_altivec;
|
||||
c->vector_fmul_add = vector_fmul_add_altivec;
|
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
||||
c->vector_fmul_window = vector_fmul_window_altivec;
|
||||
c->float_to_int16 = float_to_int16_altivec;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_altivec;
|
||||
}
|
||||
}
|
||||
|
142
libavcodec/ppc/fmtconvert_altivec.c
Normal file
142
libavcodec/ppc/fmtconvert_altivec.c
Normal file
@ -0,0 +1,142 @@
|
||||
/*
|
||||
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/fmtconvert.h"
|
||||
|
||||
#include "dsputil_altivec.h"
|
||||
#include "util_altivec.h"
|
||||
|
||||
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
|
||||
{
|
||||
union {
|
||||
vector float v;
|
||||
float s[4];
|
||||
} mul_u;
|
||||
int i;
|
||||
vector float src1, src2, dst1, dst2, mul_v, zero;
|
||||
|
||||
zero = (vector float)vec_splat_u32(0);
|
||||
mul_u.s[0] = mul;
|
||||
mul_v = vec_splat(mul_u.v, 0);
|
||||
|
||||
for(i=0; i<len; i+=8) {
|
||||
src1 = vec_ctf(vec_ld(0, src+i), 0);
|
||||
src2 = vec_ctf(vec_ld(16, src+i), 0);
|
||||
dst1 = vec_madd(src1, mul_v, zero);
|
||||
dst2 = vec_madd(src2, mul_v, zero);
|
||||
vec_st(dst1, 0, dst+i);
|
||||
vec_st(dst2, 16, dst+i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static vector signed short
|
||||
float_to_int16_one_altivec(const float *src)
|
||||
{
|
||||
vector float s0 = vec_ld(0, src);
|
||||
vector float s1 = vec_ld(16, src);
|
||||
vector signed int t0 = vec_cts(s0, 0);
|
||||
vector signed int t1 = vec_cts(s1, 0);
|
||||
return vec_packs(t0,t1);
|
||||
}
|
||||
|
||||
static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
|
||||
{
|
||||
int i;
|
||||
vector signed short d0, d1, d;
|
||||
vector unsigned char align;
|
||||
if(((long)dst)&15) //FIXME
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
d0 = vec_ld(0, dst+i);
|
||||
d = float_to_int16_one_altivec(src+i);
|
||||
d1 = vec_ld(15, dst+i);
|
||||
d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
|
||||
align = vec_lvsr(0, dst+i);
|
||||
d0 = vec_perm(d1, d, align);
|
||||
d1 = vec_perm(d, d1, align);
|
||||
vec_st(d0, 0, dst+i);
|
||||
vec_st(d1,15, dst+i);
|
||||
}
|
||||
else
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
d = float_to_int16_one_altivec(src+i);
|
||||
vec_st(d, 0, dst+i);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
float_to_int16_interleave_altivec(int16_t *dst, const float **src,
|
||||
long len, int channels)
|
||||
{
|
||||
int i;
|
||||
vector signed short d0, d1, d2, c0, c1, t0, t1;
|
||||
vector unsigned char align;
|
||||
if(channels == 1)
|
||||
float_to_int16_altivec(dst, src[0], len);
|
||||
else
|
||||
if (channels == 2) {
|
||||
if(((long)dst)&15)
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
d0 = vec_ld(0, dst + i);
|
||||
t0 = float_to_int16_one_altivec(src[0] + i);
|
||||
d1 = vec_ld(31, dst + i);
|
||||
t1 = float_to_int16_one_altivec(src[1] + i);
|
||||
c0 = vec_mergeh(t0, t1);
|
||||
c1 = vec_mergel(t0, t1);
|
||||
d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
|
||||
align = vec_lvsr(0, dst + i);
|
||||
d0 = vec_perm(d2, c0, align);
|
||||
d1 = vec_perm(c0, c1, align);
|
||||
vec_st(d0, 0, dst + i);
|
||||
d0 = vec_perm(c1, d2, align);
|
||||
vec_st(d1, 15, dst + i);
|
||||
vec_st(d0, 31, dst + i);
|
||||
dst+=8;
|
||||
}
|
||||
else
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
t0 = float_to_int16_one_altivec(src[0] + i);
|
||||
t1 = float_to_int16_one_altivec(src[1] + i);
|
||||
d0 = vec_mergeh(t0, t1);
|
||||
d1 = vec_mergel(t0, t1);
|
||||
vec_st(d0, 0, dst + i);
|
||||
vec_st(d1, 16, dst + i);
|
||||
dst+=8;
|
||||
}
|
||||
} else {
|
||||
DECLARE_ALIGNED(16, int16_t, tmp)[len];
|
||||
int c, j;
|
||||
for (c = 0; c < channels; c++) {
|
||||
float_to_int16_altivec(tmp, src[c], len);
|
||||
for (i = 0, j = c; i < len; i++, j+=channels) {
|
||||
dst[j] = tmp[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
||||
c->float_to_int16 = float_to_int16_altivec;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_altivec;
|
||||
}
|
||||
}
|
@ -31,6 +31,7 @@
|
||||
#include "get_bits.h"
|
||||
#include "dsputil.h"
|
||||
#include "fft.h"
|
||||
#include "fmtconvert.h"
|
||||
|
||||
#include "vorbis.h"
|
||||
#include "xiph.h"
|
||||
@ -127,6 +128,7 @@ typedef struct vorbis_context_s {
|
||||
AVCodecContext *avccontext;
|
||||
GetBitContext gb;
|
||||
DSPContext dsp;
|
||||
FmtConvertContext fmt_conv;
|
||||
|
||||
FFTContext mdct[2];
|
||||
uint_fast8_t first_frame;
|
||||
@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext)
|
||||
|
||||
vc->avccontext = avccontext;
|
||||
dsputil_init(&vc->dsp, avccontext);
|
||||
ff_fmt_convert_init(&vc->fmt_conv, avccontext);
|
||||
|
||||
vc->scale_bias = 32768.0f;
|
||||
|
||||
@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext,
|
||||
len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i];
|
||||
}
|
||||
|
||||
vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels);
|
||||
vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len,
|
||||
vc->audio_channels);
|
||||
*data_size = len * 2 * vc->audio_channels;
|
||||
|
||||
return buf_size ;
|
||||
|
@ -126,6 +126,7 @@ int ff_wma_init(AVCodecContext *avctx, int flags2)
|
||||
s->block_align = avctx->block_align;
|
||||
|
||||
dsputil_init(&s->dsp, avctx);
|
||||
ff_fmt_convert_init(&s->fmt_conv, avctx);
|
||||
|
||||
if (avctx->codec->id == CODEC_ID_WMAV1) {
|
||||
s->version = 1;
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include "put_bits.h"
|
||||
#include "dsputil.h"
|
||||
#include "fft.h"
|
||||
#include "fmtconvert.h"
|
||||
|
||||
/* size of blocks */
|
||||
#define BLOCK_MIN_BITS 7
|
||||
@ -134,6 +135,7 @@ typedef struct WMACodecContext {
|
||||
float lsp_pow_m_table1[(1 << LSP_POW_BITS)];
|
||||
float lsp_pow_m_table2[(1 << LSP_POW_BITS)];
|
||||
DSPContext dsp;
|
||||
FmtConvertContext fmt_conv;
|
||||
|
||||
#ifdef TRACE
|
||||
int frame_count;
|
||||
|
@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples)
|
||||
incr = s->nb_channels;
|
||||
for (ch = 0; ch < MAX_CHANNELS; ch++)
|
||||
output[ch] = s->frame_out[ch];
|
||||
s->dsp.float_to_int16_interleave(samples, output, n, incr);
|
||||
s->fmt_conv.float_to_int16_interleave(samples, output, n, incr);
|
||||
for (ch = 0; ch < incr; ch++) {
|
||||
/* prepare for next block */
|
||||
memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float));
|
||||
|
@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
|
||||
MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o
|
||||
MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \
|
||||
x86/deinterlace.o \
|
||||
x86/fmtconvert.o \
|
||||
x86/h264_chromamc.o \
|
||||
$(YASM-OBJS-yes)
|
||||
|
||||
@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT) += x86/fft.o
|
||||
OBJS-$(HAVE_MMX) += x86/dnxhd_mmx.o \
|
||||
x86/dsputil_mmx.o \
|
||||
x86/fdct_mmx.o \
|
||||
x86/fmtconvert_mmx.o \
|
||||
x86/idct_mmx_xvid.o \
|
||||
x86/idct_sse2_xvid.o \
|
||||
x86/motion_est_mmx.o \
|
||||
|
@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s
|
||||
}
|
||||
#endif /* HAVE_6REGS */
|
||||
|
||||
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
|
||||
{
|
||||
x86_reg i = -4*len;
|
||||
__asm__ volatile(
|
||||
"movss %3, %%xmm4 \n"
|
||||
"shufps $0, %%xmm4, %%xmm4 \n"
|
||||
"1: \n"
|
||||
"cvtpi2ps (%2,%0), %%xmm0 \n"
|
||||
"cvtpi2ps 8(%2,%0), %%xmm1 \n"
|
||||
"cvtpi2ps 16(%2,%0), %%xmm2 \n"
|
||||
"cvtpi2ps 24(%2,%0), %%xmm3 \n"
|
||||
"movlhps %%xmm1, %%xmm0 \n"
|
||||
"movlhps %%xmm3, %%xmm2 \n"
|
||||
"mulps %%xmm4, %%xmm0 \n"
|
||||
"mulps %%xmm4, %%xmm2 \n"
|
||||
"movaps %%xmm0, (%1,%0) \n"
|
||||
"movaps %%xmm2, 16(%1,%0) \n"
|
||||
"add $32, %0 \n"
|
||||
"jl 1b \n"
|
||||
:"+r"(i)
|
||||
:"r"(dst+len), "r"(src+len), "m"(mul)
|
||||
);
|
||||
}
|
||||
|
||||
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
|
||||
{
|
||||
x86_reg i = -4*len;
|
||||
__asm__ volatile(
|
||||
"movss %3, %%xmm4 \n"
|
||||
"shufps $0, %%xmm4, %%xmm4 \n"
|
||||
"1: \n"
|
||||
"cvtdq2ps (%2,%0), %%xmm0 \n"
|
||||
"cvtdq2ps 16(%2,%0), %%xmm1 \n"
|
||||
"mulps %%xmm4, %%xmm0 \n"
|
||||
"mulps %%xmm4, %%xmm1 \n"
|
||||
"movaps %%xmm0, (%1,%0) \n"
|
||||
"movaps %%xmm1, 16(%1,%0) \n"
|
||||
"add $32, %0 \n"
|
||||
"jl 1b \n"
|
||||
:"+r"(i)
|
||||
:"r"(dst+len), "r"(src+len), "m"(mul)
|
||||
);
|
||||
}
|
||||
|
||||
static void vector_clipf_sse(float *dst, const float *src, float min, float max,
|
||||
int len)
|
||||
{
|
||||
@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max,
|
||||
);
|
||||
}
|
||||
|
||||
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
|
||||
x86_reg reglen = len;
|
||||
// not bit-exact: pf2id uses different rounding than C and SSE
|
||||
__asm__ volatile(
|
||||
"add %0 , %0 \n\t"
|
||||
"lea (%2,%0,2) , %2 \n\t"
|
||||
"add %0 , %1 \n\t"
|
||||
"neg %0 \n\t"
|
||||
"1: \n\t"
|
||||
"pf2id (%2,%0,2) , %%mm0 \n\t"
|
||||
"pf2id 8(%2,%0,2) , %%mm1 \n\t"
|
||||
"pf2id 16(%2,%0,2) , %%mm2 \n\t"
|
||||
"pf2id 24(%2,%0,2) , %%mm3 \n\t"
|
||||
"packssdw %%mm1 , %%mm0 \n\t"
|
||||
"packssdw %%mm3 , %%mm2 \n\t"
|
||||
"movq %%mm0 , (%1,%0) \n\t"
|
||||
"movq %%mm2 , 8(%1,%0) \n\t"
|
||||
"add $16 , %0 \n\t"
|
||||
" js 1b \n\t"
|
||||
"femms \n\t"
|
||||
:"+r"(reglen), "+r"(dst), "+r"(src)
|
||||
);
|
||||
}
|
||||
static void float_to_int16_sse(int16_t *dst, const float *src, long len){
|
||||
x86_reg reglen = len;
|
||||
__asm__ volatile(
|
||||
"add %0 , %0 \n\t"
|
||||
"lea (%2,%0,2) , %2 \n\t"
|
||||
"add %0 , %1 \n\t"
|
||||
"neg %0 \n\t"
|
||||
"1: \n\t"
|
||||
"cvtps2pi (%2,%0,2) , %%mm0 \n\t"
|
||||
"cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
|
||||
"cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
|
||||
"cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
|
||||
"packssdw %%mm1 , %%mm0 \n\t"
|
||||
"packssdw %%mm3 , %%mm2 \n\t"
|
||||
"movq %%mm0 , (%1,%0) \n\t"
|
||||
"movq %%mm2 , 8(%1,%0) \n\t"
|
||||
"add $16 , %0 \n\t"
|
||||
" js 1b \n\t"
|
||||
"emms \n\t"
|
||||
:"+r"(reglen), "+r"(dst), "+r"(src)
|
||||
);
|
||||
}
|
||||
|
||||
static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
|
||||
x86_reg reglen = len;
|
||||
__asm__ volatile(
|
||||
"add %0 , %0 \n\t"
|
||||
"lea (%2,%0,2) , %2 \n\t"
|
||||
"add %0 , %1 \n\t"
|
||||
"neg %0 \n\t"
|
||||
"1: \n\t"
|
||||
"cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
|
||||
"cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
|
||||
"packssdw %%xmm1 , %%xmm0 \n\t"
|
||||
"movdqa %%xmm0 , (%1,%0) \n\t"
|
||||
"add $16 , %0 \n\t"
|
||||
" js 1b \n\t"
|
||||
:"+r"(reglen), "+r"(dst), "+r"(src)
|
||||
);
|
||||
}
|
||||
|
||||
void ff_vp3_idct_mmx(int16_t *input_data);
|
||||
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data);
|
||||
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
|
||||
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
|
||||
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
|
||||
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
|
||||
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
|
||||
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
|
||||
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
|
||||
@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const
|
||||
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
|
||||
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
|
||||
|
||||
#if !HAVE_YASM
|
||||
#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
|
||||
#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
|
||||
#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
|
||||
#endif
|
||||
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
|
||||
|
||||
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
|
||||
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
|
||||
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
||||
DECLARE_ALIGNED(16, int16_t, tmp)[len];\
|
||||
int i,j,c;\
|
||||
for(c=0; c<channels; c++){\
|
||||
float_to_int16_##cpu(tmp, src[c], len);\
|
||||
for(i=0, j=c; i<len; i++, j+=channels)\
|
||||
dst[j] = tmp[i];\
|
||||
}\
|
||||
}\
|
||||
\
|
||||
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
||||
if(channels==1)\
|
||||
float_to_int16_##cpu(dst, src[0], len);\
|
||||
else if(channels==2){\
|
||||
x86_reg reglen = len; \
|
||||
const float *src0 = src[0];\
|
||||
const float *src1 = src[1];\
|
||||
__asm__ volatile(\
|
||||
"shl $2, %0 \n"\
|
||||
"add %0, %1 \n"\
|
||||
"add %0, %2 \n"\
|
||||
"add %0, %3 \n"\
|
||||
"neg %0 \n"\
|
||||
body\
|
||||
:"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
|
||||
);\
|
||||
}else if(channels==6){\
|
||||
ff_float_to_int16_interleave6_##cpu(dst, src, len);\
|
||||
}else\
|
||||
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
|
||||
}
|
||||
|
||||
FLOAT_TO_INT16_INTERLEAVE(3dnow,
|
||||
"1: \n"
|
||||
"pf2id (%2,%0), %%mm0 \n"
|
||||
"pf2id 8(%2,%0), %%mm1 \n"
|
||||
"pf2id (%3,%0), %%mm2 \n"
|
||||
"pf2id 8(%3,%0), %%mm3 \n"
|
||||
"packssdw %%mm1, %%mm0 \n"
|
||||
"packssdw %%mm3, %%mm2 \n"
|
||||
"movq %%mm0, %%mm1 \n"
|
||||
"punpcklwd %%mm2, %%mm0 \n"
|
||||
"punpckhwd %%mm2, %%mm1 \n"
|
||||
"movq %%mm0, (%1,%0)\n"
|
||||
"movq %%mm1, 8(%1,%0)\n"
|
||||
"add $16, %0 \n"
|
||||
"js 1b \n"
|
||||
"femms \n"
|
||||
)
|
||||
|
||||
FLOAT_TO_INT16_INTERLEAVE(sse,
|
||||
"1: \n"
|
||||
"cvtps2pi (%2,%0), %%mm0 \n"
|
||||
"cvtps2pi 8(%2,%0), %%mm1 \n"
|
||||
"cvtps2pi (%3,%0), %%mm2 \n"
|
||||
"cvtps2pi 8(%3,%0), %%mm3 \n"
|
||||
"packssdw %%mm1, %%mm0 \n"
|
||||
"packssdw %%mm3, %%mm2 \n"
|
||||
"movq %%mm0, %%mm1 \n"
|
||||
"punpcklwd %%mm2, %%mm0 \n"
|
||||
"punpckhwd %%mm2, %%mm1 \n"
|
||||
"movq %%mm0, (%1,%0)\n"
|
||||
"movq %%mm1, 8(%1,%0)\n"
|
||||
"add $16, %0 \n"
|
||||
"js 1b \n"
|
||||
"emms \n"
|
||||
)
|
||||
|
||||
FLOAT_TO_INT16_INTERLEAVE(sse2,
|
||||
"1: \n"
|
||||
"cvtps2dq (%2,%0), %%xmm0 \n"
|
||||
"cvtps2dq (%3,%0), %%xmm1 \n"
|
||||
"packssdw %%xmm1, %%xmm0 \n"
|
||||
"movhlps %%xmm0, %%xmm1 \n"
|
||||
"punpcklwd %%xmm1, %%xmm0 \n"
|
||||
"movdqa %%xmm0, (%1,%0) \n"
|
||||
"add $16, %0 \n"
|
||||
"js 1b \n"
|
||||
)
|
||||
|
||||
static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
|
||||
if(channels==6)
|
||||
ff_float_to_int16_interleave6_3dn2(dst, src, len);
|
||||
else
|
||||
float_to_int16_interleave_3dnow(dst, src, len, channels);
|
||||
}
|
||||
|
||||
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
|
||||
|
||||
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
if(mm_flags & AV_CPU_FLAG_3DNOW){
|
||||
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
|
||||
c->vector_fmul = vector_fmul_3dnow;
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
||||
c->float_to_int16 = float_to_int16_3dnow;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
|
||||
}
|
||||
}
|
||||
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
|
||||
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
|
||||
#if HAVE_6REGS
|
||||
c->vector_fmul_window = vector_fmul_window_3dnow2;
|
||||
#endif
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
|
||||
}
|
||||
}
|
||||
if(mm_flags & AV_CPU_FLAG_MMX2){
|
||||
#if HAVE_YASM
|
||||
@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
#if HAVE_6REGS
|
||||
c->vector_fmul_window = vector_fmul_window_sse;
|
||||
#endif
|
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
|
||||
c->vector_clipf = vector_clipf_sse;
|
||||
c->float_to_int16 = float_to_int16_sse;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_sse;
|
||||
#if HAVE_YASM
|
||||
c->scalarproduct_float = ff_scalarproduct_float_sse;
|
||||
#endif
|
||||
@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
if(mm_flags & AV_CPU_FLAG_3DNOW)
|
||||
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
|
||||
if(mm_flags & AV_CPU_FLAG_SSE2){
|
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
|
||||
c->float_to_int16 = float_to_int16_sse2;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
|
||||
#if HAVE_YASM
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
||||
|
@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
|
||||
|
||||
section .text align=16
|
||||
|
||||
%macro PSWAPD_SSE 2
|
||||
pshufw %1, %2, 0x4e
|
||||
%endmacro
|
||||
%macro PSWAPD_3DN1 2
|
||||
movq %1, %2
|
||||
psrlq %1, 32
|
||||
punpckldq %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro FLOAT_TO_INT16_INTERLEAVE6 1
|
||||
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
|
||||
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
|
||||
%ifdef ARCH_X86_64
|
||||
%define lend r10d
|
||||
mov lend, r2d
|
||||
%else
|
||||
%define lend dword r2m
|
||||
%endif
|
||||
mov src1q, [srcq+1*gprsize]
|
||||
mov src2q, [srcq+2*gprsize]
|
||||
mov src3q, [srcq+3*gprsize]
|
||||
mov src4q, [srcq+4*gprsize]
|
||||
mov src5q, [srcq+5*gprsize]
|
||||
mov srcq, [srcq]
|
||||
sub src1q, srcq
|
||||
sub src2q, srcq
|
||||
sub src3q, srcq
|
||||
sub src4q, srcq
|
||||
sub src5q, srcq
|
||||
.loop:
|
||||
cvtps2pi mm0, [srcq]
|
||||
cvtps2pi mm1, [srcq+src1q]
|
||||
cvtps2pi mm2, [srcq+src2q]
|
||||
cvtps2pi mm3, [srcq+src3q]
|
||||
cvtps2pi mm4, [srcq+src4q]
|
||||
cvtps2pi mm5, [srcq+src5q]
|
||||
packssdw mm0, mm3
|
||||
packssdw mm1, mm4
|
||||
packssdw mm2, mm5
|
||||
pswapd mm3, mm0
|
||||
punpcklwd mm0, mm1
|
||||
punpckhwd mm1, mm2
|
||||
punpcklwd mm2, mm3
|
||||
pswapd mm3, mm0
|
||||
punpckldq mm0, mm2
|
||||
punpckhdq mm2, mm1
|
||||
punpckldq mm1, mm3
|
||||
movq [dstq ], mm0
|
||||
movq [dstq+16], mm2
|
||||
movq [dstq+ 8], mm1
|
||||
add srcq, 8
|
||||
add dstq, 24
|
||||
sub lend, 2
|
||||
jg .loop
|
||||
emms
|
||||
RET
|
||||
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
|
||||
|
||||
%define pswapd PSWAPD_SSE
|
||||
FLOAT_TO_INT16_INTERLEAVE6 sse
|
||||
%define cvtps2pi pf2id
|
||||
%define pswapd PSWAPD_3DN1
|
||||
FLOAT_TO_INT16_INTERLEAVE6 3dnow
|
||||
%undef pswapd
|
||||
FLOAT_TO_INT16_INTERLEAVE6 3dn2
|
||||
%undef cvtps2pi
|
||||
|
||||
|
||||
|
||||
%macro SCALARPRODUCT 1
|
||||
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
|
||||
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
|
||||
|
91
libavcodec/x86/fmtconvert.asm
Normal file
91
libavcodec/x86/fmtconvert.asm
Normal file
@ -0,0 +1,91 @@
|
||||
;******************************************************************************
|
||||
;* x86 optimized Format Conversion Utils
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
|
||||
section .text align=16
|
||||
|
||||
%macro PSWAPD_SSE 2
|
||||
pshufw %1, %2, 0x4e
|
||||
%endmacro
|
||||
%macro PSWAPD_3DN1 2
|
||||
movq %1, %2
|
||||
psrlq %1, 32
|
||||
punpckldq %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro FLOAT_TO_INT16_INTERLEAVE6 1
|
||||
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
|
||||
cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
|
||||
%ifdef ARCH_X86_64
|
||||
%define lend r10d
|
||||
mov lend, r2d
|
||||
%else
|
||||
%define lend dword r2m
|
||||
%endif
|
||||
mov src1q, [srcq+1*gprsize]
|
||||
mov src2q, [srcq+2*gprsize]
|
||||
mov src3q, [srcq+3*gprsize]
|
||||
mov src4q, [srcq+4*gprsize]
|
||||
mov src5q, [srcq+5*gprsize]
|
||||
mov srcq, [srcq]
|
||||
sub src1q, srcq
|
||||
sub src2q, srcq
|
||||
sub src3q, srcq
|
||||
sub src4q, srcq
|
||||
sub src5q, srcq
|
||||
.loop:
|
||||
cvtps2pi mm0, [srcq]
|
||||
cvtps2pi mm1, [srcq+src1q]
|
||||
cvtps2pi mm2, [srcq+src2q]
|
||||
cvtps2pi mm3, [srcq+src3q]
|
||||
cvtps2pi mm4, [srcq+src4q]
|
||||
cvtps2pi mm5, [srcq+src5q]
|
||||
packssdw mm0, mm3
|
||||
packssdw mm1, mm4
|
||||
packssdw mm2, mm5
|
||||
pswapd mm3, mm0
|
||||
punpcklwd mm0, mm1
|
||||
punpckhwd mm1, mm2
|
||||
punpcklwd mm2, mm3
|
||||
pswapd mm3, mm0
|
||||
punpckldq mm0, mm2
|
||||
punpckhdq mm2, mm1
|
||||
punpckldq mm1, mm3
|
||||
movq [dstq ], mm0
|
||||
movq [dstq+16], mm2
|
||||
movq [dstq+ 8], mm1
|
||||
add srcq, 8
|
||||
add dstq, 24
|
||||
sub lend, 2
|
||||
jg .loop
|
||||
emms
|
||||
RET
|
||||
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
|
||||
|
||||
%define pswapd PSWAPD_SSE
|
||||
FLOAT_TO_INT16_INTERLEAVE6 sse
|
||||
%define cvtps2pi pf2id
|
||||
%define pswapd PSWAPD_3DN1
|
||||
FLOAT_TO_INT16_INTERLEAVE6 3dnow
|
||||
%undef pswapd
|
||||
FLOAT_TO_INT16_INTERLEAVE6 3dn2
|
||||
%undef cvtps2pi
|
266
libavcodec/x86/fmtconvert_mmx.c
Normal file
266
libavcodec/x86/fmtconvert_mmx.c
Normal file
@ -0,0 +1,266 @@
|
||||
/*
|
||||
* Format Conversion Utils
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86_cpu.h"
|
||||
#include "libavcodec/fmtconvert.h"
|
||||
|
||||
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
|
||||
{
|
||||
x86_reg i = -4*len;
|
||||
__asm__ volatile(
|
||||
"movss %3, %%xmm4 \n"
|
||||
"shufps $0, %%xmm4, %%xmm4 \n"
|
||||
"1: \n"
|
||||
"cvtpi2ps (%2,%0), %%xmm0 \n"
|
||||
"cvtpi2ps 8(%2,%0), %%xmm1 \n"
|
||||
"cvtpi2ps 16(%2,%0), %%xmm2 \n"
|
||||
"cvtpi2ps 24(%2,%0), %%xmm3 \n"
|
||||
"movlhps %%xmm1, %%xmm0 \n"
|
||||
"movlhps %%xmm3, %%xmm2 \n"
|
||||
"mulps %%xmm4, %%xmm0 \n"
|
||||
"mulps %%xmm4, %%xmm2 \n"
|
||||
"movaps %%xmm0, (%1,%0) \n"
|
||||
"movaps %%xmm2, 16(%1,%0) \n"
|
||||
"add $32, %0 \n"
|
||||
"jl 1b \n"
|
||||
:"+r"(i)
|
||||
:"r"(dst+len), "r"(src+len), "m"(mul)
|
||||
);
|
||||
}
|
||||
|
||||
static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
|
||||
{
|
||||
x86_reg i = -4*len;
|
||||
__asm__ volatile(
|
||||
"movss %3, %%xmm4 \n"
|
||||
"shufps $0, %%xmm4, %%xmm4 \n"
|
||||
"1: \n"
|
||||
"cvtdq2ps (%2,%0), %%xmm0 \n"
|
||||
"cvtdq2ps 16(%2,%0), %%xmm1 \n"
|
||||
"mulps %%xmm4, %%xmm0 \n"
|
||||
"mulps %%xmm4, %%xmm1 \n"
|
||||
"movaps %%xmm0, (%1,%0) \n"
|
||||
"movaps %%xmm1, 16(%1,%0) \n"
|
||||
"add $32, %0 \n"
|
||||
"jl 1b \n"
|
||||
:"+r"(i)
|
||||
:"r"(dst+len), "r"(src+len), "m"(mul)
|
||||
);
|
||||
}
|
||||
|
||||
static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
|
||||
x86_reg reglen = len;
|
||||
// not bit-exact: pf2id uses different rounding than C and SSE
|
||||
__asm__ volatile(
|
||||
"add %0 , %0 \n\t"
|
||||
"lea (%2,%0,2) , %2 \n\t"
|
||||
"add %0 , %1 \n\t"
|
||||
"neg %0 \n\t"
|
||||
"1: \n\t"
|
||||
"pf2id (%2,%0,2) , %%mm0 \n\t"
|
||||
"pf2id 8(%2,%0,2) , %%mm1 \n\t"
|
||||
"pf2id 16(%2,%0,2) , %%mm2 \n\t"
|
||||
"pf2id 24(%2,%0,2) , %%mm3 \n\t"
|
||||
"packssdw %%mm1 , %%mm0 \n\t"
|
||||
"packssdw %%mm3 , %%mm2 \n\t"
|
||||
"movq %%mm0 , (%1,%0) \n\t"
|
||||
"movq %%mm2 , 8(%1,%0) \n\t"
|
||||
"add $16 , %0 \n\t"
|
||||
" js 1b \n\t"
|
||||
"femms \n\t"
|
||||
:"+r"(reglen), "+r"(dst), "+r"(src)
|
||||
);
|
||||
}
|
||||
|
||||
static void float_to_int16_sse(int16_t *dst, const float *src, long len){
|
||||
x86_reg reglen = len;
|
||||
__asm__ volatile(
|
||||
"add %0 , %0 \n\t"
|
||||
"lea (%2,%0,2) , %2 \n\t"
|
||||
"add %0 , %1 \n\t"
|
||||
"neg %0 \n\t"
|
||||
"1: \n\t"
|
||||
"cvtps2pi (%2,%0,2) , %%mm0 \n\t"
|
||||
"cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
|
||||
"cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
|
||||
"cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
|
||||
"packssdw %%mm1 , %%mm0 \n\t"
|
||||
"packssdw %%mm3 , %%mm2 \n\t"
|
||||
"movq %%mm0 , (%1,%0) \n\t"
|
||||
"movq %%mm2 , 8(%1,%0) \n\t"
|
||||
"add $16 , %0 \n\t"
|
||||
" js 1b \n\t"
|
||||
"emms \n\t"
|
||||
:"+r"(reglen), "+r"(dst), "+r"(src)
|
||||
);
|
||||
}
|
||||
|
||||
static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
|
||||
x86_reg reglen = len;
|
||||
__asm__ volatile(
|
||||
"add %0 , %0 \n\t"
|
||||
"lea (%2,%0,2) , %2 \n\t"
|
||||
"add %0 , %1 \n\t"
|
||||
"neg %0 \n\t"
|
||||
"1: \n\t"
|
||||
"cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
|
||||
"cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
|
||||
"packssdw %%xmm1 , %%xmm0 \n\t"
|
||||
"movdqa %%xmm0 , (%1,%0) \n\t"
|
||||
"add $16 , %0 \n\t"
|
||||
" js 1b \n\t"
|
||||
:"+r"(reglen), "+r"(dst), "+r"(src)
|
||||
);
|
||||
}
|
||||
|
||||
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
|
||||
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
|
||||
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
|
||||
|
||||
#if !HAVE_YASM
|
||||
#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
|
||||
#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
|
||||
#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
|
||||
#endif
|
||||
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
|
||||
|
||||
#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
|
||||
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
|
||||
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
||||
DECLARE_ALIGNED(16, int16_t, tmp)[len];\
|
||||
int i,j,c;\
|
||||
for(c=0; c<channels; c++){\
|
||||
float_to_int16_##cpu(tmp, src[c], len);\
|
||||
for(i=0, j=c; i<len; i++, j+=channels)\
|
||||
dst[j] = tmp[i];\
|
||||
}\
|
||||
}\
|
||||
\
|
||||
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
||||
if(channels==1)\
|
||||
float_to_int16_##cpu(dst, src[0], len);\
|
||||
else if(channels==2){\
|
||||
x86_reg reglen = len; \
|
||||
const float *src0 = src[0];\
|
||||
const float *src1 = src[1];\
|
||||
__asm__ volatile(\
|
||||
"shl $2, %0 \n"\
|
||||
"add %0, %1 \n"\
|
||||
"add %0, %2 \n"\
|
||||
"add %0, %3 \n"\
|
||||
"neg %0 \n"\
|
||||
body\
|
||||
:"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
|
||||
);\
|
||||
}else if(channels==6){\
|
||||
ff_float_to_int16_interleave6_##cpu(dst, src, len);\
|
||||
}else\
|
||||
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
|
||||
}
|
||||
|
||||
FLOAT_TO_INT16_INTERLEAVE(3dnow,
|
||||
"1: \n"
|
||||
"pf2id (%2,%0), %%mm0 \n"
|
||||
"pf2id 8(%2,%0), %%mm1 \n"
|
||||
"pf2id (%3,%0), %%mm2 \n"
|
||||
"pf2id 8(%3,%0), %%mm3 \n"
|
||||
"packssdw %%mm1, %%mm0 \n"
|
||||
"packssdw %%mm3, %%mm2 \n"
|
||||
"movq %%mm0, %%mm1 \n"
|
||||
"punpcklwd %%mm2, %%mm0 \n"
|
||||
"punpckhwd %%mm2, %%mm1 \n"
|
||||
"movq %%mm0, (%1,%0)\n"
|
||||
"movq %%mm1, 8(%1,%0)\n"
|
||||
"add $16, %0 \n"
|
||||
"js 1b \n"
|
||||
"femms \n"
|
||||
)
|
||||
|
||||
FLOAT_TO_INT16_INTERLEAVE(sse,
|
||||
"1: \n"
|
||||
"cvtps2pi (%2,%0), %%mm0 \n"
|
||||
"cvtps2pi 8(%2,%0), %%mm1 \n"
|
||||
"cvtps2pi (%3,%0), %%mm2 \n"
|
||||
"cvtps2pi 8(%3,%0), %%mm3 \n"
|
||||
"packssdw %%mm1, %%mm0 \n"
|
||||
"packssdw %%mm3, %%mm2 \n"
|
||||
"movq %%mm0, %%mm1 \n"
|
||||
"punpcklwd %%mm2, %%mm0 \n"
|
||||
"punpckhwd %%mm2, %%mm1 \n"
|
||||
"movq %%mm0, (%1,%0)\n"
|
||||
"movq %%mm1, 8(%1,%0)\n"
|
||||
"add $16, %0 \n"
|
||||
"js 1b \n"
|
||||
"emms \n"
|
||||
)
|
||||
|
||||
FLOAT_TO_INT16_INTERLEAVE(sse2,
|
||||
"1: \n"
|
||||
"cvtps2dq (%2,%0), %%xmm0 \n"
|
||||
"cvtps2dq (%3,%0), %%xmm1 \n"
|
||||
"packssdw %%xmm1, %%xmm0 \n"
|
||||
"movhlps %%xmm0, %%xmm1 \n"
|
||||
"punpcklwd %%xmm1, %%xmm0 \n"
|
||||
"movdqa %%xmm0, (%1,%0) \n"
|
||||
"add $16, %0 \n"
|
||||
"js 1b \n"
|
||||
)
|
||||
|
||||
static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
|
||||
if(channels==6)
|
||||
ff_float_to_int16_interleave6_3dn2(dst, src, len);
|
||||
else
|
||||
float_to_int16_interleave_3dnow(dst, src, len, channels);
|
||||
}
|
||||
|
||||
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_MMX) {
|
||||
|
||||
if(mm_flags & AV_CPU_FLAG_3DNOW){
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
||||
c->float_to_int16 = float_to_int16_3dnow;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
|
||||
}
|
||||
}
|
||||
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
|
||||
}
|
||||
}
|
||||
if(mm_flags & AV_CPU_FLAG_SSE){
|
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
|
||||
c->float_to_int16 = float_to_int16_sse;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_sse;
|
||||
}
|
||||
if(mm_flags & AV_CPU_FLAG_SSE2){
|
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
|
||||
c->float_to_int16 = float_to_int16_sse2;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user