1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-23 12:43:46 +02:00

Merge commit 'd74a8cb7e42f703be5796eeb485f06af710ae8ca'

* commit 'd74a8cb7e42f703be5796eeb485f06af710ae8ca':
  fmtconvert: drop unused functions

Conflicts:
	libavcodec/arm/fmtconvert_vfp_armv6.S
	libavcodec/x86/fmtconvert.asm
	libavcodec/x86/fmtconvert_init.c

Merged-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Michael Niedermayer 2015-02-28 23:56:45 +01:00
commit 5c17377e28
10 changed files with 0 additions and 1300 deletions

View File

@ -92,7 +92,6 @@ VFP-OBJS += arm/fmtconvert_vfp.o
# subsystems
VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp_armv6.o
# decoders/encoders
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \

View File

@ -34,11 +34,6 @@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst,
const int32_t *src, const float *mul,
int len);
void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
@ -48,18 +43,9 @@ av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp;
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_vfp;
}
if (have_armv6(cpu_flags)) {
c->float_to_int16 = ff_float_to_int16_vfp;
}
}
if (have_neon(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16 = ff_float_to_int16_neon;
c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
}
}
}

View File

@ -22,347 +22,6 @@
#include "config.h"
#include "libavutil/arm/asm.S"
function ff_float_to_int16_neon, export=1
subs r2, r2, #8
vld1.64 {d0-d1}, [r1,:128]!
vcvt.s32.f32 q8, q0, #16
vld1.64 {d2-d3}, [r1,:128]!
vcvt.s32.f32 q9, q1, #16
beq 3f
bics ip, r2, #15
beq 2f
1: subs ip, ip, #16
vshrn.s32 d4, q8, #16
vld1.64 {d0-d1}, [r1,:128]!
vcvt.s32.f32 q0, q0, #16
vshrn.s32 d5, q9, #16
vld1.64 {d2-d3}, [r1,:128]!
vcvt.s32.f32 q1, q1, #16
vshrn.s32 d6, q0, #16
vst1.64 {d4-d5}, [r0,:128]!
vshrn.s32 d7, q1, #16
vld1.64 {d16-d17},[r1,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r1,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.64 {d6-d7}, [r0,:128]!
bne 1b
ands r2, r2, #15
beq 3f
2: vld1.64 {d0-d1}, [r1,:128]!
vshrn.s32 d4, q8, #16
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r1,:128]!
vshrn.s32 d5, q9, #16
vcvt.s32.f32 q1, q1, #16
vshrn.s32 d6, q0, #16
vst1.64 {d4-d5}, [r0,:128]!
vshrn.s32 d7, q1, #16
vst1.64 {d6-d7}, [r0,:128]!
bx lr
3: vshrn.s32 d4, q8, #16
vshrn.s32 d5, q9, #16
vst1.64 {d4-d5}, [r0,:128]!
bx lr
endfunc
function ff_float_to_int16_interleave_neon, export=1
cmp r3, #2
itt lt
ldrlt r1, [r1]
blt X(ff_float_to_int16_neon)
bne 4f
ldr r3, [r1]
ldr r1, [r1, #4]
subs r2, r2, #8
vld1.64 {d0-d1}, [r3,:128]!
vcvt.s32.f32 q8, q0, #16
vld1.64 {d2-d3}, [r3,:128]!
vcvt.s32.f32 q9, q1, #16
vld1.64 {d20-d21},[r1,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r1,:128]!
vcvt.s32.f32 q11, q11, #16
beq 3f
bics ip, r2, #15
beq 2f
1: subs ip, ip, #16
vld1.64 {d0-d1}, [r3,:128]!
vcvt.s32.f32 q0, q0, #16
vsri.32 q10, q8, #16
vld1.64 {d2-d3}, [r3,:128]!
vcvt.s32.f32 q1, q1, #16
vld1.64 {d24-d25},[r1,:128]!
vcvt.s32.f32 q12, q12, #16
vld1.64 {d26-d27},[r1,:128]!
vsri.32 q11, q9, #16
vst1.64 {d20-d21},[r0,:128]!
vcvt.s32.f32 q13, q13, #16
vst1.64 {d22-d23},[r0,:128]!
vsri.32 q12, q0, #16
vld1.64 {d16-d17},[r3,:128]!
vsri.32 q13, q1, #16
vst1.64 {d24-d25},[r0,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r3,:128]!
vcvt.s32.f32 q9, q9, #16
vld1.64 {d20-d21},[r1,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r1,:128]!
vcvt.s32.f32 q11, q11, #16
vst1.64 {d26-d27},[r0,:128]!
bne 1b
ands r2, r2, #15
beq 3f
2: vsri.32 q10, q8, #16
vld1.64 {d0-d1}, [r3,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r3,:128]!
vcvt.s32.f32 q1, q1, #16
vld1.64 {d24-d25},[r1,:128]!
vcvt.s32.f32 q12, q12, #16
vsri.32 q11, q9, #16
vld1.64 {d26-d27},[r1,:128]!
vcvt.s32.f32 q13, q13, #16
vst1.64 {d20-d21},[r0,:128]!
vsri.32 q12, q0, #16
vst1.64 {d22-d23},[r0,:128]!
vsri.32 q13, q1, #16
vst1.64 {d24-d27},[r0,:128]!
bx lr
3: vsri.32 q10, q8, #16
vsri.32 q11, q9, #16
vst1.64 {d20-d23},[r0,:128]!
bx lr
4: push {r4-r8,lr}
cmp r3, #4
lsl ip, r3, #1
blt 4f
@ 4 channels
5: ldmia r1!, {r4-r7}
mov lr, r2
mov r8, r0
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vld1.64 {d20-d21},[r6,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r7,:128]!
vcvt.s32.f32 q11, q11, #16
6: subs lr, lr, #8
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vsri.32 q9, q8, #16
vld1.64 {d2-d3}, [r5,:128]!
vcvt.s32.f32 q1, q1, #16
vsri.32 q11, q10, #16
vld1.64 {d4-d5}, [r6,:128]!
vcvt.s32.f32 q2, q2, #16
vzip.32 d18, d22
vld1.64 {d6-d7}, [r7,:128]!
vcvt.s32.f32 q3, q3, #16
vzip.32 d19, d23
vst1.64 {d18}, [r8], ip
vsri.32 q1, q0, #16
vst1.64 {d22}, [r8], ip
vsri.32 q3, q2, #16
vst1.64 {d19}, [r8], ip
vzip.32 d2, d6
vst1.64 {d23}, [r8], ip
vzip.32 d3, d7
beq 7f
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vst1.64 {d2}, [r8], ip
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.64 {d6}, [r8], ip
vld1.64 {d20-d21},[r6,:128]!
vcvt.s32.f32 q10, q10, #16
vst1.64 {d3}, [r8], ip
vld1.64 {d22-d23},[r7,:128]!
vcvt.s32.f32 q11, q11, #16
vst1.64 {d7}, [r8], ip
b 6b
7: vst1.64 {d2}, [r8], ip
vst1.64 {d6}, [r8], ip
vst1.64 {d3}, [r8], ip
vst1.64 {d7}, [r8], ip
subs r3, r3, #4
it eq
popeq {r4-r8,pc}
cmp r3, #4
add r0, r0, #8
bge 5b
@ 2 channels
4: cmp r3, #2
blt 4f
ldmia r1!, {r4-r5}
mov lr, r2
mov r8, r0
tst lr, #8
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vld1.64 {d20-d21},[r4,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r5,:128]!
vcvt.s32.f32 q11, q11, #16
beq 6f
subs lr, lr, #8
beq 7f
vsri.32 d18, d16, #16
vsri.32 d19, d17, #16
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vst1.32 {d18[0]}, [r8], ip
vsri.32 d22, d20, #16
vst1.32 {d18[1]}, [r8], ip
vsri.32 d23, d21, #16
vst1.32 {d19[0]}, [r8], ip
vst1.32 {d19[1]}, [r8], ip
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.32 {d22[0]}, [r8], ip
vst1.32 {d22[1]}, [r8], ip
vld1.64 {d20-d21},[r4,:128]!
vcvt.s32.f32 q10, q10, #16
vst1.32 {d23[0]}, [r8], ip
vst1.32 {d23[1]}, [r8], ip
vld1.64 {d22-d23},[r5,:128]!
vcvt.s32.f32 q11, q11, #16
6: subs lr, lr, #16
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vsri.32 d18, d16, #16
vld1.64 {d2-d3}, [r5,:128]!
vcvt.s32.f32 q1, q1, #16
vsri.32 d19, d17, #16
vld1.64 {d4-d5}, [r4,:128]!
vcvt.s32.f32 q2, q2, #16
vld1.64 {d6-d7}, [r5,:128]!
vcvt.s32.f32 q3, q3, #16
vst1.32 {d18[0]}, [r8], ip
vsri.32 d22, d20, #16
vst1.32 {d18[1]}, [r8], ip
vsri.32 d23, d21, #16
vst1.32 {d19[0]}, [r8], ip
vsri.32 d2, d0, #16
vst1.32 {d19[1]}, [r8], ip
vsri.32 d3, d1, #16
vst1.32 {d22[0]}, [r8], ip
vsri.32 d6, d4, #16
vst1.32 {d22[1]}, [r8], ip
vsri.32 d7, d5, #16
vst1.32 {d23[0]}, [r8], ip
vst1.32 {d23[1]}, [r8], ip
beq 6f
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vst1.32 {d2[0]}, [r8], ip
vst1.32 {d2[1]}, [r8], ip
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.32 {d3[0]}, [r8], ip
vst1.32 {d3[1]}, [r8], ip
vld1.64 {d20-d21},[r4,:128]!
vcvt.s32.f32 q10, q10, #16
vst1.32 {d6[0]}, [r8], ip
vst1.32 {d6[1]}, [r8], ip
vld1.64 {d22-d23},[r5,:128]!
vcvt.s32.f32 q11, q11, #16
vst1.32 {d7[0]}, [r8], ip
vst1.32 {d7[1]}, [r8], ip
bgt 6b
6: vst1.32 {d2[0]}, [r8], ip
vst1.32 {d2[1]}, [r8], ip
vst1.32 {d3[0]}, [r8], ip
vst1.32 {d3[1]}, [r8], ip
vst1.32 {d6[0]}, [r8], ip
vst1.32 {d6[1]}, [r8], ip
vst1.32 {d7[0]}, [r8], ip
vst1.32 {d7[1]}, [r8], ip
b 8f
7: vsri.32 d18, d16, #16
vsri.32 d19, d17, #16
vst1.32 {d18[0]}, [r8], ip
vsri.32 d22, d20, #16
vst1.32 {d18[1]}, [r8], ip
vsri.32 d23, d21, #16
vst1.32 {d19[0]}, [r8], ip
vst1.32 {d19[1]}, [r8], ip
vst1.32 {d22[0]}, [r8], ip
vst1.32 {d22[1]}, [r8], ip
vst1.32 {d23[0]}, [r8], ip
vst1.32 {d23[1]}, [r8], ip
8: subs r3, r3, #2
add r0, r0, #4
it eq
popeq {r4-r8,pc}
@ 1 channel
4: ldr r4, [r1],#4
tst r2, #8
mov lr, r2
mov r5, r0
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r4,:128]!
vcvt.s32.f32 q1, q1, #16
bne 8f
6: subs lr, lr, #16
vld1.64 {d4-d5}, [r4,:128]!
vcvt.s32.f32 q2, q2, #16
vld1.64 {d6-d7}, [r4,:128]!
vcvt.s32.f32 q3, q3, #16
vst1.16 {d0[1]}, [r5,:16], ip
vst1.16 {d0[3]}, [r5,:16], ip
vst1.16 {d1[1]}, [r5,:16], ip
vst1.16 {d1[3]}, [r5,:16], ip
vst1.16 {d2[1]}, [r5,:16], ip
vst1.16 {d2[3]}, [r5,:16], ip
vst1.16 {d3[1]}, [r5,:16], ip
vst1.16 {d3[3]}, [r5,:16], ip
beq 7f
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r4,:128]!
vcvt.s32.f32 q1, q1, #16
7: vst1.16 {d4[1]}, [r5,:16], ip
vst1.16 {d4[3]}, [r5,:16], ip
vst1.16 {d5[1]}, [r5,:16], ip
vst1.16 {d5[3]}, [r5,:16], ip
vst1.16 {d6[1]}, [r5,:16], ip
vst1.16 {d6[3]}, [r5,:16], ip
vst1.16 {d7[1]}, [r5,:16], ip
vst1.16 {d7[3]}, [r5,:16], ip
bgt 6b
pop {r4-r8,pc}
8: subs lr, lr, #8
vst1.16 {d0[1]}, [r5,:16], ip
vst1.16 {d0[3]}, [r5,:16], ip
vst1.16 {d1[1]}, [r5,:16], ip
vst1.16 {d1[3]}, [r5,:16], ip
vst1.16 {d2[1]}, [r5,:16], ip
vst1.16 {d2[3]}, [r5,:16], ip
vst1.16 {d3[1]}, [r5,:16], ip
vst1.16 {d3[3]}, [r5,:16], ip
it eq
popeq {r4-r8,pc}
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r4,:128]!
vcvt.s32.f32 q1, q1, #16
b 6b
endfunc
function ff_int32_to_float_fmul_scalar_neon, export=1
VFP vdup.32 q0, d0[0]
VFP len .req r2

View File

@ -1,78 +0,0 @@
/*
* Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/asm.S"
/**
* ARM VFP optimized float to int16 conversion.
* Assume that len is a positive number and is multiple of 8, destination
* buffer is at least 4 bytes aligned (8 bytes alignment is better for
* performance), little-endian byte sex.
*/
@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
function ff_float_to_int16_vfp, export=1
push {r4-r8,lr}
vpush {d8-d11}
vldmia r1!, {s16-s23}
vcvt.s32.f32 s0, s16
vcvt.s32.f32 s1, s17
vcvt.s32.f32 s2, s18
vcvt.s32.f32 s3, s19
vcvt.s32.f32 s4, s20
vcvt.s32.f32 s5, s21
vcvt.s32.f32 s6, s22
vcvt.s32.f32 s7, s23
1:
subs r2, r2, #8
vmov r3, r4, s0, s1
vmov r5, r6, s2, s3
vmov r7, r8, s4, s5
vmov ip, lr, s6, s7
it gt
vldmiagt r1!, {s16-s23}
ssat r4, #16, r4
ssat r3, #16, r3
ssat r6, #16, r6
ssat r5, #16, r5
pkhbt r3, r3, r4, lsl #16
pkhbt r4, r5, r6, lsl #16
itttt gt
vcvtgt.s32.f32 s0, s16
vcvtgt.s32.f32 s1, s17
vcvtgt.s32.f32 s2, s18
vcvtgt.s32.f32 s3, s19
itttt gt
vcvtgt.s32.f32 s4, s20
vcvtgt.s32.f32 s5, s21
vcvtgt.s32.f32 s6, s22
vcvtgt.s32.f32 s7, s23
ssat r8, #16, r8
ssat r7, #16, r7
ssat lr, #16, lr
ssat ip, #16, ip
pkhbt r5, r7, r8, lsl #16
pkhbt r6, ip, lr, lsl #16
stmia r0!, {r3-r6}
bgt 1b
vpop {d8-d11}
pop {r4-r8,pc}
endfunc

View File

@ -41,59 +41,10 @@ static void int32_to_float_fmul_array8_c(FmtConvertContext *c, float *dst,
c->int32_to_float_fmul_scalar(&dst[i], &src[i], *mul++, 8);
}
static av_always_inline int float_to_int16_one(const float *src){
return av_clip_int16(lrintf(*src));
}
static void float_to_int16_c(int16_t *dst, const float *src, long len)
{
int i;
for(i=0; i<len; i++)
dst[i] = float_to_int16_one(src+i);
}
static void float_to_int16_interleave_c(int16_t *dst, const float **src,
long len, int channels)
{
int i,j,c;
if(channels==2){
for(i=0; i<len; i++){
dst[2*i] = float_to_int16_one(src[0]+i);
dst[2*i+1] = float_to_int16_one(src[1]+i);
}
}else{
for(c=0; c<channels; c++)
for(i=0, j=c; i<len; i++, j+=channels)
dst[j] = float_to_int16_one(src[c]+i);
}
}
void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
int channels)
{
int j, c;
unsigned int i;
if (channels == 2) {
for (i = 0; i < len; i++) {
dst[2*i] = src[0][i];
dst[2*i+1] = src[1][i];
}
} else if (channels == 1 && len < INT_MAX / sizeof(float)) {
memcpy(dst, src[0], len * sizeof(float));
} else {
for (c = 0; c < channels; c++)
for (i = 0, j = c; i < len; i++, j += channels)
dst[j] = src[c][i];
}
}
av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
{
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
c->int32_to_float_fmul_array8 = int32_to_float_fmul_array8_c;
c->float_to_int16 = float_to_int16_c;
c->float_to_int16_interleave = float_to_int16_interleave_c;
c->float_interleave = ff_float_interleave_c;
if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx);

View File

@ -54,56 +54,8 @@ typedef struct FmtConvertContext {
float *dst, const int32_t *src,
const float *mul, int len);
/**
* Convert an array of float to an array of int16_t.
*
* Convert floats from in the range [-32768.0,32767.0] to ints
* without rescaling
*
* @param dst destination array of int16_t.
* constraints: 16-byte aligned
* @param src source array of float.
* constraints: 16-byte aligned
* @param len number of elements to convert.
* constraints: multiple of 8
*/
void (*float_to_int16)(int16_t *dst, const float *src, long len);
/**
* Convert multiple arrays of float to an interleaved array of int16_t.
*
* Convert floats from in the range [-32768.0,32767.0] to ints
* without rescaling
*
* @param dst destination array of interleaved int16_t.
* constraints: 16-byte aligned
* @param src source array of float arrays, one for each channel.
* constraints: 16-byte aligned
* @param len number of elements to convert.
* constraints: multiple of 8
* @param channels number of channels
*/
void (*float_to_int16_interleave)(int16_t *dst, const float **src,
long len, int channels);
/**
* Convert multiple arrays of float to an array of interleaved float.
*
* @param dst destination array of interleaved float.
* constraints: 16-byte aligned
* @param src source array of float arrays, one for each channel.
* constraints: 16-byte aligned
* @param len number of elements to convert.
* constraints: multiple of 8
* @param channels number of channels
*/
void (*float_interleave)(float *dst, const float **src, unsigned int len,
int channels);
} FmtConvertContext;
void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
int channels);
void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);

View File

@ -52,203 +52,6 @@
#include "libavcodec/fmtconvert.h"
#if HAVE_INLINE_ASM
#if HAVE_MIPSDSPR1
static void float_to_int16_mips(int16_t *dst, const float *src, long len)
{
const float *src_end = src + len;
int ret0, ret1, ret2, ret3, ret4, ret5, ret6, ret7;
float src0, src1, src2, src3, src4, src5, src6, src7;
/*
* loop is 8 times unrolled in assembler in order to achieve better performance
*/
__asm__ volatile(
"beq %[len], $zero, fti16_end%= \n\t"
"fti16_lp%=: \n\t"
"lwc1 %[src0], 0(%[src]) \n\t"
"lwc1 %[src1], 4(%[src]) \n\t"
"lwc1 %[src2], 8(%[src]) \n\t"
"lwc1 %[src3], 12(%[src]) \n\t"
"cvt.w.s %[src0], %[src0] \n\t"
"cvt.w.s %[src1], %[src1] \n\t"
"cvt.w.s %[src2], %[src2] \n\t"
"cvt.w.s %[src3], %[src3] \n\t"
"mfc1 %[ret0], %[src0] \n\t"
"mfc1 %[ret1], %[src1] \n\t"
"mfc1 %[ret2], %[src2] \n\t"
"mfc1 %[ret3], %[src3] \n\t"
"lwc1 %[src4], 16(%[src]) \n\t"
"lwc1 %[src5], 20(%[src]) \n\t"
"lwc1 %[src6], 24(%[src]) \n\t"
"lwc1 %[src7], 28(%[src]) \n\t"
"cvt.w.s %[src4], %[src4] \n\t"
"cvt.w.s %[src5], %[src5] \n\t"
"cvt.w.s %[src6], %[src6] \n\t"
"cvt.w.s %[src7], %[src7] \n\t"
"addiu %[src], 32 \n\t"
"shll_s.w %[ret0], %[ret0], 16 \n\t"
"shll_s.w %[ret1], %[ret1], 16 \n\t"
"shll_s.w %[ret2], %[ret2], 16 \n\t"
"shll_s.w %[ret3], %[ret3], 16 \n\t"
"srl %[ret0], %[ret0], 16 \n\t"
"srl %[ret1], %[ret1], 16 \n\t"
"srl %[ret2], %[ret2], 16 \n\t"
"srl %[ret3], %[ret3], 16 \n\t"
"sh %[ret0], 0(%[dst]) \n\t"
"sh %[ret1], 2(%[dst]) \n\t"
"sh %[ret2], 4(%[dst]) \n\t"
"sh %[ret3], 6(%[dst]) \n\t"
"mfc1 %[ret4], %[src4] \n\t"
"mfc1 %[ret5], %[src5] \n\t"
"mfc1 %[ret6], %[src6] \n\t"
"mfc1 %[ret7], %[src7] \n\t"
"shll_s.w %[ret4], %[ret4], 16 \n\t"
"shll_s.w %[ret5], %[ret5], 16 \n\t"
"shll_s.w %[ret6], %[ret6], 16 \n\t"
"shll_s.w %[ret7], %[ret7], 16 \n\t"
"srl %[ret4], %[ret4], 16 \n\t"
"srl %[ret5], %[ret5], 16 \n\t"
"srl %[ret6], %[ret6], 16 \n\t"
"srl %[ret7], %[ret7], 16 \n\t"
"sh %[ret4], 8(%[dst]) \n\t"
"sh %[ret5], 10(%[dst]) \n\t"
"sh %[ret6], 12(%[dst]) \n\t"
"sh %[ret7], 14(%[dst]) \n\t"
"addiu %[dst], 16 \n\t"
"bne %[src], %[src_end], fti16_lp%= \n\t"
"fti16_end%=: \n\t"
: [ret0]"=&r"(ret0), [ret1]"=&r"(ret1), [ret2]"=&r"(ret2), [ret3]"=&r"(ret3),
[ret4]"=&r"(ret4), [ret5]"=&r"(ret5), [ret6]"=&r"(ret6), [ret7]"=&r"(ret7),
[src0]"=&f"(src0), [src1]"=&f"(src1), [src2]"=&f"(src2), [src3]"=&f"(src3),
[src4]"=&f"(src4), [src5]"=&f"(src5), [src6]"=&f"(src6), [src7]"=&f"(src7),
[src]"+r"(src), [dst]"+r"(dst)
: [src_end]"r"(src_end), [len]"r"(len)
: "memory"
);
}
static void float_to_int16_interleave_mips(int16_t *dst, const float **src, long len,
int channels)
{
int c, ch2 = channels <<1;
int ret0, ret1, ret2, ret3, ret4, ret5, ret6, ret7;
float src0, src1, src2, src3, src4, src5, src6, src7;
int16_t *dst_ptr0, *dst_ptr1, *dst_ptr2, *dst_ptr3;
int16_t *dst_ptr4, *dst_ptr5, *dst_ptr6, *dst_ptr7;
const float *src_ptr, *src_ptr2, *src_end;
if (channels == 2) {
src_ptr = &src[0][0];
src_ptr2 = &src[1][0];
src_end = src_ptr + len;
__asm__ volatile (
"fti16i2_lp%=: \n\t"
"lwc1 %[src0], 0(%[src_ptr]) \n\t"
"lwc1 %[src1], 0(%[src_ptr2]) \n\t"
"addiu %[src_ptr], 4 \n\t"
"cvt.w.s $f9, %[src0] \n\t"
"cvt.w.s $f10, %[src1] \n\t"
"mfc1 %[ret0], $f9 \n\t"
"mfc1 %[ret1], $f10 \n\t"
"shll_s.w %[ret0], %[ret0], 16 \n\t"
"shll_s.w %[ret1], %[ret1], 16 \n\t"
"addiu %[src_ptr2], 4 \n\t"
"srl %[ret0], %[ret0], 16 \n\t"
"srl %[ret1], %[ret1], 16 \n\t"
"sh %[ret0], 0(%[dst]) \n\t"
"sh %[ret1], 2(%[dst]) \n\t"
"addiu %[dst], 4 \n\t"
"bne %[src_ptr], %[src_end], fti16i2_lp%= \n\t"
: [ret0]"=&r"(ret0), [ret1]"=&r"(ret1),
[src0]"=&f"(src0), [src1]"=&f"(src1),
[src_ptr]"+r"(src_ptr), [src_ptr2]"+r"(src_ptr2),
[dst]"+r"(dst)
: [src_end]"r"(src_end)
: "memory"
);
} else {
for (c = 0; c < channels; c++) {
src_ptr = &src[c][0];
dst_ptr0 = &dst[c];
src_end = src_ptr + len;
/*
* loop is 8 times unrolled in assembler in order to achieve better performance
*/
__asm__ volatile(
"fti16i_lp%=: \n\t"
"lwc1 %[src0], 0(%[src_ptr]) \n\t"
"lwc1 %[src1], 4(%[src_ptr]) \n\t"
"lwc1 %[src2], 8(%[src_ptr]) \n\t"
"lwc1 %[src3], 12(%[src_ptr]) \n\t"
"cvt.w.s %[src0], %[src0] \n\t"
"cvt.w.s %[src1], %[src1] \n\t"
"cvt.w.s %[src2], %[src2] \n\t"
"cvt.w.s %[src3], %[src3] \n\t"
"mfc1 %[ret0], %[src0] \n\t"
"mfc1 %[ret1], %[src1] \n\t"
"mfc1 %[ret2], %[src2] \n\t"
"mfc1 %[ret3], %[src3] \n\t"
"lwc1 %[src4], 16(%[src_ptr]) \n\t"
"lwc1 %[src5], 20(%[src_ptr]) \n\t"
"lwc1 %[src6], 24(%[src_ptr]) \n\t"
"lwc1 %[src7], 28(%[src_ptr]) \n\t"
"addu %[dst_ptr1], %[dst_ptr0], %[ch2] \n\t"
"addu %[dst_ptr2], %[dst_ptr1], %[ch2] \n\t"
"addu %[dst_ptr3], %[dst_ptr2], %[ch2] \n\t"
"addu %[dst_ptr4], %[dst_ptr3], %[ch2] \n\t"
"addu %[dst_ptr5], %[dst_ptr4], %[ch2] \n\t"
"addu %[dst_ptr6], %[dst_ptr5], %[ch2] \n\t"
"addu %[dst_ptr7], %[dst_ptr6], %[ch2] \n\t"
"addiu %[src_ptr], 32 \n\t"
"cvt.w.s %[src4], %[src4] \n\t"
"cvt.w.s %[src5], %[src5] \n\t"
"cvt.w.s %[src6], %[src6] \n\t"
"cvt.w.s %[src7], %[src7] \n\t"
"shll_s.w %[ret0], %[ret0], 16 \n\t"
"shll_s.w %[ret1], %[ret1], 16 \n\t"
"shll_s.w %[ret2], %[ret2], 16 \n\t"
"shll_s.w %[ret3], %[ret3], 16 \n\t"
"srl %[ret0], %[ret0], 16 \n\t"
"srl %[ret1], %[ret1], 16 \n\t"
"srl %[ret2], %[ret2], 16 \n\t"
"srl %[ret3], %[ret3], 16 \n\t"
"sh %[ret0], 0(%[dst_ptr0]) \n\t"
"sh %[ret1], 0(%[dst_ptr1]) \n\t"
"sh %[ret2], 0(%[dst_ptr2]) \n\t"
"sh %[ret3], 0(%[dst_ptr3]) \n\t"
"mfc1 %[ret4], %[src4] \n\t"
"mfc1 %[ret5], %[src5] \n\t"
"mfc1 %[ret6], %[src6] \n\t"
"mfc1 %[ret7], %[src7] \n\t"
"shll_s.w %[ret4], %[ret4], 16 \n\t"
"shll_s.w %[ret5], %[ret5], 16 \n\t"
"shll_s.w %[ret6], %[ret6], 16 \n\t"
"shll_s.w %[ret7], %[ret7], 16 \n\t"
"srl %[ret4], %[ret4], 16 \n\t"
"srl %[ret5], %[ret5], 16 \n\t"
"srl %[ret6], %[ret6], 16 \n\t"
"srl %[ret7], %[ret7], 16 \n\t"
"sh %[ret4], 0(%[dst_ptr4]) \n\t"
"sh %[ret5], 0(%[dst_ptr5]) \n\t"
"sh %[ret6], 0(%[dst_ptr6]) \n\t"
"sh %[ret7], 0(%[dst_ptr7]) \n\t"
"addu %[dst_ptr0], %[dst_ptr7], %[ch2] \n\t"
"bne %[src_ptr], %[src_end], fti16i_lp%= \n\t"
: [ret0]"=&r"(ret0), [ret1]"=&r"(ret1), [ret2]"=&r"(ret2), [ret3]"=&r"(ret3),
[ret4]"=&r"(ret4), [ret5]"=&r"(ret5), [ret6]"=&r"(ret6), [ret7]"=&r"(ret7),
[src0]"=&f"(src0), [src1]"=&f"(src1), [src2]"=&f"(src2), [src3]"=&f"(src3),
[src4]"=&f"(src4), [src5]"=&f"(src5), [src6]"=&f"(src6), [src7]"=&f"(src7),
[dst_ptr1]"=&r"(dst_ptr1), [dst_ptr2]"=&r"(dst_ptr2), [dst_ptr3]"=&r"(dst_ptr3),
[dst_ptr4]"=&r"(dst_ptr4), [dst_ptr5]"=&r"(dst_ptr5), [dst_ptr6]"=&r"(dst_ptr6),
[dst_ptr7]"=&r"(dst_ptr7), [dst_ptr0]"+r"(dst_ptr0), [src_ptr]"+r"(src_ptr)
: [ch2]"r"(ch2), [src_end]"r"(src_end)
: "memory"
);
}
}
}
#endif /* HAVE_MIPSDSPR1 */
static void int32_to_float_fmul_scalar_mips(float *dst, const int *src,
float mul, int len)
@ -333,10 +136,6 @@ static void int32_to_float_fmul_scalar_mips(float *dst, const int *src,
av_cold void ff_fmt_convert_init_mips(FmtConvertContext *c)
{
#if HAVE_INLINE_ASM
#if HAVE_MIPSDSPR1
c->float_to_int16_interleave = float_to_int16_interleave_mips;
c->float_to_int16 = float_to_int16_mips;
#endif
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_mips;
#endif
}

View File

@ -52,113 +52,6 @@ static void int32_to_float_fmul_scalar_altivec(float *dst, const int32_t *src,
}
}
static vector signed short float_to_int16_one_altivec(const float *src)
{
vector float s0 = vec_ld(0, src);
vector float s1 = vec_ld(16, src);
vector signed int t0 = vec_cts(s0, 0);
vector signed int t1 = vec_cts(s1, 0);
return vec_packs(t0,t1);
}
static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
{
int i;
vector signed short d0, d1, d;
vector unsigned char align;
if (((long)dst) & 15) { //FIXME
for (i = 0; i < len - 7; i += 8) {
d0 = vec_ld(0, dst+i);
d = float_to_int16_one_altivec(src + i);
d1 = vec_ld(15, dst+i);
d1 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
align = vec_lvsr(0, dst + i);
d0 = vec_perm(d1, d, align);
d1 = vec_perm(d, d1, align);
vec_st(d0, 0, dst + i);
vec_st(d1, 15, dst + i);
}
} else {
for (i = 0; i < len - 7; i += 8) {
d = float_to_int16_one_altivec(src + i);
vec_st(d, 0, dst + i);
}
}
}
#define VSTE_INC(dst, v, elem, inc) do { \
vector signed short s = vec_splat(v, elem); \
vec_ste(s, 0, dst); \
dst += inc; \
} while (0)
static void float_to_int16_stride_altivec(int16_t *dst, const float *src,
long len, int stride)
{
int i;
vector signed short d;
for (i = 0; i < len - 7; i += 8) {
d = float_to_int16_one_altivec(src + i);
VSTE_INC(dst, d, 0, stride);
VSTE_INC(dst, d, 1, stride);
VSTE_INC(dst, d, 2, stride);
VSTE_INC(dst, d, 3, stride);
VSTE_INC(dst, d, 4, stride);
VSTE_INC(dst, d, 5, stride);
VSTE_INC(dst, d, 6, stride);
VSTE_INC(dst, d, 7, stride);
}
}
static void float_to_int16_interleave_altivec(int16_t *dst, const float **src,
long len, int channels)
{
int i;
vector signed short d0, d1, d2, c0, c1, t0, t1;
vector unsigned char align;
if (channels == 1)
float_to_int16_altivec(dst, src[0], len);
else {
if (channels == 2) {
if (((long)dst) & 15) {
for (i = 0; i < len - 7; i += 8) {
d0 = vec_ld(0, dst + i);
t0 = float_to_int16_one_altivec(src[0] + i);
d1 = vec_ld(31, dst + i);
t1 = float_to_int16_one_altivec(src[1] + i);
c0 = vec_mergeh(t0, t1);
c1 = vec_mergel(t0, t1);
d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
align = vec_lvsr(0, dst + i);
d0 = vec_perm(d2, c0, align);
d1 = vec_perm(c0, c1, align);
vec_st(d0, 0, dst + i);
d0 = vec_perm(c1, d2, align);
vec_st(d1, 15, dst + i);
vec_st(d0, 31, dst + i);
dst += 8;
}
} else {
for (i = 0; i < len - 7; i += 8) {
t0 = float_to_int16_one_altivec(src[0] + i);
t1 = float_to_int16_one_altivec(src[1] + i);
d0 = vec_mergeh(t0, t1);
d1 = vec_mergel(t0, t1);
vec_st(d0, 0, dst + i);
vec_st(d1, 16, dst + i);
dst += 8;
}
}
} else {
for (i = 0; i < channels; i++)
float_to_int16_stride_altivec(dst + i, src[i], len, channels);
}
}
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_fmt_convert_init_ppc(FmtConvertContext *c,
@ -169,9 +62,5 @@ av_cold void ff_fmt_convert_init_ppc(FmtConvertContext *c,
return;
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16 = float_to_int16_altivec;
c->float_to_int16_interleave = float_to_int16_interleave_altivec;
}
#endif /* HAVE_ALTIVEC */
}

View File

@ -23,14 +23,6 @@
SECTION_TEXT
%macro CVTPS2PI 2
%if cpuflag(sse)
cvtps2pi %1, %2
%elif cpuflag(3dnow)
pf2id %1, %2
%endif
%endmacro
;------------------------------------------------------------------------------
; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul,
; int len);
@ -116,355 +108,3 @@ INT32_TO_FLOAT_FMUL_ARRAY8
INIT_XMM sse2
INT32_TO_FLOAT_FMUL_ARRAY8
;------------------------------------------------------------------------------
; void ff_float_to_int16(int16_t *dst, const float *src, long len);
;------------------------------------------------------------------------------
%macro FLOAT_TO_INT16 1
cglobal float_to_int16, 3, 3, %1, dst, src, len
add lenq, lenq
lea srcq, [srcq+2*lenq]
add dstq, lenq
neg lenq
.loop:
%if cpuflag(sse2)
cvtps2dq m0, [srcq+2*lenq ]
cvtps2dq m1, [srcq+2*lenq+16]
packssdw m0, m1
mova [dstq+lenq], m0
%else
CVTPS2PI m0, [srcq+2*lenq ]
CVTPS2PI m1, [srcq+2*lenq+ 8]
CVTPS2PI m2, [srcq+2*lenq+16]
CVTPS2PI m3, [srcq+2*lenq+24]
packssdw m0, m1
packssdw m2, m3
mova [dstq+lenq ], m0
mova [dstq+lenq+8], m2
%endif
add lenq, 16
js .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
INIT_XMM sse2
FLOAT_TO_INT16 2
INIT_MMX sse
FLOAT_TO_INT16 0
INIT_MMX 3dnow
FLOAT_TO_INT16 0
;------------------------------------------------------------------------------
; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
;------------------------------------------------------------------------------
%macro FLOAT_TO_INT16_STEP 1
cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
add lenq, lenq
lea srcq, [srcq+2*lenq]
lea step3q, [stepq*3]
neg lenq
.loop:
%if cpuflag(sse2)
cvtps2dq m0, [srcq+2*lenq ]
cvtps2dq m1, [srcq+2*lenq+16]
packssdw m0, m1
movd v1d, m0
psrldq m0, 4
movd v2d, m0
psrldq m0, 4
mov [dstq], v1w
mov [dstq+stepq*4], v2w
shr v1d, 16
shr v2d, 16
mov [dstq+stepq*2], v1w
mov [dstq+step3q*2], v2w
lea dstq, [dstq+stepq*8]
movd v1d, m0
psrldq m0, 4
movd v2d, m0
mov [dstq], v1w
mov [dstq+stepq*4], v2w
shr v1d, 16
shr v2d, 16
mov [dstq+stepq*2], v1w
mov [dstq+step3q*2], v2w
lea dstq, [dstq+stepq*8]
%else
CVTPS2PI m0, [srcq+2*lenq ]
CVTPS2PI m1, [srcq+2*lenq+ 8]
CVTPS2PI m2, [srcq+2*lenq+16]
CVTPS2PI m3, [srcq+2*lenq+24]
packssdw m0, m1
packssdw m2, m3
movd v1d, m0
psrlq m0, 32
movd v2d, m0
mov [dstq], v1w
mov [dstq+stepq*4], v2w
shr v1d, 16
shr v2d, 16
mov [dstq+stepq*2], v1w
mov [dstq+step3q*2], v2w
lea dstq, [dstq+stepq*8]
movd v1d, m2
psrlq m2, 32
movd v2d, m2
mov [dstq], v1w
mov [dstq+stepq*4], v2w
shr v1d, 16
shr v2d, 16
mov [dstq+stepq*2], v1w
mov [dstq+step3q*2], v2w
lea dstq, [dstq+stepq*8]
%endif
add lenq, 16
js .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
INIT_XMM sse2
FLOAT_TO_INT16_STEP 2
INIT_MMX sse
FLOAT_TO_INT16_STEP 0
INIT_MMX 3dnow
FLOAT_TO_INT16_STEP 0
;-------------------------------------------------------------------------------
; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
;-------------------------------------------------------------------------------
%macro FLOAT_TO_INT16_INTERLEAVE2 0
cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
lea lenq, [4*r2q]
mov src1q, [src0q+gprsize]
mov src0q, [src0q]
add dstq, lenq
add src0q, lenq
add src1q, lenq
neg lenq
.loop:
%if cpuflag(sse2)
cvtps2dq m0, [src0q+lenq]
cvtps2dq m1, [src1q+lenq]
packssdw m0, m1
movhlps m1, m0
punpcklwd m0, m1
mova [dstq+lenq], m0
%else
CVTPS2PI m0, [src0q+lenq ]
CVTPS2PI m1, [src0q+lenq+8]
CVTPS2PI m2, [src1q+lenq ]
CVTPS2PI m3, [src1q+lenq+8]
packssdw m0, m1
packssdw m2, m3
mova m1, m0
punpcklwd m0, m2
punpckhwd m1, m2
mova [dstq+lenq ], m0
mova [dstq+lenq+8], m1
%endif
add lenq, 16
js .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
INIT_MMX 3dnow
FLOAT_TO_INT16_INTERLEAVE2
INIT_MMX sse
FLOAT_TO_INT16_INTERLEAVE2
INIT_XMM sse2
FLOAT_TO_INT16_INTERLEAVE2
;-----------------------------------------------------------------------------
; void ff_float_to_int16_interleave6(int16_t *dst, const float **src, int len)
;-----------------------------------------------------------------------------
%macro FLOAT_TO_INT16_INTERLEAVE6 0
cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64
mov lend, r2d
%else
%define lend dword r2m
%endif
mov src1q, [srcq+1*gprsize]
mov src2q, [srcq+2*gprsize]
mov src3q, [srcq+3*gprsize]
mov src4q, [srcq+4*gprsize]
mov src5q, [srcq+5*gprsize]
mov srcq, [srcq]
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
sub src4q, srcq
sub src5q, srcq
.loop:
CVTPS2PI mm0, [srcq]
CVTPS2PI mm1, [srcq+src1q]
CVTPS2PI mm2, [srcq+src2q]
CVTPS2PI mm3, [srcq+src3q]
CVTPS2PI mm4, [srcq+src4q]
CVTPS2PI mm5, [srcq+src5q]
packssdw mm0, mm3
packssdw mm1, mm4
packssdw mm2, mm5
PSWAPD mm3, mm0
punpcklwd mm0, mm1
punpckhwd mm1, mm2
punpcklwd mm2, mm3
PSWAPD mm3, mm0
punpckldq mm0, mm2
punpckhdq mm2, mm1
punpckldq mm1, mm3
movq [dstq ], mm0
movq [dstq+16], mm2
movq [dstq+ 8], mm1
add srcq, 8
add dstq, 24
sub lend, 2
jg .loop
emms
RET
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
INIT_MMX sse
FLOAT_TO_INT16_INTERLEAVE6
INIT_MMX 3dnow
FLOAT_TO_INT16_INTERLEAVE6
INIT_MMX 3dnowext
FLOAT_TO_INT16_INTERLEAVE6
;-----------------------------------------------------------------------------
; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
;-----------------------------------------------------------------------------
%macro FLOAT_INTERLEAVE6 1
cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64
mov lend, r2d
%else
%define lend dword r2m
%endif
mov src1q, [srcq+1*gprsize]
mov src2q, [srcq+2*gprsize]
mov src3q, [srcq+3*gprsize]
mov src4q, [srcq+4*gprsize]
mov src5q, [srcq+5*gprsize]
mov srcq, [srcq]
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
sub src4q, srcq
sub src5q, srcq
.loop:
%if cpuflag(sse)
movaps m0, [srcq]
movaps m1, [srcq+src1q]
movaps m2, [srcq+src2q]
movaps m3, [srcq+src3q]
movaps m4, [srcq+src4q]
movaps m5, [srcq+src5q]
SBUTTERFLYPS 0, 1, 6
SBUTTERFLYPS 2, 3, 6
SBUTTERFLYPS 4, 5, 6
movaps m6, m4
shufps m4, m0, 0xe4
movlhps m0, m2
movhlps m6, m2
movaps [dstq ], m0
movaps [dstq+16], m4
movaps [dstq+32], m6
movaps m6, m5
shufps m5, m1, 0xe4
movlhps m1, m3
movhlps m6, m3
movaps [dstq+48], m1
movaps [dstq+64], m5
movaps [dstq+80], m6
%else ; mmx
movq m0, [srcq]
movq m1, [srcq+src1q]
movq m2, [srcq+src2q]
movq m3, [srcq+src3q]
movq m4, [srcq+src4q]
movq m5, [srcq+src5q]
SBUTTERFLY dq, 0, 1, 6
SBUTTERFLY dq, 2, 3, 6
SBUTTERFLY dq, 4, 5, 6
movq [dstq ], m0
movq [dstq+ 8], m2
movq [dstq+16], m4
movq [dstq+24], m1
movq [dstq+32], m3
movq [dstq+40], m5
%endif
add srcq, mmsize
add dstq, mmsize*6
sub lend, mmsize/4
jg .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
INIT_MMX mmx
FLOAT_INTERLEAVE6 0
INIT_XMM sse
FLOAT_INTERLEAVE6 7
;-----------------------------------------------------------------------------
; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
;-----------------------------------------------------------------------------
%macro FLOAT_INTERLEAVE2 1
cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
mov src1q, [srcq+gprsize]
mov srcq, [srcq ]
sub src1q, srcq
.loop:
mova m0, [srcq ]
mova m1, [srcq+src1q ]
mova m3, [srcq +mmsize]
mova m4, [srcq+src1q+mmsize]
mova m2, m0
PUNPCKLDQ m0, m1
PUNPCKHDQ m2, m1
mova m1, m3
PUNPCKLDQ m3, m4
PUNPCKHDQ m1, m4
mova [dstq ], m0
mova [dstq+1*mmsize], m2
mova [dstq+2*mmsize], m3
mova [dstq+3*mmsize], m1
add srcq, mmsize*2
add dstq, mmsize*4
sub lend, mmsize/2
jg .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
INIT_MMX mmx
%define PUNPCKLDQ punpckldq
%define PUNPCKHDQ punpckhdq
FLOAT_INTERLEAVE2 0
INIT_XMM sse
%define PUNPCKLDQ unpcklps
%define PUNPCKHDQ unpckhps
FLOAT_INTERLEAVE2 5

View File

@ -37,84 +37,6 @@ void ff_int32_to_float_fmul_array8_sse (FmtConvertContext *c, float *dst, const
void ff_int32_to_float_fmul_array8_sse2(FmtConvertContext *c, float *dst, const int32_t *src,
const float *mul, int len);
void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step);
void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step);
void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step);
void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len);
void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnowext(int16_t *dst, const float **src, int len);
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
int c;\
for(c=0; c<channels; c++){\
ff_float_to_int16_step_##cpu(dst+c, src[c], len, channels);\
}\
}\
\
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
if(channels==1)\
ff_float_to_int16_##cpu(dst, src[0], len);\
else if(channels==2){\
ff_float_to_int16_interleave2_##cpu(dst, src, len);\
}else if(channels==6){\
ff_float_to_int16_interleave6_##cpu(dst, src, len);\
}else\
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
}
FLOAT_TO_INT16_INTERLEAVE(3dnow)
FLOAT_TO_INT16_INTERLEAVE(sse)
FLOAT_TO_INT16_INTERLEAVE(sse2)
static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src,
long len, int channels)
{
if(channels==6)
ff_float_to_int16_interleave6_3dnowext(dst, src, len);
else
float_to_int16_interleave_3dnow(dst, src, len, channels);
}
void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
static void float_interleave_mmx(float *dst, const float **src,
unsigned int len, int channels)
{
if (channels == 2) {
ff_float_interleave2_mmx(dst, src, len);
} else if (channels == 6)
ff_float_interleave6_mmx(dst, src, len);
else
ff_float_interleave_c(dst, src, len, channels);
}
static void float_interleave_sse(float *dst, const float **src,
unsigned int len, int channels)
{
if (channels == 2) {
ff_float_interleave2_sse(dst, src, len);
} else if (channels == 6)
ff_float_interleave6_sse(dst, src, len);
else
ff_float_interleave_c(dst, src, len, channels);
}
#endif /* HAVE_YASM */
av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
@ -122,32 +44,13 @@ av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->float_interleave = float_interleave_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16 = ff_float_to_int16_3dnow;
c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
}
}
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16_interleave = float_to_int16_interleave_3dnowext;
}
}
if (EXTERNAL_SSE(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse;
c->float_to_int16 = ff_float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse;
c->float_interleave = float_interleave_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_sse2;
c->float_to_int16 = ff_float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
}
#endif /* HAVE_YASM */
}