mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
dsputil: Move APE-specific bits into apedsp
This commit is contained in:
parent
256da0770e
commit
054013a0fc
@ -25,6 +25,7 @@
|
||||
#include "libavutil/avassert.h"
|
||||
#include "libavutil/channel_layout.h"
|
||||
#include "libavutil/opt.h"
|
||||
#include "apedsp.h"
|
||||
#include "avcodec.h"
|
||||
#include "dsputil.h"
|
||||
#include "bytestream.h"
|
||||
@ -136,6 +137,7 @@ typedef struct APEContext {
|
||||
AVClass *class; ///< class for AVOptions
|
||||
AVCodecContext *avctx;
|
||||
DSPContext dsp;
|
||||
APEDSPContext adsp;
|
||||
int channels;
|
||||
int samples; ///< samples left to decode in current frame
|
||||
int bps;
|
||||
@ -195,8 +197,6 @@ static void predictor_decode_stereo_3930(APEContext *ctx, int count);
|
||||
static void predictor_decode_mono_3950(APEContext *ctx, int count);
|
||||
static void predictor_decode_stereo_3950(APEContext *ctx, int count);
|
||||
|
||||
// TODO: dsputilize
|
||||
|
||||
static av_cold int ape_decode_close(AVCodecContext *avctx)
|
||||
{
|
||||
APEContext *s = avctx->priv_data;
|
||||
@ -212,6 +212,19 @@ static av_cold int ape_decode_close(AVCodecContext *avctx)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul)
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
while (order--) {
|
||||
res += *v1 * *v2++;
|
||||
*v1++ += mul * *v3++;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
static av_cold int ape_decode_init(AVCodecContext *avctx)
|
||||
{
|
||||
APEContext *s = avctx->priv_data;
|
||||
@ -292,6 +305,15 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
|
||||
s->predictor_decode_stereo = predictor_decode_stereo_3950;
|
||||
}
|
||||
|
||||
s->adsp.scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
|
||||
|
||||
if (ARCH_ARM)
|
||||
ff_apedsp_init_arm(&s->adsp);
|
||||
if (ARCH_PPC)
|
||||
ff_apedsp_init_ppc(&s->adsp);
|
||||
if (ARCH_X86)
|
||||
ff_apedsp_init_x86(&s->adsp);
|
||||
|
||||
ff_dsputil_init(&s->dsp, avctx);
|
||||
avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
|
||||
|
||||
@ -1263,9 +1285,10 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f,
|
||||
|
||||
while (count--) {
|
||||
/* round fixedpoint scalar product */
|
||||
res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order,
|
||||
f->adaptcoeffs - order,
|
||||
order, APESIGN(*data));
|
||||
res = ctx->adsp.scalarproduct_and_madd_int16(f->coeffs,
|
||||
f->delay - order,
|
||||
f->adaptcoeffs - order,
|
||||
order, APESIGN(*data));
|
||||
res = (res + (1 << (fracbits - 1))) >> fracbits;
|
||||
res += *data;
|
||||
*data++ = res;
|
||||
|
44
libavcodec/apedsp.h
Normal file
44
libavcodec/apedsp.h
Normal file
@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Monkey's Audio lossless audio decoder
|
||||
* Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
|
||||
* based upon libdemac from Dave Chapman.
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_APEDSP_H
|
||||
#define AVCODEC_APEDSP_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct APEDSPContext {
|
||||
/**
|
||||
* Calculate scalar product of v1 and v2,
|
||||
* and v1[i] += v3[i] * mul
|
||||
* @param len length of vectors, should be multiple of 16
|
||||
*/
|
||||
int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
|
||||
const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int len, int mul);
|
||||
} APEDSPContext;
|
||||
|
||||
void ff_apedsp_init_arm(APEDSPContext *c);
|
||||
void ff_apedsp_init_ppc(APEDSPContext *c);
|
||||
void ff_apedsp_init_x86(APEDSPContext *c);
|
||||
|
||||
#endif /* AVCODEC_APEDSP_H */
|
@ -24,6 +24,7 @@ OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
|
||||
|
||||
OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
|
||||
arm/sbrdsp_init_arm.o
|
||||
OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_init_arm.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o
|
||||
OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \
|
||||
arm/flacdsp_arm.o
|
||||
@ -97,6 +98,7 @@ NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o
|
||||
|
||||
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
|
||||
arm/sbrdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
|
||||
arm/synth_filter_neon.o
|
||||
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
|
||||
|
38
libavcodec/arm/apedsp_init_arm.c
Normal file
38
libavcodec/arm/apedsp_init_arm.c
Normal file
@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/apedsp.h"
|
||||
|
||||
int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3, int len, int mul);
|
||||
|
||||
av_cold void ff_apedsp_init_arm(APEDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
|
||||
}
|
||||
}
|
62
libavcodec/arm/apedsp_neon.S
Normal file
62
libavcodec/arm/apedsp_neon.S
Normal file
@ -0,0 +1,62 @@
|
||||
/*
|
||||
* ARM NEON optimised integer operations
|
||||
* Copyright (c) 2009 Kostya Shishkov
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
|
||||
function ff_scalarproduct_and_madd_int16_neon, export=1
|
||||
vld1.16 {d28[],d29[]}, [sp]
|
||||
vmov.i16 q0, #0
|
||||
vmov.i16 q1, #0
|
||||
vmov.i16 q2, #0
|
||||
vmov.i16 q3, #0
|
||||
mov r12, r0
|
||||
|
||||
1: vld1.16 {d16-d17}, [r0,:128]!
|
||||
vld1.16 {d18-d19}, [r1]!
|
||||
vld1.16 {d20-d21}, [r2]!
|
||||
vld1.16 {d22-d23}, [r0,:128]!
|
||||
vld1.16 {d24-d25}, [r1]!
|
||||
vld1.16 {d26-d27}, [r2]!
|
||||
vmul.s16 q10, q10, q14
|
||||
vmul.s16 q13, q13, q14
|
||||
vmlal.s16 q0, d16, d18
|
||||
vmlal.s16 q1, d17, d19
|
||||
vadd.s16 q10, q8, q10
|
||||
vadd.s16 q13, q11, q13
|
||||
vmlal.s16 q2, d22, d24
|
||||
vmlal.s16 q3, d23, d25
|
||||
vst1.16 {q10}, [r12,:128]!
|
||||
subs r3, r3, #16
|
||||
vst1.16 {q13}, [r12,:128]!
|
||||
bne 1b
|
||||
|
||||
vpadd.s32 d16, d0, d1
|
||||
vpadd.s32 d17, d2, d3
|
||||
vpadd.s32 d18, d4, d5
|
||||
vpadd.s32 d19, d6, d7
|
||||
vpadd.s32 d0, d16, d17
|
||||
vpadd.s32 d1, d18, d19
|
||||
vpadd.s32 d2, d0, d1
|
||||
vpaddl.s32 d3, d2
|
||||
vmov.32 r0, d3[0]
|
||||
bx lr
|
||||
endfunc
|
@ -44,9 +44,6 @@ void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
|
||||
|
||||
int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
|
||||
|
||||
int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3, int len, int mul);
|
||||
|
||||
av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
@ -73,6 +70,4 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
|
||||
c->vector_clip_int32 = ff_vector_clip_int32_neon;
|
||||
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
|
||||
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
|
||||
}
|
||||
|
@ -48,43 +48,3 @@ function ff_scalarproduct_int16_neon, export=1
|
||||
vmov.32 r0, d3[0]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
|
||||
function ff_scalarproduct_and_madd_int16_neon, export=1
|
||||
vld1.16 {d28[],d29[]}, [sp]
|
||||
vmov.i16 q0, #0
|
||||
vmov.i16 q1, #0
|
||||
vmov.i16 q2, #0
|
||||
vmov.i16 q3, #0
|
||||
mov r12, r0
|
||||
|
||||
1: vld1.16 {d16-d17}, [r0,:128]!
|
||||
vld1.16 {d18-d19}, [r1]!
|
||||
vld1.16 {d20-d21}, [r2]!
|
||||
vld1.16 {d22-d23}, [r0,:128]!
|
||||
vld1.16 {d24-d25}, [r1]!
|
||||
vld1.16 {d26-d27}, [r2]!
|
||||
vmul.s16 q10, q10, q14
|
||||
vmul.s16 q13, q13, q14
|
||||
vmlal.s16 q0, d16, d18
|
||||
vmlal.s16 q1, d17, d19
|
||||
vadd.s16 q10, q8, q10
|
||||
vadd.s16 q13, q11, q13
|
||||
vmlal.s16 q2, d22, d24
|
||||
vmlal.s16 q3, d23, d25
|
||||
vst1.16 {q10}, [r12,:128]!
|
||||
subs r3, r3, #16
|
||||
vst1.16 {q13}, [r12,:128]!
|
||||
bne 1b
|
||||
|
||||
vpadd.s32 d16, d0, d1
|
||||
vpadd.s32 d17, d2, d3
|
||||
vpadd.s32 d18, d4, d5
|
||||
vpadd.s32 d19, d6, d7
|
||||
vpadd.s32 d0, d16, d17
|
||||
vpadd.s32 d1, d18, d19
|
||||
vpadd.s32 d2, d0, d1
|
||||
vpaddl.s32 d3, d2
|
||||
vmov.32 r0, d3[0]
|
||||
bx lr
|
||||
endfunc
|
||||
|
@ -2069,19 +2069,6 @@ static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
|
||||
return res;
|
||||
}
|
||||
|
||||
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul)
|
||||
{
|
||||
int res = 0;
|
||||
|
||||
while (order--) {
|
||||
res += *v1 * *v2++;
|
||||
*v1++ += mul * *v3++;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
|
||||
int32_t max, unsigned int len)
|
||||
{
|
||||
@ -2294,8 +2281,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
|
||||
c->try_8x8basis = try_8x8basis_c;
|
||||
c->add_8x8basis = add_8x8basis_c;
|
||||
|
||||
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
|
||||
|
||||
c->scalarproduct_int16 = scalarproduct_int16_c;
|
||||
c->vector_clip_int32 = vector_clip_int32_c;
|
||||
c->vector_clipf = vector_clipf_c;
|
||||
|
@ -255,16 +255,6 @@ typedef struct DSPContext {
|
||||
*/
|
||||
int32_t (*scalarproduct_int16)(const int16_t *v1,
|
||||
const int16_t *v2 /* align 16 */, int len);
|
||||
/* ape functions */
|
||||
/**
|
||||
* Calculate scalar product of v1 and v2,
|
||||
* and v1[i] += v3[i] * mul
|
||||
* @param len length of vectors, should be multiple of 16
|
||||
*/
|
||||
int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
|
||||
const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int len, int mul);
|
||||
|
||||
/**
|
||||
* Clip each element in an array of int32_t to a given minimum and
|
||||
|
@ -12,6 +12,7 @@ OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o
|
||||
OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o
|
||||
|
||||
OBJS-$(CONFIG_APE_DECODER) += ppc/apedsp_altivec.o
|
||||
OBJS-$(CONFIG_SVQ1_ENCODER) += ppc/svq1enc_altivec.o
|
||||
OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o
|
||||
OBJS-$(CONFIG_VORBIS_DECODER) += ppc/vorbisdsp_altivec.o
|
||||
|
77
libavcodec/ppc/apedsp_altivec.c
Normal file
77
libavcodec/ppc/apedsp_altivec.c
Normal file
@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#if HAVE_ALTIVEC_H
|
||||
#include <altivec.h>
|
||||
#endif
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/ppc/types_altivec.h"
|
||||
#include "libavcodec/apedsp.h"
|
||||
|
||||
#if HAVE_ALTIVEC
|
||||
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
|
||||
const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul)
|
||||
{
|
||||
LOAD_ZERO;
|
||||
vec_s16 *pv1 = (vec_s16 *) v1;
|
||||
register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
|
||||
register vec_s16 t0, t1, i0, i1, i4;
|
||||
register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
|
||||
register vec_s32 res = zero_s32v;
|
||||
register vec_u8 align = vec_lvsl(0, v2);
|
||||
int32_t ires;
|
||||
|
||||
order >>= 4;
|
||||
do {
|
||||
i1 = vec_ld(16, v2);
|
||||
t0 = vec_perm(i2, i1, align);
|
||||
i2 = vec_ld(32, v2);
|
||||
t1 = vec_perm(i1, i2, align);
|
||||
i0 = pv1[0];
|
||||
i1 = pv1[1];
|
||||
res = vec_msum(t0, i0, res);
|
||||
res = vec_msum(t1, i1, res);
|
||||
i4 = vec_ld(16, v3);
|
||||
t0 = vec_perm(i3, i4, align);
|
||||
i3 = vec_ld(32, v3);
|
||||
t1 = vec_perm(i4, i3, align);
|
||||
pv1[0] = vec_mladd(t0, muls, i0);
|
||||
pv1[1] = vec_mladd(t1, muls, i1);
|
||||
pv1 += 2;
|
||||
v2 += 16;
|
||||
v3 += 16;
|
||||
} while (--order);
|
||||
res = vec_splat(vec_sums(res, zero_s32v), 3);
|
||||
vec_ste(res, 0, &ires);
|
||||
|
||||
return ires;
|
||||
}
|
||||
#endif /* HAVE_ALTIVEC */
|
||||
|
||||
av_cold void ff_apedsp_init_ppc(APEDSPContext *c)
|
||||
{
|
||||
#if HAVE_ALTIVEC
|
||||
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
|
||||
#endif /* HAVE_ALTIVEC */
|
||||
}
|
@ -56,49 +56,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
|
||||
return ires;
|
||||
}
|
||||
|
||||
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
|
||||
const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul)
|
||||
{
|
||||
LOAD_ZERO;
|
||||
vec_s16 *pv1 = (vec_s16 *) v1;
|
||||
register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
|
||||
register vec_s16 t0, t1, i0, i1, i4;
|
||||
register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
|
||||
register vec_s32 res = zero_s32v;
|
||||
register vec_u8 align = vec_lvsl(0, v2);
|
||||
int32_t ires;
|
||||
|
||||
order >>= 4;
|
||||
do {
|
||||
i1 = vec_ld(16, v2);
|
||||
t0 = vec_perm(i2, i1, align);
|
||||
i2 = vec_ld(32, v2);
|
||||
t1 = vec_perm(i1, i2, align);
|
||||
i0 = pv1[0];
|
||||
i1 = pv1[1];
|
||||
res = vec_msum(t0, i0, res);
|
||||
res = vec_msum(t1, i1, res);
|
||||
i4 = vec_ld(16, v3);
|
||||
t0 = vec_perm(i3, i4, align);
|
||||
i3 = vec_ld(32, v3);
|
||||
t1 = vec_perm(i4, i3, align);
|
||||
pv1[0] = vec_mladd(t0, muls, i0);
|
||||
pv1[1] = vec_mladd(t1, muls, i1);
|
||||
pv1 += 2;
|
||||
v2 += 16;
|
||||
v3 += 16;
|
||||
} while (--order);
|
||||
res = vec_splat(vec_sums(res, zero_s32v), 3);
|
||||
vec_ste(res, 0, &ires);
|
||||
|
||||
return ires;
|
||||
}
|
||||
|
||||
av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
c->scalarproduct_int16 = scalarproduct_int16_altivec;
|
||||
|
||||
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
|
||||
}
|
||||
|
@ -25,6 +25,7 @@ OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
|
||||
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
|
||||
|
||||
OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
|
||||
OBJS-$(CONFIG_APE_DECODER) += x86/apedsp_init.o
|
||||
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o
|
||||
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
|
||||
@ -89,6 +90,7 @@ YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
|
||||
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
|
||||
|
||||
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
|
||||
YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o
|
||||
YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
|
||||
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
|
||||
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
|
||||
|
167
libavcodec/x86/apedsp.asm
Normal file
167
libavcodec/x86/apedsp.asm
Normal file
@ -0,0 +1,167 @@
|
||||
;******************************************************************************
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
;* Libav is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* Libav is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with Libav; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_TEXT
|
||||
|
||||
%macro SCALARPRODUCT 0
|
||||
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
|
||||
; int order, int mul)
|
||||
cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
|
||||
shl orderq, 1
|
||||
movd m7, mulm
|
||||
%if mmsize == 16
|
||||
pshuflw m7, m7, 0
|
||||
punpcklqdq m7, m7
|
||||
%else
|
||||
pshufw m7, m7, 0
|
||||
%endif
|
||||
pxor m6, m6
|
||||
add v1q, orderq
|
||||
add v2q, orderq
|
||||
add v3q, orderq
|
||||
neg orderq
|
||||
.loop:
|
||||
movu m0, [v2q + orderq]
|
||||
movu m1, [v2q + orderq + mmsize]
|
||||
mova m4, [v1q + orderq]
|
||||
mova m5, [v1q + orderq + mmsize]
|
||||
movu m2, [v3q + orderq]
|
||||
movu m3, [v3q + orderq + mmsize]
|
||||
pmaddwd m0, m4
|
||||
pmaddwd m1, m5
|
||||
pmullw m2, m7
|
||||
pmullw m3, m7
|
||||
paddd m6, m0
|
||||
paddd m6, m1
|
||||
paddw m2, m4
|
||||
paddw m3, m5
|
||||
mova [v1q + orderq], m2
|
||||
mova [v1q + orderq + mmsize], m3
|
||||
add orderq, mmsize*2
|
||||
jl .loop
|
||||
%if mmsize == 16
|
||||
movhlps m0, m6
|
||||
paddd m6, m0
|
||||
pshuflw m0, m6, 0x4e
|
||||
%else
|
||||
pshufw m0, m6, 0x4e
|
||||
%endif
|
||||
paddd m6, m0
|
||||
movd eax, m6
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
SCALARPRODUCT
|
||||
INIT_XMM sse2
|
||||
SCALARPRODUCT
|
||||
|
||||
%macro SCALARPRODUCT_LOOP 1
|
||||
align 16
|
||||
.loop%1:
|
||||
sub orderq, mmsize*2
|
||||
%if %1
|
||||
mova m1, m4
|
||||
mova m4, [v2q + orderq]
|
||||
mova m0, [v2q + orderq + mmsize]
|
||||
palignr m1, m0, %1
|
||||
palignr m0, m4, %1
|
||||
mova m3, m5
|
||||
mova m5, [v3q + orderq]
|
||||
mova m2, [v3q + orderq + mmsize]
|
||||
palignr m3, m2, %1
|
||||
palignr m2, m5, %1
|
||||
%else
|
||||
mova m0, [v2q + orderq]
|
||||
mova m1, [v2q + orderq + mmsize]
|
||||
mova m2, [v3q + orderq]
|
||||
mova m3, [v3q + orderq + mmsize]
|
||||
%endif
|
||||
%define t0 [v1q + orderq]
|
||||
%define t1 [v1q + orderq + mmsize]
|
||||
%if ARCH_X86_64
|
||||
mova m8, t0
|
||||
mova m9, t1
|
||||
%define t0 m8
|
||||
%define t1 m9
|
||||
%endif
|
||||
pmaddwd m0, t0
|
||||
pmaddwd m1, t1
|
||||
pmullw m2, m7
|
||||
pmullw m3, m7
|
||||
paddw m2, t0
|
||||
paddw m3, t1
|
||||
paddd m6, m0
|
||||
paddd m6, m1
|
||||
mova [v1q + orderq], m2
|
||||
mova [v1q + orderq + mmsize], m3
|
||||
jg .loop%1
|
||||
%if %1
|
||||
jmp .end
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
|
||||
; int order, int mul)
|
||||
INIT_XMM ssse3
|
||||
cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
|
||||
shl orderq, 1
|
||||
movd m7, mulm
|
||||
pshuflw m7, m7, 0
|
||||
punpcklqdq m7, m7
|
||||
pxor m6, m6
|
||||
mov r4d, v2d
|
||||
and r4d, 15
|
||||
and v2q, ~15
|
||||
and v3q, ~15
|
||||
mova m4, [v2q + orderq]
|
||||
mova m5, [v3q + orderq]
|
||||
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
|
||||
cmp r4d, 0
|
||||
je .loop0
|
||||
cmp r4d, 2
|
||||
je .loop2
|
||||
cmp r4d, 4
|
||||
je .loop4
|
||||
cmp r4d, 6
|
||||
je .loop6
|
||||
cmp r4d, 8
|
||||
je .loop8
|
||||
cmp r4d, 10
|
||||
je .loop10
|
||||
cmp r4d, 12
|
||||
je .loop12
|
||||
SCALARPRODUCT_LOOP 14
|
||||
SCALARPRODUCT_LOOP 12
|
||||
SCALARPRODUCT_LOOP 10
|
||||
SCALARPRODUCT_LOOP 8
|
||||
SCALARPRODUCT_LOOP 6
|
||||
SCALARPRODUCT_LOOP 4
|
||||
SCALARPRODUCT_LOOP 2
|
||||
SCALARPRODUCT_LOOP 0
|
||||
.end:
|
||||
movhlps m0, m6
|
||||
paddd m6, m0
|
||||
pshuflw m0, m6, 0x4e
|
||||
paddd m6, m0
|
||||
movd eax, m6
|
||||
RET
|
47
libavcodec/x86/apedsp_init.c
Normal file
47
libavcodec/x86/apedsp_init.c
Normal file
@ -0,0 +1,47 @@
|
||||
/*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/apedsp.h"
|
||||
|
||||
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
|
||||
av_cold void ff_apedsp_init_x86(APEDSPContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags) &&
|
||||
!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
|
||||
}
|
@ -53,52 +53,6 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
|
||||
paddd m2, m0
|
||||
movd eax, m2
|
||||
RET
|
||||
|
||||
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
|
||||
; int order, int mul)
|
||||
cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
|
||||
shl orderq, 1
|
||||
movd m7, mulm
|
||||
%if mmsize == 16
|
||||
pshuflw m7, m7, 0
|
||||
punpcklqdq m7, m7
|
||||
%else
|
||||
pshufw m7, m7, 0
|
||||
%endif
|
||||
pxor m6, m6
|
||||
add v1q, orderq
|
||||
add v2q, orderq
|
||||
add v3q, orderq
|
||||
neg orderq
|
||||
.loop:
|
||||
movu m0, [v2q + orderq]
|
||||
movu m1, [v2q + orderq + mmsize]
|
||||
mova m4, [v1q + orderq]
|
||||
mova m5, [v1q + orderq + mmsize]
|
||||
movu m2, [v3q + orderq]
|
||||
movu m3, [v3q + orderq + mmsize]
|
||||
pmaddwd m0, m4
|
||||
pmaddwd m1, m5
|
||||
pmullw m2, m7
|
||||
pmullw m3, m7
|
||||
paddd m6, m0
|
||||
paddd m6, m1
|
||||
paddw m2, m4
|
||||
paddw m3, m5
|
||||
mova [v1q + orderq], m2
|
||||
mova [v1q + orderq + mmsize], m3
|
||||
add orderq, mmsize*2
|
||||
jl .loop
|
||||
%if mmsize == 16
|
||||
movhlps m0, m6
|
||||
paddd m6, m0
|
||||
pshuflw m0, m6, 0x4e
|
||||
%else
|
||||
pshufw m0, m6, 0x4e
|
||||
%endif
|
||||
paddd m6, m0
|
||||
movd eax, m6
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
@ -106,97 +60,6 @@ SCALARPRODUCT
|
||||
INIT_XMM sse2
|
||||
SCALARPRODUCT
|
||||
|
||||
%macro SCALARPRODUCT_LOOP 1
|
||||
align 16
|
||||
.loop%1:
|
||||
sub orderq, mmsize*2
|
||||
%if %1
|
||||
mova m1, m4
|
||||
mova m4, [v2q + orderq]
|
||||
mova m0, [v2q + orderq + mmsize]
|
||||
palignr m1, m0, %1
|
||||
palignr m0, m4, %1
|
||||
mova m3, m5
|
||||
mova m5, [v3q + orderq]
|
||||
mova m2, [v3q + orderq + mmsize]
|
||||
palignr m3, m2, %1
|
||||
palignr m2, m5, %1
|
||||
%else
|
||||
mova m0, [v2q + orderq]
|
||||
mova m1, [v2q + orderq + mmsize]
|
||||
mova m2, [v3q + orderq]
|
||||
mova m3, [v3q + orderq + mmsize]
|
||||
%endif
|
||||
%define t0 [v1q + orderq]
|
||||
%define t1 [v1q + orderq + mmsize]
|
||||
%if ARCH_X86_64
|
||||
mova m8, t0
|
||||
mova m9, t1
|
||||
%define t0 m8
|
||||
%define t1 m9
|
||||
%endif
|
||||
pmaddwd m0, t0
|
||||
pmaddwd m1, t1
|
||||
pmullw m2, m7
|
||||
pmullw m3, m7
|
||||
paddw m2, t0
|
||||
paddw m3, t1
|
||||
paddd m6, m0
|
||||
paddd m6, m1
|
||||
mova [v1q + orderq], m2
|
||||
mova [v1q + orderq + mmsize], m3
|
||||
jg .loop%1
|
||||
%if %1
|
||||
jmp .end
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
|
||||
; int order, int mul)
|
||||
INIT_XMM ssse3
|
||||
cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
|
||||
shl orderq, 1
|
||||
movd m7, mulm
|
||||
pshuflw m7, m7, 0
|
||||
punpcklqdq m7, m7
|
||||
pxor m6, m6
|
||||
mov r4d, v2d
|
||||
and r4d, 15
|
||||
and v2q, ~15
|
||||
and v3q, ~15
|
||||
mova m4, [v2q + orderq]
|
||||
mova m5, [v3q + orderq]
|
||||
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
|
||||
cmp r4d, 0
|
||||
je .loop0
|
||||
cmp r4d, 2
|
||||
je .loop2
|
||||
cmp r4d, 4
|
||||
je .loop4
|
||||
cmp r4d, 6
|
||||
je .loop6
|
||||
cmp r4d, 8
|
||||
je .loop8
|
||||
cmp r4d, 10
|
||||
je .loop10
|
||||
cmp r4d, 12
|
||||
je .loop12
|
||||
SCALARPRODUCT_LOOP 14
|
||||
SCALARPRODUCT_LOOP 12
|
||||
SCALARPRODUCT_LOOP 10
|
||||
SCALARPRODUCT_LOOP 8
|
||||
SCALARPRODUCT_LOOP 6
|
||||
SCALARPRODUCT_LOOP 4
|
||||
SCALARPRODUCT_LOOP 2
|
||||
SCALARPRODUCT_LOOP 0
|
||||
.end:
|
||||
movhlps m0, m6
|
||||
paddd m6, m0
|
||||
pshuflw m0, m6, 0x4e
|
||||
paddd m6, m0
|
||||
movd eax, m6
|
||||
RET
|
||||
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
|
||||
|
@ -76,15 +76,6 @@ int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
|
||||
int order);
|
||||
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
|
||||
int order);
|
||||
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
|
||||
const int16_t *v3,
|
||||
int order, int mul);
|
||||
|
||||
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
|
||||
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
|
||||
@ -568,7 +559,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
|
||||
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
|
||||
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
|
||||
#endif /* HAVE_MMXEXT_EXTERNAL */
|
||||
}
|
||||
|
||||
@ -607,7 +597,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
|
||||
|
||||
#if HAVE_SSE2_EXTERNAL
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
||||
if (cpu_flags & AV_CPU_FLAG_ATOM) {
|
||||
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
|
||||
} else {
|
||||
@ -621,8 +610,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
|
||||
int cpu_flags, unsigned high_bit_depth)
|
||||
{
|
||||
#if HAVE_SSSE3_EXTERNAL
|
||||
if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
|
||||
c->bswap_buf = ff_bswap32_buf_ssse3;
|
||||
#endif /* HAVE_SSSE3_EXTERNAL */
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user