mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
Merge commit '054013a0fc6f2b52c60cee3e051be8cc7f82cef3'
* commit '054013a0fc6f2b52c60cee3e051be8cc7f82cef3': dsputil: Move APE-specific bits into apedsp Conflicts: libavcodec/arm/int_neon.S libavcodec/x86/dsputil.asm Merged-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
commit
40f3a87c10
@ -25,6 +25,7 @@
|
|||||||
#include "libavutil/avassert.h"
|
#include "libavutil/avassert.h"
|
||||||
#include "libavutil/channel_layout.h"
|
#include "libavutil/channel_layout.h"
|
||||||
#include "libavutil/opt.h"
|
#include "libavutil/opt.h"
|
||||||
|
#include "apedsp.h"
|
||||||
#include "avcodec.h"
|
#include "avcodec.h"
|
||||||
#include "dsputil.h"
|
#include "dsputil.h"
|
||||||
#include "bytestream.h"
|
#include "bytestream.h"
|
||||||
@ -136,6 +137,7 @@ typedef struct APEContext {
|
|||||||
AVClass *class; ///< class for AVOptions
|
AVClass *class; ///< class for AVOptions
|
||||||
AVCodecContext *avctx;
|
AVCodecContext *avctx;
|
||||||
DSPContext dsp;
|
DSPContext dsp;
|
||||||
|
APEDSPContext adsp;
|
||||||
int channels;
|
int channels;
|
||||||
int samples; ///< samples left to decode in current frame
|
int samples; ///< samples left to decode in current frame
|
||||||
int bps;
|
int bps;
|
||||||
@ -195,8 +197,6 @@ static void predictor_decode_stereo_3930(APEContext *ctx, int count);
|
|||||||
static void predictor_decode_mono_3950(APEContext *ctx, int count);
|
static void predictor_decode_mono_3950(APEContext *ctx, int count);
|
||||||
static void predictor_decode_stereo_3950(APEContext *ctx, int count);
|
static void predictor_decode_stereo_3950(APEContext *ctx, int count);
|
||||||
|
|
||||||
// TODO: dsputilize
|
|
||||||
|
|
||||||
static av_cold int ape_decode_close(AVCodecContext *avctx)
|
static av_cold int ape_decode_close(AVCodecContext *avctx)
|
||||||
{
|
{
|
||||||
APEContext *s = avctx->priv_data;
|
APEContext *s = avctx->priv_data;
|
||||||
@ -212,6 +212,19 @@ static av_cold int ape_decode_close(AVCodecContext *avctx)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
|
||||||
|
const int16_t *v3,
|
||||||
|
int order, int mul)
|
||||||
|
{
|
||||||
|
int res = 0;
|
||||||
|
|
||||||
|
while (order--) {
|
||||||
|
res += *v1 * *v2++;
|
||||||
|
*v1++ += mul * *v3++;
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
static av_cold int ape_decode_init(AVCodecContext *avctx)
|
static av_cold int ape_decode_init(AVCodecContext *avctx)
|
||||||
{
|
{
|
||||||
APEContext *s = avctx->priv_data;
|
APEContext *s = avctx->priv_data;
|
||||||
@ -293,6 +306,15 @@ static av_cold int ape_decode_init(AVCodecContext *avctx)
|
|||||||
s->predictor_decode_stereo = predictor_decode_stereo_3950;
|
s->predictor_decode_stereo = predictor_decode_stereo_3950;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s->adsp.scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
|
||||||
|
|
||||||
|
if (ARCH_ARM)
|
||||||
|
ff_apedsp_init_arm(&s->adsp);
|
||||||
|
if (ARCH_PPC)
|
||||||
|
ff_apedsp_init_ppc(&s->adsp);
|
||||||
|
if (ARCH_X86)
|
||||||
|
ff_apedsp_init_x86(&s->adsp);
|
||||||
|
|
||||||
ff_dsputil_init(&s->dsp, avctx);
|
ff_dsputil_init(&s->dsp, avctx);
|
||||||
avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
|
avctx->channel_layout = (avctx->channels==2) ? AV_CH_LAYOUT_STEREO : AV_CH_LAYOUT_MONO;
|
||||||
|
|
||||||
@ -1275,9 +1297,10 @@ static void do_apply_filter(APEContext *ctx, int version, APEFilter *f,
|
|||||||
|
|
||||||
while (count--) {
|
while (count--) {
|
||||||
/* round fixedpoint scalar product */
|
/* round fixedpoint scalar product */
|
||||||
res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order,
|
res = ctx->adsp.scalarproduct_and_madd_int16(f->coeffs,
|
||||||
f->adaptcoeffs - order,
|
f->delay - order,
|
||||||
order, APESIGN(*data));
|
f->adaptcoeffs - order,
|
||||||
|
order, APESIGN(*data));
|
||||||
res = (res + (1 << (fracbits - 1))) >> fracbits;
|
res = (res + (1 << (fracbits - 1))) >> fracbits;
|
||||||
res += *data;
|
res += *data;
|
||||||
*data++ = res;
|
*data++ = res;
|
||||||
|
44
libavcodec/apedsp.h
Normal file
44
libavcodec/apedsp.h
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
/*
|
||||||
|
* Monkey's Audio lossless audio decoder
|
||||||
|
* Copyright (c) 2007 Benjamin Zores <ben@geexbox.org>
|
||||||
|
* based upon libdemac from Dave Chapman.
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef AVCODEC_APEDSP_H
|
||||||
|
#define AVCODEC_APEDSP_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
typedef struct APEDSPContext {
|
||||||
|
/**
|
||||||
|
* Calculate scalar product of v1 and v2,
|
||||||
|
* and v1[i] += v3[i] * mul
|
||||||
|
* @param len length of vectors, should be multiple of 16
|
||||||
|
*/
|
||||||
|
int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
|
||||||
|
const int16_t *v2,
|
||||||
|
const int16_t *v3,
|
||||||
|
int len, int mul);
|
||||||
|
} APEDSPContext;
|
||||||
|
|
||||||
|
void ff_apedsp_init_arm(APEDSPContext *c);
|
||||||
|
void ff_apedsp_init_ppc(APEDSPContext *c);
|
||||||
|
void ff_apedsp_init_x86(APEDSPContext *c);
|
||||||
|
|
||||||
|
#endif /* AVCODEC_APEDSP_H */
|
@ -24,6 +24,7 @@ OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
|
|||||||
|
|
||||||
OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
|
OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
|
||||||
arm/sbrdsp_init_arm.o
|
arm/sbrdsp_init_arm.o
|
||||||
|
OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_init_arm.o
|
||||||
OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o
|
OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o
|
||||||
OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \
|
OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \
|
||||||
arm/flacdsp_arm.o
|
arm/flacdsp_arm.o
|
||||||
@ -100,6 +101,7 @@ NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o
|
|||||||
|
|
||||||
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
|
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
|
||||||
arm/sbrdsp_neon.o
|
arm/sbrdsp_neon.o
|
||||||
|
NEON-OBJS-$(CONFIG_APE_DECODER) += arm/apedsp_neon.o
|
||||||
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
|
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
|
||||||
arm/synth_filter_neon.o
|
arm/synth_filter_neon.o
|
||||||
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
|
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
|
||||||
|
38
libavcodec/arm/apedsp_init_arm.c
Normal file
38
libavcodec/arm/apedsp_init_arm.c
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "libavutil/attributes.h"
|
||||||
|
#include "libavutil/cpu.h"
|
||||||
|
#include "libavutil/arm/cpu.h"
|
||||||
|
#include "libavcodec/apedsp.h"
|
||||||
|
|
||||||
|
int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
|
||||||
|
const int16_t *v3, int len, int mul);
|
||||||
|
|
||||||
|
av_cold void ff_apedsp_init_arm(APEDSPContext *c)
|
||||||
|
{
|
||||||
|
int cpu_flags = av_get_cpu_flags();
|
||||||
|
|
||||||
|
if (have_neon(cpu_flags)) {
|
||||||
|
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
|
||||||
|
}
|
||||||
|
}
|
62
libavcodec/arm/apedsp_neon.S
Normal file
62
libavcodec/arm/apedsp_neon.S
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
/*
|
||||||
|
* ARM NEON optimised integer operations
|
||||||
|
* Copyright (c) 2009 Kostya Shishkov
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "libavutil/arm/asm.S"
|
||||||
|
|
||||||
|
@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
|
||||||
|
function ff_scalarproduct_and_madd_int16_neon, export=1
|
||||||
|
vld1.16 {d28[],d29[]}, [sp]
|
||||||
|
vmov.i16 q0, #0
|
||||||
|
vmov.i16 q1, #0
|
||||||
|
vmov.i16 q2, #0
|
||||||
|
vmov.i16 q3, #0
|
||||||
|
mov r12, r0
|
||||||
|
|
||||||
|
1: vld1.16 {d16-d17}, [r0,:128]!
|
||||||
|
vld1.16 {d18-d19}, [r1]!
|
||||||
|
vld1.16 {d20-d21}, [r2]!
|
||||||
|
vld1.16 {d22-d23}, [r0,:128]!
|
||||||
|
vld1.16 {d24-d25}, [r1]!
|
||||||
|
vld1.16 {d26-d27}, [r2]!
|
||||||
|
vmul.s16 q10, q10, q14
|
||||||
|
vmul.s16 q13, q13, q14
|
||||||
|
vmlal.s16 q0, d16, d18
|
||||||
|
vmlal.s16 q1, d17, d19
|
||||||
|
vadd.s16 q10, q8, q10
|
||||||
|
vadd.s16 q13, q11, q13
|
||||||
|
vmlal.s16 q2, d22, d24
|
||||||
|
vmlal.s16 q3, d23, d25
|
||||||
|
vst1.16 {q10}, [r12,:128]!
|
||||||
|
subs r3, r3, #16
|
||||||
|
vst1.16 {q13}, [r12,:128]!
|
||||||
|
bgt 1b
|
||||||
|
|
||||||
|
vpadd.s32 d16, d0, d1
|
||||||
|
vpadd.s32 d17, d2, d3
|
||||||
|
vpadd.s32 d18, d4, d5
|
||||||
|
vpadd.s32 d19, d6, d7
|
||||||
|
vpadd.s32 d0, d16, d17
|
||||||
|
vpadd.s32 d1, d18, d19
|
||||||
|
vpadd.s32 d2, d0, d1
|
||||||
|
vpaddl.s32 d3, d2
|
||||||
|
vmov.32 r0, d3[0]
|
||||||
|
bx lr
|
||||||
|
endfunc
|
@ -44,9 +44,6 @@ void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
|
|||||||
|
|
||||||
int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
|
int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
|
||||||
|
|
||||||
int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
|
|
||||||
const int16_t *v3, int len, int mul);
|
|
||||||
|
|
||||||
av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
|
av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
|
||||||
unsigned high_bit_depth)
|
unsigned high_bit_depth)
|
||||||
{
|
{
|
||||||
@ -73,6 +70,4 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx,
|
|||||||
c->vector_clip_int32 = ff_vector_clip_int32_neon;
|
c->vector_clip_int32 = ff_vector_clip_int32_neon;
|
||||||
|
|
||||||
c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
|
c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
|
||||||
|
|
||||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
|
|
||||||
}
|
}
|
||||||
|
@ -49,42 +49,3 @@ function ff_scalarproduct_int16_neon, export=1
|
|||||||
bx lr
|
bx lr
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
|
|
||||||
function ff_scalarproduct_and_madd_int16_neon, export=1
|
|
||||||
vld1.16 {d28[],d29[]}, [sp]
|
|
||||||
vmov.i16 q0, #0
|
|
||||||
vmov.i16 q1, #0
|
|
||||||
vmov.i16 q2, #0
|
|
||||||
vmov.i16 q3, #0
|
|
||||||
mov r12, r0
|
|
||||||
|
|
||||||
1: vld1.16 {d16-d17}, [r0,:128]!
|
|
||||||
vld1.16 {d18-d19}, [r1]!
|
|
||||||
vld1.16 {d20-d21}, [r2]!
|
|
||||||
vld1.16 {d22-d23}, [r0,:128]!
|
|
||||||
vld1.16 {d24-d25}, [r1]!
|
|
||||||
vld1.16 {d26-d27}, [r2]!
|
|
||||||
vmul.s16 q10, q10, q14
|
|
||||||
vmul.s16 q13, q13, q14
|
|
||||||
vmlal.s16 q0, d16, d18
|
|
||||||
vmlal.s16 q1, d17, d19
|
|
||||||
vadd.s16 q10, q8, q10
|
|
||||||
vadd.s16 q13, q11, q13
|
|
||||||
vmlal.s16 q2, d22, d24
|
|
||||||
vmlal.s16 q3, d23, d25
|
|
||||||
vst1.16 {q10}, [r12,:128]!
|
|
||||||
subs r3, r3, #16
|
|
||||||
vst1.16 {q13}, [r12,:128]!
|
|
||||||
bgt 1b
|
|
||||||
|
|
||||||
vpadd.s32 d16, d0, d1
|
|
||||||
vpadd.s32 d17, d2, d3
|
|
||||||
vpadd.s32 d18, d4, d5
|
|
||||||
vpadd.s32 d19, d6, d7
|
|
||||||
vpadd.s32 d0, d16, d17
|
|
||||||
vpadd.s32 d1, d18, d19
|
|
||||||
vpadd.s32 d2, d0, d1
|
|
||||||
vpaddl.s32 d3, d2
|
|
||||||
vmov.32 r0, d3[0]
|
|
||||||
bx lr
|
|
||||||
endfunc
|
|
||||||
|
@ -2186,19 +2186,6 @@ static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
|
|
||||||
const int16_t *v3,
|
|
||||||
int order, int mul)
|
|
||||||
{
|
|
||||||
int res = 0;
|
|
||||||
|
|
||||||
while (order--) {
|
|
||||||
res += *v1 * *v2++;
|
|
||||||
*v1++ += mul * *v3++;
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
|
static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
|
||||||
int32_t max, unsigned int len)
|
int32_t max, unsigned int len)
|
||||||
{
|
{
|
||||||
@ -2490,8 +2477,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
|
|||||||
c->try_8x8basis = try_8x8basis_c;
|
c->try_8x8basis = try_8x8basis_c;
|
||||||
c->add_8x8basis = add_8x8basis_c;
|
c->add_8x8basis = add_8x8basis_c;
|
||||||
|
|
||||||
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
|
|
||||||
|
|
||||||
c->scalarproduct_int16 = scalarproduct_int16_c;
|
c->scalarproduct_int16 = scalarproduct_int16_c;
|
||||||
c->vector_clip_int32 = vector_clip_int32_c;
|
c->vector_clip_int32 = vector_clip_int32_c;
|
||||||
c->vector_clipf = vector_clipf_c;
|
c->vector_clipf = vector_clipf_c;
|
||||||
|
@ -257,16 +257,6 @@ typedef struct DSPContext {
|
|||||||
*/
|
*/
|
||||||
int32_t (*scalarproduct_int16)(const int16_t *v1,
|
int32_t (*scalarproduct_int16)(const int16_t *v1,
|
||||||
const int16_t *v2 /* align 16 */, int len);
|
const int16_t *v2 /* align 16 */, int len);
|
||||||
/* ape functions */
|
|
||||||
/**
|
|
||||||
* Calculate scalar product of v1 and v2,
|
|
||||||
* and v1[i] += v3[i] * mul
|
|
||||||
* @param len length of vectors, should be multiple of 16
|
|
||||||
*/
|
|
||||||
int32_t (*scalarproduct_and_madd_int16)(int16_t *v1 /* align 16 */,
|
|
||||||
const int16_t *v2,
|
|
||||||
const int16_t *v3,
|
|
||||||
int len, int mul);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clip each element in an array of int32_t to a given minimum and
|
* Clip each element in an array of int32_t to a given minimum and
|
||||||
|
@ -12,6 +12,7 @@ OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o
|
|||||||
OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o
|
OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o
|
||||||
OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o
|
OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o
|
||||||
|
|
||||||
|
OBJS-$(CONFIG_APE_DECODER) += ppc/apedsp_altivec.o
|
||||||
OBJS-$(CONFIG_SVQ1_ENCODER) += ppc/svq1enc_altivec.o
|
OBJS-$(CONFIG_SVQ1_ENCODER) += ppc/svq1enc_altivec.o
|
||||||
OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o
|
OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o
|
||||||
OBJS-$(CONFIG_VORBIS_DECODER) += ppc/vorbisdsp_altivec.o
|
OBJS-$(CONFIG_VORBIS_DECODER) += ppc/vorbisdsp_altivec.o
|
||||||
|
77
libavcodec/ppc/apedsp_altivec.c
Normal file
77
libavcodec/ppc/apedsp_altivec.c
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "config.h"
|
||||||
|
#if HAVE_ALTIVEC_H
|
||||||
|
#include <altivec.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "libavutil/attributes.h"
|
||||||
|
#include "libavutil/ppc/types_altivec.h"
|
||||||
|
#include "libavcodec/apedsp.h"
|
||||||
|
|
||||||
|
#if HAVE_ALTIVEC
|
||||||
|
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
|
||||||
|
const int16_t *v2,
|
||||||
|
const int16_t *v3,
|
||||||
|
int order, int mul)
|
||||||
|
{
|
||||||
|
LOAD_ZERO;
|
||||||
|
vec_s16 *pv1 = (vec_s16 *) v1;
|
||||||
|
register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
|
||||||
|
register vec_s16 t0, t1, i0, i1, i4;
|
||||||
|
register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
|
||||||
|
register vec_s32 res = zero_s32v;
|
||||||
|
register vec_u8 align = vec_lvsl(0, v2);
|
||||||
|
int32_t ires;
|
||||||
|
|
||||||
|
order >>= 4;
|
||||||
|
do {
|
||||||
|
i1 = vec_ld(16, v2);
|
||||||
|
t0 = vec_perm(i2, i1, align);
|
||||||
|
i2 = vec_ld(32, v2);
|
||||||
|
t1 = vec_perm(i1, i2, align);
|
||||||
|
i0 = pv1[0];
|
||||||
|
i1 = pv1[1];
|
||||||
|
res = vec_msum(t0, i0, res);
|
||||||
|
res = vec_msum(t1, i1, res);
|
||||||
|
i4 = vec_ld(16, v3);
|
||||||
|
t0 = vec_perm(i3, i4, align);
|
||||||
|
i3 = vec_ld(32, v3);
|
||||||
|
t1 = vec_perm(i4, i3, align);
|
||||||
|
pv1[0] = vec_mladd(t0, muls, i0);
|
||||||
|
pv1[1] = vec_mladd(t1, muls, i1);
|
||||||
|
pv1 += 2;
|
||||||
|
v2 += 16;
|
||||||
|
v3 += 16;
|
||||||
|
} while (--order);
|
||||||
|
res = vec_splat(vec_sums(res, zero_s32v), 3);
|
||||||
|
vec_ste(res, 0, &ires);
|
||||||
|
|
||||||
|
return ires;
|
||||||
|
}
|
||||||
|
#endif /* HAVE_ALTIVEC */
|
||||||
|
|
||||||
|
av_cold void ff_apedsp_init_ppc(APEDSPContext *c)
|
||||||
|
{
|
||||||
|
#if HAVE_ALTIVEC
|
||||||
|
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
|
||||||
|
#endif /* HAVE_ALTIVEC */
|
||||||
|
}
|
@ -56,49 +56,7 @@ static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
|
|||||||
return ires;
|
return ires;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
|
|
||||||
const int16_t *v2,
|
|
||||||
const int16_t *v3,
|
|
||||||
int order, int mul)
|
|
||||||
{
|
|
||||||
LOAD_ZERO;
|
|
||||||
vec_s16 *pv1 = (vec_s16 *) v1;
|
|
||||||
register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
|
|
||||||
register vec_s16 t0, t1, i0, i1, i4;
|
|
||||||
register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
|
|
||||||
register vec_s32 res = zero_s32v;
|
|
||||||
register vec_u8 align = vec_lvsl(0, v2);
|
|
||||||
int32_t ires;
|
|
||||||
|
|
||||||
order >>= 4;
|
|
||||||
do {
|
|
||||||
i1 = vec_ld(16, v2);
|
|
||||||
t0 = vec_perm(i2, i1, align);
|
|
||||||
i2 = vec_ld(32, v2);
|
|
||||||
t1 = vec_perm(i1, i2, align);
|
|
||||||
i0 = pv1[0];
|
|
||||||
i1 = pv1[1];
|
|
||||||
res = vec_msum(t0, i0, res);
|
|
||||||
res = vec_msum(t1, i1, res);
|
|
||||||
i4 = vec_ld(16, v3);
|
|
||||||
t0 = vec_perm(i3, i4, align);
|
|
||||||
i3 = vec_ld(32, v3);
|
|
||||||
t1 = vec_perm(i4, i3, align);
|
|
||||||
pv1[0] = vec_mladd(t0, muls, i0);
|
|
||||||
pv1[1] = vec_mladd(t1, muls, i1);
|
|
||||||
pv1 += 2;
|
|
||||||
v2 += 16;
|
|
||||||
v3 += 16;
|
|
||||||
} while (--order);
|
|
||||||
res = vec_splat(vec_sums(res, zero_s32v), 3);
|
|
||||||
vec_ste(res, 0, &ires);
|
|
||||||
|
|
||||||
return ires;
|
|
||||||
}
|
|
||||||
|
|
||||||
av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
|
av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
|
||||||
{
|
{
|
||||||
c->scalarproduct_int16 = scalarproduct_int16_altivec;
|
c->scalarproduct_int16 = scalarproduct_int16_altivec;
|
||||||
|
|
||||||
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
|
|
||||||
}
|
}
|
||||||
|
@ -29,6 +29,7 @@ OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
|
|||||||
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
|
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
|
||||||
|
|
||||||
OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
|
OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
|
||||||
|
OBJS-$(CONFIG_APE_DECODER) += x86/apedsp_init.o
|
||||||
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
|
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
|
||||||
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o
|
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o
|
||||||
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
|
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
|
||||||
@ -103,6 +104,7 @@ YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
|
|||||||
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
|
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
|
||||||
|
|
||||||
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
|
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
|
||||||
|
YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o
|
||||||
YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
|
YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
|
||||||
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
|
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
|
||||||
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
|
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
|
||||||
|
157
libavcodec/x86/apedsp.asm
Normal file
157
libavcodec/x86/apedsp.asm
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
;******************************************************************************
|
||||||
|
;* Copyright (c) 2008 Loren Merritt
|
||||||
|
;*
|
||||||
|
;* This file is part of FFmpeg.
|
||||||
|
;*
|
||||||
|
;* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
;* modify it under the terms of the GNU Lesser General Public
|
||||||
|
;* License as published by the Free Software Foundation; either
|
||||||
|
;* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
;*
|
||||||
|
;* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
;* Lesser General Public License for more details.
|
||||||
|
;*
|
||||||
|
;* You should have received a copy of the GNU Lesser General Public
|
||||||
|
;* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
;******************************************************************************
|
||||||
|
|
||||||
|
%include "libavutil/x86/x86util.asm"
|
||||||
|
|
||||||
|
SECTION_TEXT
|
||||||
|
|
||||||
|
%macro SCALARPRODUCT 0
|
||||||
|
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
|
||||||
|
; int order, int mul)
|
||||||
|
cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
|
||||||
|
shl orderq, 1
|
||||||
|
movd m7, mulm
|
||||||
|
%if mmsize == 16
|
||||||
|
pshuflw m7, m7, 0
|
||||||
|
punpcklqdq m7, m7
|
||||||
|
%else
|
||||||
|
pshufw m7, m7, 0
|
||||||
|
%endif
|
||||||
|
pxor m6, m6
|
||||||
|
add v1q, orderq
|
||||||
|
add v2q, orderq
|
||||||
|
add v3q, orderq
|
||||||
|
neg orderq
|
||||||
|
.loop:
|
||||||
|
movu m0, [v2q + orderq]
|
||||||
|
movu m1, [v2q + orderq + mmsize]
|
||||||
|
mova m4, [v1q + orderq]
|
||||||
|
mova m5, [v1q + orderq + mmsize]
|
||||||
|
movu m2, [v3q + orderq]
|
||||||
|
movu m3, [v3q + orderq + mmsize]
|
||||||
|
pmaddwd m0, m4
|
||||||
|
pmaddwd m1, m5
|
||||||
|
pmullw m2, m7
|
||||||
|
pmullw m3, m7
|
||||||
|
paddd m6, m0
|
||||||
|
paddd m6, m1
|
||||||
|
paddw m2, m4
|
||||||
|
paddw m3, m5
|
||||||
|
mova [v1q + orderq], m2
|
||||||
|
mova [v1q + orderq + mmsize], m3
|
||||||
|
add orderq, mmsize*2
|
||||||
|
jl .loop
|
||||||
|
HADDD m6, m0
|
||||||
|
movd eax, m6
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_MMX mmxext
|
||||||
|
SCALARPRODUCT
|
||||||
|
INIT_XMM sse2
|
||||||
|
SCALARPRODUCT
|
||||||
|
|
||||||
|
%macro SCALARPRODUCT_LOOP 1
|
||||||
|
align 16
|
||||||
|
.loop%1:
|
||||||
|
sub orderq, mmsize*2
|
||||||
|
%if %1
|
||||||
|
mova m1, m4
|
||||||
|
mova m4, [v2q + orderq]
|
||||||
|
mova m0, [v2q + orderq + mmsize]
|
||||||
|
palignr m1, m0, %1
|
||||||
|
palignr m0, m4, %1
|
||||||
|
mova m3, m5
|
||||||
|
mova m5, [v3q + orderq]
|
||||||
|
mova m2, [v3q + orderq + mmsize]
|
||||||
|
palignr m3, m2, %1
|
||||||
|
palignr m2, m5, %1
|
||||||
|
%else
|
||||||
|
mova m0, [v2q + orderq]
|
||||||
|
mova m1, [v2q + orderq + mmsize]
|
||||||
|
mova m2, [v3q + orderq]
|
||||||
|
mova m3, [v3q + orderq + mmsize]
|
||||||
|
%endif
|
||||||
|
%define t0 [v1q + orderq]
|
||||||
|
%define t1 [v1q + orderq + mmsize]
|
||||||
|
%if ARCH_X86_64
|
||||||
|
mova m8, t0
|
||||||
|
mova m9, t1
|
||||||
|
%define t0 m8
|
||||||
|
%define t1 m9
|
||||||
|
%endif
|
||||||
|
pmaddwd m0, t0
|
||||||
|
pmaddwd m1, t1
|
||||||
|
pmullw m2, m7
|
||||||
|
pmullw m3, m7
|
||||||
|
paddw m2, t0
|
||||||
|
paddw m3, t1
|
||||||
|
paddd m6, m0
|
||||||
|
paddd m6, m1
|
||||||
|
mova [v1q + orderq], m2
|
||||||
|
mova [v1q + orderq + mmsize], m3
|
||||||
|
jg .loop%1
|
||||||
|
%if %1
|
||||||
|
jmp .end
|
||||||
|
%endif
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
|
||||||
|
; int order, int mul)
|
||||||
|
INIT_XMM ssse3
|
||||||
|
cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
|
||||||
|
shl orderq, 1
|
||||||
|
movd m7, mulm
|
||||||
|
pshuflw m7, m7, 0
|
||||||
|
punpcklqdq m7, m7
|
||||||
|
pxor m6, m6
|
||||||
|
mov r4d, v2d
|
||||||
|
and r4d, 15
|
||||||
|
and v2q, ~15
|
||||||
|
and v3q, ~15
|
||||||
|
mova m4, [v2q + orderq]
|
||||||
|
mova m5, [v3q + orderq]
|
||||||
|
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
|
||||||
|
cmp r4d, 0
|
||||||
|
je .loop0
|
||||||
|
cmp r4d, 2
|
||||||
|
je .loop2
|
||||||
|
cmp r4d, 4
|
||||||
|
je .loop4
|
||||||
|
cmp r4d, 6
|
||||||
|
je .loop6
|
||||||
|
cmp r4d, 8
|
||||||
|
je .loop8
|
||||||
|
cmp r4d, 10
|
||||||
|
je .loop10
|
||||||
|
cmp r4d, 12
|
||||||
|
je .loop12
|
||||||
|
SCALARPRODUCT_LOOP 14
|
||||||
|
SCALARPRODUCT_LOOP 12
|
||||||
|
SCALARPRODUCT_LOOP 10
|
||||||
|
SCALARPRODUCT_LOOP 8
|
||||||
|
SCALARPRODUCT_LOOP 6
|
||||||
|
SCALARPRODUCT_LOOP 4
|
||||||
|
SCALARPRODUCT_LOOP 2
|
||||||
|
SCALARPRODUCT_LOOP 0
|
||||||
|
.end:
|
||||||
|
HADDD m6, m0
|
||||||
|
movd eax, m6
|
||||||
|
RET
|
47
libavcodec/x86/apedsp_init.c
Normal file
47
libavcodec/x86/apedsp_init.c
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
/*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "libavutil/attributes.h"
|
||||||
|
#include "libavutil/cpu.h"
|
||||||
|
#include "libavutil/x86/cpu.h"
|
||||||
|
#include "libavcodec/apedsp.h"
|
||||||
|
|
||||||
|
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
|
||||||
|
const int16_t *v3,
|
||||||
|
int order, int mul);
|
||||||
|
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
|
||||||
|
const int16_t *v3,
|
||||||
|
int order, int mul);
|
||||||
|
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
|
||||||
|
const int16_t *v3,
|
||||||
|
int order, int mul);
|
||||||
|
|
||||||
|
av_cold void ff_apedsp_init_x86(APEDSPContext *c)
|
||||||
|
{
|
||||||
|
int cpu_flags = av_get_cpu_flags();
|
||||||
|
|
||||||
|
if (EXTERNAL_MMXEXT(cpu_flags))
|
||||||
|
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
|
||||||
|
|
||||||
|
if (EXTERNAL_SSE2(cpu_flags))
|
||||||
|
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
||||||
|
|
||||||
|
if (EXTERNAL_SSSE3(cpu_flags) &&
|
||||||
|
!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
|
||||||
|
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
|
||||||
|
}
|
@ -53,45 +53,6 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
|
|||||||
emms
|
emms
|
||||||
%endif
|
%endif
|
||||||
RET
|
RET
|
||||||
|
|
||||||
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
|
|
||||||
; int order, int mul)
|
|
||||||
cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
|
|
||||||
shl orderq, 1
|
|
||||||
movd m7, mulm
|
|
||||||
%if mmsize == 16
|
|
||||||
pshuflw m7, m7, 0
|
|
||||||
punpcklqdq m7, m7
|
|
||||||
%else
|
|
||||||
pshufw m7, m7, 0
|
|
||||||
%endif
|
|
||||||
pxor m6, m6
|
|
||||||
add v1q, orderq
|
|
||||||
add v2q, orderq
|
|
||||||
add v3q, orderq
|
|
||||||
neg orderq
|
|
||||||
.loop:
|
|
||||||
movu m0, [v2q + orderq]
|
|
||||||
movu m1, [v2q + orderq + mmsize]
|
|
||||||
mova m4, [v1q + orderq]
|
|
||||||
mova m5, [v1q + orderq + mmsize]
|
|
||||||
movu m2, [v3q + orderq]
|
|
||||||
movu m3, [v3q + orderq + mmsize]
|
|
||||||
pmaddwd m0, m4
|
|
||||||
pmaddwd m1, m5
|
|
||||||
pmullw m2, m7
|
|
||||||
pmullw m3, m7
|
|
||||||
paddd m6, m0
|
|
||||||
paddd m6, m1
|
|
||||||
paddw m2, m4
|
|
||||||
paddw m3, m5
|
|
||||||
mova [v1q + orderq], m2
|
|
||||||
mova [v1q + orderq + mmsize], m3
|
|
||||||
add orderq, mmsize*2
|
|
||||||
jl .loop
|
|
||||||
HADDD m6, m0
|
|
||||||
movd eax, m6
|
|
||||||
RET
|
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
INIT_MMX mmxext
|
INIT_MMX mmxext
|
||||||
@ -99,94 +60,6 @@ SCALARPRODUCT
|
|||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
SCALARPRODUCT
|
SCALARPRODUCT
|
||||||
|
|
||||||
%macro SCALARPRODUCT_LOOP 1
|
|
||||||
align 16
|
|
||||||
.loop%1:
|
|
||||||
sub orderq, mmsize*2
|
|
||||||
%if %1
|
|
||||||
mova m1, m4
|
|
||||||
mova m4, [v2q + orderq]
|
|
||||||
mova m0, [v2q + orderq + mmsize]
|
|
||||||
palignr m1, m0, %1
|
|
||||||
palignr m0, m4, %1
|
|
||||||
mova m3, m5
|
|
||||||
mova m5, [v3q + orderq]
|
|
||||||
mova m2, [v3q + orderq + mmsize]
|
|
||||||
palignr m3, m2, %1
|
|
||||||
palignr m2, m5, %1
|
|
||||||
%else
|
|
||||||
mova m0, [v2q + orderq]
|
|
||||||
mova m1, [v2q + orderq + mmsize]
|
|
||||||
mova m2, [v3q + orderq]
|
|
||||||
mova m3, [v3q + orderq + mmsize]
|
|
||||||
%endif
|
|
||||||
%define t0 [v1q + orderq]
|
|
||||||
%define t1 [v1q + orderq + mmsize]
|
|
||||||
%if ARCH_X86_64
|
|
||||||
mova m8, t0
|
|
||||||
mova m9, t1
|
|
||||||
%define t0 m8
|
|
||||||
%define t1 m9
|
|
||||||
%endif
|
|
||||||
pmaddwd m0, t0
|
|
||||||
pmaddwd m1, t1
|
|
||||||
pmullw m2, m7
|
|
||||||
pmullw m3, m7
|
|
||||||
paddw m2, t0
|
|
||||||
paddw m3, t1
|
|
||||||
paddd m6, m0
|
|
||||||
paddd m6, m1
|
|
||||||
mova [v1q + orderq], m2
|
|
||||||
mova [v1q + orderq + mmsize], m3
|
|
||||||
jg .loop%1
|
|
||||||
%if %1
|
|
||||||
jmp .end
|
|
||||||
%endif
|
|
||||||
%endmacro
|
|
||||||
|
|
||||||
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
|
|
||||||
; int order, int mul)
|
|
||||||
INIT_XMM ssse3
|
|
||||||
cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
|
|
||||||
shl orderq, 1
|
|
||||||
movd m7, mulm
|
|
||||||
pshuflw m7, m7, 0
|
|
||||||
punpcklqdq m7, m7
|
|
||||||
pxor m6, m6
|
|
||||||
mov r4d, v2d
|
|
||||||
and r4d, 15
|
|
||||||
and v2q, ~15
|
|
||||||
and v3q, ~15
|
|
||||||
mova m4, [v2q + orderq]
|
|
||||||
mova m5, [v3q + orderq]
|
|
||||||
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
|
|
||||||
cmp r4d, 0
|
|
||||||
je .loop0
|
|
||||||
cmp r4d, 2
|
|
||||||
je .loop2
|
|
||||||
cmp r4d, 4
|
|
||||||
je .loop4
|
|
||||||
cmp r4d, 6
|
|
||||||
je .loop6
|
|
||||||
cmp r4d, 8
|
|
||||||
je .loop8
|
|
||||||
cmp r4d, 10
|
|
||||||
je .loop10
|
|
||||||
cmp r4d, 12
|
|
||||||
je .loop12
|
|
||||||
SCALARPRODUCT_LOOP 14
|
|
||||||
SCALARPRODUCT_LOOP 12
|
|
||||||
SCALARPRODUCT_LOOP 10
|
|
||||||
SCALARPRODUCT_LOOP 8
|
|
||||||
SCALARPRODUCT_LOOP 6
|
|
||||||
SCALARPRODUCT_LOOP 4
|
|
||||||
SCALARPRODUCT_LOOP 2
|
|
||||||
SCALARPRODUCT_LOOP 0
|
|
||||||
.end:
|
|
||||||
HADDD m6, m0
|
|
||||||
movd eax, m6
|
|
||||||
RET
|
|
||||||
|
|
||||||
|
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
|
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
|
||||||
|
@ -79,15 +79,6 @@ int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
|
|||||||
int order);
|
int order);
|
||||||
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
|
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
|
||||||
int order);
|
int order);
|
||||||
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
|
|
||||||
const int16_t *v3,
|
|
||||||
int order, int mul);
|
|
||||||
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
|
|
||||||
const int16_t *v3,
|
|
||||||
int order, int mul);
|
|
||||||
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
|
|
||||||
const int16_t *v3,
|
|
||||||
int order, int mul);
|
|
||||||
|
|
||||||
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
|
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
|
||||||
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
|
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
|
||||||
@ -560,7 +551,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
|
|||||||
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
|
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
|
||||||
|
|
||||||
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
|
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
|
||||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
|
|
||||||
#endif /* HAVE_MMXEXT_EXTERNAL */
|
#endif /* HAVE_MMXEXT_EXTERNAL */
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -600,7 +590,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
|
|||||||
|
|
||||||
#if HAVE_SSE2_EXTERNAL
|
#if HAVE_SSE2_EXTERNAL
|
||||||
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
||||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
|
||||||
if (cpu_flags & AV_CPU_FLAG_ATOM) {
|
if (cpu_flags & AV_CPU_FLAG_ATOM) {
|
||||||
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
|
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
|
||||||
} else {
|
} else {
|
||||||
@ -615,8 +604,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
|
|||||||
int cpu_flags, unsigned high_bit_depth)
|
int cpu_flags, unsigned high_bit_depth)
|
||||||
{
|
{
|
||||||
#if HAVE_SSSE3_EXTERNAL
|
#if HAVE_SSSE3_EXTERNAL
|
||||||
if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
|
|
||||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
|
|
||||||
c->bswap_buf = ff_bswap32_buf_ssse3;
|
c->bswap_buf = ff_bswap32_buf_ssse3;
|
||||||
#endif /* HAVE_SSSE3_EXTERNAL */
|
#endif /* HAVE_SSSE3_EXTERNAL */
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user