From adf4ee40e7376d63d9c557e0e3e0e440e1144405 Mon Sep 17 00:00:00 2001 From: Christophe Gisquet Date: Sat, 24 Nov 2012 15:55:49 +0100 Subject: [PATCH] wma lossless: reuse scalarproduct_and_madd_int16 This is done by padding the coefficient buffer with 0s, because the order may be only a multiple of 4, and the DSP function requires batches of 8. However, no sample with such a case was found, so request one if it uses that kind of order. Approximate relative speedup depending on instruction set: plain C: -6% mmxext: 51% sse2: 54% Signed-off-by: Michael Niedermayer --- libavcodec/wmalosslessdec.c | 60 ++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/libavcodec/wmalosslessdec.c b/libavcodec/wmalosslessdec.c index 99139eba78..ca62dc1cf9 100644 --- a/libavcodec/wmalosslessdec.c +++ b/libavcodec/wmalosslessdec.c @@ -31,6 +31,7 @@ #include "internal.h" #include "get_bits.h" #include "put_bits.h" +#include "lossless_audiodsp.h" #include "wma.h" #include "wma_common.h" @@ -46,6 +47,7 @@ #define WMALL_BLOCK_MAX_SIZE (1 << WMALL_BLOCK_MAX_BITS) ///< maximum block size #define WMALL_BLOCK_SIZES (WMALL_BLOCK_MAX_BITS - WMALL_BLOCK_MIN_BITS + 1) ///< possible block sizes +#define WMALL_COEFF_PAD_SIZE 16 ///< pad coef buffers with 0 for use with SIMD /** * @brief frame-specific decoder context for a single channel @@ -69,6 +71,7 @@ typedef struct WmallDecodeCtx { /* generic decoder variables */ AVCodecContext *avctx; AVFrame *frame; + LLAudDSPContext dsp; ///< accelerated DSP functions uint8_t frame_data[MAX_FRAMESIZE + FF_INPUT_BUFFER_PADDING_SIZE]; ///< compressed frame data PutBitContext pb; ///< context for filling the frame_data buffer @@ -143,9 +146,9 @@ typedef struct WmallDecodeCtx { int scaling; int coefsend; int bitsend; - int16_t coefs[MAX_ORDER]; - int16_t lms_prevvalues[MAX_ORDER * 2]; - int16_t lms_updates[MAX_ORDER * 2]; + DECLARE_ALIGNED(16, int16_t, coefs)[MAX_ORDER + WMALL_COEFF_PAD_SIZE/sizeof(int16_t)]; + DECLARE_ALIGNED(16, int16_t, lms_prevvalues)[MAX_ORDER * 2]; + DECLARE_ALIGNED(16, int16_t, lms_updates)[MAX_ORDER * 2]; int recent; } cdlms[WMALL_MAX_CHANNELS][9]; @@ -186,6 +189,7 @@ static av_cold int decode_init(AVCodecContext *avctx) } s->avctx = avctx; + ff_llauddsp_init(&s->dsp); init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE); if (avctx->extradata_size >= 18) { @@ -459,6 +463,13 @@ static int decode_cdlms(WmallDecodeCtx *s) s->cdlms[0][0].order = 0; return AVERROR_INVALIDDATA; } + if(s->cdlms[c][i].order & 8) { + static int warned; + if(!warned) + avpriv_request_sample(s->avctx, "CDLMS of order %d", + s->cdlms[c][i].order); + warned = 1; + } } for (i = 0; i < s->cdlms_ttl[c]; i++) @@ -484,6 +495,10 @@ static int decode_cdlms(WmallDecodeCtx *s) (get_bits(&s->gb, s->cdlms[c][i].bitsend) << shift_l) >> shift_r; } } + + for (i = 0; i < s->cdlms_ttl[c]; i++) + memset(s->cdlms[c][i].coefs + s->cdlms[c][i].order, + 0, WMALL_COEFF_PAD_SIZE); } return 0; @@ -693,35 +708,11 @@ static void revert_mclms(WmallDecodeCtx *s, int tile_size) } } -static int lms_predict(WmallDecodeCtx *s, int ich, int ilms) +static void lms_update(WmallDecodeCtx *s, int ich, int ilms, int input) { - int pred = 0, icoef; - int recent = s->cdlms[ich][ilms].recent; - - for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++) - pred += s->cdlms[ich][ilms].coefs[icoef] * - s->cdlms[ich][ilms].lms_prevvalues[icoef + recent]; - - return pred; -} - -static void lms_update(WmallDecodeCtx *s, int ich, int ilms, - int input, int residue) -{ - int icoef; int recent = s->cdlms[ich][ilms].recent; int range = 1 << s->bits_per_sample - 1; - if (residue < 0) { - for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++) - s->cdlms[ich][ilms].coefs[icoef] -= - s->cdlms[ich][ilms].lms_updates[icoef + recent]; - } else if (residue > 0) { - for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++) - s->cdlms[ich][ilms].coefs[icoef] += - s->cdlms[ich][ilms].lms_updates[icoef + recent]; - } - if (recent) recent--; else { @@ -782,6 +773,9 @@ static void use_normal_update_speed(WmallDecodeCtx *s, int ich) s->update_speed[ich] = 8; } +/** Get sign of integer (1 for positive, -1 for negative and 0 for zero) */ +#define WMASIGN(x) ((x > 0) - (x < 0)) + static void revert_cdlms(WmallDecodeCtx *s, int ch, int coef_begin, int coef_end) { @@ -792,9 +786,15 @@ static void revert_cdlms(WmallDecodeCtx *s, int ch, for (icoef = coef_begin; icoef < coef_end; icoef++) { pred = 1 << (s->cdlms[ch][ilms].scaling - 1); residue = s->channel_residues[ch][icoef]; - pred += lms_predict(s, ch, ilms); + pred += s->dsp.scalarproduct_and_madd_int16(s->cdlms[ch][ilms].coefs, + s->cdlms[ch][ilms].lms_prevvalues + + s->cdlms[ch][ilms].recent, + s->cdlms[ch][ilms].lms_updates + + s->cdlms[ch][ilms].recent, + s->cdlms[ch][ilms].order, + WMASIGN(residue)); input = residue + (pred >> s->cdlms[ch][ilms].scaling); - lms_update(s, ch, ilms, input, residue); + lms_update(s, ch, ilms, input); s->channel_residues[ch][icoef] = input; } }