mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
wma lossless: reuse scalarproduct_and_madd_int16
This is done by padding the coefficient buffer with 0s, because the order may be only a multiple of 4, and the DSP function requires batches of 8. However, no sample with such a case was found, so request one if it uses that kind of order. Approximate relative speedup depending on instruction set: plain C: -6% mmxext: 51% sse2: 54% Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
c2e2b29b73
commit
adf4ee40e7
@ -31,6 +31,7 @@
|
||||
#include "internal.h"
|
||||
#include "get_bits.h"
|
||||
#include "put_bits.h"
|
||||
#include "lossless_audiodsp.h"
|
||||
#include "wma.h"
|
||||
#include "wma_common.h"
|
||||
|
||||
@ -46,6 +47,7 @@
|
||||
#define WMALL_BLOCK_MAX_SIZE (1 << WMALL_BLOCK_MAX_BITS) ///< maximum block size
|
||||
#define WMALL_BLOCK_SIZES (WMALL_BLOCK_MAX_BITS - WMALL_BLOCK_MIN_BITS + 1) ///< possible block sizes
|
||||
|
||||
#define WMALL_COEFF_PAD_SIZE 16 ///< pad coef buffers with 0 for use with SIMD
|
||||
|
||||
/**
|
||||
* @brief frame-specific decoder context for a single channel
|
||||
@ -69,6 +71,7 @@ typedef struct WmallDecodeCtx {
|
||||
/* generic decoder variables */
|
||||
AVCodecContext *avctx;
|
||||
AVFrame *frame;
|
||||
LLAudDSPContext dsp; ///< accelerated DSP functions
|
||||
uint8_t frame_data[MAX_FRAMESIZE + FF_INPUT_BUFFER_PADDING_SIZE]; ///< compressed frame data
|
||||
PutBitContext pb; ///< context for filling the frame_data buffer
|
||||
|
||||
@ -143,9 +146,9 @@ typedef struct WmallDecodeCtx {
|
||||
int scaling;
|
||||
int coefsend;
|
||||
int bitsend;
|
||||
int16_t coefs[MAX_ORDER];
|
||||
int16_t lms_prevvalues[MAX_ORDER * 2];
|
||||
int16_t lms_updates[MAX_ORDER * 2];
|
||||
DECLARE_ALIGNED(16, int16_t, coefs)[MAX_ORDER + WMALL_COEFF_PAD_SIZE/sizeof(int16_t)];
|
||||
DECLARE_ALIGNED(16, int16_t, lms_prevvalues)[MAX_ORDER * 2];
|
||||
DECLARE_ALIGNED(16, int16_t, lms_updates)[MAX_ORDER * 2];
|
||||
int recent;
|
||||
} cdlms[WMALL_MAX_CHANNELS][9];
|
||||
|
||||
@ -186,6 +189,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
|
||||
}
|
||||
|
||||
s->avctx = avctx;
|
||||
ff_llauddsp_init(&s->dsp);
|
||||
init_put_bits(&s->pb, s->frame_data, MAX_FRAMESIZE);
|
||||
|
||||
if (avctx->extradata_size >= 18) {
|
||||
@ -459,6 +463,13 @@ static int decode_cdlms(WmallDecodeCtx *s)
|
||||
s->cdlms[0][0].order = 0;
|
||||
return AVERROR_INVALIDDATA;
|
||||
}
|
||||
if(s->cdlms[c][i].order & 8) {
|
||||
static int warned;
|
||||
if(!warned)
|
||||
avpriv_request_sample(s->avctx, "CDLMS of order %d",
|
||||
s->cdlms[c][i].order);
|
||||
warned = 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < s->cdlms_ttl[c]; i++)
|
||||
@ -484,6 +495,10 @@ static int decode_cdlms(WmallDecodeCtx *s)
|
||||
(get_bits(&s->gb, s->cdlms[c][i].bitsend) << shift_l) >> shift_r;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < s->cdlms_ttl[c]; i++)
|
||||
memset(s->cdlms[c][i].coefs + s->cdlms[c][i].order,
|
||||
0, WMALL_COEFF_PAD_SIZE);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -693,35 +708,11 @@ static void revert_mclms(WmallDecodeCtx *s, int tile_size)
|
||||
}
|
||||
}
|
||||
|
||||
static int lms_predict(WmallDecodeCtx *s, int ich, int ilms)
|
||||
static void lms_update(WmallDecodeCtx *s, int ich, int ilms, int input)
|
||||
{
|
||||
int pred = 0, icoef;
|
||||
int recent = s->cdlms[ich][ilms].recent;
|
||||
|
||||
for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
|
||||
pred += s->cdlms[ich][ilms].coefs[icoef] *
|
||||
s->cdlms[ich][ilms].lms_prevvalues[icoef + recent];
|
||||
|
||||
return pred;
|
||||
}
|
||||
|
||||
static void lms_update(WmallDecodeCtx *s, int ich, int ilms,
|
||||
int input, int residue)
|
||||
{
|
||||
int icoef;
|
||||
int recent = s->cdlms[ich][ilms].recent;
|
||||
int range = 1 << s->bits_per_sample - 1;
|
||||
|
||||
if (residue < 0) {
|
||||
for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
|
||||
s->cdlms[ich][ilms].coefs[icoef] -=
|
||||
s->cdlms[ich][ilms].lms_updates[icoef + recent];
|
||||
} else if (residue > 0) {
|
||||
for (icoef = 0; icoef < s->cdlms[ich][ilms].order; icoef++)
|
||||
s->cdlms[ich][ilms].coefs[icoef] +=
|
||||
s->cdlms[ich][ilms].lms_updates[icoef + recent];
|
||||
}
|
||||
|
||||
if (recent)
|
||||
recent--;
|
||||
else {
|
||||
@ -782,6 +773,9 @@ static void use_normal_update_speed(WmallDecodeCtx *s, int ich)
|
||||
s->update_speed[ich] = 8;
|
||||
}
|
||||
|
||||
/** Get sign of integer (1 for positive, -1 for negative and 0 for zero) */
|
||||
#define WMASIGN(x) ((x > 0) - (x < 0))
|
||||
|
||||
static void revert_cdlms(WmallDecodeCtx *s, int ch,
|
||||
int coef_begin, int coef_end)
|
||||
{
|
||||
@ -792,9 +786,15 @@ static void revert_cdlms(WmallDecodeCtx *s, int ch,
|
||||
for (icoef = coef_begin; icoef < coef_end; icoef++) {
|
||||
pred = 1 << (s->cdlms[ch][ilms].scaling - 1);
|
||||
residue = s->channel_residues[ch][icoef];
|
||||
pred += lms_predict(s, ch, ilms);
|
||||
pred += s->dsp.scalarproduct_and_madd_int16(s->cdlms[ch][ilms].coefs,
|
||||
s->cdlms[ch][ilms].lms_prevvalues
|
||||
+ s->cdlms[ch][ilms].recent,
|
||||
s->cdlms[ch][ilms].lms_updates
|
||||
+ s->cdlms[ch][ilms].recent,
|
||||
s->cdlms[ch][ilms].order,
|
||||
WMASIGN(residue));
|
||||
input = residue + (pred >> s->cdlms[ch][ilms].scaling);
|
||||
lms_update(s, ch, ilms, input, residue);
|
||||
lms_update(s, ch, ilms, input);
|
||||
s->channel_residues[ch][icoef] = input;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user