mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
vp3: DC-only IDCT
2-4% faster overall decode Originally committed as revision 22896 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
f32f7d8b24
commit
eb6a6cd788
@ -32,6 +32,7 @@ void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
|
|||||||
void ff_vp3_idct_neon(DCTELEM *data);
|
void ff_vp3_idct_neon(DCTELEM *data);
|
||||||
void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
|
void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
|
||||||
void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
|
void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
|
||||||
|
void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data);
|
||||||
|
|
||||||
void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int);
|
void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int);
|
||||||
void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
|
void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
|
||||||
@ -294,6 +295,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
|
|||||||
if (CONFIG_VP3_DECODER) {
|
if (CONFIG_VP3_DECODER) {
|
||||||
c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon;
|
c->vp3_v_loop_filter = ff_vp3_v_loop_filter_neon;
|
||||||
c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon;
|
c->vp3_h_loop_filter = ff_vp3_h_loop_filter_neon;
|
||||||
|
c->vp3_idct_dc_add = ff_vp3_idct_dc_add_neon;
|
||||||
}
|
}
|
||||||
|
|
||||||
c->vector_fmul = ff_vector_fmul_neon;
|
c->vector_fmul = ff_vector_fmul_neon;
|
||||||
|
@ -374,3 +374,47 @@ function ff_vp3_idct_add_neon, export=1
|
|||||||
vst1.64 {d7}, [r2,:64], r1
|
vst1.64 {d7}, [r2,:64], r1
|
||||||
bx lr
|
bx lr
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
|
function ff_vp3_idct_dc_add_neon, export=1
|
||||||
|
ldrsh r2, [r2]
|
||||||
|
movw r3, #46341
|
||||||
|
mul r2, r3, r2
|
||||||
|
smulwt r2, r3, r2
|
||||||
|
mov r3, r0
|
||||||
|
vdup.16 q15, r2
|
||||||
|
vrshr.s16 q15, q15, #4
|
||||||
|
|
||||||
|
vld1.8 {d0}, [r0,:64], r1
|
||||||
|
vld1.8 {d1}, [r0,:64], r1
|
||||||
|
vld1.8 {d2}, [r0,:64], r1
|
||||||
|
vaddw.u8 q8, q15, d0
|
||||||
|
vld1.8 {d3}, [r0,:64], r1
|
||||||
|
vaddw.u8 q9, q15, d1
|
||||||
|
vld1.8 {d4}, [r0,:64], r1
|
||||||
|
vaddw.u8 q10, q15, d2
|
||||||
|
vld1.8 {d5}, [r0,:64], r1
|
||||||
|
vaddw.u8 q11, q15, d3
|
||||||
|
vld1.8 {d6}, [r0,:64], r1
|
||||||
|
vaddw.u8 q12, q15, d4
|
||||||
|
vld1.8 {d7}, [r0,:64], r1
|
||||||
|
vaddw.u8 q13, q15, d5
|
||||||
|
vqmovun.s16 d0, q8
|
||||||
|
vaddw.u8 q14, q15, d6
|
||||||
|
vqmovun.s16 d1, q9
|
||||||
|
vaddw.u8 q15, q15, d7
|
||||||
|
vqmovun.s16 d2, q10
|
||||||
|
vst1.8 {d0}, [r3,:64], r1
|
||||||
|
vqmovun.s16 d3, q11
|
||||||
|
vst1.8 {d1}, [r3,:64], r1
|
||||||
|
vqmovun.s16 d4, q12
|
||||||
|
vst1.8 {d2}, [r3,:64], r1
|
||||||
|
vqmovun.s16 d5, q13
|
||||||
|
vst1.8 {d3}, [r3,:64], r1
|
||||||
|
vqmovun.s16 d6, q14
|
||||||
|
vst1.8 {d4}, [r3,:64], r1
|
||||||
|
vqmovun.s16 d7, q15
|
||||||
|
vst1.8 {d5}, [r3,:64], r1
|
||||||
|
vst1.8 {d6}, [r3,:64], r1
|
||||||
|
vst1.8 {d7}, [r3,:64], r1
|
||||||
|
bx lr
|
||||||
|
endfunc
|
||||||
|
@ -4467,6 +4467,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
|
|||||||
if (CONFIG_VP3_DECODER) {
|
if (CONFIG_VP3_DECODER) {
|
||||||
c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
|
c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
|
||||||
c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
|
c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
|
||||||
|
c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
|
||||||
}
|
}
|
||||||
if (CONFIG_VP6_DECODER) {
|
if (CONFIG_VP6_DECODER) {
|
||||||
c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
|
c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
|
||||||
|
@ -86,6 +86,7 @@ extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
|
|||||||
void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
|
void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
|
||||||
void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
|
void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
|
||||||
void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
|
void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
|
||||||
|
void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/);
|
||||||
|
|
||||||
void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
|
void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
|
||||||
void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
|
void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
|
||||||
@ -373,6 +374,7 @@ typedef struct DSPContext {
|
|||||||
void (*x8_v_loop_filter)(uint8_t *src, int stride, int qscale);
|
void (*x8_v_loop_filter)(uint8_t *src, int stride, int qscale);
|
||||||
void (*x8_h_loop_filter)(uint8_t *src, int stride, int qscale);
|
void (*x8_h_loop_filter)(uint8_t *src, int stride, int qscale);
|
||||||
|
|
||||||
|
void (*vp3_idct_dc_add)(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/);
|
||||||
void (*vp3_v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
|
void (*vp3_v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
|
||||||
void (*vp3_h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
|
void (*vp3_h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
|
||||||
|
|
||||||
|
@ -1395,8 +1395,6 @@ static void render_slice(Vp3DecodeContext *s, int slice)
|
|||||||
|
|
||||||
/* transform if this block was coded */
|
/* transform if this block was coded */
|
||||||
if (s->all_fragments[i].coding_method != MODE_COPY) {
|
if (s->all_fragments[i].coding_method != MODE_COPY) {
|
||||||
int intra = s->all_fragments[i].coding_method == MODE_INTRA;
|
|
||||||
|
|
||||||
if ((s->all_fragments[i].coding_method == MODE_USING_GOLDEN) ||
|
if ((s->all_fragments[i].coding_method == MODE_USING_GOLDEN) ||
|
||||||
(s->all_fragments[i].coding_method == MODE_GOLDEN_MV))
|
(s->all_fragments[i].coding_method == MODE_GOLDEN_MV))
|
||||||
motion_source= golden_plane;
|
motion_source= golden_plane;
|
||||||
@ -1456,11 +1454,11 @@ static void render_slice(Vp3DecodeContext *s, int slice)
|
|||||||
}
|
}
|
||||||
|
|
||||||
s->dsp.clear_block(block);
|
s->dsp.clear_block(block);
|
||||||
vp3_dequant(s, s->all_fragments + i, plane, !intra, block);
|
|
||||||
|
|
||||||
/* invert DCT and place (or add) in final output */
|
/* invert DCT and place (or add) in final output */
|
||||||
|
|
||||||
if (s->all_fragments[i].coding_method == MODE_INTRA) {
|
if (s->all_fragments[i].coding_method == MODE_INTRA) {
|
||||||
|
vp3_dequant(s, s->all_fragments + i, plane, 0, block);
|
||||||
if(s->avctx->idct_algo!=FF_IDCT_VP3)
|
if(s->avctx->idct_algo!=FF_IDCT_VP3)
|
||||||
block[0] += 128<<3;
|
block[0] += 128<<3;
|
||||||
s->dsp.idct_put(
|
s->dsp.idct_put(
|
||||||
@ -1468,10 +1466,14 @@ static void render_slice(Vp3DecodeContext *s, int slice)
|
|||||||
stride,
|
stride,
|
||||||
block);
|
block);
|
||||||
} else {
|
} else {
|
||||||
|
if (vp3_dequant(s, s->all_fragments + i, plane, 1, block)) {
|
||||||
s->dsp.idct_add(
|
s->dsp.idct_add(
|
||||||
output_plane + first_pixel,
|
output_plane + first_pixel,
|
||||||
stride,
|
stride,
|
||||||
block);
|
block);
|
||||||
|
} else {
|
||||||
|
s->dsp.vp3_idct_dc_add(output_plane + first_pixel, stride, block);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
@ -223,6 +223,25 @@ void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*
|
|||||||
idct(dest, line_size, block, 2);
|
idct(dest, line_size, block, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/){
|
||||||
|
const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
|
||||||
|
int i, dc = block[0];
|
||||||
|
dc = (46341*dc)>>16;
|
||||||
|
dc = (46341*dc + (8<<16))>>20;
|
||||||
|
|
||||||
|
for(i = 0; i < 8; i++){
|
||||||
|
dest[0] = cm[dest[0]+dc];
|
||||||
|
dest[1] = cm[dest[1]+dc];
|
||||||
|
dest[2] = cm[dest[2]+dc];
|
||||||
|
dest[3] = cm[dest[3]+dc];
|
||||||
|
dest[4] = cm[dest[4]+dc];
|
||||||
|
dest[5] = cm[dest[5]+dc];
|
||||||
|
dest[6] = cm[dest[6]+dc];
|
||||||
|
dest[7] = cm[dest[7]+dc];
|
||||||
|
dest += line_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void ff_vp3_v_loop_filter_c(uint8_t *first_pixel, int stride, int *bounding_values)
|
void ff_vp3_v_loop_filter_c(uint8_t *first_pixel, int stride, int *bounding_values)
|
||||||
{
|
{
|
||||||
unsigned char *end;
|
unsigned char *end;
|
||||||
|
@ -2653,6 +2653,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
|||||||
c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
|
c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (CONFIG_VP3_DECODER) {
|
||||||
|
c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
|
||||||
|
}
|
||||||
|
|
||||||
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
|
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
|
||||||
c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
|
c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
|
||||||
|
@ -395,3 +395,44 @@ void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
|
|||||||
ff_vp3_idct_mmx(block);
|
ff_vp3_idct_mmx(block);
|
||||||
add_pixels_clamped_mmx(block, dest, line_size);
|
add_pixels_clamped_mmx(block, dest, line_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block)
|
||||||
|
{
|
||||||
|
int dc = block[0];
|
||||||
|
dc = (46341*dc)>>16;
|
||||||
|
dc = (46341*dc + (8<<16))>>20;
|
||||||
|
|
||||||
|
__asm__ volatile(
|
||||||
|
"movd %3, %%mm0 \n\t"
|
||||||
|
"pshufw $0, %%mm0, %%mm0 \n\t"
|
||||||
|
"pxor %%mm1, %%mm1 \n\t"
|
||||||
|
"psubw %%mm0, %%mm1 \n\t"
|
||||||
|
"packuswb %%mm0, %%mm0 \n\t"
|
||||||
|
"packuswb %%mm1, %%mm1 \n\t"
|
||||||
|
|
||||||
|
#define DC_ADD \
|
||||||
|
"movq (%0), %%mm2 \n\t" \
|
||||||
|
"movq (%0,%1), %%mm3 \n\t" \
|
||||||
|
"paddusb %%mm0, %%mm2 \n\t" \
|
||||||
|
"movq (%0,%1,2), %%mm4 \n\t" \
|
||||||
|
"paddusb %%mm0, %%mm3 \n\t" \
|
||||||
|
"movq (%0,%2), %%mm5 \n\t" \
|
||||||
|
"paddusb %%mm0, %%mm4 \n\t" \
|
||||||
|
"paddusb %%mm0, %%mm5 \n\t" \
|
||||||
|
"psubusb %%mm1, %%mm2 \n\t" \
|
||||||
|
"psubusb %%mm1, %%mm3 \n\t" \
|
||||||
|
"movq %%mm2, (%0) \n\t" \
|
||||||
|
"psubusb %%mm1, %%mm4 \n\t" \
|
||||||
|
"movq %%mm3, (%0,%1) \n\t" \
|
||||||
|
"psubusb %%mm1, %%mm5 \n\t" \
|
||||||
|
"movq %%mm4, (%0,%1,2) \n\t" \
|
||||||
|
"movq %%mm5, (%0,%2) \n\t"
|
||||||
|
|
||||||
|
DC_ADD
|
||||||
|
"lea (%0,%1,4), %0 \n\t"
|
||||||
|
DC_ADD
|
||||||
|
|
||||||
|
: "+r"(dest)
|
||||||
|
: "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
@ -28,6 +28,7 @@
|
|||||||
void ff_vp3_idct_mmx(int16_t *data);
|
void ff_vp3_idct_mmx(int16_t *data);
|
||||||
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
||||||
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
||||||
|
void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
|
||||||
|
|
||||||
void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
|
void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
|
||||||
void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
|
void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
|
||||||
|
Loading…
Reference in New Issue
Block a user