mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
vp3: integrate clear_blocks with idct of previous block.
This is identical to what e.g. vp8 does, and prevents the function call overhead (plus dependency on dsputil for this particular function). Arm asm updated by Janne Grunau <janne-libav@jannau.net>. Signed-off-by: Janne Grunau <janne-libav@jannau.net>
This commit is contained in:
parent
992b031838
commit
aeaf268e52
@ -108,14 +108,20 @@ endfunc
|
||||
|
||||
function vp3_idct_start_neon
|
||||
vpush {d8-d15}
|
||||
vmov.i16 q4, #0
|
||||
vmov.i16 q5, #0
|
||||
movrel r3, vp3_idct_constants
|
||||
vld1.64 {d0-d1}, [r3,:128]
|
||||
vld1.64 {d16-d19}, [r2,:128]!
|
||||
vld1.64 {d20-d23}, [r2,:128]!
|
||||
vld1.64 {d24-d27}, [r2,:128]!
|
||||
vld1.64 {d16-d19}, [r2,:128]
|
||||
vst1.64 {q4-q5}, [r2,:128]!
|
||||
vld1.64 {d20-d23}, [r2,:128]
|
||||
vst1.64 {q4-q5}, [r2,:128]!
|
||||
vld1.64 {d24-d27}, [r2,:128]
|
||||
vst1.64 {q4-q5}, [r2,:128]!
|
||||
vadd.s16 q1, q8, q12
|
||||
vsub.s16 q8, q8, q12
|
||||
vld1.64 {d28-d31}, [r2,:128]!
|
||||
vld1.64 {d28-d31}, [r2,:128]
|
||||
vst1.64 {q4-q5}, [r2,:128]!
|
||||
|
||||
vp3_idct_core_neon:
|
||||
vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16
|
||||
@ -345,10 +351,12 @@ function ff_vp3_idct_add_neon, export=1
|
||||
endfunc
|
||||
|
||||
function ff_vp3_idct_dc_add_neon, export=1
|
||||
ldrsh r2, [r2]
|
||||
ldrsh r12, [r2]
|
||||
mov r3, r0
|
||||
add r2, r2, #15
|
||||
vdup.16 q15, r2
|
||||
add r12, r12, #15
|
||||
vdup.16 q15, r12
|
||||
mov r12, 0
|
||||
strh r12, [r2]
|
||||
vshr.s16 q15, q15, #5
|
||||
|
||||
vld1.8 {d0}, [r0,:64], r1
|
||||
|
@ -140,6 +140,7 @@ static void vp3_idct_put_altivec(uint8_t *dst, int stride, DCTELEM block[64])
|
||||
PUT(b5) dst += stride;
|
||||
PUT(b6) dst += stride;
|
||||
PUT(b7)
|
||||
memset(block, 0, sizeof(*block) * 64);
|
||||
}
|
||||
|
||||
static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64])
|
||||
@ -171,6 +172,7 @@ static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64])
|
||||
ADD(b5) dst += stride;
|
||||
ADD(b6) dst += stride;
|
||||
ADD(b7)
|
||||
memset(block, 0, sizeof(*block) * 64);
|
||||
}
|
||||
|
||||
#endif /* HAVE_ALTIVEC */
|
||||
|
@ -138,6 +138,7 @@ typedef struct Vp3DecodeContext {
|
||||
DSPContext dsp;
|
||||
VideoDSPContext vdsp;
|
||||
VP3DSPContext vp3dsp;
|
||||
DECLARE_ALIGNED(16, DCTELEM, block)[64];
|
||||
int flipped_image;
|
||||
int last_slice_end;
|
||||
int skip_loop_filter;
|
||||
@ -1458,7 +1459,7 @@ static void await_reference_row(Vp3DecodeContext *s, Vp3Fragment *fragment, int
|
||||
static void render_slice(Vp3DecodeContext *s, int slice)
|
||||
{
|
||||
int x, y, i, j, fragment;
|
||||
LOCAL_ALIGNED_16(DCTELEM, block, [64]);
|
||||
DCTELEM *block = s->block;
|
||||
int motion_x = 0xdeadbeef, motion_y = 0xdeadbeef;
|
||||
int motion_halfpel_index;
|
||||
uint8_t *motion_source;
|
||||
@ -1571,8 +1572,6 @@ static void render_slice(Vp3DecodeContext *s, int slice)
|
||||
}
|
||||
}
|
||||
|
||||
s->dsp.clear_block(block);
|
||||
|
||||
/* invert DCT and place (or add) in final output */
|
||||
|
||||
if (s->all_fragments[i].coding_method == MODE_INTRA) {
|
||||
|
@ -215,14 +215,16 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
|
||||
|
||||
static void vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){
|
||||
idct(dest, line_size, block, 1);
|
||||
memset(block, 0, sizeof(*block) * 64);
|
||||
}
|
||||
|
||||
static void vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/){
|
||||
idct(dest, line_size, block, 2);
|
||||
memset(block, 0, sizeof(*block) * 64);
|
||||
}
|
||||
|
||||
static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size,
|
||||
const DCTELEM *block/*align 16*/){
|
||||
DCTELEM *block/*align 16*/){
|
||||
int i, dc = (block[0] + 15) >> 5;
|
||||
|
||||
for(i = 0; i < 8; i++){
|
||||
@ -236,6 +238,7 @@ static void vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size,
|
||||
dest[7] = av_clip_uint8(dest[7] + dc);
|
||||
dest += line_size;
|
||||
}
|
||||
block[0] = 0;
|
||||
}
|
||||
|
||||
static void vp3_v_loop_filter_c(uint8_t *first_pixel, int stride,
|
||||
|
@ -25,7 +25,7 @@
|
||||
typedef struct VP3DSPContext {
|
||||
void (*idct_put)(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void (*idct_add)(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void (*idct_dc_add)(uint8_t *dest, int line_size, const DCTELEM *block);
|
||||
void (*idct_dc_add)(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void (*v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
|
||||
void (*h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
|
||||
|
||||
|
@ -561,6 +561,13 @@ cglobal vp3_idct_put, 3, 4, 9
|
||||
movhps [r0+r3 ], m3
|
||||
%endif
|
||||
%assign %%i %%i+64
|
||||
%endrep
|
||||
|
||||
pxor m0, m0
|
||||
%assign %%offset 0
|
||||
%rep 128/mmsize
|
||||
mova [r2+%%offset], m0
|
||||
%assign %%offset %%offset+mmsize
|
||||
%endrep
|
||||
RET
|
||||
|
||||
@ -600,6 +607,11 @@ cglobal vp3_idct_add, 3, 4, 9
|
||||
movhps [r0+r1], m0
|
||||
%endif
|
||||
lea r0, [r0+r1*2]
|
||||
%assign %%offset 0
|
||||
%rep 32/mmsize
|
||||
mova [r2+%%offset], m4
|
||||
%assign %%offset %%offset+mmsize
|
||||
%endrep
|
||||
add r2, 32
|
||||
dec r3
|
||||
jg .loop
|
||||
@ -620,7 +632,7 @@ vp3_idct_funcs
|
||||
paddusb m2, m0
|
||||
movq m4, [r0+r1*2]
|
||||
paddusb m3, m0
|
||||
movq m5, [r0+r3 ]
|
||||
movq m5, [r0+r2 ]
|
||||
paddusb m4, m0
|
||||
paddusb m5, m0
|
||||
psubusb m2, m1
|
||||
@ -630,7 +642,7 @@ vp3_idct_funcs
|
||||
movq [r0+r1 ], m3
|
||||
psubusb m5, m1
|
||||
movq [r0+r1*2], m4
|
||||
movq [r0+r3 ], m5
|
||||
movq [r0+r2 ], m5
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
@ -638,11 +650,12 @@ cglobal vp3_idct_dc_add, 3, 4
|
||||
%if ARCH_X86_64
|
||||
movsxd r1, r1d
|
||||
%endif
|
||||
lea r3, [r1*3]
|
||||
movsx r2, word [r2]
|
||||
add r2, 15
|
||||
sar r2, 5
|
||||
movd m0, r2d
|
||||
movsx r3, word [r2]
|
||||
mov word [r2], 0
|
||||
lea r2, [r1*3]
|
||||
add r3, 15
|
||||
sar r3, 5
|
||||
movd m0, r3d
|
||||
pshufw m0, m0, 0x0
|
||||
pxor m1, m1
|
||||
psubw m1, m0
|
||||
|
@ -32,7 +32,7 @@ void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
|
||||
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size,
|
||||
const DCTELEM *block);
|
||||
DCTELEM *block);
|
||||
|
||||
void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
|
||||
int *bounding_values);
|
||||
|
Loading…
x
Reference in New Issue
Block a user