mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
Assembly version of put_pixels. This is currently the function that
takes the most time, and it allows for more efficient unaligned access and better control over memory latencies. Originally committed as revision 711 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
d13c54cdc0
commit
494e409255
@ -22,6 +22,8 @@
|
||||
|
||||
void simple_idct_axp(DCTELEM *block);
|
||||
|
||||
void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
|
||||
int line_size, int h);
|
||||
void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
|
||||
int line_size);
|
||||
void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
|
||||
@ -232,12 +234,12 @@ static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
|
||||
|
||||
void dsputil_init_alpha(void)
|
||||
{
|
||||
put_pixels_tab[0] = put_pixels_axp;
|
||||
put_pixels_tab[0] = put_pixels_axp_asm;
|
||||
put_pixels_tab[1] = put_pixels_x2_axp;
|
||||
put_pixels_tab[2] = put_pixels_y2_axp;
|
||||
put_pixels_tab[3] = put_pixels_xy2_axp;
|
||||
|
||||
put_no_rnd_pixels_tab[0] = put_pixels_axp;
|
||||
put_no_rnd_pixels_tab[0] = put_pixels_axp_asm;
|
||||
put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
|
||||
put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
|
||||
put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
|
||||
|
@ -43,6 +43,123 @@
|
||||
.arch pca56
|
||||
.text
|
||||
|
||||
/************************************************************************
|
||||
* void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
|
||||
* int line_size, int h)
|
||||
*/
|
||||
.align 6
|
||||
.globl put_pixels_axp_asm
|
||||
.ent put_pixels_axp_asm
|
||||
put_pixels_axp_asm:
|
||||
.frame sp, 0, ra
|
||||
.prologue 0
|
||||
|
||||
#ifdef HAVE_GPROF
|
||||
lda AT, _mcount
|
||||
jsr AT, (AT), _mcount
|
||||
#endif
|
||||
|
||||
and a1, 7, t0
|
||||
beq t0, $aligned
|
||||
|
||||
.align 4
|
||||
$unaligned:
|
||||
ldq_u t0, 0(a1)
|
||||
ldq_u t1, 8(a1)
|
||||
addq a1, a2, a1
|
||||
nop
|
||||
|
||||
ldq_u t2, 0(a1)
|
||||
ldq_u t3, 8(a1)
|
||||
addq a1, a2, a1
|
||||
nop
|
||||
|
||||
ldq_u t4, 0(a1)
|
||||
ldq_u t5, 8(a1)
|
||||
addq a1, a2, a1
|
||||
nop
|
||||
|
||||
ldq_u t6, 0(a1)
|
||||
ldq_u t7, 8(a1)
|
||||
extql t0, a1, t0
|
||||
addq a1, a2, a1
|
||||
|
||||
extqh t1, a1, t1
|
||||
addq a0, a2, t8
|
||||
extql t2, a1, t2
|
||||
addq t8, a2, t9
|
||||
|
||||
extqh t3, a1, t3
|
||||
addq t9, a2, ta
|
||||
extql t4, a1, t4
|
||||
or t0, t1, t0
|
||||
|
||||
extqh t5, a1, t5
|
||||
or t2, t3, t2
|
||||
extql t6, a1, t6
|
||||
or t4, t5, t4
|
||||
|
||||
extqh t7, a1, t7
|
||||
or t6, t7, t6
|
||||
stq t0, 0(a0)
|
||||
stq t2, 0(t8)
|
||||
|
||||
stq t4, 0(t9)
|
||||
subq a3, 4, a3
|
||||
stq t6, 0(ta)
|
||||
addq ta, a2, a0
|
||||
|
||||
bne a3, $unaligned
|
||||
ret
|
||||
|
||||
.align 4
|
||||
$aligned:
|
||||
ldq t0, 0(a1)
|
||||
addq a1, a2, a1
|
||||
ldq t1, 0(a1)
|
||||
addq a1, a2, a1
|
||||
|
||||
ldq t2, 0(a1)
|
||||
addq a1, a2, a1
|
||||
ldq t3, 0(a1)
|
||||
addq a1, a2, a1
|
||||
|
||||
ldq t4, 0(a1)
|
||||
addq a1, a2, a1
|
||||
ldq t5, 0(a1)
|
||||
addq a1, a2, a1
|
||||
|
||||
ldq t6, 0(a1)
|
||||
addq a1, a2, a1
|
||||
ldq t7, 0(a1)
|
||||
addq a1, a2, a1
|
||||
|
||||
addq a0, a2, t8
|
||||
stq t0, 0(a0)
|
||||
addq t8, a2, t9
|
||||
stq t1, 0(t8)
|
||||
|
||||
addq t9, a2, ta
|
||||
stq t2, 0(t9)
|
||||
addq ta, a2, tb
|
||||
stq t3, 0(ta)
|
||||
|
||||
addq tb, a2, tc
|
||||
stq t4, 0(tb)
|
||||
addq tc, a2, td
|
||||
stq t5, 0(tc)
|
||||
|
||||
addq td, a2, te
|
||||
stq t6, 0(td)
|
||||
addq te, a2, a0
|
||||
stq t7, 0(te)
|
||||
|
||||
subq a3, 8, a3
|
||||
bne a3, $aligned
|
||||
|
||||
ret
|
||||
.end put_pixels_axp_asm
|
||||
|
||||
/************************************************************************
|
||||
* void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
|
||||
* int line_size)
|
||||
|
Loading…
Reference in New Issue
Block a user