mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-08 13:22:53 +02:00
pngdsp x86: use unaligned access
For test images manually generated to contain only up prediction, timing results: 8380x3032 255x185 before: 138635 1992 after: 139232 1996 Actually jumping to the proper version depending on the alignment: 8380x3032: 138767 A 0.5% speed improvement for gigantic images is not worth the code duplication. Fixes ticket #4148 Signed-off-by: Christophe Gisquet <christophe.gisquet@gmail.com> Tested-by: Benoit Fouet <benoit.fouet@free.fr> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
242f1152bf
commit
9fa056ba75
@ -25,9 +25,9 @@
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct PNGDSPContext {
|
||||
void (*add_bytes_l2)(uint8_t *dst /* align 16 */,
|
||||
void (*add_bytes_l2)(uint8_t *dst,
|
||||
uint8_t *src1 /* align 16 */,
|
||||
uint8_t *src2 /* align 16 */, int w);
|
||||
uint8_t *src2, int w);
|
||||
|
||||
/* this might write to dst[w] */
|
||||
void (*add_paeth_prediction)(uint8_t *dst, uint8_t *src,
|
||||
|
@ -42,12 +42,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
|
||||
and waq, ~(mmsize*2-1)
|
||||
jmp .end_v
|
||||
.loop_v:
|
||||
mova m0, [src1q+iq]
|
||||
mova m1, [src1q+iq+mmsize]
|
||||
paddb m0, [src2q+iq]
|
||||
paddb m1, [src2q+iq+mmsize]
|
||||
mova [dstq+iq ], m0
|
||||
mova [dstq+iq+mmsize], m1
|
||||
movu m0, [src2q+iq]
|
||||
movu m1, [src2q+iq+mmsize]
|
||||
paddb m0, [src1q+iq]
|
||||
paddb m1, [src1q+iq+mmsize]
|
||||
movu [dstq+iq ], m0
|
||||
movu [dstq+iq+mmsize], m1
|
||||
add iq, mmsize*2
|
||||
.end_v:
|
||||
cmp iq, waq
|
||||
|
Loading…
Reference in New Issue
Block a user