You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-04 22:03:09 +02:00
swscale/aarch64/rgb2rgb_neon: Implemented {yuyv, uyvy}toyuv{420, 422}
A78: uyvytoyuv420_neon: 6112.5 ( 6.96x) uyvytoyuv422_neon: 6696.0 ( 6.32x) yuyvtoyuv420_neon: 6113.0 ( 6.95x) yuyvtoyuv422_neon: 6695.2 ( 6.31x) A72: uyvytoyuv420_neon: 9512.1 ( 6.09x) uyvytoyuv422_neon: 9766.8 ( 6.32x) yuyvtoyuv420_neon: 9639.1 ( 6.00x) yuyvtoyuv422_neon: 9779.0 ( 6.03x) A53: uyvytoyuv420_neon: 12720.1 ( 9.10x) uyvytoyuv422_neon: 14282.9 ( 6.71x) yuyvtoyuv420_neon: 12637.4 ( 9.15x) yuyvtoyuv422_neon: 14127.6 ( 6.77x) Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
committed by
Martin Storsjö
parent
64107e22f5
commit
b92577405b
@ -67,6 +67,18 @@ void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
|
||||
void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
|
||||
void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
|
||||
|
||||
void ff_uyvytoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
||||
const uint8_t *src, int width, int height,
|
||||
int lumStride, int chromStride, int srcStride);
|
||||
void ff_uyvytoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
||||
const uint8_t *src, int width, int height,
|
||||
int lumStride, int chromStride, int srcStride);
|
||||
void ff_yuyvtoyuv420_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
||||
const uint8_t *src, int width, int height,
|
||||
int lumStride, int chromStride, int srcStride);
|
||||
void ff_yuyvtoyuv422_neon(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
||||
const uint8_t *src, int width, int height,
|
||||
int lumStride, int chromStride, int srcStride);
|
||||
av_cold void rgb2rgb_init_aarch64(void)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
@ -84,5 +96,9 @@ av_cold void rgb2rgb_init_aarch64(void)
|
||||
shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
|
||||
shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
|
||||
shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
|
||||
uyvytoyuv422 = ff_uyvytoyuv422_neon;
|
||||
uyvytoyuv420 = ff_uyvytoyuv420_neon;
|
||||
yuyvtoyuv422 = ff_yuyvtoyuv422_neon;
|
||||
yuyvtoyuv420 = ff_yuyvtoyuv420_neon;
|
||||
}
|
||||
}
|
||||
|
@ -425,3 +425,265 @@ neon_shuf 2013
|
||||
neon_shuf 1203
|
||||
neon_shuf 2130
|
||||
neon_shuf 3210
|
||||
|
||||
/*
|
||||
v0-v7 - two consecutive lines
|
||||
x0 - upper Y destination
|
||||
x1 - U destination
|
||||
x2 - V destination
|
||||
x3 - upper src line
|
||||
w5 - width/iteration counter - count of line pairs for yuv420, of single lines for 422
|
||||
x6 - lum padding
|
||||
x7 - chrom padding
|
||||
x8 - src padding
|
||||
w9 - number of bytes remaining in the tail
|
||||
x10 - lower Y destination
|
||||
w12 - tmp
|
||||
x13 - lower src line
|
||||
w14 - tmp
|
||||
w17 - set to 1 if last line has to be handled separately (odd height)
|
||||
*/
|
||||
|
||||
// one fast path iteration processes 16 uyvy tuples
|
||||
// is_line_tail is set to 1 when final 16 tuples are being processed
|
||||
// skip_storing_chroma is set to 1 when final line is processed and the height is odd
|
||||
.macro fastpath_iteration src_fmt, dst_fmt, is_line_tail, skip_storing_chroma
|
||||
ld4 {v0.16b - v3.16b}, [x3], #64
|
||||
.if ! \is_line_tail
|
||||
subs w14, w14, #32
|
||||
.endif
|
||||
|
||||
.if ! \skip_storing_chroma
|
||||
.ifc \dst_fmt, yuv420
|
||||
ld4 {v4.16b - v7.16b}, [x13], #64
|
||||
.endif
|
||||
|
||||
.ifc \dst_fmt, yuv420 // store UV
|
||||
.ifc \src_fmt, uyvy
|
||||
uhadd v0.16b, v4.16b, v0.16b // halving sum of U
|
||||
uhadd v2.16b, v6.16b, v2.16b // halving sum of V
|
||||
.else
|
||||
uhadd v1.16b, v5.16b, v1.16b // halving sum of U
|
||||
uhadd v3.16b, v7.16b, v3.16b // halving sum of V
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifc \src_fmt, uyvy
|
||||
st1 {v2.16b}, [x2], #16
|
||||
st1 {v0.16b}, [x1], #16
|
||||
.else
|
||||
st1 {v3.16b}, [x2], #16
|
||||
st1 {v1.16b}, [x1], #16
|
||||
.endif
|
||||
|
||||
.ifc \dst_fmt, yuv420 // store_y
|
||||
.ifc \src_fmt, uyvy
|
||||
mov v6.16b, v5.16b
|
||||
st2 {v6.16b,v7.16b}, [x10], #32
|
||||
.else
|
||||
mov v5.16b, v4.16b
|
||||
st2 {v5.16b,v6.16b}, [x10], #32
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.endif // ! \skip_storing_chroma
|
||||
|
||||
.ifc \src_fmt, uyvy
|
||||
mov v2.16b, v1.16b
|
||||
st2 {v2.16b,v3.16b}, [x0], #32
|
||||
.else
|
||||
mov v1.16b, v0.16b
|
||||
st2 {v1.16b,v2.16b}, [x0], #32
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// shift pointers back to width - 32 to process the tail of the line
|
||||
// if the height is odd, processing the final line is simplified
|
||||
.macro fastpath_shift_back_pointers src_fmt, dst_fmt, is_final_odd_line
|
||||
add x3, x3, w9, sxtw #1
|
||||
sub x3, x3, #64
|
||||
.if ! \is_final_odd_line
|
||||
.ifc \dst_fmt, yuv420
|
||||
add x13, x13, w9, sxtw #1
|
||||
sub x13, x13, #64
|
||||
add x10, x10, w9, sxtw
|
||||
sub x10, x10, #32
|
||||
.endif
|
||||
.endif
|
||||
add x0, x0, w9, sxtw
|
||||
sub x0, x0, #32
|
||||
.if ! \is_final_odd_line
|
||||
asr w14, w9, #1
|
||||
add x1, x1, w14, sxtw
|
||||
sub x1, x1, #16
|
||||
add x2, x2, w14, sxtw
|
||||
sub x2, x2, #16
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro slowpath_iteration src_fmt, dst_fmt, skip_storing_chroma
|
||||
.ifc \dst_fmt, yuv422
|
||||
.ifc \src_fmt, uyvy
|
||||
ldrb w12, [x3], #1
|
||||
ldrb w14, [x3], #1
|
||||
strb w12, [x1], #1
|
||||
strb w14, [x0], #1
|
||||
ldrb w12, [x3], #1
|
||||
ldrb w14, [x3], #1
|
||||
strb w12, [x2], #1
|
||||
strb w14, [x0], #1
|
||||
.else
|
||||
ldrb w12, [x3], #1
|
||||
ldrb w14, [x3], #1
|
||||
strb w12, [x0], #1
|
||||
strb w14, [x1], #1
|
||||
ldrb w12, [x3], #1
|
||||
ldrb w14, [x3], #1
|
||||
strb w12, [x0], #1
|
||||
strb w14, [x2], #1
|
||||
.endif
|
||||
.endif
|
||||
.ifc \dst_fmt, yuv420
|
||||
.ifc \src_fmt, uyvy
|
||||
.if \skip_storing_chroma
|
||||
ldrb w12, [x3], #2
|
||||
ldrb w14, [x3], #2
|
||||
strb w12, [x0], #1
|
||||
strb w14, [x0], #1
|
||||
.else
|
||||
ldrb w12, [x3], #1
|
||||
ldrb w14, [x13], #1
|
||||
add w12, w12, w14
|
||||
lsr w12, w12, #1
|
||||
strb w12, [x1], #1
|
||||
ldrb w14, [x3], #1
|
||||
ldrb w12, [x13], #1
|
||||
strb w14, [x0], #1
|
||||
strb w12, [x10], #1
|
||||
ldrb w14, [x13], #1
|
||||
ldrb w12, [x3], #1
|
||||
add w12, w12, w14
|
||||
lsr w12, w12, #1
|
||||
strb w12, [x2], #1
|
||||
ldrb w14, [x3], #1
|
||||
ldrb w12, [x13], #1
|
||||
strb w14, [x0], #1
|
||||
strb w12, [x10], #1
|
||||
.endif
|
||||
.else
|
||||
.if \skip_storing_chroma
|
||||
ldrb w12, [x3], #2
|
||||
ldrb w14, [x3], #2
|
||||
strb w12, [x0], #1
|
||||
strb w14, [x0], #1
|
||||
.else
|
||||
ldrb w12, [x3], #1
|
||||
ldrb w14, [x13], #1
|
||||
strb w12, [x0], #1
|
||||
strb w14, [x10], #1
|
||||
ldrb w12, [x3], #1
|
||||
ldrb w14, [x13], #1
|
||||
add w12, w12, w14
|
||||
lsr w12, w12, #1
|
||||
strb w12, [x1], #1
|
||||
ldrb w14, [x3], #1
|
||||
ldrb w12, [x13], #1
|
||||
strb w14, [x0], #1
|
||||
strb w12, [x10], #1
|
||||
ldrb w14, [x13], #1
|
||||
ldrb w12, [x3], #1
|
||||
add w12, w12, w14
|
||||
lsr w12, w12, #1
|
||||
strb w12, [x2], #1
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro move_pointers_to_next_line src_fmt, dst_fmt, is_final_odd_line
|
||||
add x3, x3, x8
|
||||
add x0, x0, x6
|
||||
.ifc \dst_fmt, yuv420
|
||||
add x13, x13, x8
|
||||
add x10, x10, x6
|
||||
.endif
|
||||
add x1, x1, x7
|
||||
add x2, x2, x7
|
||||
.endm
|
||||
|
||||
.macro interleaved_yuv_to_planar src_fmt, dst_fmt
|
||||
function ff_\src_fmt\()to\dst_fmt\()_neon, export=1
|
||||
sxtw x6, w6
|
||||
sxtw x7, w7
|
||||
ldrsw x8, [sp]
|
||||
ands w11, w4, #~31 // choose between fast and slow path
|
||||
|
||||
.ifc \dst_fmt, yuv420
|
||||
add x10, x0, x6
|
||||
add x13, x3, x8
|
||||
add x8, x8, x8
|
||||
add x6, x6, x6
|
||||
and w17, w5, #1
|
||||
asr w5, w5, #1
|
||||
.endif
|
||||
asr w9, w4, #1
|
||||
sub x8, x8, w4, sxtw #1 // src offset
|
||||
sub x6, x6, w4, sxtw // lum offset
|
||||
sub x7, x7, x9 // chr offset
|
||||
|
||||
b.eq 6f
|
||||
|
||||
1: // fast path - the width is at least 32
|
||||
and w14, w4, #~31 // w14 is the main loop counter
|
||||
and w9, w4, #31 // w9 holds the remaining width, 0 to 31
|
||||
2:
|
||||
fastpath_iteration \src_fmt, \dst_fmt, 0, 0
|
||||
b.ne 2b
|
||||
fastpath_shift_back_pointers \src_fmt, \dst_fmt, 0
|
||||
fastpath_iteration \src_fmt, \dst_fmt, 0, 0
|
||||
subs w5, w5, #1
|
||||
move_pointers_to_next_line \src_fmt, \dst_fmt
|
||||
b.ne 1b
|
||||
|
||||
.ifc \dst_fmt, yuv420 // handle the last line in case the height is odd
|
||||
cbz w17, 3f
|
||||
and w14, w4, #~31
|
||||
4:
|
||||
fastpath_iteration \src_fmt, \dst_fmt, 0, 1
|
||||
b.ne 4b
|
||||
fastpath_shift_back_pointers \src_fmt, \dst_fmt, 1
|
||||
fastpath_iteration \src_fmt, \dst_fmt, 1, 1
|
||||
3:
|
||||
.endif
|
||||
ret
|
||||
|
||||
6: // slow path - width is at most 31
|
||||
and w9, w4, #31
|
||||
7:
|
||||
subs w9, w9, #2
|
||||
slowpath_iteration \src_fmt, \dst_fmt, 0
|
||||
b.ne 7b
|
||||
subs w5, w5, #1
|
||||
move_pointers_to_next_line \src_fmt, \dst_fmt
|
||||
b.ne 6b
|
||||
|
||||
.ifc \dst_fmt, yuv420
|
||||
cbz w17, 8f
|
||||
and w9, w4, #31
|
||||
.ifc \src_fmt, uyvy
|
||||
add x3, x3, #1
|
||||
.endif
|
||||
5:
|
||||
subs w9, w9, #2
|
||||
slowpath_iteration \src_fmt, \dst_fmt, 1
|
||||
b.ne 5b
|
||||
8:
|
||||
.endif
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
interleaved_yuv_to_planar uyvy, yuv422
|
||||
interleaved_yuv_to_planar uyvy, yuv420
|
||||
interleaved_yuv_to_planar yuyv, yuv422
|
||||
interleaved_yuv_to_planar yuyv, yuv420
|
||||
|
Reference in New Issue
Block a user