1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-04 22:03:09 +02:00

swscale/aarch64/rgb2rgb: Implemented NEON shuf routines

The key idea is to pass the pre-generated tables to the TBL instruction
and churn through the data 16 bytes at a time. The remaining 4 elements
are handled with a specialized block located at the end of the routine.

The 3210 variant can be implemented using rev32, but surprisingly it is
slower than the generic TBL on A78, but much faster on A72.

There may be some room for improvement. Possibly instead of handling
last 8 and then 4 bytes separately, we can load these 4 into {v0.s}[2]
and process along with the last 8 bytes.

Speeds measured with checkasm --test=sw_rgb --bench --runs=10 | grep shuf

- A78
shuffle_bytes_0321_c:                                   75.5 ( 1.00x)
shuffle_bytes_0321_neon:                                26.5 ( 2.85x)
shuffle_bytes_1203_c:                                  136.2 ( 1.00x)
shuffle_bytes_1203_neon:                                27.2 ( 5.00x)
shuffle_bytes_1230_c:                                  135.5 ( 1.00x)
shuffle_bytes_1230_neon:                                28.0 ( 4.84x)
shuffle_bytes_2013_c:                                  138.8 ( 1.00x)
shuffle_bytes_2013_neon:                                22.0 ( 6.31x)
shuffle_bytes_2103_c:                                   76.5 ( 1.00x)
shuffle_bytes_2103_neon:                                20.5 ( 3.73x)
shuffle_bytes_2130_c:                                  137.5 ( 1.00x)
shuffle_bytes_2130_neon:                                28.0 ( 4.91x)
shuffle_bytes_3012_c:                                  138.2 ( 1.00x)
shuffle_bytes_3012_neon:                                21.5 ( 6.43x)
shuffle_bytes_3102_c:                                  138.2 ( 1.00x)
shuffle_bytes_3102_neon:                                27.2 ( 5.07x)
shuffle_bytes_3210_c:                                  138.0 ( 1.00x)
shuffle_bytes_3210_neon:                                22.0 ( 6.27x)

shuf3210 using rev32
shuffle_bytes_3210_c:                                  139.0 ( 1.00x)
shuffle_bytes_3210_neon:                                28.5 ( 4.88x)

- A72
shuffle_bytes_0321_c:                                  120.0 ( 1.00x)
shuffle_bytes_0321_neon:                                36.0 ( 3.33x)
shuffle_bytes_1203_c:                                  188.2 ( 1.00x)
shuffle_bytes_1203_neon:                                37.8 ( 4.99x)
shuffle_bytes_1230_c:                                  195.0 ( 1.00x)
shuffle_bytes_1230_neon:                                36.0 ( 5.42x)
shuffle_bytes_2013_c:                                  195.8 ( 1.00x)
shuffle_bytes_2013_neon:                                43.5 ( 4.50x)
shuffle_bytes_2103_c:                                  117.2 ( 1.00x)
shuffle_bytes_2103_neon:                                53.5 ( 2.19x)
shuffle_bytes_2130_c:                                  203.2 ( 1.00x)
shuffle_bytes_2130_neon:                                37.8 ( 5.38x)
shuffle_bytes_3012_c:                                  183.8 ( 1.00x)
shuffle_bytes_3012_neon:                                46.8 ( 3.93x)
shuffle_bytes_3102_c:                                  180.8 ( 1.00x)
shuffle_bytes_3102_neon:                                37.8 ( 4.79x)
shuffle_bytes_3210_c:                                  195.8 ( 1.00x)
shuffle_bytes_3210_neon:                                37.8 ( 5.19x)

shuf3210 using rev32
shuffle_bytes_3210_c:                                  194.8 ( 1.00x)
shuffle_bytes_3210_neon:                                30.8 ( 6.33x)

- x13s:
shuffle_bytes_0321_c:                                   49.4 ( 1.00x)
shuffle_bytes_0321_neon:                                18.1 ( 2.72x)
shuffle_bytes_1203_c:                                   98.4 ( 1.00x)
shuffle_bytes_1203_neon:                                18.4 ( 5.35x)
shuffle_bytes_1230_c:                                   97.4 ( 1.00x)
shuffle_bytes_1230_neon:                                19.1 ( 5.09x)
shuffle_bytes_2013_c:                                  101.4 ( 1.00x)
shuffle_bytes_2013_neon:                                16.9 ( 6.01x)
shuffle_bytes_2103_c:                                   53.9 ( 1.00x)
shuffle_bytes_2103_neon:                                13.9 ( 3.88x)
shuffle_bytes_2130_c:                                  100.9 ( 1.00x)
shuffle_bytes_2130_neon:                                19.1 ( 5.27x)
shuffle_bytes_3012_c:                                   97.4 ( 1.00x)
shuffle_bytes_3012_neon:                                17.1 ( 5.69x)
shuffle_bytes_3102_c:                                  100.9 ( 1.00x)
shuffle_bytes_3102_neon:                                19.1 ( 5.27x)
shuffle_bytes_3210_c:                                  100.6 ( 1.00x)
shuffle_bytes_3210_neon:                                16.9 ( 5.96x)

shuf3210 using rev32
shuffle_bytes_3210_c:                                  100.6 ( 1.00x)
shuffle_bytes_3210_neon:                                18.6 ( 5.40x)

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Krzysztof Pyrkosz
2025-01-28 19:01:33 +01:00
committed by Martin Storsjö
parent e25a19fc7c
commit c85a748979
3 changed files with 161 additions and 0 deletions

View File

@ -57,6 +57,15 @@ void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
void ff_deinterleave_bytes_neon(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
int dst1Stride, int dst2Stride);
void ff_shuffle_bytes_0321_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2103_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_1230_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_3012_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_3210_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_3102_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2013_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2130_neon(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_1203_neon(const uint8_t *src, uint8_t *dst, int src_size);
av_cold void rgb2rgb_init_aarch64(void)
{
@ -66,5 +75,14 @@ av_cold void rgb2rgb_init_aarch64(void)
ff_rgb24toyv12 = rgb24toyv12;
interleaveBytes = ff_interleave_bytes_neon;
deinterleaveBytes = ff_deinterleave_bytes_neon;
shuffle_bytes_0321 = ff_shuffle_bytes_0321_neon;
shuffle_bytes_1230 = ff_shuffle_bytes_1230_neon;
shuffle_bytes_2103 = ff_shuffle_bytes_2103_neon;
shuffle_bytes_3012 = ff_shuffle_bytes_3012_neon;
shuffle_bytes_3210 = ff_shuffle_bytes_3210_neon;
shuffle_bytes_3102 = ff_shuffle_bytes_3102_neon;
shuffle_bytes_2013 = ff_shuffle_bytes_2013_neon;
shuffle_bytes_2130 = ff_shuffle_bytes_2130_neon;
shuffle_bytes_1203 = ff_shuffle_bytes_1203_neon;
}
}

View File

@ -34,6 +34,69 @@
#define Y_OFFSET v22
#define UV_OFFSET v23
const shuf_0321_tbl, align=4
.byte 0, 3, 2, 1
.byte 4, 7, 6, 5
.byte 8, 11, 10, 9
.byte 12, 15, 14, 13
endconst
const shuf_1230_tbl, align=4
.byte 1, 2, 3, 0
.byte 5, 6, 7, 4
.byte 9, 10, 11, 8
.byte 13, 14, 15, 12
endconst
const shuf_2103_tbl, align=4
.byte 2, 1, 0, 3
.byte 6, 5, 4, 7
.byte 10, 9, 8, 11
.byte 14, 13, 12, 15
endconst
const shuf_3012_tbl, align=4
.byte 3, 0, 1, 2
.byte 7, 4, 5, 6
.byte 11, 8, 9, 10
.byte 15, 12, 13, 14
endconst
const shuf_3210_tbl, align=4
.byte 3, 2, 1, 0
.byte 7, 6, 5, 4
.byte 11, 10, 9, 8
.byte 15, 14, 13, 12
endconst
const shuf_3102_tbl, align=4
.byte 3, 1, 0, 2
.byte 7, 5, 4, 6
.byte 11, 9, 8, 10
.byte 15, 13, 12, 14
endconst
const shuf_2013_tbl, align=4
.byte 2, 0, 1, 3
.byte 6, 4, 5, 7
.byte 10, 8, 9, 11
.byte 14, 12, 13, 15
endconst
const shuf_1203_tbl, align=4
.byte 1, 2, 0, 3
.byte 5, 6, 4, 7
.byte 9, 10, 8, 11
.byte 13, 14, 12, 15
endconst
const shuf_2130_tbl, align=4
.byte 2, 1, 3, 0
.byte 6, 5, 7, 4
.byte 10, 9, 11, 8
.byte 14, 13, 15, 12
endconst
// convert rgb to 16-bit y, u, or v
// uses v3 and v4
.macro rgbconv16 dst, b, g, r, bc, gc, rc
@ -296,3 +359,71 @@ function ff_deinterleave_bytes_neon, export=1
0:
ret
endfunc
.macro neon_shuf shuf
function ff_shuffle_bytes_\shuf\()_neon, export=1
movrel x9, shuf_\shuf\()_tbl
ld1 {v1.16b}, [x9]
and w5, w2, #~15
and w3, w2, #8
and w4, w2, #4
cbz w5, 2f
1:
ld1 {v0.16b}, [x0], #16
subs w5, w5, #16
tbl v0.16b, {v0.16b}, v1.16b
st1 {v0.16b}, [x1], #16
b.gt 1b
2:
cbz w3, 3f
ld1 {v0.8b}, [x0], #8
tbl v0.8b, {v0.16b}, v1.8b
st1 {v0.8b}, [x1], #8
3:
cbz w4, 4f
.if \shuf == 0321
ldr w5, [x0]
rev w5, w5
ror w5, w5, #24
str w5, [x1]
.endif
.if \shuf == 1230
ldr w5, [x0]
ror w5, w5, #8
str w5, [x1]
.endif
.if \shuf == 2103
ldr w5, [x0]
rev w5, w5
ror w5, w5, #8
str w5, [x1]
.endif
.if \shuf == 3012
ldr w5, [x0]
ror w5, w5, #24
str w5, [x1]
.endif
.if \shuf == 3210
ldr w5, [x0]
rev w5, w5
str w5, [x1]
.endif
.if \shuf == 3102 || \shuf == 2013 || \shuf == 1203 || \shuf == 2130
ld1 {v0.s}[0], [x0]
tbl v0.8b, {v0.16b}, v1.8b
st1 {v0.s}[0], [x1]
.endif
4:
ret
endfunc
.endm
neon_shuf 0321
neon_shuf 1230
neon_shuf 2103
neon_shuf 3012
neon_shuf 3102
neon_shuf 2013
neon_shuf 1203
neon_shuf 2130
neon_shuf 3210

View File

@ -822,6 +822,18 @@ void checkasm_check_sw_rgb(void)
check_shuffle_bytes(shuffle_bytes_3210, "shuffle_bytes_3210");
report("shuffle_bytes_3210");
check_shuffle_bytes(shuffle_bytes_3102, "shuffle_bytes_3102");
report("shuffle_bytes_3102");
check_shuffle_bytes(shuffle_bytes_2013, "shuffle_bytes_2013");
report("shuffle_bytes_2013");
check_shuffle_bytes(shuffle_bytes_1203, "shuffle_bytes_1203");
report("shuffle_bytes_1203");
check_shuffle_bytes(shuffle_bytes_2130, "shuffle_bytes_2130");
report("shuffle_bytes_2130");
check_uyvy_to_422p();
report("uyvytoyuv422");