1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-04 22:03:09 +02:00

swscale/aarch64: dotprod implementation of rgba32_to_Y

The idea is to split the 16 bit coefficients into lower and upper half,
invoke udot for the lower half, shift by 8, and follow by udot for the
upper half.

Benchmark on A78:
bgra_to_y_128_c:                                       682.0 ( 1.00x)
bgra_to_y_128_neon:                                    181.2 ( 3.76x)
bgra_to_y_128_dotprod:                                 117.8 ( 5.79x)
bgra_to_y_1080_c:                                     5742.5 ( 1.00x)
bgra_to_y_1080_neon:                                  1472.5 ( 3.90x)
bgra_to_y_1080_dotprod:                                906.5 ( 6.33x)
bgra_to_y_1920_c:                                    10194.0 ( 1.00x)
bgra_to_y_1920_neon:                                  2589.8 ( 3.94x)
bgra_to_y_1920_dotprod:                               1573.8 ( 6.48x)

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Krzysztof Pyrkosz
2025-03-03 22:00:23 +01:00
committed by Martin Storsjö
parent 081c865867
commit d765e5f043
2 changed files with 105 additions and 0 deletions

View File

@ -313,3 +313,91 @@ rgbToUV_neon bgr24, rgb24, element=3
rgbToUV_neon bgra32, rgba32, element=4
rgbToUV_neon abgr32, argb32, element=4, alpha_first=1
#if HAVE_DOTPROD
ENABLE_DOTPROD
function ff_bgra32ToY_neon_dotprod, export=1
cmp w4, #0 // check width > 0
ldp w12, w11, [x5] // w12: ry, w11: gy
ldr w10, [x5, #8] // w10: by
b.gt 4f
ret
endfunc
function ff_rgba32ToY_neon_dotprod, export=1
cmp w4, #0 // check width > 0
ldp w10, w11, [x5] // w10: ry, w11: gy
ldr w12, [x5, #8] // w12: by
b.le 3f
4:
mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT - 7)
movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT - 1)
dup v6.4s, w9 // w9: const_offset
cmp w4, #16
mov w7, w10
bfi w7, w11, 8, 8 // the bfi instructions are used to assemble
bfi w7, w12, 16, 8 // 4 byte r,g,b,0 mask to be then used by udot.
dup v0.4s, w7 // v0 holds the lower byte of each coefficient
lsr w6, w10, #8
lsr w7, w11, #8
lsr w8, w12, #8
bfi w6, w7, 8, 8
bfi w6, w8, 16, 8
dup v1.4s, w6 // v1 holds the upper byte of each coefficient
b.lt 2f
1:
ld1 { v16.16b, v17.16b, v18.16b, v19.16b }, [x1], #64
sub w4, w4, #16 // width -= 16
mov v2.16b, v6.16b
mov v3.16b, v6.16b
mov v4.16b, v6.16b
mov v5.16b, v6.16b
cmp w4, #16 // width >= 16 ?
udot v2.4s, v16.16b, v0.16b
udot v3.4s, v17.16b, v0.16b
udot v4.4s, v18.16b, v0.16b
udot v5.4s, v19.16b, v0.16b
ushr v2.4s, v2.4s, #8
ushr v3.4s, v3.4s, #8
ushr v4.4s, v4.4s, #8
ushr v5.4s, v5.4s, #8
udot v2.4s, v16.16b, v1.16b
udot v3.4s, v17.16b, v1.16b
udot v4.4s, v18.16b, v1.16b
udot v5.4s, v19.16b, v1.16b
sqshrn v16.4h, v2.4s, #1
sqshrn2 v16.8h, v3.4s, #1
sqshrn v17.4h, v4.4s, #1
sqshrn2 v17.8h, v5.4s, #1
stp q16, q17, [x0], #32 // store to dst
b.ge 1b
cbz x4, 3f
2:
ldrb w13, [x1] // w13: r
ldrb w14, [x1, #1] // w14: g
ldrb w15, [x1, #2] // w15: b
smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset
smaddl x13, w14, w11, x13 // x13 += gy * g
smaddl x13, w15, w12, x13 // x13 += by * b
asr w13, w13, #9 // x13 >>= 9
sub w4, w4, #1 // width--
add x1, x1, #4
strh w13, [x0], #2 // store to dst
cbnz w4, 2b
3:
ret
endfunc
DISABLE_DOTPROD
#endif

View File

@ -210,6 +210,9 @@ void ff_##name##ToUV_neon(uint8_t *, uint8_t *, const uint8_t *, \
void ff_##name##ToUV_half_neon(uint8_t *, uint8_t *, const uint8_t *, \
const uint8_t *, const uint8_t *, int w, \
uint32_t *coeffs, void *)
#define NEON_INPUT_DOTPROD(name) \
void ff_##name##ToY_neon_dotprod(uint8_t *dst, const uint8_t *src, const uint8_t *, \
const uint8_t *, int w, uint32_t *coeffs, void *);
NEON_INPUT(abgr32);
NEON_INPUT(argb32);
@ -217,6 +220,8 @@ NEON_INPUT(bgr24);
NEON_INPUT(bgra32);
NEON_INPUT(rgb24);
NEON_INPUT(rgba32);
NEON_INPUT_DOTPROD(bgra32);
NEON_INPUT_DOTPROD(rgba32);
void ff_lumRangeFromJpeg8_neon(int16_t *dst, int width,
uint32_t coeff, int64_t offset);
@ -295,6 +300,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
c->chrToYV12 = ff_bgr24ToUV_neon;
break;
case AV_PIX_FMT_BGRA:
#if HAVE_DOTPROD
if (have_dotprod(cpu_flags)) {
c->lumToYV12 = ff_bgra32ToY_neon_dotprod;
}
else
#endif
c->lumToYV12 = ff_bgra32ToY_neon;
if (c->chrSrcHSubSample)
c->chrToYV12 = ff_bgra32ToUV_half_neon;
@ -309,6 +320,12 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
c->chrToYV12 = ff_rgb24ToUV_neon;
break;
case AV_PIX_FMT_RGBA:
#if HAVE_DOTPROD
if (have_dotprod(cpu_flags)) {
c->lumToYV12 = ff_rgba32ToY_neon_dotprod;
}
else
#endif
c->lumToYV12 = ff_rgba32ToY_neon;
if (c->chrSrcHSubSample)
c->chrToYV12 = ff_rgba32ToUV_half_neon;