mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
aarch64/vvc: Add w_avg
w_avg_8_2x2_c: 0.0 ( 0.00x) w_avg_8_2x2_neon: 0.0 ( 0.00x) w_avg_8_4x4_c: 0.2 ( 1.00x) w_avg_8_4x4_neon: 0.0 ( 0.00x) w_avg_8_8x8_c: 1.2 ( 1.00x) w_avg_8_8x8_neon: 0.2 ( 5.00x) w_avg_8_16x16_c: 4.2 ( 1.00x) w_avg_8_16x16_neon: 0.8 ( 5.67x) w_avg_8_32x32_c: 16.2 ( 1.00x) w_avg_8_32x32_neon: 2.5 ( 6.50x) w_avg_8_64x64_c: 64.5 ( 1.00x) w_avg_8_64x64_neon: 9.0 ( 7.17x) w_avg_8_128x128_c: 269.5 ( 1.00x) w_avg_8_128x128_neon: 35.5 ( 7.59x) w_avg_10_2x2_c: 0.2 ( 1.00x) w_avg_10_2x2_neon: 0.2 ( 1.00x) w_avg_10_4x4_c: 0.2 ( 1.00x) w_avg_10_4x4_neon: 0.2 ( 1.00x) w_avg_10_8x8_c: 1.0 ( 1.00x) w_avg_10_8x8_neon: 0.2 ( 4.00x) w_avg_10_16x16_c: 4.2 ( 1.00x) w_avg_10_16x16_neon: 0.8 ( 5.67x) w_avg_10_32x32_c: 16.2 ( 1.00x) w_avg_10_32x32_neon: 2.5 ( 6.50x) w_avg_10_64x64_c: 66.2 ( 1.00x) w_avg_10_64x64_neon: 10.0 ( 6.62x) w_avg_10_128x128_c: 277.8 ( 1.00x) w_avg_10_128x128_neon: 39.8 ( 6.99x) w_avg_12_2x2_c: 0.0 ( 0.00x) w_avg_12_2x2_neon: 0.2 ( 0.00x) w_avg_12_4x4_c: 0.2 ( 1.00x) w_avg_12_4x4_neon: 0.0 ( 0.00x) w_avg_12_8x8_c: 1.2 ( 1.00x) w_avg_12_8x8_neon: 0.5 ( 2.50x) w_avg_12_16x16_c: 4.8 ( 1.00x) w_avg_12_16x16_neon: 0.8 ( 6.33x) w_avg_12_32x32_c: 17.0 ( 1.00x) w_avg_12_32x32_neon: 2.8 ( 6.18x) w_avg_12_64x64_c: 64.0 ( 1.00x) w_avg_12_64x64_neon: 10.0 ( 6.40x) w_avg_12_128x128_c: 269.2 ( 1.00x) w_avg_12_128x128_neon: 42.0 ( 6.41x) Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
This commit is contained in:
parent
76eb3e5ff3
commit
0ba9e8d0d4
@ -52,6 +52,39 @@ void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
const int16_t *src0, const int16_t *src1, int width,
|
||||
int height);
|
||||
|
||||
void ff_vvc_w_avg_8_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
|
||||
const int16_t *src0, const int16_t *src1,
|
||||
int width, int height,
|
||||
uintptr_t w0_w1, uintptr_t offset_shift);
|
||||
void ff_vvc_w_avg_10_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
|
||||
const int16_t *src0, const int16_t *src1,
|
||||
int width, int height,
|
||||
uintptr_t w0_w1, uintptr_t offset_shift);
|
||||
void ff_vvc_w_avg_12_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
|
||||
const int16_t *src0, const int16_t *src1,
|
||||
int width, int height,
|
||||
uintptr_t w0_w1, uintptr_t offset_shift);
|
||||
/* When passing arguments to functions, Apple platforms diverge from the ARM64
|
||||
* standard ABI for functions that require passing arguments on the stack. To
|
||||
* simplify portability in the assembly function interface, use a different
|
||||
* function signature that doesn't require passing arguments on the stack.
|
||||
*/
|
||||
#define W_AVG_FUN(bit_depth) \
|
||||
static void vvc_w_avg_ ## bit_depth(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const int16_t *src0, const int16_t *src1, int width, int height, \
|
||||
int denom, int w0, int w1, int o0, int o1) \
|
||||
{ \
|
||||
int shift = denom + FFMAX(3, 15 - bit_depth); \
|
||||
int offset = ((o0 + o1) * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \
|
||||
uintptr_t w0_w1 = ((uintptr_t)w0 << 32) | (uint32_t)w1; \
|
||||
uintptr_t offset_shift = ((uintptr_t)offset << 32) | (uint32_t)shift; \
|
||||
ff_vvc_w_avg_ ## bit_depth ## _neon(dst, dst_stride, src0, src1, width, height, w0_w1, offset_shift); \
|
||||
}
|
||||
|
||||
W_AVG_FUN(8)
|
||||
W_AVG_FUN(10)
|
||||
W_AVG_FUN(12)
|
||||
|
||||
void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
@ -123,6 +156,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;
|
||||
|
||||
c->inter.avg = ff_vvc_avg_8_neon;
|
||||
c->inter.w_avg = vvc_w_avg_8;
|
||||
|
||||
for (int i = 0; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
|
||||
c->sao.band_filter[i] = ff_h26x_sao_band_filter_8x8_8_neon;
|
||||
@ -163,11 +197,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
||||
}
|
||||
} else if (bd == 10) {
|
||||
c->inter.avg = ff_vvc_avg_10_neon;
|
||||
c->inter.w_avg = vvc_w_avg_10;
|
||||
|
||||
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
|
||||
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
|
||||
} else if (bd == 12) {
|
||||
c->inter.avg = ff_vvc_avg_12_neon;
|
||||
c->inter.w_avg = vvc_w_avg_12;
|
||||
|
||||
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
|
||||
c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
|
||||
|
@ -22,9 +22,9 @@
|
||||
|
||||
#define VVC_MAX_PB_SIZE 128
|
||||
|
||||
.macro vvc_avg, bit_depth
|
||||
.macro vvc_avg type, bit_depth
|
||||
|
||||
.macro vvc_avg_\bit_depth\()_2_4, tap
|
||||
.macro vvc_\type\()_\bit_depth\()_2_4 tap
|
||||
.if \tap == 2
|
||||
ldr s0, [src0]
|
||||
ldr s2, [src1]
|
||||
@ -32,9 +32,19 @@
|
||||
ldr d0, [src0]
|
||||
ldr d2, [src1]
|
||||
.endif
|
||||
|
||||
.ifc \type, avg
|
||||
saddl v4.4s, v0.4h, v2.4h
|
||||
add v4.4s, v4.4s, v16.4s
|
||||
sqshrn v4.4h, v4.4s, #(15 - \bit_depth)
|
||||
.else
|
||||
mov v4.16b, v16.16b
|
||||
smlal v4.4s, v0.4h, v19.4h
|
||||
smlal v4.4s, v2.4h, v20.4h
|
||||
sqshl v4.4s, v4.4s, v22.4s
|
||||
sqxtn v4.4h, v4.4s
|
||||
.endif
|
||||
|
||||
.if \bit_depth == 8
|
||||
sqxtun v4.8b, v4.8h
|
||||
.if \tap == 2
|
||||
@ -57,7 +67,7 @@
|
||||
add dst, dst, dst_stride
|
||||
.endm
|
||||
|
||||
function ff_vvc_avg_\bit_depth\()_neon, export=1
|
||||
function ff_vvc_\type\()_\bit_depth\()_neon, export=1
|
||||
dst .req x0
|
||||
dst_stride .req x1
|
||||
src0 .req x2
|
||||
@ -67,42 +77,64 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
|
||||
|
||||
mov x10, #(VVC_MAX_PB_SIZE * 2)
|
||||
cmp width, #8
|
||||
.if \bit_depth == 8
|
||||
movi v16.4s, #64
|
||||
.ifc \type, avg
|
||||
movi v16.4s, #(1 << (14 - \bit_depth))
|
||||
.else
|
||||
.if \bit_depth == 10
|
||||
mov w6, #1023
|
||||
movi v16.4s, #16
|
||||
.else
|
||||
mov w6, #4095
|
||||
movi v16.4s, #4
|
||||
.endif
|
||||
lsr x11, x6, #32 // weight0
|
||||
mov w12, w6 // weight1
|
||||
lsr x13, x7, #32 // offset
|
||||
mov w14, w7 // shift
|
||||
|
||||
dup v19.8h, w11
|
||||
neg w14, w14 // so we can use sqshl
|
||||
dup v20.8h, w12
|
||||
dup v16.4s, w13
|
||||
dup v22.4s, w14
|
||||
.endif // avg
|
||||
|
||||
.if \bit_depth >= 10
|
||||
// clip pixel
|
||||
mov w6, #((1 << \bit_depth) - 1)
|
||||
movi v18.8h, #0
|
||||
dup v17.8h, w6
|
||||
.endif
|
||||
|
||||
b.eq 8f
|
||||
b.hi 16f
|
||||
cmp width, #4
|
||||
b.eq 4f
|
||||
2: // width == 2
|
||||
subs height, height, #1
|
||||
vvc_avg_\bit_depth\()_2_4 2
|
||||
vvc_\type\()_\bit_depth\()_2_4 2
|
||||
b.ne 2b
|
||||
b 32f
|
||||
4: // width == 4
|
||||
subs height, height, #1
|
||||
vvc_avg_\bit_depth\()_2_4 4
|
||||
vvc_\type\()_\bit_depth\()_2_4 4
|
||||
b.ne 4b
|
||||
b 32f
|
||||
8: // width == 8
|
||||
ld1 {v0.8h}, [src0], x10
|
||||
ld1 {v2.8h}, [src1], x10
|
||||
.ifc \type, avg
|
||||
saddl v4.4s, v0.4h, v2.4h
|
||||
saddl2 v5.4s, v0.8h, v2.8h
|
||||
add v4.4s, v4.4s, v16.4s
|
||||
add v5.4s, v5.4s, v16.4s
|
||||
sqshrn v4.4h, v4.4s, #(15 - \bit_depth)
|
||||
sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth)
|
||||
.else
|
||||
mov v4.16b, v16.16b
|
||||
mov v5.16b, v16.16b
|
||||
smlal v4.4s, v0.4h, v19.4h
|
||||
smlal v4.4s, v2.4h, v20.4h
|
||||
smlal2 v5.4s, v0.8h, v19.8h
|
||||
smlal2 v5.4s, v2.8h, v20.8h
|
||||
sqshl v4.4s, v4.4s, v22.4s
|
||||
sqshl v5.4s, v5.4s, v22.4s
|
||||
sqxtn v4.4h, v4.4s
|
||||
sqxtn2 v4.8h, v5.4s
|
||||
.endif
|
||||
subs height, height, #1
|
||||
.if \bit_depth == 8
|
||||
sqxtun v4.8b, v4.8h
|
||||
@ -122,6 +154,7 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
|
||||
17:
|
||||
ldp q0, q1, [x7], #32
|
||||
ldp q2, q3, [x8], #32
|
||||
.ifc \type, avg
|
||||
saddl v4.4s, v0.4h, v2.4h
|
||||
saddl2 v5.4s, v0.8h, v2.8h
|
||||
saddl v6.4s, v1.4h, v3.4h
|
||||
@ -134,6 +167,28 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
|
||||
sqshrn2 v4.8h, v5.4s, #(15 - \bit_depth)
|
||||
sqshrn v6.4h, v6.4s, #(15 - \bit_depth)
|
||||
sqshrn2 v6.8h, v7.4s, #(15 - \bit_depth)
|
||||
.else // avg
|
||||
mov v4.16b, v16.16b
|
||||
mov v5.16b, v16.16b
|
||||
mov v6.16b, v16.16b
|
||||
mov v7.16b, v16.16b
|
||||
smlal v4.4s, v0.4h, v19.4h
|
||||
smlal v4.4s, v2.4h, v20.4h
|
||||
smlal2 v5.4s, v0.8h, v19.8h
|
||||
smlal2 v5.4s, v2.8h, v20.8h
|
||||
smlal v6.4s, v1.4h, v19.4h
|
||||
smlal v6.4s, v3.4h, v20.4h
|
||||
smlal2 v7.4s, v1.8h, v19.8h
|
||||
smlal2 v7.4s, v3.8h, v20.8h
|
||||
sqshl v4.4s, v4.4s, v22.4s
|
||||
sqshl v5.4s, v5.4s, v22.4s
|
||||
sqshl v6.4s, v6.4s, v22.4s
|
||||
sqshl v7.4s, v7.4s, v22.4s
|
||||
sqxtn v4.4h, v4.4s
|
||||
sqxtn v6.4h, v6.4s
|
||||
sqxtn2 v4.8h, v5.4s
|
||||
sqxtn2 v6.8h, v7.4s
|
||||
.endif // w_avg
|
||||
subs w6, w6, #16
|
||||
.if \bit_depth == 8
|
||||
sqxtun v4.8b, v4.8h
|
||||
@ -155,9 +210,19 @@ function ff_vvc_avg_\bit_depth\()_neon, export=1
|
||||
b.ne 16b
|
||||
32:
|
||||
ret
|
||||
|
||||
.unreq dst
|
||||
.unreq dst_stride
|
||||
.unreq src0
|
||||
.unreq src1
|
||||
.unreq width
|
||||
.unreq height
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
vvc_avg 8
|
||||
vvc_avg 10
|
||||
vvc_avg 12
|
||||
vvc_avg avg, 8
|
||||
vvc_avg avg, 10
|
||||
vvc_avg avg, 12
|
||||
vvc_avg w_avg, 8
|
||||
vvc_avg w_avg, 10
|
||||
vvc_avg w_avg, 12
|
||||
|
Loading…
Reference in New Issue
Block a user