1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-09-16 08:36:51 +02:00

avcodec/aarch64/vvc: Optimised version of classify function.

Macbook Air (M2):
    vvc_alf_classify_8x8_8_c:                                2.6 ( 1.00x)
    vvc_alf_classify_8x8_8_neon:                             1.0 ( 2.47x)
    vvc_alf_classify_8x8_10_c:                               2.7 ( 1.00x)
    vvc_alf_classify_8x8_10_neon:                            0.9 ( 2.98x)
    vvc_alf_classify_8x8_12_c:                               2.7 ( 1.00x)
    vvc_alf_classify_8x8_12_neon:                            0.9 ( 2.97x)
    vvc_alf_classify_16x16_8_c:                              7.3 ( 1.00x)
    vvc_alf_classify_16x16_8_neon:                           3.4 ( 2.12x)
    vvc_alf_classify_16x16_10_c:                             4.3 ( 1.00x)
    vvc_alf_classify_16x16_10_neon:                          2.9 ( 1.47x)
    vvc_alf_classify_16x16_12_c:                             4.3 ( 1.00x)
    vvc_alf_classify_16x16_12_neon:                          3.0 ( 1.44x)
    vvc_alf_classify_32x32_8_c:                             13.7 ( 1.00x)
    vvc_alf_classify_32x32_8_neon:                          10.7 ( 1.29x)
    vvc_alf_classify_32x32_10_c:                            12.3 ( 1.00x)
    vvc_alf_classify_32x32_10_neon:                          8.7 ( 1.42x)
    vvc_alf_classify_32x32_12_c:                            12.2 ( 1.00x)
    vvc_alf_classify_32x32_12_neon:                          8.7 ( 1.40x)
    vvc_alf_classify_64x64_8_c:                             45.8 ( 1.00x)
    vvc_alf_classify_64x64_8_neon:                          37.1 ( 1.23x)
    vvc_alf_classify_64x64_10_c:                            41.3 ( 1.00x)
    vvc_alf_classify_64x64_10_neon:                         32.8 ( 1.26x)
    vvc_alf_classify_64x64_12_c:                            41.4 ( 1.00x)
    vvc_alf_classify_64x64_12_neon:                         32.4 ( 1.28x)
    vvc_alf_classify_128x128_8_c:                          163.7 ( 1.00x)
    vvc_alf_classify_128x128_8_neon:                       138.3 ( 1.18x)
    vvc_alf_classify_128x128_10_c:                         149.1 ( 1.00x)
    vvc_alf_classify_128x128_10_neon:                      120.3 ( 1.24x)
    vvc_alf_classify_128x128_12_c:                         148.7 ( 1.00x)
    vvc_alf_classify_128x128_12_neon:                      119.4 ( 1.25x)

    RPi4 (Cortex-A72):
    vvc_alf_classify_8x8_8_c:                             1251.6 ( 1.00x)
    vvc_alf_classify_8x8_8_neon:                           700.7 ( 1.79x)
    vvc_alf_classify_8x8_10_c:                            1141.9 ( 1.00x)
    vvc_alf_classify_8x8_10_neon:                          659.7 ( 1.73x)
    vvc_alf_classify_8x8_12_c:                            1075.8 ( 1.00x)
    vvc_alf_classify_8x8_12_neon:                          658.7 ( 1.63x)
    vvc_alf_classify_16x16_8_c:                           3574.1 ( 1.00x)
    vvc_alf_classify_16x16_8_neon:                        1849.8 ( 1.93x)
    vvc_alf_classify_16x16_10_c:                          3270.0 ( 1.00x)
    vvc_alf_classify_16x16_10_neon:                       1786.1 ( 1.83x)
    vvc_alf_classify_16x16_12_c:                          3271.7 ( 1.00x)
    vvc_alf_classify_16x16_12_neon:                       1785.5 ( 1.83x)
    vvc_alf_classify_32x32_8_c:                          12451.9 ( 1.00x)
    vvc_alf_classify_32x32_8_neon:                        5984.3 ( 2.08x)
    vvc_alf_classify_32x32_10_c:                         11428.9 ( 1.00x)
    vvc_alf_classify_32x32_10_neon:                       5756.3 ( 1.99x)
    vvc_alf_classify_32x32_12_c:                         11252.8 ( 1.00x)
    vvc_alf_classify_32x32_12_neon:                       5755.7 ( 1.96x)
    vvc_alf_classify_64x64_8_c:                          47625.5 ( 1.00x)
    vvc_alf_classify_64x64_8_neon:                       21071.9 ( 2.26x)
    vvc_alf_classify_64x64_10_c:                         44576.3 ( 1.00x)
    vvc_alf_classify_64x64_10_neon:                      21544.7 ( 2.07x)
    vvc_alf_classify_64x64_12_c:                         44600.5 ( 1.00x)
    vvc_alf_classify_64x64_12_neon:                      21491.2 ( 2.08x)
    vvc_alf_classify_128x128_8_c:                       192143.3 ( 1.00x)
    vvc_alf_classify_128x128_8_neon:                     82387.6 ( 2.33x)
    vvc_alf_classify_128x128_10_c:                      177583.1 ( 1.00x)
    vvc_alf_classify_128x128_10_neon:                    81628.8 ( 2.18x)
    vvc_alf_classify_128x128_12_c:                      177582.2 ( 1.00x)
    vvc_alf_classify_128x128_12_neon:                    81625.1 ( 2.18x)
This commit is contained in:
Georgii Zagoruiko
2025-09-09 22:10:54 +01:00
parent de25cb4603
commit 4fbacb3944
3 changed files with 297 additions and 0 deletions

View File

@@ -291,3 +291,208 @@ function ff_alf_filter_chroma_kernel_10_neon, export=1
1:
alf_filter_chroma_kernel 2
endfunc
#define ALF_BLOCK_SIZE 4
#define ALF_GRADIENT_STEP 2
#define ALF_GRADIENT_BORDER 2
#define ALF_NUM_DIR 4
#define ALF_GRAD_BORDER_X2 (ALF_GRADIENT_BORDER * 2)
#define ALF_STRIDE_MUL (ALF_GRADIENT_BORDER + 1)
#define ALF_GRAD_X_VSTEP (ALF_GRADIENT_STEP * 8)
#define ALF_GSTRIDE_MUL (ALF_NUM_DIR / ALF_GRADIENT_STEP)
// Shift right: equal to division by 2 (see ALF_GRADIENT_STEP)
#define ALF_GSTRIDE_XG_BYTES (2 * ALF_NUM_DIR / ALF_GRADIENT_STEP)
#define ALF_GSTRIDE_SUB_BYTES (2 * ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP) * ALF_NUM_DIR)
#define ALF_CLASS_INC (ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP)
#define ALF_CLASS_END ((ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP)
.macro ff_alf_classify_grad pix_size
// class_idx .req x0
// transpose_idx .req x1
// _src .req x2
// _src_stride .req x3
// width .req w4
// height .req w5
// vb_pos .req w6
// gradient_tmp .req x7
mov w16, #ALF_STRIDE_MUL
add w5, w5, #ALF_GRAD_BORDER_X2 // h = height + ALF_GRAD_BORDER_X2
mul x16, x3, x16 // ALF_STRIDE_MUL * stride
add w4, w4, #ALF_GRAD_BORDER_X2 // w = width + ALF_GRAD_BORDER_X2
sub x15, x2, x16 // src -= (ALF_STRIDE_MUL * stride)
mov x17, x7
.if \pix_size == 1
sub x15, x15, #ALF_GRADIENT_BORDER
.else
sub x15, x15, #ALF_GRAD_BORDER_X2
.endif
mov w8, #0 // y loop: y = 0
1:
add x16, x8, #1
mul x16, x16, x3
madd x10, x8, x3, x15 // s0 = src + y * stride
add x14, x16, x3
add x11, x15, x16 // s1
add x16, x14, x3
add x12, x15, x14 // s2
add x13, x15, x16 // s3
// if (y == vb_pos): s3 = s2
cmp w8, w6
add w16, w6, #ALF_GRADIENT_BORDER
csel x13, x12, x13, eq
// if (y == vb_pos + 2): s0 = s1
cmp w8, w16
csel x10, x11, x10, eq
.if \pix_size == 1
sub x10, x10, #1 // s0-1
sub x11, x11, #2
sub x12, x12, #2
.else
sub x10, x10, #2 // s0-1
sub x11, x11, #4
sub x12, x12, #4
.endif
// x loop
mov w9, #0
b 11f
2:
// Store operation starts from the second cycle
st2 {v4.8h, v5.8h}, [x17], #32
11:
.if \pix_size == 1
// Load 8 pixels: s0 & s1+2
mov x16, #1
mov x14, #7
ld1 {v0.8b}, [x10], x16 // s0-1
ld1 {v2.8b}, [x13], x16 // s3
ld1 {v1.8b}, [x10], x14 // s0
ld1 {v3.8b}, [x13], x14 // s3+1
uxtl v16.8h, v0.8b
uxtl v20.8h, v1.8b
uxtl v28.8h, v2.8b
uxtl v19.8h, v3.8b
mov x16, #2
mov x14, #4
ld1 {v0.8b}, [x11], x16 // s1-2
ld1 {v3.8b}, [x12], x16 // s2-2
ld1 {v1.8b}, [x11], x16 // s1
ld1 {v4.8b}, [x12], x16 // s2
ld1 {v2.8b}, [x11], x14 // s1+2
ld1 {v5.8b}, [x12], x14 // s2+2
uxtl v17.8h, v0.8b
uxtl v22.8h, v1.8b
uxtl v26.8h, v2.8b
uxtl v18.8h, v3.8b
uxtl v24.8h, v4.8b
uxtl v27.8h, v5.8b
.else
mov x16, #2
mov x14, #14
ld1 {v16.8h}, [x10], x16 // s0-1
ld1 {v28.8h}, [x13], x16 // s3
ld1 {v20.8h}, [x10], x14 // s0
ld1 {v19.8h}, [x13], x14 // s3+1
mov x16, #4
mov x14, #8
ld1 {v17.8h}, [x11], x16 // s1-2
ld1 {v18.8h}, [x12], x16 // s2-2
ld1 {v22.8h}, [x11], x16 // s1
ld1 {v24.8h}, [x12], x16 // s2
ld1 {v26.8h}, [x11], x14 // s1+2
ld1 {v27.8h}, [x12], x14 // s2+2
.endif
// Grad: Vertical & D0 (interleaved)
trn1 v21.8h, v20.8h, v16.8h // first abs: operand 1
rev32 v23.8h, v22.8h // second abs: operand 1
trn2 v29.8h, v28.8h, v19.8h // second abs: operand 2
trn1 v30.8h, v22.8h, v22.8h
trn2 v31.8h, v24.8h, v24.8h
add v30.8h, v30.8h, v30.8h
add v31.8h, v31.8h, v31.8h
sub v0.8h, v30.8h, v21.8h
sub v1.8h, v31.8h, v23.8h
sabd v4.8h, v0.8h, v24.8h
// Grad: Horizontal & D1 (interleaved)
trn2 v21.8h, v17.8h, v20.8h // first abs: operand 1
saba v4.8h, v1.8h, v29.8h
trn2 v23.8h, v22.8h, v18.8h // first abs: operand 2
trn1 v25.8h, v24.8h, v26.8h // second abs: operand 1
trn1 v29.8h, v27.8h, v28.8h // second abs: operand 2
sub v0.8h, v30.8h, v21.8h
sub v1.8h, v31.8h, v25.8h
add w9, w9, #8 // x += 8
sabd v5.8h, v0.8h, v23.8h
cmp w9, w4
saba v5.8h, v1.8h, v29.8h
b.lt 2b
add w8, w8, #ALF_GRADIENT_STEP // y += ALF_GRADIENT_STEP
// 8 pixels -> 4 cycles of generic
// 4 pixels -> paddings => half needs to be saved
st2 {v4.4h, v5.4h}, [x17], #16
cmp w8, w5
b.lt 1b
ret
.endm
.macro ff_alf_classify_sum
ld1 {v0.8h, v1.8h, v2.8h}, [x2], x3
uaddw v16.4s, v16.4s, v0.4h
uaddw v17.4s, v17.4s, v1.4h
uaddw v18.4s, v18.4s, v2.4h
uaddw2 v16.4s, v16.4s, v0.8h
uaddw2 v17.4s, v17.4s, v1.8h
uaddw2 v18.4s, v18.4s, v2.8h
.endm
function ff_alf_classify_sum_neon, export=1
// sum0 .req x0
// sum1 .req x1
// grad .req x2
// gshift .req w3
// steps .req w4
lsl w3, w3, #1
cmp w4, #4
add w3, w3, #32
ld1 {v0.8h, v1.8h, v2.8h}, [x2], x3
uxtl v16.4s, v0.4h
uxtl v17.4s, v1.4h
uxtl v18.4s, v2.4h
uaddw2 v16.4s, v16.4s, v0.8h
uaddw2 v17.4s, v17.4s, v1.8h
uaddw2 v18.4s, v18.4s, v2.8h
ff_alf_classify_sum
ff_alf_classify_sum
blt 60f
ff_alf_classify_sum
60:
add v16.4s, v16.4s, v17.4s
add v18.4s, v18.4s, v17.4s
st1 {v16.4s}, [x0]
st1 {v18.4s}, [x1]
ret
endfunc
function ff_alf_classify_grad_8_neon, export=1
ff_alf_classify_grad 1
endfunc
function ff_alf_classify_grad_10_neon, export=1
endfunc
function ff_alf_classify_grad_12_neon, export=1
ff_alf_classify_grad 2
endfunc

View File

@@ -155,3 +155,89 @@ static void FUNC2(alf_filter_chroma, BIT_DEPTH, _neon)(uint8_t *_dst,
}
}
}
#define ALF_DIR_VERT 0
#define ALF_DIR_HORZ 1
#define ALF_DIR_DIGA0 2
#define ALF_DIR_DIGA1 3
static void FUNC(alf_get_idx)(int *class_idx, int *transpose_idx, const int *sum, const int ac)
{
static const int arg_var[] = {0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 };
int hv0, hv1, dir_hv, d0, d1, dir_d, hvd1, hvd0, sum_hv, dir1;
dir_hv = sum[ALF_DIR_VERT] <= sum[ALF_DIR_HORZ];
hv1 = FFMAX(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
hv0 = FFMIN(sum[ALF_DIR_VERT], sum[ALF_DIR_HORZ]);
dir_d = sum[ALF_DIR_DIGA0] <= sum[ALF_DIR_DIGA1];
d1 = FFMAX(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
d0 = FFMIN(sum[ALF_DIR_DIGA0], sum[ALF_DIR_DIGA1]);
//promote to avoid overflow
dir1 = (uint64_t)d1 * hv0 <= (uint64_t)hv1 * d0;
hvd1 = dir1 ? hv1 : d1;
hvd0 = dir1 ? hv0 : d0;
sum_hv = sum[ALF_DIR_HORZ] + sum[ALF_DIR_VERT];
*class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)];
if (hvd1 * 2 > 9 * hvd0)
*class_idx += ((dir1 << 1) + 2) * 5;
else if (hvd1 > 2 * hvd0)
*class_idx += ((dir1 << 1) + 1) * 5;
*transpose_idx = dir_d * 2 + dir_hv;
}
static void FUNC(alf_classify)(int *class_idx, int *transpose_idx,
const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
const int vb_pos, int16_t *gradient_tmp)
{
int16_t *grad;
const int w = width + ALF_GRADIENT_BORDER * 2;
const int size = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP;
const int gstride = (w / ALF_GRADIENT_STEP) * ALF_NUM_DIR;
const int gshift = gstride - size * ALF_NUM_DIR;
for (int y = 0; y < height ; y += ALF_BLOCK_SIZE ) {
int start = 0;
int end = (ALF_BLOCK_SIZE + ALF_GRADIENT_BORDER * 2) / ALF_GRADIENT_STEP;
int ac = 2;
if (y + ALF_BLOCK_SIZE == vb_pos) {
end -= ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
ac = 3;
} else if (y == vb_pos) {
start += ALF_GRADIENT_BORDER / ALF_GRADIENT_STEP;
ac = 3;
}
for (int x = 0; x < width; x += (2*ALF_BLOCK_SIZE)) {
const int xg = x / ALF_GRADIENT_STEP;
const int yg = y / ALF_GRADIENT_STEP;
int sum0[ALF_NUM_DIR];
int sum1[ALF_NUM_DIR];
grad = gradient_tmp + (yg + start) * gstride + xg * ALF_NUM_DIR;
ff_alf_classify_sum_neon(sum0, sum1, grad, gshift, end-start);
FUNC(alf_get_idx)(class_idx, transpose_idx, sum0, ac);
class_idx++;
transpose_idx++;
FUNC(alf_get_idx)(class_idx, transpose_idx, sum1, ac);
class_idx++;
transpose_idx++;
}
}
}
void FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx,
const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
const int vb_pos, int16_t *gradient_tmp);
static void FUNC2(alf_classify, BIT_DEPTH, _neon)(int *class_idx, int *transpose_idx,
const uint8_t *_src, const ptrdiff_t _src_stride, const int width, const int height,
const int vb_pos, int *gradient_tmp)
{
FUNC2(ff_alf_classify_grad, BIT_DEPTH, _neon)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp);
FUNC(alf_classify)(class_idx, transpose_idx, _src, _src_stride, width, height, vb_pos, (int16_t*)gradient_tmp);
}

View File

@@ -30,6 +30,9 @@
#define BDOF_BLOCK_SIZE 16
#define BDOF_MIN_BLOCK_SIZE 4
void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps);
#define BIT_DEPTH 8
#include "alf_template.c"
#undef BIT_DEPTH
@@ -203,6 +206,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->sao.edge_filter[i] = ff_vvc_sao_edge_filter_16x16_8_neon;
c->alf.filter[LUMA] = alf_filter_luma_8_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_8_neon;
c->alf.classify = alf_classify_8_neon;
if (have_i8mm(cpu_flags)) {
c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon_i8mm;
@@ -242,6 +246,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
c->alf.classify = alf_classify_10_neon;
} else if (bd == 12) {
c->inter.avg = ff_vvc_avg_12_neon;
c->inter.w_avg = vvc_w_avg_12;
@@ -252,6 +257,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
c->alf.classify = alf_classify_12_neon;
}
c->inter.sad = ff_vvc_sad_neon;