diff --git a/libavcodec/x86/vvc/vvc_alf.asm b/libavcodec/x86/vvc/vvc_alf.asm index d20dba4151..b3d118962f 100644 --- a/libavcodec/x86/vvc/vvc_alf.asm +++ b/libavcodec/x86/vvc/vvc_alf.asm @@ -40,8 +40,16 @@ PARAM_SHUFFE 1 PARAM_SHUFFE 2 PARAM_SHUFFE 3 +CLASSIFY_SHUFFE: times 2 db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +TRANSPOSE_PERMUTE: dd 0, 1, 4, 5, 2, 3, 6, 7 +ARG_VAR_SHUFFE: times 2 db 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 + dd448: times 8 dd 512 - 64 dw64: times 8 dd 64 +dd2: times 8 dd 2 +dw3: times 8 dd 3 +dw5: times 8 dd 5 +dd15: times 8 dd 15 SECTION .text @@ -432,10 +440,371 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x28, dst, dst_stride, src, src_s ALF_FILTER %1, chroma %endmacro +%define ALF_GRADIENT_BORDER 2 +%define ALF_BORDER_LUMA 3 + +; ****************************** +; void ff_vvc_alf_classify_grad(int *gradient_sum, const uint8_t *src, +; ptrdiff_t src_stride, intptr_t width, intptr_t height, intptr_t vb_pos); +; ****************************** +%macro ALF_CLASSIFY_GRAD 1 +cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, width, height, vb_pos, \ + x, y, s0, s1, s2, s3, vb_pos_below, src_stride3 + + lea src_stride3q, [src_strideq * 2 + src_strideq] + + lea vb_pos_belowd, [vb_posd + ALF_GRADIENT_BORDER] + + ; src = src - ALF_BORDER_LUMA * src_stride - ALF_BORDER_LUMA + sub srcq, src_stride3q + sub srcq, ALF_BORDER_LUMA * ps + + add widthd, ALF_GRADIENT_BORDER * 2 + add heightd, ALF_GRADIENT_BORDER * 2 + + xor yd, yd + +.loop_h: + xor xd, xd + pxor m15, m15 ; prev + .loop_w: + lea s0q, [srcq + xq * ps] + lea s1q, [s0q + src_strideq] + lea s2q, [s0q + 2 * src_strideq] + lea s3q, [s0q + src_stride3q] + + cmp yd, vb_pos_belowd + cmove s0q, s1q + + cmp yd, vb_posd + cmove s3q, s2q + + LOAD_PIXELS m0, [s0q] + LOAD_PIXELS m1, [s1q] + LOAD_PIXELS m2, [s2q] + LOAD_PIXELS m3, [s3q] + + LOAD_PIXELS m4, [s0q + 2 * ps] + LOAD_PIXELS m5, [s1q + 2 * ps] + LOAD_PIXELS m6, [s2q + 2 * ps] + LOAD_PIXELS m7, [s3q + 2 * ps] + + vpblendw m8, m0, m1, 0xaa ; nw + vpblendw m9, m0, m5, 0x55 ; n + vpblendw m10, m4, m5, 0xaa ; ne + vpblendw m11, m1, m2, 0xaa ; w + vpblendw m12, m5, m6, 0xaa ; e + vpblendw m13, m2, m3, 0xaa ; sw + vpblendw m14, m2, m7, 0x55 ; s + + vpblendw m0, m1, m6, 0x55 + vpaddw m0, m0 ; c + + movu m1, [CLASSIFY_SHUFFE] + pshufb m1, m0, m1 ; d + + vpaddw m9, m14 ; n + s + vpsubw m9, m0 ; (n + s) - c + vpabsw m9, m9 ; ver + + vpaddw m11, m12 ; w + e + vpsubw m11, m1 ; (w + e) - d + vpabsw m11, m11 ; hor + + vpblendw m14, m6, m7, 0xaa ; se + vpaddw m8, m14 ; nw + se + vpsubw m8, m1 ; (nw + se) - d + vpabsw m8, m8 ; di0 + + vpaddw m10, m13 ; ne + sw + vpsubw m10, m1 ; (nw + se) - d + vpabsw m10, m10 ; di1 + + phaddw m9, m11 ; vh, each word represent 2x2 pixels + phaddw m8, m10 ; di, each word represent 2x2 pixels + phaddw m0, m9, m8 ; all = each word represent 4x2 pixels, order is v_h_d0_d1 x 4 + + vinserti128 m15, m15, xm0, 1 + vpblendw m1, m0, m15, 0xaa ; t + + phaddw m1, m0 ; each word represent 8x2 pixels, adjacent word share 4x2 pixels + + vextracti128 xm15, m0, 1 ; prev + + movu [gradient_sumq], m1 + + add gradient_sumq, 32 + add xd, 16 + cmp xd, widthd + jl .loop_w + + lea srcq, [srcq + 2 * src_strideq] + add yd, 2 + cmp yd, heightd + jl .loop_h + RET +%endmacro + +; SAVE_CLASSIFY_PARAM_W16(dest, src) +%macro SAVE_CLASSIFY_PARAM_W16 2 + lea tempq, [%1q + xq] + movu [tempq], xm%2 + vperm2i128 m%2, m%2, m%2, 1 + movu [tempq + widthq], xm%2 +%endmacro + +; SAVE_CLASSIFY_PARAM_W8 +%macro SAVE_CLASSIFY_PARAM_W8 2 + movq [%1], xm%2 + vperm2i128 m%2, m%2, m%2, 1 + movq [%1 + widthq], xm%2 +%endmacro + +; SAVE_CLASSIFY_PARAM_W4 +%macro SAVE_CLASSIFY_PARAM_W4 2 + movd [%1], xm%2 + vperm2i128 m%2, m%2, m%2, 1 + movd [%1 + widthq], xm%2 +%endmacro + +; SAVE_CLASSIFY_PARAM_W(dest, src) +%macro SAVE_CLASSIFY_PARAM_W 2 + lea tempq, [%1q + xq] + cmp wd, 8 + jl %%w4 + SAVE_CLASSIFY_PARAM_W8 tempq, %2 + vpermq m%2, m%2, 00010011b + add tempq, 8 + cmp wd, 8 + je %%end +%%w4: + SAVE_CLASSIFY_PARAM_W4 tempq, %2 +%%end: +%endmacro + +%macro ALF_CLASSIFY_H8 0 + ; first line, sum of 16x4 pixels (includes borders) + lea gradq, [gradient_sumq + 2 * xq] + movu m0, [gradq] + movu m1, [gradq + sum_strideq] + movu m2, [gradq + 2 * sum_strideq] + + pcmpeqb m11, m11 + movd xm13, yd + vpbroadcastd m13, xm13 + movd xm12, vb_posd + vpbroadcastd m12, xm12 + vpcmpeqd m13, m12 ; y == vb_pos + pandn m13, m11 ; y != vb_pos + + vpbroadcastd m14, [dw3] + pblendvb m14, m14, [dd2], m13 ; ac + + pblendvb m3, m15, [gradq + sum_stride3q], m13 + + ; extent to dword to avoid overflow + vpunpcklwd m4, m0, m15 + vpunpckhwd m5, m0, m15 + vpunpcklwd m6, m1, m15 + vpunpckhwd m7, m1, m15 + vpunpcklwd m8, m2, m15 + vpunpckhwd m9, m2, m15 + vpunpcklwd m10, m3, m15 + vpunpckhwd m11, m3, m15 + + vpaddd m0, m4, m6 + vpaddd m1, m5, m7 + vpaddd m2, m8, m10 + vpaddd m3, m9, m11 + + ; sum of the first row + vpaddd m0, m2 ; low + vpaddd m1, m3 ; high + + lea gradq, [gradq + 2 * sum_strideq] + + pblendvb m10, m15, [gradq], m13 + + movu m11, [gradq + sum_strideq] + movu m12, [gradq + 2 * sum_strideq] + movu m13, [gradq + sum_stride3q] + + vpunpcklwd m4, m10, m15 + vpunpckhwd m5, m10, m15 + vpunpcklwd m6, m11, m15 + vpunpckhwd m7, m11, m15 + vpunpcklwd m8, m12, m15 + vpunpckhwd m9, m12, m15 + vpunpcklwd m10, m13, m15 + vpunpckhwd m11, m13, m15 + + vpaddd m2, m4, m6 + vpaddd m3, m5, m7 + vpaddd m4, m8, m10 + vpaddd m5, m9, m11 + + ; sum of the second row + vpaddd m2, m4 ; low + vpaddd m3, m5 ; high + + vpunpckldq m4, m0, m2 + vpunpckhdq m5, m0, m2 + vpunpckldq m6, m1, m3 + vpunpckhdq m7, m1, m3 + + ; each dword represent 4x2 alf blocks + ; the order is 01452367 + vpunpckldq m0, m4, m6 ; sum_v + vpunpckhdq m1, m4, m6 ; sum_h + vpunpckldq m2, m5, m7 ; sum_d0 + vpunpckhdq m3, m5, m7 ; sum_d1 + + vpcmpgtd m4, m0, m1 ; dir_hv - 1 + vpmaxsd m5, m0, m1 ; hv1 + vpminsd m6, m0, m1 ; hv0 + + vpaddd m0, m1; ; sum_hv + + vpcmpgtd m7, m2, m3 ; dir_d - 1 + vpmaxsd m8, m2, m3 ; d1 + vpminsd m9, m2, m3 ; d0 + + ; *transpose_idx = dir_d * 2 + dir_hv; + vpbroadcastd m10, [dw3] + vpaddd m11, m7, m7 + vpaddd m11, m4 + vpaddd m10, m11 + vpermq m10, m10, 11011000b + SAVE_CLASSIFY_PARAM transpose_idx, 10 + + vpsrlq m10, m8, 32 + vpsrlq m11, m6, 32 + pmuldq m12, m10, m11 ; d1 * hv0 high + vpsrlq m1, m9, 32 + vpsrlq m2, m5, 32 + pmuldq m3, m1, m2 ; d0 * hv1 high + vpcmpgtq m10, m12, m3 ; dir1 - 1 high + + pmuldq m1, m8, m6 ; d1 * hv0 low + pmuldq m2, m9, m5 ; d0 * hv1 low + vpcmpgtq m1, m2 ; dir1 - 1 low + + vpblendd m1, m1, m10, 0xaa ; dir1 - 1 + + pblendvb m2, m5, m8, m1 ; hvd1 + pblendvb m3, m6, m9, m1 ; hvd0 + + movd xm5, bit_depthd + vpbroadcastd m5, xm5 + + ;*class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)]; + vpmulld m0, m14 ; sum_hv * ac + vpsrlvd m0, m0, m5 + vpminsd m0, [dd15] + movu m6, [ARG_VAR_SHUFFE] + pshufb m6, m0 ; class_idx + + vpbroadcastd m10, [dw5] + + ; if (hvd1 * 2 > 9 * hvd0) + ; *class_idx += ((dir1 << 1) + 2) * 5; + ; else if (hvd1 > 2 * hvd0) + ; *class_idx += ((dir1 << 1) + 1) * 5; + paddd m7, m3, m3 + pcmpgtd m7, m2, m7 ; hvd1 > 2 * hvd0 + pand m7, m10 + paddd m6, m7 ; class_idx + + paddd m8, m2, m2 + vpslld m9, m3, 3 + paddd m9, m3 + pcmpgtd m8, m9 ; hvd1 * 2 > 9 * hvd0 + pand m8, m10 + paddd m6, m8 ; class_idx + + pandn m1, m7 + paddd m1, m1 ; dir1 << 1 + paddd m6, m1 ; class_idx + vpermq m6, m6, 11011000b + + SAVE_CLASSIFY_PARAM class_idx, 6 +%endmacro + +%macro ALF_CLASSIFY_16x8 0 +%define SAVE_CLASSIFY_PARAM SAVE_CLASSIFY_PARAM_W16 + ALF_CLASSIFY_H8 +%undef SAVE_CLASSIFY_PARAM +%endmacro + +%macro ALF_CLASSIFY_Wx8 0 +%define SAVE_CLASSIFY_PARAM SAVE_CLASSIFY_PARAM_W + ALF_CLASSIFY_H8 +%undef SAVE_CLASSIFY_PARAM +%endmacro + +; ****************************** +;void ff_vvc_alf_classify(int *class_idx, int *transpose_idx, const int *gradient_sum, +; intptr_t width, intptr_t height, intptr_t vb_pos, int *gradient_tmp, intptr_t bit_depth); +; ****************************** +%macro ALF_CLASSIFY 1 +%define ps (%1 / 8) +ALF_CLASSIFY_GRAD %1 +cglobal vvc_alf_classify_%1bpc, 7, 15, 16, class_idx, transpose_idx, gradient_sum, width, height, vb_pos, bit_depth, \ + x, y, grad, sum_stride, sum_stride3, temp, w + + sub bit_depthq, 1 + + ; now we can use gradient to get class idx and transpose idx + lea sum_strideq, [widthd + ALF_GRADIENT_BORDER * 2] + add sum_strideq, 15 + and sum_strideq, ~15 ; align to 16 + add sum_strideq, sum_strideq ; two rows a time + + add gradient_sumq, 8 ; first 4 words are garbage + + lea sum_stride3q, [3 * sum_strideq] + + xor yd, yd + and vb_posd, ~7 ; floor align to 8 + pxor m15, m15 + +.loop_sum_h: + xor xd, xd + .loop_sum_w16: + lea wd, [widthd] + sub wd, xd + cmp wd, 16 + jl .loop_sum_w16_end + + ALF_CLASSIFY_16x8 + + add xd, 16 + jmp .loop_sum_w16 + .loop_sum_w16_end: + + cmp wd, 0 + je .loop_sum_w_end + + ALF_CLASSIFY_Wx8 + +.loop_sum_w_end: + lea gradient_sumq, [gradient_sumq + 4 * sum_strideq] + lea transpose_idxq, [transpose_idxq + 2 * widthq] + lea class_idxq, [class_idxq + 2 * widthq] + + add yd, 8 + cmp yd, heightd + jl .loop_sum_h + + RET +%endmacro + %if ARCH_X86_64 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 ALF_FILTER 16 ALF_FILTER 8 +ALF_CLASSIFY 16 +ALF_CLASSIFY 8 %endif %endif diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c index e672409cd7..ce4660d933 100644 --- a/libavcodec/x86/vvc/vvcdsp_init.c +++ b/libavcodec/x86/vvc/vvcdsp_init.c @@ -94,12 +94,18 @@ void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \ const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \ +void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum, \ + const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, intptr_t vb_pos); \ +void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, const int *gradient_sum, \ + intptr_t width, intptr_t height, intptr_t vb_pos, intptr_t bit_depth); \ #define ALF_PROTOTYPES(bpc, bd, opt) \ void bf(ff_vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \ int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos); \ void bf(ff_vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \ int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos); \ +void bf(ff_vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \ + const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp); \ ALF_BPC_PROTOTYPES(8, avx2) ALF_BPC_PROTOTYPES(16, avx2) @@ -217,6 +223,12 @@ void bf(ff_vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, c BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \ filter, clip, 0, vb_pos,(1 << bd) - 1); \ } \ +void bf(ff_vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \ + const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp) \ +{ \ + BF(ff_vvc_alf_classify_grad, bpc, opt)(gradient_tmp, src, src_stride, width, height, vb_pos); \ + BF(ff_vvc_alf_classify, bpc, opt)(class_idx, transpose_idx, gradient_tmp, width, height, vb_pos, bd); \ +} \ ALF_FUNCS(8, 8, avx2) ALF_FUNCS(16, 10, avx2) @@ -297,6 +309,7 @@ ALF_FUNCS(16, 12, avx2) #define ALF_INIT(bd) do { \ c->alf.filter[LUMA] = ff_vvc_alf_filter_luma_##bd##_avx2; \ c->alf.filter[CHROMA] = ff_vvc_alf_filter_chroma_##bd##_avx2; \ + c->alf.classify = ff_vvc_alf_classify_##bd##_avx2; \ } while (0) #endif