mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-28 20:53:54 +02:00
x86/hevc: add 12bits support for deblocking filter
cherry picked from commit 97d46afe320c7d61d7b9525e5f5588355cde4bb0 Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
a06fac353c
commit
7bdcf5c934
@ -26,10 +26,12 @@
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pw_pixel_max: times 8 dw ((1 << 10)-1)
|
||||
pw_m1: times 8 dw -1
|
||||
pw_m2: times 8 dw -2
|
||||
pd_1 : times 4 dd 1
|
||||
pw_pixel_max_12: times 8 dw ((1 << 12)-1)
|
||||
pw_pixel_max_10: times 8 dw ((1 << 10)-1)
|
||||
pw_pixel_max: times 8 dw ((1 << 10)-1)
|
||||
pw_m1: times 8 dw -1
|
||||
pw_m2: times 8 dw -2
|
||||
pd_1 : times 4 dd 1
|
||||
|
||||
cextern pw_4
|
||||
cextern pw_8
|
||||
@ -136,12 +138,12 @@ INIT_XMM sse2
|
||||
|
||||
; in: 4 rows of 8 words in m0..m3
|
||||
; out: 8 rows of 4 words in %1..%8
|
||||
%macro TRANSPOSE8x4W_STORE 8
|
||||
%macro TRANSPOSE8x4W_STORE 9
|
||||
pxor m5, m5; zeros reg
|
||||
CLIPW m0, m5, [pw_pixel_max]
|
||||
CLIPW m1, m5, [pw_pixel_max]
|
||||
CLIPW m2, m5, [pw_pixel_max]
|
||||
CLIPW m3, m5, [pw_pixel_max]
|
||||
CLIPW m0, m5, %9
|
||||
CLIPW m1, m5, %9
|
||||
CLIPW m2, m5, %9
|
||||
CLIPW m3, m5, %9
|
||||
|
||||
punpckhwd m4, m0, m1
|
||||
punpcklwd m0, m1
|
||||
@ -264,18 +266,18 @@ INIT_XMM sse2
|
||||
|
||||
; in: 8 rows of 8 words in m0..m8
|
||||
; out: 8 rows of 8 words in %1..%8
|
||||
%macro TRANSPOSE8x8W_STORE 8
|
||||
%macro TRANSPOSE8x8W_STORE 9
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
|
||||
pxor m8, m8
|
||||
CLIPW m0, m8, [pw_pixel_max]
|
||||
CLIPW m1, m8, [pw_pixel_max]
|
||||
CLIPW m2, m8, [pw_pixel_max]
|
||||
CLIPW m3, m8, [pw_pixel_max]
|
||||
CLIPW m4, m8, [pw_pixel_max]
|
||||
CLIPW m5, m8, [pw_pixel_max]
|
||||
CLIPW m6, m8, [pw_pixel_max]
|
||||
CLIPW m7, m8, [pw_pixel_max]
|
||||
CLIPW m0, m8, %9
|
||||
CLIPW m1, m8, %9
|
||||
CLIPW m2, m8, %9
|
||||
CLIPW m3, m8, %9
|
||||
CLIPW m4, m8, %9
|
||||
CLIPW m5, m8, %9
|
||||
CLIPW m6, m8, %9
|
||||
CLIPW m7, m8, %9
|
||||
|
||||
movdqu %1, m0
|
||||
movdqu %2, m1
|
||||
@ -678,7 +680,17 @@ cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
|
||||
add pixq, r3strideq
|
||||
TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
|
||||
CHROMA_DEBLOCK_BODY 10
|
||||
TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
|
||||
TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
|
||||
RET
|
||||
|
||||
cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
|
||||
sub pixq, 4
|
||||
lea r3strideq, [3*strideq]
|
||||
mov pix0q, pixq
|
||||
add pixq, r3strideq
|
||||
TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
|
||||
CHROMA_DEBLOCK_BODY 12
|
||||
TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
@ -713,8 +725,24 @@ cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
|
||||
movu m3, [pixq+strideq]; q1
|
||||
CHROMA_DEBLOCK_BODY 10
|
||||
pxor m5, m5; zeros reg
|
||||
CLIPW m1, m5, [pw_pixel_max]
|
||||
CLIPW m2, m5, [pw_pixel_max]
|
||||
CLIPW m1, m5, [pw_pixel_max_10]
|
||||
CLIPW m2, m5, [pw_pixel_max_10]
|
||||
movu [pix0q+strideq], m1
|
||||
movu [pixq], m2
|
||||
RET
|
||||
|
||||
cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
|
||||
mov pix0q, pixq
|
||||
sub pix0q, strideq
|
||||
sub pix0q, strideq
|
||||
movu m0, [pix0q]; p1
|
||||
movu m1, [pix0q+strideq]; p0
|
||||
movu m2, [pixq]; q0
|
||||
movu m3, [pixq+strideq]; q1
|
||||
CHROMA_DEBLOCK_BODY 12
|
||||
pxor m5, m5; zeros reg
|
||||
CLIPW m1, m5, [pw_pixel_max_12]
|
||||
CLIPW m2, m5, [pw_pixel_max_12]
|
||||
movu [pix0q+strideq], m1
|
||||
movu [pixq], m2
|
||||
RET
|
||||
@ -744,7 +772,19 @@ cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc
|
||||
TRANSPOSE8x8W_LOAD PASS8ROWS(r6, pixq, strideq, r5)
|
||||
LUMA_DEBLOCK_BODY 10, v
|
||||
.store:
|
||||
TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5)
|
||||
TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5), [pw_pixel_max_10]
|
||||
.bypassluma:
|
||||
RET
|
||||
|
||||
cglobal hevc_v_loop_filter_luma_12, 4, 15, 16, pix, stride, beta, tc
|
||||
sub pixq, 8
|
||||
lea r5, [3 * strideq]
|
||||
mov r6, pixq
|
||||
add pixq, r5
|
||||
TRANSPOSE8x8W_LOAD PASS8ROWS(r6, pixq, strideq, r5)
|
||||
LUMA_DEBLOCK_BODY 12, v
|
||||
.store:
|
||||
TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5), [pw_pixel_max_12]
|
||||
.bypassluma:
|
||||
RET
|
||||
|
||||
@ -803,12 +843,12 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
|
||||
LUMA_DEBLOCK_BODY 10, h
|
||||
.store:
|
||||
pxor m8, m8; zeros reg
|
||||
CLIPW m1, m8, [pw_pixel_max]
|
||||
CLIPW m2, m8, [pw_pixel_max]
|
||||
CLIPW m3, m8, [pw_pixel_max]
|
||||
CLIPW m4, m8, [pw_pixel_max]
|
||||
CLIPW m5, m8, [pw_pixel_max]
|
||||
CLIPW m6, m8, [pw_pixel_max]
|
||||
CLIPW m1, m8, [pw_pixel_max_10]
|
||||
CLIPW m2, m8, [pw_pixel_max_10]
|
||||
CLIPW m3, m8, [pw_pixel_max_10]
|
||||
CLIPW m4, m8, [pw_pixel_max_10]
|
||||
CLIPW m5, m8, [pw_pixel_max_10]
|
||||
CLIPW m6, m8, [pw_pixel_max_10]
|
||||
movdqu [pix0q + strideq], m1; p2
|
||||
movdqu [pix0q + 2 * strideq], m2; p1
|
||||
movdqu [pix0q + src3strideq], m3; p0
|
||||
@ -817,6 +857,38 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
|
||||
movdqu [pixq + 2 * strideq], m6; q2
|
||||
.bypassluma:
|
||||
RET
|
||||
|
||||
cglobal hevc_h_loop_filter_luma_12, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
|
||||
lea src3strideq, [3 * strideq]
|
||||
mov pix0q, pixq
|
||||
sub pix0q, src3strideq
|
||||
sub pix0q, strideq
|
||||
movdqu m0, [pix0q]; p3
|
||||
movdqu m1, [pix0q + strideq]; p2
|
||||
movdqu m2, [pix0q + 2 * strideq]; p1
|
||||
movdqu m3, [pix0q + src3strideq]; p0
|
||||
movdqu m4, [pixq]; q0
|
||||
movdqu m5, [pixq + strideq]; q1
|
||||
movdqu m6, [pixq + 2 * strideq]; q2
|
||||
movdqu m7, [pixq + src3strideq]; q3
|
||||
LUMA_DEBLOCK_BODY 12, h
|
||||
.store:
|
||||
pxor m8, m8; zeros reg
|
||||
CLIPW m1, m8, [pw_pixel_max_12]
|
||||
CLIPW m2, m8, [pw_pixel_max_12]
|
||||
CLIPW m3, m8, [pw_pixel_max_12]
|
||||
CLIPW m4, m8, [pw_pixel_max_12]
|
||||
CLIPW m5, m8, [pw_pixel_max_12]
|
||||
CLIPW m6, m8, [pw_pixel_max_12]
|
||||
movdqu [pix0q + strideq], m1; p2
|
||||
movdqu [pix0q + 2 * strideq], m2; p1
|
||||
movdqu [pix0q + src3strideq], m3; p0
|
||||
movdqu [pixq ], m4; q0
|
||||
movdqu [pixq + strideq], m5; q1
|
||||
movdqu [pixq + 2 * strideq], m6; q2
|
||||
.bypassluma:
|
||||
RET
|
||||
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
|
@ -46,10 +46,13 @@ LFL_FUNC(v, depth, opt)
|
||||
|
||||
LFC_FUNCS(uint8_t, 8, sse2)
|
||||
LFC_FUNCS(uint8_t, 10, sse2)
|
||||
LFC_FUNCS(uint8_t, 12, sse2)
|
||||
LFL_FUNCS(uint8_t, 8, sse2)
|
||||
LFL_FUNCS(uint8_t, 10, sse2)
|
||||
LFL_FUNCS(uint8_t, 12, sse2)
|
||||
LFL_FUNCS(uint8_t, 8, ssse3)
|
||||
LFL_FUNCS(uint8_t, 10, ssse3)
|
||||
LFL_FUNCS(uint8_t, 12, ssse3)
|
||||
|
||||
#if HAVE_SSE2_EXTERNAL
|
||||
void ff_hevc_idct32_dc_add_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
|
||||
@ -499,5 +502,18 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
|
||||
c->transform_dc_add[3] = ff_hevc_idct32_dc_add_10_avx2;
|
||||
|
||||
}
|
||||
} else if (bit_depth == 12) {
|
||||
if (EXTERNAL_SSE2(mm_flags)) {
|
||||
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
|
||||
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
|
||||
if (ARCH_X86_64) {
|
||||
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
|
||||
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
|
||||
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
|
||||
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user