1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-28 20:53:54 +02:00

x86/hevc: add 12bits support for deblocking filter

cherry picked from commit 97d46afe320c7d61d7b9525e5f5588355cde4bb0

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Mickaël Raulet 2014-07-25 17:55:40 +02:00 committed by Michael Niedermayer
parent a06fac353c
commit 7bdcf5c934
2 changed files with 116 additions and 28 deletions

View File

@ -26,10 +26,12 @@
SECTION_RODATA
pw_pixel_max: times 8 dw ((1 << 10)-1)
pw_m1: times 8 dw -1
pw_m2: times 8 dw -2
pd_1 : times 4 dd 1
pw_pixel_max_12: times 8 dw ((1 << 12)-1)
pw_pixel_max_10: times 8 dw ((1 << 10)-1)
pw_pixel_max: times 8 dw ((1 << 10)-1)
pw_m1: times 8 dw -1
pw_m2: times 8 dw -2
pd_1 : times 4 dd 1
cextern pw_4
cextern pw_8
@ -136,12 +138,12 @@ INIT_XMM sse2
; in: 4 rows of 8 words in m0..m3
; out: 8 rows of 4 words in %1..%8
%macro TRANSPOSE8x4W_STORE 8
%macro TRANSPOSE8x4W_STORE 9
pxor m5, m5; zeros reg
CLIPW m0, m5, [pw_pixel_max]
CLIPW m1, m5, [pw_pixel_max]
CLIPW m2, m5, [pw_pixel_max]
CLIPW m3, m5, [pw_pixel_max]
CLIPW m0, m5, %9
CLIPW m1, m5, %9
CLIPW m2, m5, %9
CLIPW m3, m5, %9
punpckhwd m4, m0, m1
punpcklwd m0, m1
@ -264,18 +266,18 @@ INIT_XMM sse2
; in: 8 rows of 8 words in m0..m8
; out: 8 rows of 8 words in %1..%8
%macro TRANSPOSE8x8W_STORE 8
%macro TRANSPOSE8x8W_STORE 9
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
pxor m8, m8
CLIPW m0, m8, [pw_pixel_max]
CLIPW m1, m8, [pw_pixel_max]
CLIPW m2, m8, [pw_pixel_max]
CLIPW m3, m8, [pw_pixel_max]
CLIPW m4, m8, [pw_pixel_max]
CLIPW m5, m8, [pw_pixel_max]
CLIPW m6, m8, [pw_pixel_max]
CLIPW m7, m8, [pw_pixel_max]
CLIPW m0, m8, %9
CLIPW m1, m8, %9
CLIPW m2, m8, %9
CLIPW m3, m8, %9
CLIPW m4, m8, %9
CLIPW m5, m8, %9
CLIPW m6, m8, %9
CLIPW m7, m8, %9
movdqu %1, m0
movdqu %2, m1
@ -678,7 +680,17 @@ cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
add pixq, r3strideq
TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
CHROMA_DEBLOCK_BODY 10
TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
RET
cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
sub pixq, 4
lea r3strideq, [3*strideq]
mov pix0q, pixq
add pixq, r3strideq
TRANSPOSE4x8W_LOAD PASS8ROWS(pix0q, pixq, strideq, r3strideq)
CHROMA_DEBLOCK_BODY 12
TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
RET
;-----------------------------------------------------------------------------
@ -713,8 +725,24 @@ cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
movu m3, [pixq+strideq]; q1
CHROMA_DEBLOCK_BODY 10
pxor m5, m5; zeros reg
CLIPW m1, m5, [pw_pixel_max]
CLIPW m2, m5, [pw_pixel_max]
CLIPW m1, m5, [pw_pixel_max_10]
CLIPW m2, m5, [pw_pixel_max_10]
movu [pix0q+strideq], m1
movu [pixq], m2
RET
cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
mov pix0q, pixq
sub pix0q, strideq
sub pix0q, strideq
movu m0, [pix0q]; p1
movu m1, [pix0q+strideq]; p0
movu m2, [pixq]; q0
movu m3, [pixq+strideq]; q1
CHROMA_DEBLOCK_BODY 12
pxor m5, m5; zeros reg
CLIPW m1, m5, [pw_pixel_max_12]
CLIPW m2, m5, [pw_pixel_max_12]
movu [pix0q+strideq], m1
movu [pixq], m2
RET
@ -744,7 +772,19 @@ cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc
TRANSPOSE8x8W_LOAD PASS8ROWS(r6, pixq, strideq, r5)
LUMA_DEBLOCK_BODY 10, v
.store:
TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5)
TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5), [pw_pixel_max_10]
.bypassluma:
RET
cglobal hevc_v_loop_filter_luma_12, 4, 15, 16, pix, stride, beta, tc
sub pixq, 8
lea r5, [3 * strideq]
mov r6, pixq
add pixq, r5
TRANSPOSE8x8W_LOAD PASS8ROWS(r6, pixq, strideq, r5)
LUMA_DEBLOCK_BODY 12, v
.store:
TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5), [pw_pixel_max_12]
.bypassluma:
RET
@ -803,12 +843,12 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
LUMA_DEBLOCK_BODY 10, h
.store:
pxor m8, m8; zeros reg
CLIPW m1, m8, [pw_pixel_max]
CLIPW m2, m8, [pw_pixel_max]
CLIPW m3, m8, [pw_pixel_max]
CLIPW m4, m8, [pw_pixel_max]
CLIPW m5, m8, [pw_pixel_max]
CLIPW m6, m8, [pw_pixel_max]
CLIPW m1, m8, [pw_pixel_max_10]
CLIPW m2, m8, [pw_pixel_max_10]
CLIPW m3, m8, [pw_pixel_max_10]
CLIPW m4, m8, [pw_pixel_max_10]
CLIPW m5, m8, [pw_pixel_max_10]
CLIPW m6, m8, [pw_pixel_max_10]
movdqu [pix0q + strideq], m1; p2
movdqu [pix0q + 2 * strideq], m2; p1
movdqu [pix0q + src3strideq], m3; p0
@ -817,6 +857,38 @@ cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix
movdqu [pixq + 2 * strideq], m6; q2
.bypassluma:
RET
cglobal hevc_h_loop_filter_luma_12, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
lea src3strideq, [3 * strideq]
mov pix0q, pixq
sub pix0q, src3strideq
sub pix0q, strideq
movdqu m0, [pix0q]; p3
movdqu m1, [pix0q + strideq]; p2
movdqu m2, [pix0q + 2 * strideq]; p1
movdqu m3, [pix0q + src3strideq]; p0
movdqu m4, [pixq]; q0
movdqu m5, [pixq + strideq]; q1
movdqu m6, [pixq + 2 * strideq]; q2
movdqu m7, [pixq + src3strideq]; q3
LUMA_DEBLOCK_BODY 12, h
.store:
pxor m8, m8; zeros reg
CLIPW m1, m8, [pw_pixel_max_12]
CLIPW m2, m8, [pw_pixel_max_12]
CLIPW m3, m8, [pw_pixel_max_12]
CLIPW m4, m8, [pw_pixel_max_12]
CLIPW m5, m8, [pw_pixel_max_12]
CLIPW m6, m8, [pw_pixel_max_12]
movdqu [pix0q + strideq], m1; p2
movdqu [pix0q + 2 * strideq], m2; p1
movdqu [pix0q + src3strideq], m3; p0
movdqu [pixq ], m4; q0
movdqu [pixq + strideq], m5; q1
movdqu [pixq + 2 * strideq], m6; q2
.bypassluma:
RET
%endmacro
INIT_XMM sse2

View File

@ -46,10 +46,13 @@ LFL_FUNC(v, depth, opt)
LFC_FUNCS(uint8_t, 8, sse2)
LFC_FUNCS(uint8_t, 10, sse2)
LFC_FUNCS(uint8_t, 12, sse2)
LFL_FUNCS(uint8_t, 8, sse2)
LFL_FUNCS(uint8_t, 10, sse2)
LFL_FUNCS(uint8_t, 12, sse2)
LFL_FUNCS(uint8_t, 8, ssse3)
LFL_FUNCS(uint8_t, 10, ssse3)
LFL_FUNCS(uint8_t, 12, ssse3)
#if HAVE_SSE2_EXTERNAL
void ff_hevc_idct32_dc_add_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
@ -499,5 +502,18 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->transform_dc_add[3] = ff_hevc_idct32_dc_add_10_avx2;
}
} else if (bit_depth == 12) {
if (EXTERNAL_SSE2(mm_flags)) {
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
if (ARCH_X86_64) {
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
}
}
if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
}
}
}