mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
avcodec/h264: mmx2, sse2, avx 10-bit h chroma deblock/loop filter
Yorkfield: - mmx2: 2.45x (279 vs. 114 cycles) - sse2: 3.36x (279 vs. 83 cycles) Nehalem: - mmx2: 2.10x (192 vs. 92 cycles) - sse2: 2.84x (192 vs. 68 cycles) Skylake: - mmx2: 1.75x (170 vs. 97 cycles) - sse2: 2.47x (170 vs. 69 cycles) - avx: 2.47x (170 vs. 69 cycles)
This commit is contained in:
parent
58ca2ef62e
commit
add21d0bb3
@ -843,6 +843,88 @@ DEBLOCK_LUMA_INTRA
|
||||
mova [r0+2*r1], m2
|
||||
%endmacro
|
||||
|
||||
; expands to [base],...,[base+7*stride]
|
||||
%define PASS8ROWS(base, base3, stride, stride3) \
|
||||
[base], [base+stride], [base+stride*2], [base3], \
|
||||
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
|
||||
|
||||
; in: 8 rows of 4 words in %4..%11
|
||||
; out: 4 rows of 8 words in m0..m3
|
||||
%macro TRANSPOSE4x8W_LOAD 8
|
||||
movq m0, %1
|
||||
movq m2, %2
|
||||
movq m1, %3
|
||||
movq m3, %4
|
||||
|
||||
punpcklwd m0, m2
|
||||
punpcklwd m1, m3
|
||||
punpckhdq m2, m0, m1
|
||||
punpckldq m0, m1
|
||||
|
||||
movq m4, %5
|
||||
movq m6, %6
|
||||
movq m5, %7
|
||||
movq m3, %8
|
||||
|
||||
punpcklwd m4, m6
|
||||
punpcklwd m5, m3
|
||||
punpckhdq m6, m4, m5
|
||||
punpckldq m4, m5
|
||||
|
||||
punpckhqdq m1, m0, m4
|
||||
punpcklqdq m0, m4
|
||||
punpckhqdq m3, m2, m6
|
||||
punpcklqdq m2, m6
|
||||
%endmacro
|
||||
|
||||
; in: 4 rows of 8 words in m0..m3
|
||||
; out: 8 rows of 4 words in %1..%8
|
||||
%macro TRANSPOSE8x4W_STORE 8
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
movq %1, m0
|
||||
movhps %2, m0
|
||||
movq %3, m1
|
||||
movhps %4, m1
|
||||
movq %5, m2
|
||||
movhps %6, m2
|
||||
movq %7, m3
|
||||
movhps %8, m3
|
||||
%endmacro
|
||||
|
||||
; %1 = base + 3*stride
|
||||
; %2 = 3*stride (unused on mmx)
|
||||
; %3, %4 = place to store p1 and q1 values
|
||||
%macro CHROMA_H_LOAD 4
|
||||
%if mmsize == 8
|
||||
movq m0, [pix_q - 4]
|
||||
movq m1, [pix_q + stride_q - 4]
|
||||
movq m2, [pix_q + 2*stride_q - 4]
|
||||
movq m3, [%1 - 4]
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
%else
|
||||
TRANSPOSE4x8W_LOAD PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
|
||||
%endif
|
||||
mova %3, m0
|
||||
mova %4, m3
|
||||
%endmacro
|
||||
|
||||
; %1 = base + 3*stride
|
||||
; %2 = 3*stride (unused on mmx)
|
||||
; %3, %4 = place to load p1 and q1 values
|
||||
%macro CHROMA_H_STORE 4
|
||||
mova m0, %3
|
||||
mova m3, %4
|
||||
%if mmsize == 8
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
movq [pix_q - 4], m0
|
||||
movq [pix_q + stride_q - 4], m1
|
||||
movq [pix_q + 2*stride_q - 4], m2
|
||||
movq [%1 - 4], m3
|
||||
%else
|
||||
TRANSPOSE8x4W_STORE PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro CHROMA_V_LOAD_TC 2
|
||||
movd %1, [%2]
|
||||
punpcklbw %1, %1
|
||||
@ -914,6 +996,42 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
|
||||
%else
|
||||
RET
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
|
||||
; int8_t *tc0)
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal deblock_h_chroma_10, 5, 7, 8, 2*mmsize, pix_, stride_, alpha_, beta_, tc0_
|
||||
shl alpha_d, 2
|
||||
shl beta_d, 2
|
||||
mov r5, pix_q
|
||||
lea r6, [3*stride_q]
|
||||
add r5, r6
|
||||
%if mmsize == 8
|
||||
mov r6d, 2
|
||||
.loop:
|
||||
%endif
|
||||
|
||||
CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize]
|
||||
LOAD_AB m4, m5, alpha_d, beta_d
|
||||
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
|
||||
pxor m4, m4
|
||||
CHROMA_V_LOAD_TC m6, tc0_q
|
||||
psubw m6, [pw_3]
|
||||
pmaxsw m6, m4
|
||||
pand m7, m6
|
||||
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
|
||||
CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize]
|
||||
|
||||
%if mmsize == 8
|
||||
lea pix_q, [pix_q + 4*stride_q]
|
||||
lea r5, [r5 + 4*stride_q]
|
||||
add tc0_q, 2
|
||||
dec r6d
|
||||
jg .loop
|
||||
%endif
|
||||
RET
|
||||
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64 == 0
|
||||
|
@ -313,6 +313,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
|
||||
#if ARCH_X86_32
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
|
||||
}
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
|
||||
@ -346,6 +349,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_sse2;
|
||||
}
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
|
||||
@ -381,6 +387,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
|
||||
if (chroma_format_idc <= 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_avx;
|
||||
}
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
|
||||
|
Loading…
x
Reference in New Issue
Block a user