mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-03-17 20:17:55 +02:00
H.264: Add more x86 assembly for 10-bit H.264 predict functions
Mainly ported from 8-bit H.264 predict. Some code ported from x264. LGPL ok by author. Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
This commit is contained in:
parent
e358f7ee90
commit
ac4a85f476
@ -29,11 +29,19 @@ SECTION_RODATA
|
|||||||
|
|
||||||
SECTION .text
|
SECTION .text
|
||||||
|
|
||||||
|
cextern pw_16
|
||||||
cextern pw_8
|
cextern pw_8
|
||||||
cextern pw_4
|
cextern pw_4
|
||||||
cextern pw_2
|
cextern pw_2
|
||||||
cextern pw_1
|
cextern pw_1
|
||||||
|
|
||||||
|
pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
|
||||||
|
pw_m3: times 8 dw -3
|
||||||
|
pw_pixel_max: times 8 dw ((1 << 10)-1)
|
||||||
|
pw_512: times 8 dw 512
|
||||||
|
pd_17: times 4 dd 17
|
||||||
|
pd_16: times 4 dd 16
|
||||||
|
|
||||||
; dest, left, right, src
|
; dest, left, right, src
|
||||||
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
|
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
|
||||||
%macro PRED4x4_LOWPASS 4
|
%macro PRED4x4_LOWPASS 4
|
||||||
@ -464,7 +472,92 @@ PRED8x8_TOP_DC mmxext, pshufw
|
|||||||
INIT_XMM
|
INIT_XMM
|
||||||
PRED8x8_TOP_DC sse2 , pshuflw
|
PRED8x8_TOP_DC sse2 , pshuflw
|
||||||
|
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
; void pred8x8_plane(pixel *src, int stride)
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
INIT_XMM
|
||||||
|
cglobal pred8x8_plane_10_sse2, 2,7,7
|
||||||
|
sub r0, r1
|
||||||
|
lea r2, [r1+r1*2]
|
||||||
|
lea r3, [r0+r1*4]
|
||||||
|
mova m2, [r0]
|
||||||
|
pmaddwd m2, [pw_m32101234]
|
||||||
|
HADDD m2, m1
|
||||||
|
movd m0, [r0-4]
|
||||||
|
psrld m0, 14
|
||||||
|
psubw m2, m0 ; H
|
||||||
|
movd m0, [r3+r1*4-4]
|
||||||
|
movd m1, [r0+12]
|
||||||
|
paddw m0, m1
|
||||||
|
psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
|
||||||
|
movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
|
||||||
|
movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
|
||||||
|
sub r4d, r5d
|
||||||
|
movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
|
||||||
|
movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
|
||||||
|
sub r6d, r5d
|
||||||
|
lea r4d, [r4+r6*2]
|
||||||
|
movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
|
||||||
|
movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
|
||||||
|
sub r5d, r6d
|
||||||
|
lea r5d, [r5+r5*2]
|
||||||
|
add r4d, r5d
|
||||||
|
movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
|
||||||
|
movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
|
||||||
|
sub r6d, r5d
|
||||||
|
lea r4d, [r4+r6*4]
|
||||||
|
movd m3, r4d ; V
|
||||||
|
punpckldq m2, m3
|
||||||
|
pmaddwd m2, [pd_17]
|
||||||
|
paddd m2, [pd_16]
|
||||||
|
psrad m2, 5 ; b, c
|
||||||
|
|
||||||
|
mova m3, [pw_pixel_max]
|
||||||
|
pxor m1, m1
|
||||||
|
SPLATW m0, m0, 1
|
||||||
|
SPLATW m4, m2, 2
|
||||||
|
SPLATW m2, m2, 0
|
||||||
|
pmullw m2, [pw_m32101234] ; b
|
||||||
|
pmullw m5, m4, [pw_m3] ; c
|
||||||
|
paddw m5, [pw_16]
|
||||||
|
mov r2d, 8
|
||||||
|
add r0, r1
|
||||||
|
.loop:
|
||||||
|
paddsw m6, m2, m5
|
||||||
|
paddsw m6, m0
|
||||||
|
psraw m6, 5
|
||||||
|
CLIPW m6, m1, m3
|
||||||
|
mova [r0], m6
|
||||||
|
paddw m5, m4
|
||||||
|
add r0, r1
|
||||||
|
dec r2d
|
||||||
|
jg .loop
|
||||||
|
REP_RET
|
||||||
|
|
||||||
|
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
%macro PRED8x8L_128_DC 1
|
||||||
|
cglobal pred8x8l_128_dc_10_%1, 4,4
|
||||||
|
mova m0, [pw_512]
|
||||||
|
lea r1, [r3+r3*2]
|
||||||
|
lea r2, [r0+r3*4]
|
||||||
|
MOV8 r0+r3*0, m0, m0
|
||||||
|
MOV8 r0+r3*1, m0, m0
|
||||||
|
MOV8 r0+r3*2, m0, m0
|
||||||
|
MOV8 r0+r1*1, m0, m0
|
||||||
|
MOV8 r2+r3*0, m0, m0
|
||||||
|
MOV8 r2+r3*1, m0, m0
|
||||||
|
MOV8 r2+r3*2, m0, m0
|
||||||
|
MOV8 r2+r1*1, m0, m0
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_MMX
|
||||||
|
PRED8x8L_128_DC mmxext
|
||||||
|
INIT_XMM
|
||||||
|
PRED8x8L_128_DC sse2
|
||||||
|
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
|
; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
|
||||||
@ -1258,7 +1351,7 @@ cglobal pred16x16_horizontal_10_%1, 2,3
|
|||||||
MOV16 r0+r1*1, m1, m1, m1, m1
|
MOV16 r0+r1*1, m1, m1, m1, m1
|
||||||
lea r0, [r0+r1*2]
|
lea r0, [r0+r1*2]
|
||||||
dec r2
|
dec r2
|
||||||
jge .vloop
|
jg .vloop
|
||||||
REP_RET
|
REP_RET
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
@ -1266,3 +1359,139 @@ INIT_MMX
|
|||||||
PRED16x16_HORIZONTAL mmxext
|
PRED16x16_HORIZONTAL mmxext
|
||||||
INIT_XMM
|
INIT_XMM
|
||||||
PRED16x16_HORIZONTAL sse2
|
PRED16x16_HORIZONTAL sse2
|
||||||
|
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
; void pred16x16_dc(pixel *src, int stride)
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
%macro PRED16x16_DC 1
|
||||||
|
cglobal pred16x16_dc_10_%1, 2,7
|
||||||
|
mov r4, r0
|
||||||
|
sub r0, r1
|
||||||
|
mova m0, [r0+0]
|
||||||
|
paddw m0, [r0+mmsize]
|
||||||
|
%if mmsize==8
|
||||||
|
paddw m0, [r0+16]
|
||||||
|
paddw m0, [r0+24]
|
||||||
|
%endif
|
||||||
|
HADDW m0, m2
|
||||||
|
|
||||||
|
sub r0, 2
|
||||||
|
movzx r3d, word [r0+r1*1]
|
||||||
|
movzx r5d, word [r0+r1*2]
|
||||||
|
%rep 7
|
||||||
|
lea r0, [r0+r1*2]
|
||||||
|
movzx r2d, word [r0+r1*1]
|
||||||
|
add r3d, r2d
|
||||||
|
movzx r2d, word [r0+r1*2]
|
||||||
|
add r5d, r2d
|
||||||
|
%endrep
|
||||||
|
lea r3d, [r3+r5+16]
|
||||||
|
|
||||||
|
movd m1, r3d
|
||||||
|
paddw m0, m1
|
||||||
|
psrlw m0, 5
|
||||||
|
SPLATW m0, m0
|
||||||
|
mov r3d, 8
|
||||||
|
.loop:
|
||||||
|
MOV16 r4+r1*0, m0, m0, m0, m0
|
||||||
|
MOV16 r4+r1*1, m0, m0, m0, m0
|
||||||
|
lea r4, [r4+r1*2]
|
||||||
|
dec r3d
|
||||||
|
jg .loop
|
||||||
|
REP_RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_MMX
|
||||||
|
PRED16x16_DC mmxext
|
||||||
|
INIT_XMM
|
||||||
|
PRED16x16_DC sse2
|
||||||
|
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
; void pred16x16_top_dc(pixel *src, int stride)
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
%macro PRED16x16_TOP_DC 1
|
||||||
|
cglobal pred16x16_top_dc_10_%1, 2,3
|
||||||
|
sub r0, r1
|
||||||
|
mova m0, [r0+0]
|
||||||
|
paddw m0, [r0+mmsize]
|
||||||
|
%if mmsize==8
|
||||||
|
paddw m0, [r0+16]
|
||||||
|
paddw m0, [r0+24]
|
||||||
|
%endif
|
||||||
|
HADDW m0, m2
|
||||||
|
|
||||||
|
SPLATW m0, m0
|
||||||
|
paddw m0, [pw_8]
|
||||||
|
psrlw m0, 4
|
||||||
|
mov r2d, 8
|
||||||
|
.loop:
|
||||||
|
MOV16 r0+r1*1, m0, m0, m0, m0
|
||||||
|
MOV16 r0+r1*2, m0, m0, m0, m0
|
||||||
|
lea r0, [r0+r1*2]
|
||||||
|
dec r2d
|
||||||
|
jg .loop
|
||||||
|
REP_RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_MMX
|
||||||
|
PRED16x16_TOP_DC mmxext
|
||||||
|
INIT_XMM
|
||||||
|
PRED16x16_TOP_DC sse2
|
||||||
|
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
; void pred16x16_left_dc(pixel *src, int stride)
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
%macro PRED16x16_LEFT_DC 1
|
||||||
|
cglobal pred16x16_left_dc_10_%1, 2,7
|
||||||
|
mov r4, r0
|
||||||
|
|
||||||
|
sub r0, 2
|
||||||
|
movzx r5d, word [r0+r1*0]
|
||||||
|
movzx r6d, word [r0+r1*1]
|
||||||
|
%rep 7
|
||||||
|
lea r0, [r0+r1*2]
|
||||||
|
movzx r2d, word [r0+r1*0]
|
||||||
|
movzx r3d, word [r0+r1*1]
|
||||||
|
add r5d, r2d
|
||||||
|
add r6d, r3d
|
||||||
|
%endrep
|
||||||
|
lea r2d, [r5+r6+8]
|
||||||
|
shr r2d, 4
|
||||||
|
|
||||||
|
movd m0, r2d
|
||||||
|
SPLATW m0, m0
|
||||||
|
mov r3d, 8
|
||||||
|
.loop:
|
||||||
|
MOV16 r4+r1*0, m0, m0, m0, m0
|
||||||
|
MOV16 r4+r1*1, m0, m0, m0, m0
|
||||||
|
lea r4, [r4+r1*2]
|
||||||
|
dec r3d
|
||||||
|
jg .loop
|
||||||
|
REP_RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_MMX
|
||||||
|
PRED16x16_LEFT_DC mmxext
|
||||||
|
INIT_XMM
|
||||||
|
PRED16x16_LEFT_DC sse2
|
||||||
|
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
; void pred16x16_128_dc(pixel *src, int stride)
|
||||||
|
;-----------------------------------------------------------------------------
|
||||||
|
%macro PRED16x16_128_DC 1
|
||||||
|
cglobal pred16x16_128_dc_10_%1, 2,3
|
||||||
|
mova m0, [pw_512]
|
||||||
|
mov r2d, 8
|
||||||
|
.loop:
|
||||||
|
MOV16 r0+r1*0, m0, m0, m0, m0
|
||||||
|
MOV16 r0+r1*1, m0, m0, m0, m0
|
||||||
|
lea r0, [r0+r1*2]
|
||||||
|
dec r2d
|
||||||
|
jg .loop
|
||||||
|
REP_RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_MMX
|
||||||
|
PRED16x16_128_DC mmxext
|
||||||
|
INIT_XMM
|
||||||
|
PRED16x16_128_DC sse2
|
||||||
|
@ -47,6 +47,7 @@ PRED8x8(dc, 10, mmxext)
|
|||||||
PRED8x8(dc, 10, sse2)
|
PRED8x8(dc, 10, sse2)
|
||||||
PRED8x8(top_dc, 10, mmxext)
|
PRED8x8(top_dc, 10, mmxext)
|
||||||
PRED8x8(top_dc, 10, sse2)
|
PRED8x8(top_dc, 10, sse2)
|
||||||
|
PRED8x8(plane, 10, sse2)
|
||||||
PRED8x8(vertical, 10, sse2)
|
PRED8x8(vertical, 10, sse2)
|
||||||
PRED8x8(horizontal, 10, sse2)
|
PRED8x8(horizontal, 10, sse2)
|
||||||
|
|
||||||
@ -55,6 +56,8 @@ void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int has_tople
|
|||||||
|
|
||||||
PRED8x8L(dc, 10, sse2)
|
PRED8x8L(dc, 10, sse2)
|
||||||
PRED8x8L(dc, 10, ssse3)
|
PRED8x8L(dc, 10, ssse3)
|
||||||
|
PRED8x8L(128_dc, 10, mmxext)
|
||||||
|
PRED8x8L(128_dc, 10, sse2)
|
||||||
PRED8x8L(top_dc, 10, sse2)
|
PRED8x8L(top_dc, 10, sse2)
|
||||||
PRED8x8L(top_dc, 10, ssse3)
|
PRED8x8L(top_dc, 10, ssse3)
|
||||||
PRED8x8L(vertical, 10, sse2)
|
PRED8x8L(vertical, 10, sse2)
|
||||||
@ -73,6 +76,14 @@ PRED8x8L(horizontal_up, 10, ssse3)
|
|||||||
#define PRED16x16(TYPE, DEPTH, OPT)\
|
#define PRED16x16(TYPE, DEPTH, OPT)\
|
||||||
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride);
|
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride);
|
||||||
|
|
||||||
|
PRED16x16(dc, 10, mmxext)
|
||||||
|
PRED16x16(dc, 10, sse2)
|
||||||
|
PRED16x16(top_dc, 10, mmxext)
|
||||||
|
PRED16x16(top_dc, 10, sse2)
|
||||||
|
PRED16x16(128_dc, 10, mmxext)
|
||||||
|
PRED16x16(128_dc, 10, sse2)
|
||||||
|
PRED16x16(left_dc, 10, mmxext)
|
||||||
|
PRED16x16(left_dc, 10, sse2)
|
||||||
PRED16x16(vertical, 10, mmxext)
|
PRED16x16(vertical, 10, mmxext)
|
||||||
PRED16x16(vertical, 10, sse2)
|
PRED16x16(vertical, 10, sse2)
|
||||||
PRED16x16(horizontal, 10, mmxext)
|
PRED16x16(horizontal, 10, mmxext)
|
||||||
@ -289,6 +300,12 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
|
|||||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
|
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
|
||||||
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_mmxext;
|
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_mmxext;
|
||||||
|
|
||||||
|
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext;
|
||||||
|
|
||||||
|
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext;
|
||||||
|
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext;
|
||||||
|
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext;
|
||||||
|
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext;
|
||||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext;
|
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext;
|
||||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext;
|
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext;
|
||||||
}
|
}
|
||||||
@ -301,18 +318,24 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
|
|||||||
|
|
||||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2;
|
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2;
|
||||||
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2;
|
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2;
|
||||||
|
h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2;
|
||||||
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2;
|
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2;
|
||||||
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2;
|
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2;
|
||||||
|
|
||||||
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2;
|
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2;
|
||||||
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2;
|
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2;
|
||||||
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2;
|
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2;
|
||||||
|
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2;
|
||||||
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2;
|
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2;
|
||||||
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
|
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
|
||||||
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
|
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
|
||||||
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2;
|
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2;
|
||||||
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2;
|
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2;
|
||||||
|
|
||||||
|
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2;
|
||||||
|
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2;
|
||||||
|
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2;
|
||||||
|
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2;
|
||||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2;
|
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2;
|
||||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2;
|
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user