1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-11-21 10:55:51 +02:00

x86/hevc_mc: use fewer instructions in hevc_put_hevc_{uni, bi}_w[24]_{8, 10, 12}

Signed-off-by: James Almer <jamrial@gmail.com>
Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
James Almer 2014-08-04 01:18:46 -03:00 committed by Michael Niedermayer
parent b1a44e6bf5
commit b7863c972c

View File

@ -1157,9 +1157,16 @@ cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, heigh
%define SHIFT denomd %define SHIFT denomd
%endif %endif
lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom
%if %1 <= 4
pxor m1, m1
%endif
movd m2, wxm ; WX movd m2, wxm ; WX
movd m4, SHIFT ; shift movd m4, SHIFT ; shift
%if %1 <= 4
punpcklwd m2, m1
%else
punpcklwd m2, m2 punpcklwd m2, m2
%endif
dec SHIFT dec SHIFT
movdqu m5, [one_per_32] movdqu m5, [one_per_32]
movd m6, SHIFT movd m6, SHIFT
@ -1176,6 +1183,13 @@ cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, heigh
%endif %endif
.loop .loop
SIMPLE_LOAD %1, 10, srcq, m0 SIMPLE_LOAD %1, 10, srcq, m0
%if %1 <= 4
punpcklwd m0, m1
pmaddwd m0, m2
paddd m0, m5
psrad m0, m4
paddd m0, m3
%else
pmulhw m6, m0, m2 pmulhw m6, m0, m2
pmullw m0, m2 pmullw m0, m2
punpckhwd m1, m0, m6 punpckhwd m1, m0, m6
@ -1186,6 +1200,7 @@ cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, heigh
psrad m1, m4 psrad m1, m4
paddd m0, m3 paddd m0, m3
paddd m1, m3 paddd m1, m3
%endif
packusdw m0, m1 packusdw m0, m1
%if %2 == 8 %if %2 == 8
packuswb m0, m0 packuswb m0, m0
@ -1201,13 +1216,21 @@ cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, heigh
cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2, src2stride, height, denom, wx0, wx1, ox0, ox1 cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2, src2stride, height, denom, wx0, wx1, ox0, ox1
mov r6d, denomm mov r6d, denomm
%if %1 <= 4
pxor m1, m1
%endif
movd m2, wx0m ; WX0 movd m2, wx0m ; WX0
lea r6d, [r6d+14-%2] ; shift = 14 - bitd + denom lea r6d, [r6d+14-%2] ; shift = 14 - bitd + denom
movd m3, wx1m ; WX1 movd m3, wx1m ; WX1
movd m0, r6d ; shift movd m0, r6d ; shift
%if %1 <= 4
punpcklwd m2, m1
punpcklwd m3, m1
%else
punpcklwd m2, m2 punpcklwd m2, m2
inc r6d
punpcklwd m3, m3 punpcklwd m3, m3
%endif
inc r6d
movd m5, r6d ; shift+1 movd m5, r6d ; shift+1
pshufd m2, m2, 0 pshufd m2, m2, 0
mov r6d, ox0m mov r6d, ox0m
@ -1225,6 +1248,15 @@ cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2,
.loop .loop
SIMPLE_LOAD %1, 10, srcq, m0 SIMPLE_LOAD %1, 10, srcq, m0
SIMPLE_LOAD %1, 10, src2q, m8 SIMPLE_LOAD %1, 10, src2q, m8
%if %1 <= 4
punpcklwd m0, m1
punpcklwd m8, m1
pmaddwd m0, m3
pmaddwd m8, m2
paddd m0, m4
paddd m0, m8
psrad m0, m5
%else
pmulhw m6, m0, m3 pmulhw m6, m0, m3
pmullw m0, m3 pmullw m0, m3
pmulhw m7, m8, m2 pmulhw m7, m8, m2
@ -1239,6 +1271,7 @@ cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2,
paddd m1, m4 paddd m1, m4
psrad m0, m5 psrad m0, m5
psrad m1, m5 psrad m1, m5
%endif
packusdw m0, m1 packusdw m0, m1
%if %2 == 8 %if %2 == 8
packuswb m0, m0 packuswb m0, m0