1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-24 13:56:33 +02:00

VP8: Much faster SSE2 MC

5-10% faster or more on Phenom, Athlon 64, and some others.
Helps some on pre-SSSE3 Intel chips as well, but not as much.

Originally committed as revision 24513 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Jason Garrett-Glaser 2010-07-26 19:34:00 +00:00
parent 9dd9d67bd0
commit e25dee602f

View File

@ -438,48 +438,43 @@ cglobal put_vp8_epel4_h6_mmxext, 6, 6
jg .nextrow
REP_RET
; 4x4 block, H-only 4-tap filter
INIT_XMM
cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
shl r5d, 4
cglobal put_vp8_epel8_h4_sse2, 6, 6, 10
shl r5d, 5
%ifdef PIC
lea r11, [fourtap_filter_hw_m]
lea r11, [fourtap_filter_v_m]
%endif
mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
mova m6, [fourtap_filter_hw+r5]
lea r5, [fourtap_filter_v+r5-32]
pxor m7, m7
mova m4, [pw_64]
mova m5, [r5+ 0]
mova m6, [r5+16]
%ifdef m8
mova m8, [r5+32]
mova m9, [r5+48]
%endif
.nextrow
movh m0, [r2-1]
punpcklbw m0, m7 ; ABCDEFGH
mova m1, m0
mova m2, m0
mova m3, m0
psrldq m1, 2 ; BCDEFGH
psrldq m2, 4 ; CDEFGH
psrldq m3, 6 ; DEFGH
punpcklwd m0, m1 ; ABBCCDDE
punpcklwd m2, m3 ; CDDEEFFG
pmaddwd m0, m5
pmaddwd m2, m6
paddd m0, m2
movh m1, [r2+3]
punpcklbw m1, m7 ; ABCDEFGH
mova m2, m1
mova m3, m1
mova m4, m1
psrldq m2, 2 ; BCDEFGH
psrldq m3, 4 ; CDEFGH
psrldq m4, 6 ; DEFGH
punpcklwd m1, m2 ; ABBCCDDE
punpcklwd m3, m4 ; CDDEEFFG
pmaddwd m1, m5
pmaddwd m3, m6
paddd m1, m3
packssdw m0, m1
paddsw m0, [pw_64]
movq m0, [r2-1]
movq m1, [r2-0]
movq m2, [r2+1]
movq m3, [r2+2]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
pmullw m0, m5
pmullw m1, m6
%ifdef m8
pmullw m2, m8
pmullw m3, m9
%else
pmullw m2, [r5+32]
pmullw m3, [r5+48]
%endif
paddsw m0, m1
paddsw m2, m3
paddsw m0, m2
paddsw m0, m4
psraw m0, 7
packuswb m0, m7
movh [r0], m0 ; store
@ -491,62 +486,57 @@ cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
jg .nextrow
REP_RET
cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
cglobal put_vp8_epel8_h6_sse2, 6, 6, 14
lea r5d, [r5*3]
shl r5d, 4
%ifdef PIC
lea r11, [sixtap_filter_hw_m]
lea r11, [sixtap_filter_v_m]
%endif
lea r5, [sixtap_filter_hw+r5*8]
lea r5, [sixtap_filter_v+r5-96]
pxor m7, m7
mova m6, [pw_64]
%ifdef m8
mova m8, [r5+ 0]
mova m9, [r5+16]
mova m10, [r5+32]
mova m11, [r5+48]
mova m12, [r5+64]
mova m13, [r5+80]
%endif
.nextrow
movu m0, [r2-2]
mova m6, m0
mova m4, m0
punpcklbw m0, m7 ; ABCDEFGHI
mova m1, m0
mova m2, m0
mova m3, m0
psrldq m1, 2 ; BCDEFGH
psrldq m2, 4 ; CDEFGH
psrldq m3, 6 ; DEFGH
psrldq m4, 4
punpcklbw m4, m7 ; EFGH
mova m5, m4
psrldq m5, 2 ; FGH
punpcklwd m0, m1 ; ABBCCDDE
punpcklwd m2, m3 ; CDDEEFFG
punpcklwd m4, m5 ; EFFGGHHI
pmaddwd m0, [r5-48]
pmaddwd m2, [r5-32]
pmaddwd m4, [r5-16]
paddd m0, m2
paddd m0, m4
psrldq m6, 4
mova m4, m6
punpcklbw m6, m7 ; ABCDEFGHI
mova m1, m6
mova m2, m6
mova m3, m6
psrldq m1, 2 ; BCDEFGH
psrldq m2, 4 ; CDEFGH
psrldq m3, 6 ; DEFGH
psrldq m4, 4
punpcklbw m4, m7 ; EFGH
mova m5, m4
psrldq m5, 2 ; FGH
punpcklwd m6, m1 ; ABBCCDDE
punpcklwd m2, m3 ; CDDEEFFG
punpcklwd m4, m5 ; EFFGGHHI
pmaddwd m6, [r5-48]
pmaddwd m2, [r5-32]
pmaddwd m4, [r5-16]
paddd m6, m2
paddd m6, m4
packssdw m0, m6
paddsw m0, [pw_64]
movq m0, [r2-2]
movq m1, [r2-1]
movq m2, [r2-0]
movq m3, [r2+1]
movq m4, [r2+2]
movq m5, [r2+3]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
punpcklbw m5, m7
%ifdef m8
pmullw m0, m8
pmullw m1, m9
pmullw m2, m10
pmullw m3, m11
pmullw m4, m12
pmullw m5, m13
%else
pmullw m0, [r5+ 0]
pmullw m1, [r5+16]
pmullw m2, [r5+32]
pmullw m3, [r5+48]
pmullw m4, [r5+64]
pmullw m5, [r5+80]
%endif
paddsw m1, m4
paddsw m0, m5
paddsw m1, m2
paddsw m0, m3
paddsw m0, m1
paddsw m0, m6
psraw m0, 7
packuswb m0, m7
movh [r0], m0 ; store