mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
Add macros to x86util.asm .
Improved version of VBROADCASTSS that works like the avx2 instruction. Emulation of vpbroadcastd. Horizontal sum HSUMPS that places the result in all elements. Emulation of blendvps and pblendvb. Signed-off-by: Ivan Kalvachev <ikalvachev@gmail.com>
This commit is contained in:
parent
cadab5a2a7
commit
30ae07d7ef
@ -832,15 +832,26 @@
|
||||
pmaxsd %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
|
||||
%if cpuflag(avx)
|
||||
%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
|
||||
%if cpuflag(avx2)
|
||||
vbroadcastss %1, %2
|
||||
%else ; sse
|
||||
%ifnidn %1, %2
|
||||
movss %1, %2
|
||||
%elif cpuflag(avx)
|
||||
%ifnum sizeof%2 ; avx1 register
|
||||
shufps xmm%1, xmm%2, xmm%2, q0000
|
||||
%if sizeof%1 >= 32 ; mmsize>=32
|
||||
vinsertf128 %1, %1, xmm%1, 1
|
||||
%endif
|
||||
%else ; avx1 memory
|
||||
vbroadcastss %1, %2
|
||||
%endif
|
||||
%else
|
||||
%ifnum sizeof%2 ; sse register
|
||||
shufps %1, %2, %2, q0000
|
||||
%else ; sse memory
|
||||
movss %1, %2
|
||||
shufps %1, %1, 0
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64
|
||||
@ -854,6 +865,21 @@
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm
|
||||
%if cpuflag(avx2)
|
||||
vpbroadcastd %1, %2
|
||||
%elif cpuflag(avx) && sizeof%1 >= 32
|
||||
%error vpbroadcastd not possible with ymm on avx1. try vbroadcastss
|
||||
%else
|
||||
%ifnum sizeof%2 ; sse2 register
|
||||
pshufd %1, %2, q0000
|
||||
%else ; sse memory
|
||||
movd %1, %2
|
||||
pshufd %1, %1, 0
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SHUFFLE_MASK_W 8
|
||||
%rep 8
|
||||
%if %1>=0x80
|
||||
@ -918,3 +944,67 @@
|
||||
movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; Horizontal Sum of Packed Single precision floats
|
||||
; The resulting sum is in all elements.
|
||||
%macro HSUMPS 2 ; dst/src, tmp
|
||||
%if cpuflag(avx)
|
||||
%if sizeof%1>=32 ; avx
|
||||
vperm2f128 %2, %1, %1, (0)*16+(1)
|
||||
addps %1, %2
|
||||
%endif
|
||||
shufps %2, %1, %1, q1032
|
||||
addps %1, %2
|
||||
shufps %2, %1, %1, q0321
|
||||
addps %1, %2
|
||||
%else ; this form is a bit faster than the short avx-like emulation.
|
||||
movaps %2, %1
|
||||
shufps %1, %1, q1032
|
||||
addps %1, %2
|
||||
movaps %2, %1
|
||||
shufps %1, %1, q0321
|
||||
addps %1, %2
|
||||
; all %1 members should be equal for as long as float a+b==b+a
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; Emulate blendvps if not available
|
||||
;
|
||||
; src_b is destroyed when using emulation with logical operands
|
||||
; SSE41 blendv instruction is hard coded to use xmm0 as mask
|
||||
%macro BLENDVPS 3 ; dst/src_a, src_b, mask
|
||||
%if cpuflag(avx)
|
||||
blendvps %1, %1, %2, %3
|
||||
%elif cpuflag(sse4)
|
||||
%ifnidn %3,xmm0
|
||||
%error sse41 blendvps uses xmm0 as default 3d operand, you used %3
|
||||
%endif
|
||||
blendvps %1, %2, %3
|
||||
%else
|
||||
xorps %2, %1
|
||||
andps %2, %3
|
||||
xorps %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; Emulate pblendvb if not available
|
||||
;
|
||||
; src_b is destroyed when using emulation with logical operands
|
||||
; SSE41 blendv instruction is hard coded to use xmm0 as mask
|
||||
%macro PBLENDVB 3 ; dst/src_a, src_b, mask
|
||||
%if cpuflag(avx)
|
||||
%if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
|
||||
%error pblendb not possible with ymm on avx1, try blendvps.
|
||||
%endif
|
||||
pblendvb %1, %1, %2, %3
|
||||
%elif cpuflag(sse4)
|
||||
%ifnidn %3,xmm0
|
||||
%error sse41 pblendvd uses xmm0 as default 3d operand, you used %3
|
||||
%endif
|
||||
pblendvb %1, %2, %3
|
||||
%else
|
||||
pxor %2, %1
|
||||
pand %2, %3
|
||||
pxor %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
Loading…
Reference in New Issue
Block a user