Add macros to x86util.asm .

Improved version of VBROADCASTSS that works like the avx2 instruction. Emulation of vpbroadcastd. Horizontal sum HSUMPS that places the result in all elements. Emulation of blendvps and pblendvb. Signed-off-by: Ivan Kalvachev <ikalvachev@gmail.com>
2024-12-23 12:43:46 +02:00 · 2017-08-05 20:18:50 +03:00 · 2017-08-05 20:18:50 +03:00 · 30ae07d7ef
commit 30ae07d7ef
parent cadab5a2a7
1 changed files with 98 additions and 8 deletions
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@ -832,14 +832,25 @@
    pmaxsd  %1, %2
 %endmacro
-%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
+%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
-%if cpuflag(avx)
+%if cpuflag(avx2)
-    vbroadcastss %1, %2
+    vbroadcastss  %1, %2
-%else ; sse
+%elif cpuflag(avx)
-%ifnidn %1, %2
+    %ifnum sizeof%2         ; avx1 register
-    movss        %1, %2
+        shufps  xmm%1, xmm%2, xmm%2, q0000
-%endif
+        %if sizeof%1 >= 32  ; mmsize>=32
-    shufps       %1, %1, 0
+            vinsertf128  %1, %1, xmm%1, 1
        %endif
    %else                   ; avx1 memory
        vbroadcastss  %1, %2
    %endif
 %else
    %ifnum sizeof%2         ; sse register
        shufps  %1, %2, %2, q0000
    %else                   ; sse memory
        movss   %1, %2
        shufps  %1, %1, 0
    %endif
 %endif
 %endmacro
@ -854,6 +865,21 @@
 %endif
 %endmacro
 %macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm
 %if cpuflag(avx2)
    vpbroadcastd  %1, %2
 %elif cpuflag(avx) && sizeof%1 >= 32
    %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss
 %else
    %ifnum sizeof%2         ; sse2 register
        pshufd  %1, %2, q0000
    %else                   ; sse memory
        movd    %1, %2
        pshufd  %1, %1, 0
    %endif
 %endif
 %endmacro
 %macro SHUFFLE_MASK_W 8
    %rep 8
        %if %1>=0x80
@ -918,3 +944,67 @@
    movhlps    %1, %2        ; may cause an int/float domain transition and has a dependency on dst
 %endif
 %endmacro
 ; Horizontal Sum of Packed Single precision floats
 ; The resulting sum is in all elements.
 %macro HSUMPS 2 ; dst/src, tmp
 %if cpuflag(avx)
    %if sizeof%1>=32  ; avx
        vperm2f128  %2, %1, %1, (0)*16+(1)
        addps       %1, %2
    %endif
    shufps      %2, %1, %1, q1032
    addps       %1, %2
    shufps      %2, %1, %1, q0321
    addps       %1, %2
 %else  ; this form is a bit faster than the short avx-like emulation.
    movaps      %2, %1
    shufps      %1, %1, q1032
    addps       %1, %2
    movaps      %2, %1
    shufps      %1, %1, q0321
    addps       %1, %2
    ; all %1 members should be equal for as long as float a+b==b+a
 %endif
 %endmacro
 ; Emulate blendvps if not available
 ;
 ; src_b is destroyed when using emulation with logical operands
 ; SSE41 blendv instruction is hard coded to use xmm0 as mask
 %macro BLENDVPS 3 ; dst/src_a, src_b, mask
 %if cpuflag(avx)
    blendvps  %1, %1, %2, %3
 %elif cpuflag(sse4)
    %ifnidn %3,xmm0
        %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
    %endif
    blendvps  %1, %2, %3
 %else
    xorps  %2, %1
    andps  %2, %3
    xorps  %1, %2
 %endif
 %endmacro
 ; Emulate pblendvb if not available
 ;
 ; src_b is destroyed when using emulation with logical operands
 ; SSE41 blendv instruction is hard coded to use xmm0 as mask
 %macro PBLENDVB 3 ; dst/src_a, src_b, mask
 %if cpuflag(avx)
    %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
        %error pblendb not possible with ymm on avx1, try blendvps.
    %endif
    pblendvb  %1, %1, %2, %3
 %elif cpuflag(sse4)
    %ifnidn %3,xmm0
        %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3
    %endif
    pblendvb  %1, %2, %3
 %else
    pxor  %2, %1
    pand  %2, %3
    pxor  %1, %2
 %endif
 %endmacro