From 30ae07d7ef3555ec45fa53098849223fff204475 Mon Sep 17 00:00:00 2001
From: Ivan Kalvachev <ikalvachev@gmail.com>
Date: Sat, 5 Aug 2017 20:18:50 +0300
Subject: [PATCH] Add macros to x86util.asm .

Improved version of VBROADCASTSS that works like the avx2 instruction.
Emulation of vpbroadcastd.
Horizontal sum HSUMPS that places the result in all elements.
Emulation of blendvps and pblendvb.

Signed-off-by: Ivan Kalvachev <ikalvachev@gmail.com>
---
 libavutil/x86/x86util.asm | 106 +++++++++++++++++++++++++++++++++++---
 1 file changed, 98 insertions(+), 8 deletions(-)

diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index cc7d272cad..e1220dfc1a 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -832,14 +832,25 @@
     pmaxsd  %1, %2
 %endmacro
 
-%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
-%if cpuflag(avx)
-    vbroadcastss %1, %2
-%else ; sse
-%ifnidn %1, %2
-    movss        %1, %2
-%endif
-    shufps       %1, %1, 0
+%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+    vbroadcastss  %1, %2
+%elif cpuflag(avx)
+    %ifnum sizeof%2         ; avx1 register
+        shufps  xmm%1, xmm%2, xmm%2, q0000
+        %if sizeof%1 >= 32  ; mmsize>=32
+            vinsertf128  %1, %1, xmm%1, 1
+        %endif
+    %else                   ; avx1 memory
+        vbroadcastss  %1, %2
+    %endif
+%else
+    %ifnum sizeof%2         ; sse register
+        shufps  %1, %2, %2, q0000
+    %else                   ; sse memory
+        movss   %1, %2
+        shufps  %1, %1, 0
+    %endif
 %endif
 %endmacro
 
@@ -854,6 +865,21 @@
 %endif
 %endmacro
 
+%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+    vpbroadcastd  %1, %2
+%elif cpuflag(avx) && sizeof%1 >= 32
+    %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss
+%else
+    %ifnum sizeof%2         ; sse2 register
+        pshufd  %1, %2, q0000
+    %else                   ; sse memory
+        movd    %1, %2
+        pshufd  %1, %1, 0
+    %endif
+%endif
+%endmacro
+
 %macro SHUFFLE_MASK_W 8
     %rep 8
         %if %1>=0x80
@@ -918,3 +944,67 @@
     movhlps    %1, %2        ; may cause an int/float domain transition and has a dependency on dst
 %endif
 %endmacro
+
+; Horizontal Sum of Packed Single precision floats
+; The resulting sum is in all elements.
+%macro HSUMPS 2 ; dst/src, tmp
+%if cpuflag(avx)
+    %if sizeof%1>=32  ; avx
+        vperm2f128  %2, %1, %1, (0)*16+(1)
+        addps       %1, %2
+    %endif
+    shufps      %2, %1, %1, q1032
+    addps       %1, %2
+    shufps      %2, %1, %1, q0321
+    addps       %1, %2
+%else  ; this form is a bit faster than the short avx-like emulation.
+    movaps      %2, %1
+    shufps      %1, %1, q1032
+    addps       %1, %2
+    movaps      %2, %1
+    shufps      %1, %1, q0321
+    addps       %1, %2
+    ; all %1 members should be equal for as long as float a+b==b+a
+%endif
+%endmacro
+
+; Emulate blendvps if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro BLENDVPS 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+    blendvps  %1, %1, %2, %3
+%elif cpuflag(sse4)
+    %ifnidn %3,xmm0
+        %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
+    %endif
+    blendvps  %1, %2, %3
+%else
+    xorps  %2, %1
+    andps  %2, %3
+    xorps  %1, %2
+%endif
+%endmacro
+
+; Emulate pblendvb if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro PBLENDVB 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+    %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
+        %error pblendb not possible with ymm on avx1, try blendvps.
+    %endif
+    pblendvb  %1, %1, %2, %3
+%elif cpuflag(sse4)
+    %ifnidn %3,xmm0
+        %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3
+    %endif
+    pblendvb  %1, %2, %3
+%else
+    pxor  %2, %1
+    pand  %2, %3
+    pxor  %1, %2
+%endif
+%endmacro