mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
avutil/mips: optimize UNPCK&SAD macros with MSA2.0 instruction.
Loongson 3A4000 and 2k1000 has supported MSA2.0. This patch optimized SAD_UB2_UH,UNPCK_R_SH_SW,UNPCK_SB_SH and UNPCK_SH_SW with MSA2.0 instruction. Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
parent
24f7a8a168
commit
a45e8ade2d
5
configure
vendored
5
configure
vendored
@ -441,6 +441,7 @@ Optimization options (experts only):
|
|||||||
--disable-mipsdsp disable MIPS DSP ASE R1 optimizations
|
--disable-mipsdsp disable MIPS DSP ASE R1 optimizations
|
||||||
--disable-mipsdspr2 disable MIPS DSP ASE R2 optimizations
|
--disable-mipsdspr2 disable MIPS DSP ASE R2 optimizations
|
||||||
--disable-msa disable MSA optimizations
|
--disable-msa disable MSA optimizations
|
||||||
|
--disable-msa2 disable MSA2 optimizations
|
||||||
--disable-mipsfpu disable floating point MIPS optimizations
|
--disable-mipsfpu disable floating point MIPS optimizations
|
||||||
--disable-mmi disable Loongson SIMD optimizations
|
--disable-mmi disable Loongson SIMD optimizations
|
||||||
--disable-fast-unaligned consider unaligned accesses slow
|
--disable-fast-unaligned consider unaligned accesses slow
|
||||||
@ -1999,6 +2000,7 @@ ARCH_EXT_LIST_MIPS="
|
|||||||
mipsdsp
|
mipsdsp
|
||||||
mipsdspr2
|
mipsdspr2
|
||||||
msa
|
msa
|
||||||
|
msa2
|
||||||
"
|
"
|
||||||
|
|
||||||
ARCH_EXT_LIST_LOONGSON="
|
ARCH_EXT_LIST_LOONGSON="
|
||||||
@ -2527,6 +2529,7 @@ mipsdsp_deps="mips"
|
|||||||
mipsdspr2_deps="mips"
|
mipsdspr2_deps="mips"
|
||||||
mmi_deps="mips"
|
mmi_deps="mips"
|
||||||
msa_deps="mipsfpu"
|
msa_deps="mipsfpu"
|
||||||
|
msa2_deps="msa"
|
||||||
|
|
||||||
cpunop_deps="i686"
|
cpunop_deps="i686"
|
||||||
x86_64_select="i686"
|
x86_64_select="i686"
|
||||||
@ -5753,6 +5756,7 @@ elif enabled mips; then
|
|||||||
enabled mipsfpu && enabled msa && check_inline_asm_flags msa '"addvi.b $w0, $w1, 1"' '-mmsa' && check_headers msa.h || disable msa
|
enabled mipsfpu && enabled msa && check_inline_asm_flags msa '"addvi.b $w0, $w1, 1"' '-mmsa' && check_headers msa.h || disable msa
|
||||||
enabled mipsdsp && check_inline_asm_flags mipsdsp '"addu.qb $t0, $t1, $t2"' '-mdsp'
|
enabled mipsdsp && check_inline_asm_flags mipsdsp '"addu.qb $t0, $t1, $t2"' '-mdsp'
|
||||||
enabled mipsdspr2 && check_inline_asm_flags mipsdspr2 '"absq_s.qb $t0, $t1"' '-mdspr2'
|
enabled mipsdspr2 && check_inline_asm_flags mipsdspr2 '"absq_s.qb $t0, $t1"' '-mdspr2'
|
||||||
|
enabled msa && enabled msa2 && check_inline_asm_flags msa2 '"nxbits.any.b $w0, $w0"' '-mmsa2' && check_headers msa2.h || disable msa2
|
||||||
|
|
||||||
if enabled bigendian && enabled msa; then
|
if enabled bigendian && enabled msa; then
|
||||||
disable msa
|
disable msa
|
||||||
@ -7128,6 +7132,7 @@ if enabled mips; then
|
|||||||
echo "MIPS DSP R1 enabled ${mipsdsp-no}"
|
echo "MIPS DSP R1 enabled ${mipsdsp-no}"
|
||||||
echo "MIPS DSP R2 enabled ${mipsdspr2-no}"
|
echo "MIPS DSP R2 enabled ${mipsdspr2-no}"
|
||||||
echo "MIPS MSA enabled ${msa-no}"
|
echo "MIPS MSA enabled ${msa-no}"
|
||||||
|
echo "MIPS MSA2 enabled ${msa2-no}"
|
||||||
echo "LOONGSON MMI enabled ${mmi-no}"
|
echo "LOONGSON MMI enabled ${mmi-no}"
|
||||||
fi
|
fi
|
||||||
if enabled ppc; then
|
if enabled ppc; then
|
||||||
|
@ -23,6 +23,11 @@
|
|||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <msa.h>
|
#include <msa.h>
|
||||||
|
#include <config.h>
|
||||||
|
|
||||||
|
#if HAVE_MSA2
|
||||||
|
#include <msa2.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#define ALIGNMENT 16
|
#define ALIGNMENT 16
|
||||||
#define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
|
#define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
|
||||||
@ -1234,6 +1239,15 @@
|
|||||||
unsigned absolute diff values, even-odd pairs are added
|
unsigned absolute diff values, even-odd pairs are added
|
||||||
together to generate 8 halfword results.
|
together to generate 8 halfword results.
|
||||||
*/
|
*/
|
||||||
|
#if HAVE_MSA2
|
||||||
|
#define SAD_UB2_UH(in0, in1, ref0, ref1) \
|
||||||
|
( { \
|
||||||
|
v8u16 sad_m = { 0 }; \
|
||||||
|
sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in0, (v16u8) ref0); \
|
||||||
|
sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in1, (v16u8) ref1); \
|
||||||
|
sad_m; \
|
||||||
|
} )
|
||||||
|
#else
|
||||||
#define SAD_UB2_UH(in0, in1, ref0, ref1) \
|
#define SAD_UB2_UH(in0, in1, ref0, ref1) \
|
||||||
( { \
|
( { \
|
||||||
v16u8 diff0_m, diff1_m; \
|
v16u8 diff0_m, diff1_m; \
|
||||||
@ -1247,6 +1261,7 @@
|
|||||||
\
|
\
|
||||||
sad_m; \
|
sad_m; \
|
||||||
} )
|
} )
|
||||||
|
#endif // #if HAVE_MSA2
|
||||||
|
|
||||||
/* Description : Insert specified word elements from input vectors to 1
|
/* Description : Insert specified word elements from input vectors to 1
|
||||||
destination vector
|
destination vector
|
||||||
@ -2287,6 +2302,12 @@
|
|||||||
extracted and interleaved with same vector 'in0' to generate
|
extracted and interleaved with same vector 'in0' to generate
|
||||||
4 word elements keeping sign intact
|
4 word elements keeping sign intact
|
||||||
*/
|
*/
|
||||||
|
#if HAVE_MSA2
|
||||||
|
#define UNPCK_R_SH_SW(in, out) \
|
||||||
|
{ \
|
||||||
|
out = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
|
||||||
|
}
|
||||||
|
#else
|
||||||
#define UNPCK_R_SH_SW(in, out) \
|
#define UNPCK_R_SH_SW(in, out) \
|
||||||
{ \
|
{ \
|
||||||
v8i16 sign_m; \
|
v8i16 sign_m; \
|
||||||
@ -2294,6 +2315,7 @@
|
|||||||
sign_m = __msa_clti_s_h((v8i16) in, 0); \
|
sign_m = __msa_clti_s_h((v8i16) in, 0); \
|
||||||
out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
|
out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
|
||||||
}
|
}
|
||||||
|
#endif // #if HAVE_MSA2
|
||||||
|
|
||||||
/* Description : Sign extend byte elements from input vector and return
|
/* Description : Sign extend byte elements from input vector and return
|
||||||
halfword results in pair of vectors
|
halfword results in pair of vectors
|
||||||
@ -2306,6 +2328,13 @@
|
|||||||
Then interleaved left with same vector 'in0' to
|
Then interleaved left with same vector 'in0' to
|
||||||
generate 8 signed halfword elements in 'out1'
|
generate 8 signed halfword elements in 'out1'
|
||||||
*/
|
*/
|
||||||
|
#if HAVE_MSA2
|
||||||
|
#define UNPCK_SB_SH(in, out0, out1) \
|
||||||
|
{ \
|
||||||
|
out0 = (v4i32) __builtin_msa2_w2x_lo_s_b((v16i8) in); \
|
||||||
|
out1 = (v4i32) __builtin_msa2_w2x_hi_s_b((v16i8) in); \
|
||||||
|
}
|
||||||
|
#else
|
||||||
#define UNPCK_SB_SH(in, out0, out1) \
|
#define UNPCK_SB_SH(in, out0, out1) \
|
||||||
{ \
|
{ \
|
||||||
v16i8 tmp_m; \
|
v16i8 tmp_m; \
|
||||||
@ -2313,6 +2342,7 @@
|
|||||||
tmp_m = __msa_clti_s_b((v16i8) in, 0); \
|
tmp_m = __msa_clti_s_b((v16i8) in, 0); \
|
||||||
ILVRL_B2_SH(tmp_m, in, out0, out1); \
|
ILVRL_B2_SH(tmp_m, in, out0, out1); \
|
||||||
}
|
}
|
||||||
|
#endif // #if HAVE_MSA2
|
||||||
|
|
||||||
/* Description : Zero extend unsigned byte elements to halfword elements
|
/* Description : Zero extend unsigned byte elements to halfword elements
|
||||||
Arguments : Inputs - in (1 input unsigned byte vector)
|
Arguments : Inputs - in (1 input unsigned byte vector)
|
||||||
@ -2339,6 +2369,13 @@
|
|||||||
Then interleaved left with same vector 'in0' to
|
Then interleaved left with same vector 'in0' to
|
||||||
generate 4 signed word elements in 'out1'
|
generate 4 signed word elements in 'out1'
|
||||||
*/
|
*/
|
||||||
|
#if HAVE_MSA2
|
||||||
|
#define UNPCK_SH_SW(in, out0, out1) \
|
||||||
|
{ \
|
||||||
|
out0 = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
|
||||||
|
out1 = (v4i32) __builtin_msa2_w2x_hi_s_h((v8i16) in); \
|
||||||
|
}
|
||||||
|
#else
|
||||||
#define UNPCK_SH_SW(in, out0, out1) \
|
#define UNPCK_SH_SW(in, out0, out1) \
|
||||||
{ \
|
{ \
|
||||||
v8i16 tmp_m; \
|
v8i16 tmp_m; \
|
||||||
@ -2346,6 +2383,7 @@
|
|||||||
tmp_m = __msa_clti_s_h((v8i16) in, 0); \
|
tmp_m = __msa_clti_s_h((v8i16) in, 0); \
|
||||||
ILVRL_H2_SW(tmp_m, in, out0, out1); \
|
ILVRL_H2_SW(tmp_m, in, out0, out1); \
|
||||||
}
|
}
|
||||||
|
#endif // #if HAVE_MSA2
|
||||||
|
|
||||||
/* Description : Swap two variables
|
/* Description : Swap two variables
|
||||||
Arguments : Inputs - in0, in1
|
Arguments : Inputs - in0, in1
|
||||||
@ -2850,13 +2888,11 @@
|
|||||||
*/
|
*/
|
||||||
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
|
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
|
||||||
( { \
|
( { \
|
||||||
v8i16 tmp1_m; \
|
|
||||||
v8i16 out0_m; \
|
v8i16 out0_m; \
|
||||||
\
|
\
|
||||||
out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
|
out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
|
||||||
out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
|
out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
|
||||||
tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2); \
|
out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
|
||||||
out0_m = __msa_adds_s_h(out0_m, tmp1_m); \
|
|
||||||
\
|
\
|
||||||
out0_m; \
|
out0_m; \
|
||||||
} )
|
} )
|
||||||
|
Loading…
x
Reference in New Issue
Block a user