mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for mpegvideo functions
This patch adds MSA (MIPS-SIMD-Arch) optimizations for mpegvideo functions in new file mpegvideo_msa.c Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
53fd70579b
commit
2eb28e889d
@ -29,6 +29,7 @@ OBJS-$(CONFIG_QPELDSP) += mips/qpeldsp_init_mips.o
|
||||
OBJS-$(CONFIG_HPELDSP) += mips/hpeldsp_init_mips.o
|
||||
OBJS-$(CONFIG_BLOCKDSP) += mips/blockdsp_init_mips.o
|
||||
OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_init_mips.o
|
||||
OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_init_mips.o
|
||||
MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o \
|
||||
mips/hevc_mc_uni_msa.o \
|
||||
mips/hevc_mc_uniw_msa.o \
|
||||
@ -47,5 +48,6 @@ MSA-OBJS-$(CONFIG_QPELDSP) += mips/qpeldsp_msa.o
|
||||
MSA-OBJS-$(CONFIG_HPELDSP) += mips/hpeldsp_msa.o
|
||||
MSA-OBJS-$(CONFIG_BLOCKDSP) += mips/blockdsp_msa.o
|
||||
MSA-OBJS-$(CONFIG_PIXBLOCKDSP) += mips/pixblockdsp_msa.o
|
||||
MSA-OBJS-$(CONFIG_MPEGVIDEO) += mips/mpegvideo_msa.o
|
||||
LOONGSON3-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_mmi.o
|
||||
LOONGSON3-OBJS-$(CONFIG_H264CHROMA) += mips/h264chroma_mmi.o
|
||||
|
37
libavcodec/mips/mpegvideo_init_mips.c
Normal file
37
libavcodec/mips/mpegvideo_init_mips.c
Normal file
@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "h263dsp_mips.h"
|
||||
|
||||
#if HAVE_MSA
|
||||
static av_cold void dct_unquantize_init_msa(MpegEncContext *s)
|
||||
{
|
||||
s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_msa;
|
||||
s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_msa;
|
||||
s->dct_unquantize_mpeg2_inter = ff_dct_unquantize_mpeg2_inter_msa;
|
||||
}
|
||||
#endif // #if HAVE_MSA
|
||||
|
||||
av_cold void ff_mpv_common_init_mips(MpegEncContext *s)
|
||||
{
|
||||
#if HAVE_MSA
|
||||
dct_unquantize_init_msa(s);
|
||||
#endif // #if HAVE_MSA
|
||||
}
|
250
libavcodec/mips/mpegvideo_msa.c
Normal file
250
libavcodec/mips/mpegvideo_msa.c
Normal file
@ -0,0 +1,250 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/mips/generic_macros_msa.h"
|
||||
#include "h263dsp_mips.h"
|
||||
|
||||
static void h263_dct_unquantize_msa(int16_t *block, int16_t qmul,
|
||||
int16_t qadd, int8_t n_coeffs,
|
||||
uint8_t loop_start)
|
||||
{
|
||||
int16_t *block_dup = block;
|
||||
int32_t level, cnt;
|
||||
v8i16 block_vec, qmul_vec, qadd_vec, sub;
|
||||
v8i16 add, mask, mul, zero_mask;
|
||||
|
||||
qmul_vec = __msa_fill_h(qmul);
|
||||
qadd_vec = __msa_fill_h(qadd);
|
||||
for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) {
|
||||
block_vec = LD_SH(block_dup + loop_start);
|
||||
mask = __msa_clti_s_h(block_vec, 0);
|
||||
zero_mask = __msa_ceqi_h(block_vec, 0);
|
||||
mul = block_vec * qmul_vec;
|
||||
sub = mul - qadd_vec;
|
||||
add = mul + qadd_vec;
|
||||
add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8) mask);
|
||||
block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec,
|
||||
(v16u8) zero_mask);
|
||||
ST_SH(block_vec, block_dup + loop_start);
|
||||
block_dup += 8;
|
||||
}
|
||||
|
||||
cnt = ((n_coeffs >> 3) * 8) + loop_start;
|
||||
|
||||
for (; cnt <= n_coeffs; cnt++) {
|
||||
level = block[cnt];
|
||||
if (level) {
|
||||
if (level < 0) {
|
||||
level = level * qmul - qadd;
|
||||
} else {
|
||||
level = level * qmul + qadd;
|
||||
}
|
||||
block[cnt] = level;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int32_t mpeg2_dct_unquantize_inter_msa(int16_t *block,
|
||||
int32_t qscale,
|
||||
const int16_t *quant_matrix)
|
||||
{
|
||||
int32_t cnt, sum_res = -1;
|
||||
v8i16 block_vec, block_neg, qscale_vec, mask;
|
||||
v8i16 block_org0, block_org1, block_org2, block_org3;
|
||||
v8i16 quant_m0, quant_m1, quant_m2, quant_m3;
|
||||
v8i16 sum, mul, zero_mask;
|
||||
v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l;
|
||||
v4i32 block_l, block_r, sad;
|
||||
|
||||
qscale_vec = __msa_fill_h(qscale);
|
||||
for (cnt = 0; cnt < 2; cnt++) {
|
||||
LD_SH4(block, 8, block_org0, block_org1, block_org2, block_org3);
|
||||
LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3);
|
||||
mask = __msa_clti_s_h(block_org0, 0);
|
||||
zero_mask = __msa_ceqi_h(block_org0, 0);
|
||||
block_neg = -block_org0;
|
||||
block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg,
|
||||
(v16u8) mask);
|
||||
block_vec <<= 1;
|
||||
block_vec += 1;
|
||||
UNPCK_SH_SW(block_vec, block_r, block_l);
|
||||
UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
|
||||
UNPCK_SH_SW(quant_m0, quant_m_r, quant_m_l);
|
||||
mul_vec = block_l * qscale_l;
|
||||
mul_vec *= quant_m_l;
|
||||
block_l = mul_vec >> 4;
|
||||
mul_vec = block_r * qscale_r;
|
||||
mul_vec *= quant_m_r;
|
||||
block_r = mul_vec >> 4;
|
||||
mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
|
||||
block_neg = - mul;
|
||||
sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
|
||||
(v16u8) mask);
|
||||
sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0,
|
||||
(v16u8) zero_mask);
|
||||
ST_SH(sum, block);
|
||||
block += 8;
|
||||
quant_matrix += 8;
|
||||
sad = __msa_hadd_s_w(sum, sum);
|
||||
sum_res += HADD_SW_S32(sad);
|
||||
mask = __msa_clti_s_h(block_org1, 0);
|
||||
zero_mask = __msa_ceqi_h(block_org1, 0);
|
||||
block_neg = - block_org1;
|
||||
block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg,
|
||||
(v16u8) mask);
|
||||
block_vec <<= 1;
|
||||
block_vec += 1;
|
||||
UNPCK_SH_SW(block_vec, block_r, block_l);
|
||||
UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
|
||||
UNPCK_SH_SW(quant_m1, quant_m_r, quant_m_l);
|
||||
mul_vec = block_l * qscale_l;
|
||||
mul_vec *= quant_m_l;
|
||||
block_l = mul_vec >> 4;
|
||||
mul_vec = block_r * qscale_r;
|
||||
mul_vec *= quant_m_r;
|
||||
block_r = mul_vec >> 4;
|
||||
mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
|
||||
block_neg = - mul;
|
||||
sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
|
||||
(v16u8) mask);
|
||||
sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1,
|
||||
(v16u8) zero_mask);
|
||||
ST_SH(sum, block);
|
||||
|
||||
block += 8;
|
||||
quant_matrix += 8;
|
||||
sad = __msa_hadd_s_w(sum, sum);
|
||||
sum_res += HADD_SW_S32(sad);
|
||||
mask = __msa_clti_s_h(block_org2, 0);
|
||||
zero_mask = __msa_ceqi_h(block_org2, 0);
|
||||
block_neg = - block_org2;
|
||||
block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg,
|
||||
(v16u8) mask);
|
||||
block_vec <<= 1;
|
||||
block_vec += 1;
|
||||
UNPCK_SH_SW(block_vec, block_r, block_l);
|
||||
UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
|
||||
UNPCK_SH_SW(quant_m2, quant_m_r, quant_m_l);
|
||||
mul_vec = block_l * qscale_l;
|
||||
mul_vec *= quant_m_l;
|
||||
block_l = mul_vec >> 4;
|
||||
mul_vec = block_r * qscale_r;
|
||||
mul_vec *= quant_m_r;
|
||||
block_r = mul_vec >> 4;
|
||||
mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
|
||||
block_neg = - mul;
|
||||
sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
|
||||
(v16u8) mask);
|
||||
sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2,
|
||||
(v16u8) zero_mask);
|
||||
ST_SH(sum, block);
|
||||
|
||||
block += 8;
|
||||
quant_matrix += 8;
|
||||
sad = __msa_hadd_s_w(sum, sum);
|
||||
sum_res += HADD_SW_S32(sad);
|
||||
mask = __msa_clti_s_h(block_org3, 0);
|
||||
zero_mask = __msa_ceqi_h(block_org3, 0);
|
||||
block_neg = - block_org3;
|
||||
block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg,
|
||||
(v16u8) mask);
|
||||
block_vec <<= 1;
|
||||
block_vec += 1;
|
||||
UNPCK_SH_SW(block_vec, block_r, block_l);
|
||||
UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
|
||||
UNPCK_SH_SW(quant_m3, quant_m_r, quant_m_l);
|
||||
mul_vec = block_l * qscale_l;
|
||||
mul_vec *= quant_m_l;
|
||||
block_l = mul_vec >> 4;
|
||||
mul_vec = block_r * qscale_r;
|
||||
mul_vec *= quant_m_r;
|
||||
block_r = mul_vec >> 4;
|
||||
mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
|
||||
block_neg = - mul;
|
||||
sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
|
||||
(v16u8) mask);
|
||||
sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3,
|
||||
(v16u8) zero_mask);
|
||||
ST_SH(sum, block);
|
||||
|
||||
block += 8;
|
||||
quant_matrix += 8;
|
||||
sad = __msa_hadd_s_w(sum, sum);
|
||||
sum_res += HADD_SW_S32(sad);
|
||||
}
|
||||
|
||||
return sum_res;
|
||||
}
|
||||
|
||||
void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s,
|
||||
int16_t *block, int32_t index,
|
||||
int32_t qscale)
|
||||
{
|
||||
int32_t qmul, qadd;
|
||||
int32_t nCoeffs;
|
||||
|
||||
av_assert2(s->block_last_index[index] >= 0 || s->h263_aic);
|
||||
|
||||
qmul = qscale << 1;
|
||||
|
||||
if (!s->h263_aic) {
|
||||
block[0] *= index < 4 ? s->y_dc_scale : s->c_dc_scale;
|
||||
qadd = (qscale - 1) | 1;
|
||||
} else {
|
||||
qadd = 0;
|
||||
}
|
||||
if (s->ac_pred)
|
||||
nCoeffs = 63;
|
||||
else
|
||||
nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
|
||||
|
||||
h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1);
|
||||
}
|
||||
|
||||
void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s,
|
||||
int16_t *block, int32_t index,
|
||||
int32_t qscale)
|
||||
{
|
||||
int32_t qmul, qadd;
|
||||
int32_t nCoeffs;
|
||||
|
||||
av_assert2(s->block_last_index[index] >= 0);
|
||||
|
||||
qadd = (qscale - 1) | 1;
|
||||
qmul = qscale << 1;
|
||||
|
||||
nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
|
||||
|
||||
h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0);
|
||||
}
|
||||
|
||||
void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s,
|
||||
int16_t *block, int32_t index,
|
||||
int32_t qscale)
|
||||
{
|
||||
const uint16_t *quant_matrix;
|
||||
int32_t sum = -1;
|
||||
|
||||
quant_matrix = s->inter_matrix;
|
||||
|
||||
sum = mpeg2_dct_unquantize_inter_msa(block, qscale, quant_matrix);
|
||||
|
||||
block[63] ^= sum & 1;
|
||||
}
|
@ -312,6 +312,8 @@ static av_cold int dct_init(MpegEncContext *s)
|
||||
ff_mpv_common_init_ppc(s);
|
||||
if (ARCH_X86)
|
||||
ff_mpv_common_init_x86(s);
|
||||
if (ARCH_MIPS)
|
||||
ff_mpv_common_init_mips(s);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -611,6 +611,7 @@ void ff_mpv_common_init_axp(MpegEncContext *s);
|
||||
void ff_mpv_common_init_neon(MpegEncContext *s);
|
||||
void ff_mpv_common_init_ppc(MpegEncContext *s);
|
||||
void ff_mpv_common_init_x86(MpegEncContext *s);
|
||||
void ff_mpv_common_init_mips(MpegEncContext *s);
|
||||
|
||||
int ff_mpv_common_frame_size_change(MpegEncContext *s);
|
||||
void ff_mpv_common_end(MpegEncContext *s);
|
||||
|
@ -333,6 +333,7 @@
|
||||
LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
|
||||
LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
|
||||
}
|
||||
#define LD_UB6(...) LD_B6(v16u8, __VA_ARGS__)
|
||||
#define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
|
||||
|
||||
#define LD_B7(RTYPE, psrc, stride, \
|
||||
@ -341,6 +342,7 @@
|
||||
LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
|
||||
LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
|
||||
}
|
||||
#define LD_UB7(...) LD_B7(v16u8, __VA_ARGS__)
|
||||
#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
|
||||
|
||||
#define LD_B8(RTYPE, psrc, stride, \
|
||||
@ -839,6 +841,14 @@
|
||||
#define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
|
||||
#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
|
||||
|
||||
#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \
|
||||
out0, out1, out2, slide_val) \
|
||||
{ \
|
||||
SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
|
||||
out2 = (RTYPE) __msa_sldi_b((v16i8) in0_2, (v16i8) in1_2, slide_val); \
|
||||
}
|
||||
#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
|
||||
#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
|
||||
|
||||
/* Description : Shuffle byte vector elements as per mask vector
|
||||
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
|
||||
@ -1086,6 +1096,28 @@
|
||||
}
|
||||
#define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Minimum values between unsigned elements of
|
||||
either vector are copied to the output vector
|
||||
Arguments : Inputs - in0, in1, min_vec
|
||||
Outputs - in0, in1, (in place)
|
||||
Return Type - unsigned halfword
|
||||
Details : Minimum of unsigned halfword element values from 'in0' and
|
||||
'min_value' are written to output vector 'in0'
|
||||
*/
|
||||
#define MIN_UH2(RTYPE, in0, in1, min_vec) \
|
||||
{ \
|
||||
in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \
|
||||
in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \
|
||||
}
|
||||
#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
|
||||
|
||||
#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
|
||||
{ \
|
||||
MIN_UH2(RTYPE, in0, in1, min_vec); \
|
||||
MIN_UH2(RTYPE, in2, in3, min_vec); \
|
||||
}
|
||||
#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
|
||||
|
||||
/* Description : Clips all halfword elements of input vector between min & max
|
||||
out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
|
||||
Arguments : Inputs - in (input vector)
|
||||
@ -1145,6 +1177,46 @@
|
||||
out_m; \
|
||||
} )
|
||||
|
||||
/* Description : Addition of 4 signed word elements
|
||||
4 signed word elements of input vector are added together and
|
||||
resulted integer sum is returned
|
||||
Arguments : Inputs - in (signed word vector)
|
||||
Outputs - sum_m (i32 sum)
|
||||
Return Type - signed word
|
||||
*/
|
||||
#define HADD_SW_S32(in) \
|
||||
( { \
|
||||
v2i64 res0_m, res1_m; \
|
||||
int32_t sum_m; \
|
||||
\
|
||||
res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
|
||||
res1_m = __msa_splati_d(res0_m, 1); \
|
||||
res0_m = res0_m + res1_m; \
|
||||
sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
|
||||
sum_m; \
|
||||
} )
|
||||
|
||||
/* Description : Addition of 8 unsigned halfword elements
|
||||
8 unsigned halfword elements of input vector are added
|
||||
together and resulted integer sum is returned
|
||||
Arguments : Inputs - in (unsigned halfword vector)
|
||||
Outputs - sum_m (u32 sum)
|
||||
Return Type - unsigned word
|
||||
*/
|
||||
#define HADD_UH_U32(in) \
|
||||
( { \
|
||||
v4u32 res_m; \
|
||||
v2u64 res0_m, res1_m; \
|
||||
uint32_t sum_m; \
|
||||
\
|
||||
res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
|
||||
res0_m = __msa_hadd_u_d(res_m, res_m); \
|
||||
res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
|
||||
res0_m = res0_m + res1_m; \
|
||||
sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
|
||||
sum_m; \
|
||||
} )
|
||||
|
||||
/* Description : Horizontal addition of signed byte vector elements
|
||||
Arguments : Inputs - in0, in1
|
||||
Outputs - out0, out1
|
||||
@ -1305,7 +1377,10 @@
|
||||
out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
|
||||
out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
|
||||
}
|
||||
#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
|
||||
#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
|
||||
#define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
|
||||
#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave even double word elements from vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
@ -1339,7 +1414,9 @@
|
||||
out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
|
||||
out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
|
||||
}
|
||||
#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
|
||||
#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
|
||||
#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
|
||||
#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
|
||||
|
||||
#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
@ -1348,6 +1425,7 @@
|
||||
ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
|
||||
ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
|
||||
}
|
||||
#define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
|
||||
#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
|
||||
#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
|
||||
#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
|
||||
@ -1376,6 +1454,7 @@
|
||||
ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
|
||||
}
|
||||
#define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
|
||||
#define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave left half of word elements from vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
@ -1391,7 +1470,9 @@
|
||||
out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
|
||||
out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
|
||||
}
|
||||
#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
|
||||
#define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
|
||||
#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave right half of byte elements from vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||
@ -1478,6 +1559,7 @@
|
||||
ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
|
||||
}
|
||||
#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
|
||||
#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
|
||||
|
||||
#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
|
||||
{ \
|
||||
@ -1486,6 +1568,7 @@
|
||||
}
|
||||
#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
|
||||
#define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
|
||||
#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
|
||||
|
||||
#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
out0, out1, out2, out3) \
|
||||
@ -1494,6 +1577,7 @@
|
||||
ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
|
||||
}
|
||||
#define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
|
||||
#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave right half of double word elements from vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||
@ -1527,6 +1611,7 @@
|
||||
ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
|
||||
}
|
||||
#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
|
||||
#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave both left and right half of input vectors
|
||||
Arguments : Inputs - in0, in1
|
||||
@ -1579,6 +1664,7 @@
|
||||
in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val)); \
|
||||
in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val)); \
|
||||
}
|
||||
#define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
|
||||
#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
|
||||
|
||||
#define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
|
||||
@ -1604,6 +1690,7 @@
|
||||
in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
|
||||
}
|
||||
#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
|
||||
#define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
|
||||
|
||||
#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
|
||||
{ \
|
||||
@ -2133,6 +2220,13 @@
|
||||
out0 = in0 - in1; \
|
||||
out1 = in2 - in3; \
|
||||
}
|
||||
#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
|
||||
{ \
|
||||
out0 = in0 - in1; \
|
||||
out1 = in2 - in3; \
|
||||
out2 = in4 - in5; \
|
||||
out3 = in6 - in7; \
|
||||
}
|
||||
|
||||
/* Description : Sign extend halfword elements from right half of the vector
|
||||
Arguments : Inputs - in (input halfword vector)
|
||||
|
Loading…
Reference in New Issue
Block a user