mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-03-03 14:32:16 +02:00
avcodec/la: add LSX optimization for h264 idct.
loongson_asm.S is LoongArch asm optimization helper. Add functions: ff_h264_idct_add_8_lsx ff_h264_idct8_add_8_lsx ff_h264_idct_dc_add_8_lsx ff_h264_idct8_dc_add_8_lsx ff_h264_idct_add16_8_lsx ff_h264_idct8_add4_8_lsx ff_h264_idct_add8_8_lsx ff_h264_idct_add8_422_8_lsx ff_h264_idct_add16_intra_8_lsx ff_h264_luma_dc_dequant_idct_8_lsx Replaced function(LSX is sufficient for these functions): ff_h264_idct_add_lasx ff_h264_idct4x4_addblk_dc_lasx ff_h264_idct_add16_lasx ff_h264_idct8_add4_lasx ff_h264_idct_add8_lasx ff_h264_idct_add8_422_lasx ff_h264_idct_add16_intra_lasx ff_h264_deq_idct_luma_dc_lasx Renamed functions: ff_h264_idct8_addblk_lasx ==> ff_h264_idct8_add_8_lasx ff_h264_idct8_dc_addblk_lasx ==> ff_h264_idct8_dc_add_8_lasx ./configure --disable-lasx ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an before: 155fps after: 161fps Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
parent
90fba27743
commit
e1b6ecd20a
@ -12,7 +12,6 @@ OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_init_loongarch.o
|
||||
LASX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma_lasx.o
|
||||
LASX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel_lasx.o
|
||||
LASX-OBJS-$(CONFIG_H264DSP) += loongarch/h264dsp_lasx.o \
|
||||
loongarch/h264idct_lasx.o \
|
||||
loongarch/h264_deblock_lasx.o
|
||||
LASX-OBJS-$(CONFIG_H264PRED) += loongarch/h264_intrapred_lasx.o
|
||||
LASX-OBJS-$(CONFIG_VC1_DECODER) += loongarch/vc1dsp_lasx.o
|
||||
@ -31,3 +30,5 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
|
||||
loongarch/hevc_mc_bi_lsx.o \
|
||||
loongarch/hevc_mc_uni_lsx.o \
|
||||
loongarch/hevc_mc_uniw_lsx.o
|
||||
LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
|
||||
loongarch/h264idct_loongarch.o
|
||||
|
@ -20,7 +20,7 @@
|
||||
*/
|
||||
|
||||
#include "libavcodec/bit_depth_template.c"
|
||||
#include "h264dsp_lasx.h"
|
||||
#include "h264dsp_loongarch.h"
|
||||
#include "libavutil/loongarch/loongson_intrinsics.h"
|
||||
|
||||
#define H264_LOOP_FILTER_STRENGTH_ITERATION_LASX(edges, step, mask_mv, dir, \
|
||||
|
@ -21,13 +21,32 @@
|
||||
*/
|
||||
|
||||
#include "libavutil/loongarch/cpu.h"
|
||||
#include "h264dsp_lasx.h"
|
||||
#include "h264dsp_loongarch.h"
|
||||
|
||||
av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_lsx(cpu_flags)) {
|
||||
if (bit_depth == 8) {
|
||||
c->h264_idct_add = ff_h264_idct_add_8_lsx;
|
||||
c->h264_idct8_add = ff_h264_idct8_add_8_lsx;
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_lsx;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lsx;
|
||||
|
||||
if (chroma_format_idc <= 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_8_lsx;
|
||||
else
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_422_8_lsx;
|
||||
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_8_lsx;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_8_lsx;
|
||||
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_8_lsx;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16_intra_8_lsx;
|
||||
}
|
||||
}
|
||||
#if HAVE_LASX
|
||||
if (have_lasx(cpu_flags)) {
|
||||
if (chroma_format_idc <= 1)
|
||||
c->h264_loop_filter_strength = ff_h264_loop_filter_strength_lasx;
|
||||
@ -56,20 +75,10 @@ av_cold void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
|
||||
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_8_lasx;
|
||||
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels4_8_lasx;
|
||||
|
||||
c->h264_idct_add = ff_h264_idct_add_lasx;
|
||||
c->h264_idct8_add = ff_h264_idct8_addblk_lasx;
|
||||
c->h264_idct_dc_add = ff_h264_idct4x4_addblk_dc_lasx;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_addblk_lasx;
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_lasx;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_lasx;
|
||||
|
||||
if (chroma_format_idc <= 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_lasx;
|
||||
else
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_422_lasx;
|
||||
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16_intra_lasx;
|
||||
c->h264_luma_dc_dequant_idct = ff_h264_deq_idct_luma_dc_lasx;
|
||||
c->h264_idct8_add = ff_h264_idct8_add_8_lasx;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_lasx;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_8_lasx;
|
||||
}
|
||||
}
|
||||
#endif // #if HAVE_LASX
|
||||
}
|
||||
|
@ -23,7 +23,7 @@
|
||||
*/
|
||||
|
||||
#include "libavutil/loongarch/loongson_intrinsics.h"
|
||||
#include "h264dsp_lasx.h"
|
||||
#include "h264dsp_loongarch.h"
|
||||
|
||||
#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
|
||||
p1_or_q1_org_in, p2_or_q2_org_in, \
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2021 Loongson Technology Corporation Limited
|
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
||||
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
|
||||
* Xiwei Gu <guxiwei-hf@loongson.cn>
|
||||
*
|
||||
@ -20,11 +20,34 @@
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_LOONGARCH_H264DSP_LASX_H
|
||||
#define AVCODEC_LOONGARCH_H264DSP_LASX_H
|
||||
#ifndef AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
|
||||
#define AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
|
||||
|
||||
#include "libavcodec/h264dec.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_h264_idct_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
|
||||
void ff_h264_idct8_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
|
||||
void ff_h264_idct_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
|
||||
void ff_h264_idct8_dc_add_8_lsx(uint8_t *dst, int16_t *src, int dst_stride);
|
||||
void ff_h264_luma_dc_dequant_idct_8_lsx(int16_t *_output, int16_t *_input, int qmul);
|
||||
void ff_h264_idct_add16_8_lsx(uint8_t *dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8]);
|
||||
void ff_h264_idct8_add4_8_lsx(uint8_t *dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8]);
|
||||
void ff_h264_idct_add8_8_lsx(uint8_t **dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8]);
|
||||
void ff_h264_idct_add8_422_8_lsx(uint8_t **dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8]);
|
||||
void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8]);
|
||||
|
||||
#if HAVE_LASX
|
||||
void ff_h264_h_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
|
||||
int alpha, int beta, int8_t *tc0);
|
||||
void ff_h264_v_lpf_luma_8_lasx(uint8_t *src, ptrdiff_t stride,
|
||||
@ -65,33 +88,16 @@ void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
|
||||
void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
|
||||
|
||||
void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride);
|
||||
void ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
|
||||
void ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
|
||||
void ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src,
|
||||
void ff_h264_idct8_add_8_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride);
|
||||
void ff_h264_idct8_dc_add_8_lasx(uint8_t *dst, int16_t *src,
|
||||
int32_t dst_stride);
|
||||
void ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src,
|
||||
int32_t dst_stride);
|
||||
void ff_h264_idct_add16_lasx(uint8_t *dst, const int32_t *blk_offset,
|
||||
void ff_h264_idct8_add4_8_lasx(uint8_t *dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8]);
|
||||
void ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8]);
|
||||
void ff_h264_idct_add8_lasx(uint8_t **dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8]);
|
||||
void ff_h264_idct_add8_422_lasx(uint8_t **dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8]);
|
||||
void ff_h264_idct_add16_intra_lasx(uint8_t *dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8]);
|
||||
void ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src,
|
||||
int32_t de_qval);
|
||||
|
||||
void ff_h264_loop_filter_strength_lasx(int16_t bS[2][4][4], uint8_t nnz[40],
|
||||
int8_t ref[2][40], int16_t mv[2][40][2],
|
||||
int bidir, int edges, int step,
|
||||
int mask_mv0, int mask_mv1, int field);
|
||||
#endif // #if HAVE_LASX
|
||||
|
||||
#endif // #ifndef AVCODEC_LOONGARCH_H264DSP_LASX_H
|
||||
#endif // #ifndef AVCODEC_LOONGARCH_H264DSP_LOONGARCH_H
|
658
libavcodec/loongarch/h264idct.S
Normal file
658
libavcodec/loongarch/h264idct.S
Normal file
@ -0,0 +1,658 @@
|
||||
/*
|
||||
* Loongson LASX optimized h264idct
|
||||
*
|
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
||||
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "loongson_asm.S"
|
||||
|
||||
/*
|
||||
* #define FUNC2(a, b, c) FUNC3(a, b, c)
|
||||
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
|
||||
* void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
|
||||
* LSX optimization is enough for this function.
|
||||
*/
|
||||
function ff_h264_idct_add_8_lsx
|
||||
fld.d f0, a1, 0
|
||||
fld.d f1, a1, 8
|
||||
fld.d f2, a1, 16
|
||||
fld.d f3, a1, 24
|
||||
vxor.v vr7, vr7, vr7
|
||||
add.d t2, a2, a2
|
||||
add.d t3, t2, a2
|
||||
vst vr7, a1, 0
|
||||
vst vr7, a1, 16
|
||||
|
||||
vadd.h vr4, vr0, vr2
|
||||
vsub.h vr5, vr0, vr2
|
||||
vsrai.h vr6, vr1, 1
|
||||
vsrai.h vr7, vr3, 1
|
||||
vsub.h vr6, vr6, vr3
|
||||
vadd.h vr7, vr1, vr7
|
||||
LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
|
||||
LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5
|
||||
vadd.h vr4, vr0, vr2
|
||||
vsub.h vr5, vr0, vr2
|
||||
vsrai.h vr6, vr1, 1
|
||||
vsrai.h vr7, vr3, 1
|
||||
vsub.h vr6, vr6, vr3
|
||||
vadd.h vr7, vr1, vr7
|
||||
LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
|
||||
|
||||
fld.s f4, a0, 0
|
||||
fldx.s f5, a0, a2
|
||||
fldx.s f6, a0, t2
|
||||
fldx.s f7, a0, t3
|
||||
|
||||
vsrari.h vr0, vr0, 6
|
||||
vsrari.h vr1, vr1, 6
|
||||
vsrari.h vr2, vr2, 6
|
||||
vsrari.h vr3, vr3, 6
|
||||
|
||||
vsllwil.hu.bu vr4, vr4, 0
|
||||
vsllwil.hu.bu vr5, vr5, 0
|
||||
vsllwil.hu.bu vr6, vr6, 0
|
||||
vsllwil.hu.bu vr7, vr7, 0
|
||||
vadd.h vr0, vr0, vr4
|
||||
vadd.h vr1, vr1, vr5
|
||||
vadd.h vr2, vr2, vr6
|
||||
vadd.h vr3, vr3, vr7
|
||||
vssrarni.bu.h vr1, vr0, 0
|
||||
vssrarni.bu.h vr3, vr2, 0
|
||||
|
||||
vbsrl.v vr0, vr1, 8
|
||||
vbsrl.v vr2, vr3, 8
|
||||
fst.s f1, a0, 0
|
||||
fstx.s f0, a0, a2
|
||||
fstx.s f3, a0, t2
|
||||
fstx.s f2, a0, t3
|
||||
endfunc
|
||||
|
||||
/*
|
||||
* #define FUNC2(a, b, c) FUNC3(a, b, c)
|
||||
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
|
||||
* void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
|
||||
*/
|
||||
function ff_h264_idct8_add_8_lsx
|
||||
ld.h t0, a1, 0
|
||||
add.d t2, a2, a2
|
||||
add.d t3, t2, a2
|
||||
add.d t4, t3, a2
|
||||
add.d t5, t4, a2
|
||||
add.d t6, t5, a2
|
||||
add.d t7, t6, a2
|
||||
addi.w t0, t0, 32
|
||||
st.h t0, a1, 0
|
||||
|
||||
vld vr0, a1, 0
|
||||
vld vr1, a1, 16
|
||||
vld vr2, a1, 32
|
||||
vld vr3, a1, 48
|
||||
vld vr4, a1, 64
|
||||
vld vr5, a1, 80
|
||||
vld vr6, a1, 96
|
||||
vld vr7, a1, 112
|
||||
vxor.v vr8, vr8, vr8
|
||||
vst vr8, a1, 0
|
||||
vst vr8, a1, 16
|
||||
vst vr8, a1, 32
|
||||
vst vr8, a1, 48
|
||||
vst vr8, a1, 64
|
||||
vst vr8, a1, 80
|
||||
vst vr8, a1, 96
|
||||
vst vr8, a1, 112
|
||||
|
||||
vadd.h vr18, vr0, vr4
|
||||
vsub.h vr19, vr0, vr4
|
||||
vsrai.h vr20, vr2, 1
|
||||
vsrai.h vr21, vr6, 1
|
||||
vsub.h vr20, vr20, vr6
|
||||
vadd.h vr21, vr21, vr2
|
||||
LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21, vr10, vr12, vr14, vr16
|
||||
vsrai.h vr11, vr7, 1
|
||||
vsrai.h vr13, vr3, 1
|
||||
vsrai.h vr15, vr5, 1
|
||||
vsrai.h vr17, vr1, 1
|
||||
vsub.h vr11, vr5, vr11
|
||||
vsub.h vr13, vr7, vr13
|
||||
vadd.h vr15, vr7, vr15
|
||||
vadd.h vr17, vr5, vr17
|
||||
vsub.h vr11, vr11, vr7
|
||||
vsub.h vr13, vr13, vr3
|
||||
vadd.h vr15, vr15, vr5
|
||||
vadd.h vr17, vr17, vr1
|
||||
vsub.h vr11, vr11, vr3
|
||||
vadd.h vr13, vr13, vr1
|
||||
vsub.h vr15, vr15, vr1
|
||||
vadd.h vr17, vr17, vr3
|
||||
vsrai.h vr18, vr11, 2
|
||||
vsrai.h vr19, vr13, 2
|
||||
vsrai.h vr20, vr15, 2
|
||||
vsrai.h vr21, vr17, 2
|
||||
vadd.h vr11, vr11, vr21
|
||||
vadd.h vr13, vr13, vr20
|
||||
vsub.h vr15, vr19, vr15
|
||||
vsub.h vr17, vr17, vr18
|
||||
LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
|
||||
vr0, vr3, vr1, vr2, vr5, vr6, vr4, vr7
|
||||
|
||||
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
|
||||
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
|
||||
vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
|
||||
vexth.w.h vr20, vr0
|
||||
vexth.w.h vr21, vr1
|
||||
vexth.w.h vr22, vr2
|
||||
vexth.w.h vr23, vr3
|
||||
vexth.w.h vr8, vr4
|
||||
vexth.w.h vr9, vr5
|
||||
vexth.w.h vr18, vr6
|
||||
vexth.w.h vr19, vr7
|
||||
vsllwil.w.h vr0, vr0, 0
|
||||
vsllwil.w.h vr1, vr1, 0
|
||||
vsllwil.w.h vr2, vr2, 0
|
||||
vsllwil.w.h vr3, vr3, 0
|
||||
vsllwil.w.h vr4, vr4, 0
|
||||
vsllwil.w.h vr5, vr5, 0
|
||||
vsllwil.w.h vr6, vr6, 0
|
||||
vsllwil.w.h vr7, vr7, 0
|
||||
|
||||
vadd.w vr11, vr0, vr4
|
||||
vsub.w vr13, vr0, vr4
|
||||
vsrai.w vr15, vr2, 1
|
||||
vsrai.w vr17, vr6, 1
|
||||
vsub.w vr15, vr15, vr6
|
||||
vadd.w vr17, vr17, vr2
|
||||
LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17, vr10, vr12, vr14, vr16
|
||||
vsrai.w vr11, vr7, 1
|
||||
vsrai.w vr13, vr3, 1
|
||||
vsrai.w vr15, vr5, 1
|
||||
vsrai.w vr17, vr1, 1
|
||||
vsub.w vr11, vr5, vr11
|
||||
vsub.w vr13, vr7, vr13
|
||||
vadd.w vr15, vr7, vr15
|
||||
vadd.w vr17, vr5, vr17
|
||||
vsub.w vr11, vr11, vr7
|
||||
vsub.w vr13, vr13, vr3
|
||||
vadd.w vr15, vr15, vr5
|
||||
vadd.w vr17, vr17, vr1
|
||||
vsub.w vr11, vr11, vr3
|
||||
vadd.w vr13, vr13, vr1
|
||||
vsub.w vr15, vr15, vr1
|
||||
vadd.w vr17, vr17, vr3
|
||||
vsrai.w vr0, vr11, 2
|
||||
vsrai.w vr1, vr13, 2
|
||||
vsrai.w vr2, vr15, 2
|
||||
vsrai.w vr3, vr17, 2
|
||||
vadd.w vr11, vr11, vr3
|
||||
vadd.w vr13, vr13, vr2
|
||||
vsub.w vr15, vr1, vr15
|
||||
vsub.w vr17, vr17, vr0
|
||||
LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
|
||||
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
|
||||
|
||||
vadd.w vr11, vr20, vr8
|
||||
vsub.w vr13, vr20, vr8
|
||||
vsrai.w vr15, vr22, 1
|
||||
vsrai.w vr17, vr18, 1
|
||||
vsub.w vr15, vr15, vr18
|
||||
vadd.w vr17, vr17, vr22
|
||||
LSX_BUTTERFLY_4_W vr11, vr13, vr15, vr17, vr10, vr12, vr14, vr16
|
||||
vsrai.w vr11, vr19, 1
|
||||
vsrai.w vr13, vr23, 1
|
||||
vsrai.w vr15, vr9, 1
|
||||
vsrai.w vr17, vr21, 1
|
||||
vsub.w vr11, vr9, vr11
|
||||
vsub.w vr13, vr19, vr13
|
||||
vadd.w vr15, vr19, vr15
|
||||
vadd.w vr17, vr9, vr17
|
||||
vsub.w vr11, vr11, vr19
|
||||
vsub.w vr13, vr13, vr23
|
||||
vadd.w vr15, vr15, vr9
|
||||
vadd.w vr17, vr17, vr21
|
||||
vsub.w vr11, vr11, vr23
|
||||
vadd.w vr13, vr13, vr21
|
||||
vsub.w vr15, vr15, vr21
|
||||
vadd.w vr17, vr17, vr23
|
||||
vsrai.w vr20, vr11, 2
|
||||
vsrai.w vr21, vr13, 2
|
||||
vsrai.w vr22, vr15, 2
|
||||
vsrai.w vr23, vr17, 2
|
||||
vadd.w vr11, vr11, vr23
|
||||
vadd.w vr13, vr13, vr22
|
||||
vsub.w vr15, vr21, vr15
|
||||
vsub.w vr17, vr17, vr20
|
||||
LSX_BUTTERFLY_8_W vr10, vr12, vr14, vr16, vr11, vr13, vr15, vr17, \
|
||||
vr20, vr21, vr22, vr23, vr8, vr9, vr18, vr19
|
||||
|
||||
vld vr10, a0, 0
|
||||
vldx vr11, a0, a2
|
||||
vldx vr12, a0, t2
|
||||
vldx vr13, a0, t3
|
||||
vldx vr14, a0, t4
|
||||
vldx vr15, a0, t5
|
||||
vldx vr16, a0, t6
|
||||
vldx vr17, a0, t7
|
||||
vsrani.h.w vr20, vr0, 6
|
||||
vsrani.h.w vr21, vr1, 6
|
||||
vsrani.h.w vr22, vr2, 6
|
||||
vsrani.h.w vr23, vr3, 6
|
||||
vsrani.h.w vr8, vr4, 6
|
||||
vsrani.h.w vr9, vr5, 6
|
||||
vsrani.h.w vr18, vr6, 6
|
||||
vsrani.h.w vr19, vr7, 6
|
||||
vsllwil.hu.bu vr10, vr10, 0
|
||||
vsllwil.hu.bu vr11, vr11, 0
|
||||
vsllwil.hu.bu vr12, vr12, 0
|
||||
vsllwil.hu.bu vr13, vr13, 0
|
||||
vsllwil.hu.bu vr14, vr14, 0
|
||||
vsllwil.hu.bu vr15, vr15, 0
|
||||
vsllwil.hu.bu vr16, vr16, 0
|
||||
vsllwil.hu.bu vr17, vr17, 0
|
||||
|
||||
vadd.h vr0, vr20, vr10
|
||||
vadd.h vr1, vr21, vr11
|
||||
vadd.h vr2, vr22, vr12
|
||||
vadd.h vr3, vr23, vr13
|
||||
vadd.h vr4, vr8, vr14
|
||||
vadd.h vr5, vr9, vr15
|
||||
vadd.h vr6, vr18, vr16
|
||||
vadd.h vr7, vr19, vr17
|
||||
vssrarni.bu.h vr1, vr0, 0
|
||||
vssrarni.bu.h vr3, vr2, 0
|
||||
vssrarni.bu.h vr5, vr4, 0
|
||||
vssrarni.bu.h vr7, vr6, 0
|
||||
vbsrl.v vr0, vr1, 8
|
||||
vbsrl.v vr2, vr3, 8
|
||||
vbsrl.v vr4, vr5, 8
|
||||
vbsrl.v vr6, vr7, 8
|
||||
fst.d f1, a0, 0
|
||||
fstx.d f0, a0, a2
|
||||
fstx.d f3, a0, t2
|
||||
fstx.d f2, a0, t3
|
||||
fstx.d f5, a0, t4
|
||||
fstx.d f4, a0, t5
|
||||
fstx.d f7, a0, t6
|
||||
fstx.d f6, a0, t7
|
||||
endfunc
|
||||
|
||||
/*
|
||||
* #define FUNC2(a, b, c) FUNC3(a, b, c)
|
||||
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
|
||||
* void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride)
|
||||
*/
|
||||
function ff_h264_idct8_add_8_lasx
|
||||
ld.h t0, a1, 0
|
||||
add.d t2, a2, a2
|
||||
add.d t3, t2, a2
|
||||
add.d t4, t3, a2
|
||||
add.d t5, t4, a2
|
||||
add.d t6, t5, a2
|
||||
add.d t7, t6, a2
|
||||
addi.w t0, t0, 32
|
||||
st.h t0, a1, 0
|
||||
|
||||
vld vr0, a1, 0
|
||||
vld vr1, a1, 16
|
||||
vld vr2, a1, 32
|
||||
vld vr3, a1, 48
|
||||
vld vr4, a1, 64
|
||||
vld vr5, a1, 80
|
||||
vld vr6, a1, 96
|
||||
vld vr7, a1, 112
|
||||
xvxor.v xr8, xr8, xr8
|
||||
xvst xr8, a1, 0
|
||||
xvst xr8, a1, 32
|
||||
xvst xr8, a1, 64
|
||||
xvst xr8, a1, 96
|
||||
|
||||
vadd.h vr18, vr0, vr4
|
||||
vsub.h vr19, vr0, vr4
|
||||
vsrai.h vr20, vr2, 1
|
||||
vsrai.h vr21, vr6, 1
|
||||
vsub.h vr20, vr20, vr6
|
||||
vadd.h vr21, vr21, vr2
|
||||
LSX_BUTTERFLY_4_H vr18, vr19, vr20, vr21, vr10, vr12, vr14, vr16
|
||||
vsrai.h vr11, vr7, 1
|
||||
vsrai.h vr13, vr3, 1
|
||||
vsrai.h vr15, vr5, 1
|
||||
vsrai.h vr17, vr1, 1
|
||||
vsub.h vr11, vr5, vr11
|
||||
vsub.h vr13, vr7, vr13
|
||||
vadd.h vr15, vr7, vr15
|
||||
vadd.h vr17, vr5, vr17
|
||||
vsub.h vr11, vr11, vr7
|
||||
vsub.h vr13, vr13, vr3
|
||||
vadd.h vr15, vr15, vr5
|
||||
vadd.h vr17, vr17, vr1
|
||||
vsub.h vr11, vr11, vr3
|
||||
vadd.h vr13, vr13, vr1
|
||||
vsub.h vr15, vr15, vr1
|
||||
vadd.h vr17, vr17, vr3
|
||||
vsrai.h vr18, vr11, 2
|
||||
vsrai.h vr19, vr13, 2
|
||||
vsrai.h vr20, vr15, 2
|
||||
vsrai.h vr21, vr17, 2
|
||||
vadd.h vr11, vr11, vr21
|
||||
vadd.h vr13, vr13, vr20
|
||||
vsub.h vr15, vr19, vr15
|
||||
vsub.h vr17, vr17, vr18
|
||||
LSX_BUTTERFLY_8_H vr10, vr16, vr12, vr14, vr13, vr15, vr11, vr17, \
|
||||
vr0, vr3, vr1, vr2, vr5, vr6, vr4, vr7
|
||||
|
||||
LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
|
||||
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
|
||||
vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17
|
||||
vext2xv.w.h xr0, xr0
|
||||
vext2xv.w.h xr1, xr1
|
||||
vext2xv.w.h xr2, xr2
|
||||
vext2xv.w.h xr3, xr3
|
||||
vext2xv.w.h xr4, xr4
|
||||
vext2xv.w.h xr5, xr5
|
||||
vext2xv.w.h xr6, xr6
|
||||
vext2xv.w.h xr7, xr7
|
||||
|
||||
xvadd.w xr11, xr0, xr4
|
||||
xvsub.w xr13, xr0, xr4
|
||||
xvsrai.w xr15, xr2, 1
|
||||
xvsrai.w xr17, xr6, 1
|
||||
xvsub.w xr15, xr15, xr6
|
||||
xvadd.w xr17, xr17, xr2
|
||||
LASX_BUTTERFLY_4_W xr11, xr13, xr15, xr17, xr10, xr12, xr14, xr16
|
||||
xvsrai.w xr11, xr7, 1
|
||||
xvsrai.w xr13, xr3, 1
|
||||
xvsrai.w xr15, xr5, 1
|
||||
xvsrai.w xr17, xr1, 1
|
||||
xvsub.w xr11, xr5, xr11
|
||||
xvsub.w xr13, xr7, xr13
|
||||
xvadd.w xr15, xr7, xr15
|
||||
xvadd.w xr17, xr5, xr17
|
||||
xvsub.w xr11, xr11, xr7
|
||||
xvsub.w xr13, xr13, xr3
|
||||
xvadd.w xr15, xr15, xr5
|
||||
xvadd.w xr17, xr17, xr1
|
||||
xvsub.w xr11, xr11, xr3
|
||||
xvadd.w xr13, xr13, xr1
|
||||
xvsub.w xr15, xr15, xr1
|
||||
xvadd.w xr17, xr17, xr3
|
||||
xvsrai.w xr0, xr11, 2
|
||||
xvsrai.w xr1, xr13, 2
|
||||
xvsrai.w xr2, xr15, 2
|
||||
xvsrai.w xr3, xr17, 2
|
||||
xvadd.w xr11, xr11, xr3
|
||||
xvadd.w xr13, xr13, xr2
|
||||
xvsub.w xr15, xr1, xr15
|
||||
xvsub.w xr17, xr17, xr0
|
||||
LASX_BUTTERFLY_8_W xr10, xr12, xr14, xr16, xr11, xr13, xr15, xr17, \
|
||||
xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7
|
||||
|
||||
vld vr10, a0, 0
|
||||
vldx vr11, a0, a2
|
||||
vldx vr12, a0, t2
|
||||
vldx vr13, a0, t3
|
||||
vldx vr14, a0, t4
|
||||
vldx vr15, a0, t5
|
||||
vldx vr16, a0, t6
|
||||
vldx vr17, a0, t7
|
||||
xvldi xr8, 0x806 //"xvldi.w xr8 6"
|
||||
xvsran.h.w xr0, xr0, xr8
|
||||
xvsran.h.w xr1, xr1, xr8
|
||||
xvsran.h.w xr2, xr2, xr8
|
||||
xvsran.h.w xr3, xr3, xr8
|
||||
xvsran.h.w xr4, xr4, xr8
|
||||
xvsran.h.w xr5, xr5, xr8
|
||||
xvsran.h.w xr6, xr6, xr8
|
||||
xvsran.h.w xr7, xr7, xr8
|
||||
xvpermi.d xr0, xr0, 0x08
|
||||
xvpermi.d xr1, xr1, 0x08
|
||||
xvpermi.d xr2, xr2, 0x08
|
||||
xvpermi.d xr3, xr3, 0x08
|
||||
xvpermi.d xr4, xr4, 0x08
|
||||
xvpermi.d xr5, xr5, 0x08
|
||||
xvpermi.d xr6, xr6, 0x08
|
||||
xvpermi.d xr7, xr7, 0x08
|
||||
|
||||
vsllwil.hu.bu vr10, vr10, 0
|
||||
vsllwil.hu.bu vr11, vr11, 0
|
||||
vsllwil.hu.bu vr12, vr12, 0
|
||||
vsllwil.hu.bu vr13, vr13, 0
|
||||
vsllwil.hu.bu vr14, vr14, 0
|
||||
vsllwil.hu.bu vr15, vr15, 0
|
||||
vsllwil.hu.bu vr16, vr16, 0
|
||||
vsllwil.hu.bu vr17, vr17, 0
|
||||
|
||||
vadd.h vr0, vr0, vr10
|
||||
vadd.h vr1, vr1, vr11
|
||||
vadd.h vr2, vr2, vr12
|
||||
vadd.h vr3, vr3, vr13
|
||||
vadd.h vr4, vr4, vr14
|
||||
vadd.h vr5, vr5, vr15
|
||||
vadd.h vr6, vr6, vr16
|
||||
vadd.h vr7, vr7, vr17
|
||||
vssrarni.bu.h vr1, vr0, 0
|
||||
vssrarni.bu.h vr3, vr2, 0
|
||||
vssrarni.bu.h vr5, vr4, 0
|
||||
vssrarni.bu.h vr7, vr6, 0
|
||||
vbsrl.v vr0, vr1, 8
|
||||
vbsrl.v vr2, vr3, 8
|
||||
vbsrl.v vr4, vr5, 8
|
||||
vbsrl.v vr6, vr7, 8
|
||||
fst.d f1, a0, 0
|
||||
fstx.d f0, a0, a2
|
||||
fstx.d f3, a0, t2
|
||||
fstx.d f2, a0, t3
|
||||
fstx.d f5, a0, t4
|
||||
fstx.d f4, a0, t5
|
||||
fstx.d f7, a0, t6
|
||||
fstx.d f6, a0, t7
|
||||
endfunc
|
||||
|
||||
/*
|
||||
* #define FUNC2(a, b, c) FUNC3(a, b, c)
|
||||
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
|
||||
* void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
|
||||
* LSX optimization is enough for this function.
|
||||
*/
|
||||
function ff_h264_idct_dc_add_8_lsx
|
||||
vldrepl.h vr4, a1, 0
|
||||
add.d t2, a2, a2
|
||||
add.d t3, t2, a2
|
||||
fld.s f0, a0, 0
|
||||
fldx.s f1, a0, a2
|
||||
fldx.s f2, a0, t2
|
||||
fldx.s f3, a0, t3
|
||||
st.h zero, a1, 0
|
||||
|
||||
vsrari.h vr4, vr4, 6
|
||||
vilvl.w vr0, vr1, vr0
|
||||
vilvl.w vr1, vr3, vr2
|
||||
vsllwil.hu.bu vr0, vr0, 0
|
||||
vsllwil.hu.bu vr1, vr1, 0
|
||||
vadd.h vr0, vr0, vr4
|
||||
vadd.h vr1, vr1, vr4
|
||||
vssrarni.bu.h vr1, vr0, 0
|
||||
|
||||
vbsrl.v vr2, vr1, 4
|
||||
vbsrl.v vr3, vr1, 8
|
||||
vbsrl.v vr4, vr1, 12
|
||||
fst.s f1, a0, 0
|
||||
fstx.s f2, a0, a2
|
||||
fstx.s f3, a0, t2
|
||||
fstx.s f4, a0, t3
|
||||
endfunc
|
||||
|
||||
/*
|
||||
* #define FUNC2(a, b, c) FUNC3(a, b, c)
|
||||
* #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
|
||||
* void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *_block, int stride)
|
||||
*/
|
||||
function ff_h264_idct8_dc_add_8_lsx
|
||||
vldrepl.h vr8, a1, 0
|
||||
add.d t2, a2, a2
|
||||
add.d t3, t2, a2
|
||||
add.d t4, t3, a2
|
||||
add.d t5, t4, a2
|
||||
add.d t6, t5, a2
|
||||
add.d t7, t6, a2
|
||||
|
||||
fld.d f0, a0, 0
|
||||
fldx.d f1, a0, a2
|
||||
fldx.d f2, a0, t2
|
||||
fldx.d f3, a0, t3
|
||||
fldx.d f4, a0, t4
|
||||
fldx.d f5, a0, t5
|
||||
fldx.d f6, a0, t6
|
||||
fldx.d f7, a0, t7
|
||||
st.h zero, a1, 0
|
||||
|
||||
vsrari.h vr8, vr8, 6
|
||||
vsllwil.hu.bu vr0, vr0, 0
|
||||
vsllwil.hu.bu vr1, vr1, 0
|
||||
vsllwil.hu.bu vr2, vr2, 0
|
||||
vsllwil.hu.bu vr3, vr3, 0
|
||||
vsllwil.hu.bu vr4, vr4, 0
|
||||
vsllwil.hu.bu vr5, vr5, 0
|
||||
vsllwil.hu.bu vr6, vr6, 0
|
||||
vsllwil.hu.bu vr7, vr7, 0
|
||||
vadd.h vr0, vr0, vr8
|
||||
vadd.h vr1, vr1, vr8
|
||||
vadd.h vr2, vr2, vr8
|
||||
vadd.h vr3, vr3, vr8
|
||||
vadd.h vr4, vr4, vr8
|
||||
vadd.h vr5, vr5, vr8
|
||||
vadd.h vr6, vr6, vr8
|
||||
vadd.h vr7, vr7, vr8
|
||||
vssrarni.bu.h vr1, vr0, 0
|
||||
vssrarni.bu.h vr3, vr2, 0
|
||||
vssrarni.bu.h vr5, vr4, 0
|
||||
vssrarni.bu.h vr7, vr6, 0
|
||||
|
||||
vbsrl.v vr0, vr1, 8
|
||||
vbsrl.v vr2, vr3, 8
|
||||
vbsrl.v vr4, vr5, 8
|
||||
vbsrl.v vr6, vr7, 8
|
||||
fst.d f1, a0, 0
|
||||
fstx.d f0, a0, a2
|
||||
fstx.d f3, a0, t2
|
||||
fstx.d f2, a0, t3
|
||||
fstx.d f5, a0, t4
|
||||
fstx.d f4, a0, t5
|
||||
fstx.d f7, a0, t6
|
||||
fstx.d f6, a0, t7
|
||||
endfunc
|
||||
function ff_h264_idct8_dc_add_8_lasx
|
||||
xvldrepl.h xr8, a1, 0
|
||||
add.d t2, a2, a2
|
||||
add.d t3, t2, a2
|
||||
add.d t4, t3, a2
|
||||
add.d t5, t4, a2
|
||||
add.d t6, t5, a2
|
||||
add.d t7, t6, a2
|
||||
|
||||
fld.d f0, a0, 0
|
||||
fldx.d f1, a0, a2
|
||||
fldx.d f2, a0, t2
|
||||
fldx.d f3, a0, t3
|
||||
fldx.d f4, a0, t4
|
||||
fldx.d f5, a0, t5
|
||||
fldx.d f6, a0, t6
|
||||
fldx.d f7, a0, t7
|
||||
st.h zero, a1, 0
|
||||
|
||||
xvsrari.h xr8, xr8, 6
|
||||
xvpermi.q xr1, xr0, 0x20
|
||||
xvpermi.q xr3, xr2, 0x20
|
||||
xvpermi.q xr5, xr4, 0x20
|
||||
xvpermi.q xr7, xr6, 0x20
|
||||
xvsllwil.hu.bu xr1, xr1, 0
|
||||
xvsllwil.hu.bu xr3, xr3, 0
|
||||
xvsllwil.hu.bu xr5, xr5, 0
|
||||
xvsllwil.hu.bu xr7, xr7, 0
|
||||
xvadd.h xr1, xr1, xr8
|
||||
xvadd.h xr3, xr3, xr8
|
||||
xvadd.h xr5, xr5, xr8
|
||||
xvadd.h xr7, xr7, xr8
|
||||
|
||||
xvssrarni.bu.h xr3, xr1, 0
|
||||
xvssrarni.bu.h xr7, xr5, 0
|
||||
|
||||
xvpermi.q xr1, xr3, 0x11
|
||||
xvpermi.q xr5, xr7, 0x11
|
||||
xvbsrl.v xr0, xr1, 8
|
||||
xvbsrl.v xr2, xr3, 8
|
||||
xvbsrl.v xr4, xr5, 8
|
||||
xvbsrl.v xr6, xr7, 8
|
||||
|
||||
fst.d f3, a0, 0
|
||||
fstx.d f1, a0, a2
|
||||
fstx.d f2, a0, t2
|
||||
fstx.d f0, a0, t3
|
||||
fstx.d f7, a0, t4
|
||||
fstx.d f5, a0, t5
|
||||
fstx.d f6, a0, t6
|
||||
fstx.d f4, a0, t7
|
||||
endfunc
|
||||
|
||||
/**
|
||||
* IDCT transforms the 16 dc values and dequantizes them.
|
||||
* @param qmul quantization parameter
|
||||
* void FUNCC(ff_h264_luma_dc_dequant_idct)(int16_t *_output, int16_t *_input, int qmul){
|
||||
* LSX optimization is enough for this function.
|
||||
*/
|
||||
function ff_h264_luma_dc_dequant_idct_8_lsx
|
||||
vld vr0, a1, 0
|
||||
vld vr1, a1, 8
|
||||
vld vr2, a1, 16
|
||||
vld vr3, a1, 24
|
||||
vreplgr2vr.w vr8, a2
|
||||
LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr9, vr10
|
||||
LSX_BUTTERFLY_4_H vr4, vr6, vr7, vr5, vr0, vr3, vr2, vr1
|
||||
LSX_BUTTERFLY_4_H vr0, vr1, vr2, vr3, vr4, vr7, vr6, vr5
|
||||
LSX_TRANSPOSE4x4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3, vr9, vr10
|
||||
LSX_BUTTERFLY_4_H vr0, vr1, vr3, vr2, vr4, vr7, vr6, vr5
|
||||
LSX_BUTTERFLY_4_H vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3
|
||||
vsllwil.w.h vr0, vr0, 0
|
||||
vsllwil.w.h vr1, vr1, 0
|
||||
vsllwil.w.h vr2, vr2, 0
|
||||
vsllwil.w.h vr3, vr3, 0
|
||||
vmul.w vr0, vr0, vr8
|
||||
vmul.w vr1, vr1, vr8
|
||||
vmul.w vr2, vr2, vr8
|
||||
vmul.w vr3, vr3, vr8
|
||||
vsrarni.h.w vr1, vr0, 8
|
||||
vsrarni.h.w vr3, vr2, 8
|
||||
|
||||
vstelm.h vr1, a0, 0, 0
|
||||
vstelm.h vr1, a0, 32, 4
|
||||
vstelm.h vr1, a0, 64, 1
|
||||
vstelm.h vr1, a0, 96, 5
|
||||
vstelm.h vr3, a0, 128, 0
|
||||
vstelm.h vr3, a0, 160, 4
|
||||
vstelm.h vr3, a0, 192, 1
|
||||
vstelm.h vr3, a0, 224, 5
|
||||
addi.d a0, a0, 256
|
||||
vstelm.h vr1, a0, 0, 2
|
||||
vstelm.h vr1, a0, 32, 6
|
||||
vstelm.h vr1, a0, 64, 3
|
||||
vstelm.h vr1, a0, 96, 7
|
||||
vstelm.h vr3, a0, 128, 2
|
||||
vstelm.h vr3, a0, 160, 6
|
||||
vstelm.h vr3, a0, 192, 3
|
||||
vstelm.h vr3, a0, 224, 7
|
||||
endfunc
|
@ -1,498 +0,0 @@
|
||||
/*
|
||||
* Loongson LASX optimized h264dsp
|
||||
*
|
||||
* Copyright (c) 2021 Loongson Technology Corporation Limited
|
||||
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
|
||||
* Xiwei Gu <guxiwei-hf@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/loongarch/loongson_intrinsics.h"
|
||||
#include "h264dsp_lasx.h"
|
||||
#include "libavcodec/bit_depth_template.c"
|
||||
|
||||
#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3) \
|
||||
{ \
|
||||
__m256i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
|
||||
\
|
||||
tmp0_m = __lasx_xvadd_h(in0, in2); \
|
||||
tmp1_m = __lasx_xvsub_h(in0, in2); \
|
||||
tmp2_m = __lasx_xvsrai_h(in1, 1); \
|
||||
tmp2_m = __lasx_xvsub_h(tmp2_m, in3); \
|
||||
tmp3_m = __lasx_xvsrai_h(in3, 1); \
|
||||
tmp3_m = __lasx_xvadd_h(in1, tmp3_m); \
|
||||
\
|
||||
LASX_BUTTERFLY_4_H(tmp0_m, tmp1_m, tmp2_m, tmp3_m, \
|
||||
out0, out1, out2, out3); \
|
||||
}
|
||||
|
||||
void ff_h264_idct_add_lasx(uint8_t *dst, int16_t *src, int32_t dst_stride)
|
||||
{
|
||||
__m256i src0_m, src1_m, src2_m, src3_m;
|
||||
__m256i dst0_m, dst1_m;
|
||||
__m256i hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
|
||||
__m256i inp0_m, inp1_m, res0_m, src1, src3;
|
||||
__m256i src0 = __lasx_xvld(src, 0);
|
||||
__m256i src2 = __lasx_xvld(src, 16);
|
||||
__m256i zero = __lasx_xvldi(0);
|
||||
int32_t dst_stride_2x = dst_stride << 1;
|
||||
int32_t dst_stride_3x = dst_stride_2x + dst_stride;
|
||||
|
||||
__lasx_xvst(zero, src, 0);
|
||||
DUP2_ARG2(__lasx_xvilvh_d, src0, src0, src2, src2, src1, src3);
|
||||
AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
|
||||
LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
|
||||
AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
|
||||
DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
|
||||
dst, dst_stride_3x, src0_m, src1_m, src2_m, src3_m);
|
||||
DUP4_ARG2(__lasx_xvld, dst, 0, dst + dst_stride, 0, dst + dst_stride_2x,
|
||||
0, dst + dst_stride_3x, 0, src0_m, src1_m, src2_m, src3_m);
|
||||
DUP2_ARG2(__lasx_xvilvl_d, vres1, vres0, vres3, vres2, inp0_m, inp1_m);
|
||||
inp0_m = __lasx_xvpermi_q(inp1_m, inp0_m, 0x20);
|
||||
inp0_m = __lasx_xvsrari_h(inp0_m, 6);
|
||||
DUP2_ARG2(__lasx_xvilvl_w, src1_m, src0_m, src3_m, src2_m, dst0_m, dst1_m);
|
||||
dst0_m = __lasx_xvilvl_d(dst1_m, dst0_m);
|
||||
res0_m = __lasx_vext2xv_hu_bu(dst0_m);
|
||||
res0_m = __lasx_xvadd_h(res0_m, inp0_m);
|
||||
res0_m = __lasx_xvclip255_h(res0_m);
|
||||
dst0_m = __lasx_xvpickev_b(res0_m, res0_m);
|
||||
__lasx_xvstelm_w(dst0_m, dst, 0, 0);
|
||||
__lasx_xvstelm_w(dst0_m, dst + dst_stride, 0, 1);
|
||||
__lasx_xvstelm_w(dst0_m, dst + dst_stride_2x, 0, 4);
|
||||
__lasx_xvstelm_w(dst0_m, dst + dst_stride_3x, 0, 5);
|
||||
}
|
||||
|
||||
void ff_h264_idct8_addblk_lasx(uint8_t *dst, int16_t *src,
|
||||
int32_t dst_stride)
|
||||
{
|
||||
__m256i src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
__m256i vec0, vec1, vec2, vec3;
|
||||
__m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
__m256i res0, res1, res2, res3, res4, res5, res6, res7;
|
||||
__m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
|
||||
__m256i zero = __lasx_xvldi(0);
|
||||
int32_t dst_stride_2x = dst_stride << 1;
|
||||
int32_t dst_stride_4x = dst_stride << 2;
|
||||
int32_t dst_stride_3x = dst_stride_2x + dst_stride;
|
||||
|
||||
src[0] += 32;
|
||||
DUP4_ARG2(__lasx_xvld, src, 0, src, 16, src, 32, src, 48,
|
||||
src0, src1, src2, src3);
|
||||
DUP4_ARG2(__lasx_xvld, src, 64, src, 80, src, 96, src, 112,
|
||||
src4, src5, src6, src7);
|
||||
__lasx_xvst(zero, src, 0);
|
||||
__lasx_xvst(zero, src, 32);
|
||||
__lasx_xvst(zero, src, 64);
|
||||
__lasx_xvst(zero, src, 96);
|
||||
|
||||
vec0 = __lasx_xvadd_h(src0, src4);
|
||||
vec1 = __lasx_xvsub_h(src0, src4);
|
||||
vec2 = __lasx_xvsrai_h(src2, 1);
|
||||
vec2 = __lasx_xvsub_h(vec2, src6);
|
||||
vec3 = __lasx_xvsrai_h(src6, 1);
|
||||
vec3 = __lasx_xvadd_h(src2, vec3);
|
||||
|
||||
LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
|
||||
|
||||
vec0 = __lasx_xvsrai_h(src7, 1);
|
||||
vec0 = __lasx_xvsub_h(src5, vec0);
|
||||
vec0 = __lasx_xvsub_h(vec0, src3);
|
||||
vec0 = __lasx_xvsub_h(vec0, src7);
|
||||
|
||||
vec1 = __lasx_xvsrai_h(src3, 1);
|
||||
vec1 = __lasx_xvsub_h(src1, vec1);
|
||||
vec1 = __lasx_xvadd_h(vec1, src7);
|
||||
vec1 = __lasx_xvsub_h(vec1, src3);
|
||||
|
||||
vec2 = __lasx_xvsrai_h(src5, 1);
|
||||
vec2 = __lasx_xvsub_h(vec2, src1);
|
||||
vec2 = __lasx_xvadd_h(vec2, src7);
|
||||
vec2 = __lasx_xvadd_h(vec2, src5);
|
||||
|
||||
vec3 = __lasx_xvsrai_h(src1, 1);
|
||||
vec3 = __lasx_xvadd_h(src3, vec3);
|
||||
vec3 = __lasx_xvadd_h(vec3, src5);
|
||||
vec3 = __lasx_xvadd_h(vec3, src1);
|
||||
|
||||
tmp4 = __lasx_xvsrai_h(vec3, 2);
|
||||
tmp4 = __lasx_xvadd_h(tmp4, vec0);
|
||||
tmp5 = __lasx_xvsrai_h(vec2, 2);
|
||||
tmp5 = __lasx_xvadd_h(tmp5, vec1);
|
||||
tmp6 = __lasx_xvsrai_h(vec1, 2);
|
||||
tmp6 = __lasx_xvsub_h(tmp6, vec2);
|
||||
tmp7 = __lasx_xvsrai_h(vec0, 2);
|
||||
tmp7 = __lasx_xvsub_h(vec3, tmp7);
|
||||
|
||||
LASX_BUTTERFLY_8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
|
||||
res0, res1, res2, res3, res4, res5, res6, res7);
|
||||
LASX_TRANSPOSE8x8_H(res0, res1, res2, res3, res4, res5, res6, res7,
|
||||
res0, res1, res2, res3, res4, res5, res6, res7);
|
||||
|
||||
DUP4_ARG1(__lasx_vext2xv_w_h, res0, res1, res2, res3,
|
||||
tmp0, tmp1, tmp2, tmp3);
|
||||
DUP4_ARG1(__lasx_vext2xv_w_h, res4, res5, res6, res7,
|
||||
tmp4, tmp5, tmp6, tmp7);
|
||||
vec0 = __lasx_xvadd_w(tmp0, tmp4);
|
||||
vec1 = __lasx_xvsub_w(tmp0, tmp4);
|
||||
|
||||
vec2 = __lasx_xvsrai_w(tmp2, 1);
|
||||
vec2 = __lasx_xvsub_w(vec2, tmp6);
|
||||
vec3 = __lasx_xvsrai_w(tmp6, 1);
|
||||
vec3 = __lasx_xvadd_w(vec3, tmp2);
|
||||
|
||||
tmp0 = __lasx_xvadd_w(vec0, vec3);
|
||||
tmp2 = __lasx_xvadd_w(vec1, vec2);
|
||||
tmp4 = __lasx_xvsub_w(vec1, vec2);
|
||||
tmp6 = __lasx_xvsub_w(vec0, vec3);
|
||||
|
||||
vec0 = __lasx_xvsrai_w(tmp7, 1);
|
||||
vec0 = __lasx_xvsub_w(tmp5, vec0);
|
||||
vec0 = __lasx_xvsub_w(vec0, tmp3);
|
||||
vec0 = __lasx_xvsub_w(vec0, tmp7);
|
||||
|
||||
vec1 = __lasx_xvsrai_w(tmp3, 1);
|
||||
vec1 = __lasx_xvsub_w(tmp1, vec1);
|
||||
vec1 = __lasx_xvadd_w(vec1, tmp7);
|
||||
vec1 = __lasx_xvsub_w(vec1, tmp3);
|
||||
|
||||
vec2 = __lasx_xvsrai_w(tmp5, 1);
|
||||
vec2 = __lasx_xvsub_w(vec2, tmp1);
|
||||
vec2 = __lasx_xvadd_w(vec2, tmp7);
|
||||
vec2 = __lasx_xvadd_w(vec2, tmp5);
|
||||
|
||||
vec3 = __lasx_xvsrai_w(tmp1, 1);
|
||||
vec3 = __lasx_xvadd_w(tmp3, vec3);
|
||||
vec3 = __lasx_xvadd_w(vec3, tmp5);
|
||||
vec3 = __lasx_xvadd_w(vec3, tmp1);
|
||||
|
||||
tmp1 = __lasx_xvsrai_w(vec3, 2);
|
||||
tmp1 = __lasx_xvadd_w(tmp1, vec0);
|
||||
tmp3 = __lasx_xvsrai_w(vec2, 2);
|
||||
tmp3 = __lasx_xvadd_w(tmp3, vec1);
|
||||
tmp5 = __lasx_xvsrai_w(vec1, 2);
|
||||
tmp5 = __lasx_xvsub_w(tmp5, vec2);
|
||||
tmp7 = __lasx_xvsrai_w(vec0, 2);
|
||||
tmp7 = __lasx_xvsub_w(vec3, tmp7);
|
||||
|
||||
LASX_BUTTERFLY_4_W(tmp0, tmp2, tmp5, tmp7, res0, res1, res6, res7);
|
||||
LASX_BUTTERFLY_4_W(tmp4, tmp6, tmp1, tmp3, res2, res3, res4, res5);
|
||||
|
||||
DUP4_ARG2(__lasx_xvsrai_w, res0, 6, res1, 6, res2, 6, res3, 6,
|
||||
res0, res1, res2, res3);
|
||||
DUP4_ARG2(__lasx_xvsrai_w, res4, 6, res5, 6, res6, 6, res7, 6,
|
||||
res4, res5, res6, res7);
|
||||
DUP4_ARG2(__lasx_xvpickev_h, res1, res0, res3, res2, res5, res4, res7,
|
||||
res6, res0, res1, res2, res3);
|
||||
DUP4_ARG2(__lasx_xvpermi_d, res0, 0xd8, res1, 0xd8, res2, 0xd8, res3, 0xd8,
|
||||
res0, res1, res2, res3);
|
||||
|
||||
DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
|
||||
dst, dst_stride_3x, dst0, dst1, dst2, dst3);
|
||||
dst += dst_stride_4x;
|
||||
DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
|
||||
dst, dst_stride_3x, dst4, dst5, dst6, dst7);
|
||||
dst -= dst_stride_4x;
|
||||
DUP4_ARG2(__lasx_xvilvl_b, zero, dst0, zero, dst1, zero, dst2, zero, dst3,
|
||||
dst0, dst1, dst2, dst3);
|
||||
DUP4_ARG2(__lasx_xvilvl_b, zero, dst4, zero, dst5, zero, dst6, zero, dst7,
|
||||
dst4, dst5, dst6, dst7);
|
||||
DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
|
||||
dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
|
||||
res0 = __lasx_xvadd_h(res0, dst0);
|
||||
res1 = __lasx_xvadd_h(res1, dst1);
|
||||
res2 = __lasx_xvadd_h(res2, dst2);
|
||||
res3 = __lasx_xvadd_h(res3, dst3);
|
||||
DUP4_ARG1(__lasx_xvclip255_h, res0, res1, res2, res3, res0, res1,
|
||||
res2, res3);
|
||||
DUP2_ARG2(__lasx_xvpickev_b, res1, res0, res3, res2, res0, res1);
|
||||
__lasx_xvstelm_d(res0, dst, 0, 0);
|
||||
__lasx_xvstelm_d(res0, dst + dst_stride, 0, 2);
|
||||
__lasx_xvstelm_d(res0, dst + dst_stride_2x, 0, 1);
|
||||
__lasx_xvstelm_d(res0, dst + dst_stride_3x, 0, 3);
|
||||
dst += dst_stride_4x;
|
||||
__lasx_xvstelm_d(res1, dst, 0, 0);
|
||||
__lasx_xvstelm_d(res1, dst + dst_stride, 0, 2);
|
||||
__lasx_xvstelm_d(res1, dst + dst_stride_2x, 0, 1);
|
||||
__lasx_xvstelm_d(res1, dst + dst_stride_3x, 0, 3);
|
||||
}
|
||||
|
||||
void ff_h264_idct4x4_addblk_dc_lasx(uint8_t *dst, int16_t *src,
|
||||
int32_t dst_stride)
|
||||
{
|
||||
const int16_t dc = (src[0] + 32) >> 6;
|
||||
int32_t dst_stride_2x = dst_stride << 1;
|
||||
int32_t dst_stride_3x = dst_stride_2x + dst_stride;
|
||||
__m256i pred, out;
|
||||
__m256i src0, src1, src2, src3;
|
||||
__m256i input_dc = __lasx_xvreplgr2vr_h(dc);
|
||||
|
||||
src[0] = 0;
|
||||
DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
|
||||
dst, dst_stride_3x, src0, src1, src2, src3);
|
||||
DUP2_ARG2(__lasx_xvilvl_w, src1, src0, src3, src2, src0, src1);
|
||||
|
||||
pred = __lasx_xvpermi_q(src0, src1, 0x02);
|
||||
pred = __lasx_xvaddw_h_h_bu(input_dc, pred);
|
||||
pred = __lasx_xvclip255_h(pred);
|
||||
out = __lasx_xvpickev_b(pred, pred);
|
||||
__lasx_xvstelm_w(out, dst, 0, 0);
|
||||
__lasx_xvstelm_w(out, dst + dst_stride, 0, 1);
|
||||
__lasx_xvstelm_w(out, dst + dst_stride_2x, 0, 4);
|
||||
__lasx_xvstelm_w(out, dst + dst_stride_3x, 0, 5);
|
||||
}
|
||||
|
||||
void ff_h264_idct8_dc_addblk_lasx(uint8_t *dst, int16_t *src,
|
||||
int32_t dst_stride)
|
||||
{
|
||||
int32_t dc_val;
|
||||
int32_t dst_stride_2x = dst_stride << 1;
|
||||
int32_t dst_stride_4x = dst_stride << 2;
|
||||
int32_t dst_stride_3x = dst_stride_2x + dst_stride;
|
||||
__m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
|
||||
__m256i dc;
|
||||
|
||||
dc_val = (src[0] + 32) >> 6;
|
||||
dc = __lasx_xvreplgr2vr_h(dc_val);
|
||||
|
||||
src[0] = 0;
|
||||
|
||||
DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
|
||||
dst, dst_stride_3x, dst0, dst1, dst2, dst3);
|
||||
dst += dst_stride_4x;
|
||||
DUP4_ARG2(__lasx_xvldx, dst, 0, dst, dst_stride, dst, dst_stride_2x,
|
||||
dst, dst_stride_3x, dst4, dst5, dst6, dst7);
|
||||
dst -= dst_stride_4x;
|
||||
DUP4_ARG1(__lasx_vext2xv_hu_bu, dst0, dst1, dst2, dst3,
|
||||
dst0, dst1, dst2, dst3);
|
||||
DUP4_ARG1(__lasx_vext2xv_hu_bu, dst4, dst5, dst6, dst7,
|
||||
dst4, dst5, dst6, dst7);
|
||||
DUP4_ARG3(__lasx_xvpermi_q, dst1, dst0, 0x20, dst3, dst2, 0x20, dst5,
|
||||
dst4, 0x20, dst7, dst6, 0x20, dst0, dst1, dst2, dst3);
|
||||
dst0 = __lasx_xvadd_h(dst0, dc);
|
||||
dst1 = __lasx_xvadd_h(dst1, dc);
|
||||
dst2 = __lasx_xvadd_h(dst2, dc);
|
||||
dst3 = __lasx_xvadd_h(dst3, dc);
|
||||
DUP4_ARG1(__lasx_xvclip255_h, dst0, dst1, dst2, dst3,
|
||||
dst0, dst1, dst2, dst3);
|
||||
DUP2_ARG2(__lasx_xvpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
|
||||
__lasx_xvstelm_d(dst0, dst, 0, 0);
|
||||
__lasx_xvstelm_d(dst0, dst + dst_stride, 0, 2);
|
||||
__lasx_xvstelm_d(dst0, dst + dst_stride_2x, 0, 1);
|
||||
__lasx_xvstelm_d(dst0, dst + dst_stride_3x, 0, 3);
|
||||
dst += dst_stride_4x;
|
||||
__lasx_xvstelm_d(dst1, dst, 0, 0);
|
||||
__lasx_xvstelm_d(dst1, dst + dst_stride, 0, 2);
|
||||
__lasx_xvstelm_d(dst1, dst + dst_stride_2x, 0, 1);
|
||||
__lasx_xvstelm_d(dst1, dst + dst_stride_3x, 0, 3);
|
||||
}
|
||||
|
||||
void ff_h264_idct_add16_lasx(uint8_t *dst,
|
||||
const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8])
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
for (i = 0; i < 16; i++) {
|
||||
int32_t nnz = nzc[scan8[i]];
|
||||
|
||||
if (nnz) {
|
||||
if (nnz == 1 && ((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else
|
||||
ff_h264_idct_add_lasx(dst + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ff_h264_idct8_add4_lasx(uint8_t *dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8])
|
||||
{
|
||||
int32_t cnt;
|
||||
|
||||
for (cnt = 0; cnt < 16; cnt += 4) {
|
||||
int32_t nnz = nzc[scan8[cnt]];
|
||||
|
||||
if (nnz) {
|
||||
if (nnz == 1 && ((dctcoef *) block)[cnt * 16])
|
||||
ff_h264_idct8_dc_addblk_lasx(dst + blk_offset[cnt],
|
||||
block + cnt * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else
|
||||
ff_h264_idct8_addblk_lasx(dst + blk_offset[cnt],
|
||||
block + cnt * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ff_h264_idct_add8_lasx(uint8_t **dst,
|
||||
const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8])
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
for (i = 16; i < 20; i++) {
|
||||
if (nzc[scan8[i]])
|
||||
ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
for (i = 32; i < 36; i++) {
|
||||
if (nzc[scan8[i]])
|
||||
ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
}
|
||||
|
||||
void ff_h264_idct_add8_422_lasx(uint8_t **dst,
|
||||
const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8])
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
for (i = 16; i < 20; i++) {
|
||||
if (nzc[scan8[i]])
|
||||
ff_h264_idct_add_lasx(dst[0] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
for (i = 32; i < 36; i++) {
|
||||
if (nzc[scan8[i]])
|
||||
ff_h264_idct_add_lasx(dst[1] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
for (i = 20; i < 24; i++) {
|
||||
if (nzc[scan8[i + 4]])
|
||||
ff_h264_idct_add_lasx(dst[0] + blk_offset[i + 4],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct4x4_addblk_dc_lasx(dst[0] + blk_offset[i + 4],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
for (i = 36; i < 40; i++) {
|
||||
if (nzc[scan8[i + 4]])
|
||||
ff_h264_idct_add_lasx(dst[1] + blk_offset[i + 4],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct4x4_addblk_dc_lasx(dst[1] + blk_offset[i + 4],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
}
|
||||
|
||||
void ff_h264_idct_add16_intra_lasx(uint8_t *dst,
|
||||
const int32_t *blk_offset,
|
||||
int16_t *block,
|
||||
int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8])
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
for (i = 0; i < 16; i++) {
|
||||
if (nzc[scan8[i]])
|
||||
ff_h264_idct_add_lasx(dst + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel), dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct4x4_addblk_dc_lasx(dst + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
}
|
||||
|
||||
void ff_h264_deq_idct_luma_dc_lasx(int16_t *dst, int16_t *src,
|
||||
int32_t de_qval)
|
||||
{
|
||||
#define DC_DEST_STRIDE 16
|
||||
|
||||
__m256i src0, src1, src2, src3;
|
||||
__m256i vec0, vec1, vec2, vec3;
|
||||
__m256i tmp0, tmp1, tmp2, tmp3;
|
||||
__m256i hres0, hres1, hres2, hres3;
|
||||
__m256i vres0, vres1, vres2, vres3;
|
||||
__m256i de_q_vec = __lasx_xvreplgr2vr_w(de_qval);
|
||||
|
||||
DUP4_ARG2(__lasx_xvld, src, 0, src, 8, src, 16, src, 24,
|
||||
src0, src1, src2, src3);
|
||||
LASX_TRANSPOSE4x4_H(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
|
||||
LASX_BUTTERFLY_4_H(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1);
|
||||
LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
|
||||
LASX_TRANSPOSE4x4_H(hres0, hres1, hres2, hres3,
|
||||
hres0, hres1, hres2, hres3);
|
||||
LASX_BUTTERFLY_4_H(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
|
||||
LASX_BUTTERFLY_4_H(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
|
||||
DUP4_ARG1(__lasx_vext2xv_w_h, vres0, vres1, vres2, vres3,
|
||||
vres0, vres1, vres2, vres3);
|
||||
DUP2_ARG3(__lasx_xvpermi_q, vres1, vres0, 0x20, vres3, vres2, 0x20,
|
||||
vres0, vres1);
|
||||
|
||||
vres0 = __lasx_xvmul_w(vres0, de_q_vec);
|
||||
vres1 = __lasx_xvmul_w(vres1, de_q_vec);
|
||||
|
||||
vres0 = __lasx_xvsrari_w(vres0, 8);
|
||||
vres1 = __lasx_xvsrari_w(vres1, 8);
|
||||
vec0 = __lasx_xvpickev_h(vres1, vres0);
|
||||
vec0 = __lasx_xvpermi_d(vec0, 0xd8);
|
||||
__lasx_xvstelm_h(vec0, dst + 0 * DC_DEST_STRIDE, 0, 0);
|
||||
__lasx_xvstelm_h(vec0, dst + 2 * DC_DEST_STRIDE, 0, 1);
|
||||
__lasx_xvstelm_h(vec0, dst + 8 * DC_DEST_STRIDE, 0, 2);
|
||||
__lasx_xvstelm_h(vec0, dst + 10 * DC_DEST_STRIDE, 0, 3);
|
||||
__lasx_xvstelm_h(vec0, dst + 1 * DC_DEST_STRIDE, 0, 4);
|
||||
__lasx_xvstelm_h(vec0, dst + 3 * DC_DEST_STRIDE, 0, 5);
|
||||
__lasx_xvstelm_h(vec0, dst + 9 * DC_DEST_STRIDE, 0, 6);
|
||||
__lasx_xvstelm_h(vec0, dst + 11 * DC_DEST_STRIDE, 0, 7);
|
||||
__lasx_xvstelm_h(vec0, dst + 4 * DC_DEST_STRIDE, 0, 8);
|
||||
__lasx_xvstelm_h(vec0, dst + 6 * DC_DEST_STRIDE, 0, 9);
|
||||
__lasx_xvstelm_h(vec0, dst + 12 * DC_DEST_STRIDE, 0, 10);
|
||||
__lasx_xvstelm_h(vec0, dst + 14 * DC_DEST_STRIDE, 0, 11);
|
||||
__lasx_xvstelm_h(vec0, dst + 5 * DC_DEST_STRIDE, 0, 12);
|
||||
__lasx_xvstelm_h(vec0, dst + 7 * DC_DEST_STRIDE, 0, 13);
|
||||
__lasx_xvstelm_h(vec0, dst + 13 * DC_DEST_STRIDE, 0, 14);
|
||||
__lasx_xvstelm_h(vec0, dst + 15 * DC_DEST_STRIDE, 0, 15);
|
||||
|
||||
#undef DC_DEST_STRIDE
|
||||
}
|
184
libavcodec/loongarch/h264idct_loongarch.c
Normal file
184
libavcodec/loongarch/h264idct_loongarch.c
Normal file
@ -0,0 +1,184 @@
|
||||
/*
|
||||
* Loongson LSX/LASX optimized h264idct
|
||||
*
|
||||
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
||||
* Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
|
||||
* Xiwei Gu <guxiwei-hf@loongson.cn>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "h264dsp_loongarch.h"
|
||||
#include "libavcodec/bit_depth_template.c"
|
||||
|
||||
void ff_h264_idct_add16_8_lsx(uint8_t *dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8])
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
for (i = 0; i < 16; i++) {
|
||||
int32_t nnz = nzc[scan8[i]];
|
||||
|
||||
if (nnz == 1 && ((dctcoef *) block)[i * 16]) {
|
||||
ff_h264_idct_dc_add_8_lsx(dst + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
} else if (nnz) {
|
||||
ff_h264_idct_add_8_lsx(dst + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ff_h264_idct8_add4_8_lsx(uint8_t *dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8])
|
||||
{
|
||||
int32_t cnt;
|
||||
|
||||
for (cnt = 0; cnt < 16; cnt += 4) {
|
||||
int32_t nnz = nzc[scan8[cnt]];
|
||||
|
||||
if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) {
|
||||
ff_h264_idct8_dc_add_8_lsx(dst + blk_offset[cnt],
|
||||
block + cnt * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
} else if (nnz) {
|
||||
ff_h264_idct8_add_8_lsx(dst + blk_offset[cnt],
|
||||
block + cnt * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_LASX
|
||||
void ff_h264_idct8_add4_8_lasx(uint8_t *dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8])
|
||||
{
|
||||
int32_t cnt;
|
||||
|
||||
for (cnt = 0; cnt < 16; cnt += 4) {
|
||||
int32_t nnz = nzc[scan8[cnt]];
|
||||
|
||||
if (nnz == 1 && ((dctcoef *) block)[cnt * 16]) {
|
||||
ff_h264_idct8_dc_add_8_lasx(dst + blk_offset[cnt],
|
||||
block + cnt * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
} else if (nnz) {
|
||||
ff_h264_idct8_add_8_lasx(dst + blk_offset[cnt],
|
||||
block + cnt * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // #if HAVE_LASX
|
||||
|
||||
void ff_h264_idct_add8_8_lsx(uint8_t **dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8])
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
for (i = 16; i < 20; i++) {
|
||||
if (nzc[scan8[i]])
|
||||
ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
for (i = 32; i < 36; i++) {
|
||||
if (nzc[scan8[i]])
|
||||
ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
}
|
||||
|
||||
void ff_h264_idct_add8_422_8_lsx(uint8_t **dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8])
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
for (i = 16; i < 20; i++) {
|
||||
if (nzc[scan8[i]])
|
||||
ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
for (i = 20; i < 24; i++) {
|
||||
if (nzc[scan8[i + 4]])
|
||||
ff_h264_idct_add_8_lsx(dst[0] + blk_offset[i + 4],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct_dc_add_8_lsx(dst[0] + blk_offset[i + 4],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
for (i = 32; i < 36; i++) {
|
||||
if (nzc[scan8[i]])
|
||||
ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
for (i = 36; i < 40; i++) {
|
||||
if (nzc[scan8[i + 4]])
|
||||
ff_h264_idct_add_8_lsx(dst[1] + blk_offset[i + 4],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct_dc_add_8_lsx(dst[1] + blk_offset[i + 4],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
}
|
||||
|
||||
void ff_h264_idct_add16_intra_8_lsx(uint8_t *dst, const int32_t *blk_offset,
|
||||
int16_t *block, int32_t dst_stride,
|
||||
const uint8_t nzc[15 * 8])
|
||||
{
|
||||
int32_t i;
|
||||
|
||||
for (i = 0; i < 16; i++) {
|
||||
if (nzc[scan8[i]])
|
||||
ff_h264_idct_add_8_lsx(dst + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel), dst_stride);
|
||||
else if (((dctcoef *) block)[i * 16])
|
||||
ff_h264_idct_dc_add_8_lsx(dst + blk_offset[i],
|
||||
block + i * 16 * sizeof(pixel),
|
||||
dst_stride);
|
||||
}
|
||||
}
|
945
libavcodec/loongarch/loongson_asm.S
Normal file
945
libavcodec/loongarch/loongson_asm.S
Normal file
@ -0,0 +1,945 @@
|
||||
/*
|
||||
* Loongson asm helper.
|
||||
*
|
||||
* Copyright (c) 2022 Loongson Technology Corporation Limited
|
||||
* Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
|
||||
* Shiyou Yin(yinshiyou-hf@loongson.cn)
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* MAJOR version: Macro usage changes.
|
||||
* MINOR version: Add new functions, or bug fixes.
|
||||
* MICRO version: Comment changes or implementation changes.
|
||||
*/
|
||||
#define LML_VERSION_MAJOR 0
|
||||
#define LML_VERSION_MINOR 2
|
||||
#define LML_VERSION_MICRO 0
|
||||
|
||||
/*
|
||||
*============================================================================
|
||||
* macros for specific projetc, set them as needed.
|
||||
* Following LoongML macros for your reference.
|
||||
*============================================================================
|
||||
*/
|
||||
#define ASM_PREF
|
||||
#define DEFAULT_ALIGN 5
|
||||
|
||||
.macro function name, align=DEFAULT_ALIGN
|
||||
.macro endfunc
|
||||
jirl $r0, $r1, 0x0
|
||||
.size ASM_PREF\name, . - ASM_PREF\name
|
||||
.purgem endfunc
|
||||
.endm
|
||||
.text ;
|
||||
.align \align ;
|
||||
.globl ASM_PREF\name ;
|
||||
.type ASM_PREF\name, @function ;
|
||||
ASM_PREF\name: ;
|
||||
.endm
|
||||
|
||||
/**
|
||||
* Attention: If align is not zero, the macro will use
|
||||
* t7 until the end of function
|
||||
*/
|
||||
.macro alloc_stack size, align=0
|
||||
.if \align
|
||||
.macro clean_stack
|
||||
add.d sp, sp, t7
|
||||
.endm
|
||||
addi.d sp, sp, - \size
|
||||
andi.d t7, sp, \align - 1
|
||||
sub.d sp, sp, t7
|
||||
addi.d t7, t7, \size
|
||||
.else
|
||||
.macro clean_stack
|
||||
addi.d sp, sp, \size
|
||||
.endm
|
||||
addi.d sp, sp, - \size
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro const name, align=DEFAULT_ALIGN
|
||||
.macro endconst
|
||||
.size \name, . - \name
|
||||
.purgem endconst
|
||||
.endm
|
||||
.section .rodata
|
||||
.align \align
|
||||
\name:
|
||||
.endm
|
||||
|
||||
/*
|
||||
*============================================================================
|
||||
* LoongArch register alias
|
||||
*============================================================================
|
||||
*/
|
||||
|
||||
#define a0 $a0
|
||||
#define a1 $a1
|
||||
#define a2 $a2
|
||||
#define a3 $a3
|
||||
#define a4 $a4
|
||||
#define a5 $a5
|
||||
#define a6 $a6
|
||||
#define a7 $a7
|
||||
|
||||
#define t0 $t0
|
||||
#define t1 $t1
|
||||
#define t2 $t2
|
||||
#define t3 $t3
|
||||
#define t4 $t4
|
||||
#define t5 $t5
|
||||
#define t6 $t6
|
||||
#define t7 $t7
|
||||
#define t8 $t8
|
||||
|
||||
#define s0 $s0
|
||||
#define s1 $s1
|
||||
#define s2 $s2
|
||||
#define s3 $s3
|
||||
#define s4 $s4
|
||||
#define s5 $s5
|
||||
#define s6 $s6
|
||||
#define s7 $s7
|
||||
#define s8 $s8
|
||||
|
||||
#define zero $zero
|
||||
#define sp $sp
|
||||
#define ra $ra
|
||||
|
||||
#define f0 $f0
|
||||
#define f1 $f1
|
||||
#define f2 $f2
|
||||
#define f3 $f3
|
||||
#define f4 $f4
|
||||
#define f5 $f5
|
||||
#define f6 $f6
|
||||
#define f7 $f7
|
||||
#define f8 $f8
|
||||
#define f9 $f9
|
||||
#define f10 $f10
|
||||
#define f11 $f11
|
||||
#define f12 $f12
|
||||
#define f13 $f13
|
||||
#define f14 $f14
|
||||
#define f15 $f15
|
||||
#define f16 $f16
|
||||
#define f17 $f17
|
||||
#define f18 $f18
|
||||
#define f19 $f19
|
||||
#define f20 $f20
|
||||
#define f21 $f21
|
||||
#define f22 $f22
|
||||
#define f23 $f23
|
||||
#define f24 $f24
|
||||
#define f25 $f25
|
||||
#define f26 $f26
|
||||
#define f27 $f27
|
||||
#define f28 $f28
|
||||
#define f29 $f29
|
||||
#define f30 $f30
|
||||
#define f31 $f31
|
||||
|
||||
#define vr0 $vr0
|
||||
#define vr1 $vr1
|
||||
#define vr2 $vr2
|
||||
#define vr3 $vr3
|
||||
#define vr4 $vr4
|
||||
#define vr5 $vr5
|
||||
#define vr6 $vr6
|
||||
#define vr7 $vr7
|
||||
#define vr8 $vr8
|
||||
#define vr9 $vr9
|
||||
#define vr10 $vr10
|
||||
#define vr11 $vr11
|
||||
#define vr12 $vr12
|
||||
#define vr13 $vr13
|
||||
#define vr14 $vr14
|
||||
#define vr15 $vr15
|
||||
#define vr16 $vr16
|
||||
#define vr17 $vr17
|
||||
#define vr18 $vr18
|
||||
#define vr19 $vr19
|
||||
#define vr20 $vr20
|
||||
#define vr21 $vr21
|
||||
#define vr22 $vr22
|
||||
#define vr23 $vr23
|
||||
#define vr24 $vr24
|
||||
#define vr25 $vr25
|
||||
#define vr26 $vr26
|
||||
#define vr27 $vr27
|
||||
#define vr28 $vr28
|
||||
#define vr29 $vr29
|
||||
#define vr30 $vr30
|
||||
#define vr31 $vr31
|
||||
|
||||
#define xr0 $xr0
|
||||
#define xr1 $xr1
|
||||
#define xr2 $xr2
|
||||
#define xr3 $xr3
|
||||
#define xr4 $xr4
|
||||
#define xr5 $xr5
|
||||
#define xr6 $xr6
|
||||
#define xr7 $xr7
|
||||
#define xr8 $xr8
|
||||
#define xr9 $xr9
|
||||
#define xr10 $xr10
|
||||
#define xr11 $xr11
|
||||
#define xr12 $xr12
|
||||
#define xr13 $xr13
|
||||
#define xr14 $xr14
|
||||
#define xr15 $xr15
|
||||
#define xr16 $xr16
|
||||
#define xr17 $xr17
|
||||
#define xr18 $xr18
|
||||
#define xr19 $xr19
|
||||
#define xr20 $xr20
|
||||
#define xr21 $xr21
|
||||
#define xr22 $xr22
|
||||
#define xr23 $xr23
|
||||
#define xr24 $xr24
|
||||
#define xr25 $xr25
|
||||
#define xr26 $xr26
|
||||
#define xr27 $xr27
|
||||
#define xr28 $xr28
|
||||
#define xr29 $xr29
|
||||
#define xr30 $xr30
|
||||
#define xr31 $xr31
|
||||
|
||||
/*
|
||||
*============================================================================
|
||||
* LSX/LASX synthesize instructions
|
||||
*============================================================================
|
||||
*/
|
||||
|
||||
/*
|
||||
* Description : Dot product of byte vector elements
|
||||
* Arguments : Inputs - vj, vk
|
||||
* Outputs - vd
|
||||
* Return Type - halfword
|
||||
*/
|
||||
.macro vdp2.h.bu vd, vj, vk
|
||||
vmulwev.h.bu \vd, \vj, \vk
|
||||
vmaddwod.h.bu \vd, \vj, \vk
|
||||
.endm
|
||||
|
||||
.macro vdp2.h.bu.b vd, vj, vk
|
||||
vmulwev.h.bu.b \vd, \vj, \vk
|
||||
vmaddwod.h.bu.b \vd, \vj, \vk
|
||||
.endm
|
||||
|
||||
.macro vdp2.w.h vd, vj, vk
|
||||
vmulwev.w.h \vd, \vj, \vk
|
||||
vmaddwod.w.h \vd, \vj, \vk
|
||||
.endm
|
||||
|
||||
.macro xvdp2.h.bu xd, xj, xk
|
||||
xvmulwev.h.bu \xd, \xj, \xk
|
||||
xvmaddwod.h.bu \xd, \xj, \xk
|
||||
.endm
|
||||
|
||||
.macro xvdp2.h.bu.b xd, xj, xk
|
||||
xvmulwev.h.bu.b \xd, \xj, \xk
|
||||
xvmaddwod.h.bu.b \xd, \xj, \xk
|
||||
.endm
|
||||
|
||||
.macro xvdp2.w.h xd, xj, xk
|
||||
xvmulwev.w.h \xd, \xj, \xk
|
||||
xvmaddwod.w.h \xd, \xj, \xk
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Dot product & addition of halfword vector elements
|
||||
* Arguments : Inputs - vj, vk
|
||||
* Outputs - vd
|
||||
* Return Type - twice size of input
|
||||
*/
|
||||
.macro vdp2add.h.bu vd, vj, vk
|
||||
vmaddwev.h.bu \vd, \vj, \vk
|
||||
vmaddwod.h.bu \vd, \vj, \vk
|
||||
.endm
|
||||
|
||||
.macro vdp2add.h.bu.b vd, vj, vk
|
||||
vmaddwev.h.bu.b \vd, \vj, \vk
|
||||
vmaddwod.h.bu.b \vd, \vj, \vk
|
||||
.endm
|
||||
|
||||
.macro vdp2add.w.h vd, vj, vk
|
||||
vmaddwev.w.h \vd, \vj, \vk
|
||||
vmaddwod.w.h \vd, \vj, \vk
|
||||
.endm
|
||||
|
||||
.macro xvdp2add.h.bu.b xd, xj, xk
|
||||
xvmaddwev.h.bu.b \xd, \xj, \xk
|
||||
xvmaddwod.h.bu.b \xd, \xj, \xk
|
||||
.endm
|
||||
|
||||
.macro xvdp2add.w.h xd, xj, xk
|
||||
xvmaddwev.w.h \xd, \xj, \xk
|
||||
xvmaddwod.w.h \xd, \xj, \xk
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Range each element of vector
|
||||
* clip: vj > vk ? vj : vk && vj < va ? vj : va
|
||||
* clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
|
||||
*/
|
||||
.macro vclip.h vd, vj, vk, va
|
||||
vmax.h \vd, \vj, \vk
|
||||
vmin.h \vd, \vd, \va
|
||||
.endm
|
||||
|
||||
.macro vclip255.w vd, vj
|
||||
vmaxi.w \vd, \vj, 0
|
||||
vsat.wu \vd, \vd, 7
|
||||
.endm
|
||||
|
||||
.macro vclip255.h vd, vj
|
||||
vmaxi.h \vd, \vj, 0
|
||||
vsat.hu \vd, \vd, 7
|
||||
.endm
|
||||
|
||||
.macro xvclip.h xd, xj, xk, xa
|
||||
xvmax.h \xd, \xj, \xk
|
||||
xvmin.h \xd, \xd, \xa
|
||||
.endm
|
||||
|
||||
.macro xvclip255.h xd, xj
|
||||
xvmaxi.h \xd, \xj, 0
|
||||
xvsat.hu \xd, \xd, 7
|
||||
.endm
|
||||
|
||||
.macro xvclip255.w xd, xj
|
||||
xvmaxi.w \xd, \xj, 0
|
||||
xvsat.wu \xd, \xd, 7
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Store elements of vector
|
||||
* vd : Data vector to be stroed
|
||||
* rk : Address of data storage
|
||||
* ra : Offset of address
|
||||
* si : Index of data in vd
|
||||
*/
|
||||
.macro vstelmx.b vd, rk, ra, si
|
||||
add.d \rk, \rk, \ra
|
||||
vstelm.b \vd, \rk, 0, \si
|
||||
.endm
|
||||
|
||||
.macro vstelmx.h vd, rk, ra, si
|
||||
add.d \rk, \rk, \ra
|
||||
vstelm.h \vd, \rk, 0, \si
|
||||
.endm
|
||||
|
||||
.macro vstelmx.w vd, rk, ra, si
|
||||
add.d \rk, \rk, \ra
|
||||
vstelm.w \vd, \rk, 0, \si
|
||||
.endm
|
||||
|
||||
.macro vstelmx.d vd, rk, ra, si
|
||||
add.d \rk, \rk, \ra
|
||||
vstelm.d \vd, \rk, 0, \si
|
||||
.endm
|
||||
|
||||
.macro vmov xd, xj
|
||||
vor.v \xd, \xj, \xj
|
||||
.endm
|
||||
|
||||
.macro xmov xd, xj
|
||||
xvor.v \xd, \xj, \xj
|
||||
.endm
|
||||
|
||||
.macro xvstelmx.d xd, rk, ra, si
|
||||
add.d \rk, \rk, \ra
|
||||
xvstelm.d \xd, \rk, 0, \si
|
||||
.endm
|
||||
|
||||
/*
|
||||
*============================================================================
|
||||
* LSX/LASX custom macros
|
||||
*============================================================================
|
||||
*/
|
||||
|
||||
/*
|
||||
* Load 4 float, double, V128, v256 elements with stride.
|
||||
*/
|
||||
.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
|
||||
fld.s \out0, \src, 0
|
||||
fldx.s \out1, \src, \stride
|
||||
fldx.s \out2, \src, \stride2
|
||||
fldx.s \out3, \src, \stride3
|
||||
.endm
|
||||
|
||||
.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
|
||||
fld.d \out0, \src, 0
|
||||
fldx.d \out1, \src, \stride
|
||||
fldx.d \out2, \src, \stride2
|
||||
fldx.d \out3, \src, \stride3
|
||||
.endm
|
||||
|
||||
.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
|
||||
vld \out0, \src, 0
|
||||
vldx \out1, \src, \stride
|
||||
vldx \out2, \src, \stride2
|
||||
vldx \out3, \src, \stride3
|
||||
.endm
|
||||
|
||||
.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
|
||||
xvld \out0, \src, 0
|
||||
xvldx \out1, \src, \stride
|
||||
xvldx \out2, \src, \stride2
|
||||
xvldx \out3, \src, \stride3
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 4x4 block with half-word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
*/
|
||||
.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||
tmp0, tmp1
|
||||
vilvl.h \tmp0, \in1, \in0
|
||||
vilvl.h \tmp1, \in3, \in2
|
||||
vilvl.w \out0, \tmp1, \tmp0
|
||||
vilvh.w \out2, \tmp1, \tmp0
|
||||
vilvh.d \out1, \out0, \out0
|
||||
vilvh.d \out3, \out0, \out2
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 4x4 block with word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
* Details :
|
||||
* Example :
|
||||
* 1, 2, 3, 4 1, 5, 9,13
|
||||
* 5, 6, 7, 8 to 2, 6,10,14
|
||||
* 9,10,11,12 =====> 3, 7,11,15
|
||||
* 13,14,15,16 4, 8,12,16
|
||||
*/
|
||||
.macro LSX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
|
||||
_tmp0, _tmp1
|
||||
|
||||
vilvl.w \_tmp0, \_in1, \_in0
|
||||
vilvh.w \_out1, \_in1, \_in0
|
||||
vilvl.w \_tmp1, \_in3, \_in2
|
||||
vilvh.w \_out3, \_in3, \_in2
|
||||
|
||||
vilvl.d \_out0, \_tmp1, \_tmp0
|
||||
vilvl.d \_out2, \_out3, \_out1
|
||||
vilvh.d \_out3, \_out3, \_out1
|
||||
vilvh.d \_out1, \_tmp1, \_tmp0
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 8x8 block with half-word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
|
||||
*/
|
||||
.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
|
||||
out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
|
||||
tmp3, tmp4, tmp5, tmp6, tmp7
|
||||
vilvl.h \tmp0, \in6, \in4
|
||||
vilvl.h \tmp1, \in7, \in5
|
||||
vilvl.h \tmp2, \in2, \in0
|
||||
vilvl.h \tmp3, \in3, \in1
|
||||
|
||||
vilvl.h \tmp4, \tmp1, \tmp0
|
||||
vilvh.h \tmp5, \tmp1, \tmp0
|
||||
vilvl.h \tmp6, \tmp3, \tmp2
|
||||
vilvh.h \tmp7, \tmp3, \tmp2
|
||||
|
||||
vilvh.h \tmp0, \in6, \in4
|
||||
vilvh.h \tmp1, \in7, \in5
|
||||
vilvh.h \tmp2, \in2, \in0
|
||||
vilvh.h \tmp3, \in3, \in1
|
||||
|
||||
vpickev.d \out0, \tmp4, \tmp6
|
||||
vpickod.d \out1, \tmp4, \tmp6
|
||||
vpickev.d \out2, \tmp5, \tmp7
|
||||
vpickod.d \out3, \tmp5, \tmp7
|
||||
|
||||
vilvl.h \tmp4, \tmp1, \tmp0
|
||||
vilvh.h \tmp5, \tmp1, \tmp0
|
||||
vilvl.h \tmp6, \tmp3, \tmp2
|
||||
vilvh.h \tmp7, \tmp3, \tmp2
|
||||
|
||||
vpickev.d \out4, \tmp4, \tmp6
|
||||
vpickod.d \out5, \tmp4, \tmp6
|
||||
vpickev.d \out6, \tmp5, \tmp7
|
||||
vpickod.d \out7, \tmp5, \tmp7
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 16x8 block with byte elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
|
||||
*/
|
||||
.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
in8, in9, in10, in11, in12, in13, in14, in15, \
|
||||
out0, out1, out2, out3, out4, out5, out6, out7,\
|
||||
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
|
||||
xvilvl.b \tmp0, \in2, \in0
|
||||
xvilvl.b \tmp1, \in3, \in1
|
||||
xvilvl.b \tmp2, \in6, \in4
|
||||
xvilvl.b \tmp3, \in7, \in5
|
||||
xvilvl.b \tmp4, \in10, \in8
|
||||
xvilvl.b \tmp5, \in11, \in9
|
||||
xvilvl.b \tmp6, \in14, \in12
|
||||
xvilvl.b \tmp7, \in15, \in13
|
||||
xvilvl.b \out0, \tmp1, \tmp0
|
||||
xvilvh.b \out1, \tmp1, \tmp0
|
||||
xvilvl.b \out2, \tmp3, \tmp2
|
||||
xvilvh.b \out3, \tmp3, \tmp2
|
||||
xvilvl.b \out4, \tmp5, \tmp4
|
||||
xvilvh.b \out5, \tmp5, \tmp4
|
||||
xvilvl.b \out6, \tmp7, \tmp6
|
||||
xvilvh.b \out7, \tmp7, \tmp6
|
||||
xvilvl.w \tmp0, \out2, \out0
|
||||
xvilvh.w \tmp2, \out2, \out0
|
||||
xvilvl.w \tmp4, \out3, \out1
|
||||
xvilvh.w \tmp6, \out3, \out1
|
||||
xvilvl.w \tmp1, \out6, \out4
|
||||
xvilvh.w \tmp3, \out6, \out4
|
||||
xvilvl.w \tmp5, \out7, \out5
|
||||
xvilvh.w \tmp7, \out7, \out5
|
||||
xvilvl.d \out0, \tmp1, \tmp0
|
||||
xvilvh.d \out1, \tmp1, \tmp0
|
||||
xvilvl.d \out2, \tmp3, \tmp2
|
||||
xvilvh.d \out3, \tmp3, \tmp2
|
||||
xvilvl.d \out4, \tmp5, \tmp4
|
||||
xvilvh.d \out5, \tmp5, \tmp4
|
||||
xvilvl.d \out6, \tmp7, \tmp6
|
||||
xvilvh.d \out7, \tmp7, \tmp6
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 16x8 block with byte elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
|
||||
*/
|
||||
.macro LSX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
in8, in9, in10, in11, in12, in13, in14, in15, \
|
||||
out0, out1, out2, out3, out4, out5, out6, out7,\
|
||||
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
|
||||
vilvl.b \tmp0, \in2, \in0
|
||||
vilvl.b \tmp1, \in3, \in1
|
||||
vilvl.b \tmp2, \in6, \in4
|
||||
vilvl.b \tmp3, \in7, \in5
|
||||
vilvl.b \tmp4, \in10, \in8
|
||||
vilvl.b \tmp5, \in11, \in9
|
||||
vilvl.b \tmp6, \in14, \in12
|
||||
vilvl.b \tmp7, \in15, \in13
|
||||
|
||||
vilvl.b \out0, \tmp1, \tmp0
|
||||
vilvh.b \out1, \tmp1, \tmp0
|
||||
vilvl.b \out2, \tmp3, \tmp2
|
||||
vilvh.b \out3, \tmp3, \tmp2
|
||||
vilvl.b \out4, \tmp5, \tmp4
|
||||
vilvh.b \out5, \tmp5, \tmp4
|
||||
vilvl.b \out6, \tmp7, \tmp6
|
||||
vilvh.b \out7, \tmp7, \tmp6
|
||||
vilvl.w \tmp0, \out2, \out0
|
||||
vilvh.w \tmp2, \out2, \out0
|
||||
vilvl.w \tmp4, \out3, \out1
|
||||
vilvh.w \tmp6, \out3, \out1
|
||||
vilvl.w \tmp1, \out6, \out4
|
||||
vilvh.w \tmp3, \out6, \out4
|
||||
vilvl.w \tmp5, \out7, \out5
|
||||
vilvh.w \tmp7, \out7, \out5
|
||||
vilvl.d \out0, \tmp1, \tmp0
|
||||
vilvh.d \out1, \tmp1, \tmp0
|
||||
vilvl.d \out2, \tmp3, \tmp2
|
||||
vilvh.d \out3, \tmp3, \tmp2
|
||||
vilvl.d \out4, \tmp5, \tmp4
|
||||
vilvh.d \out5, \tmp5, \tmp4
|
||||
vilvl.d \out6, \tmp7, \tmp6
|
||||
vilvh.d \out7, \tmp7, \tmp6
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 4x4 block with half-word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
*/
|
||||
.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||
tmp0, tmp1
|
||||
xvilvl.h \tmp0, \in1, \in0
|
||||
xvilvl.h \tmp1, \in3, \in2
|
||||
xvilvl.w \out0, \tmp1, \tmp0
|
||||
xvilvh.w \out2, \tmp1, \tmp0
|
||||
xvilvh.d \out1, \out0, \out0
|
||||
xvilvh.d \out3, \out0, \out2
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 4x8 block with half-word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
*/
|
||||
.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||
tmp0, tmp1
|
||||
xvilvl.h \tmp0, \in2, \in0
|
||||
xvilvl.h \tmp1, \in3, \in1
|
||||
xvilvl.h \out2, \tmp1, \tmp0
|
||||
xvilvh.h \out3, \tmp1, \tmp0
|
||||
|
||||
xvilvl.d \out0, \out2, \out2
|
||||
xvilvh.d \out1, \out2, \out2
|
||||
xvilvl.d \out2, \out3, \out3
|
||||
xvilvh.d \out3, \out3, \out3
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 8x8 block with half-word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
|
||||
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
|
||||
*/
|
||||
.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
out0, out1, out2, out3, out4, out5, out6, out7, \
|
||||
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
|
||||
xvilvl.h \tmp0, \in6, \in4
|
||||
xvilvl.h \tmp1, \in7, \in5
|
||||
xvilvl.h \tmp2, \in2, \in0
|
||||
xvilvl.h \tmp3, \in3, \in1
|
||||
|
||||
xvilvl.h \tmp4, \tmp1, \tmp0
|
||||
xvilvh.h \tmp5, \tmp1, \tmp0
|
||||
xvilvl.h \tmp6, \tmp3, \tmp2
|
||||
xvilvh.h \tmp7, \tmp3, \tmp2
|
||||
|
||||
xvilvh.h \tmp0, \in6, \in4
|
||||
xvilvh.h \tmp1, \in7, \in5
|
||||
xvilvh.h \tmp2, \in2, \in0
|
||||
xvilvh.h \tmp3, \in3, \in1
|
||||
|
||||
xvpickev.d \out0, \tmp4, \tmp6
|
||||
xvpickod.d \out1, \tmp4, \tmp6
|
||||
xvpickev.d \out2, \tmp5, \tmp7
|
||||
xvpickod.d \out3, \tmp5, \tmp7
|
||||
|
||||
xvilvl.h \tmp4, \tmp1, \tmp0
|
||||
xvilvh.h \tmp5, \tmp1, \tmp0
|
||||
xvilvl.h \tmp6, \tmp3, \tmp2
|
||||
xvilvh.h \tmp7, \tmp3, \tmp2
|
||||
|
||||
xvpickev.d \out4, \tmp4, \tmp6
|
||||
xvpickod.d \out5, \tmp4, \tmp6
|
||||
xvpickev.d \out6, \tmp5, \tmp7
|
||||
xvpickod.d \out7, \tmp5, \tmp7
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 2x4x4 block with half-word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
*/
|
||||
.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||
tmp0, tmp1, tmp2
|
||||
xvilvh.h \tmp1, \in0, \in1
|
||||
xvilvl.h \out1, \in0, \in1
|
||||
xvilvh.h \tmp0, \in2, \in3
|
||||
xvilvl.h \out3, \in2, \in3
|
||||
|
||||
xvilvh.w \tmp2, \out3, \out1
|
||||
xvilvl.w \out3, \out3, \out1
|
||||
|
||||
xvilvl.w \out2, \tmp0, \tmp1
|
||||
xvilvh.w \tmp1, \tmp0, \tmp1
|
||||
|
||||
xvilvh.d \out0, \out2, \out3
|
||||
xvilvl.d \out2, \out2, \out3
|
||||
xvilvh.d \out1, \tmp1, \tmp2
|
||||
xvilvl.d \out3, \tmp1, \tmp2
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 4x4 block with word elements in vectors
|
||||
* Arguments : Inputs - in0, in1, in2, in3
|
||||
* Outputs - out0, out1, out2, out3
|
||||
* Details :
|
||||
* Example :
|
||||
* 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13
|
||||
* 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14
|
||||
* 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15
|
||||
* 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16
|
||||
*/
|
||||
.macro LASX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
|
||||
_tmp0, _tmp1
|
||||
|
||||
xvilvl.w \_tmp0, \_in1, \_in0
|
||||
xvilvh.w \_out1, \_in1, \_in0
|
||||
xvilvl.w \_tmp1, \_in3, \_in2
|
||||
xvilvh.w \_out3, \_in3, \_in2
|
||||
|
||||
xvilvl.d \_out0, \_tmp1, \_tmp0
|
||||
xvilvl.d \_out2, \_out3, \_out1
|
||||
xvilvh.d \_out3, \_out3, \_out1
|
||||
xvilvh.d \_out1, \_tmp1, \_tmp0
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 8x8 block with word elements in vectors
|
||||
* Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
|
||||
* Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
|
||||
* _out7
|
||||
* Example : LASX_TRANSPOSE8x8_W
|
||||
* _in0 : 1,2,3,4,5,6,7,8
|
||||
* _in1 : 2,2,3,4,5,6,7,8
|
||||
* _in2 : 3,2,3,4,5,6,7,8
|
||||
* _in3 : 4,2,3,4,5,6,7,8
|
||||
* _in4 : 5,2,3,4,5,6,7,8
|
||||
* _in5 : 6,2,3,4,5,6,7,8
|
||||
* _in6 : 7,2,3,4,5,6,7,8
|
||||
* _in7 : 8,2,3,4,5,6,7,8
|
||||
*
|
||||
* _out0 : 1,2,3,4,5,6,7,8
|
||||
* _out1 : 2,2,2,2,2,2,2,2
|
||||
* _out2 : 3,3,3,3,3,3,3,3
|
||||
* _out3 : 4,4,4,4,4,4,4,4
|
||||
* _out4 : 5,5,5,5,5,5,5,5
|
||||
* _out5 : 6,6,6,6,6,6,6,6
|
||||
* _out6 : 7,7,7,7,7,7,7,7
|
||||
* _out7 : 8,8,8,8,8,8,8,8
|
||||
*/
|
||||
.macro LASX_TRANSPOSE8x8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,\
|
||||
_out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7,\
|
||||
_tmp0, _tmp1, _tmp2, _tmp3
|
||||
xvilvl.w \_tmp0, \_in2, \_in0
|
||||
xvilvl.w \_tmp1, \_in3, \_in1
|
||||
xvilvh.w \_tmp2, \_in2, \_in0
|
||||
xvilvh.w \_tmp3, \_in3, \_in1
|
||||
xvilvl.w \_out0, \_tmp1, \_tmp0
|
||||
xvilvh.w \_out1, \_tmp1, \_tmp0
|
||||
xvilvl.w \_out2, \_tmp3, \_tmp2
|
||||
xvilvh.w \_out3, \_tmp3, \_tmp2
|
||||
|
||||
xvilvl.w \_tmp0, \_in6, \_in4
|
||||
xvilvl.w \_tmp1, \_in7, \_in5
|
||||
xvilvh.w \_tmp2, \_in6, \_in4
|
||||
xvilvh.w \_tmp3, \_in7, \_in5
|
||||
xvilvl.w \_out4, \_tmp1, \_tmp0
|
||||
xvilvh.w \_out5, \_tmp1, \_tmp0
|
||||
xvilvl.w \_out6, \_tmp3, \_tmp2
|
||||
xvilvh.w \_out7, \_tmp3, \_tmp2
|
||||
|
||||
xmov \_tmp0, \_out0
|
||||
xmov \_tmp1, \_out1
|
||||
xmov \_tmp2, \_out2
|
||||
xmov \_tmp3, \_out3
|
||||
xvpermi.q \_out0, \_out4, 0x02
|
||||
xvpermi.q \_out1, \_out5, 0x02
|
||||
xvpermi.q \_out2, \_out6, 0x02
|
||||
xvpermi.q \_out3, \_out7, 0x02
|
||||
xvpermi.q \_out4, \_tmp0, 0x31
|
||||
xvpermi.q \_out5, \_tmp1, 0x31
|
||||
xvpermi.q \_out6, \_tmp2, 0x31
|
||||
xvpermi.q \_out7, \_tmp3, 0x31
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Transpose 4x4 block with double-word elements in vectors
|
||||
* Arguments : Inputs - _in0, _in1, _in2, _in3
|
||||
* Outputs - _out0, _out1, _out2, _out3
|
||||
* Example : LASX_TRANSPOSE4x4_D
|
||||
* _in0 : 1,2,3,4
|
||||
* _in1 : 1,2,3,4
|
||||
* _in2 : 1,2,3,4
|
||||
* _in3 : 1,2,3,4
|
||||
*
|
||||
* _out0 : 1,1,1,1
|
||||
* _out1 : 2,2,2,2
|
||||
* _out2 : 3,3,3,3
|
||||
* _out3 : 4,4,4,4
|
||||
*/
|
||||
.macro LASX_TRANSPOSE4x4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
|
||||
_tmp0, _tmp1
|
||||
xvilvl.d \_tmp0, \_in1, \_in0
|
||||
xvilvh.d \_out1, \_in1, \_in0
|
||||
xvilvh.d \_tmp1, \_in3, \_in2
|
||||
xvilvl.d \_out2, \_in3, \_in2
|
||||
|
||||
xvor.v \_out0, \_tmp0, \_tmp0
|
||||
xvor.v \_out3, \_tmp1, \_tmp1
|
||||
|
||||
xvpermi.q \_out0, \_out2, 0x02
|
||||
xvpermi.q \_out2, \_tmp0, 0x31
|
||||
xvpermi.q \_out3, \_out1, 0x31
|
||||
xvpermi.q \_out1, \_tmp1, 0x02
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Butterfly of 4 input vectors
|
||||
* Arguments : Inputs - _in0, _in1, _in2, _in3
|
||||
* Outputs - _out0, _out1, _out2, _out3
|
||||
* Details : Butterfly operation
|
||||
* Example : LSX_BUTTERFLY_4
|
||||
* _out0 = _in0 + _in3;
|
||||
* _out1 = _in1 + _in2;
|
||||
* _out2 = _in1 - _in2;
|
||||
* _out3 = _in0 - _in3;
|
||||
*/
|
||||
.macro LSX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
|
||||
vadd.b \_out0, \_in0, \_in3
|
||||
vadd.b \_out1, \_in1, \_in2
|
||||
vsub.b \_out2, \_in1, \_in2
|
||||
vsub.b \_out3, \_in0, \_in3
|
||||
.endm
|
||||
.macro LSX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
|
||||
vadd.h \_out0, \_in0, \_in3
|
||||
vadd.h \_out1, \_in1, \_in2
|
||||
vsub.h \_out2, \_in1, \_in2
|
||||
vsub.h \_out3, \_in0, \_in3
|
||||
.endm
|
||||
.macro LSX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
|
||||
vadd.w \_out0, \_in0, \_in3
|
||||
vadd.w \_out1, \_in1, \_in2
|
||||
vsub.w \_out2, \_in1, \_in2
|
||||
vsub.w \_out3, \_in0, \_in3
|
||||
.endm
|
||||
.macro LSX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
|
||||
vadd.d \_out0, \_in0, \_in3
|
||||
vadd.d \_out1, \_in1, \_in2
|
||||
vsub.d \_out2, \_in1, \_in2
|
||||
vsub.d \_out3, \_in0, \_in3
|
||||
.endm
|
||||
|
||||
.macro LASX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
|
||||
xvadd.b \_out0, \_in0, \_in3
|
||||
xvadd.b \_out1, \_in1, \_in2
|
||||
xvsub.b \_out2, \_in1, \_in2
|
||||
xvsub.b \_out3, \_in0, \_in3
|
||||
.endm
|
||||
.macro LASX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
|
||||
xvadd.h \_out0, \_in0, \_in3
|
||||
xvadd.h \_out1, \_in1, \_in2
|
||||
xvsub.h \_out2, \_in1, \_in2
|
||||
xvsub.h \_out3, \_in0, \_in3
|
||||
.endm
|
||||
.macro LASX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
|
||||
xvadd.w \_out0, \_in0, \_in3
|
||||
xvadd.w \_out1, \_in1, \_in2
|
||||
xvsub.w \_out2, \_in1, \_in2
|
||||
xvsub.w \_out3, \_in0, \_in3
|
||||
.endm
|
||||
.macro LASX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
|
||||
xvadd.d \_out0, \_in0, \_in3
|
||||
xvadd.d \_out1, \_in1, \_in2
|
||||
xvsub.d \_out2, \_in1, \_in2
|
||||
xvsub.d \_out3, \_in0, \_in3
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Description : Butterfly of 8 input vectors
|
||||
* Arguments : Inputs - _in0, _in1, _in2, _in3, ~
|
||||
* Outputs - _out0, _out1, _out2, _out3, ~
|
||||
* Details : Butterfly operation
|
||||
* Example : LASX_BUTTERFLY_8
|
||||
* _out0 = _in0 + _in7;
|
||||
* _out1 = _in1 + _in6;
|
||||
* _out2 = _in2 + _in5;
|
||||
* _out3 = _in3 + _in4;
|
||||
* _out4 = _in3 - _in4;
|
||||
* _out5 = _in2 - _in5;
|
||||
* _out6 = _in1 - _in6;
|
||||
* _out7 = _in0 - _in7;
|
||||
*/
|
||||
.macro LSX_BUTTERFLY_8_B _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
|
||||
_out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
|
||||
vadd.b \_out0, \_in0, \_in7
|
||||
vadd.b \_out1, \_in1, \_in6
|
||||
vadd.b \_out2, \_in2, \_in5
|
||||
vadd.b \_out3, \_in3, \_in4
|
||||
vsub.b \_out4, \_in3, \_in4
|
||||
vsub.b \_out5, \_in2, \_in5
|
||||
vsub.b \_out6, \_in1, \_in6
|
||||
vsub.b \_out7, \_in0, \_in7
|
||||
.endm
|
||||
|
||||
.macro LSX_BUTTERFLY_8_H _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
|
||||
_out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
|
||||
vadd.h \_out0, \_in0, \_in7
|
||||
vadd.h \_out1, \_in1, \_in6
|
||||
vadd.h \_out2, \_in2, \_in5
|
||||
vadd.h \_out3, \_in3, \_in4
|
||||
vsub.h \_out4, \_in3, \_in4
|
||||
vsub.h \_out5, \_in2, \_in5
|
||||
vsub.h \_out6, \_in1, \_in6
|
||||
vsub.h \_out7, \_in0, \_in7
|
||||
.endm
|
||||
|
||||
.macro LSX_BUTTERFLY_8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
|
||||
_out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
|
||||
vadd.w \_out0, \_in0, \_in7
|
||||
vadd.w \_out1, \_in1, \_in6
|
||||
vadd.w \_out2, \_in2, \_in5
|
||||
vadd.w \_out3, \_in3, \_in4
|
||||
vsub.w \_out4, \_in3, \_in4
|
||||
vsub.w \_out5, \_in2, \_in5
|
||||
vsub.w \_out6, \_in1, \_in6
|
||||
vsub.w \_out7, \_in0, \_in7
|
||||
.endm
|
||||
|
||||
.macro LSX_BUTTERFLY_8_D _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
|
||||
_out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
|
||||
vadd.d \_out0, \_in0, \_in7
|
||||
vadd.d \_out1, \_in1, \_in6
|
||||
vadd.d \_out2, \_in2, \_in5
|
||||
vadd.d \_out3, \_in3, \_in4
|
||||
vsub.d \_out4, \_in3, \_in4
|
||||
vsub.d \_out5, \_in2, \_in5
|
||||
vsub.d \_out6, \_in1, \_in6
|
||||
vsub.d \_out7, \_in0, \_in7
|
||||
.endm
|
||||
|
||||
.macro LASX_BUTTERFLY_8_B _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
|
||||
_out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
|
||||
xvadd.b \_out0, \_in0, \_in7
|
||||
xvadd.b \_out1, \_in1, \_in6
|
||||
xvadd.b \_out2, \_in2, \_in5
|
||||
xvadd.b \_out3, \_in3, \_in4
|
||||
xvsub.b \_out4, \_in3, \_in4
|
||||
xvsub.b \_out5, \_in2, \_in5
|
||||
xvsub.b \_out6, \_in1, \_in6
|
||||
xvsub.b \_out7, \_in0, \_in7
|
||||
.endm
|
||||
|
||||
.macro LASX_BUTTERFLY_8_H _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
|
||||
_out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
|
||||
xvadd.h \_out0, \_in0, \_in7
|
||||
xvadd.h \_out1, \_in1, \_in6
|
||||
xvadd.h \_out2, \_in2, \_in5
|
||||
xvadd.h \_out3, \_in3, \_in4
|
||||
xvsub.h \_out4, \_in3, \_in4
|
||||
xvsub.h \_out5, \_in2, \_in5
|
||||
xvsub.h \_out6, \_in1, \_in6
|
||||
xvsub.h \_out7, \_in0, \_in7
|
||||
.endm
|
||||
|
||||
.macro LASX_BUTTERFLY_8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
|
||||
_out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
|
||||
xvadd.w \_out0, \_in0, \_in7
|
||||
xvadd.w \_out1, \_in1, \_in6
|
||||
xvadd.w \_out2, \_in2, \_in5
|
||||
xvadd.w \_out3, \_in3, \_in4
|
||||
xvsub.w \_out4, \_in3, \_in4
|
||||
xvsub.w \_out5, \_in2, \_in5
|
||||
xvsub.w \_out6, \_in1, \_in6
|
||||
xvsub.w \_out7, \_in0, \_in7
|
||||
.endm
|
Loading…
x
Reference in New Issue
Block a user