mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-03 05:10:03 +02:00
7845b5ecd6
Replaced function(LSX is sufficient for these functions): ff_h264_v_lpf_chroma_8_lasx ff_h264_h_lpf_chroma_8_lasx ff_h264_v_lpf_chroma_intra_8_lasx ff_h264_h_lpf_chroma_intra_8_lasx ff_weight_h264_pixels4_8_lasx ff_biweight_h264_pixels4_8_lasx ./configure --disable-lasx ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an before: 161fps after: 199fps Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
1978 lines
74 KiB
ArmAsm
1978 lines
74 KiB
ArmAsm
/*
|
|
* Loongson LSX/LASX optimized h264dsp
|
|
*
|
|
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
|
* Contributed by Hao Chen <chenhao@loongson.cn>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "loongson_asm.S"
|
|
|
|
const vec_shuf
|
|
.rept 2
|
|
.byte 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
|
|
.endr
|
|
endconst
|
|
|
|
.macro AVC_LPF_P1_OR_Q1 _in0, _in1, _in2, _in3, _in4, _in5, _out, _tmp0, _tmp1
|
|
vavgr.hu \_tmp0, \_in0, \_in1
|
|
vslli.h \_tmp1, \_in2, 1
|
|
vsub.h \_tmp0, \_tmp0, \_tmp1
|
|
vavg.h \_tmp0, \_in3, \_tmp0
|
|
vclip.h \_tmp0, \_tmp0, \_in4, \_in5
|
|
vadd.h \_out, \_in2, \_tmp0
|
|
.endm
|
|
|
|
.macro AVC_LPF_P0Q0 _in0, _in1, _in2, _in3, _in4, _in5, _out0, \
|
|
_out1, _tmp0, _tmp1
|
|
vsub.h \_tmp0, \_in0, \_in1
|
|
vsub.h \_tmp1, \_in2, \_in3
|
|
vslli.h \_tmp0, \_tmp0, 2
|
|
vaddi.hu \_tmp1, \_tmp1, 4
|
|
vadd.h \_tmp0, \_tmp0, \_tmp1
|
|
vsrai.h \_tmp0, \_tmp0, 3
|
|
vclip.h \_tmp0, \_tmp0, \_in4, \_in5
|
|
vadd.h \_out0, \_in1, \_tmp0
|
|
vsub.h \_out1, \_in0, \_tmp0
|
|
vclip255.h \_out0, \_out0
|
|
vclip255.h \_out1, \_out1
|
|
.endm
|
|
|
|
.macro SAVE_REG
|
|
addi.d sp, sp, -64
|
|
fst.d f24, sp, 0
|
|
fst.d f25, sp, 8
|
|
fst.d f26, sp, 16
|
|
fst.d f27, sp, 24
|
|
fst.d f28, sp, 32
|
|
fst.d f29, sp, 40
|
|
fst.d f30, sp, 48
|
|
fst.d f31, sp, 56
|
|
.endm
|
|
|
|
.macro RESTORE_REG
|
|
fld.d f24, sp, 0
|
|
fld.d f25, sp, 8
|
|
fld.d f26, sp, 16
|
|
fld.d f27, sp, 24
|
|
fld.d f28, sp, 32
|
|
fld.d f29, sp, 40
|
|
fld.d f30, sp, 48
|
|
fld.d f31, sp, 56
|
|
addi.d sp, sp, 64
|
|
.endm
|
|
|
|
.macro load_double _in0, _in1, _in2, _in3, _src, _str0, _str1, _str2
|
|
fld.d \_in0, \_src, 0
|
|
fldx.d \_in1, \_src, \_str0
|
|
fldx.d \_in2, \_src, \_str1
|
|
fldx.d \_in3, \_src, \_str2
|
|
.endm
|
|
|
|
.macro store_double _in0, _in1, _in2, _in3, _dst, _str0, _str1, _str2
|
|
fst.d \_in0, \_dst, 0
|
|
fstx.d \_in1, \_dst, \_str0
|
|
fstx.d \_in2, \_dst, \_str1
|
|
fstx.d \_in3, \_dst, \_str2
|
|
.endm
|
|
|
|
function ff_h264_h_lpf_luma_8_lsx
|
|
slli.d t0, a1, 1 //img_width_2x
|
|
slli.d t1, a1, 2 //img_width_4x
|
|
slli.d t2, a1, 3 //img_width_8x
|
|
SAVE_REG
|
|
la.local t4, vec_shuf
|
|
add.d t3, t0, a1 //img_width_3x
|
|
vldrepl.w vr0, a4, 0 //tmp_vec0
|
|
vld vr1, t4, 0 //tc_vec
|
|
vshuf.b vr1, vr0, vr0, vr1 //tc_vec
|
|
vslti.b vr2, vr1, 0
|
|
vxori.b vr2, vr2, 255
|
|
vandi.b vr2, vr2, 1 //bs_vec
|
|
vsetnez.v $fcc0, vr2
|
|
bceqz $fcc0, .END_LUMA_8
|
|
vldi vr0, 0 //zero
|
|
addi.d t4, a0, -4 //src
|
|
vslt.bu vr3, vr0, vr2 //is_bs_greater_than0
|
|
add.d t5, t4, t2 //src_tmp
|
|
vld vr4, t4, 0 //row0
|
|
vldx vr5, t4, a1 //row1
|
|
vldx vr6, t4, t0 //row2
|
|
vldx vr7, t4, t3 //row3
|
|
add.d t6, t4, t1 // src += img_width_4x
|
|
vld vr8, t6, 0 //row4
|
|
vldx vr9, t6, a1 //row5
|
|
vldx vr10, t6, t0 //row6
|
|
vldx vr11, t6, t3 //row7
|
|
vld vr12, t5, 0 //row8
|
|
vldx vr13, t5, a1 //row9
|
|
vldx vr14, t5, t0 //row10
|
|
vldx vr15, t5, t3 //row11
|
|
add.d t6, t5, t1 // src_tmp += img_width_4x
|
|
vld vr16, t6, 0 //row12
|
|
vldx vr17, t6, a1 //row13
|
|
vldx vr18, t6, t0 //row14
|
|
vldx vr19, t6, t3 //row15
|
|
LSX_TRANSPOSE16X8_B vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11, \
|
|
vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \
|
|
vr10, vr11, vr12, vr13, vr14, vr15, vr16, vr17, \
|
|
vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
|
|
//vr10: p3_org, vr11: p2_org, vr12: p1_org, vr13: p0_org
|
|
//vr14: q0_org, vr15: q1_org, vr16: q2_org, vr17: q3_org
|
|
vabsd.bu vr20, vr13, vr14 //p0_asub_q0
|
|
vabsd.bu vr21, vr12, vr13 //p1_asub_p0
|
|
vabsd.bu vr22, vr15, vr14 //q1_asub_q0
|
|
|
|
vreplgr2vr.b vr4, a2 //alpha
|
|
vreplgr2vr.b vr5, a3 //beta
|
|
|
|
vslt.bu vr6, vr20, vr4 //is_less_than_alpha
|
|
vslt.bu vr7, vr21, vr5 //is_less_than_beta
|
|
vand.v vr8, vr6, vr7 //is_less_than
|
|
vslt.bu vr7, vr22, vr5 //is_less_than_beta
|
|
vand.v vr8, vr7, vr8 //is_less_than
|
|
vand.v vr8, vr8, vr3 //is_less_than
|
|
vsetnez.v $fcc0, vr8
|
|
bceqz $fcc0, .END_LUMA_8
|
|
vneg.b vr9, vr1 //neg_tc_h
|
|
vsllwil.hu.bu vr18, vr1, 0 //tc_h.0
|
|
vexth.hu.bu vr19, vr1 //tc_h.1
|
|
vexth.h.b vr2, vr9 //neg_tc_h.1
|
|
vsllwil.h.b vr9, vr9, 0 //neg_tc_h.0
|
|
|
|
vsllwil.hu.bu vr23, vr12, 0 //p1_org_h.0
|
|
vexth.hu.bu vr3, vr12 //p1_org_h.1
|
|
vsllwil.hu.bu vr24, vr13, 0 //p0_org_h.0
|
|
vexth.hu.bu vr4, vr13 //p0_org_h.1
|
|
vsllwil.hu.bu vr25, vr14, 0 //q0_org_h.0
|
|
vexth.hu.bu vr6, vr14 //q0_org_h.1
|
|
|
|
vabsd.bu vr0, vr11, vr13 //p2_asub_p0
|
|
vslt.bu vr7, vr0, vr5
|
|
vand.v vr7, vr8, vr7 //is_less_than_beta
|
|
vsetnez.v $fcc0, vr7
|
|
bceqz $fcc0, .END_LUMA_BETA
|
|
vsllwil.hu.bu vr26, vr11, 0 //p2_org_h.0
|
|
vexth.hu.bu vr0, vr11 //p2_org_h.1
|
|
AVC_LPF_P1_OR_Q1 vr24, vr25, vr23, vr26, vr9, vr18, vr27, vr28, vr29
|
|
AVC_LPF_P1_OR_Q1 vr4, vr6, vr3, vr0, vr2, vr19, vr28, vr29, vr30
|
|
vpickev.b vr27, vr28, vr27
|
|
vbitsel.v vr12, vr12, vr27, vr7
|
|
vandi.b vr7, vr7, 1
|
|
vadd.b vr1, vr1, vr7
|
|
.END_LUMA_BETA:
|
|
vabsd.bu vr26, vr16, vr14 //q2_asub_q0
|
|
vslt.bu vr7, vr26, vr5
|
|
vand.v vr7, vr7, vr8
|
|
vsllwil.hu.bu vr27, vr15, 0 //q1_org_h.0
|
|
vexth.hu.bu vr26, vr15 //q1_org_h.1
|
|
vsetnez.v $fcc0, vr7
|
|
bceqz $fcc0, .END_LUMA_BETA_SEC
|
|
vsllwil.hu.bu vr28, vr16, 0 //q2_org_h.0
|
|
vexth.hu.bu vr0, vr16 //q2_org_h.1
|
|
AVC_LPF_P1_OR_Q1 vr24, vr25, vr27, vr28, vr9, vr18, vr29, vr30, vr31
|
|
AVC_LPF_P1_OR_Q1 vr4, vr6, vr26, vr0, vr2, vr19, vr22, vr30, vr31
|
|
vpickev.b vr29, vr22, vr29
|
|
vbitsel.v vr15, vr15, vr29, vr7
|
|
vandi.b vr7, vr7, 1
|
|
vadd.b vr1, vr1, vr7
|
|
.END_LUMA_BETA_SEC:
|
|
vneg.b vr22, vr1 //neg_thresh_h
|
|
vsllwil.h.b vr28, vr22, 0 //neg_thresh_h.0
|
|
vexth.h.b vr29, vr22 //neg_thresh_h.1
|
|
vsllwil.hu.bu vr18, vr1, 0 //tc_h.0
|
|
vexth.hu.bu vr1, vr1 //tc_h.1
|
|
AVC_LPF_P0Q0 vr25, vr24, vr23, vr27, vr28, vr18, vr30, vr31, vr0, vr2
|
|
AVC_LPF_P0Q0 vr6, vr4, vr3, vr26, vr29, vr1, vr20, vr21, vr0, vr2
|
|
vpickev.b vr30, vr20, vr30 //p0_h
|
|
vpickev.b vr31, vr21, vr31 //q0_h
|
|
vbitsel.v vr13, vr13, vr30, vr8 //p0_org
|
|
vbitsel.v vr14, vr14, vr31, vr8 //q0_org
|
|
|
|
vilvl.b vr4, vr12, vr10 // row0.0
|
|
vilvl.b vr5, vr16, vr14 // row0.1
|
|
vilvl.b vr6, vr13, vr11 // row2.0
|
|
vilvl.b vr7, vr17, vr15 // row2.1
|
|
|
|
vilvh.b vr8, vr12, vr10 // row1.0
|
|
vilvh.b vr9, vr16, vr14 // row1.1
|
|
vilvh.b vr10, vr13, vr11 // row3.0
|
|
vilvh.b vr11, vr17, vr15 // row3.1
|
|
|
|
vilvl.b vr12, vr6, vr4 // row4.0
|
|
vilvl.b vr13, vr7, vr5 // row4.1
|
|
vilvl.b vr14, vr10, vr8 // row6.0
|
|
vilvl.b vr15, vr11, vr9 // row6.1
|
|
|
|
vilvh.b vr16, vr6, vr4 // row5.0
|
|
vilvh.b vr17, vr7, vr5 // row5.1
|
|
vilvh.b vr18, vr10, vr8 // row7.0
|
|
vilvh.b vr19, vr11, vr9 // row7.1
|
|
|
|
vilvl.w vr4, vr13, vr12 // row4: 0, 4, 1, 5
|
|
vilvh.w vr5, vr13, vr12 // row4: 2, 6, 3, 7
|
|
vilvl.w vr6, vr17, vr16 // row5: 0, 4, 1, 5
|
|
vilvh.w vr7, vr17, vr16 // row5: 2, 6, 3, 7
|
|
|
|
vilvl.w vr8, vr15, vr14 // row6: 0, 4, 1, 5
|
|
vilvh.w vr9, vr15, vr14 // row6: 2, 6, 3, 7
|
|
vilvl.w vr10, vr19, vr18 // row7: 0, 4, 1, 5
|
|
vilvh.w vr11, vr19, vr18 // row7: 2, 6, 3, 7
|
|
|
|
vbsrl.v vr20, vr4, 8
|
|
vbsrl.v vr21, vr5, 8
|
|
vbsrl.v vr22, vr6, 8
|
|
vbsrl.v vr23, vr7, 8
|
|
|
|
vbsrl.v vr24, vr8, 8
|
|
vbsrl.v vr25, vr9, 8
|
|
vbsrl.v vr26, vr10, 8
|
|
vbsrl.v vr27, vr11, 8
|
|
|
|
store_double f4, f20, f5, f21, t4, a1, t0, t3
|
|
add.d t4, t4, t1
|
|
store_double f6, f22, f7, f23, t4, a1, t0, t3
|
|
add.d t4, t4, t1
|
|
store_double f8, f24, f9, f25, t4, a1, t0, t3
|
|
add.d t4, t4, t1
|
|
store_double f10, f26, f11, f27, t4, a1, t0, t3
|
|
.END_LUMA_8:
|
|
RESTORE_REG
|
|
endfunc
|
|
|
|
function ff_h264_v_lpf_luma_8_lsx
|
|
slli.d t0, a1, 1 //img_width_2x
|
|
la.local t4, vec_shuf
|
|
vldrepl.w vr0, a4, 0 //tmp_vec0
|
|
vld vr1, t4, 0 //tc_vec
|
|
add.d t1, t0, a1 //img_width_3x
|
|
vshuf.b vr1, vr0, vr0, vr1 //tc_vec
|
|
addi.d sp, sp, -24
|
|
fst.d f24, sp, 0
|
|
fst.d f25, sp, 8
|
|
fst.d f26, sp, 16
|
|
vslti.b vr2, vr1, 0
|
|
vxori.b vr2, vr2, 255
|
|
vandi.b vr2, vr2, 1 //bs_vec
|
|
vsetnez.v $fcc0, vr2
|
|
bceqz $fcc0, .END_V_LUMA_8
|
|
sub.d t2, a0, t1 //data - img_width_3x
|
|
vreplgr2vr.b vr4, a2 //alpha
|
|
vreplgr2vr.b vr5, a3 //beta
|
|
vldi vr0, 0 //zero
|
|
vld vr10, t2, 0 //p2_org
|
|
vldx vr11, t2, a1 //p1_org
|
|
vldx vr12, t2, t0 //p0_org
|
|
vld vr13, a0, 0 //q0_org
|
|
vldx vr14, a0, a1 //q1_org
|
|
|
|
vslt.bu vr0, vr0, vr2 //is_bs_greater_than0
|
|
vabsd.bu vr16, vr11, vr12 //p1_asub_p0
|
|
vabsd.bu vr15, vr12, vr13 //p0_asub_q0
|
|
vabsd.bu vr17, vr14, vr13 //q1_asub_q0
|
|
|
|
vslt.bu vr6, vr15, vr4 //is_less_than_alpha
|
|
vslt.bu vr7, vr16, vr5 //is_less_than_beta
|
|
vand.v vr8, vr6, vr7 //is_less_than
|
|
vslt.bu vr7, vr17, vr5 //is_less_than_beta
|
|
vand.v vr8, vr7, vr8
|
|
vand.v vr8, vr8, vr0 //is_less_than
|
|
|
|
vsetnez.v $fcc0, vr8
|
|
bceqz $fcc0, .END_V_LUMA_8
|
|
vldx vr15, a0, t0 //q2_org
|
|
vneg.b vr0, vr1 //neg_tc_h
|
|
vsllwil.h.b vr18, vr1, 0 //tc_h.0
|
|
vexth.h.b vr19, vr1 //tc_h.1
|
|
vsllwil.h.b vr9, vr0, 0 //neg_tc_h.0
|
|
vexth.h.b vr2, vr0 //neg_tc_h.1
|
|
|
|
vsllwil.hu.bu vr16, vr11, 0 //p1_org_h.0
|
|
vexth.hu.bu vr17, vr11 //p1_org_h.1
|
|
vsllwil.hu.bu vr20, vr12, 0 //p0_org_h.0
|
|
vexth.hu.bu vr21, vr12 //p0_org_h.1
|
|
vsllwil.hu.bu vr22, vr13, 0 //q0_org_h.0
|
|
vexth.hu.bu vr23, vr13 //q0_org_h.1
|
|
|
|
vabsd.bu vr0, vr10, vr12 //p2_asub_p0
|
|
vslt.bu vr7, vr0, vr5 //is_less_than_beta
|
|
vand.v vr7, vr7, vr8 //is_less_than_beta
|
|
|
|
vsetnez.v $fcc0, vr8
|
|
bceqz $fcc0, .END_V_LESS_BETA
|
|
vsllwil.hu.bu vr3, vr10, 0 //p2_org_h.0
|
|
vexth.hu.bu vr4, vr10 //p2_org_h.1
|
|
AVC_LPF_P1_OR_Q1 vr20, vr22, vr16, vr3, vr9, vr18, vr24, vr0, vr26
|
|
AVC_LPF_P1_OR_Q1 vr21, vr23, vr17, vr4, vr2, vr19, vr25, vr0, vr26
|
|
vpickev.b vr24, vr25, vr24
|
|
vbitsel.v vr24, vr11, vr24, vr7
|
|
addi.d t3, t2, 16
|
|
vstx vr24, t2, a1
|
|
vandi.b vr7, vr7, 1
|
|
vadd.b vr1, vr7, vr1
|
|
.END_V_LESS_BETA:
|
|
vabsd.bu vr0, vr15, vr13 //q2_asub_q0
|
|
vslt.bu vr7, vr0, vr5 //is_less_than_beta
|
|
vand.v vr7, vr7, vr8 //is_less_than_beta
|
|
vsllwil.hu.bu vr3, vr14, 0 //q1_org_h.0
|
|
vexth.hu.bu vr4, vr14 //q1_org_h.1
|
|
|
|
vsetnez.v $fcc0, vr7
|
|
bceqz $fcc0, .END_V_LESS_BETA_SEC
|
|
vsllwil.hu.bu vr11, vr15, 0 //q2_org_h.0
|
|
vexth.hu.bu vr15, vr15 //q2_org_h.1
|
|
AVC_LPF_P1_OR_Q1 vr20, vr22, vr3, vr11, vr9, vr18, vr24, vr0, vr26
|
|
AVC_LPF_P1_OR_Q1 vr21, vr23, vr4, vr15, vr2, vr19, vr25, vr0, vr26
|
|
vpickev.b vr24, vr25, vr24
|
|
vbitsel.v vr24, vr14, vr24, vr7
|
|
vstx vr24, a0, a1
|
|
vandi.b vr7, vr7, 1
|
|
vadd.b vr1, vr1, vr7
|
|
.END_V_LESS_BETA_SEC:
|
|
vneg.b vr0, vr1
|
|
vsllwil.h.b vr9, vr0, 0 //neg_thresh_h.0
|
|
vexth.h.b vr2, vr0 //neg_thresh_h.1
|
|
vsllwil.hu.bu vr18, vr1, 0 //tc_h.0
|
|
vexth.hu.bu vr19, vr1 //tc_h.1
|
|
AVC_LPF_P0Q0 vr22, vr20, vr16, vr3, vr9, vr18, vr11, vr15, vr0, vr26
|
|
AVC_LPF_P0Q0 vr23, vr21, vr17, vr4, vr2, vr19, vr10, vr14, vr0, vr26
|
|
vpickev.b vr11, vr10, vr11 //p0_h
|
|
vpickev.b vr15, vr14, vr15 //q0_h
|
|
vbitsel.v vr11, vr12, vr11, vr8 //p0_h
|
|
vbitsel.v vr15, vr13, vr15, vr8 //q0_h
|
|
vstx vr11, t2, t0
|
|
vst vr15, a0, 0
|
|
.END_V_LUMA_8:
|
|
fld.d f24, sp, 0
|
|
fld.d f25, sp, 8
|
|
fld.d f26, sp, 16
|
|
addi.d sp, sp, 24
|
|
endfunc
|
|
|
|
const chroma_shuf
|
|
.byte 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3
|
|
endconst
|
|
|
|
function ff_h264_h_lpf_chroma_8_lsx
|
|
slli.d t0, a1, 1 //img_width_2x
|
|
slli.d t1, a1, 2 //img_width_4x
|
|
la.local t4, chroma_shuf
|
|
add.d t2, t0, a1 //img_width_3x
|
|
vldrepl.w vr0, a4, 0 //tmp_vec0
|
|
vld vr1, t4, 0 //tc_vec
|
|
vshuf.b vr1, vr0, vr0, vr1 //tc_vec
|
|
vslti.b vr2, vr1, 0
|
|
vxori.b vr2, vr2, 255
|
|
vandi.b vr2, vr2, 1 //bs_vec
|
|
vsetnez.v $fcc0, vr2
|
|
bceqz $fcc0, .END_CHROMA_8
|
|
vldi vr0, 0
|
|
addi.d t4, a0, -2
|
|
vslt.bu vr3, vr0, vr2 //is_bs_greater_than0
|
|
add.d t5, t4, t1
|
|
vld vr4, t4, 0 //row0
|
|
vldx vr5, t4, a1 //row1
|
|
vldx vr6, t4, t0 //row2
|
|
vldx vr7, t4, t2 //row3
|
|
vld vr8, t5, 0 //row4
|
|
vldx vr9, t5, a1 //row5
|
|
vldx vr10, t5, t0 //row6
|
|
vldx vr11, t5, t2 //row7
|
|
vilvl.b vr12, vr6, vr4 //p1_org
|
|
vilvl.b vr13, vr7, vr5 //p0_org
|
|
vilvl.b vr14, vr10, vr8 //q0_org
|
|
vilvl.b vr15, vr11, vr9 //q1_org
|
|
vilvl.b vr4, vr13, vr12 //row0
|
|
vilvl.b vr5, vr15, vr14 //row1
|
|
vilvl.w vr6, vr5, vr4 //row2
|
|
vilvh.w vr7, vr5, vr4 //row3
|
|
vilvl.d vr12, vr6, vr6 //p1_org
|
|
vilvh.d vr13, vr6, vr6 //p0_org
|
|
vilvl.d vr14, vr7, vr7 //q0_org
|
|
vilvh.d vr15, vr7, vr7 //q1_org
|
|
|
|
vabsd.bu vr20, vr13, vr14 //p0_asub_q0
|
|
vabsd.bu vr21, vr12, vr13 //p1_asub_p0
|
|
vabsd.bu vr22, vr15, vr14 //q1_asub_q0
|
|
|
|
vreplgr2vr.b vr4, a2 //alpha
|
|
vreplgr2vr.b vr5, a3 //beta
|
|
|
|
vslt.bu vr6, vr20, vr4 //is_less_than_alpha
|
|
vslt.bu vr7, vr21, vr5 //is_less_than_beta
|
|
vand.v vr8, vr6, vr7 //is_less_than
|
|
vslt.bu vr7, vr22, vr5 //is_less_than_beta
|
|
vand.v vr8, vr7, vr8 //is_less_than
|
|
vand.v vr8, vr8, vr3 //is_less_than
|
|
vsetnez.v $fcc0, vr8
|
|
bceqz $fcc0, .END_CHROMA_8
|
|
|
|
vneg.b vr9, vr1 //neg_tc_h
|
|
vexth.hu.bu vr3, vr12 //p1_org_h
|
|
vexth.hu.bu vr4, vr13 //p0_org_h.1
|
|
vexth.hu.bu vr5, vr14 //q0_org_h.1
|
|
vexth.hu.bu vr6, vr15 //q1_org_h.1
|
|
|
|
vexth.hu.bu vr18, vr1 //tc_h.1
|
|
vexth.h.b vr2, vr9 //neg_tc_h.1
|
|
|
|
AVC_LPF_P0Q0 vr5, vr4, vr3, vr6, vr2, vr18, vr10, vr11, vr16, vr17
|
|
vpickev.b vr10, vr10, vr10 //p0_h
|
|
vpickev.b vr11, vr11, vr11 //q0_h
|
|
vbitsel.v vr13, vr13, vr10, vr8
|
|
vbitsel.v vr14, vr14, vr11, vr8
|
|
vilvl.b vr15, vr14, vr13
|
|
addi.d t4, t4, 1
|
|
add.d t5, t4, a1
|
|
add.d t6, t4, t0
|
|
add.d t7, t4, t2
|
|
vstelm.h vr15, t4, 0, 0
|
|
vstelm.h vr15, t5, 0, 1
|
|
vstelm.h vr15, t6, 0, 2
|
|
vstelm.h vr15, t7, 0, 3
|
|
add.d t4, t4, t1
|
|
add.d t5, t4, a1
|
|
add.d t6, t4, t0
|
|
add.d t7, t4, t2
|
|
vstelm.h vr15, t4, 0, 4
|
|
vstelm.h vr15, t5, 0, 5
|
|
vstelm.h vr15, t6, 0, 6
|
|
vstelm.h vr15, t7, 0, 7
|
|
.END_CHROMA_8:
|
|
endfunc
|
|
|
|
function ff_h264_v_lpf_chroma_8_lsx
|
|
slli.d t0, a1, 1 //img_width_2x
|
|
la.local t4, chroma_shuf
|
|
vldrepl.w vr0, a4, 0 //tmp_vec0
|
|
vld vr1, t4, 0 //tc_vec
|
|
vshuf.b vr1, vr0, vr0, vr1 //tc_vec
|
|
vslti.b vr2, vr1, 0
|
|
vxori.b vr2, vr2, 255
|
|
vandi.b vr2, vr2, 1 //bs_vec
|
|
vsetnez.v $fcc0, vr2
|
|
bceqz $fcc0, .END_CHROMA_V_8
|
|
vldi vr0, 0
|
|
sub.d t4, a0, t0
|
|
vslt.bu vr3, vr0, vr2 //is_bs_greater_than0
|
|
vld vr12, t4, 0 //p1_org
|
|
vldx vr13, t4, a1 //p0_org
|
|
vld vr14, a0, 0 //q0_org
|
|
vldx vr15, a0, a1 //q1_org
|
|
|
|
vabsd.bu vr20, vr13, vr14 //p0_asub_q0
|
|
vabsd.bu vr21, vr12, vr13 //p1_asub_p0
|
|
vabsd.bu vr22, vr15, vr14 //q1_asub_q0
|
|
|
|
vreplgr2vr.b vr4, a2 //alpha
|
|
vreplgr2vr.b vr5, a3 //beta
|
|
|
|
vslt.bu vr6, vr20, vr4 //is_less_than_alpha
|
|
vslt.bu vr7, vr21, vr5 //is_less_than_beta
|
|
vand.v vr8, vr6, vr7 //is_less_than
|
|
vslt.bu vr7, vr22, vr5 //is_less_than_beta
|
|
vand.v vr8, vr7, vr8 //is_less_than
|
|
vand.v vr8, vr8, vr3 //is_less_than
|
|
vsetnez.v $fcc0, vr8
|
|
bceqz $fcc0, .END_CHROMA_V_8
|
|
|
|
vneg.b vr9, vr1 //neg_tc_h
|
|
vsllwil.hu.bu vr3, vr12, 0 //p1_org_h
|
|
vsllwil.hu.bu vr4, vr13, 0 //p0_org_h.1
|
|
vsllwil.hu.bu vr5, vr14, 0 //q0_org_h.1
|
|
vsllwil.hu.bu vr6, vr15, 0 //q1_org_h.1
|
|
|
|
vexth.hu.bu vr18, vr1 //tc_h.1
|
|
vexth.h.b vr2, vr9 //neg_tc_h.1
|
|
|
|
AVC_LPF_P0Q0 vr5, vr4, vr3, vr6, vr2, vr18, vr10, vr11, vr16, vr17
|
|
vpickev.b vr10, vr10, vr10 //p0_h
|
|
vpickev.b vr11, vr11, vr11 //q0_h
|
|
vbitsel.v vr10, vr13, vr10, vr8
|
|
vbitsel.v vr11, vr14, vr11, vr8
|
|
fstx.d f10, t4, a1
|
|
fst.d f11, a0, 0
|
|
.END_CHROMA_V_8:
|
|
endfunc
|
|
|
|
.macro AVC_LPF_P0P1P2_OR_Q0Q1Q2 _in0, _in1, _in2, _in3, _in4, _in5 \
|
|
_out0, _out1, _out2, _tmp0, _const3
|
|
vadd.h \_tmp0, \_in1, \_in2
|
|
vadd.h \_tmp0, \_tmp0, \_in3
|
|
vslli.h \_out2, \_in0, 1
|
|
vslli.h \_out0, \_tmp0, 1
|
|
vadd.h \_out0, \_out0, \_in4
|
|
vadd.h \_out1, \_in4, \_tmp0
|
|
vadd.h \_out0, \_out0, \_in5
|
|
vmadd.h \_out2, \_in4, \_const3
|
|
vsrar.h \_out0, \_out0, \_const3
|
|
vadd.h \_out2, \_out2, \_tmp0
|
|
vsrari.h \_out1, \_out1, 2
|
|
vsrar.h \_out2, \_out2, \_const3
|
|
.endm
|
|
|
|
.macro AVC_LPF_P0_OR_Q0 _in0, _in1, _in2, _out0, _tmp0
|
|
vslli.h \_tmp0, \_in2, 1
|
|
vadd.h \_out0, \_in0, \_in1
|
|
vadd.h \_out0, \_out0, \_tmp0
|
|
vsrari.h \_out0, \_out0, 2
|
|
.endm
|
|
|
|
////LSX optimization is sufficient for this function.
|
|
function ff_h264_h_lpf_luma_intra_8_lsx
|
|
slli.d t0, a1, 1 //img_width_2x
|
|
slli.d t1, a1, 2 //img_width_4x
|
|
addi.d t4, a0, -4 //src
|
|
SAVE_REG
|
|
add.d t2, t0, a1 //img_width_3x
|
|
add.d t5, t4, t1
|
|
vld vr0, t4, 0 //row0
|
|
vldx vr1, t4, a1 //row1
|
|
vldx vr2, t4, t0 //row2
|
|
vldx vr3, t4, t2 //row3
|
|
add.d t6, t5, t1
|
|
vld vr4, t5, 0 //row4
|
|
vldx vr5, t5, a1 //row5
|
|
vldx vr6, t5, t0 //row6
|
|
vldx vr7, t5, t2 //row7
|
|
add.d t7, t6, t1
|
|
vld vr8, t6, 0 //row8
|
|
vldx vr9, t6, a1 //row9
|
|
vldx vr10, t6, t0 //row10
|
|
vldx vr11, t6, t2 //row11
|
|
vld vr12, t7, 0 //row12
|
|
vldx vr13, t7, a1 //row13
|
|
vldx vr14, t7, t0 //row14
|
|
vldx vr15, t7, t2 //row15
|
|
LSX_TRANSPOSE16X8_B vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
|
|
vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
|
|
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
|
|
vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
|
|
// vr0: p3_org, vr1: p2_org, vr2: p1_org, vr3: p0_org
|
|
// vr4: q0_org, vr5: q1_org, vr6: q2_org, vr7: q3_org
|
|
|
|
vreplgr2vr.b vr16, a2 //alpha_in
|
|
vreplgr2vr.b vr17, a3 //beta_in
|
|
vabsd.bu vr10, vr3, vr4 //p0_asub_q0
|
|
vabsd.bu vr11, vr2, vr3 //p1_asub_p0
|
|
vabsd.bu vr12, vr5, vr4 //q1_asub_q0
|
|
|
|
vslt.bu vr8, vr10, vr16 //is_less_than_alpha
|
|
vslt.bu vr9, vr11, vr17 //is_less_than_beta
|
|
vand.v vr18, vr8, vr9 //is_less_than
|
|
vslt.bu vr9, vr12, vr17 //is_less_than_beta
|
|
vand.v vr18, vr18, vr9 //is_less_than
|
|
|
|
vsetnez.v $fcc0, vr18
|
|
bceqz $fcc0, .END_H_INTRA_8
|
|
vsrli.b vr16, vr16, 2 //less_alpha_shift2_add2
|
|
vaddi.bu vr16, vr16, 2
|
|
vslt.bu vr16, vr10, vr16
|
|
vsllwil.hu.bu vr10, vr2, 0 //p1_org_h.0
|
|
vexth.hu.bu vr11, vr2 //p1_org_h.1
|
|
vsllwil.hu.bu vr12, vr3, 0 //p0_org_h.0
|
|
vexth.hu.bu vr13, vr3 //p0_org_h.1
|
|
|
|
vsllwil.hu.bu vr14, vr4, 0 //q0_org_h.0
|
|
vexth.hu.bu vr15, vr4 //q0_org_h.1
|
|
vsllwil.hu.bu vr19, vr5, 0 //q1_org_h.0
|
|
vexth.hu.bu vr20, vr5 //q1_org_h.1
|
|
|
|
vabsd.bu vr21, vr1, vr3 //p2_asub_p0
|
|
vslt.bu vr9, vr21, vr17 //is_less_than_beta
|
|
vand.v vr9, vr9, vr16
|
|
vxori.b vr22, vr9, 0xff //negate_is_less_than_beta
|
|
vand.v vr9, vr9, vr18
|
|
vand.v vr22, vr22, vr18
|
|
|
|
vsetnez.v $fcc0, vr9
|
|
bceqz $fcc0, .END_H_INTRA_LESS_BETA
|
|
vsllwil.hu.bu vr23, vr1, 0 //p2_org_h.0
|
|
vexth.hu.bu vr24, vr1 //p2_org_h.1
|
|
vsllwil.hu.bu vr25, vr0, 0 //p3_org_h.0
|
|
vexth.hu.bu vr26, vr0 //p3_org_h.1
|
|
vldi vr27, 0x403
|
|
|
|
AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr25, vr12, vr14, vr10, vr23, vr19, vr28, vr29, vr30, vr31, vr27
|
|
AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr26, vr13, vr15, vr11, vr24, vr20, vr23, vr25, vr21, vr31, vr27
|
|
vpickev.b vr28, vr23, vr28 //p0_h
|
|
vpickev.b vr29, vr25, vr29 //p1_h
|
|
vpickev.b vr30, vr21, vr30 //p2_h
|
|
vbitsel.v vr3, vr3, vr28, vr9
|
|
vbitsel.v vr2, vr2, vr29, vr9
|
|
vbitsel.v vr1, vr1, vr30, vr9
|
|
.END_H_INTRA_LESS_BETA:
|
|
AVC_LPF_P0_OR_Q0 vr12, vr19, vr10, vr23, vr25
|
|
AVC_LPF_P0_OR_Q0 vr13, vr20, vr11, vr24, vr25
|
|
//vr23: p0_h.0 vr24: p0_h.1
|
|
vpickev.b vr23, vr24, vr23
|
|
vbitsel.v vr3, vr3, vr23, vr22
|
|
|
|
vabsd.bu vr21, vr6, vr4 //q2_asub_q0
|
|
vslt.bu vr9, vr21, vr17 //is_less_than_beta
|
|
vand.v vr9, vr9, vr16
|
|
vxori.b vr22, vr9, 0xff //negate_is_less_than_beta
|
|
vand.v vr9, vr9, vr18
|
|
vand.v vr22, vr22, vr18
|
|
|
|
vsetnez.v $fcc0, vr9
|
|
bceqz $fcc0, .END_H_INTRA_LESS_BETA_SEC
|
|
vsllwil.hu.bu vr23, vr6, 0 //q2_org_h.0
|
|
vexth.hu.bu vr24, vr6 //q2_org_h.1
|
|
vsllwil.hu.bu vr25, vr7, 0 //q3_org_h.0
|
|
vexth.hu.bu vr26, vr7 //q3_org_h.1
|
|
vldi vr27, 0x403
|
|
|
|
AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr25, vr14, vr12, vr19, vr23, vr10, vr28, vr29, vr30, vr31, vr27
|
|
AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr26, vr15, vr13, vr20, vr24, vr11, vr23, vr25, vr21, vr31, vr27
|
|
vpickev.b vr28, vr23, vr28 //q0_h
|
|
vpickev.b vr29, vr25, vr29 //q1_h
|
|
vpickev.b vr30, vr21, vr30 //q2_h
|
|
vbitsel.v vr4, vr4, vr28, vr9
|
|
vbitsel.v vr5, vr5, vr29, vr9
|
|
vbitsel.v vr6, vr6, vr30, vr9
|
|
.END_H_INTRA_LESS_BETA_SEC:
|
|
AVC_LPF_P0_OR_Q0 vr14, vr10, vr19, vr23, vr25
|
|
AVC_LPF_P0_OR_Q0 vr15, vr11, vr20, vr24, vr25
|
|
vpickev.b vr23, vr24, vr23
|
|
vbitsel.v vr4, vr4, vr23, vr22
|
|
|
|
vilvl.b vr14, vr2, vr0 // row0.0
|
|
vilvl.b vr15, vr6, vr4 // row0.1
|
|
vilvl.b vr16, vr3, vr1 // row2.0
|
|
vilvl.b vr17, vr7, vr5 // row2.1
|
|
|
|
vilvh.b vr18, vr2, vr0 // row1.0
|
|
vilvh.b vr19, vr6, vr4 // row1.1
|
|
vilvh.b vr20, vr3, vr1 // row3.0
|
|
vilvh.b vr21, vr7, vr5 // row3.1
|
|
|
|
vilvl.b vr2, vr16, vr14 // row4.0
|
|
vilvl.b vr3, vr17, vr15 // row4.1
|
|
vilvl.b vr4, vr20, vr18 // row6.0
|
|
vilvl.b vr5, vr21, vr19 // row6.1
|
|
|
|
vilvh.b vr6, vr16, vr14 // row5.0
|
|
vilvh.b vr7, vr17, vr15 // row5.1
|
|
vilvh.b vr8, vr20, vr18 // row7.0
|
|
vilvh.b vr9, vr21, vr19 // row7.1
|
|
|
|
vilvl.w vr14, vr3, vr2 // row4: 0, 4, 1, 5
|
|
vilvh.w vr15, vr3, vr2 // row4: 2, 6, 3, 7
|
|
vilvl.w vr16, vr7, vr6 // row5: 0, 4, 1, 5
|
|
vilvh.w vr17, vr7, vr6 // row5: 2, 6, 3, 7
|
|
|
|
vilvl.w vr18, vr5, vr4 // row6: 0, 4, 1, 5
|
|
vilvh.w vr19, vr5, vr4 // row6: 2, 6, 3, 7
|
|
vilvl.w vr20, vr9, vr8 // row7: 0, 4, 1, 5
|
|
vilvh.w vr21, vr9, vr8 // row7: 2, 6, 3, 7
|
|
|
|
vbsrl.v vr0, vr14, 8
|
|
vbsrl.v vr1, vr15, 8
|
|
vbsrl.v vr2, vr16, 8
|
|
vbsrl.v vr3, vr17, 8
|
|
|
|
vbsrl.v vr4, vr18, 8
|
|
vbsrl.v vr5, vr19, 8
|
|
vbsrl.v vr6, vr20, 8
|
|
vbsrl.v vr7, vr21, 8
|
|
|
|
store_double f14, f0, f15, f1, t4, a1, t0, t2
|
|
store_double f16, f2, f17, f3, t5, a1, t0, t2
|
|
store_double f18, f4, f19, f5, t6, a1, t0, t2
|
|
store_double f20, f6, f21, f7, t7, a1, t0, t2
|
|
.END_H_INTRA_8:
|
|
RESTORE_REG
|
|
endfunc
|
|
|
|
//LSX optimization is sufficient for this function.
|
|
function ff_h264_v_lpf_luma_intra_8_lsx
|
|
slli.d t0, a1, 1 //img_width_2x
|
|
add.d t1, t0, a1 //img_width_3x
|
|
SAVE_REG
|
|
sub.d t4, a0, t1 //src - img_width_3x
|
|
|
|
vld vr0, a0, 0 //q0_org
|
|
vldx vr1, a0, a1 //q1_org
|
|
vldx vr2, t4, a1 //p1_org
|
|
vldx vr3, t4, t0 //p0_org
|
|
|
|
vreplgr2vr.b vr4, a2 //alpha
|
|
vreplgr2vr.b vr5, a3 //beta
|
|
|
|
vabsd.bu vr6, vr3, vr0 //p0_asub_q0
|
|
vabsd.bu vr7, vr2, vr3 //p1_asub_p0
|
|
vabsd.bu vr8, vr1, vr0 //q1_asub_q0
|
|
|
|
vslt.bu vr9, vr6, vr4 //is_less_than_alpha
|
|
vslt.bu vr10, vr7, vr5 //is_less_than_beta
|
|
vand.v vr11, vr9, vr10 //is_less_than
|
|
vslt.bu vr10, vr8, vr5
|
|
vand.v vr11, vr10, vr11
|
|
|
|
vsetnez.v $fcc0, vr11
|
|
bceqz $fcc0, .END_V_INTRA_8
|
|
|
|
vld vr12, t4, 0 //p2_org
|
|
vldx vr13, a0, t0 //q2_org
|
|
vsrli.b vr14, vr4, 2 //is_alpha_shift2_add2
|
|
vsllwil.hu.bu vr15, vr2, 0 //p1_org_h.0
|
|
vexth.hu.bu vr16, vr2 //p1_org_h.1
|
|
vaddi.bu vr14, vr14, 2
|
|
vsllwil.hu.bu vr17, vr3, 0 //p0_org_h.0
|
|
vexth.hu.bu vr18, vr3 //p0_org_h.1
|
|
vslt.bu vr14, vr6, vr14
|
|
vsllwil.hu.bu vr19, vr0, 0 //q0_org_h.0
|
|
vexth.hu.bu vr20, vr0 //q0_org_h.1
|
|
vsllwil.hu.bu vr21, vr1, 0 //q1_org_h.0
|
|
vexth.hu.bu vr22, vr1 //q1_org_h.1
|
|
|
|
vabsd.bu vr23, vr12, vr3 //p2_asub_p0
|
|
vslt.bu vr10, vr23, vr5 //is_less_than_beta
|
|
vand.v vr10, vr10, vr14
|
|
vxori.b vr23, vr10, 0xff //negate_is_less_than_beta
|
|
vand.v vr10, vr10, vr11
|
|
vand.v vr23, vr23, vr11
|
|
|
|
vsetnez.v $fcc0, vr10
|
|
bceqz $fcc0, .END_V_INTRA_LESS_BETA
|
|
sub.d t5, t4, a1
|
|
vld vr24, t5, 0 //p3_org
|
|
vsllwil.hu.bu vr26, vr12, 0 //p2_org_h.0
|
|
vexth.hu.bu vr27, vr12 //p2_org_h.1
|
|
vsllwil.hu.bu vr28, vr24, 0 //p3_org_h.0
|
|
vexth.hu.bu vr29, vr24 //p3_org_h.1
|
|
vldi vr4, 0x403
|
|
|
|
AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr28, vr17, vr19, vr15, vr26, vr21, vr25, vr30, vr31, vr24, vr4
|
|
AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr29, vr18, vr20, vr16, vr27, vr22, vr6, vr7, vr8, vr24, vr4
|
|
|
|
vpickev.b vr25, vr6, vr25 //p0_h
|
|
vpickev.b vr30, vr7, vr30 //p1_h
|
|
vpickev.b vr31, vr8, vr31 //p2_h
|
|
|
|
vbitsel.v vr3, vr3, vr25, vr10
|
|
vbitsel.v vr2, vr2, vr30, vr10
|
|
vbitsel.v vr12, vr12, vr31, vr10
|
|
|
|
vstx vr2, t4, a1
|
|
vst vr12, t4, 0
|
|
.END_V_INTRA_LESS_BETA:
|
|
AVC_LPF_P0_OR_Q0 vr17, vr21, vr15, vr24, vr30
|
|
AVC_LPF_P0_OR_Q0 vr18, vr22, vr16, vr25, vr30
|
|
vpickev.b vr24, vr25, vr24
|
|
vbitsel.v vr3, vr3, vr24, vr23
|
|
vstx vr3, t4, t0
|
|
|
|
vabsd.bu vr23, vr13, vr0 //q2_asub_q0
|
|
vslt.bu vr10, vr23, vr5 //is_less_than_beta
|
|
vand.v vr10, vr10, vr14
|
|
vxori.b vr23, vr10, 0xff //negate_is_less_than_beta
|
|
vand.v vr10, vr10, vr11
|
|
vand.v vr23, vr23, vr11
|
|
|
|
vsetnez.v $fcc0, vr10
|
|
bceqz $fcc0, .END_V_INTRA_LESS_BETA_SEC
|
|
vldx vr24, a0, t1 //q3_org
|
|
|
|
vsllwil.hu.bu vr26, vr13, 0 //q2_org_h.0
|
|
vexth.hu.bu vr27, vr13 //q2_org_h.1
|
|
vsllwil.hu.bu vr28, vr24, 0 //q3_org_h.0
|
|
vexth.hu.bu vr29, vr24 //q3_org_h.1
|
|
vldi vr4, 0x403
|
|
|
|
AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr28, vr19, vr17, vr21, vr26, vr15, vr25, vr30, vr31, vr24, vr4
|
|
AVC_LPF_P0P1P2_OR_Q0Q1Q2 vr29, vr20, vr18, vr22, vr27, vr16, vr6, vr7, vr8, vr24, vr4
|
|
|
|
vpickev.b vr25, vr6, vr25
|
|
vpickev.b vr30, vr7, vr30
|
|
vpickev.b vr31, vr8, vr31
|
|
|
|
vbitsel.v vr0, vr0, vr25, vr10
|
|
vbitsel.v vr1, vr1, vr30, vr10
|
|
vbitsel.v vr13, vr13, vr31, vr10
|
|
vstx vr1, a0, a1
|
|
vstx vr13, a0, t0
|
|
.END_V_INTRA_LESS_BETA_SEC:
|
|
AVC_LPF_P0_OR_Q0 vr19, vr15, vr21, vr24, vr30
|
|
AVC_LPF_P0_OR_Q0 vr20, vr16, vr22, vr25, vr30
|
|
vpickev.b vr24, vr25, vr24
|
|
vbitsel.v vr0, vr0, vr24, vr23
|
|
vst vr0, a0, 0
|
|
.END_V_INTRA_8:
|
|
RESTORE_REG
|
|
endfunc
|
|
|
|
function ff_h264_h_lpf_chroma_intra_8_lsx
|
|
addi.d t4, a0, -2
|
|
slli.d t0, a1, 1 //img_2x
|
|
slli.d t2, a1, 2 //img_4x
|
|
add.d t1, t0, a1 //img_3x
|
|
|
|
add.d t5, t4, t2
|
|
fld.s f0, t4, 0 //row0
|
|
fldx.s f1, t4, a1 //row1
|
|
fldx.s f2, t4, t0 //row2
|
|
fldx.s f3, t4, t1 //row3
|
|
fld.s f4, t5, 0 //row4
|
|
fldx.s f5, t5, a1 //row5
|
|
fldx.s f6, t5, t0 //row6
|
|
fldx.s f7, t5, t1 //row7
|
|
|
|
vilvl.b vr8, vr2, vr0 //p1_org
|
|
vilvl.b vr9, vr3, vr1 //p0_org
|
|
vilvl.b vr10, vr6, vr4 //q0_org
|
|
vilvl.b vr11, vr7, vr5 //q1_org
|
|
|
|
vilvl.b vr0, vr9, vr8
|
|
vilvl.b vr1, vr11, vr10
|
|
vilvl.w vr2, vr1, vr0
|
|
vilvh.w vr3, vr1, vr0
|
|
|
|
vilvl.d vr8, vr2, vr2 //p1_org
|
|
vilvh.d vr9, vr2, vr2 //p0_org
|
|
vilvl.d vr10, vr3, vr3 //q0_org
|
|
vilvh.d vr11, vr3, vr3 //q1_org
|
|
|
|
vreplgr2vr.b vr0, a2 //alpha
|
|
vreplgr2vr.b vr1, a3 //beta
|
|
|
|
vabsd.bu vr2, vr9, vr10 //p0_asub_q0
|
|
vabsd.bu vr3, vr8, vr9 //p1_asub_p0
|
|
vabsd.bu vr4, vr11, vr10 //q1_asub_q0
|
|
|
|
vslt.bu vr5, vr2, vr0 //is_less_than_alpha
|
|
vslt.bu vr6, vr3, vr1 //is_less_than_beta
|
|
vand.v vr7, vr5, vr6 //is_less_than
|
|
vslt.bu vr6, vr4, vr1
|
|
vand.v vr7, vr7, vr6
|
|
|
|
vsetnez.v $fcc0, vr7
|
|
bceqz $fcc0, .END_H_CHROMA_INTRA_8
|
|
|
|
vexth.hu.bu vr12, vr8 //p1_org_h
|
|
vexth.hu.bu vr13, vr9 //p0_org_h
|
|
vexth.hu.bu vr14, vr10 //q0_org_h
|
|
vexth.hu.bu vr15, vr11 //q1_org_h
|
|
|
|
AVC_LPF_P0_OR_Q0 vr13, vr15, vr12, vr16, vr18
|
|
AVC_LPF_P0_OR_Q0 vr14, vr12, vr15, vr17, vr18
|
|
|
|
vpickev.b vr18, vr16, vr16
|
|
vpickev.b vr19, vr17, vr17
|
|
vbitsel.v vr9, vr9, vr18, vr7
|
|
vbitsel.v vr10, vr10, vr19, vr7
|
|
.END_H_CHROMA_INTRA_8:
|
|
vilvl.b vr11, vr10, vr9
|
|
addi.d t4, t4, 1
|
|
vstelm.h vr11, t4, 0, 0
|
|
add.d t4, t4, a1
|
|
vstelm.h vr11, t4, 0, 1
|
|
add.d t4, t4, a1
|
|
vstelm.h vr11, t4, 0, 2
|
|
add.d t4, t4, a1
|
|
vstelm.h vr11, t4, 0, 3
|
|
add.d t4, t4, a1
|
|
vstelm.h vr11, t4, 0, 4
|
|
add.d t4, t4, a1
|
|
vstelm.h vr11, t4, 0, 5
|
|
add.d t4, t4, a1
|
|
vstelm.h vr11, t4, 0, 6
|
|
add.d t4, t4, a1
|
|
vstelm.h vr11, t4, 0, 7
|
|
endfunc
|
|
|
|
function ff_h264_v_lpf_chroma_intra_8_lsx
|
|
slli.d t0, a1, 1 //img_width_2x
|
|
sub.d t2, a0, a1
|
|
sub.d t1, a0, t0 //data - img_width_2x
|
|
|
|
vreplgr2vr.b vr0, a2
|
|
vreplgr2vr.b vr1, a3
|
|
|
|
vld vr2, t1, 0 //p1_org
|
|
vldx vr3, t1, a1 //p0_org
|
|
vld vr4, a0, 0 //q0_org
|
|
vldx vr5, a0, a1 //q1_org
|
|
|
|
vabsd.bu vr6, vr3, vr4 //p0_asub_q0
|
|
vabsd.bu vr7, vr2, vr3 //p1_asub_p0
|
|
vabsd.bu vr8, vr5, vr4 //q1_asub_q0
|
|
|
|
vslt.bu vr9, vr6, vr0 //is_less_than_alpha
|
|
vslt.bu vr10, vr7, vr1 //is_less_than_beta
|
|
vand.v vr11, vr9, vr10 //is_less_than
|
|
vslt.bu vr10, vr8, vr1
|
|
vand.v vr11, vr10, vr11
|
|
|
|
vsetnez.v $fcc0, vr11
|
|
bceqz $fcc0, .END_V_CHROMA_INTRA_8
|
|
|
|
vsllwil.hu.bu vr6, vr2, 0 //p1_org_h.0
|
|
vsllwil.hu.bu vr8, vr3, 0 //p0_org_h.0
|
|
vsllwil.hu.bu vr13, vr4, 0 //q0_org_h.0
|
|
vsllwil.hu.bu vr15, vr5, 0 //q1_org_h.0
|
|
|
|
AVC_LPF_P0_OR_Q0 vr8, vr15, vr6, vr17, vr23
|
|
AVC_LPF_P0_OR_Q0 vr13, vr6, vr15, vr18, vr23
|
|
|
|
vpickev.b vr19, vr17, vr17
|
|
vpickev.b vr20, vr18, vr18
|
|
vbitsel.v vr3, vr3, vr19, vr11
|
|
vbitsel.v vr4, vr4, vr20, vr11
|
|
|
|
vstelm.d vr3, t2, 0, 0
|
|
vstelm.d vr4, a0, 0, 0
|
|
.END_V_CHROMA_INTRA_8:
|
|
endfunc
|
|
|
|
.macro biweight_calc _in0, _in1, _in2, _in3, _reg0, _reg1, _reg2,\
|
|
_out0, _out1, _out2, _out3
|
|
vmov \_out0, \_reg0
|
|
vmov \_out1, \_reg0
|
|
vmov \_out2, \_reg0
|
|
vmov \_out3, \_reg0
|
|
vmaddwev.h.bu.b \_out0, \_in0, \_reg1
|
|
vmaddwev.h.bu.b \_out1, \_in1, \_reg1
|
|
vmaddwev.h.bu.b \_out2, \_in2, \_reg1
|
|
vmaddwev.h.bu.b \_out3, \_in3, \_reg1
|
|
vmaddwod.h.bu.b \_out0, \_in0, \_reg1
|
|
vmaddwod.h.bu.b \_out1, \_in1, \_reg1
|
|
vmaddwod.h.bu.b \_out2, \_in2, \_reg1
|
|
vmaddwod.h.bu.b \_out3, \_in3, \_reg1
|
|
|
|
vssran.bu.h \_out0, \_out0, \_reg2
|
|
vssran.bu.h \_out1, \_out1, \_reg2
|
|
vssran.bu.h \_out2, \_out2, \_reg2
|
|
vssran.bu.h \_out3, \_out3, \_reg2
|
|
.endm
|
|
|
|
.macro biweight_load_8
|
|
load_double f0, f1, f2, f3, a1, a2, t0, t1
|
|
load_double f10, f11, f12, f13, a0, a2, t0, t1
|
|
|
|
vilvl.d vr0, vr1, vr0 //src0
|
|
vilvl.d vr2, vr3, vr2 //src2
|
|
vilvl.d vr10, vr11, vr10 //dst0
|
|
vilvl.d vr12, vr13, vr12 //dst2
|
|
|
|
vilvl.b vr1, vr10, vr0 //vec0.0
|
|
vilvh.b vr3, vr10, vr0 //vec0.1
|
|
vilvl.b vr5, vr12, vr2 //vec1.0
|
|
vilvh.b vr7, vr12, vr2 //vec1.1
|
|
.endm
|
|
|
|
.macro biweight_8
|
|
biweight_calc vr1, vr3, vr5, vr7, vr8, vr20, vr9, vr0, vr2, vr4, vr6
|
|
vilvl.d vr0, vr2, vr0
|
|
vilvl.d vr2, vr6, vr4
|
|
|
|
vbsrl.v vr1, vr0, 8
|
|
vbsrl.v vr3, vr2, 8
|
|
|
|
store_double f0, f1, f2, f3, a0, a2, t0, t1
|
|
.endm
|
|
|
|
.macro biweight_load2_8
|
|
biweight_load_8
|
|
load_double f0, f2, f4, f6, t4, a2, t0, t1
|
|
load_double f14, f15, f16, f17, t5, a2, t0, t1
|
|
|
|
vilvl.d vr0, vr2, vr0 //src4
|
|
vilvl.d vr4, vr6, vr4 //src6
|
|
vilvl.d vr14, vr15, vr14 //dst4
|
|
vilvl.d vr16, vr17, vr16 //dst6
|
|
|
|
vilvl.b vr11, vr14, vr0 //vec4.0
|
|
vilvh.b vr13, vr14, vr0 //vec4.1
|
|
vilvl.b vr15, vr16, vr4 //vec6.0
|
|
vilvh.b vr17, vr16, vr4 //vec6.1
|
|
.endm
|
|
|
|
.macro biweight2_8
|
|
biweight_8
|
|
biweight_calc vr11, vr13, vr15, vr17, vr8, vr20, vr9, \
|
|
vr10, vr12, vr14, vr16
|
|
vilvl.d vr10, vr12, vr10
|
|
vilvl.d vr12, vr16, vr14
|
|
|
|
vbsrl.v vr11, vr10, 8
|
|
vbsrl.v vr13, vr12, 8
|
|
|
|
store_double f10, f11, f12, f13, t5, a2, t0, t1
|
|
.endm
|
|
|
|
.macro biweight_load_16
|
|
add.d t4, a1, t2
|
|
vld vr0, a1, 0
|
|
vldx vr1, a1, a2
|
|
vldx vr2, a1, t0
|
|
vldx vr3, a1, t1
|
|
vld vr4, t4, 0
|
|
vldx vr5, t4, a2
|
|
vldx vr6, t4, t0
|
|
vldx vr7, t4, t1
|
|
|
|
add.d t5, a0, t2
|
|
vld vr10, a0, 0
|
|
vldx vr11, a0, a2
|
|
vldx vr12, a0, t0
|
|
vldx vr13, a0, t1
|
|
vld vr14, t5, 0
|
|
vldx vr15, t5, a2
|
|
vldx vr16, t5, t0
|
|
vldx vr17, t5, t1
|
|
|
|
vilvl.b vr18, vr10, vr0
|
|
vilvl.b vr19, vr11, vr1
|
|
vilvl.b vr21, vr12, vr2
|
|
vilvl.b vr22, vr13, vr3
|
|
vilvh.b vr0, vr10, vr0
|
|
vilvh.b vr1, vr11, vr1
|
|
vilvh.b vr2, vr12, vr2
|
|
vilvh.b vr3, vr13, vr3
|
|
|
|
vilvl.b vr10, vr14, vr4
|
|
vilvl.b vr11, vr15, vr5
|
|
vilvl.b vr12, vr16, vr6
|
|
vilvl.b vr13, vr17, vr7
|
|
vilvh.b vr14, vr14, vr4
|
|
vilvh.b vr15, vr15, vr5
|
|
vilvh.b vr16, vr16, vr6
|
|
vilvh.b vr17, vr17, vr7
|
|
.endm
|
|
|
|
.macro biweight_16
|
|
biweight_calc vr18, vr19, vr21, vr22, vr8, vr20, vr9, vr4, vr5, vr6, vr7
|
|
biweight_calc vr0, vr1, vr2, vr3, vr8, vr20, vr9, vr18, vr19, vr21, vr22
|
|
biweight_calc vr10, vr11, vr12, vr13, vr8, vr20, vr9, vr0, vr1, vr2, vr3
|
|
biweight_calc vr14, vr15, vr16, vr17, vr8, vr20, vr9, vr10, vr11, vr12, vr13
|
|
|
|
vilvl.d vr4, vr18, vr4
|
|
vilvl.d vr5, vr19, vr5
|
|
vilvl.d vr6, vr21, vr6
|
|
vilvl.d vr7, vr22, vr7
|
|
vilvl.d vr0, vr10, vr0
|
|
vilvl.d vr1, vr11, vr1
|
|
vilvl.d vr2, vr12, vr2
|
|
vilvl.d vr3, vr13, vr3
|
|
|
|
vst vr4, a0, 0
|
|
vstx vr5, a0, a2
|
|
vstx vr6, a0, t0
|
|
vstx vr7, a0, t1
|
|
vst vr0, t5, 0
|
|
vstx vr1, t5, a2
|
|
vstx vr2, t5, t0
|
|
vstx vr3, t5, t1
|
|
.endm
|
|
|
|
.macro biweight_func w
|
|
function ff_biweight_h264_pixels\w\()_8_lsx
|
|
slli.d t0, a2, 1
|
|
slli.d t2, a2, 2
|
|
add.d t1, t0, a2
|
|
addi.d a7, a7, 1
|
|
ori a7, a7, 1
|
|
sll.d a7, a7, a4
|
|
addi.d a4, a4, 1
|
|
|
|
vreplgr2vr.b vr0, a6 //tmp0
|
|
vreplgr2vr.b vr1, a5 //tmp1
|
|
vreplgr2vr.h vr8, a7 //offset
|
|
vreplgr2vr.h vr9, a4 //denom
|
|
vilvh.b vr20, vr1, vr0 //wgt
|
|
.endm
|
|
|
|
biweight_func 8
|
|
addi.d t3, zero, 8
|
|
biweight_load_8
|
|
biweight_8
|
|
blt a3, t3, .END_BIWEIGHT_H264_PIXELS8
|
|
addi.d t3, zero, 16
|
|
add.d a1, a1, t2
|
|
add.d a0, a0, t2
|
|
biweight_load_8
|
|
biweight_8
|
|
blt a3, t3, .END_BIWEIGHT_H264_PIXELS8
|
|
add.d a1, a1, t2
|
|
add.d a0, a0, t2
|
|
add.d t4, a1, t2
|
|
add.d t5, a0, t2
|
|
biweight_load2_8
|
|
biweight2_8
|
|
.END_BIWEIGHT_H264_PIXELS8:
|
|
endfunc
|
|
|
|
biweight_func 16
|
|
addi.d t6, zero, 16
|
|
biweight_load_16
|
|
biweight_16
|
|
|
|
bne a3, t6, .END_BIWEIGHT_PIXELS16
|
|
add.d a1, t4, t2
|
|
add.d a0, t5, t2
|
|
biweight_load_16
|
|
biweight_16
|
|
.END_BIWEIGHT_PIXELS16:
|
|
endfunc
|
|
|
|
.macro biweight_calc_4 _in0, _out0
|
|
vmov \_out0, vr8
|
|
vmaddwev.h.bu.b \_out0, \_in0, vr20
|
|
vmaddwod.h.bu.b \_out0, \_in0, vr20
|
|
vssran.bu.h \_out0, \_out0, vr9
|
|
.endm
|
|
|
|
//LSX optimization is sufficient for this function.
|
|
biweight_func 4
|
|
addi.d t3, zero, 4
|
|
fld.s f0, a1, 0
|
|
fldx.s f1, a1, a2
|
|
fld.s f10, a0, 0
|
|
fldx.s f11, a0, a2
|
|
vilvl.w vr2, vr1, vr0
|
|
vilvl.w vr12, vr11, vr10
|
|
vilvl.b vr0, vr12, vr2
|
|
|
|
biweight_calc_4 vr0, vr1
|
|
vbsrl.v vr2, vr1, 4
|
|
fst.s f1, a0, 0
|
|
fstx.s f2, a0, a2
|
|
|
|
blt a3, t3, .END_BIWEIGHT_H264_PIXELS4
|
|
addi.d t3, zero, 8
|
|
fldx.s f0, a1, t0
|
|
fldx.s f1, a1, t1
|
|
fldx.s f10, a0, t0
|
|
fldx.s f11, a0, t1
|
|
vilvl.w vr2, vr1, vr0
|
|
vilvl.w vr12, vr11, vr10
|
|
vilvl.b vr0, vr12, vr2
|
|
|
|
biweight_calc_4 vr0, vr1
|
|
vbsrl.v vr2, vr1, 4
|
|
fstx.s f1, a0, t0
|
|
fstx.s f2, a0, t1
|
|
blt a3, t3, .END_BIWEIGHT_H264_PIXELS4
|
|
add.d a1, a1, t2
|
|
add.d a0, a0, t2
|
|
fld.s f0, a1, 0
|
|
fldx.s f1, a1, a2
|
|
fldx.s f2, a1, t0
|
|
fldx.s f3, a1, t1
|
|
fld.s f10, a0, 0
|
|
fldx.s f11, a0, a2
|
|
fldx.s f12, a0, t0
|
|
fldx.s f13, a0, t1
|
|
vilvl.w vr4, vr1, vr0
|
|
vilvl.w vr5, vr3, vr2
|
|
vilvl.w vr14, vr11, vr10
|
|
vilvl.w vr15, vr13, vr12
|
|
|
|
vilvl.b vr0, vr14, vr4
|
|
vilvl.b vr10, vr15, vr5
|
|
|
|
vmov vr1, vr8
|
|
vmov vr11, vr8
|
|
vmaddwev.h.bu.b vr1, vr0, vr20
|
|
vmaddwev.h.bu.b vr11, vr10, vr20
|
|
vmaddwod.h.bu.b vr1, vr0, vr20
|
|
vmaddwod.h.bu.b vr11, vr10, vr20
|
|
|
|
vssran.bu.h vr0, vr1, vr9 //vec0
|
|
vssran.bu.h vr10, vr11, vr9 //vec0
|
|
vbsrl.v vr2, vr0, 4
|
|
vbsrl.v vr12, vr10, 4
|
|
|
|
fst.s f0, a0, 0
|
|
fstx.s f2, a0, a2
|
|
fstx.s f10, a0, t0
|
|
fstx.s f12, a0, t1
|
|
.END_BIWEIGHT_H264_PIXELS4:
|
|
endfunc
|
|
|
|
.macro biweight_func_lasx w
|
|
function ff_biweight_h264_pixels\w\()_8_lasx
|
|
slli.d t0, a2, 1
|
|
slli.d t2, a2, 2
|
|
add.d t1, t0, a2
|
|
addi.d a7, a7, 1
|
|
ori a7, a7, 1
|
|
sll.d a7, a7, a4
|
|
addi.d a4, a4, 1
|
|
|
|
xvreplgr2vr.b xr0, a6 //tmp0
|
|
xvreplgr2vr.b xr1, a5 //tmp1
|
|
xvreplgr2vr.h xr8, a7 //offset
|
|
xvreplgr2vr.h xr9, a4 //denom
|
|
xvilvh.b xr20, xr1, xr0 //wgt
|
|
.endm
|
|
|
|
.macro biweight_calc_lasx _in0, _in1, _reg0, _reg1, _reg2, _out0, _out1
|
|
xmov \_out0, \_reg0
|
|
xmov \_out1, \_reg0
|
|
xvmaddwev.h.bu.b \_out0, \_in0, \_reg1
|
|
xvmaddwev.h.bu.b \_out1, \_in1, \_reg1
|
|
xvmaddwod.h.bu.b \_out0, \_in0, \_reg1
|
|
xvmaddwod.h.bu.b \_out1, \_in1, \_reg1
|
|
|
|
xvssran.bu.h \_out0, \_out0, \_reg2
|
|
xvssran.bu.h \_out1, \_out1, \_reg2
|
|
.endm
|
|
|
|
.macro biweight_load_lasx_8
|
|
load_double f0, f1, f2, f3, a1, a2, t0, t1
|
|
load_double f10, f11, f12, f13, a0, a2, t0, t1
|
|
|
|
vilvl.d vr0, vr1, vr0 //src0
|
|
vilvl.d vr2, vr3, vr2 //src2
|
|
vilvl.d vr10, vr11, vr10 //dst0
|
|
vilvl.d vr12, vr13, vr12 //dst2
|
|
|
|
xvpermi.q xr2, xr0, 0x20
|
|
xvpermi.q xr12, xr10, 0x20
|
|
|
|
xvilvl.b xr0, xr12, xr2
|
|
xvilvh.b xr1, xr12, xr2
|
|
.endm
|
|
|
|
.macro biweight_lasx_8
|
|
biweight_calc_lasx xr0, xr1, xr8, xr20, xr9, xr2, xr3
|
|
xvilvl.d xr0, xr3, xr2
|
|
xvpermi.d xr2, xr0, 0x4E
|
|
vbsrl.v vr1, vr0, 8
|
|
vbsrl.v vr3, vr2, 8
|
|
|
|
store_double f0, f1, f2, f3, a0, a2, t0, t1
|
|
.endm
|
|
|
|
biweight_func_lasx 8
|
|
addi.d t3, zero, 8
|
|
biweight_load_lasx_8
|
|
biweight_lasx_8
|
|
blt a3, t3, .END_BIWEIGHT_H264_PIXELS8_LASX
|
|
addi.d t3, zero, 16
|
|
add.d a1, a1, t2
|
|
add.d a0, a0, t2
|
|
biweight_load_lasx_8
|
|
biweight_lasx_8
|
|
blt a3, t3, .END_BIWEIGHT_H264_PIXELS8_LASX
|
|
add.d a1, a1, t2
|
|
add.d a0, a0, t2
|
|
add.d t4, a1, t2
|
|
add.d t5, a0, t2
|
|
biweight_load_lasx_8
|
|
load_double f4, f5, f6, f7, t4, a2, t0, t1
|
|
load_double f14, f15, f16, f17, t5, a2, t0, t1
|
|
vilvl.d vr4, vr5, vr4 //src4
|
|
vilvl.d vr6, vr7, vr6 //src6
|
|
vilvl.d vr14, vr15, vr14 //dst4
|
|
vilvl.d vr16, vr17, vr16 //dst6
|
|
xvpermi.q xr6, xr4, 0x20
|
|
xvpermi.q xr16, xr14, 0x20
|
|
xvilvl.b xr10, xr16, xr6
|
|
xvilvh.b xr11, xr16, xr6
|
|
biweight_lasx_8
|
|
biweight_calc_lasx xr10, xr11, xr8, xr20, xr9, xr12, xr13
|
|
xvilvl.d xr10, xr13, xr12
|
|
xvpermi.d xr12, xr10, 0x4E
|
|
vbsrl.v vr11, vr10, 8
|
|
vbsrl.v vr13, vr12, 8
|
|
store_double f10, f11, f12, f13, t5, a2, t0, t1
|
|
.END_BIWEIGHT_H264_PIXELS8_LASX:
|
|
endfunc
|
|
|
|
.macro biweight_load_lasx_16
|
|
add.d t4, a1, t2
|
|
vld vr0, a1, 0
|
|
vldx vr1, a1, a2
|
|
vldx vr2, a1, t0
|
|
vldx vr3, a1, t1
|
|
vld vr4, t4, 0
|
|
vldx vr5, t4, a2
|
|
vldx vr6, t4, t0
|
|
vldx vr7, t4, t1
|
|
|
|
add.d t5, a0, t2
|
|
vld vr10, a0, 0
|
|
vldx vr11, a0, a2
|
|
vldx vr12, a0, t0
|
|
vldx vr13, a0, t1
|
|
vld vr14, t5, 0
|
|
vldx vr15, t5, a2
|
|
vldx vr16, t5, t0
|
|
vldx vr17, t5, t1
|
|
|
|
xvpermi.q xr1, xr0, 0x20
|
|
xvpermi.q xr3, xr2, 0x20
|
|
xvpermi.q xr5, xr4, 0x20
|
|
xvpermi.q xr7, xr6, 0x20
|
|
|
|
xvpermi.q xr11, xr10, 0x20
|
|
xvpermi.q xr13, xr12, 0x20
|
|
xvpermi.q xr15, xr14, 0x20
|
|
xvpermi.q xr17, xr16, 0x20
|
|
|
|
xvilvl.b xr0, xr11, xr1 //vec0
|
|
xvilvl.b xr2, xr13, xr3 //vec2
|
|
xvilvl.b xr4, xr15, xr5 //vec4
|
|
xvilvl.b xr6, xr17, xr7 //vec6
|
|
|
|
xvilvh.b xr10, xr11, xr1 //vec1
|
|
xvilvh.b xr12, xr13, xr3 //vec2
|
|
xvilvh.b xr14, xr15, xr5 //vec5
|
|
xvilvh.b xr16, xr17, xr7 //vec7
|
|
.endm
|
|
|
|
.macro biweight_lasx_16
|
|
biweight_calc_lasx xr0, xr2, xr8, xr20, xr9, xr1, xr3
|
|
biweight_calc_lasx xr4, xr6, xr8, xr20, xr9, xr5, xr7
|
|
biweight_calc_lasx xr10, xr12, xr8, xr20, xr9, xr11, xr13
|
|
biweight_calc_lasx xr14, xr16, xr8, xr20, xr9, xr15, xr17
|
|
xvilvl.d xr0, xr11, xr1
|
|
xvilvl.d xr2, xr13, xr3
|
|
xvilvl.d xr4, xr15, xr5
|
|
xvilvl.d xr6, xr17, xr7
|
|
|
|
xvpermi.d xr1, xr0, 0x4E
|
|
xvpermi.d xr3, xr2, 0x4E
|
|
xvpermi.d xr5, xr4, 0x4E
|
|
xvpermi.d xr7, xr6, 0x4E
|
|
vst vr0, a0, 0
|
|
vstx vr1, a0, a2
|
|
vstx vr2, a0, t0
|
|
vstx vr3, a0, t1
|
|
vst vr4, t5, 0
|
|
vstx vr5, t5, a2
|
|
vstx vr6, t5, t0
|
|
vstx vr7, t5, t1
|
|
.endm
|
|
|
|
biweight_func_lasx 16
|
|
addi.d t6, zero, 16
|
|
biweight_load_lasx_16
|
|
biweight_lasx_16
|
|
bne a3, t6, .END_BIWEIGHT_PIXELS16_LASX
|
|
add.d a1, t4, t2
|
|
add.d a0, t5, t2
|
|
biweight_load_lasx_16
|
|
biweight_lasx_16
|
|
.END_BIWEIGHT_PIXELS16_LASX:
|
|
endfunc
|
|
|
|
.macro weight_func w
|
|
function ff_weight_h264_pixels\w\()_8_lsx
|
|
slli.d t0, a1, 1
|
|
slli.d t2, a1, 2
|
|
add.d t1, t0, a1
|
|
|
|
sll.d a5, a5, a3
|
|
vreplgr2vr.h vr20, a4 //weight
|
|
vreplgr2vr.h vr8, a5 //offset
|
|
vreplgr2vr.h vr9, a3 //log2_denom
|
|
.endm
|
|
|
|
.macro weight_load_16
|
|
add.d t4, a0, t2
|
|
vld vr0, a0, 0
|
|
vldx vr1, a0, a1
|
|
vldx vr2, a0, t0
|
|
vldx vr3, a0, t1
|
|
vld vr4, t4, 0
|
|
vldx vr5, t4, a1
|
|
vldx vr6, t4, t0
|
|
vldx vr7, t4, t1
|
|
|
|
vilvl.b vr10, vr23, vr0
|
|
vilvl.b vr11, vr23, vr1
|
|
vilvl.b vr12, vr23, vr2
|
|
vilvl.b vr13, vr23, vr3
|
|
vilvl.b vr14, vr23, vr4
|
|
vilvl.b vr15, vr23, vr5
|
|
vilvl.b vr16, vr23, vr6
|
|
vilvl.b vr17, vr23, vr7
|
|
.endm
|
|
|
|
.macro weight_extend_16
|
|
vilvl.b vr10, vr23, vr0
|
|
vilvl.b vr11, vr23, vr1
|
|
vilvl.b vr12, vr23, vr2
|
|
vilvl.b vr13, vr23, vr3
|
|
vilvl.b vr14, vr23, vr4
|
|
vilvl.b vr15, vr23, vr5
|
|
vilvl.b vr16, vr23, vr6
|
|
vilvl.b vr17, vr23, vr7
|
|
|
|
vilvh.b vr18, vr23, vr0
|
|
vilvh.b vr19, vr23, vr1
|
|
vilvh.b vr21, vr23, vr2
|
|
vilvh.b vr22, vr23, vr3
|
|
vilvh.b vr0, vr23, vr4
|
|
vilvh.b vr1, vr23, vr5
|
|
vilvh.b vr2, vr23, vr6
|
|
vilvh.b vr3, vr23, vr7
|
|
.endm
|
|
|
|
.macro weight_calc _in0, _in1, _in2, _in3, _reg0, _reg1, _reg2, \
|
|
_out0, _out1, _out2, _out3
|
|
vmul.h \_in0, \_in0, \_reg1
|
|
vmul.h \_in1, \_in1, \_reg1
|
|
vmul.h \_in2, \_in2, \_reg1
|
|
vmul.h \_in3, \_in3, \_reg1
|
|
vsadd.h \_out0, \_reg0, \_in0
|
|
vsadd.h \_out1, \_reg0, \_in1
|
|
vsadd.h \_out2, \_reg0, \_in2
|
|
vsadd.h \_out3, \_reg0, \_in3
|
|
vssrarn.bu.h \_out0, \_out0, \_reg2
|
|
vssrarn.bu.h \_out1, \_out1, \_reg2
|
|
vssrarn.bu.h \_out2, \_out2, \_reg2
|
|
vssrarn.bu.h \_out3, \_out3, \_reg2
|
|
.endm
|
|
|
|
.macro weight_16
|
|
weight_calc vr10, vr11, vr12, vr13, vr8, vr20, vr9, vr10, vr11, vr12, vr13
|
|
weight_calc vr14, vr15, vr16, vr17, vr8, vr20, vr9, vr14, vr15, vr16, vr17
|
|
weight_calc vr18, vr19, vr21, vr22, vr8, vr20, vr9, vr4, vr5, vr6, vr7
|
|
weight_calc vr0, vr1, vr2, vr3, vr8, vr20, vr9, vr0, vr1, vr2, vr3
|
|
|
|
vilvl.d vr10, vr4, vr10
|
|
vilvl.d vr11, vr5, vr11
|
|
vilvl.d vr12, vr6, vr12
|
|
vilvl.d vr13, vr7, vr13
|
|
vilvl.d vr14, vr0, vr14
|
|
vilvl.d vr15, vr1, vr15
|
|
vilvl.d vr16, vr2, vr16
|
|
vilvl.d vr17, vr3, vr17
|
|
|
|
vst vr10, a0, 0
|
|
vstx vr11, a0, a1
|
|
vstx vr12, a0, t0
|
|
vstx vr13, a0, t1
|
|
vst vr14, t4, 0
|
|
vstx vr15, t4, a1
|
|
vstx vr16, t4, t0
|
|
vstx vr17, t4, t1
|
|
.endm
|
|
|
|
weight_func 16
|
|
vldi vr23, 0
|
|
addi.d t3, zero, 16
|
|
weight_load_16
|
|
weight_extend_16
|
|
weight_16
|
|
bne a2, t3, .END_WEIGHT_H264_PIXELS16_8
|
|
add.d a0, t4, t2
|
|
weight_load_16
|
|
weight_extend_16
|
|
weight_16
|
|
.END_WEIGHT_H264_PIXELS16_8:
|
|
endfunc
|
|
|
|
.macro weight_load_8
|
|
load_double f0, f1, f2, f3, a0, a1, t0, t1
|
|
.endm
|
|
|
|
.macro weight_extend_8
|
|
vilvl.b vr10, vr21, vr0
|
|
vilvl.b vr11, vr21, vr1
|
|
vilvl.b vr12, vr21, vr2
|
|
vilvl.b vr13, vr21, vr3
|
|
.endm
|
|
|
|
.macro weight_8
|
|
weight_calc vr10, vr11, vr12, vr13, vr8, vr20, vr9, vr0, vr1, vr2, vr3
|
|
store_double f0, f1, f2, f3, a0, a1, t0, t1
|
|
.endm
|
|
|
|
weight_func 8
|
|
vldi vr21, 0
|
|
addi.d t3, zero, 8
|
|
weight_load_8
|
|
weight_extend_8
|
|
weight_8
|
|
blt a2, t3, .END_WEIGHT_H264_PIXELS8
|
|
add.d a0, a0, t2
|
|
addi.d t3, zero, 16
|
|
weight_load_8
|
|
weight_extend_8
|
|
weight_8
|
|
blt a2, t3, .END_WEIGHT_H264_PIXELS8
|
|
add.d a0, a0, t2
|
|
add.d t4, a0, t2
|
|
weight_load_8
|
|
load_double f4, f5, f6, f7, t4, a1, t0, t1
|
|
weight_extend_8
|
|
vilvl.b vr14, vr21, vr4
|
|
vilvl.b vr15, vr21, vr5
|
|
vilvl.b vr16, vr21, vr6
|
|
vilvl.b vr17, vr21, vr7
|
|
weight_8
|
|
weight_calc vr14, vr15, vr16, vr17, vr8, vr20, vr9, vr4, vr5, vr6, vr7
|
|
store_double f4, f5, f6, f7, t4, a1, t0, t1
|
|
.END_WEIGHT_H264_PIXELS8:
|
|
endfunc
|
|
|
|
.macro weight_func_lasx w
|
|
function ff_weight_h264_pixels\w\()_8_lasx
|
|
slli.d t0, a1, 1
|
|
slli.d t2, a1, 2
|
|
add.d t1, t0, a1
|
|
|
|
sll.d a5, a5, a3
|
|
xvreplgr2vr.h xr20, a4 //weight
|
|
xvreplgr2vr.h xr8, a5 //offset
|
|
xvreplgr2vr.h xr9, a3 //log2_denom
|
|
.endm
|
|
|
|
.macro weight_calc_lasx _in0, _in1, _reg0, _reg1, _reg2, _out0, _out1
|
|
xvmul.h \_out0, \_in0, \_reg1
|
|
xvmul.h \_out1, \_in1, \_reg1
|
|
xvsadd.h \_out0, \_reg0, \_out0
|
|
xvsadd.h \_out1, \_reg0, \_out1
|
|
xvssrarn.bu.h \_out0, \_out0, \_reg2
|
|
xvssrarn.bu.h \_out1, \_out1, \_reg2
|
|
.endm
|
|
|
|
.macro weight_load_lasx_8
|
|
load_double f0, f1, f2, f3, a0, a1, t0, t1
|
|
vilvl.d vr4, vr1, vr0
|
|
vilvl.d vr5, vr3, vr2
|
|
vext2xv.hu.bu xr6, xr4
|
|
vext2xv.hu.bu xr7, xr5
|
|
.endm
|
|
|
|
.macro weight_lasx_8
|
|
weight_calc_lasx xr6, xr7, xr8, xr20, xr9, xr1, xr3
|
|
xvpermi.d xr2, xr1, 0x2
|
|
xvpermi.d xr4, xr3, 0x2
|
|
store_double f1, f2, f3, f4, a0, a1, t0, t1
|
|
.endm
|
|
|
|
weight_func_lasx 8
|
|
addi.d t3, zero, 8
|
|
weight_load_lasx_8
|
|
weight_lasx_8
|
|
blt a2, t3, .END_WEIGHT_H264_PIXELS8_LASX
|
|
add.d a0, a0, t2
|
|
addi.d t3, zero, 16
|
|
weight_load_lasx_8
|
|
weight_lasx_8
|
|
blt a2, t3, .END_WEIGHT_H264_PIXELS8_LASX
|
|
add.d a0, a0, t2
|
|
add.d t4, a0, t2
|
|
weight_load_lasx_8
|
|
load_double f14, f15, f16, f17, t4, a1, t0, t1
|
|
vilvl.d vr4, vr15, vr14
|
|
vilvl.d vr5, vr17, vr16
|
|
vext2xv.hu.bu xr10, xr4
|
|
vext2xv.hu.bu xr11, xr5
|
|
weight_lasx_8
|
|
weight_calc_lasx xr10, xr11, xr8, xr20, xr9, xr4, xr6
|
|
xvpermi.d xr5, xr4, 0x2
|
|
xvpermi.d xr7, xr6, 0x2
|
|
store_double f4, f5, f6, f7, t4, a1, t0, t1
|
|
.END_WEIGHT_H264_PIXELS8_LASX:
|
|
endfunc
|
|
|
|
.macro weight_load_lasx_16
|
|
add.d t4, a0, t2
|
|
vld vr0, a0, 0
|
|
vldx vr1, a0, a1
|
|
vldx vr2, a0, t0
|
|
vldx vr3, a0, t1
|
|
vld vr4, t4, 0
|
|
vldx vr5, t4, a1
|
|
vldx vr6, t4, t0
|
|
vldx vr7, t4, t1
|
|
|
|
vext2xv.hu.bu xr0, xr0
|
|
vext2xv.hu.bu xr1, xr1
|
|
vext2xv.hu.bu xr2, xr2
|
|
vext2xv.hu.bu xr3, xr3
|
|
vext2xv.hu.bu xr4, xr4
|
|
vext2xv.hu.bu xr5, xr5
|
|
vext2xv.hu.bu xr6, xr6
|
|
vext2xv.hu.bu xr7, xr7
|
|
.endm
|
|
|
|
.macro weight_lasx_16
|
|
weight_calc_lasx xr0, xr1, xr8, xr20, xr9, xr10, xr11
|
|
weight_calc_lasx xr2, xr3, xr8, xr20, xr9, xr12, xr13
|
|
weight_calc_lasx xr4, xr5, xr8, xr20, xr9, xr14, xr15
|
|
weight_calc_lasx xr6, xr7, xr8, xr20, xr9, xr16, xr17
|
|
xvpermi.d xr10, xr10, 0xD8
|
|
xvpermi.d xr11, xr11, 0xD8
|
|
xvpermi.d xr12, xr12, 0xD8
|
|
xvpermi.d xr13, xr13, 0xD8
|
|
xvpermi.d xr14, xr14, 0xD8
|
|
xvpermi.d xr15, xr15, 0xD8
|
|
xvpermi.d xr16, xr16, 0xD8
|
|
xvpermi.d xr17, xr17, 0xD8
|
|
|
|
vst vr10, a0, 0
|
|
vstx vr11, a0, a1
|
|
vstx vr12, a0, t0
|
|
vstx vr13, a0, t1
|
|
vst vr14, t4, 0
|
|
vstx vr15, t4, a1
|
|
vstx vr16, t4, t0
|
|
vstx vr17, t4, t1
|
|
.endm
|
|
|
|
weight_func_lasx 16
|
|
addi.d t3, zero, 16
|
|
weight_load_lasx_16
|
|
weight_lasx_16
|
|
bne a2, t3, .END_WEIGHT_H264_PIXELS16_8_LASX
|
|
add.d a0, t4, t2
|
|
weight_load_lasx_16
|
|
weight_lasx_16
|
|
.END_WEIGHT_H264_PIXELS16_8_LASX:
|
|
endfunc
|
|
|
|
//LSX optimization is sufficient for this function.
|
|
function ff_weight_h264_pixels4_8_lsx
|
|
add.d t0, a0, a1
|
|
addi.d t3, zero, 4
|
|
|
|
sll.d a5, a5, a3
|
|
vreplgr2vr.h vr20, a4 //weight
|
|
vreplgr2vr.h vr8, a5 //offset
|
|
vreplgr2vr.h vr9, a3 //log2_denom
|
|
vldi vr21, 0
|
|
|
|
fld.s f0, a0, 0
|
|
fldx.s f1, a0, a1
|
|
vilvl.w vr4, vr1, vr0
|
|
vilvl.b vr5, vr21, vr4
|
|
vmul.h vr10, vr5, vr20
|
|
vsadd.h vr0, vr8, vr10
|
|
vssrarn.bu.h vr0, vr0, vr9
|
|
|
|
fst.s f0, a0, 0
|
|
vstelm.w vr0, t0, 0, 1
|
|
blt a2, t3, .END_WEIGHT_H264_PIXELS4
|
|
add.d a0, t0, a1
|
|
addi.d t3, zero, 8
|
|
fld.s f0, a0, 0
|
|
fldx.s f1, a0, a1
|
|
add.d t0, a0, a1
|
|
vilvl.w vr4, vr1, vr0
|
|
vilvl.b vr5, vr21, vr4
|
|
|
|
vmul.h vr10, vr5, vr20
|
|
vsadd.h vr0, vr8, vr10
|
|
vssrarn.bu.h vr0, vr0, vr9
|
|
|
|
fst.s f0, a0, 0
|
|
vstelm.w vr0, t0, 0, 1
|
|
blt a2, t3, .END_WEIGHT_H264_PIXELS4
|
|
add.d a0, t0, a1
|
|
add.d t0, a0, a1
|
|
add.d t1, t0, a1
|
|
add.d t2, t1, a1
|
|
|
|
fld.s f0, a0, 0
|
|
fld.s f1, t0, 0
|
|
fld.s f2, t1, 0
|
|
fld.s f3, t2, 0
|
|
|
|
vilvl.w vr4, vr1, vr0
|
|
vilvl.w vr5, vr3, vr2
|
|
vilvl.b vr6, vr21, vr4
|
|
vilvl.b vr7, vr21, vr5
|
|
|
|
vmul.h vr10, vr6, vr20
|
|
vmul.h vr11, vr7, vr20
|
|
vsadd.h vr0, vr8, vr10
|
|
vsadd.h vr1, vr8, vr11
|
|
vssrarn.bu.h vr10, vr0, vr9
|
|
vssrarn.bu.h vr11, vr1, vr9
|
|
|
|
fst.s f10, a0, 0
|
|
vstelm.w vr10, t0, 0, 1
|
|
fst.s f11, t1, 0
|
|
vstelm.w vr11, t2, 0, 1
|
|
.END_WEIGHT_H264_PIXELS4:
|
|
endfunc
|
|
|
|
function ff_h264_add_pixels4_8_lsx
|
|
slli.d t0, a2, 1
|
|
add.d t1, t0, a2
|
|
vld vr0, a1, 0
|
|
vld vr1, a1, 16
|
|
vldi vr2, 0
|
|
fld.s f3, a0, 0
|
|
fldx.s f4, a0, a2
|
|
fldx.s f5, a0, t0
|
|
fldx.s f6, a0, t1
|
|
vilvl.w vr7, vr4, vr3
|
|
vilvl.w vr8, vr6, vr5
|
|
vilvl.b vr9, vr2, vr7
|
|
vilvl.b vr10, vr2, vr8
|
|
vadd.h vr11, vr0, vr9
|
|
vadd.h vr12, vr1, vr10
|
|
vpickev.b vr0, vr12, vr11
|
|
vbsrl.v vr3, vr0, 4
|
|
vbsrl.v vr4, vr0, 8
|
|
vbsrl.v vr5, vr0, 12
|
|
fst.s f0, a0, 0
|
|
fstx.s f3, a0, a2
|
|
fstx.s f4, a0, t0
|
|
fstx.s f5, a0, t1
|
|
vst vr2, a1, 0
|
|
vst vr2, a1, 16
|
|
endfunc
|
|
|
|
function ff_h264_add_pixels8_8_lsx
|
|
slli.d t0, a2, 1
|
|
slli.d t2, a2, 2
|
|
add.d t1, t0, a2
|
|
add.d t3, a0, t2
|
|
vldi vr0, 0
|
|
vld vr1, a1, 0
|
|
vld vr2, a1, 16
|
|
vld vr3, a1, 32
|
|
vld vr4, a1, 48
|
|
vld vr5, a1, 64
|
|
vld vr6, a1, 80
|
|
vld vr7, a1, 96
|
|
vld vr8, a1, 112
|
|
load_double f10, f11, f12, f13, a0, a2, t0, t1
|
|
load_double f14, f15, f16, f17, t3, a2, t0, t1
|
|
vilvl.b vr10, vr0, vr10
|
|
vilvl.b vr11, vr0, vr11
|
|
vilvl.b vr12, vr0, vr12
|
|
vilvl.b vr13, vr0, vr13
|
|
vilvl.b vr14, vr0, vr14
|
|
vilvl.b vr15, vr0, vr15
|
|
vilvl.b vr16, vr0, vr16
|
|
vilvl.b vr17, vr0, vr17
|
|
vadd.h vr1, vr1, vr10
|
|
vadd.h vr2, vr2, vr11
|
|
vadd.h vr3, vr3, vr12
|
|
vadd.h vr4, vr4, vr13
|
|
vadd.h vr5, vr5, vr14
|
|
vadd.h vr6, vr6, vr15
|
|
vadd.h vr7, vr7, vr16
|
|
vadd.h vr8, vr8, vr17
|
|
vpickev.b vr10, vr2, vr1
|
|
vpickev.b vr12, vr4, vr3
|
|
vpickev.b vr14, vr6, vr5
|
|
vpickev.b vr16, vr8, vr7
|
|
vbsrl.v vr11, vr10, 8
|
|
vbsrl.v vr13, vr12, 8
|
|
vbsrl.v vr15, vr14, 8
|
|
vbsrl.v vr17, vr16, 8
|
|
vst vr0, a1, 0
|
|
vst vr0, a1, 16
|
|
vst vr0, a1, 32
|
|
vst vr0, a1, 48
|
|
vst vr0, a1, 64
|
|
vst vr0, a1, 80
|
|
vst vr0, a1, 96
|
|
vst vr0, a1, 112
|
|
store_double f10, f11, f12, f13, a0, a2, t0, t1
|
|
store_double f14, f15, f16, f17, t3, a2, t0, t1
|
|
endfunc
|
|
|
|
const cnst_value
|
|
.byte 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2, 6, 2
|
|
.byte 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1
|
|
endconst
|
|
|
|
function ff_h264_loop_filter_strength_lsx
|
|
vldi vr0, 0
|
|
ldptr.w t0, sp, 0 //mask_mv1
|
|
ldptr.w t1, sp, 8 //field
|
|
beqz t1, .FIELD
|
|
la.local t2, cnst_value
|
|
vld vr1, t2, 0
|
|
vld vr2, t2, 16
|
|
b .END_FIELD
|
|
.FIELD:
|
|
vldi vr1, 0x06
|
|
vldi vr2, 0x03
|
|
.END_FIELD:
|
|
vldi vr3, 0x01
|
|
slli.d a6, a6, 3 //step <<= 3
|
|
slli.d a5, a5, 3 //edges <<= 3
|
|
move t3, zero
|
|
slli.d t4, a6, 2
|
|
move t5, a2
|
|
move t6, a3
|
|
move t7, a1
|
|
move t8, a0
|
|
slli.d t0, t0, 3
|
|
.ITERATION_FIR:
|
|
bge t3, a5, .END_ITERATION_FIR
|
|
vand.v vr20, vr20, vr0
|
|
and t2, t0, t3
|
|
bnez t2, .MASK_MV_FIR
|
|
beqz a4, .BIDIR_FIR
|
|
vld vr4, t5, 4
|
|
vld vr5, t5, 44
|
|
vld vr6, t5, 12
|
|
vld vr7, t5, 52
|
|
vilvl.w vr4, vr5, vr4
|
|
vilvl.w vr6, vr6, vr6
|
|
vilvl.w vr7, vr7, vr7
|
|
vshuf4i.h vr5, vr4, 0x4e
|
|
vsub.b vr6, vr6, vr4
|
|
vsub.b vr7, vr7, vr5
|
|
vor.v vr6, vr6, vr7
|
|
vld vr10, t6, 16
|
|
vld vr11, t6, 48
|
|
vld vr12, t6, 208
|
|
vld vr8, t6, 176
|
|
vsub.h vr13, vr10, vr11
|
|
vsub.h vr14, vr10, vr12
|
|
vsub.h vr15, vr8, vr11
|
|
vsub.h vr16, vr8, vr12
|
|
vssrarni.b.h vr14, vr13, 0
|
|
vssrarni.b.h vr16, vr15, 0
|
|
vadd.b vr14, vr2, vr14
|
|
vadd.b vr16, vr2, vr16
|
|
vssub.bu vr14, vr14, vr1
|
|
vssub.bu vr16, vr16, vr1
|
|
vssrarni.b.h vr14, vr14, 0
|
|
vssrarni.b.h vr16, vr16, 0
|
|
vor.v vr20, vr6, vr14
|
|
vshuf4i.h vr16, vr16, 0x4e
|
|
vor.v vr20, vr20, vr16
|
|
vshuf4i.h vr21, vr20, 0x4e
|
|
vmin.bu vr20, vr20, vr21
|
|
b .MASK_MV_FIR
|
|
.BIDIR_FIR:
|
|
vld vr4, t5, 4
|
|
vld vr5, t5, 12
|
|
vld vr10, t6, 16
|
|
vld vr11, t6, 48
|
|
vsub.h vr12, vr11, vr10
|
|
vssrarni.b.h vr12, vr12, 0
|
|
vadd.b vr13, vr12, vr2
|
|
vssub.bu vr14, vr13, vr1
|
|
vsat.h vr15, vr14, 7
|
|
vpickev.b vr20, vr15, vr15
|
|
vsub.b vr6, vr5, vr4
|
|
vor.v vr20, vr20, vr6
|
|
.MASK_MV_FIR:
|
|
vld vr4, t7, 12
|
|
vld vr5, t7, 4
|
|
vor.v vr6, vr4, vr5
|
|
vmin.bu vr6, vr6, vr3
|
|
vmin.bu vr20, vr20, vr3
|
|
vslli.h vr6, vr6, 1
|
|
vmax.bu vr6, vr20, vr6
|
|
vilvl.b vr7, vr0, vr6
|
|
add.d t3, t3, a6
|
|
fst.d f7, t8, 32
|
|
add.d t5, t5, a6
|
|
add.d t6, t6, t4
|
|
add.d t7, t7, a6
|
|
add.d t8, t8, a6
|
|
b .ITERATION_FIR
|
|
.END_ITERATION_FIR:
|
|
move t3, zero
|
|
addi.d a5, zero, 32
|
|
vldi vr21, 0xff
|
|
move t5, a2
|
|
move t6, a3
|
|
move t7, a1
|
|
move t8, a0
|
|
slli.d a7, a7, 3
|
|
.ITERATION_SEC:
|
|
bge t3, a5, .END_ITERATION_SEC
|
|
vand.v vr20, vr20, vr21
|
|
and t2, a7, t3
|
|
bnez t2, .MASK_MV_SEC
|
|
beqz a4, .BIDIR_SEC
|
|
vld vr4, t5, 11
|
|
vld vr5, t5, 51
|
|
vld vr6, t5, 12
|
|
vld vr7, t5, 52
|
|
vilvl.w vr4, vr5, vr4
|
|
vilvl.w vr6, vr6, vr6
|
|
vilvl.w vr7, vr7, vr7
|
|
vshuf4i.h vr5, vr4, 0x4e
|
|
vsub.b vr6, vr6, vr4
|
|
vsub.b vr7, vr7, vr5
|
|
vor.v vr6, vr6, vr7
|
|
vld vr10, t6, 44
|
|
vld vr11, t6, 48
|
|
vld vr12, t6, 208
|
|
vld vr8, t6, 204
|
|
vsub.h vr13, vr10, vr11
|
|
vsub.h vr14, vr10, vr12
|
|
vsub.h vr15, vr8, vr11
|
|
vsub.h vr16, vr8, vr12
|
|
vssrarni.b.h vr14, vr13, 0
|
|
vssrarni.b.h vr16, vr15, 0
|
|
vadd.b vr14, vr2, vr14
|
|
vadd.b vr16, vr2, vr16
|
|
vssub.bu vr14, vr14, vr1
|
|
vssub.bu vr16, vr16, vr1
|
|
vssrarni.b.h vr14, vr14, 0
|
|
vssrarni.b.h vr16, vr16, 0
|
|
vor.v vr20, vr6, vr14
|
|
vshuf4i.h vr16, vr16, 0x4e
|
|
vor.v vr20, vr20, vr16
|
|
vshuf4i.h vr22, vr20, 0x4e
|
|
vmin.bu vr20, vr20, vr22
|
|
b .MASK_MV_SEC
|
|
.BIDIR_SEC:
|
|
vld vr4, t5, 11
|
|
vld vr5, t5, 12
|
|
vld vr10, t6, 44
|
|
vld vr11, t6, 48
|
|
vsub.h vr12, vr11, vr10
|
|
vssrarni.b.h vr12, vr12, 0
|
|
vadd.b vr13, vr12, vr2
|
|
vssub.bu vr14, vr13, vr1
|
|
vssrarni.b.h vr14, vr14, 0
|
|
vsub.b vr6, vr5, vr4
|
|
vor.v vr20, vr14, vr6
|
|
.MASK_MV_SEC:
|
|
vld vr4, t7, 12
|
|
vld vr5, t7, 11
|
|
vor.v vr6, vr4, vr5
|
|
vmin.bu vr6, vr6, vr3
|
|
vmin.bu vr20, vr20, vr3
|
|
vslli.h vr6, vr6, 1
|
|
vmax.bu vr6, vr20, vr6
|
|
vilvl.b vr7, vr0, vr6
|
|
addi.d t3, t3, 8
|
|
fst.d f7, t8, 0
|
|
addi.d t5, t5, 8
|
|
addi.d t6, t6, 32
|
|
addi.d t7, t7, 8
|
|
addi.d t8, t8, 8
|
|
b .ITERATION_SEC
|
|
.END_ITERATION_SEC:
|
|
vld vr4, a0, 0
|
|
vld vr5, a0, 16
|
|
vilvh.d vr6, vr4, vr4
|
|
vilvh.d vr7, vr5, vr5
|
|
LSX_TRANSPOSE4x4_H vr4, vr6, vr5, vr7, vr6, vr7, vr8, vr9, vr10, vr11
|
|
vilvl.d vr4, vr7, vr6
|
|
vilvl.d vr5, vr9, vr8
|
|
vst vr4, a0, 0
|
|
vst vr5, a0, 16
|
|
endfunc
|