mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-03 05:10:03 +02:00
4501b1dfd7
./configure --disable-lasx ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 -pix_fmt bgra -y /dev/null -an before: 91fps after: 160fps Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
1869 lines
70 KiB
ArmAsm
1869 lines
70 KiB
ArmAsm
/*
|
|
* Loongson LSX optimized swscale
|
|
*
|
|
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
|
* Contributed by Lu Wang <wanglu@loongson.cn>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "libavcodec/loongarch/loongson_asm.S"
|
|
|
|
/* void ff_hscale_8_to_15_lsx(SwsContext *c, int16_t *dst, int dstW,
|
|
* const uint8_t *src, const int16_t *filter,
|
|
* const int32_t *filterPos, int filterSize)
|
|
*/
|
|
function ff_hscale_8_to_15_lsx
|
|
addi.d sp, sp, -72
|
|
st.d s0, sp, 0
|
|
st.d s1, sp, 8
|
|
st.d s2, sp, 16
|
|
st.d s3, sp, 24
|
|
st.d s4, sp, 32
|
|
st.d s5, sp, 40
|
|
st.d s6, sp, 48
|
|
st.d s7, sp, 56
|
|
st.d s8, sp, 64
|
|
li.w t0, 32767
|
|
li.w t8, 8
|
|
li.w t7, 4
|
|
vldi vr0, 0
|
|
vreplgr2vr.w vr20, t0
|
|
beq a6, t7, .LOOP_DSTW4
|
|
beq a6, t8, .LOOP_DSTW8
|
|
blt t8, a6, .LOOP_START
|
|
b .END_DSTW4
|
|
|
|
.LOOP_START:
|
|
li.w t1, 0
|
|
li.w s1, 0
|
|
li.w s2, 0
|
|
li.w s3, 0
|
|
li.w s4, 0
|
|
li.w s5, 0
|
|
vldi vr22, 0
|
|
addi.w s0, a6, -7
|
|
slli.w s7, a6, 1
|
|
slli.w s8, a6, 2
|
|
add.w t6, s7, s8
|
|
.LOOP_DSTW:
|
|
ld.w t2, a5, 0
|
|
ld.w t3, a5, 4
|
|
ld.w t4, a5, 8
|
|
ld.w t5, a5, 12
|
|
fldx.d f1, a3, t2
|
|
fldx.d f2, a3, t3
|
|
fldx.d f3, a3, t4
|
|
fldx.d f4, a3, t5
|
|
vld vr9, a4, 0
|
|
vldx vr10, a4, s7
|
|
vldx vr11, a4, s8
|
|
vldx vr12, a4, t6
|
|
vilvl.b vr1, vr0, vr1
|
|
vilvl.b vr2, vr0, vr2
|
|
vilvl.b vr3, vr0, vr3
|
|
vilvl.b vr4, vr0, vr4
|
|
vdp2.w.h vr17, vr1, vr9
|
|
vdp2.w.h vr18, vr2, vr10
|
|
vdp2.w.h vr19, vr3, vr11
|
|
vdp2.w.h vr21, vr4, vr12
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.d vr1, vr3, vr1
|
|
vadd.w vr22, vr22, vr1
|
|
addi.w s1, s1, 8
|
|
addi.d a3, a3, 8
|
|
addi.d a4, a4, 16
|
|
blt s1, s0, .LOOP_DSTW
|
|
blt s1, a6, .DSTWA
|
|
b .END_FILTER
|
|
.DSTWA:
|
|
ld.w t2, a5, 0
|
|
li.w t3, 0
|
|
move s6, s1
|
|
.FILTERSIZEA:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s2, s2, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .FILTERSIZEA
|
|
|
|
ld.w t2, a5, 4
|
|
li.w t3, 0
|
|
move s6, s1
|
|
addi.w t1, t1, 1
|
|
.FILTERSIZEB:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s3, s3, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .FILTERSIZEB
|
|
ld.w t2, a5, 8
|
|
addi.w t1, t1, 1
|
|
li.w t3, 0
|
|
move s6, s1
|
|
.FILTERSIZEC:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s4, s4, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .FILTERSIZEC
|
|
ld.w t2, a5, 12
|
|
addi.w t1, t1, 1
|
|
move s6, s1
|
|
li.w t3, 0
|
|
.FILTERSIZED:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s5, s5, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .FILTERSIZED
|
|
.END_FILTER:
|
|
vpickve2gr.w t1, vr22, 0
|
|
vpickve2gr.w t2, vr22, 1
|
|
vpickve2gr.w t3, vr22, 2
|
|
vpickve2gr.w t4, vr22, 3
|
|
add.w s2, s2, t1
|
|
add.w s3, s3, t2
|
|
add.w s4, s4, t3
|
|
add.w s5, s5, t4
|
|
srai.w s2, s2, 7
|
|
srai.w s3, s3, 7
|
|
srai.w s4, s4, 7
|
|
srai.w s5, s5, 7
|
|
slt t1, s2, t0
|
|
slt t2, s3, t0
|
|
slt t3, s4, t0
|
|
slt t4, s5, t0
|
|
maskeqz s2, s2, t1
|
|
maskeqz s3, s3, t2
|
|
maskeqz s4, s4, t3
|
|
maskeqz s5, s5, t4
|
|
masknez t1, t0, t1
|
|
masknez t2, t0, t2
|
|
masknez t3, t0, t3
|
|
masknez t4, t0, t4
|
|
or s2, s2, t1
|
|
or s3, s3, t2
|
|
or s4, s4, t3
|
|
or s5, s5, t4
|
|
st.h s2, a1, 0
|
|
st.h s3, a1, 2
|
|
st.h s4, a1, 4
|
|
st.h s5, a1, 6
|
|
|
|
addi.d a1, a1, 8
|
|
sub.d a3, a3, s1
|
|
addi.d a5, a5, 16
|
|
slli.d t3, a6, 3
|
|
add.d a4, a4, t3
|
|
sub.d a4, a4, s1
|
|
sub.d a4, a4, s1
|
|
addi.d a2, a2, -4
|
|
bge a2, t7, .LOOP_START
|
|
blt zero, a2, .RES
|
|
b .END_LOOP
|
|
.RES:
|
|
li.w t1, 0
|
|
.DSTW:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.FILTERSIZE:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .FILTERSIZE
|
|
srai.w t8, t8, 7
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 1
|
|
stx.h t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .DSTW
|
|
b .END_LOOP
|
|
|
|
.LOOP_DSTW8:
|
|
ld.w t1, a5, 0
|
|
ld.w t2, a5, 4
|
|
ld.w t3, a5, 8
|
|
ld.w t4, a5, 12
|
|
fldx.d f1, a3, t1
|
|
fldx.d f2, a3, t2
|
|
fldx.d f3, a3, t3
|
|
fldx.d f4, a3, t4
|
|
ld.w t1, a5, 16
|
|
ld.w t2, a5, 20
|
|
ld.w t3, a5, 24
|
|
ld.w t4, a5, 28
|
|
fldx.d f5, a3, t1
|
|
fldx.d f6, a3, t2
|
|
fldx.d f7, a3, t3
|
|
fldx.d f8, a3, t4
|
|
vld vr9, a4, 0
|
|
vld vr10, a4, 16
|
|
vld vr11, a4, 32
|
|
vld vr12, a4, 48
|
|
vld vr13, a4, 64
|
|
vld vr14, a4, 80
|
|
vld vr15, a4, 96
|
|
vld vr16, a4, 112
|
|
vilvl.b vr1, vr0, vr1
|
|
vilvl.b vr2, vr0, vr2
|
|
vilvl.b vr3, vr0, vr3
|
|
vilvl.b vr4, vr0, vr4
|
|
vilvl.b vr5, vr0, vr5
|
|
vilvl.b vr6, vr0, vr6
|
|
vilvl.b vr7, vr0, vr7
|
|
vilvl.b vr8, vr0, vr8
|
|
|
|
vdp2.w.h vr17, vr1, vr9
|
|
vdp2.w.h vr18, vr2, vr10
|
|
vdp2.w.h vr19, vr3, vr11
|
|
vdp2.w.h vr21, vr4, vr12
|
|
vdp2.w.h vr1, vr5, vr13
|
|
vdp2.w.h vr2, vr6, vr14
|
|
vdp2.w.h vr3, vr7, vr15
|
|
vdp2.w.h vr4, vr8, vr16
|
|
vhaddw.d.w vr5, vr1, vr1
|
|
vhaddw.d.w vr6, vr2, vr2
|
|
vhaddw.d.w vr7, vr3, vr3
|
|
vhaddw.d.w vr8, vr4, vr4
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
vhaddw.q.d vr5, vr5, vr5
|
|
vhaddw.q.d vr6, vr6, vr6
|
|
vhaddw.q.d vr7, vr7, vr7
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.w vr5, vr6, vr5
|
|
vilvl.w vr7, vr8, vr7
|
|
vilvl.d vr1, vr3, vr1
|
|
vilvl.d vr5, vr7, vr5
|
|
vsrai.w vr1, vr1, 7
|
|
vsrai.w vr5, vr5, 7
|
|
vmin.w vr1, vr1, vr20
|
|
vmin.w vr5, vr5, vr20
|
|
|
|
vpickev.h vr1, vr5, vr1
|
|
vst vr1, a1, 0
|
|
addi.d a1, a1, 16
|
|
addi.d a5, a5, 32
|
|
addi.d a4, a4, 128
|
|
addi.d a2, a2, -8
|
|
bge a2, t8, .LOOP_DSTW8
|
|
blt zero, a2, .RES8
|
|
b .END_LOOP
|
|
.RES8:
|
|
li.w t1, 0
|
|
.DSTW8:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.FILTERSIZE8:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .FILTERSIZE8
|
|
srai.w t8, t8, 7
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 1
|
|
stx.h t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .DSTW8
|
|
b .END_LOOP
|
|
|
|
.LOOP_DSTW4:
|
|
ld.w t1, a5, 0
|
|
ld.w t2, a5, 4
|
|
ld.w t3, a5, 8
|
|
ld.w t4, a5, 12
|
|
fldx.s f1, a3, t1
|
|
fldx.s f2, a3, t2
|
|
fldx.s f3, a3, t3
|
|
fldx.s f4, a3, t4
|
|
ld.w t1, a5, 16
|
|
ld.w t2, a5, 20
|
|
ld.w t3, a5, 24
|
|
ld.w t4, a5, 28
|
|
fldx.s f5, a3, t1
|
|
fldx.s f6, a3, t2
|
|
fldx.s f7, a3, t3
|
|
fldx.s f8, a3, t4
|
|
vld vr9, a4, 0
|
|
vld vr10, a4, 16
|
|
vld vr11, a4, 32
|
|
vld vr12, a4, 48
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.w vr5, vr6, vr5
|
|
vilvl.w vr7, vr8, vr7
|
|
vilvl.b vr1, vr0, vr1
|
|
vilvl.b vr3, vr0, vr3
|
|
vilvl.b vr5, vr0, vr5
|
|
vilvl.b vr7, vr0, vr7
|
|
|
|
vdp2.w.h vr13, vr1, vr9
|
|
vdp2.w.h vr14, vr3, vr10
|
|
vdp2.w.h vr15, vr5, vr11
|
|
vdp2.w.h vr16, vr7, vr12
|
|
vhaddw.d.w vr13, vr13, vr13
|
|
vhaddw.d.w vr14, vr14, vr14
|
|
vhaddw.d.w vr15, vr15, vr15
|
|
vhaddw.d.w vr16, vr16, vr16
|
|
vpickev.w vr13, vr14, vr13
|
|
vpickev.w vr15, vr16, vr15
|
|
vsrai.w vr13, vr13, 7
|
|
vsrai.w vr15, vr15, 7
|
|
vmin.w vr13, vr13, vr20
|
|
vmin.w vr15, vr15, vr20
|
|
|
|
vpickev.h vr13, vr15, vr13
|
|
vst vr13, a1, 0
|
|
addi.d a1, a1, 16
|
|
addi.d a5, a5, 32
|
|
addi.d a4, a4, 64
|
|
addi.d a2, a2, -8
|
|
bge a2, t8, .LOOP_DSTW4
|
|
blt zero, a2, .RES4
|
|
b .END_LOOP
|
|
.RES4:
|
|
li.w t1, 0
|
|
.DSTW4:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.FILTERSIZE4:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .FILTERSIZE4
|
|
srai.w t8, t8, 7
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 1
|
|
stx.h t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .DSTW4
|
|
b .END_LOOP
|
|
.END_DSTW4:
|
|
|
|
li.w t1, 0
|
|
.LOOP_DSTW1:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.FILTERSIZE1:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .FILTERSIZE1
|
|
srai.w t8, t8, 7
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 1
|
|
stx.h t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .LOOP_DSTW1
|
|
b .END_LOOP
|
|
.END_LOOP:
|
|
|
|
ld.d s0, sp, 0
|
|
ld.d s1, sp, 8
|
|
ld.d s2, sp, 16
|
|
ld.d s3, sp, 24
|
|
ld.d s4, sp, 32
|
|
ld.d s5, sp, 40
|
|
ld.d s6, sp, 48
|
|
ld.d s7, sp, 56
|
|
ld.d s8, sp, 64
|
|
addi.d sp, sp, 72
|
|
endfunc
|
|
|
|
/* void ff_hscale_8_to_19_lsx(SwsContext *c, int16_t *dst, int dstW,
|
|
* const uint8_t *src, const int16_t *filter,
|
|
* const int32_t *filterPos, int filterSize)
|
|
*/
|
|
function ff_hscale_8_to_19_lsx
|
|
addi.d sp, sp, -72
|
|
st.d s0, sp, 0
|
|
st.d s1, sp, 8
|
|
st.d s2, sp, 16
|
|
st.d s3, sp, 24
|
|
st.d s4, sp, 32
|
|
st.d s5, sp, 40
|
|
st.d s6, sp, 48
|
|
st.d s7, sp, 56
|
|
st.d s8, sp, 64
|
|
li.w t0, 524287
|
|
li.w t8, 8
|
|
li.w t7, 4
|
|
vldi vr0, 0
|
|
vreplgr2vr.w vr20, t0
|
|
beq a6, t7, .LOOP_DST4
|
|
beq a6, t8, .LOOP_DST8
|
|
blt t8, a6, .LOOP
|
|
b .END_DST4
|
|
|
|
.LOOP:
|
|
li.w t1, 0
|
|
li.w s1, 0
|
|
li.w s2, 0
|
|
li.w s3, 0
|
|
li.w s4, 0
|
|
li.w s5, 0
|
|
vldi vr22, 0
|
|
addi.w s0, a6, -7
|
|
slli.w s7, a6, 1
|
|
slli.w s8, a6, 2
|
|
add.w t6, s7, s8
|
|
.LOOP_DST:
|
|
ld.w t2, a5, 0
|
|
ld.w t3, a5, 4
|
|
ld.w t4, a5, 8
|
|
ld.w t5, a5, 12
|
|
fldx.d f1, a3, t2
|
|
fldx.d f2, a3, t3
|
|
fldx.d f3, a3, t4
|
|
fldx.d f4, a3, t5
|
|
vld vr9, a4, 0
|
|
vldx vr10, a4, s7
|
|
vldx vr11, a4, s8
|
|
vldx vr12, a4, t6
|
|
vilvl.b vr1, vr0, vr1
|
|
vilvl.b vr2, vr0, vr2
|
|
vilvl.b vr3, vr0, vr3
|
|
vilvl.b vr4, vr0, vr4
|
|
vdp2.w.h vr17, vr1, vr9
|
|
vdp2.w.h vr18, vr2, vr10
|
|
vdp2.w.h vr19, vr3, vr11
|
|
vdp2.w.h vr21, vr4, vr12
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.d vr1, vr3, vr1
|
|
vadd.w vr22, vr22, vr1
|
|
addi.w s1, s1, 8
|
|
addi.d a3, a3, 8
|
|
addi.d a4, a4, 16
|
|
blt s1, s0, .LOOP_DST
|
|
blt s1, a6, .DSTA
|
|
b .END_FILTERA
|
|
.DSTA:
|
|
ld.w t2, a5, 0
|
|
li.w t3, 0
|
|
move s6, s1
|
|
.FILTERA:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s2, s2, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .FILTERA
|
|
|
|
ld.w t2, a5, 4
|
|
li.w t3, 0
|
|
move s6, s1
|
|
addi.w t1, t1, 1
|
|
.FILTERB:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s3, s3, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .FILTERB
|
|
ld.w t2, a5, 8
|
|
addi.w t1, t1, 1
|
|
li.w t3, 0
|
|
move s6, s1
|
|
.FILTERC:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s4, s4, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .FILTERC
|
|
ld.w t2, a5, 12
|
|
addi.w t1, t1, 1
|
|
move s6, s1
|
|
li.w t3, 0
|
|
.FILTERD:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s5, s5, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .FILTERD
|
|
.END_FILTERA:
|
|
vpickve2gr.w t1, vr22, 0
|
|
vpickve2gr.w t2, vr22, 1
|
|
vpickve2gr.w t3, vr22, 2
|
|
vpickve2gr.w t4, vr22, 3
|
|
add.w s2, s2, t1
|
|
add.w s3, s3, t2
|
|
add.w s4, s4, t3
|
|
add.w s5, s5, t4
|
|
srai.w s2, s2, 3
|
|
srai.w s3, s3, 3
|
|
srai.w s4, s4, 3
|
|
srai.w s5, s5, 3
|
|
slt t1, s2, t0
|
|
slt t2, s3, t0
|
|
slt t3, s4, t0
|
|
slt t4, s5, t0
|
|
maskeqz s2, s2, t1
|
|
maskeqz s3, s3, t2
|
|
maskeqz s4, s4, t3
|
|
maskeqz s5, s5, t4
|
|
masknez t1, t0, t1
|
|
masknez t2, t0, t2
|
|
masknez t3, t0, t3
|
|
masknez t4, t0, t4
|
|
or s2, s2, t1
|
|
or s3, s3, t2
|
|
or s4, s4, t3
|
|
or s5, s5, t4
|
|
st.w s2, a1, 0
|
|
st.w s3, a1, 4
|
|
st.w s4, a1, 8
|
|
st.w s5, a1, 12
|
|
|
|
addi.d a1, a1, 16
|
|
sub.d a3, a3, s1
|
|
addi.d a5, a5, 16
|
|
slli.d t3, a6, 3
|
|
add.d a4, a4, t3
|
|
sub.d a4, a4, s1
|
|
sub.d a4, a4, s1
|
|
addi.d a2, a2, -4
|
|
bge a2, t7, .LOOP
|
|
blt zero, a2, .RESA
|
|
b .END
|
|
.RESA:
|
|
li.w t1, 0
|
|
.DST:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.FILTER:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .FILTER
|
|
srai.w t8, t8, 3
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 2
|
|
stx.w t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .DST
|
|
b .END
|
|
|
|
.LOOP_DST8:
|
|
ld.w t1, a5, 0
|
|
ld.w t2, a5, 4
|
|
ld.w t3, a5, 8
|
|
ld.w t4, a5, 12
|
|
fldx.d f1, a3, t1
|
|
fldx.d f2, a3, t2
|
|
fldx.d f3, a3, t3
|
|
fldx.d f4, a3, t4
|
|
ld.w t1, a5, 16
|
|
ld.w t2, a5, 20
|
|
ld.w t3, a5, 24
|
|
ld.w t4, a5, 28
|
|
fldx.d f5, a3, t1
|
|
fldx.d f6, a3, t2
|
|
fldx.d f7, a3, t3
|
|
fldx.d f8, a3, t4
|
|
vld vr9, a4, 0
|
|
vld vr10, a4, 16
|
|
vld vr11, a4, 32
|
|
vld vr12, a4, 48
|
|
vld vr13, a4, 64
|
|
vld vr14, a4, 80
|
|
vld vr15, a4, 96
|
|
vld vr16, a4, 112
|
|
vilvl.b vr1, vr0, vr1
|
|
vilvl.b vr2, vr0, vr2
|
|
vilvl.b vr3, vr0, vr3
|
|
vilvl.b vr4, vr0, vr4
|
|
vilvl.b vr5, vr0, vr5
|
|
vilvl.b vr6, vr0, vr6
|
|
vilvl.b vr7, vr0, vr7
|
|
vilvl.b vr8, vr0, vr8
|
|
|
|
vdp2.w.h vr17, vr1, vr9
|
|
vdp2.w.h vr18, vr2, vr10
|
|
vdp2.w.h vr19, vr3, vr11
|
|
vdp2.w.h vr21, vr4, vr12
|
|
vdp2.w.h vr1, vr5, vr13
|
|
vdp2.w.h vr2, vr6, vr14
|
|
vdp2.w.h vr3, vr7, vr15
|
|
vdp2.w.h vr4, vr8, vr16
|
|
vhaddw.d.w vr5, vr1, vr1
|
|
vhaddw.d.w vr6, vr2, vr2
|
|
vhaddw.d.w vr7, vr3, vr3
|
|
vhaddw.d.w vr8, vr4, vr4
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
vhaddw.q.d vr5, vr5, vr5
|
|
vhaddw.q.d vr6, vr6, vr6
|
|
vhaddw.q.d vr7, vr7, vr7
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.w vr5, vr6, vr5
|
|
vilvl.w vr7, vr8, vr7
|
|
vilvl.d vr1, vr3, vr1
|
|
vilvl.d vr5, vr7, vr5
|
|
vsrai.w vr1, vr1, 3
|
|
vsrai.w vr5, vr5, 3
|
|
vmin.w vr1, vr1, vr20
|
|
vmin.w vr5, vr5, vr20
|
|
|
|
vst vr1, a1, 0
|
|
vst vr5, a1, 16
|
|
addi.d a1, a1, 32
|
|
addi.d a5, a5, 32
|
|
addi.d a4, a4, 128
|
|
addi.d a2, a2, -8
|
|
bge a2, t8, .LOOP_DST8
|
|
blt zero, a2, .REST8
|
|
b .END
|
|
.REST8:
|
|
li.w t1, 0
|
|
.DST8:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.FILTER8:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .FILTER8
|
|
srai.w t8, t8, 3
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 2
|
|
stx.w t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .DST8
|
|
b .END
|
|
|
|
.LOOP_DST4:
|
|
ld.w t1, a5, 0
|
|
ld.w t2, a5, 4
|
|
ld.w t3, a5, 8
|
|
ld.w t4, a5, 12
|
|
fldx.s f1, a3, t1
|
|
fldx.s f2, a3, t2
|
|
fldx.s f3, a3, t3
|
|
fldx.s f4, a3, t4
|
|
ld.w t1, a5, 16
|
|
ld.w t2, a5, 20
|
|
ld.w t3, a5, 24
|
|
ld.w t4, a5, 28
|
|
fldx.s f5, a3, t1
|
|
fldx.s f6, a3, t2
|
|
fldx.s f7, a3, t3
|
|
fldx.s f8, a3, t4
|
|
vld vr9, a4, 0
|
|
vld vr10, a4, 16
|
|
vld vr11, a4, 32
|
|
vld vr12, a4, 48
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.w vr5, vr6, vr5
|
|
vilvl.w vr7, vr8, vr7
|
|
vilvl.b vr1, vr0, vr1
|
|
vilvl.b vr3, vr0, vr3
|
|
vilvl.b vr5, vr0, vr5
|
|
vilvl.b vr7, vr0, vr7
|
|
|
|
vdp2.w.h vr13, vr1, vr9
|
|
vdp2.w.h vr14, vr3, vr10
|
|
vdp2.w.h vr15, vr5, vr11
|
|
vdp2.w.h vr16, vr7, vr12
|
|
vhaddw.d.w vr13, vr13, vr13
|
|
vhaddw.d.w vr14, vr14, vr14
|
|
vhaddw.d.w vr15, vr15, vr15
|
|
vhaddw.d.w vr16, vr16, vr16
|
|
vpickev.w vr13, vr14, vr13
|
|
vpickev.w vr15, vr16, vr15
|
|
vsrai.w vr13, vr13, 3
|
|
vsrai.w vr15, vr15, 3
|
|
vmin.w vr13, vr13, vr20
|
|
vmin.w vr15, vr15, vr20
|
|
|
|
vst vr13, a1, 0
|
|
vst vr15, a1, 16
|
|
addi.d a1, a1, 32
|
|
addi.d a5, a5, 32
|
|
addi.d a4, a4, 64
|
|
addi.d a2, a2, -8
|
|
bge a2, t8, .LOOP_DST4
|
|
blt zero, a2, .REST4
|
|
b .END
|
|
.REST4:
|
|
li.w t1, 0
|
|
.DST4:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.FILTER4:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .FILTER4
|
|
srai.w t8, t8, 3
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 2
|
|
stx.w t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .DST4
|
|
b .END
|
|
.END_DST4:
|
|
|
|
li.w t1, 0
|
|
.LOOP_DST1:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.FILTER1:
|
|
add.w t4, t2, t3
|
|
ldx.bu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .FILTER1
|
|
srai.w t8, t8, 3
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 2
|
|
stx.w t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .LOOP_DST1
|
|
b .END
|
|
.END:
|
|
|
|
ld.d s0, sp, 0
|
|
ld.d s1, sp, 8
|
|
ld.d s2, sp, 16
|
|
ld.d s3, sp, 24
|
|
ld.d s4, sp, 32
|
|
ld.d s5, sp, 40
|
|
ld.d s6, sp, 48
|
|
ld.d s7, sp, 56
|
|
ld.d s8, sp, 64
|
|
addi.d sp, sp, 72
|
|
endfunc
|
|
|
|
/* void ff_hscale_16_to_15_sub_lsx(SwsContext *c, int16_t *dst, int dstW,
|
|
* const uint8_t *src, const int16_t *filter,
|
|
* const int32_t *filterPos, int filterSize, int sh)
|
|
*/
|
|
function ff_hscale_16_to_15_sub_lsx
|
|
addi.d sp, sp, -72
|
|
st.d s0, sp, 0
|
|
st.d s1, sp, 8
|
|
st.d s2, sp, 16
|
|
st.d s3, sp, 24
|
|
st.d s4, sp, 32
|
|
st.d s5, sp, 40
|
|
st.d s6, sp, 48
|
|
st.d s7, sp, 56
|
|
st.d s8, sp, 64
|
|
li.w t0, 32767
|
|
li.w t8, 8
|
|
li.w t7, 4
|
|
vreplgr2vr.w vr20, t0
|
|
vreplgr2vr.w vr0, a7
|
|
beq a6, t7, .LOOP_HS15_DST4
|
|
beq a6, t8, .LOOP_HS15_DST8
|
|
blt t8, a6, .LOOP_HS15
|
|
b .END_HS15_DST4
|
|
|
|
.LOOP_HS15:
|
|
li.w t1, 0
|
|
li.w s1, 0
|
|
li.w s2, 0
|
|
li.w s3, 0
|
|
li.w s4, 0
|
|
li.w s5, 0
|
|
vldi vr22, 0
|
|
addi.w s0, a6, -7
|
|
slli.w s7, a6, 1
|
|
slli.w s8, a6, 2
|
|
add.w t6, s7, s8
|
|
.LOOP_HS15_DST:
|
|
ld.w t2, a5, 0
|
|
ld.w t3, a5, 4
|
|
ld.w t4, a5, 8
|
|
ld.w t5, a5, 12
|
|
slli.w t2, t2, 1
|
|
slli.w t3, t3, 1
|
|
slli.w t4, t4, 1
|
|
slli.w t5, t5, 1
|
|
vldx vr1, a3, t2
|
|
vldx vr2, a3, t3
|
|
vldx vr3, a3, t4
|
|
vldx vr4, a3, t5
|
|
vld vr9, a4, 0
|
|
vldx vr10, a4, s7
|
|
vldx vr11, a4, s8
|
|
vldx vr12, a4, t6
|
|
vmulwev.w.hu.h vr17, vr1, vr9
|
|
vmulwev.w.hu.h vr18, vr2, vr10
|
|
vmulwev.w.hu.h vr19, vr3, vr11
|
|
vmulwev.w.hu.h vr21, vr4, vr12
|
|
vmaddwod.w.hu.h vr17, vr1, vr9
|
|
vmaddwod.w.hu.h vr18, vr2, vr10
|
|
vmaddwod.w.hu.h vr19, vr3, vr11
|
|
vmaddwod.w.hu.h vr21, vr4, vr12
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.d vr1, vr3, vr1
|
|
vadd.w vr22, vr22, vr1
|
|
addi.w s1, s1, 8
|
|
addi.d a3, a3, 16
|
|
addi.d a4, a4, 16
|
|
blt s1, s0, .LOOP_HS15_DST
|
|
blt s1, a6, .HS15_DSTA
|
|
b .END_HS15_FILTERA
|
|
.HS15_DSTA:
|
|
ld.w t2, a5, 0
|
|
li.w t3, 0
|
|
move s6, s1
|
|
.HS15_FILTERA:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s2, s2, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .HS15_FILTERA
|
|
|
|
ld.w t2, a5, 4
|
|
li.w t3, 0
|
|
move s6, s1
|
|
addi.w t1, t1, 1
|
|
.HS15_FILTERB:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s3, s3, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .HS15_FILTERB
|
|
ld.w t2, a5, 8
|
|
addi.w t1, t1, 1
|
|
li.w t3, 0
|
|
move s6, s1
|
|
.HS15_FILTERC:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s4, s4, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .HS15_FILTERC
|
|
ld.w t2, a5, 12
|
|
addi.w t1, t1, 1
|
|
move s6, s1
|
|
li.w t3, 0
|
|
.HS15_FILTERD:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s5, s5, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .HS15_FILTERD
|
|
.END_HS15_FILTERA:
|
|
vpickve2gr.w t1, vr22, 0
|
|
vpickve2gr.w t2, vr22, 1
|
|
vpickve2gr.w t3, vr22, 2
|
|
vpickve2gr.w t4, vr22, 3
|
|
add.w s2, s2, t1
|
|
add.w s3, s3, t2
|
|
add.w s4, s4, t3
|
|
add.w s5, s5, t4
|
|
sra.w s2, s2, a7
|
|
sra.w s3, s3, a7
|
|
sra.w s4, s4, a7
|
|
sra.w s5, s5, a7
|
|
slt t1, s2, t0
|
|
slt t2, s3, t0
|
|
slt t3, s4, t0
|
|
slt t4, s5, t0
|
|
maskeqz s2, s2, t1
|
|
maskeqz s3, s3, t2
|
|
maskeqz s4, s4, t3
|
|
maskeqz s5, s5, t4
|
|
masknez t1, t0, t1
|
|
masknez t2, t0, t2
|
|
masknez t3, t0, t3
|
|
masknez t4, t0, t4
|
|
or s2, s2, t1
|
|
or s3, s3, t2
|
|
or s4, s4, t3
|
|
or s5, s5, t4
|
|
st.h s2, a1, 0
|
|
st.h s3, a1, 2
|
|
st.h s4, a1, 4
|
|
st.h s5, a1, 6
|
|
|
|
addi.d a1, a1, 8
|
|
sub.d a3, a3, s1
|
|
sub.d a3, a3, s1
|
|
addi.d a5, a5, 16
|
|
slli.d t3, a6, 3
|
|
add.d a4, a4, t3
|
|
sub.d a4, a4, s1
|
|
sub.d a4, a4, s1
|
|
addi.d a2, a2, -4
|
|
bge a2, t7, .LOOP_HS15
|
|
blt zero, a2, .HS15_RESA
|
|
b .HS15_END
|
|
.HS15_RESA:
|
|
li.w t1, 0
|
|
.HS15_DST:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.HS15_FILTER:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .HS15_FILTER
|
|
sra.w t8, t8, a7
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 1
|
|
stx.h t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .HS15_DST
|
|
b .HS15_END
|
|
|
|
.LOOP_HS15_DST8:
|
|
ld.w t1, a5, 0
|
|
ld.w t2, a5, 4
|
|
ld.w t3, a5, 8
|
|
ld.w t4, a5, 12
|
|
slli.w t1, t1, 1
|
|
slli.w t2, t2, 1
|
|
slli.w t3, t3, 1
|
|
slli.w t4, t4, 1
|
|
vldx vr1, a3, t1
|
|
vldx vr2, a3, t2
|
|
vldx vr3, a3, t3
|
|
vldx vr4, a3, t4
|
|
ld.w t1, a5, 16
|
|
ld.w t2, a5, 20
|
|
ld.w t3, a5, 24
|
|
ld.w t4, a5, 28
|
|
slli.w t1, t1, 1
|
|
slli.w t2, t2, 1
|
|
slli.w t3, t3, 1
|
|
slli.w t4, t4, 1
|
|
vldx vr5, a3, t1
|
|
vldx vr6, a3, t2
|
|
vldx vr7, a3, t3
|
|
vldx vr8, a3, t4
|
|
vld vr9, a4, 0
|
|
vld vr10, a4, 16
|
|
vld vr11, a4, 32
|
|
vld vr12, a4, 48
|
|
vld vr13, a4, 64
|
|
vld vr14, a4, 80
|
|
vld vr15, a4, 96
|
|
vld vr16, a4, 112
|
|
|
|
vmulwev.w.hu.h vr17, vr1, vr9
|
|
vmulwev.w.hu.h vr18, vr2, vr10
|
|
vmulwev.w.hu.h vr19, vr3, vr11
|
|
vmulwev.w.hu.h vr21, vr4, vr12
|
|
vmaddwod.w.hu.h vr17, vr1, vr9
|
|
vmaddwod.w.hu.h vr18, vr2, vr10
|
|
vmaddwod.w.hu.h vr19, vr3, vr11
|
|
vmaddwod.w.hu.h vr21, vr4, vr12
|
|
vmulwev.w.hu.h vr1, vr5, vr13
|
|
vmulwev.w.hu.h vr2, vr6, vr14
|
|
vmulwev.w.hu.h vr3, vr7, vr15
|
|
vmulwev.w.hu.h vr4, vr8, vr16
|
|
vmaddwod.w.hu.h vr1, vr5, vr13
|
|
vmaddwod.w.hu.h vr2, vr6, vr14
|
|
vmaddwod.w.hu.h vr3, vr7, vr15
|
|
vmaddwod.w.hu.h vr4, vr8, vr16
|
|
vhaddw.d.w vr5, vr1, vr1
|
|
vhaddw.d.w vr6, vr2, vr2
|
|
vhaddw.d.w vr7, vr3, vr3
|
|
vhaddw.d.w vr8, vr4, vr4
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
vhaddw.q.d vr5, vr5, vr5
|
|
vhaddw.q.d vr6, vr6, vr6
|
|
vhaddw.q.d vr7, vr7, vr7
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.w vr5, vr6, vr5
|
|
vilvl.w vr7, vr8, vr7
|
|
vilvl.d vr1, vr3, vr1
|
|
vilvl.d vr5, vr7, vr5
|
|
vsra.w vr1, vr1, vr0
|
|
vsra.w vr5, vr5, vr0
|
|
vmin.w vr1, vr1, vr20
|
|
vmin.w vr5, vr5, vr20
|
|
|
|
vpickev.h vr1, vr5, vr1
|
|
vst vr1, a1, 0
|
|
addi.d a1, a1, 16
|
|
addi.d a5, a5, 32
|
|
addi.d a4, a4, 128
|
|
addi.d a2, a2, -8
|
|
bge a2, t8, .LOOP_HS15_DST8
|
|
blt zero, a2, .HS15_REST8
|
|
b .HS15_END
|
|
.HS15_REST8:
|
|
li.w t1, 0
|
|
.HS15_DST8:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.HS15_FILTER8:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .HS15_FILTER8
|
|
sra.w t8, t8, a7
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 1
|
|
stx.h t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .HS15_DST8
|
|
b .HS15_END
|
|
|
|
.LOOP_HS15_DST4:
|
|
ld.w t1, a5, 0
|
|
ld.w t2, a5, 4
|
|
ld.w t3, a5, 8
|
|
ld.w t4, a5, 12
|
|
slli.w t1, t1, 1
|
|
slli.w t2, t2, 1
|
|
slli.w t3, t3, 1
|
|
slli.w t4, t4, 1
|
|
fldx.d f1, a3, t1
|
|
fldx.d f2, a3, t2
|
|
fldx.d f3, a3, t3
|
|
fldx.d f4, a3, t4
|
|
ld.w t1, a5, 16
|
|
ld.w t2, a5, 20
|
|
ld.w t3, a5, 24
|
|
ld.w t4, a5, 28
|
|
slli.w t1, t1, 1
|
|
slli.w t2, t2, 1
|
|
slli.w t3, t3, 1
|
|
slli.w t4, t4, 1
|
|
fldx.d f5, a3, t1
|
|
fldx.d f6, a3, t2
|
|
fldx.d f7, a3, t3
|
|
fldx.d f8, a3, t4
|
|
vld vr9, a4, 0
|
|
vld vr10, a4, 16
|
|
vld vr11, a4, 32
|
|
vld vr12, a4, 48
|
|
vilvl.d vr1, vr2, vr1
|
|
vilvl.d vr3, vr4, vr3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
vmulwev.w.hu.h vr13, vr1, vr9
|
|
vmulwev.w.hu.h vr14, vr3, vr10
|
|
vmulwev.w.hu.h vr15, vr5, vr11
|
|
vmulwev.w.hu.h vr16, vr7, vr12
|
|
vmaddwod.w.hu.h vr13, vr1, vr9
|
|
vmaddwod.w.hu.h vr14, vr3, vr10
|
|
vmaddwod.w.hu.h vr15, vr5, vr11
|
|
vmaddwod.w.hu.h vr16, vr7, vr12
|
|
vhaddw.d.w vr13, vr13, vr13
|
|
vhaddw.d.w vr14, vr14, vr14
|
|
vhaddw.d.w vr15, vr15, vr15
|
|
vhaddw.d.w vr16, vr16, vr16
|
|
vpickev.w vr13, vr14, vr13
|
|
vpickev.w vr15, vr16, vr15
|
|
vsra.w vr13, vr13, vr0
|
|
vsra.w vr15, vr15, vr0
|
|
vmin.w vr13, vr13, vr20
|
|
vmin.w vr15, vr15, vr20
|
|
|
|
vpickev.h vr13, vr15, vr13
|
|
vst vr13, a1, 0
|
|
addi.d a1, a1, 16
|
|
addi.d a5, a5, 32
|
|
addi.d a4, a4, 64
|
|
addi.d a2, a2, -8
|
|
bge a2, t8, .LOOP_HS15_DST4
|
|
blt zero, a2, .HS15_REST4
|
|
b .HS15_END
|
|
.HS15_REST4:
|
|
li.w t1, 0
|
|
.HS15_DST4:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.HS15_FILTER4:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .HS15_FILTER4
|
|
sra.w t8, t8, a7
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 1
|
|
stx.h t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .HS15_DST4
|
|
b .HS15_END
|
|
.END_HS15_DST4:
|
|
|
|
li.w t1, 0
|
|
.LOOP_HS15_DST1:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.HS15_FILTER1:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .HS15_FILTER1
|
|
sra.w t8, t8, a7
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 1
|
|
stx.h t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .LOOP_HS15_DST1
|
|
b .HS15_END
|
|
.HS15_END:
|
|
|
|
ld.d s0, sp, 0
|
|
ld.d s1, sp, 8
|
|
ld.d s2, sp, 16
|
|
ld.d s3, sp, 24
|
|
ld.d s4, sp, 32
|
|
ld.d s5, sp, 40
|
|
ld.d s6, sp, 48
|
|
ld.d s7, sp, 56
|
|
ld.d s8, sp, 64
|
|
addi.d sp, sp, 72
|
|
endfunc
|
|
|
|
/* void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *dst, int dstW,
|
|
* const uint8_t *src, const int16_t *filter,
|
|
* const int32_t *filterPos, int filterSize, int sh)
|
|
*/
|
|
function ff_hscale_16_to_19_sub_lsx
|
|
addi.d sp, sp, -72
|
|
st.d s0, sp, 0
|
|
st.d s1, sp, 8
|
|
st.d s2, sp, 16
|
|
st.d s3, sp, 24
|
|
st.d s4, sp, 32
|
|
st.d s5, sp, 40
|
|
st.d s6, sp, 48
|
|
st.d s7, sp, 56
|
|
st.d s8, sp, 64
|
|
|
|
li.w t0, 524287
|
|
li.w t8, 8
|
|
li.w t7, 4
|
|
vreplgr2vr.w vr20, t0
|
|
vreplgr2vr.w vr0, a7
|
|
beq a6, t7, .LOOP_HS19_DST4
|
|
beq a6, t8, .LOOP_HS19_DST8
|
|
blt t8, a6, .LOOP_HS19
|
|
b .END_HS19_DST4
|
|
|
|
.LOOP_HS19:
|
|
li.w t1, 0
|
|
li.w s1, 0
|
|
li.w s2, 0
|
|
li.w s3, 0
|
|
li.w s4, 0
|
|
li.w s5, 0
|
|
vldi vr22, 0
|
|
addi.w s0, a6, -7
|
|
slli.w s7, a6, 1
|
|
slli.w s8, a6, 2
|
|
add.w t6, s7, s8
|
|
.LOOP_HS19_DST:
|
|
ld.w t2, a5, 0
|
|
ld.w t3, a5, 4
|
|
ld.w t4, a5, 8
|
|
ld.w t5, a5, 12
|
|
slli.w t2, t2, 1
|
|
slli.w t3, t3, 1
|
|
slli.w t4, t4, 1
|
|
slli.w t5, t5, 1
|
|
vldx vr1, a3, t2
|
|
vldx vr2, a3, t3
|
|
vldx vr3, a3, t4
|
|
vldx vr4, a3, t5
|
|
vld vr9, a4, 0
|
|
vldx vr10, a4, s7
|
|
vldx vr11, a4, s8
|
|
vldx vr12, a4, t6
|
|
vmulwev.w.hu.h vr17, vr1, vr9
|
|
vmulwev.w.hu.h vr18, vr2, vr10
|
|
vmulwev.w.hu.h vr19, vr3, vr11
|
|
vmulwev.w.hu.h vr21, vr4, vr12
|
|
vmaddwod.w.hu.h vr17, vr1, vr9
|
|
vmaddwod.w.hu.h vr18, vr2, vr10
|
|
vmaddwod.w.hu.h vr19, vr3, vr11
|
|
vmaddwod.w.hu.h vr21, vr4, vr12
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.d vr1, vr3, vr1
|
|
vadd.w vr22, vr22, vr1
|
|
addi.w s1, s1, 8
|
|
addi.d a3, a3, 16
|
|
addi.d a4, a4, 16
|
|
blt s1, s0, .LOOP_HS19_DST
|
|
blt s1, a6, .HS19_DSTA
|
|
b .END_HS19_FILTERA
|
|
.HS19_DSTA:
|
|
ld.w t2, a5, 0
|
|
li.w t3, 0
|
|
move s6, s1
|
|
.HS19_FILTERA:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s2, s2, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .HS19_FILTERA
|
|
|
|
ld.w t2, a5, 4
|
|
li.w t3, 0
|
|
move s6, s1
|
|
addi.w t1, t1, 1
|
|
.HS19_FILTERB:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s3, s3, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .HS19_FILTERB
|
|
ld.w t2, a5, 8
|
|
addi.w t1, t1, 1
|
|
li.w t3, 0
|
|
move s6, s1
|
|
.HS19_FILTERC:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s4, s4, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .HS19_FILTERC
|
|
ld.w t2, a5, 12
|
|
addi.w t1, t1, 1
|
|
move s6, s1
|
|
li.w t3, 0
|
|
.HS19_FILTERD:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t6, t6, 1
|
|
ldx.h t6, a4, t6
|
|
mul.w t6, t5, t6
|
|
add.w s5, s5, t6
|
|
addi.w t3, t3, 1
|
|
addi.w s6, s6, 1
|
|
blt s6, a6, .HS19_FILTERD
|
|
.END_HS19_FILTERA:
|
|
vpickve2gr.w t1, vr22, 0
|
|
vpickve2gr.w t2, vr22, 1
|
|
vpickve2gr.w t3, vr22, 2
|
|
vpickve2gr.w t4, vr22, 3
|
|
add.w s2, s2, t1
|
|
add.w s3, s3, t2
|
|
add.w s4, s4, t3
|
|
add.w s5, s5, t4
|
|
sra.w s2, s2, a7
|
|
sra.w s3, s3, a7
|
|
sra.w s4, s4, a7
|
|
sra.w s5, s5, a7
|
|
slt t1, s2, t0
|
|
slt t2, s3, t0
|
|
slt t3, s4, t0
|
|
slt t4, s5, t0
|
|
maskeqz s2, s2, t1
|
|
maskeqz s3, s3, t2
|
|
maskeqz s4, s4, t3
|
|
maskeqz s5, s5, t4
|
|
masknez t1, t0, t1
|
|
masknez t2, t0, t2
|
|
masknez t3, t0, t3
|
|
masknez t4, t0, t4
|
|
or s2, s2, t1
|
|
or s3, s3, t2
|
|
or s4, s4, t3
|
|
or s5, s5, t4
|
|
st.w s2, a1, 0
|
|
st.w s3, a1, 4
|
|
st.w s4, a1, 8
|
|
st.w s5, a1, 12
|
|
|
|
addi.d a1, a1, 16
|
|
sub.d a3, a3, s1
|
|
sub.d a3, a3, s1
|
|
addi.d a5, a5, 16
|
|
slli.d t3, a6, 3
|
|
add.d a4, a4, t3
|
|
sub.d a4, a4, s1
|
|
sub.d a4, a4, s1
|
|
addi.d a2, a2, -4
|
|
bge a2, t7, .LOOP_HS19
|
|
blt zero, a2, .HS19_RESA
|
|
b .HS19_END
|
|
.HS19_RESA:
|
|
li.w t1, 0
|
|
.HS19_DST:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.HS19_FILTER:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .HS19_FILTER
|
|
sra.w t8, t8, a7
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 2
|
|
stx.w t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .HS19_DST
|
|
b .HS19_END
|
|
|
|
.LOOP_HS19_DST8:
|
|
ld.w t1, a5, 0
|
|
ld.w t2, a5, 4
|
|
ld.w t3, a5, 8
|
|
ld.w t4, a5, 12
|
|
slli.w t1, t1, 1
|
|
slli.w t2, t2, 1
|
|
slli.w t3, t3, 1
|
|
slli.w t4, t4, 1
|
|
vldx vr1, a3, t1
|
|
vldx vr2, a3, t2
|
|
vldx vr3, a3, t3
|
|
vldx vr4, a3, t4
|
|
ld.w t1, a5, 16
|
|
ld.w t2, a5, 20
|
|
ld.w t3, a5, 24
|
|
ld.w t4, a5, 28
|
|
slli.w t1, t1, 1
|
|
slli.w t2, t2, 1
|
|
slli.w t3, t3, 1
|
|
slli.w t4, t4, 1
|
|
vldx vr5, a3, t1
|
|
vldx vr6, a3, t2
|
|
vldx vr7, a3, t3
|
|
vldx vr8, a3, t4
|
|
vld vr9, a4, 0
|
|
vld vr10, a4, 16
|
|
vld vr11, a4, 32
|
|
vld vr12, a4, 48
|
|
vld vr13, a4, 64
|
|
vld vr14, a4, 80
|
|
vld vr15, a4, 96
|
|
vld vr16, a4, 112
|
|
vmulwev.w.hu.h vr17, vr1, vr9
|
|
vmulwev.w.hu.h vr18, vr2, vr10
|
|
vmulwev.w.hu.h vr19, vr3, vr11
|
|
vmulwev.w.hu.h vr21, vr4, vr12
|
|
vmaddwod.w.hu.h vr17, vr1, vr9
|
|
vmaddwod.w.hu.h vr18, vr2, vr10
|
|
vmaddwod.w.hu.h vr19, vr3, vr11
|
|
vmaddwod.w.hu.h vr21, vr4, vr12
|
|
vmulwev.w.hu.h vr1, vr5, vr13
|
|
vmulwev.w.hu.h vr2, vr6, vr14
|
|
vmulwev.w.hu.h vr3, vr7, vr15
|
|
vmulwev.w.hu.h vr4, vr8, vr16
|
|
vmaddwod.w.hu.h vr1, vr5, vr13
|
|
vmaddwod.w.hu.h vr2, vr6, vr14
|
|
vmaddwod.w.hu.h vr3, vr7, vr15
|
|
vmaddwod.w.hu.h vr4, vr8, vr16
|
|
vhaddw.d.w vr5, vr1, vr1
|
|
vhaddw.d.w vr6, vr2, vr2
|
|
vhaddw.d.w vr7, vr3, vr3
|
|
vhaddw.d.w vr8, vr4, vr4
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
vhaddw.q.d vr5, vr5, vr5
|
|
vhaddw.q.d vr6, vr6, vr6
|
|
vhaddw.q.d vr7, vr7, vr7
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
vilvl.w vr1, vr2, vr1
|
|
vilvl.w vr3, vr4, vr3
|
|
vilvl.w vr5, vr6, vr5
|
|
vilvl.w vr7, vr8, vr7
|
|
vilvl.d vr1, vr3, vr1
|
|
vilvl.d vr5, vr7, vr5
|
|
vsra.w vr1, vr1, vr0
|
|
vsra.w vr5, vr5, vr0
|
|
vmin.w vr1, vr1, vr20
|
|
vmin.w vr5, vr5, vr20
|
|
|
|
vst vr1, a1, 0
|
|
vst vr5, a1, 16
|
|
addi.d a1, a1, 32
|
|
addi.d a5, a5, 32
|
|
addi.d a4, a4, 128
|
|
addi.d a2, a2, -8
|
|
bge a2, t8, .LOOP_HS19_DST8
|
|
blt zero, a2, .HS19_REST8
|
|
b .HS19_END
|
|
.HS19_REST8:
|
|
li.w t1, 0
|
|
.HS19_DST8:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.HS19_FILTER8:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .HS19_FILTER8
|
|
sra.w t8, t8, a7
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 2
|
|
stx.w t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .HS19_DST8
|
|
b .HS19_END
|
|
|
|
.LOOP_HS19_DST4:
|
|
ld.w t1, a5, 0
|
|
ld.w t2, a5, 4
|
|
ld.w t3, a5, 8
|
|
ld.w t4, a5, 12
|
|
slli.w t1, t1, 1
|
|
slli.w t2, t2, 1
|
|
slli.w t3, t3, 1
|
|
slli.w t4, t4, 1
|
|
fldx.d f1, a3, t1
|
|
fldx.d f2, a3, t2
|
|
fldx.d f3, a3, t3
|
|
fldx.d f4, a3, t4
|
|
ld.w t1, a5, 16
|
|
ld.w t2, a5, 20
|
|
ld.w t3, a5, 24
|
|
ld.w t4, a5, 28
|
|
slli.w t1, t1, 1
|
|
slli.w t2, t2, 1
|
|
slli.w t3, t3, 1
|
|
slli.w t4, t4, 1
|
|
fldx.d f5, a3, t1
|
|
fldx.d f6, a3, t2
|
|
fldx.d f7, a3, t3
|
|
fldx.d f8, a3, t4
|
|
vld vr9, a4, 0
|
|
vld vr10, a4, 16
|
|
vld vr11, a4, 32
|
|
vld vr12, a4, 48
|
|
vilvl.d vr1, vr2, vr1
|
|
vilvl.d vr3, vr4, vr3
|
|
vilvl.d vr5, vr6, vr5
|
|
vilvl.d vr7, vr8, vr7
|
|
vmulwev.w.hu.h vr13, vr1, vr9
|
|
vmulwev.w.hu.h vr14, vr3, vr10
|
|
vmulwev.w.hu.h vr15, vr5, vr11
|
|
vmulwev.w.hu.h vr16, vr7, vr12
|
|
vmaddwod.w.hu.h vr13, vr1, vr9
|
|
vmaddwod.w.hu.h vr14, vr3, vr10
|
|
vmaddwod.w.hu.h vr15, vr5, vr11
|
|
vmaddwod.w.hu.h vr16, vr7, vr12
|
|
vhaddw.d.w vr13, vr13, vr13
|
|
vhaddw.d.w vr14, vr14, vr14
|
|
vhaddw.d.w vr15, vr15, vr15
|
|
vhaddw.d.w vr16, vr16, vr16
|
|
vpickev.w vr13, vr14, vr13
|
|
vpickev.w vr15, vr16, vr15
|
|
vsra.w vr13, vr13, vr0
|
|
vsra.w vr15, vr15, vr0
|
|
vmin.w vr13, vr13, vr20
|
|
vmin.w vr15, vr15, vr20
|
|
|
|
vst vr13, a1, 0
|
|
vst vr15, a1, 16
|
|
addi.d a1, a1, 32
|
|
addi.d a5, a5, 32
|
|
addi.d a4, a4, 64
|
|
addi.d a2, a2, -8
|
|
bge a2, t8, .LOOP_HS19_DST4
|
|
blt zero, a2, .HS19_REST4
|
|
b .HS19_END
|
|
.HS19_REST4:
|
|
li.w t1, 0
|
|
.HS19_DST4:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.HS19_FILTER4:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .HS19_FILTER4
|
|
sra.w t8, t8, a7
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 2
|
|
stx.w t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .HS19_DST4
|
|
b .HS19_END
|
|
.END_HS19_DST4:
|
|
|
|
li.w t1, 0
|
|
.LOOP_HS19_DST1:
|
|
slli.w t2, t1, 2
|
|
ldx.w t2, a5, t2
|
|
li.w t3, 0
|
|
li.w t8, 0
|
|
.HS19_FILTER1:
|
|
add.w t4, t2, t3
|
|
slli.w t4, t4, 1
|
|
ldx.hu t5, a3, t4
|
|
mul.w t6, a6, t1
|
|
add.w t6, t6, t3
|
|
slli.w t7, t6, 1
|
|
ldx.h t7, a4, t7
|
|
mul.w t7, t5, t7
|
|
add.w t8, t8, t7
|
|
addi.w t3, t3, 1
|
|
blt t3, a6, .HS19_FILTER1
|
|
sra.w t8, t8, a7
|
|
slt t5, t8, t0
|
|
maskeqz t8, t8, t5
|
|
masknez t5, t0, t5
|
|
or t8, t8, t5
|
|
slli.w t4, t1, 2
|
|
stx.w t8, a1, t4
|
|
addi.w t1, t1, 1
|
|
blt t1, a2, .LOOP_HS19_DST1
|
|
b .HS19_END
|
|
.HS19_END:
|
|
|
|
ld.d s0, sp, 0
|
|
ld.d s1, sp, 8
|
|
ld.d s2, sp, 16
|
|
ld.d s3, sp, 24
|
|
ld.d s4, sp, 32
|
|
ld.d s5, sp, 40
|
|
ld.d s6, sp, 48
|
|
ld.d s7, sp, 56
|
|
ld.d s8, sp, 64
|
|
addi.d sp, sp, 72
|
|
endfunc
|