2023-05-25 09:24:30 +02:00
|
|
|
/*
|
|
|
|
* Loongson LSX optimized swscale
|
|
|
|
*
|
|
|
|
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
|
|
|
* Contributed by Lu Wang <wanglu@loongson.cn>
|
|
|
|
*
|
|
|
|
* This file is part of FFmpeg.
|
|
|
|
*
|
|
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "libavcodec/loongarch/loongson_asm.S"
|
|
|
|
|
swscale: rename SwsContext to SwsInternal
And preserve the public SwsContext as separate name. The motivation here
is that I want to turn SwsContext into a public struct, while keeping the
internal implementation hidden. Additionally, I also want to be able to
use multiple internal implementations, e.g. for GPU devices.
This commit does not include any functional changes. For the most part, it is
a simple rename. The only complications arise from the public facing API
functions, which preserve their current type (and hence require an additional
unwrapping step internally), and the checkasm test framework, which directly
accesses SwsInternal.
For consistency, the affected functions that need to maintain a distionction
have generally been changed to refer to the SwsContext as *sws, and the
SwsInternal as *c.
In an upcoming commit, I will provide a backing definition for the public
SwsContext, and update `sws_internal()` to dereference the internal struct
instead of merely casting it.
Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
2024-10-09 19:44:33 +02:00
|
|
|
/* void ff_hscale_8_to_15_lsx(SwsInternal *c, int16_t *dst, int dstW,
|
2023-05-25 09:24:30 +02:00
|
|
|
* const uint8_t *src, const int16_t *filter,
|
|
|
|
* const int32_t *filterPos, int filterSize)
|
|
|
|
*/
|
|
|
|
function ff_hscale_8_to_15_lsx
|
|
|
|
addi.d sp, sp, -72
|
|
|
|
st.d s0, sp, 0
|
|
|
|
st.d s1, sp, 8
|
|
|
|
st.d s2, sp, 16
|
|
|
|
st.d s3, sp, 24
|
|
|
|
st.d s4, sp, 32
|
|
|
|
st.d s5, sp, 40
|
|
|
|
st.d s6, sp, 48
|
|
|
|
st.d s7, sp, 56
|
|
|
|
st.d s8, sp, 64
|
|
|
|
li.w t0, 32767
|
|
|
|
li.w t8, 8
|
|
|
|
li.w t7, 4
|
|
|
|
vldi vr0, 0
|
|
|
|
vreplgr2vr.w vr20, t0
|
|
|
|
beq a6, t7, .LOOP_DSTW4
|
|
|
|
beq a6, t8, .LOOP_DSTW8
|
|
|
|
blt t8, a6, .LOOP_START
|
|
|
|
b .END_DSTW4
|
|
|
|
|
|
|
|
.LOOP_START:
|
|
|
|
li.w t1, 0
|
|
|
|
li.w s1, 0
|
|
|
|
li.w s2, 0
|
|
|
|
li.w s3, 0
|
|
|
|
li.w s4, 0
|
|
|
|
li.w s5, 0
|
|
|
|
vldi vr22, 0
|
|
|
|
addi.w s0, a6, -7
|
|
|
|
slli.w s7, a6, 1
|
|
|
|
slli.w s8, a6, 2
|
|
|
|
add.w t6, s7, s8
|
|
|
|
.LOOP_DSTW:
|
|
|
|
ld.w t2, a5, 0
|
|
|
|
ld.w t3, a5, 4
|
|
|
|
ld.w t4, a5, 8
|
|
|
|
ld.w t5, a5, 12
|
|
|
|
fldx.d f1, a3, t2
|
|
|
|
fldx.d f2, a3, t3
|
|
|
|
fldx.d f3, a3, t4
|
|
|
|
fldx.d f4, a3, t5
|
|
|
|
vld vr9, a4, 0
|
|
|
|
vldx vr10, a4, s7
|
|
|
|
vldx vr11, a4, s8
|
|
|
|
vldx vr12, a4, t6
|
|
|
|
vilvl.b vr1, vr0, vr1
|
|
|
|
vilvl.b vr2, vr0, vr2
|
|
|
|
vilvl.b vr3, vr0, vr3
|
|
|
|
vilvl.b vr4, vr0, vr4
|
|
|
|
vdp2.w.h vr17, vr1, vr9
|
|
|
|
vdp2.w.h vr18, vr2, vr10
|
|
|
|
vdp2.w.h vr19, vr3, vr11
|
|
|
|
vdp2.w.h vr21, vr4, vr12
|
|
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
|
|
vilvl.w vr1, vr2, vr1
|
|
|
|
vilvl.w vr3, vr4, vr3
|
|
|
|
vilvl.d vr1, vr3, vr1
|
|
|
|
vadd.w vr22, vr22, vr1
|
|
|
|
addi.w s1, s1, 8
|
|
|
|
addi.d a3, a3, 8
|
|
|
|
addi.d a4, a4, 16
|
|
|
|
blt s1, s0, .LOOP_DSTW
|
|
|
|
blt s1, a6, .DSTWA
|
|
|
|
b .END_FILTER
|
|
|
|
.DSTWA:
|
|
|
|
ld.w t2, a5, 0
|
|
|
|
li.w t3, 0
|
|
|
|
move s6, s1
|
|
|
|
.FILTERSIZEA:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s2, s2, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .FILTERSIZEA
|
|
|
|
|
|
|
|
ld.w t2, a5, 4
|
|
|
|
li.w t3, 0
|
|
|
|
move s6, s1
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
.FILTERSIZEB:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s3, s3, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .FILTERSIZEB
|
|
|
|
ld.w t2, a5, 8
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
li.w t3, 0
|
|
|
|
move s6, s1
|
|
|
|
.FILTERSIZEC:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s4, s4, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .FILTERSIZEC
|
|
|
|
ld.w t2, a5, 12
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
move s6, s1
|
|
|
|
li.w t3, 0
|
|
|
|
.FILTERSIZED:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s5, s5, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .FILTERSIZED
|
|
|
|
.END_FILTER:
|
|
|
|
vpickve2gr.w t1, vr22, 0
|
|
|
|
vpickve2gr.w t2, vr22, 1
|
|
|
|
vpickve2gr.w t3, vr22, 2
|
|
|
|
vpickve2gr.w t4, vr22, 3
|
|
|
|
add.w s2, s2, t1
|
|
|
|
add.w s3, s3, t2
|
|
|
|
add.w s4, s4, t3
|
|
|
|
add.w s5, s5, t4
|
|
|
|
srai.w s2, s2, 7
|
|
|
|
srai.w s3, s3, 7
|
|
|
|
srai.w s4, s4, 7
|
|
|
|
srai.w s5, s5, 7
|
|
|
|
slt t1, s2, t0
|
|
|
|
slt t2, s3, t0
|
|
|
|
slt t3, s4, t0
|
|
|
|
slt t4, s5, t0
|
|
|
|
maskeqz s2, s2, t1
|
|
|
|
maskeqz s3, s3, t2
|
|
|
|
maskeqz s4, s4, t3
|
|
|
|
maskeqz s5, s5, t4
|
|
|
|
masknez t1, t0, t1
|
|
|
|
masknez t2, t0, t2
|
|
|
|
masknez t3, t0, t3
|
|
|
|
masknez t4, t0, t4
|
|
|
|
or s2, s2, t1
|
|
|
|
or s3, s3, t2
|
|
|
|
or s4, s4, t3
|
|
|
|
or s5, s5, t4
|
|
|
|
st.h s2, a1, 0
|
|
|
|
st.h s3, a1, 2
|
|
|
|
st.h s4, a1, 4
|
|
|
|
st.h s5, a1, 6
|
|
|
|
|
|
|
|
addi.d a1, a1, 8
|
|
|
|
sub.d a3, a3, s1
|
|
|
|
addi.d a5, a5, 16
|
|
|
|
slli.d t3, a6, 3
|
|
|
|
add.d a4, a4, t3
|
|
|
|
sub.d a4, a4, s1
|
|
|
|
sub.d a4, a4, s1
|
|
|
|
addi.d a2, a2, -4
|
|
|
|
bge a2, t7, .LOOP_START
|
|
|
|
blt zero, a2, .RES
|
|
|
|
b .END_LOOP
|
|
|
|
.RES:
|
|
|
|
li.w t1, 0
|
|
|
|
.DSTW:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.FILTERSIZE:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .FILTERSIZE
|
|
|
|
srai.w t8, t8, 7
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 1
|
|
|
|
stx.h t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .DSTW
|
|
|
|
b .END_LOOP
|
|
|
|
|
|
|
|
.LOOP_DSTW8:
|
|
|
|
ld.w t1, a5, 0
|
|
|
|
ld.w t2, a5, 4
|
|
|
|
ld.w t3, a5, 8
|
|
|
|
ld.w t4, a5, 12
|
|
|
|
fldx.d f1, a3, t1
|
|
|
|
fldx.d f2, a3, t2
|
|
|
|
fldx.d f3, a3, t3
|
|
|
|
fldx.d f4, a3, t4
|
|
|
|
ld.w t1, a5, 16
|
|
|
|
ld.w t2, a5, 20
|
|
|
|
ld.w t3, a5, 24
|
|
|
|
ld.w t4, a5, 28
|
|
|
|
fldx.d f5, a3, t1
|
|
|
|
fldx.d f6, a3, t2
|
|
|
|
fldx.d f7, a3, t3
|
|
|
|
fldx.d f8, a3, t4
|
|
|
|
vld vr9, a4, 0
|
|
|
|
vld vr10, a4, 16
|
|
|
|
vld vr11, a4, 32
|
|
|
|
vld vr12, a4, 48
|
|
|
|
vld vr13, a4, 64
|
|
|
|
vld vr14, a4, 80
|
|
|
|
vld vr15, a4, 96
|
|
|
|
vld vr16, a4, 112
|
|
|
|
vilvl.b vr1, vr0, vr1
|
|
|
|
vilvl.b vr2, vr0, vr2
|
|
|
|
vilvl.b vr3, vr0, vr3
|
|
|
|
vilvl.b vr4, vr0, vr4
|
|
|
|
vilvl.b vr5, vr0, vr5
|
|
|
|
vilvl.b vr6, vr0, vr6
|
|
|
|
vilvl.b vr7, vr0, vr7
|
|
|
|
vilvl.b vr8, vr0, vr8
|
|
|
|
|
|
|
|
vdp2.w.h vr17, vr1, vr9
|
|
|
|
vdp2.w.h vr18, vr2, vr10
|
|
|
|
vdp2.w.h vr19, vr3, vr11
|
|
|
|
vdp2.w.h vr21, vr4, vr12
|
|
|
|
vdp2.w.h vr1, vr5, vr13
|
|
|
|
vdp2.w.h vr2, vr6, vr14
|
|
|
|
vdp2.w.h vr3, vr7, vr15
|
|
|
|
vdp2.w.h vr4, vr8, vr16
|
|
|
|
vhaddw.d.w vr5, vr1, vr1
|
|
|
|
vhaddw.d.w vr6, vr2, vr2
|
|
|
|
vhaddw.d.w vr7, vr3, vr3
|
|
|
|
vhaddw.d.w vr8, vr4, vr4
|
|
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
|
|
vhaddw.q.d vr5, vr5, vr5
|
|
|
|
vhaddw.q.d vr6, vr6, vr6
|
|
|
|
vhaddw.q.d vr7, vr7, vr7
|
|
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
|
|
vilvl.w vr1, vr2, vr1
|
|
|
|
vilvl.w vr3, vr4, vr3
|
|
|
|
vilvl.w vr5, vr6, vr5
|
|
|
|
vilvl.w vr7, vr8, vr7
|
|
|
|
vilvl.d vr1, vr3, vr1
|
|
|
|
vilvl.d vr5, vr7, vr5
|
|
|
|
vsrai.w vr1, vr1, 7
|
|
|
|
vsrai.w vr5, vr5, 7
|
|
|
|
vmin.w vr1, vr1, vr20
|
|
|
|
vmin.w vr5, vr5, vr20
|
|
|
|
|
|
|
|
vpickev.h vr1, vr5, vr1
|
|
|
|
vst vr1, a1, 0
|
|
|
|
addi.d a1, a1, 16
|
|
|
|
addi.d a5, a5, 32
|
|
|
|
addi.d a4, a4, 128
|
|
|
|
addi.d a2, a2, -8
|
|
|
|
bge a2, t8, .LOOP_DSTW8
|
|
|
|
blt zero, a2, .RES8
|
|
|
|
b .END_LOOP
|
|
|
|
.RES8:
|
|
|
|
li.w t1, 0
|
|
|
|
.DSTW8:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.FILTERSIZE8:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .FILTERSIZE8
|
|
|
|
srai.w t8, t8, 7
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 1
|
|
|
|
stx.h t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .DSTW8
|
|
|
|
b .END_LOOP
|
|
|
|
|
|
|
|
.LOOP_DSTW4:
|
|
|
|
ld.w t1, a5, 0
|
|
|
|
ld.w t2, a5, 4
|
|
|
|
ld.w t3, a5, 8
|
|
|
|
ld.w t4, a5, 12
|
|
|
|
fldx.s f1, a3, t1
|
|
|
|
fldx.s f2, a3, t2
|
|
|
|
fldx.s f3, a3, t3
|
|
|
|
fldx.s f4, a3, t4
|
|
|
|
ld.w t1, a5, 16
|
|
|
|
ld.w t2, a5, 20
|
|
|
|
ld.w t3, a5, 24
|
|
|
|
ld.w t4, a5, 28
|
|
|
|
fldx.s f5, a3, t1
|
|
|
|
fldx.s f6, a3, t2
|
|
|
|
fldx.s f7, a3, t3
|
|
|
|
fldx.s f8, a3, t4
|
|
|
|
vld vr9, a4, 0
|
|
|
|
vld vr10, a4, 16
|
|
|
|
vld vr11, a4, 32
|
|
|
|
vld vr12, a4, 48
|
|
|
|
vilvl.w vr1, vr2, vr1
|
|
|
|
vilvl.w vr3, vr4, vr3
|
|
|
|
vilvl.w vr5, vr6, vr5
|
|
|
|
vilvl.w vr7, vr8, vr7
|
|
|
|
vilvl.b vr1, vr0, vr1
|
|
|
|
vilvl.b vr3, vr0, vr3
|
|
|
|
vilvl.b vr5, vr0, vr5
|
|
|
|
vilvl.b vr7, vr0, vr7
|
|
|
|
|
|
|
|
vdp2.w.h vr13, vr1, vr9
|
|
|
|
vdp2.w.h vr14, vr3, vr10
|
|
|
|
vdp2.w.h vr15, vr5, vr11
|
|
|
|
vdp2.w.h vr16, vr7, vr12
|
|
|
|
vhaddw.d.w vr13, vr13, vr13
|
|
|
|
vhaddw.d.w vr14, vr14, vr14
|
|
|
|
vhaddw.d.w vr15, vr15, vr15
|
|
|
|
vhaddw.d.w vr16, vr16, vr16
|
|
|
|
vpickev.w vr13, vr14, vr13
|
|
|
|
vpickev.w vr15, vr16, vr15
|
|
|
|
vsrai.w vr13, vr13, 7
|
|
|
|
vsrai.w vr15, vr15, 7
|
|
|
|
vmin.w vr13, vr13, vr20
|
|
|
|
vmin.w vr15, vr15, vr20
|
|
|
|
|
|
|
|
vpickev.h vr13, vr15, vr13
|
|
|
|
vst vr13, a1, 0
|
|
|
|
addi.d a1, a1, 16
|
|
|
|
addi.d a5, a5, 32
|
|
|
|
addi.d a4, a4, 64
|
|
|
|
addi.d a2, a2, -8
|
|
|
|
bge a2, t8, .LOOP_DSTW4
|
|
|
|
blt zero, a2, .RES4
|
|
|
|
b .END_LOOP
|
|
|
|
.RES4:
|
|
|
|
li.w t1, 0
|
|
|
|
.DSTW4:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.FILTERSIZE4:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .FILTERSIZE4
|
|
|
|
srai.w t8, t8, 7
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 1
|
|
|
|
stx.h t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .DSTW4
|
|
|
|
b .END_LOOP
|
|
|
|
.END_DSTW4:
|
|
|
|
|
|
|
|
li.w t1, 0
|
|
|
|
.LOOP_DSTW1:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.FILTERSIZE1:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .FILTERSIZE1
|
|
|
|
srai.w t8, t8, 7
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 1
|
|
|
|
stx.h t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .LOOP_DSTW1
|
|
|
|
b .END_LOOP
|
|
|
|
.END_LOOP:
|
|
|
|
|
|
|
|
ld.d s0, sp, 0
|
|
|
|
ld.d s1, sp, 8
|
|
|
|
ld.d s2, sp, 16
|
|
|
|
ld.d s3, sp, 24
|
|
|
|
ld.d s4, sp, 32
|
|
|
|
ld.d s5, sp, 40
|
|
|
|
ld.d s6, sp, 48
|
|
|
|
ld.d s7, sp, 56
|
|
|
|
ld.d s8, sp, 64
|
|
|
|
addi.d sp, sp, 72
|
|
|
|
endfunc
|
|
|
|
|
swscale: rename SwsContext to SwsInternal
And preserve the public SwsContext as separate name. The motivation here
is that I want to turn SwsContext into a public struct, while keeping the
internal implementation hidden. Additionally, I also want to be able to
use multiple internal implementations, e.g. for GPU devices.
This commit does not include any functional changes. For the most part, it is
a simple rename. The only complications arise from the public facing API
functions, which preserve their current type (and hence require an additional
unwrapping step internally), and the checkasm test framework, which directly
accesses SwsInternal.
For consistency, the affected functions that need to maintain a distionction
have generally been changed to refer to the SwsContext as *sws, and the
SwsInternal as *c.
In an upcoming commit, I will provide a backing definition for the public
SwsContext, and update `sws_internal()` to dereference the internal struct
instead of merely casting it.
Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
2024-10-09 19:44:33 +02:00
|
|
|
/* void ff_hscale_8_to_19_lsx(SwsInternal *c, int16_t *dst, int dstW,
|
2023-05-25 09:24:30 +02:00
|
|
|
* const uint8_t *src, const int16_t *filter,
|
|
|
|
* const int32_t *filterPos, int filterSize)
|
|
|
|
*/
|
|
|
|
function ff_hscale_8_to_19_lsx
|
|
|
|
addi.d sp, sp, -72
|
|
|
|
st.d s0, sp, 0
|
|
|
|
st.d s1, sp, 8
|
|
|
|
st.d s2, sp, 16
|
|
|
|
st.d s3, sp, 24
|
|
|
|
st.d s4, sp, 32
|
|
|
|
st.d s5, sp, 40
|
|
|
|
st.d s6, sp, 48
|
|
|
|
st.d s7, sp, 56
|
|
|
|
st.d s8, sp, 64
|
|
|
|
li.w t0, 524287
|
|
|
|
li.w t8, 8
|
|
|
|
li.w t7, 4
|
|
|
|
vldi vr0, 0
|
|
|
|
vreplgr2vr.w vr20, t0
|
|
|
|
beq a6, t7, .LOOP_DST4
|
|
|
|
beq a6, t8, .LOOP_DST8
|
|
|
|
blt t8, a6, .LOOP
|
|
|
|
b .END_DST4
|
|
|
|
|
|
|
|
.LOOP:
|
|
|
|
li.w t1, 0
|
|
|
|
li.w s1, 0
|
|
|
|
li.w s2, 0
|
|
|
|
li.w s3, 0
|
|
|
|
li.w s4, 0
|
|
|
|
li.w s5, 0
|
|
|
|
vldi vr22, 0
|
|
|
|
addi.w s0, a6, -7
|
|
|
|
slli.w s7, a6, 1
|
|
|
|
slli.w s8, a6, 2
|
|
|
|
add.w t6, s7, s8
|
|
|
|
.LOOP_DST:
|
|
|
|
ld.w t2, a5, 0
|
|
|
|
ld.w t3, a5, 4
|
|
|
|
ld.w t4, a5, 8
|
|
|
|
ld.w t5, a5, 12
|
|
|
|
fldx.d f1, a3, t2
|
|
|
|
fldx.d f2, a3, t3
|
|
|
|
fldx.d f3, a3, t4
|
|
|
|
fldx.d f4, a3, t5
|
|
|
|
vld vr9, a4, 0
|
|
|
|
vldx vr10, a4, s7
|
|
|
|
vldx vr11, a4, s8
|
|
|
|
vldx vr12, a4, t6
|
|
|
|
vilvl.b vr1, vr0, vr1
|
|
|
|
vilvl.b vr2, vr0, vr2
|
|
|
|
vilvl.b vr3, vr0, vr3
|
|
|
|
vilvl.b vr4, vr0, vr4
|
|
|
|
vdp2.w.h vr17, vr1, vr9
|
|
|
|
vdp2.w.h vr18, vr2, vr10
|
|
|
|
vdp2.w.h vr19, vr3, vr11
|
|
|
|
vdp2.w.h vr21, vr4, vr12
|
|
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
|
|
vilvl.w vr1, vr2, vr1
|
|
|
|
vilvl.w vr3, vr4, vr3
|
|
|
|
vilvl.d vr1, vr3, vr1
|
|
|
|
vadd.w vr22, vr22, vr1
|
|
|
|
addi.w s1, s1, 8
|
|
|
|
addi.d a3, a3, 8
|
|
|
|
addi.d a4, a4, 16
|
|
|
|
blt s1, s0, .LOOP_DST
|
|
|
|
blt s1, a6, .DSTA
|
|
|
|
b .END_FILTERA
|
|
|
|
.DSTA:
|
|
|
|
ld.w t2, a5, 0
|
|
|
|
li.w t3, 0
|
|
|
|
move s6, s1
|
|
|
|
.FILTERA:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s2, s2, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .FILTERA
|
|
|
|
|
|
|
|
ld.w t2, a5, 4
|
|
|
|
li.w t3, 0
|
|
|
|
move s6, s1
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
.FILTERB:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s3, s3, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .FILTERB
|
|
|
|
ld.w t2, a5, 8
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
li.w t3, 0
|
|
|
|
move s6, s1
|
|
|
|
.FILTERC:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s4, s4, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .FILTERC
|
|
|
|
ld.w t2, a5, 12
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
move s6, s1
|
|
|
|
li.w t3, 0
|
|
|
|
.FILTERD:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s5, s5, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .FILTERD
|
|
|
|
.END_FILTERA:
|
|
|
|
vpickve2gr.w t1, vr22, 0
|
|
|
|
vpickve2gr.w t2, vr22, 1
|
|
|
|
vpickve2gr.w t3, vr22, 2
|
|
|
|
vpickve2gr.w t4, vr22, 3
|
|
|
|
add.w s2, s2, t1
|
|
|
|
add.w s3, s3, t2
|
|
|
|
add.w s4, s4, t3
|
|
|
|
add.w s5, s5, t4
|
|
|
|
srai.w s2, s2, 3
|
|
|
|
srai.w s3, s3, 3
|
|
|
|
srai.w s4, s4, 3
|
|
|
|
srai.w s5, s5, 3
|
|
|
|
slt t1, s2, t0
|
|
|
|
slt t2, s3, t0
|
|
|
|
slt t3, s4, t0
|
|
|
|
slt t4, s5, t0
|
|
|
|
maskeqz s2, s2, t1
|
|
|
|
maskeqz s3, s3, t2
|
|
|
|
maskeqz s4, s4, t3
|
|
|
|
maskeqz s5, s5, t4
|
|
|
|
masknez t1, t0, t1
|
|
|
|
masknez t2, t0, t2
|
|
|
|
masknez t3, t0, t3
|
|
|
|
masknez t4, t0, t4
|
|
|
|
or s2, s2, t1
|
|
|
|
or s3, s3, t2
|
|
|
|
or s4, s4, t3
|
|
|
|
or s5, s5, t4
|
|
|
|
st.w s2, a1, 0
|
|
|
|
st.w s3, a1, 4
|
|
|
|
st.w s4, a1, 8
|
|
|
|
st.w s5, a1, 12
|
|
|
|
|
|
|
|
addi.d a1, a1, 16
|
|
|
|
sub.d a3, a3, s1
|
|
|
|
addi.d a5, a5, 16
|
|
|
|
slli.d t3, a6, 3
|
|
|
|
add.d a4, a4, t3
|
|
|
|
sub.d a4, a4, s1
|
|
|
|
sub.d a4, a4, s1
|
|
|
|
addi.d a2, a2, -4
|
|
|
|
bge a2, t7, .LOOP
|
|
|
|
blt zero, a2, .RESA
|
|
|
|
b .END
|
|
|
|
.RESA:
|
|
|
|
li.w t1, 0
|
|
|
|
.DST:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.FILTER:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .FILTER
|
|
|
|
srai.w t8, t8, 3
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 2
|
|
|
|
stx.w t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .DST
|
|
|
|
b .END
|
|
|
|
|
|
|
|
.LOOP_DST8:
|
|
|
|
ld.w t1, a5, 0
|
|
|
|
ld.w t2, a5, 4
|
|
|
|
ld.w t3, a5, 8
|
|
|
|
ld.w t4, a5, 12
|
|
|
|
fldx.d f1, a3, t1
|
|
|
|
fldx.d f2, a3, t2
|
|
|
|
fldx.d f3, a3, t3
|
|
|
|
fldx.d f4, a3, t4
|
|
|
|
ld.w t1, a5, 16
|
|
|
|
ld.w t2, a5, 20
|
|
|
|
ld.w t3, a5, 24
|
|
|
|
ld.w t4, a5, 28
|
|
|
|
fldx.d f5, a3, t1
|
|
|
|
fldx.d f6, a3, t2
|
|
|
|
fldx.d f7, a3, t3
|
|
|
|
fldx.d f8, a3, t4
|
|
|
|
vld vr9, a4, 0
|
|
|
|
vld vr10, a4, 16
|
|
|
|
vld vr11, a4, 32
|
|
|
|
vld vr12, a4, 48
|
|
|
|
vld vr13, a4, 64
|
|
|
|
vld vr14, a4, 80
|
|
|
|
vld vr15, a4, 96
|
|
|
|
vld vr16, a4, 112
|
|
|
|
vilvl.b vr1, vr0, vr1
|
|
|
|
vilvl.b vr2, vr0, vr2
|
|
|
|
vilvl.b vr3, vr0, vr3
|
|
|
|
vilvl.b vr4, vr0, vr4
|
|
|
|
vilvl.b vr5, vr0, vr5
|
|
|
|
vilvl.b vr6, vr0, vr6
|
|
|
|
vilvl.b vr7, vr0, vr7
|
|
|
|
vilvl.b vr8, vr0, vr8
|
|
|
|
|
|
|
|
vdp2.w.h vr17, vr1, vr9
|
|
|
|
vdp2.w.h vr18, vr2, vr10
|
|
|
|
vdp2.w.h vr19, vr3, vr11
|
|
|
|
vdp2.w.h vr21, vr4, vr12
|
|
|
|
vdp2.w.h vr1, vr5, vr13
|
|
|
|
vdp2.w.h vr2, vr6, vr14
|
|
|
|
vdp2.w.h vr3, vr7, vr15
|
|
|
|
vdp2.w.h vr4, vr8, vr16
|
|
|
|
vhaddw.d.w vr5, vr1, vr1
|
|
|
|
vhaddw.d.w vr6, vr2, vr2
|
|
|
|
vhaddw.d.w vr7, vr3, vr3
|
|
|
|
vhaddw.d.w vr8, vr4, vr4
|
|
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
|
|
vhaddw.q.d vr5, vr5, vr5
|
|
|
|
vhaddw.q.d vr6, vr6, vr6
|
|
|
|
vhaddw.q.d vr7, vr7, vr7
|
|
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
|
|
vilvl.w vr1, vr2, vr1
|
|
|
|
vilvl.w vr3, vr4, vr3
|
|
|
|
vilvl.w vr5, vr6, vr5
|
|
|
|
vilvl.w vr7, vr8, vr7
|
|
|
|
vilvl.d vr1, vr3, vr1
|
|
|
|
vilvl.d vr5, vr7, vr5
|
|
|
|
vsrai.w vr1, vr1, 3
|
|
|
|
vsrai.w vr5, vr5, 3
|
|
|
|
vmin.w vr1, vr1, vr20
|
|
|
|
vmin.w vr5, vr5, vr20
|
|
|
|
|
|
|
|
vst vr1, a1, 0
|
|
|
|
vst vr5, a1, 16
|
|
|
|
addi.d a1, a1, 32
|
|
|
|
addi.d a5, a5, 32
|
|
|
|
addi.d a4, a4, 128
|
|
|
|
addi.d a2, a2, -8
|
|
|
|
bge a2, t8, .LOOP_DST8
|
|
|
|
blt zero, a2, .REST8
|
|
|
|
b .END
|
|
|
|
.REST8:
|
|
|
|
li.w t1, 0
|
|
|
|
.DST8:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.FILTER8:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .FILTER8
|
|
|
|
srai.w t8, t8, 3
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 2
|
|
|
|
stx.w t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .DST8
|
|
|
|
b .END
|
|
|
|
|
|
|
|
.LOOP_DST4:
|
|
|
|
ld.w t1, a5, 0
|
|
|
|
ld.w t2, a5, 4
|
|
|
|
ld.w t3, a5, 8
|
|
|
|
ld.w t4, a5, 12
|
|
|
|
fldx.s f1, a3, t1
|
|
|
|
fldx.s f2, a3, t2
|
|
|
|
fldx.s f3, a3, t3
|
|
|
|
fldx.s f4, a3, t4
|
|
|
|
ld.w t1, a5, 16
|
|
|
|
ld.w t2, a5, 20
|
|
|
|
ld.w t3, a5, 24
|
|
|
|
ld.w t4, a5, 28
|
|
|
|
fldx.s f5, a3, t1
|
|
|
|
fldx.s f6, a3, t2
|
|
|
|
fldx.s f7, a3, t3
|
|
|
|
fldx.s f8, a3, t4
|
|
|
|
vld vr9, a4, 0
|
|
|
|
vld vr10, a4, 16
|
|
|
|
vld vr11, a4, 32
|
|
|
|
vld vr12, a4, 48
|
|
|
|
vilvl.w vr1, vr2, vr1
|
|
|
|
vilvl.w vr3, vr4, vr3
|
|
|
|
vilvl.w vr5, vr6, vr5
|
|
|
|
vilvl.w vr7, vr8, vr7
|
|
|
|
vilvl.b vr1, vr0, vr1
|
|
|
|
vilvl.b vr3, vr0, vr3
|
|
|
|
vilvl.b vr5, vr0, vr5
|
|
|
|
vilvl.b vr7, vr0, vr7
|
|
|
|
|
|
|
|
vdp2.w.h vr13, vr1, vr9
|
|
|
|
vdp2.w.h vr14, vr3, vr10
|
|
|
|
vdp2.w.h vr15, vr5, vr11
|
|
|
|
vdp2.w.h vr16, vr7, vr12
|
|
|
|
vhaddw.d.w vr13, vr13, vr13
|
|
|
|
vhaddw.d.w vr14, vr14, vr14
|
|
|
|
vhaddw.d.w vr15, vr15, vr15
|
|
|
|
vhaddw.d.w vr16, vr16, vr16
|
|
|
|
vpickev.w vr13, vr14, vr13
|
|
|
|
vpickev.w vr15, vr16, vr15
|
|
|
|
vsrai.w vr13, vr13, 3
|
|
|
|
vsrai.w vr15, vr15, 3
|
|
|
|
vmin.w vr13, vr13, vr20
|
|
|
|
vmin.w vr15, vr15, vr20
|
|
|
|
|
|
|
|
vst vr13, a1, 0
|
|
|
|
vst vr15, a1, 16
|
|
|
|
addi.d a1, a1, 32
|
|
|
|
addi.d a5, a5, 32
|
|
|
|
addi.d a4, a4, 64
|
|
|
|
addi.d a2, a2, -8
|
|
|
|
bge a2, t8, .LOOP_DST4
|
|
|
|
blt zero, a2, .REST4
|
|
|
|
b .END
|
|
|
|
.REST4:
|
|
|
|
li.w t1, 0
|
|
|
|
.DST4:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.FILTER4:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .FILTER4
|
|
|
|
srai.w t8, t8, 3
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 2
|
|
|
|
stx.w t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .DST4
|
|
|
|
b .END
|
|
|
|
.END_DST4:
|
|
|
|
|
|
|
|
li.w t1, 0
|
|
|
|
.LOOP_DST1:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.FILTER1:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
ldx.bu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .FILTER1
|
|
|
|
srai.w t8, t8, 3
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 2
|
|
|
|
stx.w t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .LOOP_DST1
|
|
|
|
b .END
|
|
|
|
.END:
|
|
|
|
|
|
|
|
ld.d s0, sp, 0
|
|
|
|
ld.d s1, sp, 8
|
|
|
|
ld.d s2, sp, 16
|
|
|
|
ld.d s3, sp, 24
|
|
|
|
ld.d s4, sp, 32
|
|
|
|
ld.d s5, sp, 40
|
|
|
|
ld.d s6, sp, 48
|
|
|
|
ld.d s7, sp, 56
|
|
|
|
ld.d s8, sp, 64
|
|
|
|
addi.d sp, sp, 72
|
|
|
|
endfunc
|
|
|
|
|
swscale: rename SwsContext to SwsInternal
And preserve the public SwsContext as separate name. The motivation here
is that I want to turn SwsContext into a public struct, while keeping the
internal implementation hidden. Additionally, I also want to be able to
use multiple internal implementations, e.g. for GPU devices.
This commit does not include any functional changes. For the most part, it is
a simple rename. The only complications arise from the public facing API
functions, which preserve their current type (and hence require an additional
unwrapping step internally), and the checkasm test framework, which directly
accesses SwsInternal.
For consistency, the affected functions that need to maintain a distionction
have generally been changed to refer to the SwsContext as *sws, and the
SwsInternal as *c.
In an upcoming commit, I will provide a backing definition for the public
SwsContext, and update `sws_internal()` to dereference the internal struct
instead of merely casting it.
Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
2024-10-09 19:44:33 +02:00
|
|
|
/* void ff_hscale_16_to_15_sub_lsx(SwsInternal *c, int16_t *dst, int dstW,
|
2023-05-25 09:24:30 +02:00
|
|
|
* const uint8_t *src, const int16_t *filter,
|
|
|
|
* const int32_t *filterPos, int filterSize, int sh)
|
|
|
|
*/
|
|
|
|
function ff_hscale_16_to_15_sub_lsx
|
|
|
|
addi.d sp, sp, -72
|
|
|
|
st.d s0, sp, 0
|
|
|
|
st.d s1, sp, 8
|
|
|
|
st.d s2, sp, 16
|
|
|
|
st.d s3, sp, 24
|
|
|
|
st.d s4, sp, 32
|
|
|
|
st.d s5, sp, 40
|
|
|
|
st.d s6, sp, 48
|
|
|
|
st.d s7, sp, 56
|
|
|
|
st.d s8, sp, 64
|
|
|
|
li.w t0, 32767
|
|
|
|
li.w t8, 8
|
|
|
|
li.w t7, 4
|
|
|
|
vreplgr2vr.w vr20, t0
|
|
|
|
vreplgr2vr.w vr0, a7
|
|
|
|
beq a6, t7, .LOOP_HS15_DST4
|
|
|
|
beq a6, t8, .LOOP_HS15_DST8
|
|
|
|
blt t8, a6, .LOOP_HS15
|
|
|
|
b .END_HS15_DST4
|
|
|
|
|
|
|
|
.LOOP_HS15:
|
|
|
|
li.w t1, 0
|
|
|
|
li.w s1, 0
|
|
|
|
li.w s2, 0
|
|
|
|
li.w s3, 0
|
|
|
|
li.w s4, 0
|
|
|
|
li.w s5, 0
|
|
|
|
vldi vr22, 0
|
|
|
|
addi.w s0, a6, -7
|
|
|
|
slli.w s7, a6, 1
|
|
|
|
slli.w s8, a6, 2
|
|
|
|
add.w t6, s7, s8
|
|
|
|
.LOOP_HS15_DST:
|
|
|
|
ld.w t2, a5, 0
|
|
|
|
ld.w t3, a5, 4
|
|
|
|
ld.w t4, a5, 8
|
|
|
|
ld.w t5, a5, 12
|
|
|
|
slli.w t2, t2, 1
|
|
|
|
slli.w t3, t3, 1
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
slli.w t5, t5, 1
|
|
|
|
vldx vr1, a3, t2
|
|
|
|
vldx vr2, a3, t3
|
|
|
|
vldx vr3, a3, t4
|
|
|
|
vldx vr4, a3, t5
|
|
|
|
vld vr9, a4, 0
|
|
|
|
vldx vr10, a4, s7
|
|
|
|
vldx vr11, a4, s8
|
|
|
|
vldx vr12, a4, t6
|
|
|
|
vmulwev.w.hu.h vr17, vr1, vr9
|
|
|
|
vmulwev.w.hu.h vr18, vr2, vr10
|
|
|
|
vmulwev.w.hu.h vr19, vr3, vr11
|
|
|
|
vmulwev.w.hu.h vr21, vr4, vr12
|
|
|
|
vmaddwod.w.hu.h vr17, vr1, vr9
|
|
|
|
vmaddwod.w.hu.h vr18, vr2, vr10
|
|
|
|
vmaddwod.w.hu.h vr19, vr3, vr11
|
|
|
|
vmaddwod.w.hu.h vr21, vr4, vr12
|
|
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
|
|
vilvl.w vr1, vr2, vr1
|
|
|
|
vilvl.w vr3, vr4, vr3
|
|
|
|
vilvl.d vr1, vr3, vr1
|
|
|
|
vadd.w vr22, vr22, vr1
|
|
|
|
addi.w s1, s1, 8
|
|
|
|
addi.d a3, a3, 16
|
|
|
|
addi.d a4, a4, 16
|
|
|
|
blt s1, s0, .LOOP_HS15_DST
|
|
|
|
blt s1, a6, .HS15_DSTA
|
|
|
|
b .END_HS15_FILTERA
|
|
|
|
.HS15_DSTA:
|
|
|
|
ld.w t2, a5, 0
|
|
|
|
li.w t3, 0
|
|
|
|
move s6, s1
|
|
|
|
.HS15_FILTERA:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s2, s2, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .HS15_FILTERA
|
|
|
|
|
|
|
|
ld.w t2, a5, 4
|
|
|
|
li.w t3, 0
|
|
|
|
move s6, s1
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
.HS15_FILTERB:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s3, s3, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .HS15_FILTERB
|
|
|
|
ld.w t2, a5, 8
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
li.w t3, 0
|
|
|
|
move s6, s1
|
|
|
|
.HS15_FILTERC:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s4, s4, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .HS15_FILTERC
|
|
|
|
ld.w t2, a5, 12
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
move s6, s1
|
|
|
|
li.w t3, 0
|
|
|
|
.HS15_FILTERD:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s5, s5, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .HS15_FILTERD
|
|
|
|
.END_HS15_FILTERA:
|
|
|
|
vpickve2gr.w t1, vr22, 0
|
|
|
|
vpickve2gr.w t2, vr22, 1
|
|
|
|
vpickve2gr.w t3, vr22, 2
|
|
|
|
vpickve2gr.w t4, vr22, 3
|
|
|
|
add.w s2, s2, t1
|
|
|
|
add.w s3, s3, t2
|
|
|
|
add.w s4, s4, t3
|
|
|
|
add.w s5, s5, t4
|
|
|
|
sra.w s2, s2, a7
|
|
|
|
sra.w s3, s3, a7
|
|
|
|
sra.w s4, s4, a7
|
|
|
|
sra.w s5, s5, a7
|
|
|
|
slt t1, s2, t0
|
|
|
|
slt t2, s3, t0
|
|
|
|
slt t3, s4, t0
|
|
|
|
slt t4, s5, t0
|
|
|
|
maskeqz s2, s2, t1
|
|
|
|
maskeqz s3, s3, t2
|
|
|
|
maskeqz s4, s4, t3
|
|
|
|
maskeqz s5, s5, t4
|
|
|
|
masknez t1, t0, t1
|
|
|
|
masknez t2, t0, t2
|
|
|
|
masknez t3, t0, t3
|
|
|
|
masknez t4, t0, t4
|
|
|
|
or s2, s2, t1
|
|
|
|
or s3, s3, t2
|
|
|
|
or s4, s4, t3
|
|
|
|
or s5, s5, t4
|
|
|
|
st.h s2, a1, 0
|
|
|
|
st.h s3, a1, 2
|
|
|
|
st.h s4, a1, 4
|
|
|
|
st.h s5, a1, 6
|
|
|
|
|
|
|
|
addi.d a1, a1, 8
|
|
|
|
sub.d a3, a3, s1
|
|
|
|
sub.d a3, a3, s1
|
|
|
|
addi.d a5, a5, 16
|
|
|
|
slli.d t3, a6, 3
|
|
|
|
add.d a4, a4, t3
|
|
|
|
sub.d a4, a4, s1
|
|
|
|
sub.d a4, a4, s1
|
|
|
|
addi.d a2, a2, -4
|
|
|
|
bge a2, t7, .LOOP_HS15
|
|
|
|
blt zero, a2, .HS15_RESA
|
|
|
|
b .HS15_END
|
|
|
|
.HS15_RESA:
|
|
|
|
li.w t1, 0
|
|
|
|
.HS15_DST:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.HS15_FILTER:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .HS15_FILTER
|
|
|
|
sra.w t8, t8, a7
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 1
|
|
|
|
stx.h t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .HS15_DST
|
|
|
|
b .HS15_END
|
|
|
|
|
|
|
|
.LOOP_HS15_DST8:
|
|
|
|
ld.w t1, a5, 0
|
|
|
|
ld.w t2, a5, 4
|
|
|
|
ld.w t3, a5, 8
|
|
|
|
ld.w t4, a5, 12
|
|
|
|
slli.w t1, t1, 1
|
|
|
|
slli.w t2, t2, 1
|
|
|
|
slli.w t3, t3, 1
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
vldx vr1, a3, t1
|
|
|
|
vldx vr2, a3, t2
|
|
|
|
vldx vr3, a3, t3
|
|
|
|
vldx vr4, a3, t4
|
|
|
|
ld.w t1, a5, 16
|
|
|
|
ld.w t2, a5, 20
|
|
|
|
ld.w t3, a5, 24
|
|
|
|
ld.w t4, a5, 28
|
|
|
|
slli.w t1, t1, 1
|
|
|
|
slli.w t2, t2, 1
|
|
|
|
slli.w t3, t3, 1
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
vldx vr5, a3, t1
|
|
|
|
vldx vr6, a3, t2
|
|
|
|
vldx vr7, a3, t3
|
|
|
|
vldx vr8, a3, t4
|
|
|
|
vld vr9, a4, 0
|
|
|
|
vld vr10, a4, 16
|
|
|
|
vld vr11, a4, 32
|
|
|
|
vld vr12, a4, 48
|
|
|
|
vld vr13, a4, 64
|
|
|
|
vld vr14, a4, 80
|
|
|
|
vld vr15, a4, 96
|
|
|
|
vld vr16, a4, 112
|
|
|
|
|
|
|
|
vmulwev.w.hu.h vr17, vr1, vr9
|
|
|
|
vmulwev.w.hu.h vr18, vr2, vr10
|
|
|
|
vmulwev.w.hu.h vr19, vr3, vr11
|
|
|
|
vmulwev.w.hu.h vr21, vr4, vr12
|
|
|
|
vmaddwod.w.hu.h vr17, vr1, vr9
|
|
|
|
vmaddwod.w.hu.h vr18, vr2, vr10
|
|
|
|
vmaddwod.w.hu.h vr19, vr3, vr11
|
|
|
|
vmaddwod.w.hu.h vr21, vr4, vr12
|
|
|
|
vmulwev.w.hu.h vr1, vr5, vr13
|
|
|
|
vmulwev.w.hu.h vr2, vr6, vr14
|
|
|
|
vmulwev.w.hu.h vr3, vr7, vr15
|
|
|
|
vmulwev.w.hu.h vr4, vr8, vr16
|
|
|
|
vmaddwod.w.hu.h vr1, vr5, vr13
|
|
|
|
vmaddwod.w.hu.h vr2, vr6, vr14
|
|
|
|
vmaddwod.w.hu.h vr3, vr7, vr15
|
|
|
|
vmaddwod.w.hu.h vr4, vr8, vr16
|
|
|
|
vhaddw.d.w vr5, vr1, vr1
|
|
|
|
vhaddw.d.w vr6, vr2, vr2
|
|
|
|
vhaddw.d.w vr7, vr3, vr3
|
|
|
|
vhaddw.d.w vr8, vr4, vr4
|
|
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
|
|
vhaddw.q.d vr5, vr5, vr5
|
|
|
|
vhaddw.q.d vr6, vr6, vr6
|
|
|
|
vhaddw.q.d vr7, vr7, vr7
|
|
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
|
|
vilvl.w vr1, vr2, vr1
|
|
|
|
vilvl.w vr3, vr4, vr3
|
|
|
|
vilvl.w vr5, vr6, vr5
|
|
|
|
vilvl.w vr7, vr8, vr7
|
|
|
|
vilvl.d vr1, vr3, vr1
|
|
|
|
vilvl.d vr5, vr7, vr5
|
|
|
|
vsra.w vr1, vr1, vr0
|
|
|
|
vsra.w vr5, vr5, vr0
|
|
|
|
vmin.w vr1, vr1, vr20
|
|
|
|
vmin.w vr5, vr5, vr20
|
|
|
|
|
|
|
|
vpickev.h vr1, vr5, vr1
|
|
|
|
vst vr1, a1, 0
|
|
|
|
addi.d a1, a1, 16
|
|
|
|
addi.d a5, a5, 32
|
|
|
|
addi.d a4, a4, 128
|
|
|
|
addi.d a2, a2, -8
|
|
|
|
bge a2, t8, .LOOP_HS15_DST8
|
|
|
|
blt zero, a2, .HS15_REST8
|
|
|
|
b .HS15_END
|
|
|
|
.HS15_REST8:
|
|
|
|
li.w t1, 0
|
|
|
|
.HS15_DST8:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.HS15_FILTER8:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .HS15_FILTER8
|
|
|
|
sra.w t8, t8, a7
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 1
|
|
|
|
stx.h t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .HS15_DST8
|
|
|
|
b .HS15_END
|
|
|
|
|
|
|
|
.LOOP_HS15_DST4:
|
|
|
|
ld.w t1, a5, 0
|
|
|
|
ld.w t2, a5, 4
|
|
|
|
ld.w t3, a5, 8
|
|
|
|
ld.w t4, a5, 12
|
|
|
|
slli.w t1, t1, 1
|
|
|
|
slli.w t2, t2, 1
|
|
|
|
slli.w t3, t3, 1
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
fldx.d f1, a3, t1
|
|
|
|
fldx.d f2, a3, t2
|
|
|
|
fldx.d f3, a3, t3
|
|
|
|
fldx.d f4, a3, t4
|
|
|
|
ld.w t1, a5, 16
|
|
|
|
ld.w t2, a5, 20
|
|
|
|
ld.w t3, a5, 24
|
|
|
|
ld.w t4, a5, 28
|
|
|
|
slli.w t1, t1, 1
|
|
|
|
slli.w t2, t2, 1
|
|
|
|
slli.w t3, t3, 1
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
fldx.d f5, a3, t1
|
|
|
|
fldx.d f6, a3, t2
|
|
|
|
fldx.d f7, a3, t3
|
|
|
|
fldx.d f8, a3, t4
|
|
|
|
vld vr9, a4, 0
|
|
|
|
vld vr10, a4, 16
|
|
|
|
vld vr11, a4, 32
|
|
|
|
vld vr12, a4, 48
|
|
|
|
vilvl.d vr1, vr2, vr1
|
|
|
|
vilvl.d vr3, vr4, vr3
|
|
|
|
vilvl.d vr5, vr6, vr5
|
|
|
|
vilvl.d vr7, vr8, vr7
|
|
|
|
vmulwev.w.hu.h vr13, vr1, vr9
|
|
|
|
vmulwev.w.hu.h vr14, vr3, vr10
|
|
|
|
vmulwev.w.hu.h vr15, vr5, vr11
|
|
|
|
vmulwev.w.hu.h vr16, vr7, vr12
|
|
|
|
vmaddwod.w.hu.h vr13, vr1, vr9
|
|
|
|
vmaddwod.w.hu.h vr14, vr3, vr10
|
|
|
|
vmaddwod.w.hu.h vr15, vr5, vr11
|
|
|
|
vmaddwod.w.hu.h vr16, vr7, vr12
|
|
|
|
vhaddw.d.w vr13, vr13, vr13
|
|
|
|
vhaddw.d.w vr14, vr14, vr14
|
|
|
|
vhaddw.d.w vr15, vr15, vr15
|
|
|
|
vhaddw.d.w vr16, vr16, vr16
|
|
|
|
vpickev.w vr13, vr14, vr13
|
|
|
|
vpickev.w vr15, vr16, vr15
|
|
|
|
vsra.w vr13, vr13, vr0
|
|
|
|
vsra.w vr15, vr15, vr0
|
|
|
|
vmin.w vr13, vr13, vr20
|
|
|
|
vmin.w vr15, vr15, vr20
|
|
|
|
|
|
|
|
vpickev.h vr13, vr15, vr13
|
|
|
|
vst vr13, a1, 0
|
|
|
|
addi.d a1, a1, 16
|
|
|
|
addi.d a5, a5, 32
|
|
|
|
addi.d a4, a4, 64
|
|
|
|
addi.d a2, a2, -8
|
|
|
|
bge a2, t8, .LOOP_HS15_DST4
|
|
|
|
blt zero, a2, .HS15_REST4
|
|
|
|
b .HS15_END
|
|
|
|
.HS15_REST4:
|
|
|
|
li.w t1, 0
|
|
|
|
.HS15_DST4:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.HS15_FILTER4:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .HS15_FILTER4
|
|
|
|
sra.w t8, t8, a7
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 1
|
|
|
|
stx.h t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .HS15_DST4
|
|
|
|
b .HS15_END
|
|
|
|
.END_HS15_DST4:
|
|
|
|
|
|
|
|
li.w t1, 0
|
|
|
|
.LOOP_HS15_DST1:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.HS15_FILTER1:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .HS15_FILTER1
|
|
|
|
sra.w t8, t8, a7
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 1
|
|
|
|
stx.h t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .LOOP_HS15_DST1
|
|
|
|
b .HS15_END
|
|
|
|
.HS15_END:
|
|
|
|
|
|
|
|
ld.d s0, sp, 0
|
|
|
|
ld.d s1, sp, 8
|
|
|
|
ld.d s2, sp, 16
|
|
|
|
ld.d s3, sp, 24
|
|
|
|
ld.d s4, sp, 32
|
|
|
|
ld.d s5, sp, 40
|
|
|
|
ld.d s6, sp, 48
|
|
|
|
ld.d s7, sp, 56
|
|
|
|
ld.d s8, sp, 64
|
|
|
|
addi.d sp, sp, 72
|
|
|
|
endfunc
|
|
|
|
|
swscale: rename SwsContext to SwsInternal
And preserve the public SwsContext as separate name. The motivation here
is that I want to turn SwsContext into a public struct, while keeping the
internal implementation hidden. Additionally, I also want to be able to
use multiple internal implementations, e.g. for GPU devices.
This commit does not include any functional changes. For the most part, it is
a simple rename. The only complications arise from the public facing API
functions, which preserve their current type (and hence require an additional
unwrapping step internally), and the checkasm test framework, which directly
accesses SwsInternal.
For consistency, the affected functions that need to maintain a distionction
have generally been changed to refer to the SwsContext as *sws, and the
SwsInternal as *c.
In an upcoming commit, I will provide a backing definition for the public
SwsContext, and update `sws_internal()` to dereference the internal struct
instead of merely casting it.
Sponsored-by: Sovereign Tech Fund
Signed-off-by: Niklas Haas <git@haasn.dev>
2024-10-09 19:44:33 +02:00
|
|
|
/* void ff_hscale_16_to_19_sub_lsx(SwsInternal *c, int16_t *dst, int dstW,
|
2023-05-25 09:24:30 +02:00
|
|
|
* const uint8_t *src, const int16_t *filter,
|
|
|
|
* const int32_t *filterPos, int filterSize, int sh)
|
|
|
|
*/
|
|
|
|
function ff_hscale_16_to_19_sub_lsx
|
|
|
|
addi.d sp, sp, -72
|
|
|
|
st.d s0, sp, 0
|
|
|
|
st.d s1, sp, 8
|
|
|
|
st.d s2, sp, 16
|
|
|
|
st.d s3, sp, 24
|
|
|
|
st.d s4, sp, 32
|
|
|
|
st.d s5, sp, 40
|
|
|
|
st.d s6, sp, 48
|
|
|
|
st.d s7, sp, 56
|
|
|
|
st.d s8, sp, 64
|
|
|
|
|
|
|
|
li.w t0, 524287
|
|
|
|
li.w t8, 8
|
|
|
|
li.w t7, 4
|
|
|
|
vreplgr2vr.w vr20, t0
|
|
|
|
vreplgr2vr.w vr0, a7
|
|
|
|
beq a6, t7, .LOOP_HS19_DST4
|
|
|
|
beq a6, t8, .LOOP_HS19_DST8
|
|
|
|
blt t8, a6, .LOOP_HS19
|
|
|
|
b .END_HS19_DST4
|
|
|
|
|
|
|
|
.LOOP_HS19:
|
|
|
|
li.w t1, 0
|
|
|
|
li.w s1, 0
|
|
|
|
li.w s2, 0
|
|
|
|
li.w s3, 0
|
|
|
|
li.w s4, 0
|
|
|
|
li.w s5, 0
|
|
|
|
vldi vr22, 0
|
|
|
|
addi.w s0, a6, -7
|
|
|
|
slli.w s7, a6, 1
|
|
|
|
slli.w s8, a6, 2
|
|
|
|
add.w t6, s7, s8
|
|
|
|
.LOOP_HS19_DST:
|
|
|
|
ld.w t2, a5, 0
|
|
|
|
ld.w t3, a5, 4
|
|
|
|
ld.w t4, a5, 8
|
|
|
|
ld.w t5, a5, 12
|
|
|
|
slli.w t2, t2, 1
|
|
|
|
slli.w t3, t3, 1
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
slli.w t5, t5, 1
|
|
|
|
vldx vr1, a3, t2
|
|
|
|
vldx vr2, a3, t3
|
|
|
|
vldx vr3, a3, t4
|
|
|
|
vldx vr4, a3, t5
|
|
|
|
vld vr9, a4, 0
|
|
|
|
vldx vr10, a4, s7
|
|
|
|
vldx vr11, a4, s8
|
|
|
|
vldx vr12, a4, t6
|
|
|
|
vmulwev.w.hu.h vr17, vr1, vr9
|
|
|
|
vmulwev.w.hu.h vr18, vr2, vr10
|
|
|
|
vmulwev.w.hu.h vr19, vr3, vr11
|
|
|
|
vmulwev.w.hu.h vr21, vr4, vr12
|
|
|
|
vmaddwod.w.hu.h vr17, vr1, vr9
|
|
|
|
vmaddwod.w.hu.h vr18, vr2, vr10
|
|
|
|
vmaddwod.w.hu.h vr19, vr3, vr11
|
|
|
|
vmaddwod.w.hu.h vr21, vr4, vr12
|
|
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
|
|
vilvl.w vr1, vr2, vr1
|
|
|
|
vilvl.w vr3, vr4, vr3
|
|
|
|
vilvl.d vr1, vr3, vr1
|
|
|
|
vadd.w vr22, vr22, vr1
|
|
|
|
addi.w s1, s1, 8
|
|
|
|
addi.d a3, a3, 16
|
|
|
|
addi.d a4, a4, 16
|
|
|
|
blt s1, s0, .LOOP_HS19_DST
|
|
|
|
blt s1, a6, .HS19_DSTA
|
|
|
|
b .END_HS19_FILTERA
|
|
|
|
.HS19_DSTA:
|
|
|
|
ld.w t2, a5, 0
|
|
|
|
li.w t3, 0
|
|
|
|
move s6, s1
|
|
|
|
.HS19_FILTERA:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s2, s2, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .HS19_FILTERA
|
|
|
|
|
|
|
|
ld.w t2, a5, 4
|
|
|
|
li.w t3, 0
|
|
|
|
move s6, s1
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
.HS19_FILTERB:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s3, s3, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .HS19_FILTERB
|
|
|
|
ld.w t2, a5, 8
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
li.w t3, 0
|
|
|
|
move s6, s1
|
|
|
|
.HS19_FILTERC:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s4, s4, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .HS19_FILTERC
|
|
|
|
ld.w t2, a5, 12
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
move s6, s1
|
|
|
|
li.w t3, 0
|
|
|
|
.HS19_FILTERD:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t6, t6, 1
|
|
|
|
ldx.h t6, a4, t6
|
|
|
|
mul.w t6, t5, t6
|
|
|
|
add.w s5, s5, t6
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
addi.w s6, s6, 1
|
|
|
|
blt s6, a6, .HS19_FILTERD
|
|
|
|
.END_HS19_FILTERA:
|
|
|
|
vpickve2gr.w t1, vr22, 0
|
|
|
|
vpickve2gr.w t2, vr22, 1
|
|
|
|
vpickve2gr.w t3, vr22, 2
|
|
|
|
vpickve2gr.w t4, vr22, 3
|
|
|
|
add.w s2, s2, t1
|
|
|
|
add.w s3, s3, t2
|
|
|
|
add.w s4, s4, t3
|
|
|
|
add.w s5, s5, t4
|
|
|
|
sra.w s2, s2, a7
|
|
|
|
sra.w s3, s3, a7
|
|
|
|
sra.w s4, s4, a7
|
|
|
|
sra.w s5, s5, a7
|
|
|
|
slt t1, s2, t0
|
|
|
|
slt t2, s3, t0
|
|
|
|
slt t3, s4, t0
|
|
|
|
slt t4, s5, t0
|
|
|
|
maskeqz s2, s2, t1
|
|
|
|
maskeqz s3, s3, t2
|
|
|
|
maskeqz s4, s4, t3
|
|
|
|
maskeqz s5, s5, t4
|
|
|
|
masknez t1, t0, t1
|
|
|
|
masknez t2, t0, t2
|
|
|
|
masknez t3, t0, t3
|
|
|
|
masknez t4, t0, t4
|
|
|
|
or s2, s2, t1
|
|
|
|
or s3, s3, t2
|
|
|
|
or s4, s4, t3
|
|
|
|
or s5, s5, t4
|
|
|
|
st.w s2, a1, 0
|
|
|
|
st.w s3, a1, 4
|
|
|
|
st.w s4, a1, 8
|
|
|
|
st.w s5, a1, 12
|
|
|
|
|
|
|
|
addi.d a1, a1, 16
|
|
|
|
sub.d a3, a3, s1
|
|
|
|
sub.d a3, a3, s1
|
|
|
|
addi.d a5, a5, 16
|
|
|
|
slli.d t3, a6, 3
|
|
|
|
add.d a4, a4, t3
|
|
|
|
sub.d a4, a4, s1
|
|
|
|
sub.d a4, a4, s1
|
|
|
|
addi.d a2, a2, -4
|
|
|
|
bge a2, t7, .LOOP_HS19
|
|
|
|
blt zero, a2, .HS19_RESA
|
|
|
|
b .HS19_END
|
|
|
|
.HS19_RESA:
|
|
|
|
li.w t1, 0
|
|
|
|
.HS19_DST:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.HS19_FILTER:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .HS19_FILTER
|
|
|
|
sra.w t8, t8, a7
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 2
|
|
|
|
stx.w t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .HS19_DST
|
|
|
|
b .HS19_END
|
|
|
|
|
|
|
|
.LOOP_HS19_DST8:
|
|
|
|
ld.w t1, a5, 0
|
|
|
|
ld.w t2, a5, 4
|
|
|
|
ld.w t3, a5, 8
|
|
|
|
ld.w t4, a5, 12
|
|
|
|
slli.w t1, t1, 1
|
|
|
|
slli.w t2, t2, 1
|
|
|
|
slli.w t3, t3, 1
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
vldx vr1, a3, t1
|
|
|
|
vldx vr2, a3, t2
|
|
|
|
vldx vr3, a3, t3
|
|
|
|
vldx vr4, a3, t4
|
|
|
|
ld.w t1, a5, 16
|
|
|
|
ld.w t2, a5, 20
|
|
|
|
ld.w t3, a5, 24
|
|
|
|
ld.w t4, a5, 28
|
|
|
|
slli.w t1, t1, 1
|
|
|
|
slli.w t2, t2, 1
|
|
|
|
slli.w t3, t3, 1
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
vldx vr5, a3, t1
|
|
|
|
vldx vr6, a3, t2
|
|
|
|
vldx vr7, a3, t3
|
|
|
|
vldx vr8, a3, t4
|
|
|
|
vld vr9, a4, 0
|
|
|
|
vld vr10, a4, 16
|
|
|
|
vld vr11, a4, 32
|
|
|
|
vld vr12, a4, 48
|
|
|
|
vld vr13, a4, 64
|
|
|
|
vld vr14, a4, 80
|
|
|
|
vld vr15, a4, 96
|
|
|
|
vld vr16, a4, 112
|
|
|
|
vmulwev.w.hu.h vr17, vr1, vr9
|
|
|
|
vmulwev.w.hu.h vr18, vr2, vr10
|
|
|
|
vmulwev.w.hu.h vr19, vr3, vr11
|
|
|
|
vmulwev.w.hu.h vr21, vr4, vr12
|
|
|
|
vmaddwod.w.hu.h vr17, vr1, vr9
|
|
|
|
vmaddwod.w.hu.h vr18, vr2, vr10
|
|
|
|
vmaddwod.w.hu.h vr19, vr3, vr11
|
|
|
|
vmaddwod.w.hu.h vr21, vr4, vr12
|
|
|
|
vmulwev.w.hu.h vr1, vr5, vr13
|
|
|
|
vmulwev.w.hu.h vr2, vr6, vr14
|
|
|
|
vmulwev.w.hu.h vr3, vr7, vr15
|
|
|
|
vmulwev.w.hu.h vr4, vr8, vr16
|
|
|
|
vmaddwod.w.hu.h vr1, vr5, vr13
|
|
|
|
vmaddwod.w.hu.h vr2, vr6, vr14
|
|
|
|
vmaddwod.w.hu.h vr3, vr7, vr15
|
|
|
|
vmaddwod.w.hu.h vr4, vr8, vr16
|
|
|
|
vhaddw.d.w vr5, vr1, vr1
|
|
|
|
vhaddw.d.w vr6, vr2, vr2
|
|
|
|
vhaddw.d.w vr7, vr3, vr3
|
|
|
|
vhaddw.d.w vr8, vr4, vr4
|
|
|
|
vhaddw.d.w vr1, vr17, vr17
|
|
|
|
vhaddw.d.w vr2, vr18, vr18
|
|
|
|
vhaddw.d.w vr3, vr19, vr19
|
|
|
|
vhaddw.d.w vr4, vr21, vr21
|
|
|
|
vhaddw.q.d vr1, vr1, vr1
|
|
|
|
vhaddw.q.d vr2, vr2, vr2
|
|
|
|
vhaddw.q.d vr3, vr3, vr3
|
|
|
|
vhaddw.q.d vr4, vr4, vr4
|
|
|
|
vhaddw.q.d vr5, vr5, vr5
|
|
|
|
vhaddw.q.d vr6, vr6, vr6
|
|
|
|
vhaddw.q.d vr7, vr7, vr7
|
|
|
|
vhaddw.q.d vr8, vr8, vr8
|
|
|
|
vilvl.w vr1, vr2, vr1
|
|
|
|
vilvl.w vr3, vr4, vr3
|
|
|
|
vilvl.w vr5, vr6, vr5
|
|
|
|
vilvl.w vr7, vr8, vr7
|
|
|
|
vilvl.d vr1, vr3, vr1
|
|
|
|
vilvl.d vr5, vr7, vr5
|
|
|
|
vsra.w vr1, vr1, vr0
|
|
|
|
vsra.w vr5, vr5, vr0
|
|
|
|
vmin.w vr1, vr1, vr20
|
|
|
|
vmin.w vr5, vr5, vr20
|
|
|
|
|
|
|
|
vst vr1, a1, 0
|
|
|
|
vst vr5, a1, 16
|
|
|
|
addi.d a1, a1, 32
|
|
|
|
addi.d a5, a5, 32
|
|
|
|
addi.d a4, a4, 128
|
|
|
|
addi.d a2, a2, -8
|
|
|
|
bge a2, t8, .LOOP_HS19_DST8
|
|
|
|
blt zero, a2, .HS19_REST8
|
|
|
|
b .HS19_END
|
|
|
|
.HS19_REST8:
|
|
|
|
li.w t1, 0
|
|
|
|
.HS19_DST8:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.HS19_FILTER8:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .HS19_FILTER8
|
|
|
|
sra.w t8, t8, a7
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 2
|
|
|
|
stx.w t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .HS19_DST8
|
|
|
|
b .HS19_END
|
|
|
|
|
|
|
|
.LOOP_HS19_DST4:
|
|
|
|
ld.w t1, a5, 0
|
|
|
|
ld.w t2, a5, 4
|
|
|
|
ld.w t3, a5, 8
|
|
|
|
ld.w t4, a5, 12
|
|
|
|
slli.w t1, t1, 1
|
|
|
|
slli.w t2, t2, 1
|
|
|
|
slli.w t3, t3, 1
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
fldx.d f1, a3, t1
|
|
|
|
fldx.d f2, a3, t2
|
|
|
|
fldx.d f3, a3, t3
|
|
|
|
fldx.d f4, a3, t4
|
|
|
|
ld.w t1, a5, 16
|
|
|
|
ld.w t2, a5, 20
|
|
|
|
ld.w t3, a5, 24
|
|
|
|
ld.w t4, a5, 28
|
|
|
|
slli.w t1, t1, 1
|
|
|
|
slli.w t2, t2, 1
|
|
|
|
slli.w t3, t3, 1
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
fldx.d f5, a3, t1
|
|
|
|
fldx.d f6, a3, t2
|
|
|
|
fldx.d f7, a3, t3
|
|
|
|
fldx.d f8, a3, t4
|
|
|
|
vld vr9, a4, 0
|
|
|
|
vld vr10, a4, 16
|
|
|
|
vld vr11, a4, 32
|
|
|
|
vld vr12, a4, 48
|
|
|
|
vilvl.d vr1, vr2, vr1
|
|
|
|
vilvl.d vr3, vr4, vr3
|
|
|
|
vilvl.d vr5, vr6, vr5
|
|
|
|
vilvl.d vr7, vr8, vr7
|
|
|
|
vmulwev.w.hu.h vr13, vr1, vr9
|
|
|
|
vmulwev.w.hu.h vr14, vr3, vr10
|
|
|
|
vmulwev.w.hu.h vr15, vr5, vr11
|
|
|
|
vmulwev.w.hu.h vr16, vr7, vr12
|
|
|
|
vmaddwod.w.hu.h vr13, vr1, vr9
|
|
|
|
vmaddwod.w.hu.h vr14, vr3, vr10
|
|
|
|
vmaddwod.w.hu.h vr15, vr5, vr11
|
|
|
|
vmaddwod.w.hu.h vr16, vr7, vr12
|
|
|
|
vhaddw.d.w vr13, vr13, vr13
|
|
|
|
vhaddw.d.w vr14, vr14, vr14
|
|
|
|
vhaddw.d.w vr15, vr15, vr15
|
|
|
|
vhaddw.d.w vr16, vr16, vr16
|
|
|
|
vpickev.w vr13, vr14, vr13
|
|
|
|
vpickev.w vr15, vr16, vr15
|
|
|
|
vsra.w vr13, vr13, vr0
|
|
|
|
vsra.w vr15, vr15, vr0
|
|
|
|
vmin.w vr13, vr13, vr20
|
|
|
|
vmin.w vr15, vr15, vr20
|
|
|
|
|
|
|
|
vst vr13, a1, 0
|
|
|
|
vst vr15, a1, 16
|
|
|
|
addi.d a1, a1, 32
|
|
|
|
addi.d a5, a5, 32
|
|
|
|
addi.d a4, a4, 64
|
|
|
|
addi.d a2, a2, -8
|
|
|
|
bge a2, t8, .LOOP_HS19_DST4
|
|
|
|
blt zero, a2, .HS19_REST4
|
|
|
|
b .HS19_END
|
|
|
|
.HS19_REST4:
|
|
|
|
li.w t1, 0
|
|
|
|
.HS19_DST4:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.HS19_FILTER4:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .HS19_FILTER4
|
|
|
|
sra.w t8, t8, a7
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 2
|
|
|
|
stx.w t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .HS19_DST4
|
|
|
|
b .HS19_END
|
|
|
|
.END_HS19_DST4:
|
|
|
|
|
|
|
|
li.w t1, 0
|
|
|
|
.LOOP_HS19_DST1:
|
|
|
|
slli.w t2, t1, 2
|
|
|
|
ldx.w t2, a5, t2
|
|
|
|
li.w t3, 0
|
|
|
|
li.w t8, 0
|
|
|
|
.HS19_FILTER1:
|
|
|
|
add.w t4, t2, t3
|
|
|
|
slli.w t4, t4, 1
|
|
|
|
ldx.hu t5, a3, t4
|
|
|
|
mul.w t6, a6, t1
|
|
|
|
add.w t6, t6, t3
|
|
|
|
slli.w t7, t6, 1
|
|
|
|
ldx.h t7, a4, t7
|
|
|
|
mul.w t7, t5, t7
|
|
|
|
add.w t8, t8, t7
|
|
|
|
addi.w t3, t3, 1
|
|
|
|
blt t3, a6, .HS19_FILTER1
|
|
|
|
sra.w t8, t8, a7
|
|
|
|
slt t5, t8, t0
|
|
|
|
maskeqz t8, t8, t5
|
|
|
|
masknez t5, t0, t5
|
|
|
|
or t8, t8, t5
|
|
|
|
slli.w t4, t1, 2
|
|
|
|
stx.w t8, a1, t4
|
|
|
|
addi.w t1, t1, 1
|
|
|
|
blt t1, a2, .LOOP_HS19_DST1
|
|
|
|
b .HS19_END
|
|
|
|
.HS19_END:
|
|
|
|
|
|
|
|
ld.d s0, sp, 0
|
|
|
|
ld.d s1, sp, 8
|
|
|
|
ld.d s2, sp, 16
|
|
|
|
ld.d s3, sp, 24
|
|
|
|
ld.d s4, sp, 32
|
|
|
|
ld.d s5, sp, 40
|
|
|
|
ld.d s6, sp, 48
|
|
|
|
ld.d s7, sp, 56
|
|
|
|
ld.d s8, sp, 64
|
|
|
|
addi.d sp, sp, 72
|
|
|
|
endfunc
|
2024-03-16 05:03:31 +02:00
|
|
|
|
|
|
|
function lumRangeFromJpeg_lsx
|
|
|
|
li.w t0, 14071
|
|
|
|
li.w t1, 33561947
|
|
|
|
vreplgr2vr.h vr0, t0
|
|
|
|
srli.w t2, a1, 3
|
|
|
|
andi t3, a1, 7
|
|
|
|
beqz t2, 2f
|
|
|
|
1:
|
|
|
|
vld vr1, a0, 0
|
|
|
|
vreplgr2vr.w vr2, t1
|
|
|
|
vreplgr2vr.w vr3, t1
|
|
|
|
vmaddwev.w.h vr2, vr0, vr1
|
|
|
|
vmaddwod.w.h vr3, vr0, vr1
|
|
|
|
vsrai.w vr2, vr2, 14
|
|
|
|
vsrai.w vr3, vr3, 14
|
|
|
|
vpackev.h vr1, vr3, vr2
|
|
|
|
vst vr1, a0, 0
|
|
|
|
addi.d a0, a0, 16
|
|
|
|
addi.d t2, t2, -1
|
|
|
|
bnez t2, 1b
|
|
|
|
2:
|
|
|
|
beqz t3, 4f
|
|
|
|
3:
|
|
|
|
ld.h t4, a0, 0
|
|
|
|
mul.w t4, t4, t0
|
|
|
|
add.w t4, t4, t1
|
|
|
|
srai.w t4, t4, 14
|
|
|
|
st.h t4, a0, 0
|
|
|
|
addi.d a0, a0, 2
|
|
|
|
addi.d t3, t3, -1
|
|
|
|
bnez t3, 3b
|
|
|
|
4:
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function lumRangeFromJpeg_lasx
|
|
|
|
li.w t0, 14071
|
|
|
|
li.w t1, 33561947
|
|
|
|
xvreplgr2vr.h xr0, t0
|
|
|
|
srli.w t2, a1, 4
|
|
|
|
andi t3, a1, 15
|
|
|
|
beqz t2, 2f
|
|
|
|
1:
|
|
|
|
xvld xr1, a0, 0
|
|
|
|
xvreplgr2vr.w xr2, t1
|
|
|
|
xvreplgr2vr.w xr3, t1
|
|
|
|
xvmaddwev.w.h xr2, xr0, xr1
|
|
|
|
xvmaddwod.w.h xr3, xr0, xr1
|
|
|
|
xvsrai.w xr2, xr2, 14
|
|
|
|
xvsrai.w xr3, xr3, 14
|
|
|
|
xvpackev.h xr1, xr3, xr2
|
|
|
|
xvst xr1, a0, 0
|
|
|
|
addi.d a0, a0, 32
|
|
|
|
addi.d t2, t2, -1
|
|
|
|
bnez t2, 1b
|
|
|
|
2:
|
|
|
|
beqz t3, 4f
|
|
|
|
3:
|
|
|
|
ld.h t4, a0, 0
|
|
|
|
mul.w t4, t4, t0
|
|
|
|
add.w t4, t4, t1
|
|
|
|
srai.w t4, t4, 14
|
|
|
|
st.h t4, a0, 0
|
|
|
|
addi.d a0, a0, 2
|
|
|
|
addi.d t3, t3, -1
|
|
|
|
bnez t3, 3b
|
|
|
|
4:
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function lumRangeToJpeg_lsx
|
|
|
|
li.w t0, 19077
|
|
|
|
li.w t1, -39057361
|
|
|
|
li.w t2, 30189
|
|
|
|
vreplgr2vr.h vr0, t0
|
|
|
|
vreplgr2vr.h vr4, t2
|
|
|
|
srli.w t2, a1, 3
|
|
|
|
andi t3, a1, 7
|
|
|
|
beqz t2, 2f
|
|
|
|
1:
|
|
|
|
vld vr1, a0, 0
|
|
|
|
vreplgr2vr.w vr2, t1
|
|
|
|
vreplgr2vr.w vr3, t1
|
|
|
|
vmin.h vr1, vr1, vr4
|
|
|
|
vmaddwev.w.h vr2, vr0, vr1
|
|
|
|
vmaddwod.w.h vr3, vr0, vr1
|
|
|
|
vsrai.w vr2, vr2, 14
|
|
|
|
vsrai.w vr3, vr3, 14
|
|
|
|
vpackev.h vr1, vr3, vr2
|
|
|
|
vst vr1, a0, 0
|
|
|
|
addi.d a0, a0, 16
|
|
|
|
addi.d t2, t2, -1
|
|
|
|
bnez t2, 1b
|
|
|
|
2:
|
|
|
|
beqz t3, 4f
|
|
|
|
3:
|
|
|
|
ld.h t4, a0, 0
|
|
|
|
vreplgr2vr.h vr1, t4
|
|
|
|
vmin.h vr1, vr1, vr4
|
|
|
|
vpickve2gr.h t4, vr1, 0
|
|
|
|
mul.w t4, t4, t0
|
|
|
|
add.w t4, t4, t1
|
|
|
|
srai.w t4, t4, 14
|
|
|
|
st.h t4, a0, 0
|
|
|
|
addi.d a0, a0, 2
|
|
|
|
addi.d t3, t3, -1
|
|
|
|
bnez t3, 3b
|
|
|
|
4:
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function lumRangeToJpeg_lasx
|
|
|
|
li.w t0, 19077
|
|
|
|
li.w t1, -39057361
|
|
|
|
li.w t2, 30189
|
|
|
|
xvreplgr2vr.h xr0, t0
|
|
|
|
xvreplgr2vr.h xr4, t2
|
|
|
|
srli.w t2, a1, 4
|
|
|
|
andi t3, a1, 15
|
|
|
|
beqz t2, 2f
|
|
|
|
1:
|
|
|
|
xvld xr1, a0, 0
|
|
|
|
xvreplgr2vr.w xr2, t1
|
|
|
|
xvreplgr2vr.w xr3, t1
|
|
|
|
xvmin.h xr1, xr1, xr4
|
|
|
|
xvmaddwev.w.h xr2, xr0, xr1
|
|
|
|
xvmaddwod.w.h xr3, xr0, xr1
|
|
|
|
xvsrai.w xr2, xr2, 14
|
|
|
|
xvsrai.w xr3, xr3, 14
|
|
|
|
xvpackev.h xr1, xr3, xr2
|
|
|
|
xvst xr1, a0, 0
|
|
|
|
addi.d a0, a0, 32
|
|
|
|
addi.d t2, t2, -1
|
|
|
|
bnez t2, 1b
|
|
|
|
2:
|
|
|
|
beqz t3, 4f
|
|
|
|
3:
|
|
|
|
ld.h t4, a0, 0
|
|
|
|
vreplgr2vr.h vr1, t4
|
|
|
|
vmin.h vr1, vr1, vr4
|
|
|
|
vpickve2gr.h t4, vr1, 0
|
|
|
|
mul.w t4, t4, t0
|
|
|
|
add.w t4, t4, t1
|
|
|
|
srai.w t4, t4, 14
|
|
|
|
st.h t4, a0, 0
|
|
|
|
addi.d a0, a0, 2
|
|
|
|
addi.d t3, t3, -1
|
|
|
|
bnez t3, 3b
|
|
|
|
4:
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function chrRangeFromJpeg_lsx
|
|
|
|
li.w t0, 1799
|
|
|
|
li.w t1, 4081085
|
|
|
|
vreplgr2vr.h vr0, t0
|
|
|
|
srli.w t2, a2, 3
|
|
|
|
andi t3, a2, 7
|
|
|
|
beqz t2, 2f
|
|
|
|
1:
|
|
|
|
vld vr1, a0, 0
|
|
|
|
vld vr2, a1, 0
|
|
|
|
vreplgr2vr.w vr3, t1
|
|
|
|
vreplgr2vr.w vr4, t1
|
|
|
|
vreplgr2vr.w vr5, t1
|
|
|
|
vreplgr2vr.w vr6, t1
|
|
|
|
vmaddwev.w.h vr3, vr0, vr1
|
|
|
|
vmaddwod.w.h vr4, vr0, vr1
|
|
|
|
vmaddwev.w.h vr5, vr0, vr2
|
|
|
|
vmaddwod.w.h vr6, vr0, vr2
|
|
|
|
vsrai.w vr3, vr3, 11
|
|
|
|
vsrai.w vr4, vr4, 11
|
|
|
|
vsrai.w vr5, vr5, 11
|
|
|
|
vsrai.w vr6, vr6, 11
|
|
|
|
vpackev.h vr1, vr4, vr3
|
|
|
|
vpackev.h vr2, vr6, vr5
|
|
|
|
vst vr1, a0, 0
|
|
|
|
vst vr2, a1, 0
|
|
|
|
addi.d a0, a0, 16
|
|
|
|
addi.d a1, a1, 16
|
|
|
|
addi.d t2, t2, -1
|
|
|
|
bnez t2, 1b
|
|
|
|
2:
|
|
|
|
beqz t3, 4f
|
|
|
|
3:
|
|
|
|
ld.h t4, a0, 0
|
|
|
|
ld.h t5, a1, 0
|
|
|
|
mul.w t4, t4, t0
|
|
|
|
mul.w t5, t5, t0
|
|
|
|
add.w t4, t4, t1
|
|
|
|
add.w t5, t5, t1
|
|
|
|
srai.w t4, t4, 11
|
|
|
|
srai.w t5, t5, 11
|
|
|
|
st.h t4, a0, 0
|
|
|
|
st.h t5, a1, 0
|
|
|
|
addi.d a0, a0, 2
|
|
|
|
addi.d a1, a1, 2
|
|
|
|
addi.d t3, t3, -1
|
|
|
|
bnez t3, 3b
|
|
|
|
4:
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function chrRangeFromJpeg_lasx
|
|
|
|
li.w t0, 1799
|
|
|
|
li.w t1, 4081085
|
|
|
|
xvreplgr2vr.h xr0, t0
|
|
|
|
srli.w t2, a2, 4
|
|
|
|
andi t3, a2, 15
|
|
|
|
beqz t2, 2f
|
|
|
|
1:
|
|
|
|
xvld xr1, a0, 0
|
|
|
|
xvld xr2, a1, 0
|
|
|
|
xvreplgr2vr.w xr3, t1
|
|
|
|
xvreplgr2vr.w xr4, t1
|
|
|
|
xvreplgr2vr.w xr5, t1
|
|
|
|
xvreplgr2vr.w xr6, t1
|
|
|
|
xvmaddwev.w.h xr3, xr0, xr1
|
|
|
|
xvmaddwod.w.h xr4, xr0, xr1
|
|
|
|
xvmaddwev.w.h xr5, xr0, xr2
|
|
|
|
xvmaddwod.w.h xr6, xr0, xr2
|
|
|
|
xvsrai.w xr3, xr3, 11
|
|
|
|
xvsrai.w xr4, xr4, 11
|
|
|
|
xvsrai.w xr5, xr5, 11
|
|
|
|
xvsrai.w xr6, xr6, 11
|
|
|
|
xvpackev.h xr1, xr4, xr3
|
|
|
|
xvpackev.h xr2, xr6, xr5
|
|
|
|
xvst xr1, a0, 0
|
|
|
|
xvst xr2, a1, 0
|
|
|
|
addi.d a0, a0, 32
|
|
|
|
addi.d a1, a1, 32
|
|
|
|
addi.d t2, t2, -1
|
|
|
|
bnez t2, 1b
|
|
|
|
2:
|
|
|
|
beqz t3, 4f
|
|
|
|
3:
|
|
|
|
ld.h t4, a0, 0
|
|
|
|
ld.h t5, a1, 0
|
|
|
|
mul.w t4, t4, t0
|
|
|
|
mul.w t5, t5, t0
|
|
|
|
add.w t4, t4, t1
|
|
|
|
add.w t5, t5, t1
|
|
|
|
srai.w t4, t4, 11
|
|
|
|
srai.w t5, t5, 11
|
|
|
|
st.h t4, a0, 0
|
|
|
|
st.h t5, a1, 0
|
|
|
|
addi.d a0, a0, 2
|
|
|
|
addi.d a1, a1, 2
|
|
|
|
addi.d t3, t3, -1
|
|
|
|
bnez t3, 3b
|
|
|
|
4:
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function chrRangeToJpeg_lsx
|
|
|
|
li.w t0, 4663
|
|
|
|
li.w t1, -9289992
|
|
|
|
li.w t2, 30775
|
|
|
|
vreplgr2vr.h vr0, t0
|
|
|
|
vreplgr2vr.h vr7, t2
|
|
|
|
srli.w t2, a2, 3
|
|
|
|
andi t3, a2, 7
|
|
|
|
beqz t2, 2f
|
|
|
|
1:
|
|
|
|
vld vr1, a0, 0
|
|
|
|
vld vr2, a1, 0
|
|
|
|
vreplgr2vr.w vr3, t1
|
|
|
|
vreplgr2vr.w vr4, t1
|
|
|
|
vreplgr2vr.w vr5, t1
|
|
|
|
vreplgr2vr.w vr6, t1
|
|
|
|
vmin.h vr1, vr1, vr7
|
|
|
|
vmin.h vr2, vr2, vr7
|
|
|
|
vmaddwev.w.h vr3, vr0, vr1
|
|
|
|
vmaddwod.w.h vr4, vr0, vr1
|
|
|
|
vmaddwev.w.h vr5, vr0, vr2
|
|
|
|
vmaddwod.w.h vr6, vr0, vr2
|
|
|
|
vsrai.w vr3, vr3, 12
|
|
|
|
vsrai.w vr4, vr4, 12
|
|
|
|
vsrai.w vr5, vr5, 12
|
|
|
|
vsrai.w vr6, vr6, 12
|
|
|
|
vpackev.h vr1, vr4, vr3
|
|
|
|
vpackev.h vr2, vr6, vr5
|
|
|
|
vst vr1, a0, 0
|
|
|
|
vst vr2, a1, 0
|
|
|
|
addi.d a0, a0, 16
|
|
|
|
addi.d a1, a1, 16
|
|
|
|
addi.d t2, t2, -1
|
|
|
|
bnez t2, 1b
|
|
|
|
2:
|
|
|
|
beqz t3, 4f
|
|
|
|
3:
|
|
|
|
ld.h t4, a0, 0
|
|
|
|
ld.h t5, a1, 0
|
|
|
|
vreplgr2vr.h vr1, t4
|
|
|
|
vreplgr2vr.h vr2, t5
|
|
|
|
vmin.h vr1, vr1, vr7
|
|
|
|
vmin.h vr2, vr2, vr7
|
|
|
|
vpickve2gr.h t4, vr1, 0
|
|
|
|
vpickve2gr.h t5, vr2, 0
|
|
|
|
mul.w t4, t4, t0
|
|
|
|
mul.w t5, t5, t0
|
|
|
|
add.w t4, t4, t1
|
|
|
|
add.w t5, t5, t1
|
|
|
|
srai.w t4, t4, 12
|
|
|
|
srai.w t5, t5, 12
|
|
|
|
st.h t4, a0, 0
|
|
|
|
st.h t5, a1, 0
|
|
|
|
addi.d a0, a0, 2
|
|
|
|
addi.d a1, a1, 2
|
|
|
|
addi.d t3, t3, -1
|
|
|
|
bnez t3, 3b
|
|
|
|
4:
|
|
|
|
endfunc
|
|
|
|
|
|
|
|
function chrRangeToJpeg_lasx
|
|
|
|
li.w t0, 4663
|
|
|
|
li.w t1, -9289992
|
|
|
|
li.w t2, 30775
|
|
|
|
xvreplgr2vr.h xr0, t0
|
|
|
|
xvreplgr2vr.h xr7, t2
|
|
|
|
srli.w t2, a2, 4
|
|
|
|
andi t3, a2, 15
|
|
|
|
beqz t2, 2f
|
|
|
|
1:
|
|
|
|
xvld xr1, a0, 0
|
|
|
|
xvld xr2, a1, 0
|
|
|
|
xvreplgr2vr.w xr3, t1
|
|
|
|
xvreplgr2vr.w xr4, t1
|
|
|
|
xvreplgr2vr.w xr5, t1
|
|
|
|
xvreplgr2vr.w xr6, t1
|
|
|
|
xvmin.h xr1, xr1, xr7
|
|
|
|
xvmin.h xr2, xr2, xr7
|
|
|
|
xvmaddwev.w.h xr3, xr0, xr1
|
|
|
|
xvmaddwod.w.h xr4, xr0, xr1
|
|
|
|
xvmaddwev.w.h xr5, xr0, xr2
|
|
|
|
xvmaddwod.w.h xr6, xr0, xr2
|
|
|
|
xvsrai.w xr3, xr3, 12
|
|
|
|
xvsrai.w xr4, xr4, 12
|
|
|
|
xvsrai.w xr5, xr5, 12
|
|
|
|
xvsrai.w xr6, xr6, 12
|
|
|
|
xvpackev.h xr1, xr4, xr3
|
|
|
|
xvpackev.h xr2, xr6, xr5
|
|
|
|
xvst xr1, a0, 0
|
|
|
|
xvst xr2, a1, 0
|
|
|
|
addi.d a0, a0, 32
|
|
|
|
addi.d a1, a1, 32
|
|
|
|
addi.d t2, t2, -1
|
|
|
|
bnez t2, 1b
|
|
|
|
2:
|
|
|
|
beqz t3, 4f
|
|
|
|
3:
|
|
|
|
ld.h t4, a0, 0
|
|
|
|
ld.h t5, a1, 0
|
|
|
|
vreplgr2vr.h vr1, t4
|
|
|
|
vreplgr2vr.h vr2, t5
|
|
|
|
vmin.h vr1, vr1, vr7
|
|
|
|
vmin.h vr2, vr2, vr7
|
|
|
|
vpickve2gr.h t4, vr1, 0
|
|
|
|
vpickve2gr.h t5, vr2, 0
|
|
|
|
mul.w t4, t4, t0
|
|
|
|
mul.w t5, t5, t0
|
|
|
|
add.w t4, t4, t1
|
|
|
|
add.w t5, t5, t1
|
|
|
|
srai.w t4, t4, 12
|
|
|
|
srai.w t5, t5, 12
|
|
|
|
st.h t4, a0, 0
|
|
|
|
st.h t5, a1, 0
|
|
|
|
addi.d a0, a0, 2
|
|
|
|
addi.d a1, a1, 2
|
|
|
|
addi.d t3, t3, -1
|
|
|
|
bnez t3, 3b
|
|
|
|
4:
|
|
|
|
endfunc
|