mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-08 13:22:53 +02:00
9239081db3
tests/checkasm/checkasm: C LSX LASX put_hevc_qpel_uni_h4_8_c: 5.7 1.2 put_hevc_qpel_uni_h6_8_c: 12.2 2.7 put_hevc_qpel_uni_h8_8_c: 21.5 3.2 put_hevc_qpel_uni_h12_8_c: 47.2 9.2 7.2 put_hevc_qpel_uni_h16_8_c: 87.0 11.7 9.0 put_hevc_qpel_uni_h24_8_c: 188.2 27.5 21.0 put_hevc_qpel_uni_h32_8_c: 335.2 46.7 28.5 put_hevc_qpel_uni_h48_8_c: 772.5 104.5 65.2 put_hevc_qpel_uni_h64_8_c: 1383.2 142.2 109.0 put_hevc_epel_uni_w_v4_8_c: 5.0 1.5 put_hevc_epel_uni_w_v6_8_c: 10.7 3.5 2.5 put_hevc_epel_uni_w_v8_8_c: 18.2 3.7 3.0 put_hevc_epel_uni_w_v12_8_c: 40.2 10.7 7.5 put_hevc_epel_uni_w_v16_8_c: 70.2 13.0 9.2 put_hevc_epel_uni_w_v24_8_c: 158.2 30.2 22.5 put_hevc_epel_uni_w_v32_8_c: 281.0 52.0 36.5 put_hevc_epel_uni_w_v48_8_c: 631.7 116.7 82.7 put_hevc_epel_uni_w_v64_8_c: 1108.2 207.5 142.2 put_hevc_epel_uni_w_h4_8_c: 4.7 1.2 put_hevc_epel_uni_w_h6_8_c: 9.7 3.5 2.7 put_hevc_epel_uni_w_h8_8_c: 17.2 4.2 3.5 put_hevc_epel_uni_w_h12_8_c: 38.0 11.5 7.2 put_hevc_epel_uni_w_h16_8_c: 69.2 14.5 9.2 put_hevc_epel_uni_w_h24_8_c: 152.0 34.7 22.5 put_hevc_epel_uni_w_h32_8_c: 271.0 58.0 40.0 put_hevc_epel_uni_w_h48_8_c: 597.5 136.7 95.0 put_hevc_epel_uni_w_h64_8_c: 1074.0 252.2 168.0 put_hevc_epel_bi_h4_8_c: 4.5 0.7 put_hevc_epel_bi_h6_8_c: 9.0 1.5 put_hevc_epel_bi_h8_8_c: 15.2 1.7 put_hevc_epel_bi_h12_8_c: 33.5 4.2 3.7 put_hevc_epel_bi_h16_8_c: 59.7 5.2 4.7 put_hevc_epel_bi_h24_8_c: 132.2 11.0 put_hevc_epel_bi_h32_8_c: 232.7 20.2 13.2 put_hevc_epel_bi_h48_8_c: 521.7 45.2 31.2 put_hevc_epel_bi_h64_8_c: 949.0 71.5 51.0 After this patch, the peformance of decoding H265 4K 30FPS 30Mbps on 3A6000 with 8 threads improves 1fps(55fps-->56fsp). Change-Id: I8cc1e41daa63ca478039bc55d1ee8934a7423f51 Reviewed-by: yinshiyou-hf@loongson.cn Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
4574 lines
170 KiB
ArmAsm
4574 lines
170 KiB
ArmAsm
/*
|
|
* Copyright (c) 2023 Loongson Technology Corporation Limited
|
|
* Contributed by jinbo <jinbo@loongson.cn>
|
|
*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with FFmpeg; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
|
|
#include "loongson_asm.S"
|
|
|
|
.extern ff_hevc_qpel_filters
|
|
.extern ff_hevc_epel_filters
|
|
|
|
.macro LOAD_VAR bit
|
|
addi.w t1, a5, 6 //shift
|
|
addi.w t3, zero, 1 //one
|
|
sub.w t4, t1, t3
|
|
sll.w t3, t3, t4 //offset
|
|
.if \bit == 128
|
|
vreplgr2vr.w vr1, a6 //wx
|
|
vreplgr2vr.w vr2, t3 //offset
|
|
vreplgr2vr.w vr3, t1 //shift
|
|
vreplgr2vr.w vr4, a7 //ox
|
|
.else
|
|
xvreplgr2vr.w xr1, a6
|
|
xvreplgr2vr.w xr2, t3
|
|
xvreplgr2vr.w xr3, t1
|
|
xvreplgr2vr.w xr4, a7
|
|
.endif
|
|
.endm
|
|
|
|
.macro HEVC_PEL_UNI_W_PIXELS8_LSX src0, dst0, w
|
|
vldrepl.d vr0, \src0, 0
|
|
vsllwil.hu.bu vr0, vr0, 0
|
|
vexth.wu.hu vr5, vr0
|
|
vsllwil.wu.hu vr0, vr0, 0
|
|
vslli.w vr0, vr0, 6
|
|
vslli.w vr5, vr5, 6
|
|
vmul.w vr0, vr0, vr1
|
|
vmul.w vr5, vr5, vr1
|
|
vadd.w vr0, vr0, vr2
|
|
vadd.w vr5, vr5, vr2
|
|
vsra.w vr0, vr0, vr3
|
|
vsra.w vr5, vr5, vr3
|
|
vadd.w vr0, vr0, vr4
|
|
vadd.w vr5, vr5, vr4
|
|
vssrani.h.w vr5, vr0, 0
|
|
vssrani.bu.h vr5, vr5, 0
|
|
.if \w == 6
|
|
fst.s f5, \dst0, 0
|
|
vstelm.h vr5, \dst0, 4, 2
|
|
.else
|
|
fst.d f5, \dst0, 0
|
|
.endif
|
|
.endm
|
|
|
|
.macro HEVC_PEL_UNI_W_PIXELS8x2_LASX src0, dst0, w
|
|
vldrepl.d vr0, \src0, 0
|
|
add.d t2, \src0, a3
|
|
vldrepl.d vr5, t2, 0
|
|
xvpermi.q xr0, xr5, 0x02
|
|
xvsllwil.hu.bu xr0, xr0, 0
|
|
xvexth.wu.hu xr5, xr0
|
|
xvsllwil.wu.hu xr0, xr0, 0
|
|
xvslli.w xr0, xr0, 6
|
|
xvslli.w xr5, xr5, 6
|
|
xvmul.w xr0, xr0, xr1
|
|
xvmul.w xr5, xr5, xr1
|
|
xvadd.w xr0, xr0, xr2
|
|
xvadd.w xr5, xr5, xr2
|
|
xvsra.w xr0, xr0, xr3
|
|
xvsra.w xr5, xr5, xr3
|
|
xvadd.w xr0, xr0, xr4
|
|
xvadd.w xr5, xr5, xr4
|
|
xvssrani.h.w xr5, xr0, 0
|
|
xvpermi.q xr0, xr5, 0x01
|
|
xvssrani.bu.h xr0, xr5, 0
|
|
add.d t3, \dst0, a1
|
|
.if \w == 6
|
|
vstelm.w vr0, \dst0, 0, 0
|
|
vstelm.h vr0, \dst0, 4, 2
|
|
vstelm.w vr0, t3, 0, 2
|
|
vstelm.h vr0, t3, 4, 6
|
|
.else
|
|
vstelm.d vr0, \dst0, 0, 0
|
|
vstelm.d vr0, t3, 0, 1
|
|
.endif
|
|
.endm
|
|
|
|
.macro HEVC_PEL_UNI_W_PIXELS16_LSX src0, dst0
|
|
vld vr0, \src0, 0
|
|
vexth.hu.bu vr7, vr0
|
|
vexth.wu.hu vr8, vr7
|
|
vsllwil.wu.hu vr7, vr7, 0
|
|
vsllwil.hu.bu vr5, vr0, 0
|
|
vexth.wu.hu vr6, vr5
|
|
vsllwil.wu.hu vr5, vr5, 0
|
|
vslli.w vr5, vr5, 6
|
|
vslli.w vr6, vr6, 6
|
|
vslli.w vr7, vr7, 6
|
|
vslli.w vr8, vr8, 6
|
|
vmul.w vr5, vr5, vr1
|
|
vmul.w vr6, vr6, vr1
|
|
vmul.w vr7, vr7, vr1
|
|
vmul.w vr8, vr8, vr1
|
|
vadd.w vr5, vr5, vr2
|
|
vadd.w vr6, vr6, vr2
|
|
vadd.w vr7, vr7, vr2
|
|
vadd.w vr8, vr8, vr2
|
|
vsra.w vr5, vr5, vr3
|
|
vsra.w vr6, vr6, vr3
|
|
vsra.w vr7, vr7, vr3
|
|
vsra.w vr8, vr8, vr3
|
|
vadd.w vr5, vr5, vr4
|
|
vadd.w vr6, vr6, vr4
|
|
vadd.w vr7, vr7, vr4
|
|
vadd.w vr8, vr8, vr4
|
|
vssrani.h.w vr6, vr5, 0
|
|
vssrani.h.w vr8, vr7, 0
|
|
vssrani.bu.h vr8, vr6, 0
|
|
vst vr8, \dst0, 0
|
|
.endm
|
|
|
|
.macro HEVC_PEL_UNI_W_PIXELS16_LASX src0, dst0
|
|
vld vr0, \src0, 0
|
|
xvpermi.d xr0, xr0, 0xd8
|
|
xvsllwil.hu.bu xr0, xr0, 0
|
|
xvexth.wu.hu xr6, xr0
|
|
xvsllwil.wu.hu xr5, xr0, 0
|
|
xvslli.w xr5, xr5, 6
|
|
xvslli.w xr6, xr6, 6
|
|
xvmul.w xr5, xr5, xr1
|
|
xvmul.w xr6, xr6, xr1
|
|
xvadd.w xr5, xr5, xr2
|
|
xvadd.w xr6, xr6, xr2
|
|
xvsra.w xr5, xr5, xr3
|
|
xvsra.w xr6, xr6, xr3
|
|
xvadd.w xr5, xr5, xr4
|
|
xvadd.w xr6, xr6, xr4
|
|
xvssrani.h.w xr6, xr5, 0
|
|
xvpermi.q xr7, xr6, 0x01
|
|
xvssrani.bu.h xr7, xr6, 0
|
|
vst vr7, \dst0, 0
|
|
.endm
|
|
|
|
.macro HEVC_PEL_UNI_W_PIXELS32_LASX src0, dst0, w
|
|
.if \w == 16
|
|
vld vr0, \src0, 0
|
|
add.d t2, \src0, a3
|
|
vld vr5, t2, 0
|
|
xvpermi.q xr0, xr5, 0x02
|
|
.else //w=24/32
|
|
xvld xr0, \src0, 0
|
|
.endif
|
|
xvexth.hu.bu xr7, xr0
|
|
xvexth.wu.hu xr8, xr7
|
|
xvsllwil.wu.hu xr7, xr7, 0
|
|
xvsllwil.hu.bu xr5, xr0, 0
|
|
xvexth.wu.hu xr6, xr5
|
|
xvsllwil.wu.hu xr5, xr5, 0
|
|
xvslli.w xr5, xr5, 6
|
|
xvslli.w xr6, xr6, 6
|
|
xvslli.w xr7, xr7, 6
|
|
xvslli.w xr8, xr8, 6
|
|
xvmul.w xr5, xr5, xr1
|
|
xvmul.w xr6, xr6, xr1
|
|
xvmul.w xr7, xr7, xr1
|
|
xvmul.w xr8, xr8, xr1
|
|
xvadd.w xr5, xr5, xr2
|
|
xvadd.w xr6, xr6, xr2
|
|
xvadd.w xr7, xr7, xr2
|
|
xvadd.w xr8, xr8, xr2
|
|
xvsra.w xr5, xr5, xr3
|
|
xvsra.w xr6, xr6, xr3
|
|
xvsra.w xr7, xr7, xr3
|
|
xvsra.w xr8, xr8, xr3
|
|
xvadd.w xr5, xr5, xr4
|
|
xvadd.w xr6, xr6, xr4
|
|
xvadd.w xr7, xr7, xr4
|
|
xvadd.w xr8, xr8, xr4
|
|
xvssrani.h.w xr6, xr5, 0
|
|
xvssrani.h.w xr8, xr7, 0
|
|
xvssrani.bu.h xr8, xr6, 0
|
|
.if \w == 16
|
|
vst vr8, \dst0, 0
|
|
add.d t2, \dst0, a1
|
|
xvpermi.q xr8, xr8, 0x01
|
|
vst vr8, t2, 0
|
|
.elseif \w == 24
|
|
vst vr8, \dst0, 0
|
|
xvstelm.d xr8, \dst0, 16, 2
|
|
.else
|
|
xvst xr8, \dst0, 0
|
|
.endif
|
|
.endm
|
|
|
|
/*
|
|
* void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride,
|
|
* const uint8_t *_src, ptrdiff_t _srcstride,
|
|
* int height, int denom, int wx, int ox,
|
|
* intptr_t mx, intptr_t my, int width)
|
|
*/
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx
|
|
LOAD_VAR 128
|
|
srli.w t0, a4, 1
|
|
.LOOP_PIXELS4:
|
|
vldrepl.w vr0, a2, 0
|
|
add.d t1, a2, a3
|
|
vldrepl.w vr5, t1, 0
|
|
vsllwil.hu.bu vr0, vr0, 0
|
|
vsllwil.wu.hu vr0, vr0, 0
|
|
vsllwil.hu.bu vr5, vr5, 0
|
|
vsllwil.wu.hu vr5, vr5, 0
|
|
vslli.w vr0, vr0, 6
|
|
vslli.w vr5, vr5, 6
|
|
vmul.w vr0, vr0, vr1
|
|
vmul.w vr5, vr5, vr1
|
|
vadd.w vr0, vr0, vr2
|
|
vadd.w vr5, vr5, vr2
|
|
vsra.w vr0, vr0, vr3
|
|
vsra.w vr5, vr5, vr3
|
|
vadd.w vr0, vr0, vr4
|
|
vadd.w vr5, vr5, vr4
|
|
vssrani.h.w vr5, vr0, 0
|
|
vssrani.bu.h vr5, vr5, 0
|
|
fst.s f5, a0, 0
|
|
add.d t2, a0, a1
|
|
vstelm.w vr5, t2, 0, 1
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a0, a1, a0, 1
|
|
addi.w t0, t0, -1
|
|
bnez t0, .LOOP_PIXELS4
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx
|
|
LOAD_VAR 128
|
|
.LOOP_PIXELS6:
|
|
HEVC_PEL_UNI_W_PIXELS8_LSX a2, a0, 6
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
bnez a4, .LOOP_PIXELS6
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx
|
|
LOAD_VAR 256
|
|
srli.w t0, a4, 1
|
|
.LOOP_PIXELS6_LASX:
|
|
HEVC_PEL_UNI_W_PIXELS8x2_LASX a2, a0, 6
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a0, a1, a0, 1
|
|
addi.w t0, t0, -1
|
|
bnez t0, .LOOP_PIXELS6_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx
|
|
LOAD_VAR 128
|
|
.LOOP_PIXELS8:
|
|
HEVC_PEL_UNI_W_PIXELS8_LSX a2, a0, 8
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
bnez a4, .LOOP_PIXELS8
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx
|
|
LOAD_VAR 256
|
|
srli.w t0, a4, 1
|
|
.LOOP_PIXELS8_LASX:
|
|
HEVC_PEL_UNI_W_PIXELS8x2_LASX a2, a0, 8
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a0, a1, a0, 1
|
|
addi.w t0, t0, -1
|
|
bnez t0, .LOOP_PIXELS8_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx
|
|
LOAD_VAR 128
|
|
.LOOP_PIXELS12:
|
|
vld vr0, a2, 0
|
|
vexth.hu.bu vr7, vr0
|
|
vsllwil.wu.hu vr7, vr7, 0
|
|
vsllwil.hu.bu vr5, vr0, 0
|
|
vexth.wu.hu vr6, vr5
|
|
vsllwil.wu.hu vr5, vr5, 0
|
|
vslli.w vr5, vr5, 6
|
|
vslli.w vr6, vr6, 6
|
|
vslli.w vr7, vr7, 6
|
|
vmul.w vr5, vr5, vr1
|
|
vmul.w vr6, vr6, vr1
|
|
vmul.w vr7, vr7, vr1
|
|
vadd.w vr5, vr5, vr2
|
|
vadd.w vr6, vr6, vr2
|
|
vadd.w vr7, vr7, vr2
|
|
vsra.w vr5, vr5, vr3
|
|
vsra.w vr6, vr6, vr3
|
|
vsra.w vr7, vr7, vr3
|
|
vadd.w vr5, vr5, vr4
|
|
vadd.w vr6, vr6, vr4
|
|
vadd.w vr7, vr7, vr4
|
|
vssrani.h.w vr6, vr5, 0
|
|
vssrani.h.w vr7, vr7, 0
|
|
vssrani.bu.h vr7, vr6, 0
|
|
fst.d f7, a0, 0
|
|
vstelm.w vr7, a0, 8, 2
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
bnez a4, .LOOP_PIXELS12
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx
|
|
LOAD_VAR 256
|
|
.LOOP_PIXELS12_LASX:
|
|
vld vr0, a2, 0
|
|
xvpermi.d xr0, xr0, 0xd8
|
|
xvsllwil.hu.bu xr0, xr0, 0
|
|
xvexth.wu.hu xr6, xr0
|
|
xvsllwil.wu.hu xr5, xr0, 0
|
|
xvslli.w xr5, xr5, 6
|
|
xvslli.w xr6, xr6, 6
|
|
xvmul.w xr5, xr5, xr1
|
|
xvmul.w xr6, xr6, xr1
|
|
xvadd.w xr5, xr5, xr2
|
|
xvadd.w xr6, xr6, xr2
|
|
xvsra.w xr5, xr5, xr3
|
|
xvsra.w xr6, xr6, xr3
|
|
xvadd.w xr5, xr5, xr4
|
|
xvadd.w xr6, xr6, xr4
|
|
xvssrani.h.w xr6, xr5, 0
|
|
xvpermi.q xr7, xr6, 0x01
|
|
xvssrani.bu.h xr7, xr6, 0
|
|
fst.d f7, a0, 0
|
|
vstelm.w vr7, a0, 8, 2
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
bnez a4, .LOOP_PIXELS12_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx
|
|
LOAD_VAR 128
|
|
.LOOP_PIXELS16:
|
|
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
bnez a4, .LOOP_PIXELS16
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx
|
|
LOAD_VAR 256
|
|
srli.w t0, a4, 1
|
|
.LOOP_PIXELS16_LASX:
|
|
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 16
|
|
alsl.d a2, a3, a2, 1
|
|
alsl.d a0, a1, a0, 1
|
|
addi.w t0, t0, -1
|
|
bnez t0, .LOOP_PIXELS16_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx
|
|
LOAD_VAR 128
|
|
.LOOP_PIXELS24:
|
|
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
|
|
addi.d t0, a2, 16
|
|
addi.d t1, a0, 16
|
|
HEVC_PEL_UNI_W_PIXELS8_LSX t0, t1, 8
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
bnez a4, .LOOP_PIXELS24
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx
|
|
LOAD_VAR 256
|
|
.LOOP_PIXELS24_LASX:
|
|
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 24
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
bnez a4, .LOOP_PIXELS24_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx
|
|
LOAD_VAR 128
|
|
.LOOP_PIXELS32:
|
|
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
|
|
addi.d t0, a2, 16
|
|
addi.d t1, a0, 16
|
|
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
bnez a4, .LOOP_PIXELS32
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx
|
|
LOAD_VAR 256
|
|
.LOOP_PIXELS32_LASX:
|
|
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
bnez a4, .LOOP_PIXELS32_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx
|
|
LOAD_VAR 128
|
|
.LOOP_PIXELS48:
|
|
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
|
|
addi.d t0, a2, 16
|
|
addi.d t1, a0, 16
|
|
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
|
|
addi.d t0, a2, 32
|
|
addi.d t1, a0, 32
|
|
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
bnez a4, .LOOP_PIXELS48
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx
|
|
LOAD_VAR 256
|
|
.LOOP_PIXELS48_LASX:
|
|
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32
|
|
addi.d t0, a2, 32
|
|
addi.d t1, a0, 32
|
|
HEVC_PEL_UNI_W_PIXELS16_LASX t0, t1
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
bnez a4, .LOOP_PIXELS48_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx
|
|
LOAD_VAR 128
|
|
.LOOP_PIXELS64:
|
|
HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
|
|
addi.d t0, a2, 16
|
|
addi.d t1, a0, 16
|
|
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
|
|
addi.d t0, a2, 32
|
|
addi.d t1, a0, 32
|
|
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
|
|
addi.d t0, a2, 48
|
|
addi.d t1, a0, 48
|
|
HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
bnez a4, .LOOP_PIXELS64
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx
|
|
LOAD_VAR 256
|
|
.LOOP_PIXELS64_LASX:
|
|
HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32
|
|
addi.d t0, a2, 32
|
|
addi.d t1, a0, 32
|
|
HEVC_PEL_UNI_W_PIXELS32_LASX t0, t1, 32
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.w a4, a4, -1
|
|
bnez a4, .LOOP_PIXELS64_LASX
|
|
endfunc
|
|
|
|
.macro vhaddw.d.h in0
|
|
vhaddw.w.h \in0, \in0, \in0
|
|
vhaddw.d.w \in0, \in0, \in0
|
|
.endm
|
|
|
|
.macro xvhaddw.d.h in0
|
|
xvhaddw.w.h \in0, \in0, \in0
|
|
xvhaddw.d.w \in0, \in0, \in0
|
|
.endm
|
|
|
|
/*
|
|
* void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
|
|
* const uint8_t *_src, ptrdiff_t _srcstride,
|
|
* int height, int denom, int wx, int ox,
|
|
* intptr_t mx, intptr_t my, int width)
|
|
*/
|
|
function ff_hevc_put_hevc_qpel_uni_w_v4_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
fld.s f6, a2, 0 //0
|
|
fldx.s f7, a2, a3 //1
|
|
fldx.s f8, a2, t0 //2
|
|
add.d a2, a2, t1
|
|
fld.s f9, a2, 0 //3
|
|
fldx.s f10, a2, a3 //4
|
|
fldx.s f11, a2, t0 //5
|
|
fldx.s f12, a2, t1 //6
|
|
add.d a2, a2, t2
|
|
vilvl.b vr6, vr7, vr6
|
|
vilvl.b vr7, vr9, vr8
|
|
vilvl.b vr8, vr11, vr10
|
|
vilvl.b vr9, vr13, vr12
|
|
vilvl.h vr6, vr7, vr6
|
|
vilvl.h vr7, vr9, vr8
|
|
vilvl.w vr8, vr7, vr6
|
|
vilvh.w vr9, vr7, vr6
|
|
.LOOP_V4:
|
|
fld.s f13, a2, 0 //7
|
|
fldx.s f14, a2, a3 //8 next loop
|
|
add.d a2, a2, t0
|
|
vextrins.b vr8, vr13, 0x70
|
|
vextrins.b vr8, vr13, 0xf1
|
|
vextrins.b vr9, vr13, 0x72
|
|
vextrins.b vr9, vr13, 0xf3
|
|
vbsrl.v vr10, vr8, 1
|
|
vbsrl.v vr11, vr9, 1
|
|
vextrins.b vr10, vr14, 0x70
|
|
vextrins.b vr10, vr14, 0xf1
|
|
vextrins.b vr11, vr14, 0x72
|
|
vextrins.b vr11, vr14, 0xf3
|
|
vdp2.h.bu.b vr6, vr8, vr5 //QPEL_FILTER(src, stride)
|
|
vdp2.h.bu.b vr7, vr9, vr5
|
|
vdp2.h.bu.b vr12, vr10, vr5
|
|
vdp2.h.bu.b vr13, vr11, vr5
|
|
vbsrl.v vr8, vr10, 1
|
|
vbsrl.v vr9, vr11, 1
|
|
vhaddw.d.h vr6
|
|
vhaddw.d.h vr7
|
|
vhaddw.d.h vr12
|
|
vhaddw.d.h vr13
|
|
vpickev.w vr6, vr7, vr6
|
|
vpickev.w vr12, vr13, vr12
|
|
vmulwev.w.h vr6, vr6, vr1 //QPEL_FILTER(src, stride) * wx
|
|
vmulwev.w.h vr12, vr12, vr1
|
|
vadd.w vr6, vr6, vr2
|
|
vsra.w vr6, vr6, vr3
|
|
vadd.w vr6, vr6, vr4
|
|
vadd.w vr12, vr12, vr2
|
|
vsra.w vr12, vr12, vr3
|
|
vadd.w vr12, vr12, vr4
|
|
vssrani.h.w vr12, vr6, 0
|
|
vssrani.bu.h vr12, vr12, 0
|
|
fst.s f12, a0, 0
|
|
add.d a0, a0, a1
|
|
vstelm.w vr12, a0, 0, 1
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -2
|
|
bnez a4, .LOOP_V4
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v6_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
fld.d f6, a2, 0
|
|
fldx.d f7, a2, a3
|
|
fldx.d f8, a2, t0
|
|
add.d a2, a2, t1
|
|
fld.d f9, a2, 0
|
|
fldx.d f10, a2, a3
|
|
fldx.d f11, a2, t0
|
|
fldx.d f12, a2, t1
|
|
add.d a2, a2, t2
|
|
vilvl.b vr6, vr7, vr6 //transpose 8x6 to 3x16
|
|
vilvl.b vr7, vr9, vr8
|
|
vilvl.b vr8, vr11, vr10
|
|
vilvl.b vr9, vr13, vr12
|
|
vilvl.h vr10, vr7, vr6
|
|
vilvh.h vr11, vr7, vr6
|
|
vilvl.h vr12, vr9, vr8
|
|
vilvh.h vr13, vr9, vr8
|
|
vilvl.w vr6, vr12, vr10
|
|
vilvh.w vr7, vr12, vr10
|
|
vilvl.w vr8, vr13, vr11
|
|
.LOOP_V6:
|
|
fld.d f13, a2, 0
|
|
add.d a2, a2, a3
|
|
vextrins.b vr6, vr13, 0x70
|
|
vextrins.b vr6, vr13, 0xf1
|
|
vextrins.b vr7, vr13, 0x72
|
|
vextrins.b vr7, vr13, 0xf3
|
|
vextrins.b vr8, vr13, 0x74
|
|
vextrins.b vr8, vr13, 0xf5
|
|
vdp2.h.bu.b vr10, vr6, vr5 //QPEL_FILTER(src, stride)
|
|
vdp2.h.bu.b vr11, vr7, vr5
|
|
vdp2.h.bu.b vr12, vr8, vr5
|
|
vbsrl.v vr6, vr6, 1
|
|
vbsrl.v vr7, vr7, 1
|
|
vbsrl.v vr8, vr8, 1
|
|
vhaddw.d.h vr10
|
|
vhaddw.d.h vr11
|
|
vhaddw.d.h vr12
|
|
vpickev.w vr10, vr11, vr10
|
|
vpickev.w vr11, vr13, vr12
|
|
vmulwev.w.h vr10, vr10, vr1 //QPEL_FILTER(src, stride) * wx
|
|
vmulwev.w.h vr11, vr11, vr1
|
|
vadd.w vr10, vr10, vr2
|
|
vadd.w vr11, vr11, vr2
|
|
vsra.w vr10, vr10, vr3
|
|
vsra.w vr11, vr11, vr3
|
|
vadd.w vr10, vr10, vr4
|
|
vadd.w vr11, vr11, vr4
|
|
vssrani.h.w vr11, vr10, 0
|
|
vssrani.bu.h vr11, vr11, 0
|
|
fst.s f11, a0, 0
|
|
vstelm.h vr11, a0, 4, 2
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_V6
|
|
endfunc
|
|
|
|
// transpose 8x8b to 4x16b
|
|
.macro TRANSPOSE8X8B_LSX in0, in1, in2, in3, in4, in5, in6, in7, \
|
|
out0, out1, out2, out3
|
|
vilvl.b \in0, \in1, \in0
|
|
vilvl.b \in1, \in3, \in2
|
|
vilvl.b \in2, \in5, \in4
|
|
vilvl.b \in3, \in7, \in6
|
|
vilvl.h \in4, \in1, \in0
|
|
vilvh.h \in5, \in1, \in0
|
|
vilvl.h \in6, \in3, \in2
|
|
vilvh.h \in7, \in3, \in2
|
|
vilvl.w \out0, \in6, \in4
|
|
vilvh.w \out1, \in6, \in4
|
|
vilvl.w \out2, \in7, \in5
|
|
vilvh.w \out3, \in7, \in5
|
|
.endm
|
|
|
|
.macro PUT_HEVC_QPEL_UNI_W_V8_LSX in0, in1, in2, in3, out0, out1, pos
|
|
.if \pos == 0
|
|
vextrins.b \in0, vr13, 0x70 //insert the 8th load
|
|
vextrins.b \in0, vr13, 0xf1
|
|
vextrins.b \in1, vr13, 0x72
|
|
vextrins.b \in1, vr13, 0xf3
|
|
vextrins.b \in2, vr13, 0x74
|
|
vextrins.b \in2, vr13, 0xf5
|
|
vextrins.b \in3, vr13, 0x76
|
|
vextrins.b \in3, vr13, 0xf7
|
|
.else// \pos == 8
|
|
vextrins.b \in0, vr13, 0x78
|
|
vextrins.b \in0, vr13, 0xf9
|
|
vextrins.b \in1, vr13, 0x7a
|
|
vextrins.b \in1, vr13, 0xfb
|
|
vextrins.b \in2, vr13, 0x7c
|
|
vextrins.b \in2, vr13, 0xfd
|
|
vextrins.b \in3, vr13, 0x7e
|
|
vextrins.b \in3, vr13, 0xff
|
|
.endif
|
|
vdp2.h.bu.b \out0, \in0, vr5 //QPEL_FILTER(src, stride)
|
|
vdp2.h.bu.b \out1, \in1, vr5
|
|
vdp2.h.bu.b vr12, \in2, vr5
|
|
vdp2.h.bu.b vr20, \in3, vr5
|
|
vbsrl.v \in0, \in0, 1 //Back up previous 7 loaded datas,
|
|
vbsrl.v \in1, \in1, 1 //so just need to insert the 8th
|
|
vbsrl.v \in2, \in2, 1 //load in the next loop.
|
|
vbsrl.v \in3, \in3, 1
|
|
vhaddw.d.h \out0
|
|
vhaddw.d.h \out1
|
|
vhaddw.d.h vr12
|
|
vhaddw.d.h vr20
|
|
vpickev.w \out0, \out1, \out0
|
|
vpickev.w \out1, vr20, vr12
|
|
vmulwev.w.h \out0, \out0, vr1 //QPEL_FILTER(src, stride) * wx
|
|
vmulwev.w.h \out1, \out1, vr1
|
|
vadd.w \out0, \out0, vr2
|
|
vadd.w \out1, \out1, vr2
|
|
vsra.w \out0, \out0, vr3
|
|
vsra.w \out1, \out1, vr3
|
|
vadd.w \out0, \out0, vr4
|
|
vadd.w \out1, \out1, vr4
|
|
.endm
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v8_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
fld.d f6, a2, 0
|
|
fldx.d f7, a2, a3
|
|
fldx.d f8, a2, t0
|
|
add.d a2, a2, t1
|
|
fld.d f9, a2, 0
|
|
fldx.d f10, a2, a3
|
|
fldx.d f11, a2, t0
|
|
fldx.d f12, a2, t1
|
|
add.d a2, a2, t2
|
|
TRANSPOSE8X8B_LSX vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13, \
|
|
vr6, vr7, vr8, vr9
|
|
.LOOP_V8:
|
|
fld.d f13, a2, 0 //the 8th load
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_V8_LSX vr6, vr7, vr8, vr9, vr10, vr11, 0
|
|
vssrani.h.w vr11, vr10, 0
|
|
vssrani.bu.h vr11, vr11, 0
|
|
fst.d f11, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_V8
|
|
endfunc
|
|
|
|
.macro PUT_HEVC_UNI_W_V8_LASX w
|
|
fld.d f6, a2, 0
|
|
fldx.d f7, a2, a3
|
|
fldx.d f8, a2, t0
|
|
add.d a2, a2, t1
|
|
fld.d f9, a2, 0
|
|
fldx.d f10, a2, a3
|
|
fldx.d f11, a2, t0
|
|
fldx.d f12, a2, t1
|
|
add.d a2, a2, t2
|
|
TRANSPOSE8X8B_LSX vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13, \
|
|
vr6, vr7, vr8, vr9
|
|
xvpermi.q xr6, xr7, 0x02
|
|
xvpermi.q xr8, xr9, 0x02
|
|
.LOOP_V8_LASX_\w:
|
|
fld.d f13, a2, 0 // 0 1 2 3 4 5 6 7 the 8th load
|
|
add.d a2, a2, a3
|
|
vshuf4i.h vr13, vr13, 0xd8
|
|
vbsrl.v vr14, vr13, 4
|
|
xvpermi.q xr13, xr14, 0x02 //0 1 4 5 * * * * 2 3 6 7 * * * *
|
|
xvextrins.b xr6, xr13, 0x70 //begin to insert the 8th load
|
|
xvextrins.b xr6, xr13, 0xf1
|
|
xvextrins.b xr8, xr13, 0x72
|
|
xvextrins.b xr8, xr13, 0xf3
|
|
xvdp2.h.bu.b xr20, xr6, xr5 //QPEL_FILTER(src, stride)
|
|
xvdp2.h.bu.b xr21, xr8, xr5
|
|
xvbsrl.v xr6, xr6, 1
|
|
xvbsrl.v xr8, xr8, 1
|
|
xvhaddw.d.h xr20
|
|
xvhaddw.d.h xr21
|
|
xvpickev.w xr20, xr21, xr20
|
|
xvpermi.d xr20, xr20, 0xd8
|
|
xvmulwev.w.h xr20, xr20, xr1 //QPEL_FILTER(src, stride) * wx
|
|
xvadd.w xr20, xr20, xr2
|
|
xvsra.w xr20, xr20, xr3
|
|
xvadd.w xr10, xr20, xr4
|
|
xvpermi.q xr11, xr10, 0x01
|
|
vssrani.h.w vr11, vr10, 0
|
|
vssrani.bu.h vr11, vr11, 0
|
|
fst.d f11, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_V8_LASX_\w
|
|
.endm
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v8_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
PUT_HEVC_UNI_W_V8_LASX 8
|
|
endfunc
|
|
|
|
.macro PUT_HEVC_QPEL_UNI_W_V16_LSX w
|
|
vld vr6, a2, 0
|
|
vldx vr7, a2, a3
|
|
vldx vr8, a2, t0
|
|
add.d a2, a2, t1
|
|
vld vr9, a2, 0
|
|
vldx vr10, a2, a3
|
|
vldx vr11, a2, t0
|
|
vldx vr12, a2, t1
|
|
add.d a2, a2, t2
|
|
.if \w > 8
|
|
vilvh.d vr14, vr14, vr6
|
|
vilvh.d vr15, vr15, vr7
|
|
vilvh.d vr16, vr16, vr8
|
|
vilvh.d vr17, vr17, vr9
|
|
vilvh.d vr18, vr18, vr10
|
|
vilvh.d vr19, vr19, vr11
|
|
vilvh.d vr20, vr20, vr12
|
|
.endif
|
|
TRANSPOSE8X8B_LSX vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13, \
|
|
vr6, vr7, vr8, vr9
|
|
.if \w > 8
|
|
TRANSPOSE8X8B_LSX vr14, vr15, vr16, vr17, vr18, vr19, vr20, vr21, \
|
|
vr14, vr15, vr16, vr17
|
|
.endif
|
|
.LOOP_HORI_16_\w:
|
|
vld vr13, a2, 0
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_V8_LSX vr6, vr7, vr8, vr9, vr10, vr11, 0
|
|
.if \w > 8
|
|
PUT_HEVC_QPEL_UNI_W_V8_LSX vr14, vr15, vr16, vr17, vr18, vr19, 8
|
|
.endif
|
|
vssrani.h.w vr11, vr10, 0
|
|
.if \w > 8
|
|
vssrani.h.w vr19, vr18, 0
|
|
vssrani.bu.h vr19, vr11, 0
|
|
.else
|
|
vssrani.bu.h vr11, vr11, 0
|
|
.endif
|
|
.if \w == 8
|
|
fst.d f11, a0, 0
|
|
.elseif \w == 12
|
|
fst.d f19, a0, 0
|
|
vstelm.w vr19, a0, 8, 2
|
|
.else
|
|
vst vr19, a0, 0
|
|
.endif
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_HORI_16_\w
|
|
.endm
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v16_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 16
|
|
endfunc
|
|
|
|
.macro PUT_HEVC_QPEL_UNI_W_V16_LASX w
|
|
vld vr6, a2, 0
|
|
vldx vr7, a2, a3
|
|
vldx vr8, a2, t0
|
|
add.d a2, a2, t1
|
|
vld vr9, a2, 0
|
|
vldx vr10, a2, a3
|
|
vldx vr11, a2, t0
|
|
vldx vr12, a2, t1
|
|
add.d a2, a2, t2
|
|
xvpermi.q xr6, xr10, 0x02 //pack and transpose the 8x16 to 4x32 begin
|
|
xvpermi.q xr7, xr11, 0x02
|
|
xvpermi.q xr8, xr12, 0x02
|
|
xvpermi.q xr9, xr13, 0x02
|
|
xvilvl.b xr14, xr7, xr6 //0 2
|
|
xvilvh.b xr15, xr7, xr6 //1 3
|
|
xvilvl.b xr16, xr9, xr8 //0 2
|
|
xvilvh.b xr17, xr9, xr8 //1 3
|
|
xvpermi.d xr14, xr14, 0xd8
|
|
xvpermi.d xr15, xr15, 0xd8
|
|
xvpermi.d xr16, xr16, 0xd8
|
|
xvpermi.d xr17, xr17, 0xd8
|
|
xvilvl.h xr6, xr16, xr14
|
|
xvilvh.h xr7, xr16, xr14
|
|
xvilvl.h xr8, xr17, xr15
|
|
xvilvh.h xr9, xr17, xr15
|
|
xvilvl.w xr14, xr7, xr6 //0 1 4 5
|
|
xvilvh.w xr15, xr7, xr6 //2 3 6 7
|
|
xvilvl.w xr16, xr9, xr8 //8 9 12 13
|
|
xvilvh.w xr17, xr9, xr8 //10 11 14 15 end
|
|
.LOOP_HORI_16_LASX_\w:
|
|
vld vr13, a2, 0 //the 8th load
|
|
add.d a2, a2, a3
|
|
vshuf4i.w vr13, vr13, 0xd8
|
|
vbsrl.v vr12, vr13, 8
|
|
xvpermi.q xr13, xr12, 0x02
|
|
xvextrins.b xr14, xr13, 0x70 //inset the 8th load
|
|
xvextrins.b xr14, xr13, 0xf1
|
|
xvextrins.b xr15, xr13, 0x72
|
|
xvextrins.b xr15, xr13, 0xf3
|
|
xvextrins.b xr16, xr13, 0x74
|
|
xvextrins.b xr16, xr13, 0xf5
|
|
xvextrins.b xr17, xr13, 0x76
|
|
xvextrins.b xr17, xr13, 0xf7
|
|
xvdp2.h.bu.b xr6, xr14, xr5 //QPEL_FILTER(src, stride)
|
|
xvdp2.h.bu.b xr7, xr15, xr5
|
|
xvdp2.h.bu.b xr8, xr16, xr5
|
|
xvdp2.h.bu.b xr9, xr17, xr5
|
|
xvhaddw.d.h xr6
|
|
xvhaddw.d.h xr7
|
|
xvhaddw.d.h xr8
|
|
xvhaddw.d.h xr9
|
|
xvbsrl.v xr14, xr14, 1 //Back up previous 7 loaded datas,
|
|
xvbsrl.v xr15, xr15, 1 //so just need to insert the 8th
|
|
xvbsrl.v xr16, xr16, 1 //load in next loop.
|
|
xvbsrl.v xr17, xr17, 1
|
|
xvpickev.w xr6, xr7, xr6 //0 1 2 3 4 5 6 7
|
|
xvpickev.w xr7, xr9, xr8 //8 9 10 11 12 13 14 15
|
|
xvmulwev.w.h xr6, xr6, xr1 //QPEL_FILTER(src, stride) * wx
|
|
xvmulwev.w.h xr7, xr7, xr1
|
|
xvadd.w xr6, xr6, xr2
|
|
xvadd.w xr7, xr7, xr2
|
|
xvsra.w xr6, xr6, xr3
|
|
xvsra.w xr7, xr7, xr3
|
|
xvadd.w xr6, xr6, xr4
|
|
xvadd.w xr7, xr7, xr4
|
|
xvssrani.h.w xr7, xr6, 0 //0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15
|
|
xvpermi.q xr6, xr7, 0x01
|
|
vssrani.bu.h vr6, vr7, 0
|
|
vshuf4i.w vr6, vr6, 0xd8
|
|
.if \w == 12
|
|
fst.d f6, a0, 0
|
|
vstelm.w vr6, a0, 8, 2
|
|
.else
|
|
vst vr6, a0, 0
|
|
.endif
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_HORI_16_LASX_\w
|
|
.endm
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v16_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
PUT_HEVC_QPEL_UNI_W_V16_LASX 16
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v12_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 12
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v12_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
PUT_HEVC_QPEL_UNI_W_V16_LASX 12
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v24_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
addi.d t4, a0, 0 //save dst
|
|
addi.d t5, a2, 0 //save src
|
|
addi.d t6, a4, 0
|
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 24
|
|
addi.d a0, t4, 16
|
|
addi.d a2, t5, 16
|
|
addi.d a4, t6, 0
|
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 8
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v24_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
addi.d t4, a0, 0 //save dst
|
|
addi.d t5, a2, 0 //save src
|
|
addi.d t6, a4, 0
|
|
PUT_HEVC_QPEL_UNI_W_V16_LASX 24
|
|
addi.d a0, t4, 16
|
|
addi.d a2, t5, 16
|
|
addi.d a4, t6, 0
|
|
PUT_HEVC_UNI_W_V8_LASX 24
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v32_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
addi.d t3, zero, 2
|
|
addi.d t4, a0, 0 //save dst
|
|
addi.d t5, a2, 0 //save src
|
|
addi.d t6, a4, 0
|
|
.LOOP_V32:
|
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 32
|
|
addi.d t3, t3, -1
|
|
addi.d a0, t4, 16
|
|
addi.d a2, t5, 16
|
|
addi.d a4, t6, 0
|
|
bnez t3, .LOOP_V32
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v32_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
addi.d t3, zero, 2
|
|
addi.d t4, a0, 0 //save dst
|
|
addi.d t5, a2, 0 //save src
|
|
addi.d t6, a4, 0
|
|
.LOOP_V32_LASX:
|
|
PUT_HEVC_QPEL_UNI_W_V16_LASX 32
|
|
addi.d t3, t3, -1
|
|
addi.d a0, t4, 16
|
|
addi.d a2, t5, 16
|
|
addi.d a4, t6, 0
|
|
bnez t3, .LOOP_V32_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v48_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
addi.d t3, zero, 3
|
|
addi.d t4, a0, 0 //save dst
|
|
addi.d t5, a2, 0 //save src
|
|
addi.d t6, a4, 0
|
|
.LOOP_V48:
|
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 48
|
|
addi.d t3, t3, -1
|
|
addi.d a0, t4, 16
|
|
addi.d t4, t4, 16
|
|
addi.d a2, t5, 16
|
|
addi.d t5, t5, 16
|
|
addi.d a4, t6, 0
|
|
bnez t3, .LOOP_V48
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v48_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
addi.d t3, zero, 3
|
|
addi.d t4, a0, 0 //save dst
|
|
addi.d t5, a2, 0 //save src
|
|
addi.d t6, a4, 0
|
|
.LOOP_V48_LASX:
|
|
PUT_HEVC_QPEL_UNI_W_V16_LASX 48
|
|
addi.d t3, t3, -1
|
|
addi.d a0, t4, 16
|
|
addi.d t4, t4, 16
|
|
addi.d a2, t5, 16
|
|
addi.d t5, t5, 16
|
|
addi.d a4, t6, 0
|
|
bnez t3, .LOOP_V48_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v64_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
addi.d t3, zero, 4
|
|
addi.d t4, a0, 0 //save dst
|
|
addi.d t5, a2, 0 //save src
|
|
addi.d t6, a4, 0
|
|
.LOOP_V64:
|
|
PUT_HEVC_QPEL_UNI_W_V16_LSX 64
|
|
addi.d t3, t3, -1
|
|
addi.d a0, t4, 16
|
|
addi.d t4, t4, 16
|
|
addi.d a2, t5, 16
|
|
addi.d t5, t5, 16
|
|
addi.d a4, t6, 0
|
|
bnez t3, .LOOP_V64
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_v64_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
add.d t2, t1, a3 //stride * 4
|
|
sub.d a2, a2, t1 //src -= stride*3
|
|
addi.d t3, zero, 4
|
|
addi.d t4, a0, 0 //save dst
|
|
addi.d t5, a2, 0 //save src
|
|
addi.d t6, a4, 0
|
|
.LOOP_V64_LASX:
|
|
PUT_HEVC_QPEL_UNI_W_V16_LASX 64
|
|
addi.d t3, t3, -1
|
|
addi.d a0, t4, 16
|
|
addi.d t4, t4, 16
|
|
addi.d a2, t5, 16
|
|
addi.d t5, t5, 16
|
|
addi.d a4, t6, 0
|
|
bnez t3, .LOOP_V64_LASX
|
|
endfunc
|
|
|
|
.macro PUT_HEVC_QPEL_UNI_W_H8_LSX in0, out0, out1
|
|
vbsrl.v vr7, \in0, 1
|
|
vbsrl.v vr8, \in0, 2
|
|
vbsrl.v vr9, \in0, 3
|
|
vbsrl.v vr10, \in0, 4
|
|
vbsrl.v vr11, \in0, 5
|
|
vbsrl.v vr12, \in0, 6
|
|
vbsrl.v vr13, \in0, 7
|
|
vilvl.d vr6, vr7, \in0
|
|
vilvl.d vr7, vr9, vr8
|
|
vilvl.d vr8, vr11, vr10
|
|
vilvl.d vr9, vr13, vr12
|
|
vdp2.h.bu.b vr10, vr6, vr5
|
|
vdp2.h.bu.b vr11, vr7, vr5
|
|
vdp2.h.bu.b vr12, vr8, vr5
|
|
vdp2.h.bu.b vr13, vr9, vr5
|
|
vhaddw.d.h vr10
|
|
vhaddw.d.h vr11
|
|
vhaddw.d.h vr12
|
|
vhaddw.d.h vr13
|
|
vpickev.w vr10, vr11, vr10
|
|
vpickev.w vr11, vr13, vr12
|
|
vmulwev.w.h vr10, vr10, vr1
|
|
vmulwev.w.h vr11, vr11, vr1
|
|
vadd.w vr10, vr10, vr2
|
|
vadd.w vr11, vr11, vr2
|
|
vsra.w vr10, vr10, vr3
|
|
vsra.w vr11, vr11, vr3
|
|
vadd.w \out0, vr10, vr4
|
|
vadd.w \out1, vr11, vr4
|
|
.endm
|
|
|
|
.macro PUT_HEVC_QPEL_UNI_W_H8_LASX in0, out0
|
|
xvbsrl.v xr7, \in0, 4
|
|
xvpermi.q xr7, \in0, 0x20
|
|
xvbsrl.v xr8, xr7, 1
|
|
xvbsrl.v xr9, xr7, 2
|
|
xvbsrl.v xr10, xr7, 3
|
|
xvpackev.d xr7, xr8, xr7
|
|
xvpackev.d xr8, xr10, xr9
|
|
xvdp2.h.bu.b xr10, xr7, xr5
|
|
xvdp2.h.bu.b xr11, xr8, xr5
|
|
xvhaddw.d.h xr10
|
|
xvhaddw.d.h xr11
|
|
xvpickev.w xr10, xr11, xr10
|
|
xvmulwev.w.h xr10, xr10, xr1
|
|
xvadd.w xr10, xr10, xr2
|
|
xvsra.w xr10, xr10, xr3
|
|
xvadd.w \out0, xr10, xr4
|
|
.endm
|
|
|
|
.macro PUT_HEVC_QPEL_UNI_W_H16_LASX in0, out0
|
|
xvpermi.d xr6, \in0, 0x94
|
|
xvbsrl.v xr7, xr6, 1
|
|
xvbsrl.v xr8, xr6, 2
|
|
xvbsrl.v xr9, xr6, 3
|
|
xvbsrl.v xr10, xr6, 4
|
|
xvbsrl.v xr11, xr6, 5
|
|
xvbsrl.v xr12, xr6, 6
|
|
xvbsrl.v xr13, xr6, 7
|
|
xvpackev.d xr6, xr7, xr6
|
|
xvpackev.d xr7, xr9, xr8
|
|
xvpackev.d xr8, xr11, xr10
|
|
xvpackev.d xr9, xr13, xr12
|
|
xvdp2.h.bu.b xr10, xr6, xr5
|
|
xvdp2.h.bu.b xr11, xr7, xr5
|
|
xvdp2.h.bu.b xr12, xr8, xr5
|
|
xvdp2.h.bu.b xr13, xr9, xr5
|
|
xvhaddw.d.h xr10
|
|
xvhaddw.d.h xr11
|
|
xvhaddw.d.h xr12
|
|
xvhaddw.d.h xr13
|
|
xvpickev.w xr10, xr11, xr10
|
|
xvpickev.w xr11, xr13, xr12
|
|
xvmulwev.w.h xr10, xr10, xr1
|
|
xvmulwev.w.h xr11, xr11, xr1
|
|
xvadd.w xr10, xr10, xr2
|
|
xvadd.w xr11, xr11, xr2
|
|
xvsra.w xr10, xr10, xr3
|
|
xvsra.w xr11, xr11, xr3
|
|
xvadd.w xr10, xr10, xr4
|
|
xvadd.w xr11, xr11, xr4
|
|
xvssrani.h.w xr11, xr10, 0
|
|
xvpermi.q \out0, xr11, 0x01
|
|
xvssrani.bu.h \out0, xr11, 0
|
|
.endm
|
|
|
|
/*
|
|
* void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
|
|
* const uint8_t *_src, ptrdiff_t _srcstride,
|
|
* int height, int denom, int wx, int ox,
|
|
* intptr_t mx, intptr_t my, int width)
|
|
*/
|
|
function ff_hevc_put_hevc_qpel_uni_w_h4_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H4:
|
|
vld vr18, a2, 0
|
|
vldx vr19, a2, a3
|
|
alsl.d a2, a3, a2, 1
|
|
vbsrl.v vr6, vr18, 1
|
|
vbsrl.v vr7, vr18, 2
|
|
vbsrl.v vr8, vr18, 3
|
|
vbsrl.v vr9, vr19, 1
|
|
vbsrl.v vr10, vr19, 2
|
|
vbsrl.v vr11, vr19, 3
|
|
vilvl.d vr6, vr6, vr18
|
|
vilvl.d vr7, vr8, vr7
|
|
vilvl.d vr8, vr9, vr19
|
|
vilvl.d vr9, vr11, vr10
|
|
vdp2.h.bu.b vr10, vr6, vr5
|
|
vdp2.h.bu.b vr11, vr7, vr5
|
|
vdp2.h.bu.b vr12, vr8, vr5
|
|
vdp2.h.bu.b vr13, vr9, vr5
|
|
vhaddw.d.h vr10
|
|
vhaddw.d.h vr11
|
|
vhaddw.d.h vr12
|
|
vhaddw.d.h vr13
|
|
vpickev.w vr10, vr11, vr10
|
|
vpickev.w vr11, vr13, vr12
|
|
vmulwev.w.h vr10, vr10, vr1
|
|
vmulwev.w.h vr11, vr11, vr1
|
|
vadd.w vr10, vr10, vr2
|
|
vadd.w vr11, vr11, vr2
|
|
vsra.w vr10, vr10, vr3
|
|
vsra.w vr11, vr11, vr3
|
|
vadd.w vr10, vr10, vr4
|
|
vadd.w vr11, vr11, vr4
|
|
vssrani.h.w vr11, vr10, 0
|
|
vssrani.bu.h vr11, vr11, 0
|
|
fst.s f11, a0, 0
|
|
vbsrl.v vr11, vr11, 4
|
|
fstx.s f11, a0, a1
|
|
alsl.d a0, a1, a0, 1
|
|
addi.d a4, a4, -2
|
|
bnez a4, .LOOP_H4
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h4_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H4_LASX:
|
|
vld vr18, a2, 0
|
|
vldx vr19, a2, a3
|
|
alsl.d a2, a3, a2, 1
|
|
xvpermi.q xr18, xr19, 0x02
|
|
xvbsrl.v xr6, xr18, 1
|
|
xvbsrl.v xr7, xr18, 2
|
|
xvbsrl.v xr8, xr18, 3
|
|
xvpackev.d xr6, xr6, xr18
|
|
xvpackev.d xr7, xr8, xr7
|
|
xvdp2.h.bu.b xr10, xr6, xr5
|
|
xvdp2.h.bu.b xr11, xr7, xr5
|
|
xvhaddw.d.h xr10
|
|
xvhaddw.d.h xr11
|
|
xvpickev.w xr10, xr11, xr10
|
|
xvmulwev.w.h xr10, xr10, xr1
|
|
xvadd.w xr10, xr10, xr2
|
|
xvsra.w xr10, xr10, xr3
|
|
xvadd.w xr10, xr10, xr4
|
|
xvpermi.q xr11, xr10, 0x01
|
|
vssrani.h.w vr11, vr10, 0
|
|
vssrani.bu.h vr11, vr11, 0
|
|
fst.s f11, a0, 0
|
|
vbsrl.v vr11, vr11, 4
|
|
fstx.s f11, a0, a1
|
|
alsl.d a0, a1, a0, 1
|
|
addi.d a4, a4, -2
|
|
bnez a4, .LOOP_H4_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h6_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H6:
|
|
vld vr6, a2, 0
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr6, vr10, vr11
|
|
vssrani.h.w vr11, vr10, 0
|
|
vssrani.bu.h vr11, vr11, 0
|
|
fst.s f11, a0, 0
|
|
vstelm.h vr11, a0, 4, 2
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H6
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h6_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H6_LASX:
|
|
vld vr6, a2, 0
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H8_LASX xr6, xr10
|
|
xvpermi.q xr11, xr10, 0x01
|
|
vssrani.h.w vr11, vr10, 0
|
|
vssrani.bu.h vr11, vr11, 0
|
|
fst.s f11, a0, 0
|
|
vstelm.h vr11, a0, 4, 2
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H6_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h8_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H8:
|
|
vld vr6, a2, 0
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr6, vr10, vr11
|
|
vssrani.h.w vr11, vr10, 0
|
|
vssrani.bu.h vr11, vr11, 0
|
|
fst.d f11, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H8
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h8_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H8_LASX:
|
|
vld vr6, a2, 0
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H8_LASX xr6, xr10
|
|
xvpermi.q xr11, xr10, 0x01
|
|
vssrani.h.w vr11, vr10, 0
|
|
vssrani.bu.h vr11, vr11, 0
|
|
fst.d f11, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H8_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h12_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H12:
|
|
vld vr6, a2, 0
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr6, vr14, vr15
|
|
vld vr6, a2, 8
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr6, vr16, vr17
|
|
add.d a2, a2, a3
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
fst.d f17, a0, 0
|
|
vbsrl.v vr17, vr17, 8
|
|
fst.s f17, a0, 8
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H12
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h12_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H12_LASX:
|
|
xvld xr6, a2, 0
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr6, xr14
|
|
fst.d f14, a0, 0
|
|
vstelm.w vr14, a0, 8, 2
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H12_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h16_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H16:
|
|
vld vr6, a2, 0
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr6, vr14, vr15
|
|
vld vr6, a2, 8
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr6, vr16, vr17
|
|
add.d a2, a2, a3
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
vst vr17, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H16
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h16_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H16_LASX:
|
|
xvld xr6, a2, 0
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr6, xr10
|
|
vst vr10, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H16_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h24_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H24:
|
|
vld vr18, a2, 0
|
|
vld vr19, a2, 16
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr14, vr15
|
|
vshuf4i.d vr18, vr19, 0x09
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr16, vr17
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
vst vr17, a0, 0
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr14, vr15
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.bu.h vr15, vr15, 0
|
|
fst.d f15, a0, 16
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H24
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h24_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H24_LASX:
|
|
xvld xr18, a2, 0
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr18, xr20
|
|
xvpermi.q xr19, xr18, 0x01
|
|
vst vr20, a0, 0
|
|
PUT_HEVC_QPEL_UNI_W_H8_LASX xr19, xr20
|
|
xvpermi.q xr21, xr20, 0x01
|
|
vssrani.h.w vr21, vr20, 0
|
|
vssrani.bu.h vr21, vr21, 0
|
|
fst.d f21, a0, 16
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H24_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h32_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H32:
|
|
vld vr18, a2, 0
|
|
vld vr19, a2, 16
|
|
vld vr20, a2, 32
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr14, vr15
|
|
vshuf4i.d vr18, vr19, 0x09
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr16, vr17
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
vst vr17, a0, 0
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr14, vr15
|
|
vshuf4i.d vr19, vr20, 0x09
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr16, vr17
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
vst vr17, a0, 16
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H32
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h32_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H32_LASX:
|
|
xvld xr18, a2, 0
|
|
xvld xr19, a2, 16
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr18, xr20
|
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr19, xr21
|
|
xvpermi.q xr20, xr21, 0x02
|
|
xvst xr20, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H32_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h48_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H48:
|
|
vld vr18, a2, 0
|
|
vld vr19, a2, 16
|
|
vld vr20, a2, 32
|
|
vld vr21, a2, 48
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr14, vr15
|
|
vshuf4i.d vr18, vr19, 0x09
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr16, vr17
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
vst vr17, a0, 0
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr14, vr15
|
|
vshuf4i.d vr19, vr20, 0x09
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr16, vr17
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
vst vr17, a0, 16
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr20, vr14, vr15
|
|
vshuf4i.d vr20, vr21, 0x09
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr20, vr16, vr17
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
vst vr17, a0, 32
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H48
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h48_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H48_LASX:
|
|
xvld xr18, a2, 0
|
|
xvld xr19, a2, 32
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr18, xr20
|
|
xvpermi.q xr18, xr19, 0x03
|
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr18, xr21
|
|
xvpermi.q xr20, xr21, 0x02
|
|
xvst xr20, a0, 0
|
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr19, xr20
|
|
vst vr20, a0, 32
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H48_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h64_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H64:
|
|
vld vr18, a2, 0
|
|
vld vr19, a2, 16
|
|
vld vr20, a2, 32
|
|
vld vr21, a2, 48
|
|
vld vr22, a2, 64
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr14, vr15
|
|
vshuf4i.d vr18, vr19, 0x09
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr18, vr16, vr17
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
vst vr17, a0, 0
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr14, vr15
|
|
vshuf4i.d vr19, vr20, 0x09
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr19, vr16, vr17
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
vst vr17, a0, 16
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr20, vr14, vr15
|
|
vshuf4i.d vr20, vr21, 0x09
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr20, vr16, vr17
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
vst vr17, a0, 32
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr21, vr14, vr15
|
|
vshuf4i.d vr21, vr22, 0x09
|
|
PUT_HEVC_QPEL_UNI_W_H8_LSX vr21, vr16, vr17
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
vst vr17, a0, 48
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H64
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
xvreplve0.q xr5, xr5
|
|
addi.d a2, a2, -3 //src -= 3
|
|
.LOOP_H64_LASX:
|
|
xvld xr18, a2, 0
|
|
xvld xr19, a2, 32
|
|
xvld xr20, a2, 64
|
|
add.d a2, a2, a3
|
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr18, xr21
|
|
xvpermi.q xr18, xr19, 0x03
|
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr18, xr22
|
|
xvpermi.q xr21, xr22, 0x02
|
|
xvst xr21, a0, 0
|
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr19, xr21
|
|
xvpermi.q xr19, xr20, 0x03
|
|
PUT_HEVC_QPEL_UNI_W_H16_LASX xr19, xr22
|
|
xvpermi.q xr21, xr22, 0x02
|
|
xvst xr21, a0, 32
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_H64_LASX
|
|
endfunc
|
|
|
|
const shufb
|
|
.byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6 //mask for epel_uni_w(128-bit)
|
|
.byte 4,5,6,7, 5,6,7,8 ,6,7,8,9, 7,8,9,10 //mask for epel_uni_w(256-bit)
|
|
.byte 0,1,2,3, 4,5,6,7 ,1,2,3,4, 5,6,7,8 //mask for qpel_uni_h4
|
|
.byte 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8 //mask for qpel_uni_h/v6/8...
|
|
.byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6, 4,5,6,7, 5,6,7,8, 6,7,8,9, 7,8,9,10 //epel_uni_w_h16/24/32/48/64
|
|
.byte 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8, 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8 //mask for bi_epel_h16/24/32/48/64
|
|
endconst
|
|
|
|
.macro PUT_HEVC_EPEL_UNI_W_HV4_LSX w
|
|
fld.d f7, a2, 0 // start to load src
|
|
fldx.d f8, a2, a3
|
|
alsl.d a2, a3, a2, 1
|
|
fld.d f9, a2, 0
|
|
vshuf.b vr7, vr7, vr7, vr0 // 0123 1234 2345 3456
|
|
vshuf.b vr8, vr8, vr8, vr0
|
|
vshuf.b vr9, vr9, vr9, vr0
|
|
vdp2.h.bu.b vr10, vr7, vr5 // EPEL_FILTER(src, 1)
|
|
vdp2.h.bu.b vr11, vr8, vr5
|
|
vdp2.h.bu.b vr12, vr9, vr5
|
|
vhaddw.w.h vr10, vr10, vr10 // tmp[0/1/2/3]
|
|
vhaddw.w.h vr11, vr11, vr11 // vr10,vr11,vr12 corresponding to EPEL_EXTRA
|
|
vhaddw.w.h vr12, vr12, vr12
|
|
.LOOP_HV4_\w:
|
|
add.d a2, a2, a3
|
|
fld.d f14, a2, 0 // height loop begin
|
|
vshuf.b vr14, vr14, vr14, vr0
|
|
vdp2.h.bu.b vr13, vr14, vr5
|
|
vhaddw.w.h vr13, vr13, vr13
|
|
vmul.w vr14, vr10, vr16 // EPEL_FILTER(tmp, MAX_PB_SIZE)
|
|
vmadd.w vr14, vr11, vr17
|
|
vmadd.w vr14, vr12, vr18
|
|
vmadd.w vr14, vr13, vr19
|
|
vaddi.wu vr10, vr11, 0 //back up previous value
|
|
vaddi.wu vr11, vr12, 0
|
|
vaddi.wu vr12, vr13, 0
|
|
vsrai.w vr14, vr14, 6 // >> 6
|
|
vmul.w vr14, vr14, vr1 // * wx
|
|
vadd.w vr14, vr14, vr2 // + offset
|
|
vsra.w vr14, vr14, vr3 // >> shift
|
|
vadd.w vr14, vr14, vr4 // + ox
|
|
vssrani.h.w vr14, vr14, 0
|
|
vssrani.bu.h vr14, vr14, 0 // clip
|
|
fst.s f14, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_HV4_\w
|
|
.endm
|
|
|
|
/*
|
|
* void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
|
|
* const uint8_t *_src, ptrdiff_t _srcstride,
|
|
* int height, int denom, int wx, int ox,
|
|
* intptr_t mx, intptr_t my, int width)
|
|
*/
|
|
function ff_hevc_put_hevc_epel_uni_w_hv4_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
vreplvei.w vr5, vr5, 0
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
vreplvei.w vr16, vr6, 0
|
|
vreplvei.w vr17, vr6, 1
|
|
vreplvei.w vr18, vr6, 2
|
|
vreplvei.w vr19, vr6, 3
|
|
la.local t1, shufb
|
|
vld vr0, t1, 0
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
PUT_HEVC_EPEL_UNI_W_HV4_LSX 4
|
|
endfunc
|
|
|
|
.macro PUT_HEVC_EPEL_UNI_W_HV8_LSX w
|
|
vld vr7, a2, 0 // start to load src
|
|
vldx vr8, a2, a3
|
|
alsl.d a2, a3, a2, 1
|
|
vld vr9, a2, 0
|
|
vshuf.b vr10, vr7, vr7, vr0 // 0123 1234 2345 3456
|
|
vshuf.b vr11, vr8, vr8, vr0
|
|
vshuf.b vr12, vr9, vr9, vr0
|
|
vshuf.b vr7, vr7, vr7, vr22// 4567 5678 6789 78910
|
|
vshuf.b vr8, vr8, vr8, vr22
|
|
vshuf.b vr9, vr9, vr9, vr22
|
|
vdp2.h.bu.b vr13, vr10, vr5 // EPEL_FILTER(src, 1)
|
|
vdp2.h.bu.b vr14, vr11, vr5
|
|
vdp2.h.bu.b vr15, vr12, vr5
|
|
vdp2.h.bu.b vr23, vr7, vr5
|
|
vdp2.h.bu.b vr20, vr8, vr5
|
|
vdp2.h.bu.b vr21, vr9, vr5
|
|
vhaddw.w.h vr7, vr13, vr13
|
|
vhaddw.w.h vr8, vr14, vr14
|
|
vhaddw.w.h vr9, vr15, vr15
|
|
vhaddw.w.h vr10, vr23, vr23
|
|
vhaddw.w.h vr11, vr20, vr20
|
|
vhaddw.w.h vr12, vr21, vr21
|
|
.LOOP_HV8_HORI_\w:
|
|
add.d a2, a2, a3
|
|
vld vr15, a2, 0
|
|
vshuf.b vr23, vr15, vr15, vr0
|
|
vshuf.b vr15, vr15, vr15, vr22
|
|
vdp2.h.bu.b vr13, vr23, vr5
|
|
vdp2.h.bu.b vr14, vr15, vr5
|
|
vhaddw.w.h vr13, vr13, vr13 //789--13
|
|
vhaddw.w.h vr14, vr14, vr14 //101112--14
|
|
vmul.w vr15, vr7, vr16 //EPEL_FILTER(tmp, MAX_PB_SIZE)
|
|
vmadd.w vr15, vr8, vr17
|
|
vmadd.w vr15, vr9, vr18
|
|
vmadd.w vr15, vr13, vr19
|
|
vmul.w vr20, vr10, vr16
|
|
vmadd.w vr20, vr11, vr17
|
|
vmadd.w vr20, vr12, vr18
|
|
vmadd.w vr20, vr14, vr19
|
|
vaddi.wu vr7, vr8, 0 //back up previous value
|
|
vaddi.wu vr8, vr9, 0
|
|
vaddi.wu vr9, vr13, 0
|
|
vaddi.wu vr10, vr11, 0
|
|
vaddi.wu vr11, vr12, 0
|
|
vaddi.wu vr12, vr14, 0
|
|
vsrai.w vr15, vr15, 6 // >> 6
|
|
vsrai.w vr20, vr20, 6
|
|
vmul.w vr15, vr15, vr1 // * wx
|
|
vmul.w vr20, vr20, vr1
|
|
vadd.w vr15, vr15, vr2 // + offset
|
|
vadd.w vr20, vr20, vr2
|
|
vsra.w vr15, vr15, vr3 // >> shift
|
|
vsra.w vr20, vr20, vr3
|
|
vadd.w vr15, vr15, vr4 // + ox
|
|
vadd.w vr20, vr20, vr4
|
|
vssrani.h.w vr20, vr15, 0
|
|
vssrani.bu.h vr20, vr20, 0
|
|
.if \w > 6
|
|
fst.d f20, a0, 0
|
|
.else
|
|
fst.s f20, a0, 0
|
|
vstelm.h vr20, a0, 4, 2
|
|
.endif
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_HV8_HORI_\w
|
|
.endm
|
|
|
|
.macro PUT_HEVC_EPEL_UNI_W_HV8_LASX w
|
|
vld vr7, a2, 0 // start to load src
|
|
vldx vr8, a2, a3
|
|
alsl.d a2, a3, a2, 1
|
|
vld vr9, a2, 0
|
|
xvreplve0.q xr7, xr7
|
|
xvreplve0.q xr8, xr8
|
|
xvreplve0.q xr9, xr9
|
|
xvshuf.b xr10, xr7, xr7, xr0 // 0123 1234 2345 3456
|
|
xvshuf.b xr11, xr8, xr8, xr0
|
|
xvshuf.b xr12, xr9, xr9, xr0
|
|
xvdp2.h.bu.b xr13, xr10, xr5 // EPEL_FILTER(src, 1)
|
|
xvdp2.h.bu.b xr14, xr11, xr5
|
|
xvdp2.h.bu.b xr15, xr12, xr5
|
|
xvhaddw.w.h xr7, xr13, xr13
|
|
xvhaddw.w.h xr8, xr14, xr14
|
|
xvhaddw.w.h xr9, xr15, xr15
|
|
.LOOP_HV8_HORI_LASX_\w:
|
|
add.d a2, a2, a3
|
|
vld vr15, a2, 0
|
|
xvreplve0.q xr15, xr15
|
|
xvshuf.b xr23, xr15, xr15, xr0
|
|
xvdp2.h.bu.b xr10, xr23, xr5
|
|
xvhaddw.w.h xr10, xr10, xr10
|
|
xvmul.w xr15, xr7, xr16 //EPEL_FILTER(tmp, MAX_PB_SIZE)
|
|
xvmadd.w xr15, xr8, xr17
|
|
xvmadd.w xr15, xr9, xr18
|
|
xvmadd.w xr15, xr10, xr19
|
|
xvaddi.wu xr7, xr8, 0 //back up previous value
|
|
xvaddi.wu xr8, xr9, 0
|
|
xvaddi.wu xr9, xr10, 0
|
|
xvsrai.w xr15, xr15, 6 // >> 6
|
|
xvmul.w xr15, xr15, xr1 // * wx
|
|
xvadd.w xr15, xr15, xr2 // + offset
|
|
xvsra.w xr15, xr15, xr3 // >> shift
|
|
xvadd.w xr15, xr15, xr4 // + ox
|
|
xvpermi.q xr20, xr15, 0x01
|
|
vssrani.h.w vr20, vr15, 0
|
|
vssrani.bu.h vr20, vr20, 0
|
|
.if \w > 6
|
|
fst.d f20, a0, 0
|
|
.else
|
|
fst.s f20, a0, 0
|
|
vstelm.h vr20, a0, 4, 2
|
|
.endif
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_HV8_HORI_LASX_\w
|
|
.endm
|
|
|
|
.macro PUT_HEVC_EPEL_UNI_W_HV16_LASX w
|
|
xvld xr7, a2, 0 // start to load src
|
|
xvldx xr8, a2, a3
|
|
alsl.d a2, a3, a2, 1
|
|
xvld xr9, a2, 0
|
|
xvpermi.d xr10, xr7, 0x09 //8..18
|
|
xvpermi.d xr11, xr8, 0x09
|
|
xvpermi.d xr12, xr9, 0x09
|
|
xvreplve0.q xr7, xr7
|
|
xvreplve0.q xr8, xr8
|
|
xvreplve0.q xr9, xr9
|
|
xvshuf.b xr13, xr7, xr7, xr0 // 0123 1234 2345 3456
|
|
xvshuf.b xr14, xr8, xr8, xr0
|
|
xvshuf.b xr15, xr9, xr9, xr0
|
|
xvdp2.h.bu.b xr20, xr13, xr5 // EPEL_FILTER(src, 1)
|
|
xvdp2.h.bu.b xr21, xr14, xr5
|
|
xvdp2.h.bu.b xr22, xr15, xr5
|
|
xvhaddw.w.h xr7, xr20, xr20
|
|
xvhaddw.w.h xr8, xr21, xr21
|
|
xvhaddw.w.h xr9, xr22, xr22
|
|
xvreplve0.q xr10, xr10
|
|
xvreplve0.q xr11, xr11
|
|
xvreplve0.q xr12, xr12
|
|
xvshuf.b xr13, xr10, xr10, xr0
|
|
xvshuf.b xr14, xr11, xr11, xr0
|
|
xvshuf.b xr15, xr12, xr12, xr0
|
|
xvdp2.h.bu.b xr20, xr13, xr5
|
|
xvdp2.h.bu.b xr21, xr14, xr5
|
|
xvdp2.h.bu.b xr22, xr15, xr5
|
|
xvhaddw.w.h xr10, xr20, xr20
|
|
xvhaddw.w.h xr11, xr21, xr21
|
|
xvhaddw.w.h xr12, xr22, xr22
|
|
.LOOP_HV16_HORI_LASX_\w:
|
|
add.d a2, a2, a3
|
|
xvld xr15, a2, 0
|
|
xvpermi.d xr20, xr15, 0x09 //8...18
|
|
xvreplve0.q xr15, xr15
|
|
xvreplve0.q xr20, xr20
|
|
xvshuf.b xr21, xr15, xr15, xr0
|
|
xvshuf.b xr22, xr20, xr20, xr0
|
|
xvdp2.h.bu.b xr13, xr21, xr5
|
|
xvdp2.h.bu.b xr14, xr22, xr5
|
|
xvhaddw.w.h xr13, xr13, xr13
|
|
xvhaddw.w.h xr14, xr14, xr14
|
|
xvmul.w xr15, xr7, xr16 //EPEL_FILTER(tmp, MAX_PB_SIZE)
|
|
xvmadd.w xr15, xr8, xr17
|
|
xvmadd.w xr15, xr9, xr18
|
|
xvmadd.w xr15, xr13, xr19
|
|
xvmul.w xr20, xr10, xr16
|
|
xvmadd.w xr20, xr11, xr17
|
|
xvmadd.w xr20, xr12, xr18
|
|
xvmadd.w xr20, xr14, xr19
|
|
xvaddi.wu xr7, xr8, 0 //back up previous value
|
|
xvaddi.wu xr8, xr9, 0
|
|
xvaddi.wu xr9, xr13, 0
|
|
xvaddi.wu xr10, xr11, 0
|
|
xvaddi.wu xr11, xr12, 0
|
|
xvaddi.wu xr12, xr14, 0
|
|
xvsrai.w xr15, xr15, 6 // >> 6
|
|
xvsrai.w xr20, xr20, 6 // >> 6
|
|
xvmul.w xr15, xr15, xr1 // * wx
|
|
xvmul.w xr20, xr20, xr1 // * wx
|
|
xvadd.w xr15, xr15, xr2 // + offset
|
|
xvadd.w xr20, xr20, xr2 // + offset
|
|
xvsra.w xr15, xr15, xr3 // >> shift
|
|
xvsra.w xr20, xr20, xr3 // >> shift
|
|
xvadd.w xr15, xr15, xr4 // + ox
|
|
xvadd.w xr20, xr20, xr4 // + ox
|
|
xvssrani.h.w xr20, xr15, 0
|
|
xvpermi.q xr21, xr20, 0x01
|
|
vssrani.bu.h vr21, vr20, 0
|
|
vpermi.w vr21, vr21, 0xd8
|
|
.if \w < 16
|
|
fst.d f21, a0, 0
|
|
vstelm.w vr21, a0, 8, 2
|
|
.else
|
|
vst vr21, a0, 0
|
|
.endif
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_HV16_HORI_LASX_\w
|
|
.endm
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv6_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
vreplvei.w vr5, vr5, 0
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
vreplvei.w vr16, vr6, 0
|
|
vreplvei.w vr17, vr6, 1
|
|
vreplvei.w vr18, vr6, 2
|
|
vreplvei.w vr19, vr6, 3
|
|
la.local t1, shufb
|
|
vld vr0, t1, 0
|
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 6
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv6_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
xvreplve0.w xr5, xr5
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
xvreplve0.q xr6, xr6
|
|
xvrepl128vei.w xr16, xr6, 0
|
|
xvrepl128vei.w xr17, xr6, 1
|
|
xvrepl128vei.w xr18, xr6, 2
|
|
xvrepl128vei.w xr19, xr6, 3
|
|
la.local t1, shufb
|
|
xvld xr0, t1, 0
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
PUT_HEVC_EPEL_UNI_W_HV8_LASX 6
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv8_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
vreplvei.w vr5, vr5, 0
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
vreplvei.w vr16, vr6, 0
|
|
vreplvei.w vr17, vr6, 1
|
|
vreplvei.w vr18, vr6, 2
|
|
vreplvei.w vr19, vr6, 3
|
|
la.local t1, shufb
|
|
vld vr0, t1, 0
|
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 8
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv8_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
xvreplve0.w xr5, xr5
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
xvreplve0.q xr6, xr6
|
|
xvrepl128vei.w xr16, xr6, 0
|
|
xvrepl128vei.w xr17, xr6, 1
|
|
xvrepl128vei.w xr18, xr6, 2
|
|
xvrepl128vei.w xr19, xr6, 3
|
|
la.local t1, shufb
|
|
xvld xr0, t1, 0
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
PUT_HEVC_EPEL_UNI_W_HV8_LASX 8
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv12_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
vreplvei.w vr5, vr5, 0
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
vreplvei.w vr16, vr6, 0
|
|
vreplvei.w vr17, vr6, 1
|
|
vreplvei.w vr18, vr6, 2
|
|
vreplvei.w vr19, vr6, 3
|
|
la.local t1, shufb
|
|
vld vr0, t1, 0
|
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 12
|
|
addi.d a0, t2, 8
|
|
addi.d a2, t3, 8
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_HV4_LSX 12
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv12_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
xvreplve0.w xr5, xr5
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
xvreplve0.q xr6, xr6
|
|
xvrepl128vei.w xr16, xr6, 0
|
|
xvrepl128vei.w xr17, xr6, 1
|
|
xvrepl128vei.w xr18, xr6, 2
|
|
xvrepl128vei.w xr19, xr6, 3
|
|
la.local t1, shufb
|
|
xvld xr0, t1, 0
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
PUT_HEVC_EPEL_UNI_W_HV16_LASX 12
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv16_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
vreplvei.w vr5, vr5, 0
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
vreplvei.w vr16, vr6, 0
|
|
vreplvei.w vr17, vr6, 1
|
|
vreplvei.w vr18, vr6, 2
|
|
vreplvei.w vr19, vr6, 3
|
|
la.local t1, shufb
|
|
vld vr0, t1, 0
|
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
addi.d t5, zero, 2
|
|
.LOOP_HV16:
|
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 16
|
|
addi.d a0, t2, 8
|
|
addi.d a2, t3, 8
|
|
addi.d a4, t4, 0
|
|
addi.d t5, t5, -1
|
|
bnez t5, .LOOP_HV16
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv16_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
xvreplve0.w xr5, xr5
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
xvreplve0.q xr6, xr6
|
|
xvrepl128vei.w xr16, xr6, 0
|
|
xvrepl128vei.w xr17, xr6, 1
|
|
xvrepl128vei.w xr18, xr6, 2
|
|
xvrepl128vei.w xr19, xr6, 3
|
|
la.local t1, shufb
|
|
xvld xr0, t1, 0
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
PUT_HEVC_EPEL_UNI_W_HV16_LASX 16
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv24_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
vreplvei.w vr5, vr5, 0
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
vreplvei.w vr16, vr6, 0
|
|
vreplvei.w vr17, vr6, 1
|
|
vreplvei.w vr18, vr6, 2
|
|
vreplvei.w vr19, vr6, 3
|
|
la.local t1, shufb
|
|
vld vr0, t1, 0
|
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
addi.d t5, zero, 3
|
|
.LOOP_HV24:
|
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 24
|
|
addi.d a0, t2, 8
|
|
addi.d t2, t2, 8
|
|
addi.d a2, t3, 8
|
|
addi.d t3, t3, 8
|
|
addi.d a4, t4, 0
|
|
addi.d t5, t5, -1
|
|
bnez t5, .LOOP_HV24
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv24_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
xvreplve0.w xr5, xr5
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
xvreplve0.q xr6, xr6
|
|
xvrepl128vei.w xr16, xr6, 0
|
|
xvrepl128vei.w xr17, xr6, 1
|
|
xvrepl128vei.w xr18, xr6, 2
|
|
xvrepl128vei.w xr19, xr6, 3
|
|
la.local t1, shufb
|
|
xvld xr0, t1, 0
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
PUT_HEVC_EPEL_UNI_W_HV16_LASX 24
|
|
addi.d a0, t2, 16
|
|
addi.d a2, t3, 16
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_HV8_LASX 24
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv32_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
vreplvei.w vr5, vr5, 0
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
vreplvei.w vr16, vr6, 0
|
|
vreplvei.w vr17, vr6, 1
|
|
vreplvei.w vr18, vr6, 2
|
|
vreplvei.w vr19, vr6, 3
|
|
la.local t1, shufb
|
|
vld vr0, t1, 0
|
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
addi.d t5, zero, 4
|
|
.LOOP_HV32:
|
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 32
|
|
addi.d a0, t2, 8
|
|
addi.d t2, t2, 8
|
|
addi.d a2, t3, 8
|
|
addi.d t3, t3, 8
|
|
addi.d a4, t4, 0
|
|
addi.d t5, t5, -1
|
|
bnez t5, .LOOP_HV32
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv32_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
xvreplve0.w xr5, xr5
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
xvreplve0.q xr6, xr6
|
|
xvrepl128vei.w xr16, xr6, 0
|
|
xvrepl128vei.w xr17, xr6, 1
|
|
xvrepl128vei.w xr18, xr6, 2
|
|
xvrepl128vei.w xr19, xr6, 3
|
|
la.local t1, shufb
|
|
xvld xr0, t1, 0
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
addi.d t5, zero, 2
|
|
.LOOP_HV32_LASX:
|
|
PUT_HEVC_EPEL_UNI_W_HV16_LASX 32
|
|
addi.d a0, t2, 16
|
|
addi.d t2, t2, 16
|
|
addi.d a2, t3, 16
|
|
addi.d t3, t3, 16
|
|
addi.d a4, t4, 0
|
|
addi.d t5, t5, -1
|
|
bnez t5, .LOOP_HV32_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv48_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
vreplvei.w vr5, vr5, 0
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
vreplvei.w vr16, vr6, 0
|
|
vreplvei.w vr17, vr6, 1
|
|
vreplvei.w vr18, vr6, 2
|
|
vreplvei.w vr19, vr6, 3
|
|
la.local t1, shufb
|
|
vld vr0, t1, 0
|
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
addi.d t5, zero, 6
|
|
.LOOP_HV48:
|
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 48
|
|
addi.d a0, t2, 8
|
|
addi.d t2, t2, 8
|
|
addi.d a2, t3, 8
|
|
addi.d t3, t3, 8
|
|
addi.d a4, t4, 0
|
|
addi.d t5, t5, -1
|
|
bnez t5, .LOOP_HV48
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv48_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
xvreplve0.w xr5, xr5
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
xvreplve0.q xr6, xr6
|
|
xvrepl128vei.w xr16, xr6, 0
|
|
xvrepl128vei.w xr17, xr6, 1
|
|
xvrepl128vei.w xr18, xr6, 2
|
|
xvrepl128vei.w xr19, xr6, 3
|
|
la.local t1, shufb
|
|
xvld xr0, t1, 0
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
addi.d t5, zero, 3
|
|
.LOOP_HV48_LASX:
|
|
PUT_HEVC_EPEL_UNI_W_HV16_LASX 48
|
|
addi.d a0, t2, 16
|
|
addi.d t2, t2, 16
|
|
addi.d a2, t3, 16
|
|
addi.d t3, t3, 16
|
|
addi.d a4, t4, 0
|
|
addi.d t5, t5, -1
|
|
bnez t5, .LOOP_HV48_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv64_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
vreplvei.w vr5, vr5, 0
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
vreplvei.w vr16, vr6, 0
|
|
vreplvei.w vr17, vr6, 1
|
|
vreplvei.w vr18, vr6, 2
|
|
vreplvei.w vr19, vr6, 3
|
|
la.local t1, shufb
|
|
vld vr0, t1, 0
|
|
vaddi.bu vr22, vr0, 4 // update shufb to get high part
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
addi.d t5, zero, 8
|
|
.LOOP_HV64:
|
|
PUT_HEVC_EPEL_UNI_W_HV8_LSX 64
|
|
addi.d a0, t2, 8
|
|
addi.d t2, t2, 8
|
|
addi.d a2, t3, 8
|
|
addi.d t3, t3, 8
|
|
addi.d a4, t4, 0
|
|
addi.d t5, t5, -1
|
|
bnez t5, .LOOP_HV64
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 // mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr5, t1, t0 // ff_hevc_epel_filters[mx - 1];
|
|
xvreplve0.w xr5, xr5
|
|
ld.d t0, sp, 8 // my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
vldx vr6, t1, t0 // ff_hevc_epel_filters[my - 1];
|
|
vsllwil.h.b vr6, vr6, 0
|
|
vsllwil.w.h vr6, vr6, 0
|
|
xvreplve0.q xr6, xr6
|
|
xvrepl128vei.w xr16, xr6, 0
|
|
xvrepl128vei.w xr17, xr6, 1
|
|
xvrepl128vei.w xr18, xr6, 2
|
|
xvrepl128vei.w xr19, xr6, 3
|
|
la.local t1, shufb
|
|
xvld xr0, t1, 0
|
|
sub.d a2, a2, a3 // src -= srcstride
|
|
addi.d a2, a2, -1
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
addi.d t5, zero, 4
|
|
.LOOP_HV64_LASX:
|
|
PUT_HEVC_EPEL_UNI_W_HV16_LASX 64
|
|
addi.d a0, t2, 16
|
|
addi.d t2, t2, 16
|
|
addi.d a2, t3, 16
|
|
addi.d t3, t3, 16
|
|
addi.d a4, t4, 0
|
|
addi.d t5, t5, -1
|
|
bnez t5, .LOOP_HV64_LASX
|
|
endfunc
|
|
|
|
/*
|
|
* void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride,
|
|
* const uint8_t *_src, ptrdiff_t _srcstride,
|
|
* int height, intptr_t mx, intptr_t my,
|
|
* int width)
|
|
*/
|
|
function ff_hevc_put_hevc_uni_qpel_h4_8_lsx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr5, t1, t0 //filter
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
vreplgr2vr.h vr1, t1
|
|
la.local t1, shufb
|
|
vld vr2, t1, 32 //mask0 0 1
|
|
vaddi.bu vr3, vr2, 2 //mask1 2 3
|
|
.LOOP_UNI_H4:
|
|
vld vr18, a2, 0
|
|
vldx vr19, a2, a3
|
|
alsl.d a2, a3, a2, 1
|
|
vshuf.b vr6, vr18, vr18, vr2
|
|
vshuf.b vr7, vr18, vr18, vr3
|
|
vshuf.b vr8, vr19, vr19, vr2
|
|
vshuf.b vr9, vr19, vr19, vr3
|
|
vdp2.h.bu.b vr10, vr6, vr5
|
|
vdp2.h.bu.b vr11, vr7, vr5
|
|
vdp2.h.bu.b vr12, vr8, vr5
|
|
vdp2.h.bu.b vr13, vr9, vr5
|
|
vhaddw.d.h vr10
|
|
vhaddw.d.h vr11
|
|
vhaddw.d.h vr12
|
|
vhaddw.d.h vr13
|
|
vpickev.w vr10, vr11, vr10
|
|
vpickev.w vr11, vr13, vr12
|
|
vpickev.h vr10, vr11, vr10
|
|
vadd.h vr10, vr10, vr1
|
|
vsrai.h vr10, vr10, 6
|
|
vssrani.bu.h vr10, vr10, 0
|
|
fst.s f10, a0, 0
|
|
vbsrl.v vr10, vr10, 4
|
|
fstx.s f10, a0, a1
|
|
alsl.d a0, a1, a0, 1
|
|
addi.d a4, a4, -2
|
|
bnez a4, .LOOP_UNI_H4
|
|
endfunc
|
|
|
|
.macro HEVC_UNI_QPEL_H8_LSX in0, out0
|
|
vshuf.b vr10, \in0, \in0, vr5
|
|
vshuf.b vr11, \in0, \in0, vr6
|
|
vshuf.b vr12, \in0, \in0, vr7
|
|
vshuf.b vr13, \in0, \in0, vr8
|
|
vdp2.h.bu.b \out0, vr10, vr0 //(QPEL_FILTER(src, 1)
|
|
vdp2add.h.bu.b \out0, vr11, vr1
|
|
vdp2add.h.bu.b \out0, vr12, vr2
|
|
vdp2add.h.bu.b \out0, vr13, vr3
|
|
vadd.h \out0, \out0, vr4
|
|
vsrai.h \out0, \out0, 6
|
|
.endm
|
|
|
|
.macro HEVC_UNI_QPEL_H16_LASX in0, out0
|
|
xvshuf.b xr10, \in0, \in0, xr5
|
|
xvshuf.b xr11, \in0, \in0, xr6
|
|
xvshuf.b xr12, \in0, \in0, xr7
|
|
xvshuf.b xr13, \in0, \in0, xr8
|
|
xvdp2.h.bu.b \out0, xr10, xr0 //(QPEL_FILTER(src, 1)
|
|
xvdp2add.h.bu.b \out0, xr11, xr1
|
|
xvdp2add.h.bu.b \out0, xr12, xr2
|
|
xvdp2add.h.bu.b \out0, xr13, xr3
|
|
xvadd.h \out0, \out0, xr4
|
|
xvsrai.h \out0, \out0, 6
|
|
.endm
|
|
|
|
function ff_hevc_put_hevc_uni_qpel_h6_8_lsx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr0, t1, t0 //filter abcdefgh
|
|
vreplvei.h vr1, vr0, 1 //cd...
|
|
vreplvei.h vr2, vr0, 2 //ef...
|
|
vreplvei.h vr3, vr0, 3 //gh...
|
|
vreplvei.h vr0, vr0, 0 //ab...
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
vreplgr2vr.h vr4, t1
|
|
la.local t1, shufb
|
|
vld vr5, t1, 48
|
|
vaddi.bu vr6, vr5, 2
|
|
vaddi.bu vr7, vr5, 4
|
|
vaddi.bu vr8, vr5, 6
|
|
.LOOP_UNI_H6:
|
|
vld vr9, a2, 0
|
|
add.d a2, a2, a3
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14
|
|
vssrani.bu.h vr14, vr14, 0
|
|
fst.s f14, a0, 0
|
|
vstelm.h vr14, a0, 4, 2
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_H6
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_uni_qpel_h8_8_lsx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr0, t1, t0 //filter abcdefgh
|
|
vreplvei.h vr1, vr0, 1 //cd...
|
|
vreplvei.h vr2, vr0, 2 //ef...
|
|
vreplvei.h vr3, vr0, 3 //gh...
|
|
vreplvei.h vr0, vr0, 0 //ab...
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
vreplgr2vr.h vr4, t1
|
|
la.local t1, shufb
|
|
vld vr5, t1, 48
|
|
vaddi.bu vr6, vr5, 2
|
|
vaddi.bu vr7, vr5, 4
|
|
vaddi.bu vr8, vr5, 6
|
|
.LOOP_UNI_H8:
|
|
vld vr9, a2, 0
|
|
add.d a2, a2, a3
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14
|
|
vssrani.bu.h vr14, vr14, 0
|
|
fst.d f14, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_H8
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_uni_qpel_h12_8_lsx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr0, t1, t0 //filter abcdefgh
|
|
vreplvei.h vr1, vr0, 1 //cd...
|
|
vreplvei.h vr2, vr0, 2 //ef...
|
|
vreplvei.h vr3, vr0, 3 //gh...
|
|
vreplvei.h vr0, vr0, 0 //ab...
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
vreplgr2vr.h vr4, t1
|
|
la.local t1, shufb
|
|
vld vr5, t1, 48
|
|
vaddi.bu vr6, vr5, 2
|
|
vaddi.bu vr7, vr5, 4
|
|
vaddi.bu vr8, vr5, 6
|
|
.LOOP_UNI_H12:
|
|
vld vr9, a2, 0
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14
|
|
vld vr9, a2, 8
|
|
add.d a2, a2, a3
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr15
|
|
vssrani.bu.h vr15, vr14, 0
|
|
fst.d f15, a0, 0
|
|
vstelm.w vr15, a0, 8, 2
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_H12
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_uni_qpel_h12_8_lasx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr0, t1, t0 //filter abcdefgh
|
|
xvreplve0.q xr0, xr0
|
|
xvrepl128vei.h xr1, xr0, 1 //cd...
|
|
xvrepl128vei.h xr2, xr0, 2 //ef...
|
|
xvrepl128vei.h xr3, xr0, 3 //gh...
|
|
xvrepl128vei.h xr0, xr0, 0 //ab...
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
xvreplgr2vr.h xr4, t1
|
|
la.local t1, shufb
|
|
vld vr5, t1, 48
|
|
xvreplve0.q xr5, xr5
|
|
xvaddi.bu xr6, xr5, 2
|
|
xvaddi.bu xr7, xr5, 4
|
|
xvaddi.bu xr8, xr5, 6
|
|
.LOOP_UNI_H12_LASX:
|
|
xvld xr9, a2, 0
|
|
add.d a2, a2, a3
|
|
xvpermi.d xr9, xr9, 0x94 //rearrange data
|
|
HEVC_UNI_QPEL_H16_LASX xr9, xr14
|
|
xvpermi.q xr15, xr14, 0x01
|
|
vssrani.bu.h vr15, vr14, 0
|
|
fst.d f15, a0, 0
|
|
vstelm.w vr15, a0, 8, 2
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_H12_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_uni_qpel_h16_8_lsx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr0, t1, t0 //filter abcdefgh
|
|
vreplvei.h vr1, vr0, 1 //cd...
|
|
vreplvei.h vr2, vr0, 2 //ef...
|
|
vreplvei.h vr3, vr0, 3 //gh...
|
|
vreplvei.h vr0, vr0, 0 //ab...
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
vreplgr2vr.h vr4, t1
|
|
la.local t1, shufb
|
|
vld vr5, t1, 48
|
|
vaddi.bu vr6, vr5, 2
|
|
vaddi.bu vr7, vr5, 4
|
|
vaddi.bu vr8, vr5, 6
|
|
.LOOP_UNI_H16:
|
|
vld vr9, a2, 0
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14
|
|
vld vr9, a2, 8
|
|
add.d a2, a2, a3
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr15
|
|
vssrani.bu.h vr15, vr14, 0
|
|
vst vr15, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_H16
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_uni_qpel_h16_8_lasx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr0, t1, t0 //filter abcdefgh
|
|
xvreplve0.q xr0, xr0
|
|
xvrepl128vei.h xr1, xr0, 1 //cd...
|
|
xvrepl128vei.h xr2, xr0, 2 //ef...
|
|
xvrepl128vei.h xr3, xr0, 3 //gh...
|
|
xvrepl128vei.h xr0, xr0, 0 //ab...
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
xvreplgr2vr.h xr4, t1
|
|
la.local t1, shufb
|
|
vld vr5, t1, 48
|
|
xvreplve0.q xr5, xr5
|
|
xvaddi.bu xr6, xr5, 2
|
|
xvaddi.bu xr7, xr5, 4
|
|
xvaddi.bu xr8, xr5, 6
|
|
.LOOP_UNI_H16_LASX:
|
|
xvld xr9, a2, 0
|
|
add.d a2, a2, a3
|
|
xvpermi.d xr9, xr9, 0x94 //rearrange data
|
|
HEVC_UNI_QPEL_H16_LASX xr9, xr14
|
|
xvpermi.q xr15, xr14, 0x01
|
|
vssrani.bu.h vr15, vr14, 0
|
|
vst vr15, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_H16_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_uni_qpel_h24_8_lsx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr0, t1, t0 //filter abcdefgh
|
|
vreplvei.h vr1, vr0, 1 //cd...
|
|
vreplvei.h vr2, vr0, 2 //ef...
|
|
vreplvei.h vr3, vr0, 3 //gh...
|
|
vreplvei.h vr0, vr0, 0 //ab...
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
vreplgr2vr.h vr4, t1
|
|
la.local t1, shufb
|
|
vld vr5, t1, 48
|
|
vaddi.bu vr6, vr5, 2
|
|
vaddi.bu vr7, vr5, 4
|
|
vaddi.bu vr8, vr5, 6
|
|
.LOOP_UNI_H24:
|
|
vld vr9, a2, 0
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14
|
|
vld vr9, a2, 8
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr15
|
|
vld vr9, a2, 16
|
|
add.d a2, a2, a3
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr16
|
|
vssrani.bu.h vr15, vr14, 0
|
|
vssrani.bu.h vr16, vr16, 0
|
|
vst vr15, a0, 0
|
|
fst.d f16, a0, 16
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_H24
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_uni_qpel_h24_8_lasx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr0, t1, t0 //filter abcdefgh
|
|
xvreplve0.q xr0, xr0
|
|
xvrepl128vei.h xr1, xr0, 1 //cd...
|
|
xvrepl128vei.h xr2, xr0, 2 //ef...
|
|
xvrepl128vei.h xr3, xr0, 3 //gh...
|
|
xvrepl128vei.h xr0, xr0, 0 //ab...
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
xvreplgr2vr.h xr4, t1
|
|
la.local t1, shufb
|
|
vld vr5, t1, 48
|
|
xvreplve0.q xr5, xr5
|
|
xvaddi.bu xr6, xr5, 2
|
|
xvaddi.bu xr7, xr5, 4
|
|
xvaddi.bu xr8, xr5, 6
|
|
.LOOP_UNI_H24_LASX:
|
|
xvld xr9, a2, 0
|
|
xvpermi.q xr19, xr9, 0x01 //16...23
|
|
add.d a2, a2, a3
|
|
xvpermi.d xr9, xr9, 0x94 //rearrange data
|
|
HEVC_UNI_QPEL_H16_LASX xr9, xr14
|
|
xvpermi.q xr15, xr14, 0x01
|
|
vssrani.bu.h vr15, vr14, 0
|
|
vst vr15, a0, 0
|
|
HEVC_UNI_QPEL_H8_LSX vr19, vr16
|
|
vssrani.bu.h vr16, vr16, 0
|
|
fst.d f16, a0, 16
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_H24_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_uni_qpel_h32_8_lsx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr0, t1, t0 //filter abcdefgh
|
|
vreplvei.h vr1, vr0, 1 //cd...
|
|
vreplvei.h vr2, vr0, 2 //ef...
|
|
vreplvei.h vr3, vr0, 3 //gh...
|
|
vreplvei.h vr0, vr0, 0 //ab...
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
vreplgr2vr.h vr4, t1
|
|
la.local t1, shufb
|
|
vld vr5, t1, 48
|
|
vaddi.bu vr6, vr5, 2
|
|
vaddi.bu vr7, vr5, 4
|
|
vaddi.bu vr8, vr5, 6
|
|
.LOOP_UNI_H32:
|
|
vld vr9, a2, 0
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14
|
|
vld vr9, a2, 8
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr15
|
|
vld vr9, a2, 16
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr16
|
|
vld vr9, a2, 24
|
|
add.d a2, a2, a3
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr17
|
|
vssrani.bu.h vr15, vr14, 0
|
|
vssrani.bu.h vr17, vr16, 0
|
|
vst vr15, a0, 0
|
|
vst vr17, a0, 16
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_H32
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_uni_qpel_h32_8_lasx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr0, t1, t0 //filter abcdefgh
|
|
xvreplve0.q xr0, xr0
|
|
xvrepl128vei.h xr1, xr0, 1 //cd...
|
|
xvrepl128vei.h xr2, xr0, 2 //ef...
|
|
xvrepl128vei.h xr3, xr0, 3 //gh...
|
|
xvrepl128vei.h xr0, xr0, 0 //ab...
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
xvreplgr2vr.h xr4, t1
|
|
la.local t1, shufb
|
|
vld vr5, t1, 48
|
|
xvreplve0.q xr5, xr5
|
|
xvaddi.bu xr6, xr5, 2
|
|
xvaddi.bu xr7, xr5, 4
|
|
xvaddi.bu xr8, xr5, 6
|
|
.LOOP_UNI_H32_LASX:
|
|
xvld xr9, a2, 0
|
|
xvpermi.d xr9, xr9, 0x94
|
|
HEVC_UNI_QPEL_H16_LASX xr9, xr14
|
|
xvld xr9, a2, 16
|
|
xvpermi.d xr9, xr9, 0x94
|
|
HEVC_UNI_QPEL_H16_LASX xr9, xr15
|
|
add.d a2, a2, a3
|
|
xvssrani.bu.h xr15, xr14, 0
|
|
xvpermi.d xr15, xr15, 0xd8
|
|
xvst xr15, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_H32_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_uni_qpel_h48_8_lsx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr0, t1, t0 //filter abcdefgh
|
|
vreplvei.h vr1, vr0, 1 //cd...
|
|
vreplvei.h vr2, vr0, 2 //ef...
|
|
vreplvei.h vr3, vr0, 3 //gh...
|
|
vreplvei.h vr0, vr0, 0 //ab...
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
vreplgr2vr.h vr4, t1
|
|
la.local t1, shufb
|
|
vld vr5, t1, 48
|
|
vaddi.bu vr6, vr5, 2
|
|
vaddi.bu vr7, vr5, 4
|
|
vaddi.bu vr8, vr5, 6
|
|
.LOOP_UNI_H48:
|
|
vld vr9, a2, 0
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr14
|
|
vld vr9, a2, 8
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr15
|
|
vld vr9, a2, 16
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr16
|
|
vld vr9, a2, 24
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr17
|
|
vld vr9, a2, 32
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr18
|
|
vld vr9, a2, 40
|
|
add.d a2, a2, a3
|
|
HEVC_UNI_QPEL_H8_LSX vr9, vr19
|
|
vssrani.bu.h vr15, vr14, 0
|
|
vssrani.bu.h vr17, vr16, 0
|
|
vssrani.bu.h vr19, vr18, 0
|
|
vst vr15, a0, 0
|
|
vst vr17, a0, 16
|
|
vst vr19, a0, 32
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_H48
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_uni_qpel_h48_8_lasx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr0, t1, t0 //filter abcdefgh
|
|
xvreplve0.q xr0, xr0
|
|
xvrepl128vei.h xr1, xr0, 1 //cd...
|
|
xvrepl128vei.h xr2, xr0, 2 //ef...
|
|
xvrepl128vei.h xr3, xr0, 3 //gh...
|
|
xvrepl128vei.h xr0, xr0, 0 //ab...
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
xvreplgr2vr.h xr4, t1
|
|
la.local t1, shufb
|
|
vld vr5, t1, 48
|
|
xvreplve0.q xr5, xr5
|
|
xvaddi.bu xr6, xr5, 2
|
|
xvaddi.bu xr7, xr5, 4
|
|
xvaddi.bu xr8, xr5, 6
|
|
.LOOP_UNI_H48_LASX:
|
|
xvld xr9, a2, 0
|
|
xvpermi.d xr9, xr9, 0x94
|
|
HEVC_UNI_QPEL_H16_LASX xr9, xr14
|
|
xvld xr9, a2, 16
|
|
xvpermi.d xr9, xr9, 0x94
|
|
HEVC_UNI_QPEL_H16_LASX xr9, xr15
|
|
xvld xr9, a2, 32
|
|
xvpermi.d xr9, xr9, 0x94
|
|
HEVC_UNI_QPEL_H16_LASX xr9, xr16
|
|
add.d a2, a2, a3
|
|
xvssrani.bu.h xr15, xr14, 0
|
|
xvpermi.d xr15, xr15, 0xd8
|
|
xvst xr15, a0, 0
|
|
xvpermi.q xr17, xr16, 0x01
|
|
vssrani.bu.h vr17, vr16, 0
|
|
vst vr17, a0, 32
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_H48_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_uni_qpel_h64_8_lasx
|
|
addi.d t0, a5, -1
|
|
slli.w t0, t0, 4
|
|
la.local t1, ff_hevc_qpel_filters
|
|
vldx vr0, t1, t0 //filter abcdefgh
|
|
xvreplve0.q xr0, xr0
|
|
xvrepl128vei.h xr1, xr0, 1 //cd...
|
|
xvrepl128vei.h xr2, xr0, 2 //ef...
|
|
xvrepl128vei.h xr3, xr0, 3 //gh...
|
|
xvrepl128vei.h xr0, xr0, 0 //ab...
|
|
addi.d a2, a2, -3 //src -= 3
|
|
addi.w t1, zero, 32
|
|
xvreplgr2vr.h xr4, t1
|
|
la.local t1, shufb
|
|
vld vr5, t1, 48
|
|
xvreplve0.q xr5, xr5
|
|
xvaddi.bu xr6, xr5, 2
|
|
xvaddi.bu xr7, xr5, 4
|
|
xvaddi.bu xr8, xr5, 6
|
|
.LOOP_UNI_H64_LASX:
|
|
xvld xr9, a2, 0
|
|
xvpermi.d xr9, xr9, 0x94
|
|
HEVC_UNI_QPEL_H16_LASX xr9, xr14
|
|
xvld xr9, a2, 16
|
|
xvpermi.d xr9, xr9, 0x94
|
|
HEVC_UNI_QPEL_H16_LASX xr9, xr15
|
|
xvld xr9, a2, 32
|
|
xvpermi.d xr9, xr9, 0x94
|
|
HEVC_UNI_QPEL_H16_LASX xr9, xr16
|
|
xvld xr9, a2, 48
|
|
xvpermi.d xr9, xr9, 0x94
|
|
HEVC_UNI_QPEL_H16_LASX xr9, xr17
|
|
add.d a2, a2, a3
|
|
xvssrani.bu.h xr15, xr14, 0
|
|
xvpermi.d xr15, xr15, 0xd8
|
|
xvst xr15, a0, 0
|
|
xvssrani.bu.h xr17, xr16, 0
|
|
xvpermi.d xr17, xr17, 0xd8
|
|
xvst xr17, a0, 32
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_H64_LASX
|
|
endfunc
|
|
|
|
/*
|
|
* void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
|
|
* const uint8_t *_src, ptrdiff_t _srcstride,
|
|
* int height, int denom, int wx, int ox,
|
|
* intptr_t mx, intptr_t my, int width)
|
|
*/
|
|
function ff_hevc_put_hevc_epel_uni_w_v4_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
fld.s f6, a2, 0 //0
|
|
fldx.s f7, a2, a3 //1
|
|
fldx.s f8, a2, t0 //2
|
|
add.d a2, a2, t1
|
|
vilvl.b vr6, vr7, vr6
|
|
vilvl.b vr7, vr8, vr8
|
|
vilvl.h vr6, vr7, vr6
|
|
vreplvei.w vr0, vr0, 0
|
|
.LOOP_UNI_V4:
|
|
fld.s f9, a2, 0 //3
|
|
fldx.s f10, a2, a3 //4
|
|
add.d a2, a2, t0
|
|
vextrins.b vr6, vr9, 0x30 //insert the 3th load
|
|
vextrins.b vr6, vr9, 0x71
|
|
vextrins.b vr6, vr9, 0xb2
|
|
vextrins.b vr6, vr9, 0xf3
|
|
vbsrl.v vr7, vr6, 1
|
|
vextrins.b vr7, vr10, 0x30 //insert the 4th load
|
|
vextrins.b vr7, vr10, 0x71
|
|
vextrins.b vr7, vr10, 0xb2
|
|
vextrins.b vr7, vr10, 0xf3
|
|
vdp2.h.bu.b vr8, vr6, vr0 //EPEL_FILTER(src, stride)
|
|
vdp2.h.bu.b vr9, vr7, vr0
|
|
vhaddw.w.h vr10, vr8, vr8
|
|
vhaddw.w.h vr11, vr9, vr9
|
|
vmulwev.w.h vr10, vr10, vr1 //EPEL_FILTER(src, stride) * wx
|
|
vmulwev.w.h vr11, vr11, vr1
|
|
vadd.w vr10, vr10, vr2 // + offset
|
|
vadd.w vr11, vr11, vr2
|
|
vsra.w vr10, vr10, vr3 // >> shift
|
|
vsra.w vr11, vr11, vr3
|
|
vadd.w vr10, vr10, vr4 // + ox
|
|
vadd.w vr11, vr11, vr4
|
|
vssrani.h.w vr11, vr10, 0
|
|
vssrani.bu.h vr10, vr11, 0
|
|
vbsrl.v vr6, vr7, 1
|
|
fst.s f10, a0, 0
|
|
vbsrl.v vr10, vr10, 4
|
|
fstx.s f10, a0, a1
|
|
alsl.d a0, a1, a0, 1
|
|
addi.d a4, a4, -2
|
|
bnez a4, .LOOP_UNI_V4
|
|
endfunc
|
|
|
|
.macro CALC_EPEL_FILTER_LSX out0, out1
|
|
vdp2.h.bu.b vr12, vr10, vr0 //EPEL_FILTER(src, stride)
|
|
vdp2add.h.bu.b vr12, vr11, vr5
|
|
vexth.w.h vr13, vr12
|
|
vsllwil.w.h vr12, vr12, 0
|
|
vmulwev.w.h vr12, vr12, vr1 //EPEL_FILTER(src, stride) * wx
|
|
vmulwev.w.h vr13, vr13, vr1 //EPEL_FILTER(src, stride) * wx
|
|
vadd.w vr12, vr12, vr2 // + offset
|
|
vadd.w vr13, vr13, vr2
|
|
vsra.w vr12, vr12, vr3 // >> shift
|
|
vsra.w vr13, vr13, vr3
|
|
vadd.w \out0, vr12, vr4 // + ox
|
|
vadd.w \out1, vr13, vr4
|
|
.endm
|
|
|
|
.macro CALC_EPEL_FILTER_LASX out0
|
|
xvdp2.h.bu.b xr11, xr12, xr0 //EPEL_FILTER(src, stride)
|
|
xvhaddw.w.h xr12, xr11, xr11
|
|
xvmulwev.w.h xr12, xr12, xr1 //EPEL_FILTER(src, stride) * wx
|
|
xvadd.w xr12, xr12, xr2 // + offset
|
|
xvsra.w xr12, xr12, xr3 // >> shift
|
|
xvadd.w \out0, xr12, xr4 // + ox
|
|
.endm
|
|
|
|
//w is a label, also can be used as a condition for ".if" statement.
|
|
.macro PUT_HEVC_EPEL_UNI_W_V8_LSX w
|
|
fld.d f6, a2, 0 //0
|
|
fldx.d f7, a2, a3 //1
|
|
fldx.d f8, a2, t0 //2
|
|
add.d a2, a2, t1
|
|
.LOOP_UNI_V8_\w:
|
|
fld.d f9, a2, 0 // 3
|
|
add.d a2, a2, a3
|
|
vilvl.b vr10, vr7, vr6
|
|
vilvl.b vr11, vr9, vr8
|
|
vaddi.bu vr6, vr7, 0 //back up previous value
|
|
vaddi.bu vr7, vr8, 0
|
|
vaddi.bu vr8, vr9, 0
|
|
CALC_EPEL_FILTER_LSX vr12, vr13
|
|
vssrani.h.w vr13, vr12, 0
|
|
vssrani.bu.h vr13, vr13, 0
|
|
.if \w < 8
|
|
fst.s f13, a0, 0
|
|
vstelm.h vr13, a0, 4, 2
|
|
.else
|
|
fst.d f13, a0, 0
|
|
.endif
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_V8_\w
|
|
.endm
|
|
|
|
//w is a label, also can be used as a condition for ".if" statement.
|
|
.macro PUT_HEVC_EPEL_UNI_W_V8_LASX w
|
|
fld.d f6, a2, 0 //0
|
|
fldx.d f7, a2, a3 //1
|
|
fldx.d f8, a2, t0 //2
|
|
add.d a2, a2, t1
|
|
.LOOP_UNI_V8_LASX_\w:
|
|
fld.d f9, a2, 0 // 3
|
|
add.d a2, a2, a3
|
|
vilvl.b vr10, vr7, vr6
|
|
vilvl.b vr11, vr9, vr8
|
|
xvilvl.h xr12, xr11, xr10
|
|
xvilvh.h xr13, xr11, xr10
|
|
xvpermi.q xr12, xr13, 0x02
|
|
vaddi.bu vr6, vr7, 0 //back up previous value
|
|
vaddi.bu vr7, vr8, 0
|
|
vaddi.bu vr8, vr9, 0
|
|
CALC_EPEL_FILTER_LASX xr12
|
|
xvpermi.q xr13, xr12, 0x01
|
|
vssrani.h.w vr13, vr12, 0
|
|
vssrani.bu.h vr13, vr13, 0
|
|
.if \w < 8
|
|
fst.s f13, a0, 0
|
|
vstelm.h vr13, a0, 4, 2
|
|
.else
|
|
fst.d f13, a0, 0
|
|
.endif
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_V8_LASX_\w
|
|
.endm
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v6_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
PUT_HEVC_EPEL_UNI_W_V8_LSX 6
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v6_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.w xr0, xr0
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
PUT_HEVC_EPEL_UNI_W_V8_LASX 6
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v8_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
PUT_HEVC_EPEL_UNI_W_V8_LSX 8
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v8_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.w xr0, xr0
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
PUT_HEVC_EPEL_UNI_W_V8_LASX 8
|
|
endfunc
|
|
|
|
//w is a label, also can be used as a condition for ".if" statement.
|
|
.macro PUT_HEVC_EPEL_UNI_W_V16_LSX w
|
|
vld vr6, a2, 0 //0
|
|
vldx vr7, a2, a3 //1
|
|
vldx vr8, a2, t0 //2
|
|
add.d a2, a2, t1
|
|
.LOOP_UNI_V16_\w:
|
|
vld vr9, a2, 0 //3
|
|
add.d a2, a2, a3
|
|
vilvl.b vr10, vr7, vr6
|
|
vilvl.b vr11, vr9, vr8
|
|
CALC_EPEL_FILTER_LSX vr14, vr15
|
|
vilvh.b vr10, vr7, vr6
|
|
vilvh.b vr11, vr9, vr8
|
|
CALC_EPEL_FILTER_LSX vr16, vr17
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
vaddi.bu vr6, vr7, 0 //back up previous value
|
|
vaddi.bu vr7, vr8, 0
|
|
vaddi.bu vr8, vr9, 0
|
|
.if \w < 16
|
|
fst.d f17, a0, 0
|
|
vstelm.w vr17, a0, 8, 2
|
|
.else
|
|
vst vr17, a0, 0
|
|
.endif
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_V16_\w
|
|
.endm
|
|
|
|
//w is a label, also can be used as a condition for ".if" statement.
|
|
.macro PUT_HEVC_EPEL_UNI_W_V16_LASX w
|
|
vld vr6, a2, 0 //0
|
|
vldx vr7, a2, a3 //1
|
|
vldx vr8, a2, t0 //2
|
|
add.d a2, a2, t1
|
|
.LOOP_UNI_V16_LASX_\w:
|
|
vld vr9, a2, 0 //3
|
|
add.d a2, a2, a3
|
|
xvilvl.b xr10, xr7, xr6
|
|
xvilvh.b xr11, xr7, xr6
|
|
xvpermi.q xr11, xr10, 0x20
|
|
xvilvl.b xr12, xr9, xr8
|
|
xvilvh.b xr13, xr9, xr8
|
|
xvpermi.q xr13, xr12, 0x20
|
|
xvdp2.h.bu.b xr10, xr11, xr0 //EPEL_FILTER(src, stride)
|
|
xvdp2add.h.bu.b xr10, xr13, xr5
|
|
xvexth.w.h xr11, xr10
|
|
xvsllwil.w.h xr10, xr10, 0
|
|
xvmulwev.w.h xr10, xr10, xr1 //EPEL_FILTER(src, stride) * wx
|
|
xvmulwev.w.h xr11, xr11, xr1
|
|
xvadd.w xr10, xr10, xr2 // + offset
|
|
xvadd.w xr11, xr11, xr2
|
|
xvsra.w xr10, xr10, xr3 // >> shift
|
|
xvsra.w xr11, xr11, xr3
|
|
xvadd.w xr10, xr10, xr4 // + wx
|
|
xvadd.w xr11, xr11, xr4
|
|
xvssrani.h.w xr11, xr10, 0
|
|
xvpermi.q xr10, xr11, 0x01
|
|
vssrani.bu.h vr10, vr11, 0
|
|
vaddi.bu vr6, vr7, 0 //back up previous value
|
|
vaddi.bu vr7, vr8, 0
|
|
vaddi.bu vr8, vr9, 0
|
|
.if \w < 16
|
|
fst.d f10, a0, 0
|
|
vstelm.w vr10, a0, 8, 2
|
|
.else
|
|
vst vr10, a0, 0
|
|
.endif
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_V16_LASX_\w
|
|
.endm
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v12_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 12
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v12_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.q xr0, xr0
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
xvrepl128vei.h xr5, xr0, 1
|
|
xvrepl128vei.h xr0, xr0, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 12
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v16_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 16
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v16_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.q xr0, xr0
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
xvrepl128vei.h xr5, xr0, 1
|
|
xvrepl128vei.h xr0, xr0, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 16
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v24_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
addi.d t2, a0, 0 //save init
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 24
|
|
addi.d a0, t2, 16 //increase step
|
|
addi.d a2, t3, 16
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V8_LSX 24
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v24_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.w xr20, xr0 //save xr0
|
|
xvreplve0.q xr0, xr0
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
xvrepl128vei.h xr5, xr0, 1
|
|
xvrepl128vei.h xr0, xr0, 0
|
|
addi.d t2, a0, 0 //save init
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 24
|
|
addi.d a0, t2, 16 //increase step
|
|
addi.d a2, t3, 16
|
|
addi.d a4, t4, 0
|
|
xvaddi.bu xr0, xr20, 0
|
|
PUT_HEVC_EPEL_UNI_W_V8_LASX 24
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v32_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 32
|
|
addi.d a0, t2, 16
|
|
addi.d a2, t3, 16
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 33
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v32_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.q xr0, xr0
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
xvrepl128vei.h xr5, xr0, 1
|
|
xvrepl128vei.h xr0, xr0, 0
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 32
|
|
addi.d a0, t2, 16
|
|
addi.d a2, t3, 16
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 33
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v48_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 48
|
|
addi.d a0, t2, 16
|
|
addi.d a2, t3, 16
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 49
|
|
addi.d a0, t2, 32
|
|
addi.d a2, t3, 32
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 50
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v48_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.q xr0, xr0
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
xvrepl128vei.h xr5, xr0, 1
|
|
xvrepl128vei.h xr0, xr0, 0
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 48
|
|
addi.d a0, t2, 16
|
|
addi.d a2, t3, 16
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 49
|
|
addi.d a0, t2, 32
|
|
addi.d a2, t3, 32
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 50
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v64_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 64
|
|
addi.d a0, t2, 16
|
|
addi.d a2, t3, 16
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 65
|
|
addi.d a0, t2, 32
|
|
addi.d a2, t3, 32
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 66
|
|
addi.d a0, t2, 48
|
|
addi.d a2, t3, 48
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LSX 67
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_v64_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 8 //my
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.q xr0, xr0
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
sub.d a2, a2, a3 //src -= stride
|
|
xvrepl128vei.h xr5, xr0, 1
|
|
xvrepl128vei.h xr0, xr0, 0
|
|
addi.d t2, a0, 0
|
|
addi.d t3, a2, 0
|
|
addi.d t4, a4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 64
|
|
addi.d a0, t2, 16
|
|
addi.d a2, t3, 16
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 65
|
|
addi.d a0, t2, 32
|
|
addi.d a2, t3, 32
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 66
|
|
addi.d a0, t2, 48
|
|
addi.d a2, t3, 48
|
|
addi.d a4, t4, 0
|
|
PUT_HEVC_EPEL_UNI_W_V16_LASX 67
|
|
endfunc
|
|
|
|
/*
|
|
* void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
|
|
* const uint8_t *_src, ptrdiff_t _srcstride,
|
|
* int height, int denom, int wx, int ox,
|
|
* intptr_t mx, intptr_t my, int width)
|
|
*/
|
|
function ff_hevc_put_hevc_epel_uni_w_h4_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
vreplvei.w vr0, vr0, 0
|
|
la.local t1, shufb
|
|
vld vr5, t1, 0
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
.LOOP_UNI_W_H4:
|
|
fld.d f6, a2, 0
|
|
add.d a2, a2, a3
|
|
vshuf.b vr6, vr6, vr6, vr5
|
|
vdp2.h.bu.b vr7, vr6, vr0
|
|
vhaddw.w.h vr7, vr7, vr7
|
|
vmulwev.w.h vr7, vr7, vr1
|
|
vadd.w vr7, vr7, vr2
|
|
vsra.w vr7, vr7, vr3
|
|
vadd.w vr7, vr7, vr4
|
|
vssrani.h.w vr7, vr7, 0
|
|
vssrani.bu.h vr7, vr7, 0
|
|
fst.s f7, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H4
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h6_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
vreplvei.w vr0, vr0, 0
|
|
la.local t1, shufb
|
|
vld vr6, t1, 48
|
|
vaddi.bu vr7, vr6, 2
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
.LOOP_UNI_W_H6:
|
|
vld vr8, a2, 0
|
|
add.d a2, a2, a3
|
|
vshuf.b vr10, vr8, vr8, vr6
|
|
vshuf.b vr11, vr8, vr8, vr7
|
|
CALC_EPEL_FILTER_LSX vr14, vr15
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.bu.h vr15, vr15, 0
|
|
fst.s f15, a0, 0
|
|
vstelm.h vr15, a0, 4, 2
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H6
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h6_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.w xr0, xr0
|
|
la.local t1, shufb
|
|
xvld xr6, t1, 64
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
.LOOP_UNI_W_H6_LASX:
|
|
vld vr8, a2, 0
|
|
xvreplve0.q xr8, xr8
|
|
add.d a2, a2, a3
|
|
xvshuf.b xr12, xr8, xr8, xr6
|
|
CALC_EPEL_FILTER_LASX xr14
|
|
xvpermi.q xr15, xr14, 0x01
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.bu.h vr15, vr15, 0
|
|
fst.s f15, a0, 0
|
|
vstelm.h vr15, a0, 4, 2
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H6_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h8_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
vreplvei.w vr0, vr0, 0
|
|
la.local t1, shufb
|
|
vld vr6, t1, 48
|
|
vaddi.bu vr7, vr6, 2
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
.LOOP_UNI_W_H8:
|
|
vld vr8, a2, 0
|
|
add.d a2, a2, a3
|
|
vshuf.b vr10, vr8, vr8, vr6
|
|
vshuf.b vr11, vr8, vr8, vr7
|
|
CALC_EPEL_FILTER_LSX vr14, vr15
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.bu.h vr15, vr15, 0
|
|
fst.d f15, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H8
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h8_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.w xr0, xr0
|
|
la.local t1, shufb
|
|
xvld xr6, t1, 64
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
.LOOP_UNI_W_H8_LASX:
|
|
vld vr8, a2, 0
|
|
xvreplve0.q xr8, xr8
|
|
add.d a2, a2, a3
|
|
xvshuf.b xr12, xr8, xr8, xr6
|
|
CALC_EPEL_FILTER_LASX xr14
|
|
xvpermi.q xr15, xr14, 0x01
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.bu.h vr15, vr15, 0
|
|
fst.d f15, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H8_LASX
|
|
endfunc
|
|
|
|
.macro EPEL_UNI_W_H16_LOOP_LSX idx0, idx1, idx2
|
|
vld vr8, a2, \idx0
|
|
vshuf.b vr10, vr8, vr8, vr6
|
|
vshuf.b vr11, vr8, vr8, vr7
|
|
CALC_EPEL_FILTER_LSX vr14, vr15
|
|
vld vr8, a2, \idx1
|
|
vshuf.b vr10, vr8, vr8, vr6
|
|
vshuf.b vr11, vr8, vr8, vr7
|
|
CALC_EPEL_FILTER_LSX vr16, vr17
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
vst vr17, a0, \idx2
|
|
.endm
|
|
|
|
.macro EPEL_UNI_W_H16_LOOP_LASX idx0, idx2, w
|
|
xvld xr8, a2, \idx0
|
|
xvpermi.d xr9, xr8, 0x09
|
|
xvreplve0.q xr8, xr8
|
|
xvshuf.b xr12, xr8, xr8, xr6
|
|
CALC_EPEL_FILTER_LASX xr14
|
|
xvreplve0.q xr8, xr9
|
|
xvshuf.b xr12, xr8, xr8, xr6
|
|
CALC_EPEL_FILTER_LASX xr16
|
|
xvssrani.h.w xr16, xr14, 0
|
|
xvpermi.q xr17, xr16, 0x01
|
|
vssrani.bu.h vr17, vr16, 0
|
|
vpermi.w vr17, vr17, 0xd8
|
|
.if \w == 12
|
|
fst.d f17, a0, 0
|
|
vstelm.w vr17, a0, 8, 2
|
|
.else
|
|
vst vr17, a0, \idx2
|
|
.endif
|
|
.endm
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h12_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
vreplvei.w vr0, vr0, 0
|
|
la.local t1, shufb
|
|
vld vr6, t1, 48
|
|
vaddi.bu vr7, vr6, 2
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
.LOOP_UNI_W_H12:
|
|
vld vr8, a2, 0
|
|
vshuf.b vr10, vr8, vr8, vr6
|
|
vshuf.b vr11, vr8, vr8, vr7
|
|
CALC_EPEL_FILTER_LSX vr14, vr15
|
|
vld vr8, a2, 8
|
|
vshuf.b vr10, vr8, vr8, vr6
|
|
vshuf.b vr11, vr8, vr8, vr7
|
|
CALC_EPEL_FILTER_LSX vr16, vr17
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.h.w vr17, vr16, 0
|
|
vssrani.bu.h vr17, vr15, 0
|
|
fst.d f17, a0, 0
|
|
vstelm.w vr17, a0, 8, 2
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H12
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h12_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.w xr0, xr0
|
|
la.local t1, shufb
|
|
xvld xr6, t1, 64
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
.LOOP_UNI_W_H12_LASX:
|
|
EPEL_UNI_W_H16_LOOP_LASX 0, 0, 12
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H12_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h16_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
vreplvei.w vr0, vr0, 0
|
|
la.local t1, shufb
|
|
vld vr6, t1, 48
|
|
vaddi.bu vr7, vr6, 2
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
.LOOP_UNI_W_H16:
|
|
EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H16
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h16_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.w xr0, xr0
|
|
la.local t1, shufb
|
|
xvld xr6, t1, 64
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
.LOOP_UNI_W_H16_LASX:
|
|
EPEL_UNI_W_H16_LOOP_LASX 0, 0, 16
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H16_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h24_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
vreplvei.w vr0, vr0, 0
|
|
la.local t1, shufb
|
|
vld vr6, t1, 48
|
|
vaddi.bu vr7, vr6, 2
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
.LOOP_UNI_W_H24:
|
|
EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0
|
|
vld vr8, a2, 16
|
|
add.d a2, a2, a3
|
|
vshuf.b vr10, vr8, vr8, vr6
|
|
vshuf.b vr11, vr8, vr8, vr7
|
|
CALC_EPEL_FILTER_LSX vr18, vr19
|
|
vssrani.h.w vr19, vr18, 0
|
|
vssrani.bu.h vr19, vr19, 0
|
|
fst.d f19, a0, 16
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H24
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h24_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.w xr0, xr0
|
|
la.local t1, shufb
|
|
xvld xr6, t1, 64
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
.LOOP_UNI_W_H24_LASX:
|
|
EPEL_UNI_W_H16_LOOP_LASX 0, 0, 24
|
|
vld vr8, a2, 16
|
|
add.d a2, a2, a3
|
|
xvreplve0.q xr8, xr8
|
|
xvshuf.b xr12, xr8, xr8, xr6
|
|
CALC_EPEL_FILTER_LASX xr14
|
|
xvpermi.q xr15, xr14, 0x01
|
|
vssrani.h.w vr15, vr14, 0
|
|
vssrani.bu.h vr15, vr15, 0
|
|
fst.d f15, a0, 16
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H24_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h32_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
vreplvei.w vr0, vr0, 0
|
|
la.local t1, shufb
|
|
vld vr6, t1, 48
|
|
vaddi.bu vr7, vr6, 2
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
.LOOP_UNI_W_H32:
|
|
EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0
|
|
EPEL_UNI_W_H16_LOOP_LSX 16, 24, 16
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H32
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h32_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.w xr0, xr0
|
|
la.local t1, shufb
|
|
xvld xr6, t1, 64
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
.LOOP_UNI_W_H32_LASX:
|
|
EPEL_UNI_W_H16_LOOP_LASX 0, 0, 32
|
|
EPEL_UNI_W_H16_LOOP_LASX 16, 16, 32
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H32_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h48_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
vreplvei.w vr0, vr0, 0
|
|
la.local t1, shufb
|
|
vld vr6, t1, 48
|
|
vaddi.bu vr7, vr6, 2
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
.LOOP_UNI_W_H48:
|
|
EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0
|
|
EPEL_UNI_W_H16_LOOP_LSX 16, 24, 16
|
|
EPEL_UNI_W_H16_LOOP_LSX 32, 40, 32
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H48
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h48_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.w xr0, xr0
|
|
la.local t1, shufb
|
|
xvld xr6, t1, 64
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
.LOOP_UNI_W_H48_LASX:
|
|
EPEL_UNI_W_H16_LOOP_LASX 0, 0, 48
|
|
EPEL_UNI_W_H16_LOOP_LASX 16, 16, 48
|
|
EPEL_UNI_W_H16_LOOP_LASX 32, 32, 48
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H48_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h64_8_lsx
|
|
LOAD_VAR 128
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
vreplvei.w vr0, vr0, 0
|
|
la.local t1, shufb
|
|
vld vr6, t1, 48
|
|
vaddi.bu vr7, vr6, 2
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
vreplvei.h vr5, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
.LOOP_UNI_W_H64:
|
|
EPEL_UNI_W_H16_LOOP_LSX 0, 8, 0
|
|
EPEL_UNI_W_H16_LOOP_LSX 16, 24, 16
|
|
EPEL_UNI_W_H16_LOOP_LSX 32, 40, 32
|
|
EPEL_UNI_W_H16_LOOP_LSX 48, 56, 48
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H64
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_epel_uni_w_h64_8_lasx
|
|
LOAD_VAR 256
|
|
ld.d t0, sp, 0 //mx
|
|
addi.d t0, t0, -1
|
|
slli.w t0, t0, 2
|
|
la.local t1, ff_hevc_epel_filters
|
|
vldx vr0, t1, t0 //filter
|
|
xvreplve0.w xr0, xr0
|
|
la.local t1, shufb
|
|
xvld xr6, t1, 64
|
|
slli.d t0, a3, 1 //stride * 2
|
|
add.d t1, t0, a3 //stride * 3
|
|
addi.d a2, a2, -1 //src -= 1
|
|
.LOOP_UNI_W_H64_LASX:
|
|
EPEL_UNI_W_H16_LOOP_LASX 0, 0, 64
|
|
EPEL_UNI_W_H16_LOOP_LASX 16, 16, 64
|
|
EPEL_UNI_W_H16_LOOP_LASX 32, 32, 64
|
|
EPEL_UNI_W_H16_LOOP_LASX 48, 48, 64
|
|
add.d a2, a2, a3
|
|
add.d a0, a0, a1
|
|
addi.d a4, a4, -1
|
|
bnez a4, .LOOP_UNI_W_H64_LASX
|
|
endfunc
|
|
|
|
/*
|
|
* void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride,
|
|
* const uint8_t *_src, ptrdiff_t _srcstride,
|
|
* const int16_t *src2, int height, intptr_t mx,
|
|
* intptr_t my, int width)
|
|
*/
|
|
function ff_hevc_put_hevc_bi_epel_h4_8_lsx
|
|
addi.d a6, a6, -1
|
|
slli.w a6, a6, 2
|
|
la.local t0, ff_hevc_epel_filters
|
|
vldx vr0, t0, a6 // filter
|
|
vreplvei.w vr0, vr0, 0
|
|
la.local t0, shufb
|
|
vld vr1, t0, 0 // mask
|
|
addi.d a2, a2, -1 // src -= 1
|
|
.LOOP_BI_EPEL_H4:
|
|
vld vr4, a4, 0 // src2
|
|
vld vr5, a2, 0
|
|
add.d a2, a2, a3
|
|
addi.d a4, a4, 128
|
|
vshuf.b vr5, vr5, vr5, vr1
|
|
vdp2.h.bu.b vr6, vr5, vr0 // EPEL_FILTER(src, 1)
|
|
vsllwil.w.h vr4, vr4, 0
|
|
vhaddw.w.h vr6, vr6, vr6
|
|
vadd.w vr6, vr6, vr4 // src2[x]
|
|
vssrani.h.w vr6, vr6, 0
|
|
vssrarni.bu.h vr6, vr6, 7
|
|
fst.s f6, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a5, a5, -1
|
|
bnez a5, .LOOP_BI_EPEL_H4
|
|
endfunc
|
|
|
|
.macro PUT_HEVC_BI_EPEL_H8_LSX in0, in1, in2, in3, out0
|
|
vshuf.b vr6, \in1, \in0, \in2
|
|
vshuf.b vr7, \in1, \in0, \in3
|
|
vdp2.h.bu.b vr8, vr6, vr0 // EPEL_FILTER(src, 1)
|
|
vdp2add.h.bu.b vr8, vr7, vr1 // EPEL_FILTER(src, 1)
|
|
vsadd.h \out0, vr8, vr4 // src2[x]
|
|
.endm
|
|
|
|
.macro PUT_HEVC_BI_EPEL_H16_LASX in0, in1, in2, in3, out0
|
|
xvshuf.b xr6, \in1, \in0, \in2
|
|
xvshuf.b xr7, \in1, \in0, \in3
|
|
xvdp2.h.bu.b xr8, xr6, xr0 // EPEL_FILTER(src, 1)
|
|
xvdp2add.h.bu.b xr8, xr7, xr1 // EPEL_FILTER(src, 1)
|
|
xvsadd.h \out0, xr8, xr4 // src2[x]
|
|
.endm
|
|
|
|
function ff_hevc_put_hevc_bi_epel_h6_8_lsx
|
|
addi.d a6, a6, -1
|
|
slli.w a6, a6, 2
|
|
la.local t0, ff_hevc_epel_filters
|
|
vldx vr0, t0, a6 // filter
|
|
vreplvei.h vr1, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
la.local t0, shufb
|
|
vld vr2, t0, 48// mask
|
|
vaddi.bu vr3, vr2, 2
|
|
addi.d a2, a2, -1 // src -= 1
|
|
.LOOP_BI_EPEL_H6:
|
|
vld vr4, a4, 0 // src2
|
|
vld vr5, a2, 0
|
|
add.d a2, a2, a3
|
|
addi.d a4, a4, 128
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr7
|
|
vssrarni.bu.h vr7, vr7, 7
|
|
fst.s f7, a0, 0
|
|
vstelm.h vr7, a0, 4, 2
|
|
add.d a0, a0, a1
|
|
addi.d a5, a5, -1
|
|
bnez a5, .LOOP_BI_EPEL_H6
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_bi_epel_h8_8_lsx
|
|
addi.d a6, a6, -1
|
|
slli.w a6, a6, 2
|
|
la.local t0, ff_hevc_epel_filters
|
|
vldx vr0, t0, a6 // filter
|
|
vreplvei.h vr1, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
la.local t0, shufb
|
|
vld vr2, t0, 48// mask
|
|
vaddi.bu vr3, vr2, 2
|
|
addi.d a2, a2, -1 // src -= 1
|
|
.LOOP_BI_EPEL_H8:
|
|
vld vr4, a4, 0 // src2
|
|
vld vr5, a2, 0
|
|
add.d a2, a2, a3
|
|
addi.d a4, a4, 128
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr7
|
|
vssrarni.bu.h vr7, vr7, 7
|
|
fst.d f7, a0, 0
|
|
add.d a0, a0, a1
|
|
addi.d a5, a5, -1
|
|
bnez a5, .LOOP_BI_EPEL_H8
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_bi_epel_h12_8_lsx
|
|
addi.d a6, a6, -1
|
|
slli.w a6, a6, 2
|
|
la.local t0, ff_hevc_epel_filters
|
|
vldx vr0, t0, a6 // filter
|
|
vreplvei.h vr1, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
la.local t0, shufb
|
|
vld vr2, t0, 48// mask
|
|
vaddi.bu vr3, vr2, 2
|
|
addi.d a2, a2, -1 // src -= 1
|
|
.LOOP_BI_EPEL_H12:
|
|
vld vr4, a4, 0 // src2
|
|
vld vr5, a2, 0
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr11
|
|
vld vr5, a2, 8
|
|
vld vr4, a4, 16
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr12
|
|
vssrarni.bu.h vr12, vr11, 7
|
|
fst.d f12, a0, 0
|
|
vstelm.w vr12, a0, 8, 2
|
|
add.d a2, a2, a3
|
|
addi.d a4, a4, 128
|
|
add.d a0, a0, a1
|
|
addi.d a5, a5, -1
|
|
bnez a5, .LOOP_BI_EPEL_H12
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_bi_epel_h12_8_lasx
|
|
addi.d a6, a6, -1
|
|
slli.w a6, a6, 2
|
|
la.local t0, ff_hevc_epel_filters
|
|
vldx vr0, t0, a6 // filter
|
|
xvreplve0.q xr0, xr0
|
|
xvrepl128vei.h xr1, xr0, 1
|
|
xvrepl128vei.h xr0, xr0, 0
|
|
la.local t0, shufb
|
|
xvld xr2, t0, 96// mask
|
|
xvaddi.bu xr3, xr2, 2
|
|
addi.d a2, a2, -1 // src -= 1
|
|
.LOOP_BI_EPEL_H12_LASX:
|
|
xvld xr4, a4, 0 // src2
|
|
xvld xr5, a2, 0
|
|
xvpermi.d xr5, xr5, 0x94
|
|
PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr9
|
|
xvpermi.q xr10, xr9, 0x01
|
|
vssrarni.bu.h vr10, vr9, 7
|
|
fst.d f10, a0, 0
|
|
vstelm.w vr10, a0, 8, 2
|
|
add.d a2, a2, a3
|
|
addi.d a4, a4, 128
|
|
add.d a0, a0, a1
|
|
addi.d a5, a5, -1
|
|
bnez a5, .LOOP_BI_EPEL_H12_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_bi_epel_h16_8_lsx
|
|
addi.d a6, a6, -1
|
|
slli.w a6, a6, 2
|
|
la.local t0, ff_hevc_epel_filters
|
|
vldx vr0, t0, a6 // filter
|
|
vreplvei.h vr1, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
la.local t0, shufb
|
|
vld vr2, t0, 48// mask
|
|
vaddi.bu vr3, vr2, 2
|
|
addi.d a2, a2, -1 // src -= 1
|
|
.LOOP_BI_EPEL_H16:
|
|
vld vr4, a4, 0 // src2
|
|
vld vr5, a2, 0
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr11
|
|
vld vr5, a2, 8
|
|
vld vr4, a4, 16
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr12
|
|
vssrarni.bu.h vr12, vr11, 7
|
|
vst vr12, a0, 0
|
|
add.d a2, a2, a3
|
|
addi.d a4, a4, 128
|
|
add.d a0, a0, a1
|
|
addi.d a5, a5, -1
|
|
bnez a5, .LOOP_BI_EPEL_H16
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_bi_epel_h16_8_lasx
|
|
addi.d a6, a6, -1
|
|
slli.w a6, a6, 2
|
|
la.local t0, ff_hevc_epel_filters
|
|
vldx vr0, t0, a6 // filter
|
|
xvreplve0.q xr0, xr0
|
|
xvrepl128vei.h xr1, xr0, 1
|
|
xvrepl128vei.h xr0, xr0, 0
|
|
la.local t0, shufb
|
|
xvld xr2, t0, 96// mask
|
|
xvaddi.bu xr3, xr2, 2
|
|
addi.d a2, a2, -1 // src -= 1
|
|
.LOOP_BI_EPEL_H16_LASX:
|
|
xvld xr4, a4, 0 // src2
|
|
xvld xr5, a2, 0
|
|
xvpermi.d xr5, xr5, 0x94
|
|
PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr9
|
|
xvpermi.q xr10, xr9, 0x01
|
|
vssrarni.bu.h vr10, vr9, 7
|
|
vst vr10, a0, 0
|
|
add.d a2, a2, a3
|
|
addi.d a4, a4, 128
|
|
add.d a0, a0, a1
|
|
addi.d a5, a5, -1
|
|
bnez a5, .LOOP_BI_EPEL_H16_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_bi_epel_h32_8_lasx
|
|
addi.d a6, a6, -1
|
|
slli.w a6, a6, 2
|
|
la.local t0, ff_hevc_epel_filters
|
|
vldx vr0, t0, a6 // filter
|
|
xvreplve0.q xr0, xr0
|
|
xvrepl128vei.h xr1, xr0, 1
|
|
xvrepl128vei.h xr0, xr0, 0
|
|
la.local t0, shufb
|
|
xvld xr2, t0, 96// mask
|
|
xvaddi.bu xr3, xr2, 2
|
|
addi.d a2, a2, -1 // src -= 1
|
|
.LOOP_BI_EPEL_H32_LASX:
|
|
xvld xr4, a4, 0 // src2
|
|
xvld xr5, a2, 0
|
|
xvpermi.q xr15, xr5, 0x01
|
|
xvpermi.d xr5, xr5, 0x94
|
|
PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr9
|
|
xvld xr4, a4, 32
|
|
xvld xr15, a2, 16
|
|
xvpermi.d xr15, xr15, 0x94
|
|
PUT_HEVC_BI_EPEL_H16_LASX xr15, xr15, xr2, xr3, xr11
|
|
xvssrarni.bu.h xr11, xr9, 7
|
|
xvpermi.d xr11, xr11, 0xd8
|
|
xvst xr11, a0, 0
|
|
add.d a2, a2, a3
|
|
addi.d a4, a4, 128
|
|
add.d a0, a0, a1
|
|
addi.d a5, a5, -1
|
|
bnez a5, .LOOP_BI_EPEL_H32_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_bi_epel_h48_8_lsx
|
|
addi.d a6, a6, -1
|
|
slli.w a6, a6, 2
|
|
la.local t0, ff_hevc_epel_filters
|
|
vldx vr0, t0, a6// filter
|
|
vreplvei.h vr1, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
la.local t0, shufb
|
|
vld vr2, t0, 48// mask
|
|
vaddi.bu vr3, vr2, 2
|
|
vaddi.bu vr21, vr2, 8
|
|
vaddi.bu vr22, vr2, 10
|
|
addi.d a2, a2, -1 // src -= 1
|
|
.LOOP_BI_EPEL_H48:
|
|
vld vr4, a4, 0 // src2
|
|
vld vr5, a2, 0
|
|
vld vr9, a2, 16
|
|
vld vr10, a2, 32
|
|
vld vr11, a2, 48
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr12
|
|
vld vr4, a4, 16
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr9, vr21, vr22, vr13
|
|
vld vr4, a4, 32
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr9, vr9, vr2, vr3, vr14
|
|
vld vr4, a4, 48
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr9, vr10, vr21, vr22, vr15
|
|
vld vr4, a4, 64
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr10, vr10, vr2, vr3, vr16
|
|
vld vr4, a4, 80
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr10, vr11, vr21, vr22, vr17
|
|
vssrarni.bu.h vr13, vr12, 7
|
|
vssrarni.bu.h vr15, vr14, 7
|
|
vssrarni.bu.h vr17, vr16, 7
|
|
vst vr13, a0, 0
|
|
vst vr15, a0, 16
|
|
vst vr17, a0, 32
|
|
add.d a2, a2, a3
|
|
addi.d a4, a4, 128
|
|
add.d a0, a0, a1
|
|
addi.d a5, a5, -1
|
|
bnez a5, .LOOP_BI_EPEL_H48
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_bi_epel_h48_8_lasx
|
|
addi.d a6, a6, -1
|
|
slli.w a6, a6, 2
|
|
la.local t0, ff_hevc_epel_filters
|
|
vldx vr0, t0, a6 // filter
|
|
xvreplve0.q xr0, xr0
|
|
xvrepl128vei.h xr1, xr0, 1
|
|
xvrepl128vei.h xr0, xr0, 0
|
|
la.local t0, shufb
|
|
xvld xr2, t0, 96// mask
|
|
xvaddi.bu xr3, xr2, 2
|
|
addi.d a2, a2, -1 // src -= 1
|
|
.LOOP_BI_EPEL_H48_LASX:
|
|
xvld xr4, a4, 0 // src2
|
|
xvld xr5, a2, 0
|
|
xvld xr9, a2, 32
|
|
xvpermi.d xr10, xr9, 0x94
|
|
xvpermi.q xr9, xr5, 0x21
|
|
xvpermi.d xr9, xr9, 0x94
|
|
xvpermi.d xr5, xr5, 0x94
|
|
PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr11
|
|
xvld xr4, a4, 32
|
|
PUT_HEVC_BI_EPEL_H16_LASX xr9, xr9, xr2, xr3, xr12
|
|
xvld xr4, a4, 64
|
|
PUT_HEVC_BI_EPEL_H16_LASX xr10, xr10, xr2, xr3, xr13
|
|
xvssrarni.bu.h xr12, xr11, 7
|
|
xvpermi.d xr12, xr12, 0xd8
|
|
xvpermi.q xr14, xr13, 0x01
|
|
vssrarni.bu.h vr14, vr13, 7
|
|
xvst xr12, a0, 0
|
|
vst vr14, a0, 32
|
|
add.d a2, a2, a3
|
|
addi.d a4, a4, 128
|
|
add.d a0, a0, a1
|
|
addi.d a5, a5, -1
|
|
bnez a5, .LOOP_BI_EPEL_H48_LASX
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_bi_epel_h64_8_lsx
|
|
addi.d a6, a6, -1
|
|
slli.w a6, a6, 2
|
|
la.local t0, ff_hevc_epel_filters
|
|
vldx vr0, t0, a6// filter
|
|
vreplvei.h vr1, vr0, 1
|
|
vreplvei.h vr0, vr0, 0
|
|
la.local t0, shufb
|
|
vld vr2, t0, 48// mask
|
|
vaddi.bu vr3, vr2, 2
|
|
vaddi.bu vr21, vr2, 8
|
|
vaddi.bu vr22, vr2, 10
|
|
addi.d a2, a2, -1 // src -= 1
|
|
.LOOP_BI_EPEL_H64:
|
|
vld vr4, a4, 0 // src2
|
|
vld vr5, a2, 0
|
|
vld vr9, a2, 16
|
|
vld vr10, a2, 32
|
|
vld vr11, a2, 48
|
|
vld vr12, a2, 64
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr5, vr2, vr3, vr13
|
|
vld vr4, a4, 16
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr5, vr9, vr21, vr22, vr14
|
|
vld vr4, a4, 32
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr9, vr9, vr2, vr3, vr15
|
|
vld vr4, a4, 48
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr9, vr10, vr21, vr22, vr16
|
|
vld vr4, a4, 64
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr10, vr10, vr2, vr3, vr17
|
|
vld vr4, a4, 80
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr10, vr11, vr21, vr22, vr18
|
|
vld vr4, a4, 96
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr11, vr11, vr2, vr3, vr19
|
|
vld vr4, a4, 112
|
|
PUT_HEVC_BI_EPEL_H8_LSX vr11, vr12, vr21, vr22, vr20
|
|
vssrarni.bu.h vr14, vr13, 7
|
|
vssrarni.bu.h vr16, vr15, 7
|
|
vssrarni.bu.h vr18, vr17, 7
|
|
vssrarni.bu.h vr20, vr19, 7
|
|
vst vr14, a0, 0
|
|
vst vr16, a0, 16
|
|
vst vr18, a0, 32
|
|
vst vr20, a0, 48
|
|
add.d a2, a2, a3
|
|
addi.d a4, a4, 128
|
|
add.d a0, a0, a1
|
|
addi.d a5, a5, -1
|
|
bnez a5, .LOOP_BI_EPEL_H64
|
|
endfunc
|
|
|
|
function ff_hevc_put_hevc_bi_epel_h64_8_lasx
|
|
addi.d a6, a6, -1
|
|
slli.w a6, a6, 2
|
|
la.local t0, ff_hevc_epel_filters
|
|
vldx vr0, t0, a6 // filter
|
|
xvreplve0.q xr0, xr0
|
|
xvrepl128vei.h xr1, xr0, 1
|
|
xvrepl128vei.h xr0, xr0, 0
|
|
la.local t0, shufb
|
|
xvld xr2, t0, 96// mask
|
|
xvaddi.bu xr3, xr2, 2
|
|
addi.d a2, a2, -1 // src -= 1
|
|
.LOOP_BI_EPEL_H64_LASX:
|
|
xvld xr4, a4, 0 // src2
|
|
xvld xr5, a2, 0
|
|
xvld xr9, a2, 32
|
|
xvld xr11, a2, 48
|
|
xvpermi.d xr11, xr11, 0x94
|
|
xvpermi.d xr10, xr9, 0x94
|
|
xvpermi.q xr9, xr5, 0x21
|
|
xvpermi.d xr9, xr9, 0x94
|
|
xvpermi.d xr5, xr5, 0x94
|
|
PUT_HEVC_BI_EPEL_H16_LASX xr5, xr5, xr2, xr3, xr12
|
|
xvld xr4, a4, 32
|
|
PUT_HEVC_BI_EPEL_H16_LASX xr9, xr9, xr2, xr3, xr13
|
|
xvld xr4, a4, 64
|
|
PUT_HEVC_BI_EPEL_H16_LASX xr10, xr10, xr2, xr3, xr14
|
|
xvld xr4, a4, 96
|
|
PUT_HEVC_BI_EPEL_H16_LASX xr11, xr11, xr2, xr3, xr15
|
|
xvssrarni.bu.h xr13, xr12, 7
|
|
xvssrarni.bu.h xr15, xr14, 7
|
|
xvpermi.d xr13, xr13, 0xd8
|
|
xvpermi.d xr15, xr15, 0xd8
|
|
xvst xr13, a0, 0
|
|
xvst xr15, a0, 32
|
|
add.d a2, a2, a3
|
|
addi.d a4, a4, 128
|
|
add.d a0, a0, a1
|
|
addi.d a5, a5, -1
|
|
bnez a5, .LOOP_BI_EPEL_H64_LASX
|
|
endfunc
|