mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-28 20:53:54 +02:00
lavc/aarch64: new optimization for 8-bit hevc_qpel_uni_w_h
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
0b7356c1b4
commit
15972cce8c
@ -145,6 +145,7 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
|
||||
void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
|
||||
void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
|
||||
|
||||
|
||||
NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
|
||||
const uint8_t *_src, ptrdiff_t _srcstride,
|
||||
int height, int denom, int wx, int ox,
|
||||
@ -155,6 +156,12 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
|
||||
int height, int denom, int wx, int ox,
|
||||
intptr_t mx, intptr_t my, int width),);
|
||||
|
||||
NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
|
||||
const uint8_t *_src, ptrdiff_t _srcstride,
|
||||
int height, int denom, int wx, int ox,
|
||||
intptr_t mx, intptr_t my, int width), _i8mm);
|
||||
|
||||
|
||||
#define NEON8_FNASSIGN(member, v, h, fn, ext) \
|
||||
member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \
|
||||
member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \
|
||||
@ -174,9 +181,11 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
|
||||
member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
|
||||
member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
|
||||
|
||||
|
||||
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
|
||||
{
|
||||
if (!have_neon(av_get_cpu_flags())) return;
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (!have_neon(cpu_flags)) return;
|
||||
|
||||
if (bit_depth == 8) {
|
||||
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_neon;
|
||||
@ -236,6 +245,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
|
||||
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
|
||||
NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
|
||||
|
||||
if (have_i8mm(cpu_flags)) {
|
||||
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
|
||||
}
|
||||
|
||||
}
|
||||
if (bit_depth == 10) {
|
||||
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_neon;
|
||||
|
@ -1192,3 +1192,437 @@ function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
|
||||
b.hi 3b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
#if HAVE_I8MM
|
||||
.macro QPEL_UNI_W_H_HEADER
|
||||
ldr x12, [sp]
|
||||
sub x2, x2, #3
|
||||
movrel x9, qpel_filters
|
||||
add x9, x9, x12, lsl #3
|
||||
ldr x11, [x9]
|
||||
dup v28.2d, x11
|
||||
mov w10, #-6
|
||||
sub w10, w10, w5
|
||||
dup v30.4s, w6 // wx
|
||||
dup v31.4s, w10 // shift
|
||||
dup v29.4s, w7 // ox
|
||||
.endm
|
||||
|
||||
function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_i8mm, export=1
|
||||
QPEL_UNI_W_H_HEADER
|
||||
1:
|
||||
ld1 {v0.16b}, [x2], x3
|
||||
ext v1.16b, v0.16b, v0.16b, #1
|
||||
ext v2.16b, v0.16b, v0.16b, #2
|
||||
ext v3.16b, v0.16b, v0.16b, #3
|
||||
zip1 v0.2d, v0.2d, v1.2d
|
||||
zip1 v2.2d, v2.2d, v3.2d
|
||||
movi v16.2d, #0
|
||||
movi v17.2d, #0
|
||||
usdot v16.4s, v0.16b, v28.16b
|
||||
usdot v17.4s, v2.16b, v28.16b
|
||||
addp v16.4s, v16.4s, v17.4s
|
||||
mul v16.4s, v16.4s, v30.4s
|
||||
sqrshl v16.4s, v16.4s, v31.4s
|
||||
sqadd v16.4s, v16.4s, v29.4s
|
||||
sqxtn v16.4h, v16.4s
|
||||
sqxtun v16.8b, v16.8h
|
||||
str s16, [x0]
|
||||
add x0, x0, x1
|
||||
subs w4, w4, #1
|
||||
b.hi 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_i8mm, export=1
|
||||
QPEL_UNI_W_H_HEADER
|
||||
sub x1, x1, #4
|
||||
1:
|
||||
ld1 {v0.16b}, [x2], x3
|
||||
ext v1.16b, v0.16b, v0.16b, #1
|
||||
ext v2.16b, v0.16b, v0.16b, #2
|
||||
ext v3.16b, v0.16b, v0.16b, #3
|
||||
ext v4.16b, v0.16b, v0.16b, #4
|
||||
ext v5.16b, v0.16b, v0.16b, #5
|
||||
zip1 v0.2d, v0.2d, v1.2d
|
||||
zip1 v2.2d, v2.2d, v3.2d
|
||||
zip1 v4.2d, v4.2d, v5.2d
|
||||
movi v16.2d, #0
|
||||
movi v17.2d, #0
|
||||
movi v18.2d, #0
|
||||
usdot v16.4s, v0.16b, v28.16b
|
||||
usdot v17.4s, v2.16b, v28.16b
|
||||
usdot v18.4s, v4.16b, v28.16b
|
||||
addp v16.4s, v16.4s, v17.4s
|
||||
addp v18.4s, v18.4s, v18.4s
|
||||
mul v16.4s, v16.4s, v30.4s
|
||||
mul v18.2s, v18.2s, v30.2s
|
||||
sqrshl v16.4s, v16.4s, v31.4s
|
||||
sqrshl v18.2s, v18.2s, v31.2s
|
||||
sqadd v16.4s, v16.4s, v29.4s
|
||||
sqadd v18.2s, v18.2s, v29.2s
|
||||
sqxtn v16.4h, v16.4s
|
||||
sqxtn2 v16.8h, v18.4s
|
||||
sqxtun v16.8b, v16.8h
|
||||
str s16, [x0], #4
|
||||
st1 {v16.h}[2], [x0], x1
|
||||
subs w4, w4, #1
|
||||
b.hi 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
.macro QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
|
||||
movi \d0\().2d, #0
|
||||
movi \d1\().2d, #0
|
||||
movi \d2\().2d, #0
|
||||
movi \d3\().2d, #0
|
||||
usdot \d0\().4s, \s0\().16b, v28.16b
|
||||
usdot \d1\().4s, \s1\().16b, v28.16b
|
||||
usdot \d2\().4s, \s2\().16b, v28.16b
|
||||
usdot \d3\().4s, \s3\().16b, v28.16b
|
||||
addp \d0\().4s, \d0\().4s, \d1\().4s
|
||||
addp \d2\().4s, \d2\().4s, \d3\().4s
|
||||
mul \d0\().4s, \d0\().4s, v30.4s
|
||||
mul \d2\().4s, \d2\().4s, v30.4s
|
||||
sqrshl \d0\().4s, \d0\().4s, v31.4s
|
||||
sqrshl \d2\().4s, \d2\().4s, v31.4s
|
||||
sqadd \d0\().4s, \d0\().4s, v29.4s
|
||||
sqadd \d2\().4s, \d2\().4s, v29.4s
|
||||
.endm
|
||||
|
||||
.macro QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
|
||||
movi \d0\().2d, #0
|
||||
movi \d1\().2d, #0
|
||||
usdot \d0\().4s, \s0\().16b, v28.16b
|
||||
usdot \d1\().4s, \s1\().16b, v28.16b
|
||||
addp \d0\().4s, \d0\().4s, \d1\().4s
|
||||
mul \d0\().4s, \d0\().4s, v30.4s
|
||||
sqrshl \d0\().4s, \d0\().4s, v31.4s
|
||||
sqadd \d0\().4s, \d0\().4s, v29.4s
|
||||
.endm
|
||||
|
||||
|
||||
function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_i8mm, export=1
|
||||
QPEL_UNI_W_H_HEADER
|
||||
1:
|
||||
ld1 {v16.16b, v17.16b}, [x2], x3
|
||||
ext v1.16b, v16.16b, v17.16b, #1
|
||||
ext v2.16b, v16.16b, v17.16b, #2
|
||||
ext v3.16b, v16.16b, v17.16b, #3
|
||||
ext v4.16b, v16.16b, v17.16b, #4
|
||||
ext v5.16b, v16.16b, v17.16b, #5
|
||||
ext v6.16b, v16.16b, v17.16b, #6
|
||||
ext v7.16b, v16.16b, v17.16b, #7
|
||||
zip1 v0.2d, v16.2d, v1.2d
|
||||
zip1 v2.2d, v2.2d, v3.2d
|
||||
zip1 v4.2d, v4.2d, v5.2d
|
||||
zip1 v6.2d, v6.2d, v7.2d
|
||||
QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21
|
||||
sqxtn v18.4h, v18.4s
|
||||
sqxtn2 v18.8h, v20.4s
|
||||
sqxtun v18.8b, v18.8h
|
||||
str d18, [x0]
|
||||
add x0, x0, x1
|
||||
subs w4, w4, #1
|
||||
b.hi 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_i8mm, export=1
|
||||
QPEL_UNI_W_H_HEADER
|
||||
add x13, x0, #8
|
||||
1:
|
||||
ld1 {v16.16b, v17.16b}, [x2], x3
|
||||
ext v1.16b, v16.16b, v17.16b, #1
|
||||
ext v2.16b, v16.16b, v17.16b, #2
|
||||
ext v3.16b, v16.16b, v17.16b, #3
|
||||
ext v4.16b, v16.16b, v17.16b, #4
|
||||
ext v5.16b, v16.16b, v17.16b, #5
|
||||
ext v6.16b, v16.16b, v17.16b, #6
|
||||
ext v7.16b, v16.16b, v17.16b, #7
|
||||
zip1 v18.2d, v16.2d, v1.2d
|
||||
zip1 v19.2d, v2.2d, v3.2d
|
||||
zip1 v20.2d, v4.2d, v5.2d
|
||||
zip1 v21.2d, v6.2d, v7.2d
|
||||
zip2 v22.2d, v16.2d, v1.2d
|
||||
zip2 v23.2d, v2.2d, v3.2d
|
||||
QPEL_UNI_W_H_CALC v18, v19, v20, v21, v0, v2, v4, v6
|
||||
QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25
|
||||
sqxtn v0.4h, v0.4s
|
||||
sqxtn2 v0.8h, v4.4s
|
||||
sqxtn v1.4h, v24.4s
|
||||
sqxtun v0.8b, v0.8h
|
||||
sqxtun v1.8b, v1.8h
|
||||
|
||||
str d0, [x0]
|
||||
str s1, [x13]
|
||||
add x0, x0, x1
|
||||
add x13, x13, x1
|
||||
subs w4, w4, #1
|
||||
b.hi 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_i8mm, export=1
|
||||
QPEL_UNI_W_H_HEADER
|
||||
1:
|
||||
ld1 {v16.16b, v17.16b}, [x2], x3
|
||||
ext v1.16b, v16.16b, v17.16b, #1
|
||||
ext v2.16b, v16.16b, v17.16b, #2
|
||||
ext v3.16b, v16.16b, v17.16b, #3
|
||||
ext v4.16b, v16.16b, v17.16b, #4
|
||||
ext v5.16b, v16.16b, v17.16b, #5
|
||||
ext v6.16b, v16.16b, v17.16b, #6
|
||||
ext v7.16b, v16.16b, v17.16b, #7
|
||||
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21 // v18: 0, 8, 2, 10 v20: 1, 9, 3, 11
|
||||
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25 // v22: 4, 12, 6, 14 v24: 5, 13, 7, 15
|
||||
sqxtn v0.4h, v18.4s
|
||||
sqxtn2 v0.8h, v22.4s
|
||||
sqxtn v1.4h, v20.4s
|
||||
sqxtn2 v1.8h, v24.4s
|
||||
trn1 v2.8h, v0.8h, v1.8h
|
||||
trn2 v3.8h, v0.8h, v1.8h
|
||||
sqxtun v0.8b, v2.8h
|
||||
sqxtun2 v0.16b, v3.8h
|
||||
st1 {v0.16b}, [x0], x1
|
||||
subs w4, w4, #1
|
||||
b.hi 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_i8mm, export=1
|
||||
QPEL_UNI_W_H_HEADER
|
||||
sub x1, x1, #16
|
||||
1:
|
||||
ld1 {v16.16b, v17.16b}, [x2], x3
|
||||
ext v1.16b, v16.16b, v17.16b, #1
|
||||
ext v2.16b, v16.16b, v17.16b, #2
|
||||
ext v3.16b, v16.16b, v17.16b, #3
|
||||
ext v4.16b, v16.16b, v17.16b, #4
|
||||
ext v5.16b, v16.16b, v17.16b, #5
|
||||
ext v6.16b, v16.16b, v17.16b, #6
|
||||
ext v7.16b, v16.16b, v17.16b, #7
|
||||
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21
|
||||
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
|
||||
sqxtn v18.4h, v18.4s
|
||||
sqxtn2 v18.8h, v22.4s
|
||||
sqxtn v19.4h, v20.4s
|
||||
sqxtn2 v19.8h, v24.4s
|
||||
trn1 v20.8h, v18.8h, v19.8h
|
||||
trn2 v21.8h, v18.8h, v19.8h
|
||||
sqxtun v26.8b, v20.8h
|
||||
sqxtun2 v26.16b, v21.8h // 0-15
|
||||
ext v1.16b, v17.16b, v17.16b, #1
|
||||
ext v2.16b, v17.16b, v17.16b, #2
|
||||
ext v3.16b, v17.16b, v17.16b, #3
|
||||
ext v4.16b, v17.16b, v17.16b, #4
|
||||
ext v5.16b, v17.16b, v17.16b, #5
|
||||
ext v6.16b, v17.16b, v17.16b, #6
|
||||
ext v7.16b, v17.16b, v17.16b, #7
|
||||
zip1 v0.2d, v17.2d, v1.2d
|
||||
zip1 v2.2d, v2.2d, v3.2d
|
||||
zip1 v4.2d, v4.2d, v5.2d
|
||||
zip1 v6.2d, v6.2d, v7.2d
|
||||
QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21
|
||||
sqxtn v18.4h, v18.4s
|
||||
sqxtn2 v18.8h, v20.4s
|
||||
sqxtun v27.8b, v18.8h
|
||||
|
||||
st1 {v26.16b}, [x0], #16
|
||||
st1 {v27.8b}, [x0], x1
|
||||
subs w4, w4, #1
|
||||
b.hi 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_i8mm, export=1
|
||||
QPEL_UNI_W_H_HEADER
|
||||
1:
|
||||
ld1 {v16.16b, v17.16b, v18.16b}, [x2], x3
|
||||
ext v1.16b, v16.16b, v17.16b, #1
|
||||
ext v2.16b, v16.16b, v17.16b, #2
|
||||
ext v3.16b, v16.16b, v17.16b, #3
|
||||
ext v4.16b, v16.16b, v17.16b, #4
|
||||
ext v5.16b, v16.16b, v17.16b, #5
|
||||
ext v6.16b, v16.16b, v17.16b, #6
|
||||
ext v7.16b, v16.16b, v17.16b, #7
|
||||
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v0, v19, v20, v21
|
||||
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
|
||||
sqxtn v0.4h, v0.4s
|
||||
sqxtn2 v0.8h, v22.4s
|
||||
sqxtn v19.4h, v20.4s
|
||||
sqxtn2 v19.8h, v24.4s
|
||||
trn1 v20.8h, v0.8h, v19.8h
|
||||
trn2 v21.8h, v0.8h, v19.8h
|
||||
sqxtun v26.8b, v20.8h
|
||||
sqxtun2 v26.16b, v21.8h // 0-15
|
||||
ext v1.16b, v17.16b, v18.16b, #1
|
||||
ext v2.16b, v17.16b, v18.16b, #2
|
||||
ext v3.16b, v17.16b, v18.16b, #3
|
||||
ext v4.16b, v17.16b, v18.16b, #4
|
||||
ext v5.16b, v17.16b, v18.16b, #5
|
||||
ext v6.16b, v17.16b, v18.16b, #6
|
||||
ext v7.16b, v17.16b, v18.16b, #7
|
||||
QPEL_UNI_W_H_CALC v17, v2, v1, v3, v0, v19, v20, v21
|
||||
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
|
||||
sqxtn v0.4h, v0.4s
|
||||
sqxtn2 v0.8h, v22.4s
|
||||
sqxtn v19.4h, v20.4s
|
||||
sqxtn2 v19.8h, v24.4s
|
||||
trn1 v20.8h, v0.8h, v19.8h
|
||||
trn2 v21.8h, v0.8h, v19.8h
|
||||
sqxtun v27.8b, v20.8h
|
||||
sqxtun2 v27.16b, v21.8h // 16-31
|
||||
st1 {v26.16b, v27.16b}, [x0], x1
|
||||
subs w4, w4, #1
|
||||
b.hi 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_i8mm, export=1
|
||||
QPEL_UNI_W_H_HEADER
|
||||
1:
|
||||
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
|
||||
ext v1.16b, v16.16b, v17.16b, #1
|
||||
ext v2.16b, v16.16b, v17.16b, #2
|
||||
ext v3.16b, v16.16b, v17.16b, #3
|
||||
ext v4.16b, v16.16b, v17.16b, #4
|
||||
ext v5.16b, v16.16b, v17.16b, #5
|
||||
ext v6.16b, v16.16b, v17.16b, #6
|
||||
ext v7.16b, v16.16b, v17.16b, #7
|
||||
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0
|
||||
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
||||
sqxtn v20.4h, v20.4s
|
||||
sqxtn2 v20.8h, v22.4s
|
||||
sqxtn v21.4h, v21.4s
|
||||
sqxtn2 v21.8h, v23.4s
|
||||
trn1 v22.8h, v20.8h, v21.8h
|
||||
trn2 v23.8h, v20.8h, v21.8h
|
||||
sqxtun v25.8b, v22.8h
|
||||
sqxtun2 v25.16b, v23.8h // 0-15
|
||||
ext v1.16b, v17.16b, v18.16b, #1
|
||||
ext v2.16b, v17.16b, v18.16b, #2
|
||||
ext v3.16b, v17.16b, v18.16b, #3
|
||||
ext v4.16b, v17.16b, v18.16b, #4
|
||||
ext v5.16b, v17.16b, v18.16b, #5
|
||||
ext v6.16b, v17.16b, v18.16b, #6
|
||||
ext v7.16b, v17.16b, v18.16b, #7
|
||||
QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0
|
||||
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
||||
sqxtn v20.4h, v20.4s
|
||||
sqxtn2 v20.8h, v22.4s
|
||||
sqxtn v21.4h, v21.4s
|
||||
sqxtn2 v21.8h, v23.4s
|
||||
trn1 v22.8h, v20.8h, v21.8h
|
||||
trn2 v23.8h, v20.8h, v21.8h
|
||||
sqxtun v26.8b, v22.8h
|
||||
sqxtun2 v26.16b, v23.8h // 16-31
|
||||
ext v1.16b, v18.16b, v19.16b, #1
|
||||
ext v2.16b, v18.16b, v19.16b, #2
|
||||
ext v3.16b, v18.16b, v19.16b, #3
|
||||
ext v4.16b, v18.16b, v19.16b, #4
|
||||
ext v5.16b, v18.16b, v19.16b, #5
|
||||
ext v6.16b, v18.16b, v19.16b, #6
|
||||
ext v7.16b, v18.16b, v19.16b, #7
|
||||
QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0
|
||||
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
||||
sqxtn v20.4h, v20.4s
|
||||
sqxtn2 v20.8h, v22.4s
|
||||
sqxtn v21.4h, v21.4s
|
||||
sqxtn2 v21.8h, v23.4s
|
||||
trn1 v22.8h, v20.8h, v21.8h
|
||||
trn2 v23.8h, v20.8h, v21.8h
|
||||
sqxtun v27.8b, v22.8h
|
||||
sqxtun2 v27.16b, v23.8h // 32-47
|
||||
st1 {v25.16b, v26.16b, v27.16b}, [x0], x1
|
||||
subs w4, w4, #1
|
||||
b.hi 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
|
||||
function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_i8mm, export=1
|
||||
QPEL_UNI_W_H_HEADER
|
||||
sub x3, x3, #64
|
||||
1:
|
||||
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
|
||||
ext v1.16b, v16.16b, v17.16b, #1
|
||||
ext v2.16b, v16.16b, v17.16b, #2
|
||||
ext v3.16b, v16.16b, v17.16b, #3
|
||||
ext v4.16b, v16.16b, v17.16b, #4
|
||||
ext v5.16b, v16.16b, v17.16b, #5
|
||||
ext v6.16b, v16.16b, v17.16b, #6
|
||||
ext v7.16b, v16.16b, v17.16b, #7
|
||||
QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0
|
||||
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
||||
sqxtn v20.4h, v20.4s
|
||||
sqxtn2 v20.8h, v22.4s
|
||||
sqxtn v21.4h, v21.4s
|
||||
sqxtn2 v21.8h, v23.4s
|
||||
trn1 v22.8h, v20.8h, v21.8h
|
||||
trn2 v23.8h, v20.8h, v21.8h
|
||||
sqxtun v16.8b, v22.8h
|
||||
sqxtun2 v16.16b, v23.8h // 0-15
|
||||
ext v1.16b, v17.16b, v18.16b, #1
|
||||
ext v2.16b, v17.16b, v18.16b, #2
|
||||
ext v3.16b, v17.16b, v18.16b, #3
|
||||
ext v4.16b, v17.16b, v18.16b, #4
|
||||
ext v5.16b, v17.16b, v18.16b, #5
|
||||
ext v6.16b, v17.16b, v18.16b, #6
|
||||
ext v7.16b, v17.16b, v18.16b, #7
|
||||
QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0
|
||||
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
||||
sqxtn v20.4h, v20.4s
|
||||
sqxtn2 v20.8h, v22.4s
|
||||
sqxtn v21.4h, v21.4s
|
||||
sqxtn2 v21.8h, v23.4s
|
||||
trn1 v22.8h, v20.8h, v21.8h
|
||||
trn2 v23.8h, v20.8h, v21.8h
|
||||
sqxtun v17.8b, v22.8h
|
||||
sqxtun2 v17.16b, v23.8h // 16-31
|
||||
ext v1.16b, v18.16b, v19.16b, #1
|
||||
ext v2.16b, v18.16b, v19.16b, #2
|
||||
ext v3.16b, v18.16b, v19.16b, #3
|
||||
ext v4.16b, v18.16b, v19.16b, #4
|
||||
ext v5.16b, v18.16b, v19.16b, #5
|
||||
ext v6.16b, v18.16b, v19.16b, #6
|
||||
ext v7.16b, v18.16b, v19.16b, #7
|
||||
QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0
|
||||
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
||||
ld1 {v0.16b}, [x2], x3
|
||||
sqxtn v20.4h, v20.4s
|
||||
sqxtn2 v20.8h, v22.4s
|
||||
sqxtn v21.4h, v21.4s
|
||||
sqxtn2 v21.8h, v23.4s
|
||||
trn1 v22.8h, v20.8h, v21.8h
|
||||
trn2 v23.8h, v20.8h, v21.8h
|
||||
sqxtun v18.8b, v22.8h
|
||||
sqxtun2 v18.16b, v23.8h // 32-47
|
||||
ext v1.16b, v19.16b, v0.16b, #1
|
||||
ext v2.16b, v19.16b, v0.16b, #2
|
||||
ext v3.16b, v19.16b, v0.16b, #3
|
||||
ext v4.16b, v19.16b, v0.16b, #4
|
||||
ext v5.16b, v19.16b, v0.16b, #5
|
||||
ext v6.16b, v19.16b, v0.16b, #6
|
||||
ext v7.16b, v19.16b, v0.16b, #7
|
||||
QPEL_UNI_W_H_CALC v19, v2, v1, v3, v20, v24, v21, v0
|
||||
QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
|
||||
sqxtn v20.4h, v20.4s
|
||||
sqxtn2 v20.8h, v22.4s
|
||||
sqxtn v21.4h, v21.4s
|
||||
sqxtn2 v21.8h, v23.4s
|
||||
trn1 v22.8h, v20.8h, v21.8h
|
||||
trn2 v23.8h, v20.8h, v21.8h
|
||||
sqxtun v19.8b, v22.8h
|
||||
sqxtun2 v19.16b, v23.8h // 48-63
|
||||
|
||||
st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
|
||||
subs w4, w4, #1
|
||||
b.hi 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
#endif // HAVE_I8MM
|
||||
|
Loading…
Reference in New Issue
Block a user