1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-10 06:10:52 +02:00

avcodec/vc1: Arm 64-bit NEON inverse transform fast paths

checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows.

vc1dsp.vc1_inv_trans_4x4_c: 158.2
vc1dsp.vc1_inv_trans_4x4_neon: 65.7
vc1dsp.vc1_inv_trans_4x4_dc_c: 86.5
vc1dsp.vc1_inv_trans_4x4_dc_neon: 26.5
vc1dsp.vc1_inv_trans_4x8_c: 335.2
vc1dsp.vc1_inv_trans_4x8_neon: 106.2
vc1dsp.vc1_inv_trans_4x8_dc_c: 151.2
vc1dsp.vc1_inv_trans_4x8_dc_neon: 25.5
vc1dsp.vc1_inv_trans_8x4_c: 365.7
vc1dsp.vc1_inv_trans_8x4_neon: 97.2
vc1dsp.vc1_inv_trans_8x4_dc_c: 139.7
vc1dsp.vc1_inv_trans_8x4_dc_neon: 16.5
vc1dsp.vc1_inv_trans_8x8_c: 547.7
vc1dsp.vc1_inv_trans_8x8_neon: 137.0
vc1dsp.vc1_inv_trans_8x8_dc_c: 268.2
vc1dsp.vc1_inv_trans_8x8_dc_neon: 30.5

Signed-off-by: Ben Avison <bavison@riscosopen.org>
Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Ben Avison
2022-03-31 18:23:48 +01:00
committed by Martin Storsjö
parent c07de58a72
commit 501fdc017d
2 changed files with 697 additions and 0 deletions

View File

@@ -25,6 +25,16 @@
#include "config.h" #include "config.h"
void ff_vc1_inv_trans_8x8_neon(int16_t *block);
void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq); void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq); void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
@@ -46,6 +56,15 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
int cpu_flags = av_get_cpu_flags(); int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) { if (have_neon(cpu_flags)) {
dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon;
dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon;
dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon;
dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon;
dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon;
dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon;
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon;

View File

@@ -22,7 +22,685 @@
#include "libavutil/aarch64/asm.S" #include "libavutil/aarch64/asm.S"
// VC-1 8x8 inverse transform
// On entry:
// x0 -> array of 16-bit inverse transform coefficients, in column-major order
// On exit:
// array at x0 updated to hold transformed block; also now held in row-major order
function ff_vc1_inv_trans_8x8_neon, export=1
ld1 {v1.16b, v2.16b}, [x0], #32
ld1 {v3.16b, v4.16b}, [x0], #32
ld1 {v5.16b, v6.16b}, [x0], #32
shl v1.8h, v1.8h, #2 // 8/2 * src[0]
sub x1, x0, #3*32
ld1 {v16.16b, v17.16b}, [x0]
shl v7.8h, v2.8h, #4 // 16 * src[8]
shl v18.8h, v2.8h, #2 // 4 * src[8]
shl v19.8h, v4.8h, #4 // 16 * src[24]
ldr d0, .Lcoeffs_it8
shl v5.8h, v5.8h, #2 // 8/2 * src[32]
shl v20.8h, v6.8h, #4 // 16 * src[40]
shl v21.8h, v6.8h, #2 // 4 * src[40]
shl v22.8h, v17.8h, #4 // 16 * src[56]
ssra v20.8h, v19.8h, #2 // 4 * src[24] + 16 * src[40]
mul v23.8h, v3.8h, v0.h[0] // 6/2 * src[16]
sub v19.8h, v19.8h, v21.8h // 16 * src[24] - 4 * src[40]
ssra v7.8h, v22.8h, #2 // 16 * src[8] + 4 * src[56]
sub v18.8h, v22.8h, v18.8h // - 4 * src[8] + 16 * src[56]
shl v3.8h, v3.8h, #3 // 16/2 * src[16]
mls v20.8h, v2.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40]
ssra v1.8h, v1.8h, #1 // 12/2 * src[0]
ssra v5.8h, v5.8h, #1 // 12/2 * src[32]
mla v7.8h, v4.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56]
shl v21.8h, v16.8h, #3 // 16/2 * src[48]
mls v19.8h, v2.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40]
sub v2.8h, v23.8h, v21.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
mla v18.8h, v4.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
add v4.8h, v1.8h, v5.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32]
sub v1.8h, v1.8h, v5.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32]
mla v3.8h, v16.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
mla v7.8h, v6.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56]
add v5.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2
sub v16.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2
mla v20.8h, v17.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56]
add v21.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2
add v22.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2
mls v19.8h, v17.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56]
sub v17.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2
add v23.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2
mls v18.8h, v6.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56]
sub v1.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2
sub v2.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2
neg v3.8h, v7.8h // -t1
neg v4.8h, v20.8h // +t2
neg v6.8h, v19.8h // +t3
ssra v22.8h, v7.8h, #1 // (t5 + t1) >> 1
ssra v1.8h, v19.8h, #1 // (t7 - t3) >> 1
neg v7.8h, v18.8h // +t4
ssra v5.8h, v4.8h, #1 // (t6 + t2) >> 1
ssra v16.8h, v6.8h, #1 // (t7 + t3) >> 1
ssra v2.8h, v18.8h, #1 // (t8 - t4) >> 1
ssra v17.8h, v7.8h, #1 // (t8 + t4) >> 1
ssra v21.8h, v20.8h, #1 // (t6 - t2) >> 1
ssra v23.8h, v3.8h, #1 // (t5 - t1) >> 1
srshr v3.8h, v22.8h, #2 // (t5 + t1 + 4) >> 3
srshr v4.8h, v5.8h, #2 // (t6 + t2 + 4) >> 3
srshr v5.8h, v16.8h, #2 // (t7 + t3 + 4) >> 3
srshr v6.8h, v17.8h, #2 // (t8 + t4 + 4) >> 3
srshr v2.8h, v2.8h, #2 // (t8 - t4 + 4) >> 3
srshr v1.8h, v1.8h, #2 // (t7 - t3 + 4) >> 3
srshr v7.8h, v21.8h, #2 // (t6 - t2 + 4) >> 3
srshr v16.8h, v23.8h, #2 // (t5 - t1 + 4) >> 3
trn2 v17.8h, v3.8h, v4.8h
trn2 v18.8h, v5.8h, v6.8h
trn2 v19.8h, v2.8h, v1.8h
trn2 v20.8h, v7.8h, v16.8h
trn1 v21.4s, v17.4s, v18.4s
trn2 v17.4s, v17.4s, v18.4s
trn1 v18.4s, v19.4s, v20.4s
trn2 v19.4s, v19.4s, v20.4s
trn1 v3.8h, v3.8h, v4.8h
trn2 v4.2d, v21.2d, v18.2d
trn1 v20.2d, v17.2d, v19.2d
trn1 v5.8h, v5.8h, v6.8h
trn1 v1.8h, v2.8h, v1.8h
trn1 v2.8h, v7.8h, v16.8h
trn1 v6.2d, v21.2d, v18.2d
trn2 v7.2d, v17.2d, v19.2d
shl v16.8h, v20.8h, #4 // 16 * src[24]
shl v17.8h, v4.8h, #4 // 16 * src[40]
trn1 v18.4s, v3.4s, v5.4s
trn1 v19.4s, v1.4s, v2.4s
shl v21.8h, v7.8h, #4 // 16 * src[56]
shl v22.8h, v6.8h, #2 // 4 * src[8]
shl v23.8h, v4.8h, #2 // 4 * src[40]
trn2 v3.4s, v3.4s, v5.4s
trn2 v1.4s, v1.4s, v2.4s
shl v2.8h, v6.8h, #4 // 16 * src[8]
sub v5.8h, v16.8h, v23.8h // 16 * src[24] - 4 * src[40]
ssra v17.8h, v16.8h, #2 // 4 * src[24] + 16 * src[40]
sub v16.8h, v21.8h, v22.8h // - 4 * src[8] + 16 * src[56]
trn1 v22.2d, v18.2d, v19.2d
trn2 v18.2d, v18.2d, v19.2d
trn1 v19.2d, v3.2d, v1.2d
ssra v2.8h, v21.8h, #2 // 16 * src[8] + 4 * src[56]
mls v17.8h, v6.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40]
shl v21.8h, v22.8h, #2 // 8/2 * src[0]
shl v18.8h, v18.8h, #2 // 8/2 * src[32]
mls v5.8h, v6.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40]
shl v6.8h, v19.8h, #3 // 16/2 * src[16]
trn2 v1.2d, v3.2d, v1.2d
mla v16.8h, v20.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
ssra v21.8h, v21.8h, #1 // 12/2 * src[0]
ssra v18.8h, v18.8h, #1 // 12/2 * src[32]
mul v3.8h, v19.8h, v0.h[0] // 6/2 * src[16]
shl v19.8h, v1.8h, #3 // 16/2 * src[48]
mla v2.8h, v20.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56]
add v20.8h, v21.8h, v18.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32]
mla v6.8h, v1.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
sub v1.8h, v21.8h, v18.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32]
sub v3.8h, v3.8h, v19.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
mla v17.8h, v7.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56]
mls v5.8h, v7.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56]
add v7.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2
add v18.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2
mls v16.8h, v4.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56]
sub v19.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2
neg v21.8h, v17.8h // +t2
mla v2.8h, v4.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56]
sub v0.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2
neg v4.8h, v5.8h // +t3
sub v22.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2
sub v23.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2
neg v24.8h, v16.8h // +t4
add v6.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2
add v1.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2
ssra v7.8h, v21.8h, #1 // (t6 + t2) >> 1
neg v3.8h, v2.8h // -t1
ssra v18.8h, v2.8h, #1 // (t5 + t1) >> 1
ssra v19.8h, v4.8h, #1 // (t7 + t3) >> 1
ssra v0.8h, v24.8h, #1 // (t8 + t4) >> 1
srsra v23.8h, v16.8h, #1 // (t8 - t4 + 1) >> 1
srsra v22.8h, v5.8h, #1 // (t7 - t3 + 1) >> 1
srsra v1.8h, v17.8h, #1 // (t6 - t2 + 1) >> 1
srsra v6.8h, v3.8h, #1 // (t5 - t1 + 1) >> 1
srshr v2.8h, v18.8h, #6 // (t5 + t1 + 64) >> 7
srshr v3.8h, v7.8h, #6 // (t6 + t2 + 64) >> 7
srshr v4.8h, v19.8h, #6 // (t7 + t3 + 64) >> 7
srshr v5.8h, v0.8h, #6 // (t8 + t4 + 64) >> 7
srshr v16.8h, v23.8h, #6 // (t8 - t4 + 65) >> 7
srshr v17.8h, v22.8h, #6 // (t7 - t3 + 65) >> 7
st1 {v2.16b, v3.16b}, [x1], #32
srshr v0.8h, v1.8h, #6 // (t6 - t2 + 65) >> 7
srshr v1.8h, v6.8h, #6 // (t5 - t1 + 65) >> 7
st1 {v4.16b, v5.16b}, [x1], #32
st1 {v16.16b, v17.16b}, [x1], #32
st1 {v0.16b, v1.16b}, [x1]
ret
endfunc
// VC-1 8x4 inverse transform
// On entry:
// x0 -> array of 8-bit samples, in row-major order
// x1 = row stride for 8-bit sample array
// x2 -> array of 16-bit inverse transform coefficients, in row-major order
// On exit:
// array at x0 updated by saturated addition of (narrowed) transformed block
function ff_vc1_inv_trans_8x4_neon, export=1
ld1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
mov x3, x0
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector
ld1 {v5.8b}, [x0], x1
trn2 v6.4h, v1.4h, v3.4h
trn2 v7.4h, v2.4h, v4.4h
trn1 v1.4h, v1.4h, v3.4h
trn1 v2.4h, v2.4h, v4.4h
trn2 v3.4h, v16.4h, v18.4h
trn2 v4.4h, v17.4h, v19.4h
trn1 v16.4h, v16.4h, v18.4h
trn1 v17.4h, v17.4h, v19.4h
ld1 {v18.8b}, [x0], x1
trn1 v19.2s, v6.2s, v3.2s
trn2 v3.2s, v6.2s, v3.2s
trn1 v6.2s, v7.2s, v4.2s
trn2 v4.2s, v7.2s, v4.2s
trn1 v7.2s, v1.2s, v16.2s
trn1 v20.2s, v2.2s, v17.2s
shl v21.4h, v19.4h, #4 // 16 * src[1]
trn2 v1.2s, v1.2s, v16.2s
shl v16.4h, v3.4h, #4 // 16 * src[3]
trn2 v2.2s, v2.2s, v17.2s
shl v17.4h, v6.4h, #4 // 16 * src[5]
ld1 {v22.8b}, [x0], x1
shl v23.4h, v4.4h, #4 // 16 * src[7]
mul v24.4h, v1.4h, v0.h[0] // 6/2 * src[2]
ld1 {v25.8b}, [x0]
shl v26.4h, v19.4h, #2 // 4 * src[1]
shl v27.4h, v6.4h, #2 // 4 * src[5]
ssra v21.4h, v23.4h, #2 // 16 * src[1] + 4 * src[7]
ssra v17.4h, v16.4h, #2 // 4 * src[3] + 16 * src[5]
sub v23.4h, v23.4h, v26.4h // - 4 * src[1] + 16 * src[7]
sub v16.4h, v16.4h, v27.4h // 16 * src[3] - 4 * src[5]
shl v7.4h, v7.4h, #2 // 8/2 * src[0]
shl v20.4h, v20.4h, #2 // 8/2 * src[4]
mla v21.4h, v3.4h, v0.h[2] // 16 * src[1] + 15 * src[3] + 4 * src[7]
shl v1.4h, v1.4h, #3 // 16/2 * src[2]
mls v17.4h, v19.4h, v0.h[2] // - 15 * src[1] + 4 * src[3] + 16 * src[5]
ssra v7.4h, v7.4h, #1 // 12/2 * src[0]
mls v16.4h, v19.4h, v0.h[1] // - 9 * src[1] + 16 * src[3] - 4 * src[5]
ssra v20.4h, v20.4h, #1 // 12/2 * src[4]
mla v23.4h, v3.4h, v0.h[1] // - 4 * src[1] + 9 * src[3] + 16 * src[7]
shl v3.4h, v2.4h, #3 // 16/2 * src[6]
mla v1.4h, v2.4h, v0.h[0] // t3/2 = 16/2 * src[2] + 6/2 * src[6]
mla v21.4h, v6.4h, v0.h[1] // t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7]
mla v17.4h, v4.4h, v0.h[1] // -t2 = - 15 * src[1] + 4 * src[3] + 16 * src[5] + 9 * src[7]
sub v2.4h, v24.4h, v3.4h // t4/2 = 6/2 * src[2] - 16/2 * src[6]
mls v16.4h, v4.4h, v0.h[2] // -t3 = - 9 * src[1] + 16 * src[3] - 4 * src[5] - 15 * src[7]
add v3.4h, v7.4h, v20.4h // t1/2 = 12/2 * src[0] + 12/2 * src[4]
mls v23.4h, v6.4h, v0.h[2] // -t4 = - 4 * src[1] + 9 * src[3] - 15 * src[5] + 16 * src[7]
sub v4.4h, v7.4h, v20.4h // t2/2 = 12/2 * src[0] - 12/2 * src[4]
neg v6.4h, v21.4h // -t1
add v7.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2
sub v19.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2
add v20.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2
sub v24.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2
add v26.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2
add v27.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2
sub v2.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2
sub v1.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2
neg v3.4h, v17.4h // +t2
neg v4.4h, v16.4h // +t3
neg v28.4h, v23.4h // +t4
ssra v7.4h, v21.4h, #1 // (t5 + t1) >> 1
ssra v1.4h, v23.4h, #1 // (t8 - t4) >> 1
ssra v20.4h, v3.4h, #1 // (t6 + t2) >> 1
ssra v24.4h, v4.4h, #1 // (t7 + t3) >> 1
ssra v19.4h, v28.4h, #1 // (t8 + t4) >> 1
ssra v2.4h, v16.4h, #1 // (t7 - t3) >> 1
ssra v27.4h, v17.4h, #1 // (t6 - t2) >> 1
ssra v26.4h, v6.4h, #1 // (t5 - t1) >> 1
trn1 v1.2d, v7.2d, v1.2d
trn1 v2.2d, v20.2d, v2.2d
trn1 v3.2d, v24.2d, v27.2d
trn1 v4.2d, v19.2d, v26.2d
srshr v1.8h, v1.8h, #2 // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
srshr v2.8h, v2.8h, #2 // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
srshr v3.8h, v3.8h, #2 // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
srshr v4.8h, v4.8h, #2 // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
trn2 v6.8h, v1.8h, v2.8h
trn1 v1.8h, v1.8h, v2.8h
trn2 v2.8h, v3.8h, v4.8h
trn1 v3.8h, v3.8h, v4.8h
trn2 v4.4s, v6.4s, v2.4s
trn1 v7.4s, v1.4s, v3.4s
trn2 v1.4s, v1.4s, v3.4s
mul v3.8h, v4.8h, v0.h[5] // 22/2 * src[24]
trn1 v2.4s, v6.4s, v2.4s
mul v4.8h, v4.8h, v0.h[4] // 10/2 * src[24]
mul v6.8h, v7.8h, v0.h[6] // 17 * src[0]
mul v1.8h, v1.8h, v0.h[6] // 17 * src[16]
mls v3.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[8] + 22/2 * src[24]
mla v4.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[8] + 10/2 * src[24]
add v0.8h, v6.8h, v1.8h // t1 = 17 * src[0] + 17 * src[16]
sub v1.8h, v6.8h, v1.8h // t2 = 17 * src[0] - 17 * src[16]
neg v2.8h, v3.8h // -t4/2
neg v6.8h, v4.8h // -t3/2
ssra v4.8h, v0.8h, #1 // (t1 + t3) >> 1
ssra v2.8h, v1.8h, #1 // (t2 - t4) >> 1
ssra v3.8h, v1.8h, #1 // (t2 + t4) >> 1
ssra v6.8h, v0.8h, #1 // (t1 - t3) >> 1
srshr v0.8h, v4.8h, #6 // (t1 + t3 + 64) >> 7
srshr v1.8h, v2.8h, #6 // (t2 - t4 + 64) >> 7
srshr v2.8h, v3.8h, #6 // (t2 + t4 + 64) >> 7
srshr v3.8h, v6.8h, #6 // (t1 - t3 + 64) >> 7
uaddw v0.8h, v0.8h, v5.8b
uaddw v1.8h, v1.8h, v18.8b
uaddw v2.8h, v2.8h, v22.8b
uaddw v3.8h, v3.8h, v25.8b
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
sqxtun v2.8b, v2.8h
sqxtun v3.8b, v3.8h
st1 {v0.8b}, [x3], x1
st1 {v1.8b}, [x3], x1
st1 {v2.8b}, [x3], x1
st1 {v3.8b}, [x3]
ret
endfunc
// VC-1 4x8 inverse transform
// On entry:
// x0 -> array of 8-bit samples, in row-major order
// x1 = row stride for 8-bit sample array
// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
// On exit:
// array at x0 updated by saturated addition of (narrowed) transformed block
function ff_vc1_inv_trans_4x8_neon, export=1
mov x3, #16
ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector
mov x4, x0
ld1 {v1.d}[0], [x2], x3 // 00 01 02 03
ld1 {v2.d}[0], [x2], x3 // 10 11 12 13
ld1 {v3.d}[0], [x2], x3 // 20 21 22 23
ld1 {v4.d}[0], [x2], x3 // 30 31 32 33
ld1 {v1.d}[1], [x2], x3 // 40 41 42 43
ld1 {v2.d}[1], [x2], x3 // 50 51 52 53
ld1 {v3.d}[1], [x2], x3 // 60 61 62 63
ld1 {v4.d}[1], [x2] // 70 71 72 73
ld1 {v5.s}[0], [x0], x1
ld1 {v6.s}[0], [x0], x1
ld1 {v7.s}[0], [x0], x1
trn2 v16.8h, v1.8h, v2.8h // 01 11 03 13 41 51 43 53
trn1 v1.8h, v1.8h, v2.8h // 00 10 02 12 40 50 42 52
trn2 v2.8h, v3.8h, v4.8h // 21 31 23 33 61 71 63 73
trn1 v3.8h, v3.8h, v4.8h // 20 30 22 32 60 70 62 72
ld1 {v4.s}[0], [x0], x1
trn2 v17.4s, v16.4s, v2.4s // 03 13 23 33 43 53 63 73
trn1 v18.4s, v1.4s, v3.4s // 00 10 20 30 40 50 60 70
trn1 v2.4s, v16.4s, v2.4s // 01 11 21 31 41 51 61 71
mul v16.8h, v17.8h, v0.h[4] // 10/2 * src[3]
ld1 {v5.s}[1], [x0], x1
mul v17.8h, v17.8h, v0.h[5] // 22/2 * src[3]
ld1 {v6.s}[1], [x0], x1
trn2 v1.4s, v1.4s, v3.4s // 02 12 22 32 42 52 62 72
mul v3.8h, v18.8h, v0.h[6] // 17 * src[0]
ld1 {v7.s}[1], [x0], x1
mul v1.8h, v1.8h, v0.h[6] // 17 * src[2]
ld1 {v4.s}[1], [x0]
mla v16.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[1] + 10/2 * src[3]
mls v17.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[1] + 22/2 * src[3]
add v2.8h, v3.8h, v1.8h // t1 = 17 * src[0] + 17 * src[2]
sub v1.8h, v3.8h, v1.8h // t2 = 17 * src[0] - 17 * src[2]
neg v3.8h, v16.8h // -t3/2
ssra v16.8h, v2.8h, #1 // (t1 + t3) >> 1
neg v18.8h, v17.8h // -t4/2
ssra v17.8h, v1.8h, #1 // (t2 + t4) >> 1
ssra v3.8h, v2.8h, #1 // (t1 - t3) >> 1
ssra v18.8h, v1.8h, #1 // (t2 - t4) >> 1
srshr v1.8h, v16.8h, #2 // (t1 + t3 + 64) >> 3
srshr v2.8h, v17.8h, #2 // (t2 + t4 + 64) >> 3
srshr v3.8h, v3.8h, #2 // (t1 - t3 + 64) >> 3
srshr v16.8h, v18.8h, #2 // (t2 - t4 + 64) >> 3
trn2 v17.8h, v2.8h, v3.8h // 12 13 32 33 52 53 72 73
trn2 v18.8h, v1.8h, v16.8h // 10 11 30 31 50 51 70 71
trn1 v1.8h, v1.8h, v16.8h // 00 01 20 21 40 41 60 61
trn1 v2.8h, v2.8h, v3.8h // 02 03 22 23 42 43 62 63
trn1 v3.4s, v18.4s, v17.4s // 10 11 12 13 50 51 52 53
trn2 v16.4s, v18.4s, v17.4s // 30 31 32 33 70 71 72 73
trn1 v17.4s, v1.4s, v2.4s // 00 01 02 03 40 41 42 43
mov d18, v3.d[1] // 50 51 52 53
shl v19.4h, v3.4h, #4 // 16 * src[8]
mov d20, v16.d[1] // 70 71 72 73
shl v21.4h, v16.4h, #4 // 16 * src[24]
mov d22, v17.d[1] // 40 41 42 43
shl v23.4h, v3.4h, #2 // 4 * src[8]
shl v24.4h, v18.4h, #4 // 16 * src[40]
shl v25.4h, v20.4h, #4 // 16 * src[56]
shl v26.4h, v18.4h, #2 // 4 * src[40]
trn2 v1.4s, v1.4s, v2.4s // 20 21 22 23 60 61 62 63
ssra v24.4h, v21.4h, #2 // 4 * src[24] + 16 * src[40]
sub v2.4h, v25.4h, v23.4h // - 4 * src[8] + 16 * src[56]
shl v17.4h, v17.4h, #2 // 8/2 * src[0]
sub v21.4h, v21.4h, v26.4h // 16 * src[24] - 4 * src[40]
shl v22.4h, v22.4h, #2 // 8/2 * src[32]
mov d23, v1.d[1] // 60 61 62 63
ssra v19.4h, v25.4h, #2 // 16 * src[8] + 4 * src[56]
mul v25.4h, v1.4h, v0.h[0] // 6/2 * src[16]
shl v1.4h, v1.4h, #3 // 16/2 * src[16]
mls v24.4h, v3.4h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40]
ssra v17.4h, v17.4h, #1 // 12/2 * src[0]
mls v21.4h, v3.4h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40]
ssra v22.4h, v22.4h, #1 // 12/2 * src[32]
mla v2.4h, v16.4h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
shl v3.4h, v23.4h, #3 // 16/2 * src[48]
mla v19.4h, v16.4h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56]
mla v1.4h, v23.4h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
mla v24.4h, v20.4h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56]
add v16.4h, v17.4h, v22.4h // t1/2 = 12/2 * src[0] + 12/2 * src[32]
sub v3.4h, v25.4h, v3.4h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
sub v17.4h, v17.4h, v22.4h // t2/2 = 12/2 * src[0] - 12/2 * src[32]
mls v21.4h, v20.4h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56]
mla v19.4h, v18.4h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56]
add v20.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2
mls v2.4h, v18.4h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56]
sub v0.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2
add v18.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2
sub v22.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2
neg v23.4h, v24.4h // +t2
sub v25.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2
add v3.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2
neg v17.4h, v21.4h // +t3
sub v26.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2
add v1.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2
neg v16.4h, v19.4h // -t1
neg v27.4h, v2.4h // +t4
ssra v20.4h, v19.4h, #1 // (t5 + t1) >> 1
srsra v0.4h, v2.4h, #1 // (t8 - t4 + 1) >> 1
ssra v18.4h, v23.4h, #1 // (t6 + t2) >> 1
srsra v22.4h, v21.4h, #1 // (t7 - t3 + 1) >> 1
ssra v25.4h, v17.4h, #1 // (t7 + t3) >> 1
srsra v3.4h, v24.4h, #1 // (t6 - t2 + 1) >> 1
ssra v26.4h, v27.4h, #1 // (t8 + t4) >> 1
srsra v1.4h, v16.4h, #1 // (t5 - t1 + 1) >> 1
trn1 v0.2d, v20.2d, v0.2d
trn1 v2.2d, v18.2d, v22.2d
trn1 v3.2d, v25.2d, v3.2d
trn1 v1.2d, v26.2d, v1.2d
srshr v0.8h, v0.8h, #6 // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
srshr v2.8h, v2.8h, #6 // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
srshr v3.8h, v3.8h, #6 // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
srshr v1.8h, v1.8h, #6 // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
uaddw v0.8h, v0.8h, v5.8b
uaddw v2.8h, v2.8h, v6.8b
uaddw v3.8h, v3.8h, v7.8b
uaddw v1.8h, v1.8h, v4.8b
sqxtun v0.8b, v0.8h
sqxtun v2.8b, v2.8h
sqxtun v3.8b, v3.8h
sqxtun v1.8b, v1.8h
st1 {v0.s}[0], [x4], x1
st1 {v2.s}[0], [x4], x1
st1 {v3.s}[0], [x4], x1
st1 {v1.s}[0], [x4], x1
st1 {v0.s}[1], [x4], x1
st1 {v2.s}[1], [x4], x1
st1 {v3.s}[1], [x4], x1
st1 {v1.s}[1], [x4]
ret
endfunc
// VC-1 4x4 inverse transform
// On entry:
// x0 -> array of 8-bit samples, in row-major order
// x1 = row stride for 8-bit sample array
// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
// On exit:
// array at x0 updated by saturated addition of (narrowed) transformed block
function ff_vc1_inv_trans_4x4_neon, export=1
mov x3, #16
ldr d0, .Lcoeffs_it4
mov x4, x0
ld1 {v1.d}[0], [x2], x3 // 00 01 02 03
ld1 {v2.d}[0], [x2], x3 // 10 11 12 13
ld1 {v3.d}[0], [x2], x3 // 20 21 22 23
ld1 {v4.d}[0], [x2] // 30 31 32 33
ld1 {v5.s}[0], [x0], x1
ld1 {v5.s}[1], [x0], x1
ld1 {v6.s}[0], [x0], x1
trn2 v7.4h, v1.4h, v2.4h // 01 11 03 13
trn1 v1.4h, v1.4h, v2.4h // 00 10 02 12
ld1 {v6.s}[1], [x0]
trn2 v2.4h, v3.4h, v4.4h // 21 31 23 33
trn1 v3.4h, v3.4h, v4.4h // 20 30 22 32
trn2 v4.2s, v7.2s, v2.2s // 03 13 23 33
trn1 v16.2s, v1.2s, v3.2s // 00 10 20 30
trn1 v2.2s, v7.2s, v2.2s // 01 11 21 31
trn2 v1.2s, v1.2s, v3.2s // 02 12 22 32
mul v3.4h, v4.4h, v0.h[0] // 10/2 * src[3]
mul v4.4h, v4.4h, v0.h[1] // 22/2 * src[3]
mul v7.4h, v16.4h, v0.h[2] // 17 * src[0]
mul v1.4h, v1.4h, v0.h[2] // 17 * src[2]
mla v3.4h, v2.4h, v0.h[1] // t3/2 = 22/2 * src[1] + 10/2 * src[3]
mls v4.4h, v2.4h, v0.h[0] // t4/2 = - 10/2 * src[1] + 22/2 * src[3]
add v2.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[2]
sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[2]
neg v7.4h, v3.4h // -t3/2
neg v16.4h, v4.4h // -t4/2
ssra v3.4h, v2.4h, #1 // (t1 + t3) >> 1
ssra v4.4h, v1.4h, #1 // (t2 + t4) >> 1
ssra v16.4h, v1.4h, #1 // (t2 - t4) >> 1
ssra v7.4h, v2.4h, #1 // (t1 - t3) >> 1
srshr v1.4h, v3.4h, #2 // (t1 + t3 + 64) >> 3
srshr v2.4h, v4.4h, #2 // (t2 + t4 + 64) >> 3
srshr v3.4h, v16.4h, #2 // (t2 - t4 + 64) >> 3
srshr v4.4h, v7.4h, #2 // (t1 - t3 + 64) >> 3
trn2 v7.4h, v1.4h, v3.4h // 10 11 30 31
trn1 v1.4h, v1.4h, v3.4h // 00 01 20 21
trn2 v3.4h, v2.4h, v4.4h // 12 13 32 33
trn1 v2.4h, v2.4h, v4.4h // 02 03 22 23
trn2 v4.2s, v7.2s, v3.2s // 30 31 32 33
trn1 v16.2s, v1.2s, v2.2s // 00 01 02 03
trn1 v3.2s, v7.2s, v3.2s // 10 11 12 13
trn2 v1.2s, v1.2s, v2.2s // 20 21 22 23
mul v2.4h, v4.4h, v0.h[1] // 22/2 * src[24]
mul v4.4h, v4.4h, v0.h[0] // 10/2 * src[24]
mul v7.4h, v16.4h, v0.h[2] // 17 * src[0]
mul v1.4h, v1.4h, v0.h[2] // 17 * src[16]
mls v2.4h, v3.4h, v0.h[0] // t4/2 = - 10/2 * src[8] + 22/2 * src[24]
mla v4.4h, v3.4h, v0.h[1] // t3/2 = 22/2 * src[8] + 10/2 * src[24]
add v0.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[16]
sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[16]
neg v3.4h, v2.4h // -t4/2
neg v7.4h, v4.4h // -t3/2
ssra v4.4h, v0.4h, #1 // (t1 + t3) >> 1
ssra v3.4h, v1.4h, #1 // (t2 - t4) >> 1
ssra v2.4h, v1.4h, #1 // (t2 + t4) >> 1
ssra v7.4h, v0.4h, #1 // (t1 - t3) >> 1
trn1 v0.2d, v4.2d, v3.2d
trn1 v1.2d, v2.2d, v7.2d
srshr v0.8h, v0.8h, #6 // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
srshr v1.8h, v1.8h, #6 // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
uaddw v0.8h, v0.8h, v5.8b
uaddw v1.8h, v1.8h, v6.8b
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
st1 {v0.s}[0], [x4], x1
st1 {v0.s}[1], [x4], x1
st1 {v1.s}[0], [x4], x1
st1 {v1.s}[1], [x4]
ret
endfunc
// VC-1 8x8 inverse transform, DC case
// On entry:
// x0 -> array of 8-bit samples, in row-major order
// x1 = row stride for 8-bit sample array
// x2 -> 16-bit inverse transform DC coefficient
// On exit:
// array at x0 updated by saturated addition of (narrowed) transformed block
function ff_vc1_inv_trans_8x8_dc_neon, export=1
ldrsh w2, [x2]
mov x3, x0
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x0], x1
ld1 {v2.8b}, [x0], x1
add w2, w2, w2, lsl #1
ld1 {v3.8b}, [x0], x1
ld1 {v4.8b}, [x0], x1
add w2, w2, #1
ld1 {v5.8b}, [x0], x1
asr w2, w2, #1
ld1 {v6.8b}, [x0], x1
add w2, w2, w2, lsl #1
ld1 {v7.8b}, [x0]
add w0, w2, #16
asr w0, w0, #5
dup v16.8h, w0
uaddw v0.8h, v16.8h, v0.8b
uaddw v1.8h, v16.8h, v1.8b
uaddw v2.8h, v16.8h, v2.8b
uaddw v3.8h, v16.8h, v3.8b
uaddw v4.8h, v16.8h, v4.8b
uaddw v5.8h, v16.8h, v5.8b
sqxtun v0.8b, v0.8h
uaddw v6.8h, v16.8h, v6.8b
sqxtun v1.8b, v1.8h
uaddw v7.8h, v16.8h, v7.8b
sqxtun v2.8b, v2.8h
sqxtun v3.8b, v3.8h
sqxtun v4.8b, v4.8h
st1 {v0.8b}, [x3], x1
sqxtun v0.8b, v5.8h
st1 {v1.8b}, [x3], x1
sqxtun v1.8b, v6.8h
st1 {v2.8b}, [x3], x1
sqxtun v2.8b, v7.8h
st1 {v3.8b}, [x3], x1
st1 {v4.8b}, [x3], x1
st1 {v0.8b}, [x3], x1
st1 {v1.8b}, [x3], x1
st1 {v2.8b}, [x3]
ret
endfunc
// VC-1 8x4 inverse transform, DC case
// On entry:
// x0 -> array of 8-bit samples, in row-major order
// x1 = row stride for 8-bit sample array
// x2 -> 16-bit inverse transform DC coefficient
// On exit:
// array at x0 updated by saturated addition of (narrowed) transformed block
function ff_vc1_inv_trans_8x4_dc_neon, export=1
ldrsh w2, [x2]
mov x3, x0
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x0], x1
ld1 {v2.8b}, [x0], x1
add w2, w2, w2, lsl #1
ld1 {v3.8b}, [x0]
add w0, w2, #1
asr w0, w0, #1
add w0, w0, w0, lsl #4
add w0, w0, #64
asr w0, w0, #7
dup v4.8h, w0
uaddw v0.8h, v4.8h, v0.8b
uaddw v1.8h, v4.8h, v1.8b
uaddw v2.8h, v4.8h, v2.8b
uaddw v3.8h, v4.8h, v3.8b
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
sqxtun v2.8b, v2.8h
sqxtun v3.8b, v3.8h
st1 {v0.8b}, [x3], x1
st1 {v1.8b}, [x3], x1
st1 {v2.8b}, [x3], x1
st1 {v3.8b}, [x3]
ret
endfunc
// VC-1 4x8 inverse transform, DC case
// On entry:
// x0 -> array of 8-bit samples, in row-major order
// x1 = row stride for 8-bit sample array
// x2 -> 16-bit inverse transform DC coefficient
// On exit:
// array at x0 updated by saturated addition of (narrowed) transformed block
function ff_vc1_inv_trans_4x8_dc_neon, export=1
ldrsh w2, [x2]
mov x3, x0
ld1 {v0.s}[0], [x0], x1
ld1 {v1.s}[0], [x0], x1
ld1 {v2.s}[0], [x0], x1
add w2, w2, w2, lsl #4
ld1 {v3.s}[0], [x0], x1
add w2, w2, #4
asr w2, w2, #3
add w2, w2, w2, lsl #1
ld1 {v0.s}[1], [x0], x1
add w2, w2, #16
asr w2, w2, #5
dup v4.8h, w2
ld1 {v1.s}[1], [x0], x1
ld1 {v2.s}[1], [x0], x1
ld1 {v3.s}[1], [x0]
uaddw v0.8h, v4.8h, v0.8b
uaddw v1.8h, v4.8h, v1.8b
uaddw v2.8h, v4.8h, v2.8b
uaddw v3.8h, v4.8h, v3.8b
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
sqxtun v2.8b, v2.8h
sqxtun v3.8b, v3.8h
st1 {v0.s}[0], [x3], x1
st1 {v1.s}[0], [x3], x1
st1 {v2.s}[0], [x3], x1
st1 {v3.s}[0], [x3], x1
st1 {v0.s}[1], [x3], x1
st1 {v1.s}[1], [x3], x1
st1 {v2.s}[1], [x3], x1
st1 {v3.s}[1], [x3]
ret
endfunc
// VC-1 4x4 inverse transform, DC case
// On entry:
// x0 -> array of 8-bit samples, in row-major order
// x1 = row stride for 8-bit sample array
// x2 -> 16-bit inverse transform DC coefficient
// On exit:
// array at x0 updated by saturated addition of (narrowed) transformed block
function ff_vc1_inv_trans_4x4_dc_neon, export=1
ldrsh w2, [x2]
mov x3, x0
ld1 {v0.s}[0], [x0], x1
ld1 {v1.s}[0], [x0], x1
ld1 {v0.s}[1], [x0], x1
add w2, w2, w2, lsl #4
ld1 {v1.s}[1], [x0]
add w0, w2, #4
asr w0, w0, #3
add w0, w0, w0, lsl #4
add w0, w0, #64
asr w0, w0, #7
dup v2.8h, w0
uaddw v0.8h, v2.8h, v0.8b
uaddw v1.8h, v2.8h, v1.8b
sqxtun v0.8b, v0.8h
sqxtun v1.8b, v1.8h
st1 {v0.s}[0], [x3], x1
st1 {v1.s}[0], [x3], x1
st1 {v0.s}[1], [x3], x1
st1 {v1.s}[1], [x3]
ret
endfunc
.align 5 .align 5
.Lcoeffs_it8:
.quad 0x000F00090003
.Lcoeffs_it4:
.quad 0x0011000B0005
.Lcoeffs: .Lcoeffs:
.quad 0x00050002 .quad 0x00050002