mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-13 21:28:01 +02:00
arm/aarch64: vp9: Fix vertical alignment
Align the second/third operands as they usually are. Due to the wildly varying sizes of the written out operands in aarch64 assembly, the column alignment is usually not as clear as in arm assembly. Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
827a05eaa9
commit
7995ebfad1
@ -380,7 +380,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
|
|||||||
.ifc \txfm1\()_\txfm2,idct_idct
|
.ifc \txfm1\()_\txfm2,idct_idct
|
||||||
movrel x4, idct_coeffs
|
movrel x4, idct_coeffs
|
||||||
.else
|
.else
|
||||||
movrel x4, iadst8_coeffs
|
movrel x4, iadst8_coeffs
|
||||||
ld1 {v1.8h}, [x4], #16
|
ld1 {v1.8h}, [x4], #16
|
||||||
.endif
|
.endif
|
||||||
ld1 {v0.8h}, [x4]
|
ld1 {v0.8h}, [x4]
|
||||||
@ -480,23 +480,23 @@ itxfm_func8x8 iadst, iadst
|
|||||||
|
|
||||||
|
|
||||||
function idct16x16_dc_add_neon
|
function idct16x16_dc_add_neon
|
||||||
movrel x4, idct_coeffs
|
movrel x4, idct_coeffs
|
||||||
ld1 {v0.4h}, [x4]
|
ld1 {v0.4h}, [x4]
|
||||||
|
|
||||||
movi v1.4h, #0
|
movi v1.4h, #0
|
||||||
|
|
||||||
ld1 {v2.h}[0], [x2]
|
ld1 {v2.h}[0], [x2]
|
||||||
smull v2.4s, v2.4h, v0.h[0]
|
smull v2.4s, v2.4h, v0.h[0]
|
||||||
rshrn v2.4h, v2.4s, #14
|
rshrn v2.4h, v2.4s, #14
|
||||||
smull v2.4s, v2.4h, v0.h[0]
|
smull v2.4s, v2.4h, v0.h[0]
|
||||||
rshrn v2.4h, v2.4s, #14
|
rshrn v2.4h, v2.4s, #14
|
||||||
dup v2.8h, v2.h[0]
|
dup v2.8h, v2.h[0]
|
||||||
st1 {v1.h}[0], [x2]
|
st1 {v1.h}[0], [x2]
|
||||||
|
|
||||||
srshr v2.8h, v2.8h, #6
|
srshr v2.8h, v2.8h, #6
|
||||||
|
|
||||||
mov x3, x0
|
mov x3, x0
|
||||||
mov x4, #16
|
mov x4, #16
|
||||||
1:
|
1:
|
||||||
// Loop to add the constant from v2 into all 16x16 outputs
|
// Loop to add the constant from v2 into all 16x16 outputs
|
||||||
subs x4, x4, #2
|
subs x4, x4, #2
|
||||||
@ -869,7 +869,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
|
|||||||
.ifc \txfm1,idct
|
.ifc \txfm1,idct
|
||||||
ld1 {v0.8h,v1.8h}, [x10]
|
ld1 {v0.8h,v1.8h}, [x10]
|
||||||
.endif
|
.endif
|
||||||
mov x9, #32
|
mov x9, #32
|
||||||
|
|
||||||
.ifc \txfm1\()_\txfm2,idct_idct
|
.ifc \txfm1\()_\txfm2,idct_idct
|
||||||
cmp w3, #10
|
cmp w3, #10
|
||||||
@ -1046,10 +1046,10 @@ idct16_partial quarter
|
|||||||
idct16_partial half
|
idct16_partial half
|
||||||
|
|
||||||
function idct32x32_dc_add_neon
|
function idct32x32_dc_add_neon
|
||||||
movrel x4, idct_coeffs
|
movrel x4, idct_coeffs
|
||||||
ld1 {v0.4h}, [x4]
|
ld1 {v0.4h}, [x4]
|
||||||
|
|
||||||
movi v1.4h, #0
|
movi v1.4h, #0
|
||||||
|
|
||||||
ld1 {v2.h}[0], [x2]
|
ld1 {v2.h}[0], [x2]
|
||||||
smull v2.4s, v2.4h, v0.h[0]
|
smull v2.4s, v2.4h, v0.h[0]
|
||||||
@ -1059,10 +1059,10 @@ function idct32x32_dc_add_neon
|
|||||||
dup v2.8h, v2.h[0]
|
dup v2.8h, v2.h[0]
|
||||||
st1 {v1.h}[0], [x2]
|
st1 {v1.h}[0], [x2]
|
||||||
|
|
||||||
srshr v0.8h, v2.8h, #6
|
srshr v0.8h, v2.8h, #6
|
||||||
|
|
||||||
mov x3, x0
|
mov x3, x0
|
||||||
mov x4, #32
|
mov x4, #32
|
||||||
1:
|
1:
|
||||||
// Loop to add the constant v0 into all 32x32 outputs
|
// Loop to add the constant v0 into all 32x32 outputs
|
||||||
subs x4, x4, #2
|
subs x4, x4, #2
|
||||||
@ -1230,7 +1230,7 @@ endfunc
|
|||||||
// x9 = double input stride
|
// x9 = double input stride
|
||||||
function idct32_1d_8x32_pass1\suffix\()_neon
|
function idct32_1d_8x32_pass1\suffix\()_neon
|
||||||
mov x14, x30
|
mov x14, x30
|
||||||
movi v2.8h, #0
|
movi v2.8h, #0
|
||||||
|
|
||||||
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
|
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
|
||||||
.ifb \suffix
|
.ifb \suffix
|
||||||
@ -1295,7 +1295,7 @@ function idct32_1d_8x32_pass1\suffix\()_neon
|
|||||||
.endif
|
.endif
|
||||||
add x2, x2, #64
|
add x2, x2, #64
|
||||||
|
|
||||||
movi v2.8h, #0
|
movi v2.8h, #0
|
||||||
// v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
|
// v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
|
||||||
.ifb \suffix
|
.ifb \suffix
|
||||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||||
|
@ -530,7 +530,7 @@ function idct16x16_dc_add_neon
|
|||||||
movrel r12, idct_coeffs
|
movrel r12, idct_coeffs
|
||||||
vld1.16 {d0}, [r12,:64]
|
vld1.16 {d0}, [r12,:64]
|
||||||
|
|
||||||
vmov.i16 q2, #0
|
vmov.i16 q2, #0
|
||||||
|
|
||||||
vld1.16 {d16[]}, [r2,:16]
|
vld1.16 {d16[]}, [r2,:16]
|
||||||
vmull.s16 q8, d16, d0[0]
|
vmull.s16 q8, d16, d0[0]
|
||||||
@ -793,7 +793,7 @@ function \txfm\()16_1d_4x16_pass1_neon
|
|||||||
push {lr}
|
push {lr}
|
||||||
|
|
||||||
mov r12, #32
|
mov r12, #32
|
||||||
vmov.s16 q2, #0
|
vmov.s16 q2, #0
|
||||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||||
vld1.16 {d\i}, [r2,:64]
|
vld1.16 {d\i}, [r2,:64]
|
||||||
vst1.16 {d4}, [r2,:64], r12
|
vst1.16 {d4}, [r2,:64], r12
|
||||||
@ -1142,7 +1142,7 @@ function idct32x32_dc_add_neon
|
|||||||
movrel r12, idct_coeffs
|
movrel r12, idct_coeffs
|
||||||
vld1.16 {d0}, [r12,:64]
|
vld1.16 {d0}, [r12,:64]
|
||||||
|
|
||||||
vmov.i16 q2, #0
|
vmov.i16 q2, #0
|
||||||
|
|
||||||
vld1.16 {d16[]}, [r2,:16]
|
vld1.16 {d16[]}, [r2,:16]
|
||||||
vmull.s16 q8, d16, d0[0]
|
vmull.s16 q8, d16, d0[0]
|
||||||
@ -1330,7 +1330,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
|
|||||||
|
|
||||||
@ Double stride of the input, since we only read every other line
|
@ Double stride of the input, since we only read every other line
|
||||||
mov r12, #128
|
mov r12, #128
|
||||||
vmov.s16 d4, #0
|
vmov.s16 d4, #0
|
||||||
|
|
||||||
@ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
|
@ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
|
||||||
.ifb \suffix
|
.ifb \suffix
|
||||||
@ -1394,7 +1394,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
|
|||||||
.endif
|
.endif
|
||||||
add r2, r2, #64
|
add r2, r2, #64
|
||||||
|
|
||||||
vmov.s16 d8, #0
|
vmov.s16 d8, #0
|
||||||
@ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
|
@ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
|
||||||
.ifb \suffix
|
.ifb \suffix
|
||||||
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||||
@ -1533,9 +1533,9 @@ function idct32_1d_4x32_pass2\suffix\()_neon
|
|||||||
.endif
|
.endif
|
||||||
vld1.32 {d12[]}, [r0,:32], r1
|
vld1.32 {d12[]}, [r0,:32], r1
|
||||||
vld1.32 {d12[1]}, [r0,:32], r1
|
vld1.32 {d12[1]}, [r0,:32], r1
|
||||||
vrshr.s16 q4, q4, #6
|
vrshr.s16 q4, q4, #6
|
||||||
vld1.32 {d13[]}, [r0,:32], r1
|
vld1.32 {d13[]}, [r0,:32], r1
|
||||||
vrshr.s16 q5, q5, #6
|
vrshr.s16 q5, q5, #6
|
||||||
vld1.32 {d13[1]}, [r0,:32], r1
|
vld1.32 {d13[1]}, [r0,:32], r1
|
||||||
sub r0, r0, r1, lsl #2
|
sub r0, r0, r1, lsl #2
|
||||||
vaddw.u8 q4, q4, d12
|
vaddw.u8 q4, q4, d12
|
||||||
|
@ -828,7 +828,7 @@ function ff_vp9_loop_filter_v_16_16_neon, export=1
|
|||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
function vp9_loop_filter_h_16_neon
|
function vp9_loop_filter_h_16_neon
|
||||||
sub r12, r0, #8
|
sub r12, r0, #8
|
||||||
vld1.8 {d16}, [r12,:64], r1
|
vld1.8 {d16}, [r12,:64], r1
|
||||||
vld1.8 {d24}, [r0, :64], r1
|
vld1.8 {d24}, [r0, :64], r1
|
||||||
vld1.8 {d17}, [r12,:64], r1
|
vld1.8 {d17}, [r12,:64], r1
|
||||||
|
Loading…
Reference in New Issue
Block a user