mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
aarch64: Use ret x<n> instead of br x<n> where possible
Change AArch64 assembly code to use: ret x<n> instead of: br x<n> "ret x<n>" is already used in a lot of places so this patch makes it consistent across the code base. This does not change behavior or performance. In addition, this change reduces the number of landing pads needed in a subsequent patch to support the Armv8.5-A Branch Target Identification (BTI) security feature. Signed-off-by: Jonathan Wright <jonathan.wright@arm.com> Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
20c66fe2f9
commit
6f04cf54f5
@ -58,7 +58,7 @@ endconst
|
||||
.endm
|
||||
|
||||
.macro idct_end
|
||||
br x10
|
||||
ret x10
|
||||
.endm
|
||||
|
||||
.macro smull1 a, b, c
|
||||
|
@ -1040,7 +1040,7 @@ function \txfm\()16_1d_4x16_pass1_neon
|
||||
.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
|
||||
store \i, x0, #16
|
||||
.endr
|
||||
br x14
|
||||
ret x14
|
||||
1:
|
||||
// Special case: For the last input column (x1 == 12),
|
||||
// which would be stored as the last row in the temp buffer,
|
||||
@ -1068,7 +1068,7 @@ function \txfm\()16_1d_4x16_pass1_neon
|
||||
mov v29.16b, v17.16b
|
||||
mov v30.16b, v18.16b
|
||||
mov v31.16b, v19.16b
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
|
||||
@ -1098,7 +1098,7 @@ function \txfm\()16_1d_4x16_pass2_neon
|
||||
load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
|
||||
load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
|
||||
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
@ -1208,7 +1208,7 @@ function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
.endif
|
||||
br x15
|
||||
ret x15
|
||||
endfunc
|
||||
|
||||
function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
|
||||
@ -1264,7 +1264,7 @@ function idct16_1d_4x16_pass1_quarter_neon
|
||||
st1 {v23.4s}, [x0], #16
|
||||
st1 {v27.4s}, [x0], #16
|
||||
st1 {v31.4s}, [x0], #16
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function idct16_1d_4x16_pass2_quarter_neon
|
||||
@ -1286,7 +1286,7 @@ function idct16_1d_4x16_pass2_quarter_neon
|
||||
load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
|
||||
load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
|
||||
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function idct16_1d_4x16_pass1_half_neon
|
||||
@ -1313,7 +1313,7 @@ function idct16_1d_4x16_pass1_half_neon
|
||||
.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
|
||||
store \i, x0, #16
|
||||
.endr
|
||||
br x14
|
||||
ret x14
|
||||
1:
|
||||
// Special case: For the second input column (r1 == 4),
|
||||
// which would be stored as the second row in the temp buffer,
|
||||
@ -1341,7 +1341,7 @@ function idct16_1d_4x16_pass1_half_neon
|
||||
mov v21.16b, v17.16b
|
||||
mov v22.16b, v18.16b
|
||||
mov v23.16b, v19.16b
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function idct16_1d_4x16_pass2_half_neon
|
||||
@ -1364,7 +1364,7 @@ function idct16_1d_4x16_pass2_half_neon
|
||||
load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
|
||||
load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
|
||||
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
.macro idct16_partial size
|
||||
@ -1390,7 +1390,7 @@ function idct16x16_\size\()_add_16_neon
|
||||
|
||||
add sp, sp, #1024
|
||||
ldp d8, d9, [sp], 0x10
|
||||
br x15
|
||||
ret x15
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
@ -1729,7 +1729,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
|
||||
store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
|
||||
store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
|
||||
.purgem store_rev
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
// This is mostly the same as 4x32_pass1, but without the transpose,
|
||||
@ -1849,7 +1849,7 @@ function idct32_1d_4x32_pass2\suffix\()_neon
|
||||
load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1
|
||||
load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1
|
||||
.purgem load_acc_store
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
@ -1943,7 +1943,7 @@ function vp9_idct_idct_32x32_add_16_neon
|
||||
ldp d10, d11, [sp], 0x10
|
||||
ldp d8, d9, [sp], 0x10
|
||||
|
||||
br x15
|
||||
ret x15
|
||||
endfunc
|
||||
|
||||
function ff_vp9_idct_idct_32x32_add_10_neon, export=1
|
||||
@ -2009,7 +2009,7 @@ function idct32x32_\size\()_add_16_neon
|
||||
ldp d10, d11, [sp], 0x10
|
||||
ldp d8, d9, [sp], 0x10
|
||||
|
||||
br x15
|
||||
ret x15
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
@ -787,7 +787,7 @@ function \txfm\()16_1d_8x16_pass1_neon
|
||||
.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
|
||||
store \i, x0, #16
|
||||
.endr
|
||||
br x14
|
||||
ret x14
|
||||
1:
|
||||
// Special case: For the last input column (x1 == 8),
|
||||
// which would be stored as the last row in the temp buffer,
|
||||
@ -806,7 +806,7 @@ function \txfm\()16_1d_8x16_pass1_neon
|
||||
mov v29.16b, v21.16b
|
||||
mov v30.16b, v22.16b
|
||||
mov v31.16b, v23.16b
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
|
||||
@ -834,7 +834,7 @@ function \txfm\()16_1d_8x16_pass2_neon
|
||||
load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
|
||||
load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
|
||||
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
@ -925,7 +925,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
.endif
|
||||
br x15
|
||||
ret x15
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
@ -960,7 +960,7 @@ function idct16_1d_8x16_pass1_quarter_neon
|
||||
.irp i, 24, 25, 26, 27
|
||||
store \i, x0, x9
|
||||
.endr
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function idct16_1d_8x16_pass2_quarter_neon
|
||||
@ -978,7 +978,7 @@ function idct16_1d_8x16_pass2_quarter_neon
|
||||
load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
|
||||
load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
|
||||
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function idct16_1d_8x16_pass1_half_neon
|
||||
@ -1003,7 +1003,7 @@ function idct16_1d_8x16_pass1_half_neon
|
||||
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
store \i, x0, x9
|
||||
.endr
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
function idct16_1d_8x16_pass2_half_neon
|
||||
@ -1021,7 +1021,7 @@ function idct16_1d_8x16_pass2_half_neon
|
||||
load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
|
||||
load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
|
||||
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
.macro idct16_partial size
|
||||
@ -1038,7 +1038,7 @@ function idct16x16_\size\()_add_neon
|
||||
.endr
|
||||
|
||||
add sp, sp, #512
|
||||
br x15
|
||||
ret x15
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
@ -1349,7 +1349,7 @@ function idct32_1d_8x32_pass1\suffix\()_neon
|
||||
store_rev v25.8h, v17.8h
|
||||
store_rev v24.8h, v16.8h
|
||||
.purgem store_rev
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
|
||||
// This is mostly the same as 8x32_pass1, but without the transpose,
|
||||
@ -1466,7 +1466,7 @@ function idct32_1d_8x32_pass2\suffix\()_neon
|
||||
load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1
|
||||
load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1
|
||||
.purgem load_acc_store
|
||||
br x14
|
||||
ret x14
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
@ -1547,7 +1547,7 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
|
||||
ldp d8, d9, [sp], 0x10
|
||||
ldp d10, d11, [sp], 0x10
|
||||
|
||||
br x15
|
||||
ret x15
|
||||
endfunc
|
||||
|
||||
.macro idct32_partial size
|
||||
@ -1572,7 +1572,7 @@ function idct32x32_\size\()_add_neon
|
||||
ldp d8, d9, [sp], 0x10
|
||||
ldp d10, d11, [sp], 0x10
|
||||
|
||||
br x15
|
||||
ret x15
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
|
@ -57,7 +57,7 @@
|
||||
mov x12, v4.d[1]
|
||||
adds x11, x11, x12
|
||||
b.ne 1f
|
||||
br x10
|
||||
ret x10
|
||||
1:
|
||||
|
||||
.if \wd >= 8
|
||||
@ -193,7 +193,7 @@
|
||||
b.eq 6f
|
||||
.else
|
||||
b.ne 1f
|
||||
br x13
|
||||
ret x13
|
||||
1:
|
||||
.endif
|
||||
|
||||
@ -252,7 +252,7 @@
|
||||
b.ne 1f
|
||||
// If no pixels needed flat8in nor flat8out, jump to a
|
||||
// writeout of the inner 4 pixels
|
||||
br x14
|
||||
ret x14
|
||||
1:
|
||||
|
||||
mov x11, v7.d[0]
|
||||
@ -260,7 +260,7 @@
|
||||
adds x11, x11, x12
|
||||
b.ne 1f
|
||||
// If no pixels need flat8out, jump to a writeout of the inner 6 pixels
|
||||
br x15
|
||||
ret x15
|
||||
|
||||
1:
|
||||
// flat8out
|
||||
@ -434,7 +434,7 @@ function ff_\func\()_\bpp\()_neon, export=1
|
||||
ldp d10, d11, [sp], 0x10
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
br x16
|
||||
ret x16
|
||||
.else
|
||||
b \func\()_16_neon
|
||||
.endif
|
||||
@ -474,7 +474,7 @@ function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
.endif
|
||||
br x16
|
||||
ret x16
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
@ -508,7 +508,7 @@ function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
|
||||
lsl w3, w14, #\bpp - 8
|
||||
lsl w4, w15, #\bpp - 8
|
||||
bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
|
||||
br x16
|
||||
ret x16
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
@ -541,7 +541,7 @@ function vp9_loop_filter_v_4_8_16_neon
|
||||
st1 {v25.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_v_4_8
|
||||
@ -589,7 +589,7 @@ function vp9_loop_filter_h_4_8_16_neon
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #4
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_h_4_8
|
||||
@ -620,7 +620,7 @@ function vp9_loop_filter_v_8_8_16_neon
|
||||
sub x0, x0, x1, lsl #1
|
||||
sub x0, x0, x1
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
6:
|
||||
sub x9, x0, x1, lsl #1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
@ -628,7 +628,7 @@ function vp9_loop_filter_v_8_8_16_neon
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_v_8_8
|
||||
@ -671,7 +671,7 @@ function vp9_loop_filter_h_8_8_16_neon
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #8
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
6:
|
||||
// If we didn't need to do the flat8in part, we use the same writeback
|
||||
// as in loop_filter_h_4_8.
|
||||
@ -688,7 +688,7 @@ function vp9_loop_filter_h_8_8_16_neon
|
||||
st1 {v25.d}[1], [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #4
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_h_8_8
|
||||
@ -743,7 +743,7 @@ function vp9_loop_filter_v_16_8_16_neon
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, x1
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
8:
|
||||
add x9, x9, x1, lsl #2
|
||||
// If we didn't do the flat8out part, the output is left in the
|
||||
@ -756,7 +756,7 @@ function vp9_loop_filter_v_16_8_16_neon
|
||||
st1 {v26.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
sub x0, x0, x1
|
||||
br x10
|
||||
ret x10
|
||||
7:
|
||||
sub x9, x0, x1, lsl #1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
@ -764,7 +764,7 @@ function vp9_loop_filter_v_16_8_16_neon
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_v_16_8, push=1
|
||||
@ -821,7 +821,7 @@ function vp9_loop_filter_h_16_8_16_neon
|
||||
st1 {v31.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
8:
|
||||
// The same writeback as in loop_filter_h_8_8
|
||||
sub x9, x0, #8
|
||||
@ -838,7 +838,7 @@ function vp9_loop_filter_h_16_8_16_neon
|
||||
st1 {v27.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #8
|
||||
br x10
|
||||
ret x10
|
||||
7:
|
||||
// The same writeback as in loop_filter_h_4_8
|
||||
sub x9, x0, #4
|
||||
@ -854,7 +854,7 @@ function vp9_loop_filter_h_16_8_16_neon
|
||||
st1 {v25.d}[1], [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #4
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_h_16_8, push=1
|
||||
|
@ -399,7 +399,7 @@
|
||||
.endif
|
||||
// If no pixels needed flat8in nor flat8out, jump to a
|
||||
// writeout of the inner 4 pixels
|
||||
br x14
|
||||
ret x14
|
||||
1:
|
||||
|
||||
mov x5, v7.d[0]
|
||||
@ -411,7 +411,7 @@
|
||||
cbnz x5, 1f
|
||||
.endif
|
||||
// If no pixels need flat8out, jump to a writeout of the inner 6 pixels
|
||||
br x15
|
||||
ret x15
|
||||
|
||||
1:
|
||||
// flat8out
|
||||
@ -532,32 +532,32 @@ function vp9_loop_filter_4
|
||||
loop_filter 4, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31
|
||||
ret
|
||||
9:
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
function vp9_loop_filter_4_16b_mix_44
|
||||
loop_filter 4, .16b, 44, v16, v17, v18, v19, v28, v29, v30, v31
|
||||
ret
|
||||
9:
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
function vp9_loop_filter_8
|
||||
loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31
|
||||
ret
|
||||
6:
|
||||
br x13
|
||||
ret x13
|
||||
9:
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
function vp9_loop_filter_8_16b_mix
|
||||
loop_filter 8, .16b, 88, v16, v17, v18, v19, v28, v29, v30, v31
|
||||
ret
|
||||
6:
|
||||
br x13
|
||||
ret x13
|
||||
9:
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
function vp9_loop_filter_16
|
||||
@ -568,7 +568,7 @@ function vp9_loop_filter_16
|
||||
ldp d10, d11, [sp], 0x10
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
function vp9_loop_filter_16_16b
|
||||
@ -579,7 +579,7 @@ function vp9_loop_filter_16_16b
|
||||
ldp d10, d11, [sp], 0x10
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
.macro loop_filter_4
|
||||
@ -648,7 +648,7 @@ function ff_vp9_loop_filter_v_4_8_neon, export=1
|
||||
st1 {v23.8b}, [x9], x1
|
||||
st1 {v25.8b}, [x0], x1
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
function ff_vp9_loop_filter_v_44_16_neon, export=1
|
||||
@ -672,7 +672,7 @@ function ff_vp9_loop_filter_v_44_16_neon, export=1
|
||||
st1 {v23.16b}, [x9], x1
|
||||
st1 {v25.16b}, [x0], x1
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
function ff_vp9_loop_filter_h_4_8_neon, export=1
|
||||
@ -714,7 +714,7 @@ function ff_vp9_loop_filter_h_4_8_neon, export=1
|
||||
st1 {v25.s}[0], [x9], x1
|
||||
st1 {v25.s}[1], [x0], x1
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
function ff_vp9_loop_filter_h_44_16_neon, export=1
|
||||
@ -766,7 +766,7 @@ function ff_vp9_loop_filter_h_44_16_neon, export=1
|
||||
st1 {v25.s}[1], [x9], x1
|
||||
st1 {v25.s}[3], [x0], x1
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
function ff_vp9_loop_filter_v_8_8_neon, export=1
|
||||
@ -793,14 +793,14 @@ function ff_vp9_loop_filter_v_8_8_neon, export=1
|
||||
st1 {v23.8b}, [x9], x1
|
||||
st1 {v26.8b}, [x0], x1
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
6:
|
||||
sub x9, x0, x1, lsl #1
|
||||
st1 {v22.8b}, [x9], x1
|
||||
st1 {v24.8b}, [x0], x1
|
||||
st1 {v23.8b}, [x9], x1
|
||||
st1 {v25.8b}, [x0], x1
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
.macro mix_v_16 mix
|
||||
@ -828,14 +828,14 @@ function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1
|
||||
st1 {v23.16b}, [x9], x1
|
||||
st1 {v26.16b}, [x0], x1
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
6:
|
||||
sub x9, x0, x1, lsl #1
|
||||
st1 {v22.16b}, [x9], x1
|
||||
st1 {v24.16b}, [x0], x1
|
||||
st1 {v23.16b}, [x9], x1
|
||||
st1 {v25.16b}, [x0], x1
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
@ -876,7 +876,7 @@ function ff_vp9_loop_filter_h_8_8_neon, export=1
|
||||
st1 {v23.8b}, [x9], x1
|
||||
st1 {v27.8b}, [x0], x1
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
6:
|
||||
// If we didn't need to do the flat8in part, we use the same writeback
|
||||
// as in loop_filter_h_4_8.
|
||||
@ -891,7 +891,7 @@ function ff_vp9_loop_filter_h_8_8_neon, export=1
|
||||
st1 {v24.s}[1], [x0], x1
|
||||
st1 {v25.s}[0], [x9], x1
|
||||
st1 {v25.s}[1], [x0], x1
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
|
||||
.macro mix_h_16 mix
|
||||
@ -942,7 +942,7 @@ function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
|
||||
st1 {v27.8b}, [x9], x1
|
||||
st1 {v27.d}[1], [x0], x1
|
||||
|
||||
br x10
|
||||
ret x10
|
||||
6:
|
||||
add x9, x9, #2
|
||||
add x0, x0, #2
|
||||
@ -963,7 +963,7 @@ function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
|
||||
st1 {v24.s}[3], [x0], x1
|
||||
st1 {v25.s}[1], [x9], x1
|
||||
st1 {v25.s}[3], [x0], x1
|
||||
br x10
|
||||
ret x10
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
@ -1022,7 +1022,7 @@ function ff_vp9_loop_filter_v_16_8_neon, export=1
|
||||
ldp d10, d11, [sp], 0x10
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
br x10
|
||||
ret x10
|
||||
8:
|
||||
add x9, x9, x1, lsl #2
|
||||
// If we didn't do the flat8out part, the output is left in the
|
||||
@ -1091,7 +1091,7 @@ function ff_vp9_loop_filter_v_16_16_neon, export=1
|
||||
ldp d10, d11, [sp], 0x10
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
br x10
|
||||
ret x10
|
||||
8:
|
||||
add x9, x9, x1, lsl #2
|
||||
st1 {v21.16b}, [x9], x1
|
||||
@ -1168,7 +1168,7 @@ function ff_vp9_loop_filter_h_16_8_neon, export=1
|
||||
ldp d10, d11, [sp], 0x10
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
br x10
|
||||
ret x10
|
||||
8:
|
||||
// The same writeback as in loop_filter_h_8_8
|
||||
sub x9, x0, #4
|
||||
@ -1287,7 +1287,7 @@ function ff_vp9_loop_filter_h_16_16_neon, export=1
|
||||
ldp d10, d11, [sp], 0x10
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
br x10
|
||||
ret x10
|
||||
8:
|
||||
sub x9, x0, #4
|
||||
add x0, x9, x1, lsl #3
|
||||
|
Loading…
Reference in New Issue
Block a user