mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
ARM: allow building in Thumb2 mode
Signed-off-by: Mans Rullgard <mans@mansr.com>
This commit is contained in:
parent
9cd7b8549b
commit
8986fddc2b
3
configure
vendored
3
configure
vendored
@ -967,6 +967,7 @@ CONFIG_LIST="
|
||||
static
|
||||
swscale
|
||||
swscale_alpha
|
||||
thumb
|
||||
vaapi
|
||||
vdpau
|
||||
version3
|
||||
@ -2607,7 +2608,7 @@ if enabled alpha; then
|
||||
|
||||
elif enabled arm; then
|
||||
|
||||
check_cflags -marm
|
||||
enabled thumb && check_cflags -mthumb || check_cflags -marm
|
||||
nogas=die
|
||||
|
||||
if check_cpp_condition stddef.h "defined __ARM_PCS_VFP"; then
|
||||
|
@ -114,12 +114,15 @@ static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
|
||||
"vmov d1, %2, %3 \n\t"
|
||||
"lsls %6, %6, #1 \n\t"
|
||||
"and %0, %5, #1<<31 \n\t"
|
||||
"it cs \n\t"
|
||||
"lslcs %5, %5, #1 \n\t"
|
||||
"lsls %6, %6, #1 \n\t"
|
||||
"and %1, %5, #1<<31 \n\t"
|
||||
"it cs \n\t"
|
||||
"lslcs %5, %5, #1 \n\t"
|
||||
"lsls %6, %6, #1 \n\t"
|
||||
"and %2, %5, #1<<31 \n\t"
|
||||
"it cs \n\t"
|
||||
"lslcs %5, %5, #1 \n\t"
|
||||
"vmov d4, %0, %1 \n\t"
|
||||
"and %3, %5, #1<<31 \n\t"
|
||||
|
@ -27,6 +27,7 @@ function ff_ac3_update_bap_counts_arm, export=1
|
||||
lsl r3, lr, #1
|
||||
ldrh r12, [r0, r3]
|
||||
subs r2, r2, #1
|
||||
it gt
|
||||
ldrbgt lr, [r1], #1
|
||||
add r12, r12, #1
|
||||
strh r12, [r0, r3]
|
||||
|
@ -42,9 +42,11 @@ function ff_ac3_bit_alloc_calc_bap_armv6, export=1
|
||||
mov r11, r10
|
||||
ldrb r10, [r4], #1 @ band_start_tab[band++]
|
||||
subs r9, r9, r5 @ - floor
|
||||
it lt
|
||||
movlt r9, #0
|
||||
cmp r10, r3 @ - end
|
||||
and r9, r9, r8 @ & 0x1fe0
|
||||
ite gt
|
||||
subgt r8, r3, r11
|
||||
suble r8, r10, r11
|
||||
add r9, r9, r5 @ + floor => m
|
||||
|
@ -41,6 +41,7 @@ endfunc
|
||||
|
||||
function ff_ac3_exponent_min_neon, export=1
|
||||
cmp r1, #0
|
||||
it eq
|
||||
bxeq lr
|
||||
push {lr}
|
||||
mov r12, #256
|
||||
|
@ -24,9 +24,18 @@
|
||||
# define ELF
|
||||
#else
|
||||
# define ELF @
|
||||
#endif
|
||||
|
||||
#if CONFIG_THUMB
|
||||
# define A @
|
||||
# define T
|
||||
#else
|
||||
# define A
|
||||
# define T @
|
||||
#endif
|
||||
|
||||
.syntax unified
|
||||
T .thumb
|
||||
|
||||
.macro require8 val=1
|
||||
ELF .eabi_attribute 24, \val
|
||||
@ -82,6 +91,90 @@ ELF .size \name, . - \name
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro ldr_pre rt, rn, rm:vararg
|
||||
A ldr \rt, [\rn, \rm]!
|
||||
T add \rn, \rn, \rm
|
||||
T ldr \rt, [\rn]
|
||||
.endm
|
||||
|
||||
.macro ldr_post rt, rn, rm:vararg
|
||||
A ldr \rt, [\rn], \rm
|
||||
T ldr \rt, [\rn]
|
||||
T add \rn, \rn, \rm
|
||||
.endm
|
||||
|
||||
.macro ldrd_reg rt, rt2, rn, rm
|
||||
A ldrd \rt, \rt2, [\rn, \rm]
|
||||
T add \rt, \rn, \rm
|
||||
T ldrd \rt, \rt2, [\rt]
|
||||
.endm
|
||||
|
||||
.macro ldrd_post rt, rt2, rn, rm
|
||||
A ldrd \rt, \rt2, [\rn], \rm
|
||||
T ldrd \rt, \rt2, [\rn]
|
||||
T add \rn, \rn, \rm
|
||||
.endm
|
||||
|
||||
.macro ldrh_pre rt, rn, rm
|
||||
A ldrh \rt, [\rn, \rm]!
|
||||
T add \rn, \rn, \rm
|
||||
T ldrh \rt, [\rn]
|
||||
.endm
|
||||
|
||||
.macro ldrh_dpre rt, rn, rm
|
||||
A ldrh \rt, [\rn, -\rm]!
|
||||
T sub \rn, \rn, \rm
|
||||
T ldrh \rt, [\rn]
|
||||
.endm
|
||||
|
||||
.macro ldrh_post rt, rn, rm
|
||||
A ldrh \rt, [\rn], \rm
|
||||
T ldrh \rt, [\rn]
|
||||
T add \rn, \rn, \rm
|
||||
.endm
|
||||
|
||||
.macro str_post rt, rn, rm:vararg
|
||||
A str \rt, [\rn], \rm
|
||||
T str \rt, [\rn]
|
||||
T add \rn, \rn, \rm
|
||||
.endm
|
||||
|
||||
.macro strb_post rt, rn, rm:vararg
|
||||
A strb \rt, [\rn], \rm
|
||||
T strb \rt, [\rn]
|
||||
T add \rn, \rn, \rm
|
||||
.endm
|
||||
|
||||
.macro strd_post rt, rt2, rn, rm
|
||||
A strd \rt, \rt2, [\rn], \rm
|
||||
T strd \rt, \rt2, [\rn]
|
||||
T add \rn, \rn, \rm
|
||||
.endm
|
||||
|
||||
.macro strh_pre rt, rn, rm
|
||||
A strh \rt, [\rn, \rm]!
|
||||
T add \rn, \rn, \rm
|
||||
T strh \rt, [\rn]
|
||||
.endm
|
||||
|
||||
.macro strh_dpre rt, rn, rm
|
||||
A strh \rt, [\rn, -\rm]!
|
||||
T sub \rn, \rn, \rm
|
||||
T strh \rt, [\rn]
|
||||
.endm
|
||||
|
||||
.macro strh_post rt, rn, rm
|
||||
A strh \rt, [\rn], \rm
|
||||
T strh \rt, [\rn]
|
||||
T add \rn, \rn, \rm
|
||||
.endm
|
||||
|
||||
.macro strh_dpost rt, rn, rm
|
||||
A strh \rt, [\rn], -\rm
|
||||
T strh \rt, [\rn]
|
||||
T sub \rn, \rn, \rm
|
||||
.endm
|
||||
|
||||
#if HAVE_VFP_ARGS
|
||||
.eabi_attribute 28, 1
|
||||
# define VFP
|
||||
|
@ -27,6 +27,7 @@ function ff_dca_lfe_fir_neon, export=1
|
||||
add r5, r2, #256*4-16 @ cf1
|
||||
sub r1, r1, #12
|
||||
cmp r3, #32
|
||||
ite eq
|
||||
moveq r6, #256/32
|
||||
movne r6, #256/64
|
||||
NOVFP vldr s0, [sp, #16] @ scale
|
||||
|
@ -554,10 +554,12 @@ endfunc
|
||||
and r9, r5, r14
|
||||
and r10, r6, r14
|
||||
and r11, r7, r14
|
||||
it eq
|
||||
andeq r14, r14, r14, \rnd #1
|
||||
add r8, r8, r10
|
||||
add r9, r9, r11
|
||||
ldr r12, =0xfcfcfcfc >> 2
|
||||
itt eq
|
||||
addeq r8, r8, r14
|
||||
addeq r9, r9, r14
|
||||
and r4, r12, r4, lsr #2
|
||||
@ -638,8 +640,10 @@ function ff_add_pixels_clamped_arm, export=1
|
||||
mvn r5, r5
|
||||
mvn r7, r7
|
||||
tst r6, #0x100
|
||||
it ne
|
||||
movne r6, r5, lsr #24
|
||||
tst r8, #0x100
|
||||
it ne
|
||||
movne r8, r7, lsr #24
|
||||
mov r9, r6
|
||||
ldrsh r5, [r0, #4] /* moved form [A] */
|
||||
@ -654,8 +658,10 @@ function ff_add_pixels_clamped_arm, export=1
|
||||
mvn r5, r5
|
||||
mvn r7, r7
|
||||
tst r6, #0x100
|
||||
it ne
|
||||
movne r6, r5, lsr #24
|
||||
tst r8, #0x100
|
||||
it ne
|
||||
movne r8, r7, lsr #24
|
||||
orr r9, r9, r6, lsl #16
|
||||
ldr r4, [r1, #4] /* moved form [B] */
|
||||
@ -676,8 +682,10 @@ function ff_add_pixels_clamped_arm, export=1
|
||||
mvn r5, r5
|
||||
mvn r7, r7
|
||||
tst r6, #0x100
|
||||
it ne
|
||||
movne r6, r5, lsr #24
|
||||
tst r8, #0x100
|
||||
it ne
|
||||
movne r8, r7, lsr #24
|
||||
mov r9, r6
|
||||
ldrsh r5, [r0, #12] /* moved from [D] */
|
||||
@ -692,8 +700,10 @@ function ff_add_pixels_clamped_arm, export=1
|
||||
mvn r5, r5
|
||||
mvn r7, r7
|
||||
tst r6, #0x100
|
||||
it ne
|
||||
movne r6, r5, lsr #24
|
||||
tst r8, #0x100
|
||||
it ne
|
||||
movne r8, r7, lsr #24
|
||||
orr r9, r9, r6, lsl #16
|
||||
add r0, r0, #16 /* moved from [E] */
|
||||
|
@ -47,16 +47,16 @@ function ff_put_pixels16_armv6, export=1
|
||||
ldr r5, [r1, #4]
|
||||
ldr r6, [r1, #8]
|
||||
ldr r7, [r1, #12]
|
||||
ldr r4, [r1], r2
|
||||
ldr_post r4, r1, r2
|
||||
strd r6, r7, [r0, #8]
|
||||
ldr r9, [r1, #4]
|
||||
strd r4, r5, [r0], r2
|
||||
strd_post r4, r5, r0, r2
|
||||
ldr r10, [r1, #8]
|
||||
ldr r11, [r1, #12]
|
||||
ldr r8, [r1], r2
|
||||
ldr_post r8, r1, r2
|
||||
strd r10, r11, [r0, #8]
|
||||
subs r3, r3, #2
|
||||
strd r8, r9, [r0], r2
|
||||
strd_post r8, r9, r0, r2
|
||||
bne 1b
|
||||
|
||||
pop {r4-r11}
|
||||
@ -67,12 +67,12 @@ function ff_put_pixels8_armv6, export=1
|
||||
push {r4-r7}
|
||||
1:
|
||||
ldr r5, [r1, #4]
|
||||
ldr r4, [r1], r2
|
||||
ldr_post r4, r1, r2
|
||||
ldr r7, [r1, #4]
|
||||
strd r4, r5, [r0], r2
|
||||
ldr r6, [r1], r2
|
||||
strd_post r4, r5, r0, r2
|
||||
ldr_post r6, r1, r2
|
||||
subs r3, r3, #2
|
||||
strd r6, r7, [r0], r2
|
||||
strd_post r6, r7, r0, r2
|
||||
bne 1b
|
||||
|
||||
pop {r4-r7}
|
||||
@ -90,7 +90,7 @@ function ff_put_pixels8_x2_armv6, export=1
|
||||
ldr r5, [r1, #4]
|
||||
ldr r7, [r1, #5]
|
||||
lsr r6, r4, #8
|
||||
ldr r8, [r1, r2]!
|
||||
ldr_pre r8, r1, r2
|
||||
orr r6, r6, r5, lsl #24
|
||||
ldr r9, [r1, #4]
|
||||
ldr r11, [r1, #5]
|
||||
@ -112,9 +112,9 @@ function ff_put_pixels8_x2_armv6, export=1
|
||||
uhadd8 r9, r9, r11
|
||||
and r6, r6, r12
|
||||
uadd8 r8, r8, r14
|
||||
strd r4, r5, [r0], r2
|
||||
strd_post r4, r5, r0, r2
|
||||
uadd8 r9, r9, r6
|
||||
strd r8, r9, [r0], r2
|
||||
strd_post r8, r9, r0, r2
|
||||
bne 1b
|
||||
|
||||
pop {r4-r11, pc}
|
||||
@ -127,7 +127,7 @@ function ff_put_pixels8_y2_armv6, export=1
|
||||
orr r12, r12, r12, lsl #16
|
||||
ldr r4, [r1]
|
||||
ldr r5, [r1, #4]
|
||||
ldr r6, [r1, r2]!
|
||||
ldr_pre r6, r1, r2
|
||||
ldr r7, [r1, #4]
|
||||
1:
|
||||
subs r3, r3, #2
|
||||
@ -136,7 +136,7 @@ function ff_put_pixels8_y2_armv6, export=1
|
||||
uhadd8 r9, r5, r7
|
||||
eor r11, r5, r7
|
||||
and r10, r10, r12
|
||||
ldr r4, [r1, r2]!
|
||||
ldr_pre r4, r1, r2
|
||||
uadd8 r8, r8, r10
|
||||
and r11, r11, r12
|
||||
uadd8 r9, r9, r11
|
||||
@ -148,11 +148,11 @@ function ff_put_pixels8_y2_armv6, export=1
|
||||
eor r7, r5, r7
|
||||
uadd8 r10, r10, r6
|
||||
and r7, r7, r12
|
||||
ldr r6, [r1, r2]!
|
||||
ldr_pre r6, r1, r2
|
||||
uadd8 r11, r11, r7
|
||||
strd r8, r9, [r0], r2
|
||||
strd_post r8, r9, r0, r2
|
||||
ldr r7, [r1, #4]
|
||||
strd r10, r11, [r0], r2
|
||||
strd_post r10, r11, r0, r2
|
||||
bne 1b
|
||||
|
||||
pop {r4-r11}
|
||||
@ -166,7 +166,7 @@ function ff_put_pixels8_x2_no_rnd_armv6, export=1
|
||||
ldr r4, [r1]
|
||||
ldr r5, [r1, #4]
|
||||
ldr r7, [r1, #5]
|
||||
ldr r8, [r1, r2]!
|
||||
ldr_pre r8, r1, r2
|
||||
ldr r9, [r1, #4]
|
||||
ldr r14, [r1, #5]
|
||||
add r1, r1, r2
|
||||
@ -191,16 +191,16 @@ function ff_put_pixels8_y2_no_rnd_armv6, export=1
|
||||
push {r4-r9, lr}
|
||||
ldr r4, [r1]
|
||||
ldr r5, [r1, #4]
|
||||
ldr r6, [r1, r2]!
|
||||
ldr_pre r6, r1, r2
|
||||
ldr r7, [r1, #4]
|
||||
1:
|
||||
subs r3, r3, #2
|
||||
uhadd8 r8, r4, r6
|
||||
ldr r4, [r1, r2]!
|
||||
ldr_pre r4, r1, r2
|
||||
uhadd8 r9, r5, r7
|
||||
ldr r5, [r1, #4]
|
||||
uhadd8 r12, r4, r6
|
||||
ldr r6, [r1, r2]!
|
||||
ldr_pre r6, r1, r2
|
||||
uhadd8 r14, r5, r7
|
||||
ldr r7, [r1, #4]
|
||||
stm r0, {r8,r9}
|
||||
@ -220,44 +220,44 @@ function ff_avg_pixels8_armv6, export=1
|
||||
orr lr, lr, lr, lsl #16
|
||||
ldrd r4, r5, [r0]
|
||||
ldr r10, [r1, #4]
|
||||
ldr r9, [r1], r2
|
||||
ldr_post r9, r1, r2
|
||||
subs r3, r3, #2
|
||||
1:
|
||||
pld [r1, r2]
|
||||
eor r8, r4, r9
|
||||
uhadd8 r4, r4, r9
|
||||
eor r12, r5, r10
|
||||
ldrd r6, r7, [r0, r2]
|
||||
ldrd_reg r6, r7, r0, r2
|
||||
uhadd8 r5, r5, r10
|
||||
and r8, r8, lr
|
||||
ldr r10, [r1, #4]
|
||||
and r12, r12, lr
|
||||
uadd8 r4, r4, r8
|
||||
ldr r9, [r1], r2
|
||||
ldr_post r9, r1, r2
|
||||
eor r8, r6, r9
|
||||
uadd8 r5, r5, r12
|
||||
pld [r1, r2, lsl #1]
|
||||
eor r12, r7, r10
|
||||
uhadd8 r6, r6, r9
|
||||
strd r4, r5, [r0], r2
|
||||
strd_post r4, r5, r0, r2
|
||||
uhadd8 r7, r7, r10
|
||||
beq 2f
|
||||
and r8, r8, lr
|
||||
ldrd r4, r5, [r0, r2]
|
||||
ldrd_reg r4, r5, r0, r2
|
||||
uadd8 r6, r6, r8
|
||||
ldr r10, [r1, #4]
|
||||
and r12, r12, lr
|
||||
subs r3, r3, #2
|
||||
uadd8 r7, r7, r12
|
||||
ldr r9, [r1], r2
|
||||
strd r6, r7, [r0], r2
|
||||
ldr_post r9, r1, r2
|
||||
strd_post r6, r7, r0, r2
|
||||
b 1b
|
||||
2:
|
||||
and r8, r8, lr
|
||||
and r12, r12, lr
|
||||
uadd8 r6, r6, r8
|
||||
uadd8 r7, r7, r12
|
||||
strd r6, r7, [r0], r2
|
||||
strd_post r6, r7, r0, r2
|
||||
|
||||
pop {r4-r10, pc}
|
||||
endfunc
|
||||
@ -284,7 +284,7 @@ function ff_add_pixels_clamped_armv6, export=1
|
||||
orr r6, r8, r5, lsl #8
|
||||
orr r7, r4, lr, lsl #8
|
||||
subs r3, r3, #1
|
||||
strd r6, r7, [r1], r2
|
||||
strd_post r6, r7, r1, r2
|
||||
bgt 1b
|
||||
pop {r4-r8,pc}
|
||||
endfunc
|
||||
@ -294,7 +294,7 @@ function ff_get_pixels_armv6, export=1
|
||||
push {r4-r8, lr}
|
||||
mov lr, #8
|
||||
1:
|
||||
ldrd r4, r5, [r1], r2
|
||||
ldrd_post r4, r5, r1, r2
|
||||
subs lr, lr, #1
|
||||
uxtb16 r6, r4
|
||||
uxtb16 r4, r4, ror #8
|
||||
@ -317,8 +317,8 @@ function ff_diff_pixels_armv6, export=1
|
||||
push {r4-r9, lr}
|
||||
mov lr, #8
|
||||
1:
|
||||
ldrd r4, r5, [r1], r3
|
||||
ldrd r6, r7, [r2], r3
|
||||
ldrd_post r4, r5, r1, r3
|
||||
ldrd_post r6, r7, r2, r3
|
||||
uxtb16 r8, r4
|
||||
uxtb16 r4, r4, ror #8
|
||||
uxtb16 r9, r6
|
||||
@ -492,19 +492,19 @@ function ff_pix_abs8_armv6, export=1
|
||||
push {r4-r9, lr}
|
||||
mov r0, #0
|
||||
mov lr, #0
|
||||
ldrd r4, r5, [r1], r3
|
||||
ldrd_post r4, r5, r1, r3
|
||||
1:
|
||||
subs r12, r12, #2
|
||||
ldr r7, [r2, #4]
|
||||
ldr r6, [r2], r3
|
||||
ldrd r8, r9, [r1], r3
|
||||
ldr_post r6, r2, r3
|
||||
ldrd_post r8, r9, r1, r3
|
||||
usada8 r0, r4, r6, r0
|
||||
pld [r2, r3]
|
||||
usada8 lr, r5, r7, lr
|
||||
ldr r7, [r2, #4]
|
||||
ldr r6, [r2], r3
|
||||
ldr_post r6, r2, r3
|
||||
beq 2f
|
||||
ldrd r4, r5, [r1], r3
|
||||
ldrd_post r4, r5, r1, r3
|
||||
usada8 r0, r8, r6, r0
|
||||
pld [r2, r3]
|
||||
usada8 lr, r9, r7, lr
|
||||
@ -613,7 +613,7 @@ function ff_pix_sum_armv6, export=1
|
||||
ldr r7, [r0, #12]
|
||||
usada8 r2, r6, lr, r2
|
||||
beq 2f
|
||||
ldr r4, [r0, r1]!
|
||||
ldr_pre r4, r0, r1
|
||||
usada8 r3, r7, lr, r3
|
||||
bgt 1b
|
||||
2:
|
||||
|
@ -531,6 +531,7 @@ function ff_vorbis_inverse_coupling_neon, export=1
|
||||
|
||||
2: vst1.32 {d2-d3}, [r3, :128]!
|
||||
vst1.32 {d0-d1}, [r12,:128]!
|
||||
it lt
|
||||
bxlt lr
|
||||
|
||||
3: vld1.32 {d2-d3}, [r1,:128]
|
||||
@ -575,6 +576,7 @@ NOVFP vdup.32 q8, r2
|
||||
2: vst1.32 {q2},[r0,:128]!
|
||||
vst1.32 {q3},[r0,:128]!
|
||||
ands len, len, #15
|
||||
it eq
|
||||
bxeq lr
|
||||
3: vld1.32 {q0},[r1,:128]!
|
||||
vmul.f32 q0, q0, q8
|
||||
@ -638,6 +640,7 @@ NOVFP ldr r3, [sp]
|
||||
2: vst1.32 {q8},[r0,:128]!
|
||||
vst1.32 {q9},[r0,:128]!
|
||||
ands r3, r3, #7
|
||||
it eq
|
||||
popeq {pc}
|
||||
3: vld1.32 {q0},[r1,:128]!
|
||||
ldr r12, [r2], #4
|
||||
|
@ -55,18 +55,23 @@ function ff_vector_fmul_vfp, export=1
|
||||
1:
|
||||
subs r3, r3, #16
|
||||
vmul.f32 s12, s4, s12
|
||||
itttt ge
|
||||
vldmiage r1!, {s16-s19}
|
||||
vldmiage r2!, {s24-s27}
|
||||
vldmiage r1!, {s20-s23}
|
||||
vldmiage r2!, {s28-s31}
|
||||
it ge
|
||||
vmulge.f32 s24, s16, s24
|
||||
vstmia r0!, {s8-s11}
|
||||
vstmia r0!, {s12-s15}
|
||||
it ge
|
||||
vmulge.f32 s28, s20, s28
|
||||
itttt gt
|
||||
vldmiagt r1!, {s0-s3}
|
||||
vldmiagt r2!, {s8-s11}
|
||||
vldmiagt r1!, {s4-s7}
|
||||
vldmiagt r2!, {s12-s15}
|
||||
ittt ge
|
||||
vmulge.f32 s8, s0, s8
|
||||
vstmiage r0!, {s24-s27}
|
||||
vstmiage r0!, {s28-s31}
|
||||
@ -97,33 +102,49 @@ function ff_vector_fmul_reverse_vfp, export=1
|
||||
vmul.f32 s11, s0, s11
|
||||
1:
|
||||
subs r3, r3, #16
|
||||
it ge
|
||||
vldmdbge r2!, {s16-s19}
|
||||
vmul.f32 s12, s7, s12
|
||||
it ge
|
||||
vldmiage r1!, {s24-s27}
|
||||
vmul.f32 s13, s6, s13
|
||||
it ge
|
||||
vldmdbge r2!, {s20-s23}
|
||||
vmul.f32 s14, s5, s14
|
||||
it ge
|
||||
vldmiage r1!, {s28-s31}
|
||||
vmul.f32 s15, s4, s15
|
||||
it ge
|
||||
vmulge.f32 s24, s19, s24
|
||||
it gt
|
||||
vldmdbgt r2!, {s0-s3}
|
||||
it ge
|
||||
vmulge.f32 s25, s18, s25
|
||||
vstmia r0!, {s8-s13}
|
||||
it ge
|
||||
vmulge.f32 s26, s17, s26
|
||||
it gt
|
||||
vldmiagt r1!, {s8-s11}
|
||||
itt ge
|
||||
vmulge.f32 s27, s16, s27
|
||||
vmulge.f32 s28, s23, s28
|
||||
it gt
|
||||
vldmdbgt r2!, {s4-s7}
|
||||
it ge
|
||||
vmulge.f32 s29, s22, s29
|
||||
vstmia r0!, {s14-s15}
|
||||
ittt ge
|
||||
vmulge.f32 s30, s21, s30
|
||||
vmulge.f32 s31, s20, s31
|
||||
vmulge.f32 s8, s3, s8
|
||||
it gt
|
||||
vldmiagt r1!, {s12-s15}
|
||||
itttt ge
|
||||
vmulge.f32 s9, s2, s9
|
||||
vmulge.f32 s10, s1, s10
|
||||
vstmiage r0!, {s24-s27}
|
||||
vmulge.f32 s11, s0, s11
|
||||
it ge
|
||||
vstmiage r0!, {s28-s31}
|
||||
bgt 1b
|
||||
|
||||
|
@ -71,6 +71,7 @@ endfunc
|
||||
|
||||
function ff_float_to_int16_interleave_neon, export=1
|
||||
cmp r3, #2
|
||||
itt lt
|
||||
ldrlt r1, [r1]
|
||||
blt ff_float_to_int16_neon
|
||||
bne 4f
|
||||
@ -196,6 +197,7 @@ function ff_float_to_int16_interleave_neon, export=1
|
||||
vst1.64 {d3}, [r8], ip
|
||||
vst1.64 {d7}, [r8], ip
|
||||
subs r3, r3, #4
|
||||
it eq
|
||||
popeq {r4-r8,pc}
|
||||
cmp r3, #4
|
||||
add r0, r0, #8
|
||||
@ -305,6 +307,7 @@ function ff_float_to_int16_interleave_neon, export=1
|
||||
vst1.32 {d23[1]}, [r8], ip
|
||||
8: subs r3, r3, #2
|
||||
add r0, r0, #4
|
||||
it eq
|
||||
popeq {r4-r8,pc}
|
||||
|
||||
@ 1 channel
|
||||
@ -354,6 +357,7 @@ function ff_float_to_int16_interleave_neon, export=1
|
||||
vst1.16 {d2[3]}, [r5,:16], ip
|
||||
vst1.16 {d3[1]}, [r5,:16], ip
|
||||
vst1.16 {d3[3]}, [r5,:16], ip
|
||||
it eq
|
||||
popeq {r4-r8,pc}
|
||||
vld1.64 {d0-d1}, [r4,:128]!
|
||||
vcvt.s32.f32 q0, q0, #16
|
||||
|
@ -46,6 +46,7 @@ function ff_float_to_int16_vfp, export=1
|
||||
vmov r5, r6, s2, s3
|
||||
vmov r7, r8, s4, s5
|
||||
vmov ip, lr, s6, s7
|
||||
it gt
|
||||
vldmiagt r1!, {s16-s23}
|
||||
ssat r4, #16, r4
|
||||
ssat r3, #16, r3
|
||||
@ -53,10 +54,12 @@ function ff_float_to_int16_vfp, export=1
|
||||
ssat r5, #16, r5
|
||||
pkhbt r3, r3, r4, lsl #16
|
||||
pkhbt r4, r5, r6, lsl #16
|
||||
itttt gt
|
||||
vcvtgt.s32.f32 s0, s16
|
||||
vcvtgt.s32.f32 s1, s17
|
||||
vcvtgt.s32.f32 s2, s18
|
||||
vcvtgt.s32.f32 s3, s19
|
||||
itttt gt
|
||||
vcvtgt.s32.f32 s4, s20
|
||||
vcvtgt.s32.f32 s5, s21
|
||||
vcvtgt.s32.f32 s6, s22
|
||||
|
@ -71,7 +71,9 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
|
||||
muls r7, r4, r5
|
||||
A muls r7, r4, r5
|
||||
T mul r7, r4, r5
|
||||
T cmp r7, #0
|
||||
rsb r6, r7, r5, lsl #3
|
||||
rsb ip, r7, r4, lsl #3
|
||||
sub r4, r7, r4, lsl #3
|
||||
@ -197,7 +199,9 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1
|
||||
pld [r1]
|
||||
pld [r1, r2]
|
||||
|
||||
muls r7, r4, r5
|
||||
A muls r7, r4, r5
|
||||
T mul r7, r4, r5
|
||||
T cmp r7, #0
|
||||
rsb r6, r7, r5, lsl #3
|
||||
rsb ip, r7, r4, lsl #3
|
||||
sub r4, r7, r4, lsl #3
|
||||
@ -368,10 +372,10 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
|
||||
pop {r4-r6, pc}
|
||||
2:
|
||||
.ifc \type,put
|
||||
ldrh r5, [r1], r2
|
||||
strh r5, [r0], r2
|
||||
ldrh r6, [r1], r2
|
||||
strh r6, [r0], r2
|
||||
ldrh_post r5, r1, r2
|
||||
strh_post r5, r0, r2
|
||||
ldrh_post r6, r1, r2
|
||||
strh_post r6, r0, r2
|
||||
.else
|
||||
vld1.16 {d16[0]}, [r1], r2
|
||||
vld1.16 {d16[1]}, [r1], r2
|
||||
@ -404,28 +408,17 @@ endfunc
|
||||
ldr ip, [sp]
|
||||
tst r2, r2
|
||||
ldr ip, [ip]
|
||||
it ne
|
||||
tstne r3, r3
|
||||
vmov.32 d24[0], ip
|
||||
and ip, ip, ip, lsl #16
|
||||
it eq
|
||||
bxeq lr
|
||||
ands ip, ip, ip, lsl #8
|
||||
it lt
|
||||
bxlt lr
|
||||
.endm
|
||||
|
||||
.macro align_push_regs
|
||||
and ip, sp, #15
|
||||
add ip, ip, #32
|
||||
sub sp, sp, ip
|
||||
vst1.64 {d12-d15}, [sp,:128]
|
||||
sub sp, sp, #32
|
||||
vst1.64 {d8-d11}, [sp,:128]
|
||||
.endm
|
||||
|
||||
.macro align_pop_regs
|
||||
vld1.64 {d8-d11}, [sp,:128]!
|
||||
vld1.64 {d12-d15}, [sp,:128], ip
|
||||
.endm
|
||||
|
||||
.macro h264_loop_filter_luma
|
||||
vdup.8 q11, r2 @ alpha
|
||||
vmovl.u8 q12, d24
|
||||
@ -506,7 +499,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
|
||||
vld1.64 {d18,d19}, [r0,:128], r1
|
||||
vld1.64 {d16,d17}, [r0,:128], r1
|
||||
|
||||
align_push_regs
|
||||
vpush {d8-d15}
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
@ -516,7 +509,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
|
||||
vst1.64 {d0, d1}, [r0,:128], r1
|
||||
vst1.64 {d10,d11}, [r0,:128]
|
||||
|
||||
align_pop_regs
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
@ -543,7 +536,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1
|
||||
|
||||
transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
|
||||
|
||||
align_push_regs
|
||||
vpush {d8-d15}
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
@ -568,7 +561,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1
|
||||
vst1.32 {d1[1]}, [r0], r1
|
||||
vst1.32 {d11[1]}, [r0], r1
|
||||
|
||||
align_pop_regs
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
@ -1116,6 +1109,7 @@ function \type\()_h264_qpel8_hv_lowpass_neon
|
||||
vrhadd.u8 d11, d11, d7
|
||||
sub r0, r0, r2, lsl #3
|
||||
.endif
|
||||
|
||||
vst1.64 {d12}, [r0,:64], r2
|
||||
vst1.64 {d13}, [r0,:64], r2
|
||||
vst1.64 {d14}, [r0,:64], r2
|
||||
@ -1263,7 +1257,9 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1
|
||||
\type\()_h264_qpel8_mc11:
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
bic sp, sp, #15
|
||||
A bic sp, sp, #15
|
||||
T bic r0, r11, #15
|
||||
T mov sp, r0
|
||||
sub sp, sp, #64
|
||||
mov r0, sp
|
||||
sub r1, r1, #2
|
||||
@ -1271,14 +1267,14 @@ function ff_\type\()_h264_qpel8_mc11_neon, export=1
|
||||
mov ip, #8
|
||||
vpush {d8-d15}
|
||||
bl put_h264_qpel8_h_lowpass_neon
|
||||
ldrd r0, [r11]
|
||||
ldrd r0, [r11], #8
|
||||
mov r3, r2
|
||||
add ip, sp, #64
|
||||
sub r1, r1, r2, lsl #1
|
||||
mov r2, #8
|
||||
bl \type\()_h264_qpel8_v_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
add sp, r11, #8
|
||||
mov sp, r11
|
||||
pop {r11, pc}
|
||||
endfunc
|
||||
|
||||
@ -1287,7 +1283,9 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1
|
||||
\type\()_h264_qpel8_mc21:
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
bic sp, sp, #15
|
||||
A bic sp, sp, #15
|
||||
T bic r0, r11, #15
|
||||
T mov sp, r0
|
||||
sub sp, sp, #(8*8+16*12)
|
||||
sub r1, r1, #2
|
||||
mov r3, #8
|
||||
@ -1296,14 +1294,14 @@ function ff_\type\()_h264_qpel8_mc21_neon, export=1
|
||||
vpush {d8-d15}
|
||||
bl put_h264_qpel8_h_lowpass_neon
|
||||
mov r4, r0
|
||||
ldrd r0, [r11]
|
||||
ldrd r0, [r11], #8
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, r2
|
||||
sub r2, r4, #64
|
||||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
add sp, r11, #8
|
||||
mov sp, r11
|
||||
pop {r4, r10, r11, pc}
|
||||
endfunc
|
||||
|
||||
@ -1330,7 +1328,9 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1
|
||||
\type\()_h264_qpel8_mc12:
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
bic sp, sp, #15
|
||||
A bic sp, sp, #15
|
||||
T bic r0, r11, #15
|
||||
T mov sp, r0
|
||||
sub sp, sp, #(8*8+16*12)
|
||||
sub r1, r1, r2, lsl #1
|
||||
mov r3, r2
|
||||
@ -1339,20 +1339,22 @@ function ff_\type\()_h264_qpel8_mc12_neon, export=1
|
||||
vpush {d8-d15}
|
||||
bl put_h264_qpel8_v_lowpass_neon
|
||||
mov r4, r0
|
||||
ldrd r0, [r11]
|
||||
ldrd r0, [r11], #8
|
||||
sub r1, r1, r3, lsl #1
|
||||
sub r1, r1, #2
|
||||
sub r2, r4, #64
|
||||
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
add sp, r11, #8
|
||||
mov sp, r11
|
||||
pop {r4, r10, r11, pc}
|
||||
endfunc
|
||||
|
||||
function ff_\type\()_h264_qpel8_mc22_neon, export=1
|
||||
push {r4, r10, r11, lr}
|
||||
mov r11, sp
|
||||
bic sp, sp, #15
|
||||
A bic sp, sp, #15
|
||||
T bic r4, r11, #15
|
||||
T mov sp, r4
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, r2
|
||||
@ -1441,21 +1443,23 @@ function ff_\type\()_h264_qpel16_mc11_neon, export=1
|
||||
\type\()_h264_qpel16_mc11:
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
bic sp, sp, #15
|
||||
A bic sp, sp, #15
|
||||
T bic r0, r11, #15
|
||||
T mov sp, r0
|
||||
sub sp, sp, #256
|
||||
mov r0, sp
|
||||
sub r1, r1, #2
|
||||
mov r3, #16
|
||||
vpush {d8-d15}
|
||||
bl put_h264_qpel16_h_lowpass_neon
|
||||
ldrd r0, [r11]
|
||||
ldrd r0, [r11], #8
|
||||
mov r3, r2
|
||||
add ip, sp, #64
|
||||
sub r1, r1, r2, lsl #1
|
||||
mov r2, #16
|
||||
bl \type\()_h264_qpel16_v_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
add sp, r11, #8
|
||||
mov sp, r11
|
||||
pop {r4, r11, pc}
|
||||
endfunc
|
||||
|
||||
@ -1464,20 +1468,22 @@ function ff_\type\()_h264_qpel16_mc21_neon, export=1
|
||||
\type\()_h264_qpel16_mc21:
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
bic sp, sp, #15
|
||||
A bic sp, sp, #15
|
||||
T bic r0, r11, #15
|
||||
T mov sp, r0
|
||||
sub sp, sp, #(16*16+16*12)
|
||||
sub r1, r1, #2
|
||||
mov r0, sp
|
||||
vpush {d8-d15}
|
||||
bl put_h264_qpel16_h_lowpass_neon_packed
|
||||
mov r4, r0
|
||||
ldrd r0, [r11]
|
||||
ldrd r0, [r11], #8
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, r2
|
||||
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
add sp, r11, #8
|
||||
mov sp, r11
|
||||
pop {r4-r5, r9-r11, pc}
|
||||
endfunc
|
||||
|
||||
@ -1504,7 +1510,9 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1
|
||||
\type\()_h264_qpel16_mc12:
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
bic sp, sp, #15
|
||||
A bic sp, sp, #15
|
||||
T bic r0, r11, #15
|
||||
T mov sp, r0
|
||||
sub sp, sp, #(16*16+16*12)
|
||||
sub r1, r1, r2, lsl #1
|
||||
mov r0, sp
|
||||
@ -1512,13 +1520,13 @@ function ff_\type\()_h264_qpel16_mc12_neon, export=1
|
||||
vpush {d8-d15}
|
||||
bl put_h264_qpel16_v_lowpass_neon_packed
|
||||
mov r4, r0
|
||||
ldrd r0, [r11]
|
||||
ldrd r0, [r11], #8
|
||||
sub r1, r1, r3, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r2, r3
|
||||
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
|
||||
vpop {d8-d15}
|
||||
add sp, r11, #8
|
||||
mov sp, r11
|
||||
pop {r4-r5, r9-r11, pc}
|
||||
endfunc
|
||||
|
||||
@ -1526,7 +1534,9 @@ function ff_\type\()_h264_qpel16_mc22_neon, export=1
|
||||
push {r4, r9-r11, lr}
|
||||
lowpass_const r3
|
||||
mov r11, sp
|
||||
bic sp, sp, #15
|
||||
A bic sp, sp, #15
|
||||
T bic r4, r11, #15
|
||||
T mov sp, r4
|
||||
sub r1, r1, r2, lsl #1
|
||||
sub r1, r1, #2
|
||||
mov r3, r2
|
||||
|
@ -106,10 +106,12 @@ function ff_h264_idct_add16_neon, export=1
|
||||
blt 2f
|
||||
ldrsh lr, [r1]
|
||||
add r0, r0, r4
|
||||
it ne
|
||||
movne lr, #0
|
||||
cmp lr, #0
|
||||
adrne lr, ff_h264_idct_dc_add_neon
|
||||
adreq lr, ff_h264_idct_add_neon
|
||||
ite ne
|
||||
adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
|
||||
adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB
|
||||
blx lr
|
||||
2: subs ip, ip, #1
|
||||
add r1, r1, #32
|
||||
@ -132,8 +134,9 @@ function ff_h264_idct_add16intra_neon, export=1
|
||||
add r0, r0, r4
|
||||
cmp r8, #0
|
||||
ldrsh r8, [r1]
|
||||
adrne lr, ff_h264_idct_add_neon
|
||||
adreq lr, ff_h264_idct_dc_add_neon
|
||||
iteet ne
|
||||
adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
|
||||
adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
|
||||
cmpeq r8, #0
|
||||
blxne lr
|
||||
subs ip, ip, #1
|
||||
@ -159,12 +162,14 @@ function ff_h264_idct_add8_neon, export=1
|
||||
add r1, r3, r12, lsl #5
|
||||
cmp r8, #0
|
||||
ldrsh r8, [r1]
|
||||
adrne lr, ff_h264_idct_add_neon
|
||||
adreq lr, ff_h264_idct_dc_add_neon
|
||||
iteet ne
|
||||
adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
|
||||
adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
|
||||
cmpeq r8, #0
|
||||
blxne lr
|
||||
add r12, r12, #1
|
||||
cmp r12, #4
|
||||
itt eq
|
||||
moveq r12, #16
|
||||
moveq r4, r9
|
||||
cmp r12, #20
|
||||
@ -365,10 +370,12 @@ function ff_h264_idct8_add4_neon, export=1
|
||||
blt 2f
|
||||
ldrsh lr, [r1]
|
||||
add r0, r0, r4
|
||||
it ne
|
||||
movne lr, #0
|
||||
cmp lr, #0
|
||||
adrne lr, ff_h264_idct8_dc_add_neon
|
||||
adreq lr, ff_h264_idct8_add_neon
|
||||
ite ne
|
||||
adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB
|
||||
adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB
|
||||
blx lr
|
||||
2: subs r12, r12, #4
|
||||
add r1, r1, #128
|
||||
|
@ -64,11 +64,14 @@ static inline av_const int mid_pred(int a, int b, int c)
|
||||
__asm__ (
|
||||
"mov %0, %2 \n\t"
|
||||
"cmp %1, %2 \n\t"
|
||||
"itt gt \n\t"
|
||||
"movgt %0, %1 \n\t"
|
||||
"movgt %1, %2 \n\t"
|
||||
"cmp %1, %3 \n\t"
|
||||
"it le \n\t"
|
||||
"movle %1, %3 \n\t"
|
||||
"cmp %0, %1 \n\t"
|
||||
"it gt \n\t"
|
||||
"movgt %0, %1 \n\t"
|
||||
: "=&r"(m), "+r"(a)
|
||||
: "r"(b), "r"(c)
|
||||
|
@ -191,7 +191,9 @@ function ff_mdct_calc_neon, export=1
|
||||
vadd.f32 d17, d17, d3 @ in2u+in1d -I
|
||||
1:
|
||||
vmul.f32 d7, d0, d21 @ I*s
|
||||
ldr r10, [r3, lr, lsr #1]
|
||||
A ldr r10, [r3, lr, lsr #1]
|
||||
T lsr r10, lr, #1
|
||||
T ldr r10, [r3, r10]
|
||||
vmul.f32 d6, d1, d20 @ -R*c
|
||||
ldr r6, [r3, #4]!
|
||||
vmul.f32 d4, d1, d21 @ -R*s
|
||||
|
@ -75,7 +75,7 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1
|
||||
sum8 r8, r9, r1, r0, r10, r11, r12, lr
|
||||
sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32
|
||||
round r10, r8, r9
|
||||
strh r10, [r3], r4
|
||||
strh_post r10, r3, r4
|
||||
|
||||
mov lr, #15
|
||||
1:
|
||||
@ -127,10 +127,10 @@ function ff_mpadsp_apply_window_fixed_armv6, export=1
|
||||
round r10, r8, r9
|
||||
adds r8, r8, r4
|
||||
adc r9, r9, r7
|
||||
strh r10, [r3], r12
|
||||
strh_post r10, r3, r12
|
||||
round r11, r8, r9
|
||||
subs lr, lr, #1
|
||||
strh r11, [r5], -r12
|
||||
strh_dpost r11, r5, r12
|
||||
bgt 1b
|
||||
|
||||
sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33
|
||||
|
@ -38,15 +38,21 @@
|
||||
|
||||
.macro dequant_t dst, src, mul, add, tmp
|
||||
rsbs \tmp, ip, \src, asr #16
|
||||
it gt
|
||||
addgt \tmp, \add, #0
|
||||
it lt
|
||||
rsblt \tmp, \add, #0
|
||||
it ne
|
||||
smlatbne \dst, \src, \mul, \tmp
|
||||
.endm
|
||||
|
||||
.macro dequant_b dst, src, mul, add, tmp
|
||||
rsbs \tmp, ip, \src, lsl #16
|
||||
it gt
|
||||
addgt \tmp, \add, #0
|
||||
it lt
|
||||
rsblt \tmp, \add, #0
|
||||
it ne
|
||||
smlabbne \dst, \src, \mul, \tmp
|
||||
.endm
|
||||
|
||||
@ -80,21 +86,27 @@ function ff_dct_unquantize_h263_armv5te, export=1
|
||||
strh lr, [r0], #2
|
||||
|
||||
subs r3, r3, #8
|
||||
it gt
|
||||
ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
|
||||
bgt 1b
|
||||
|
||||
adds r3, r3, #2
|
||||
it le
|
||||
pople {r4-r9,pc}
|
||||
2:
|
||||
ldrsh r9, [r0, #0]
|
||||
ldrsh lr, [r0, #2]
|
||||
mov r8, r2
|
||||
cmp r9, #0
|
||||
it lt
|
||||
rsblt r8, r2, #0
|
||||
it ne
|
||||
smlabbne r9, r9, r1, r8
|
||||
mov r8, r2
|
||||
cmp lr, #0
|
||||
it lt
|
||||
rsblt r8, r2, #0
|
||||
it ne
|
||||
smlabbne lr, lr, r1, r8
|
||||
strh r9, [r0], #2
|
||||
strh lr, [r0], #2
|
||||
|
@ -57,6 +57,7 @@ function ff_dct_unquantize_h263_neon, export=1
|
||||
subs r3, r3, #16
|
||||
vst1.16 {q0}, [r1,:128]!
|
||||
vst1.16 {q8}, [r1,:128]!
|
||||
it le
|
||||
bxle lr
|
||||
cmp r3, #8
|
||||
bgt 1b
|
||||
@ -78,6 +79,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1
|
||||
ldr r6, [r0, #AC_PRED]
|
||||
add lr, r0, #INTER_SCANTAB_RASTER_END
|
||||
cmp r6, #0
|
||||
it ne
|
||||
movne r12, #63
|
||||
bne 1f
|
||||
ldr r12, [r12, r2, lsl #2]
|
||||
@ -86,9 +88,11 @@ function ff_dct_unquantize_h263_intra_neon, export=1
|
||||
ldrsh r4, [r1]
|
||||
cmp r5, #0
|
||||
mov r5, r1
|
||||
it ne
|
||||
movne r2, #0
|
||||
bne 2f
|
||||
cmp r2, #4
|
||||
it ge
|
||||
addge r0, r0, #4
|
||||
sub r2, r3, #1
|
||||
ldr r6, [r0, #Y_DC_SCALE]
|
||||
|
@ -137,6 +137,7 @@ function ff_rdft_calc_neon, export=1
|
||||
vst1.32 {d22}, [r5,:64]
|
||||
|
||||
cmp r6, #0
|
||||
it eq
|
||||
popeq {r4-r8,pc}
|
||||
|
||||
vmul.f32 d22, d22, d18
|
||||
|
@ -121,11 +121,13 @@ __b_evaluation:
|
||||
ldr r11, [r12, #offW7] @ R11=W7
|
||||
mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
|
||||
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
|
||||
teq r2, #0 @ if null avoid muls
|
||||
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
|
||||
teq r2, #0 @ if null avoid muls
|
||||
itttt ne
|
||||
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
|
||||
rsbne r2, r2, #0 @ R2=-ROWr16[3]
|
||||
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
|
||||
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
|
||||
it ne
|
||||
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
|
||||
|
||||
@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
|
||||
@ -148,19 +150,23 @@ __b_evaluation:
|
||||
@@ MAC16(b3, -W1, row[7]);
|
||||
@@ MAC16(b1, -W5, row[7]);
|
||||
mov r3, r3, asr #16 @ R3=ROWr16[5]
|
||||
teq r3, #0 @ if null avoid muls
|
||||
teq r3, #0 @ if null avoid muls
|
||||
it ne
|
||||
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
|
||||
mov r4, r4, asr #16 @ R4=ROWr16[7]
|
||||
itttt ne
|
||||
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
|
||||
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
|
||||
rsbne r3, r3, #0 @ R3=-ROWr16[5]
|
||||
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
|
||||
@@ R3 is free now
|
||||
teq r4, #0 @ if null avoid muls
|
||||
teq r4, #0 @ if null avoid muls
|
||||
itttt ne
|
||||
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
|
||||
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
|
||||
rsbne r4, r4, #0 @ R4=-ROWr16[7]
|
||||
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
|
||||
it ne
|
||||
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
|
||||
@@ R4 is free now
|
||||
__end_b_evaluation:
|
||||
@ -204,16 +210,19 @@ __a_evaluation:
|
||||
@@ a2 -= W4*row[4]
|
||||
@@ a3 += W4*row[4]
|
||||
ldrsh r11, [r14, #8] @ R11=ROWr16[4]
|
||||
teq r11, #0 @ if null avoid muls
|
||||
teq r11, #0 @ if null avoid muls
|
||||
it ne
|
||||
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
|
||||
@@ R9 is free now
|
||||
ldrsh r9, [r14, #12] @ R9=ROWr16[6]
|
||||
itttt ne
|
||||
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
|
||||
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
|
||||
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
|
||||
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
|
||||
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
|
||||
teq r9, #0 @ if null avoid muls
|
||||
teq r9, #0 @ if null avoid muls
|
||||
itttt ne
|
||||
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
|
||||
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
|
||||
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
|
||||
@ -222,6 +231,7 @@ __a_evaluation:
|
||||
@@ a1 -= W2*row[6];
|
||||
@@ a2 += W2*row[6];
|
||||
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
|
||||
itt ne
|
||||
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
|
||||
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
|
||||
|
||||
@ -323,10 +333,12 @@ __b_evaluation2:
|
||||
ldrsh r2, [r14, #48]
|
||||
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
|
||||
teq r2, #0 @ if 0, then avoid muls
|
||||
itttt ne
|
||||
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
|
||||
rsbne r2, r2, #0 @ R2=-ROWr16[3]
|
||||
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
|
||||
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
|
||||
it ne
|
||||
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
|
||||
|
||||
@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
|
||||
@ -342,18 +354,22 @@ __b_evaluation2:
|
||||
@@ MAC16(b1, -W5, col[7x8]);
|
||||
ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
|
||||
teq r3, #0 @ if 0 then avoid muls
|
||||
itttt ne
|
||||
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
|
||||
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
|
||||
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
|
||||
rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
|
||||
ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
|
||||
it ne
|
||||
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
|
||||
@@ R3 is free now
|
||||
teq r4, #0 @ if 0 then avoid muls
|
||||
itttt ne
|
||||
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
|
||||
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
|
||||
rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
|
||||
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
|
||||
it ne
|
||||
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
|
||||
@@ R4 is free now
|
||||
__end_b_evaluation2:
|
||||
@ -390,15 +406,18 @@ __a_evaluation2:
|
||||
@@ a3 += W4*row[4]
|
||||
ldrsh r11, [r14, #64] @ R11=ROWr16[4]
|
||||
teq r11, #0 @ if null avoid muls
|
||||
itttt ne
|
||||
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
|
||||
@@ R9 is free now
|
||||
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
|
||||
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
|
||||
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
|
||||
ldrsh r9, [r14, #96] @ R9=ROWr16[6]
|
||||
it ne
|
||||
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
|
||||
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
|
||||
teq r9, #0 @ if null avoid muls
|
||||
itttt ne
|
||||
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
|
||||
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
|
||||
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
|
||||
@ -407,6 +426,7 @@ __a_evaluation2:
|
||||
@@ a1 -= W2*row[6];
|
||||
@@ a2 += W2*row[6];
|
||||
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
|
||||
itt ne
|
||||
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
|
||||
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
|
||||
__end_a_evaluation2:
|
||||
|
@ -49,6 +49,7 @@ function idct_row_armv5te
|
||||
ldrd v1, [a1, #8]
|
||||
ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */
|
||||
orrs v1, v1, v2
|
||||
itt eq
|
||||
cmpeq v1, a4
|
||||
cmpeq v1, a3, lsr #16
|
||||
beq row_dc_only
|
||||
@ -269,6 +270,7 @@ function idct_col_armv5te
|
||||
ldmfd sp!, {a3, a4}
|
||||
adds a2, a3, v1
|
||||
mov a2, a2, lsr #20
|
||||
it mi
|
||||
orrmi a2, a2, #0xf000
|
||||
add ip, a4, v2
|
||||
mov ip, ip, asr #20
|
||||
@ -276,6 +278,7 @@ function idct_col_armv5te
|
||||
str a2, [a1]
|
||||
subs a3, a3, v1
|
||||
mov a2, a3, lsr #20
|
||||
it mi
|
||||
orrmi a2, a2, #0xf000
|
||||
sub a4, a4, v2
|
||||
mov a4, a4, asr #20
|
||||
@ -285,6 +288,7 @@ function idct_col_armv5te
|
||||
|
||||
subs a2, a3, v3
|
||||
mov a2, a2, lsr #20
|
||||
it mi
|
||||
orrmi a2, a2, #0xf000
|
||||
sub ip, a4, v4
|
||||
mov ip, ip, asr #20
|
||||
@ -292,6 +296,7 @@ function idct_col_armv5te
|
||||
str a2, [a1, #(16*1)]
|
||||
adds a3, a3, v3
|
||||
mov a2, a3, lsr #20
|
||||
it mi
|
||||
orrmi a2, a2, #0xf000
|
||||
add a4, a4, v4
|
||||
mov a4, a4, asr #20
|
||||
@ -301,6 +306,7 @@ function idct_col_armv5te
|
||||
|
||||
adds a2, a3, v5
|
||||
mov a2, a2, lsr #20
|
||||
it mi
|
||||
orrmi a2, a2, #0xf000
|
||||
add ip, a4, v6
|
||||
mov ip, ip, asr #20
|
||||
@ -308,6 +314,7 @@ function idct_col_armv5te
|
||||
str a2, [a1, #(16*2)]
|
||||
subs a3, a3, v5
|
||||
mov a2, a3, lsr #20
|
||||
it mi
|
||||
orrmi a2, a2, #0xf000
|
||||
sub a4, a4, v6
|
||||
mov a4, a4, asr #20
|
||||
@ -317,6 +324,7 @@ function idct_col_armv5te
|
||||
|
||||
adds a2, a3, v7
|
||||
mov a2, a2, lsr #20
|
||||
it mi
|
||||
orrmi a2, a2, #0xf000
|
||||
add ip, a4, fp
|
||||
mov ip, ip, asr #20
|
||||
@ -324,6 +332,7 @@ function idct_col_armv5te
|
||||
str a2, [a1, #(16*3)]
|
||||
subs a3, a3, v7
|
||||
mov a2, a3, lsr #20
|
||||
it mi
|
||||
orrmi a2, a2, #0xf000
|
||||
sub a4, a4, fp
|
||||
mov a4, a4, asr #20
|
||||
@ -335,15 +344,19 @@ endfunc
|
||||
|
||||
.macro clip dst, src:vararg
|
||||
movs \dst, \src
|
||||
it mi
|
||||
movmi \dst, #0
|
||||
cmp \dst, #255
|
||||
it gt
|
||||
movgt \dst, #255
|
||||
.endm
|
||||
|
||||
.macro aclip dst, src:vararg
|
||||
adds \dst, \src
|
||||
it mi
|
||||
movmi \dst, #0
|
||||
cmp \dst, #255
|
||||
it gt
|
||||
movgt \dst, #255
|
||||
.endm
|
||||
|
||||
@ -370,35 +383,35 @@ function idct_col_put_armv5te
|
||||
orr a2, a3, a4, lsl #8
|
||||
rsb v2, lr, lr, lsl #3
|
||||
ldmfd sp!, {a3, a4}
|
||||
strh a2, [v2, v1]!
|
||||
strh_pre a2, v2, v1
|
||||
|
||||
sub a2, a3, v3
|
||||
clip a2, a2, asr #20
|
||||
sub ip, a4, v4
|
||||
clip ip, ip, asr #20
|
||||
orr a2, a2, ip, lsl #8
|
||||
strh a2, [v1, lr]!
|
||||
strh_pre a2, v1, lr
|
||||
add a3, a3, v3
|
||||
clip a2, a3, asr #20
|
||||
add a4, a4, v4
|
||||
clip a4, a4, asr #20
|
||||
orr a2, a2, a4, lsl #8
|
||||
ldmfd sp!, {a3, a4}
|
||||
strh a2, [v2, -lr]!
|
||||
strh_dpre a2, v2, lr
|
||||
|
||||
add a2, a3, v5
|
||||
clip a2, a2, asr #20
|
||||
add ip, a4, v6
|
||||
clip ip, ip, asr #20
|
||||
orr a2, a2, ip, lsl #8
|
||||
strh a2, [v1, lr]!
|
||||
strh_pre a2, v1, lr
|
||||
sub a3, a3, v5
|
||||
clip a2, a3, asr #20
|
||||
sub a4, a4, v6
|
||||
clip a4, a4, asr #20
|
||||
orr a2, a2, a4, lsl #8
|
||||
ldmfd sp!, {a3, a4}
|
||||
strh a2, [v2, -lr]!
|
||||
strh_dpre a2, v2, lr
|
||||
|
||||
add a2, a3, v7
|
||||
clip a2, a2, asr #20
|
||||
@ -411,7 +424,7 @@ function idct_col_put_armv5te
|
||||
sub a4, a4, fp
|
||||
clip a4, a4, asr #20
|
||||
orr a2, a2, a4, lsl #8
|
||||
strh a2, [v2, -lr]
|
||||
strh_dpre a2, v2, lr
|
||||
|
||||
ldr pc, [sp], #4
|
||||
endfunc
|
||||
@ -436,7 +449,7 @@ function idct_col_add_armv5te
|
||||
ldr v1, [sp, #32]
|
||||
sub a4, a4, v2
|
||||
rsb v2, v1, v1, lsl #3
|
||||
ldrh ip, [v2, lr]!
|
||||
ldrh_pre ip, v2, lr
|
||||
strh a2, [lr]
|
||||
and a2, ip, #255
|
||||
aclip a3, a2, a3, asr #20
|
||||
@ -448,7 +461,7 @@ function idct_col_add_armv5te
|
||||
strh a2, [v2]
|
||||
|
||||
ldmfd sp!, {a3, a4}
|
||||
ldrh ip, [lr, v1]!
|
||||
ldrh_pre ip, lr, v1
|
||||
sub a2, a3, v3
|
||||
add a3, a3, v3
|
||||
and v3, ip, #255
|
||||
@ -458,7 +471,7 @@ function idct_col_add_armv5te
|
||||
aclip v3, v3, ip, lsr #8
|
||||
orr a2, a2, v3, lsl #8
|
||||
add a4, a4, v4
|
||||
ldrh ip, [v2, -v1]!
|
||||
ldrh_dpre ip, v2, v1
|
||||
strh a2, [lr]
|
||||
and a2, ip, #255
|
||||
aclip a3, a2, a3, asr #20
|
||||
@ -468,7 +481,7 @@ function idct_col_add_armv5te
|
||||
strh a2, [v2]
|
||||
|
||||
ldmfd sp!, {a3, a4}
|
||||
ldrh ip, [lr, v1]!
|
||||
ldrh_pre ip, lr, v1
|
||||
add a2, a3, v5
|
||||
sub a3, a3, v5
|
||||
and v3, ip, #255
|
||||
@ -478,7 +491,7 @@ function idct_col_add_armv5te
|
||||
aclip v3, v3, ip, lsr #8
|
||||
orr a2, a2, v3, lsl #8
|
||||
sub a4, a4, v6
|
||||
ldrh ip, [v2, -v1]!
|
||||
ldrh_dpre ip, v2, v1
|
||||
strh a2, [lr]
|
||||
and a2, ip, #255
|
||||
aclip a3, a2, a3, asr #20
|
||||
@ -488,7 +501,7 @@ function idct_col_add_armv5te
|
||||
strh a2, [v2]
|
||||
|
||||
ldmfd sp!, {a3, a4}
|
||||
ldrh ip, [lr, v1]!
|
||||
ldrh_pre ip, lr, v1
|
||||
add a2, a3, v7
|
||||
sub a3, a3, v7
|
||||
and v3, ip, #255
|
||||
@ -498,7 +511,7 @@ function idct_col_add_armv5te
|
||||
aclip v3, v3, ip, lsr #8
|
||||
orr a2, a2, v3, lsl #8
|
||||
sub a4, a4, fp
|
||||
ldrh ip, [v2, -v1]!
|
||||
ldrh_dpre ip, v2, v1
|
||||
strh a2, [lr]
|
||||
and a2, ip, #255
|
||||
aclip a3, a2, a3, asr #20
|
||||
|
@ -200,6 +200,7 @@ function idct_row_armv6
|
||||
ldr r3, [r0, #8] /* r3 = row[3,1] */
|
||||
ldr r2, [r0] /* r2 = row[2,0] */
|
||||
orrs lr, lr, ip
|
||||
itt eq
|
||||
cmpeq lr, r3
|
||||
cmpeq lr, r2, lsr #16
|
||||
beq 1f
|
||||
@ -282,14 +283,14 @@ function idct_col_put_armv6
|
||||
pop {r1, r2}
|
||||
idct_finish_shift_sat COL_SHIFT
|
||||
|
||||
strb r4, [r1], r2
|
||||
strb r5, [r1], r2
|
||||
strb r6, [r1], r2
|
||||
strb r7, [r1], r2
|
||||
strb r11,[r1], r2
|
||||
strb r10,[r1], r2
|
||||
strb r9, [r1], r2
|
||||
strb r8, [r1], r2
|
||||
strb_post r4, r1, r2
|
||||
strb_post r5, r1, r2
|
||||
strb_post r6, r1, r2
|
||||
strb_post r7, r1, r2
|
||||
strb_post r11,r1, r2
|
||||
strb_post r10,r1, r2
|
||||
strb_post r9, r1, r2
|
||||
strb_post r8, r1, r2
|
||||
|
||||
sub r1, r1, r2, lsl #3
|
||||
|
||||
@ -318,16 +319,16 @@ function idct_col_add_armv6
|
||||
add ip, r3, ip, asr #COL_SHIFT
|
||||
usat ip, #8, ip
|
||||
add r4, r7, r4, asr #COL_SHIFT
|
||||
strb ip, [r1], r2
|
||||
strb_post ip, r1, r2
|
||||
ldrb ip, [r1, r2]
|
||||
usat r4, #8, r4
|
||||
ldrb r11,[r1, r2, lsl #2]
|
||||
add r5, ip, r5, asr #COL_SHIFT
|
||||
usat r5, #8, r5
|
||||
strb r4, [r1], r2
|
||||
strb_post r4, r1, r2
|
||||
ldrb r3, [r1, r2]
|
||||
ldrb ip, [r1, r2, lsl #2]
|
||||
strb r5, [r1], r2
|
||||
strb_post r5, r1, r2
|
||||
ldrb r7, [r1, r2]
|
||||
ldrb r4, [r1, r2, lsl #2]
|
||||
add r6, r3, r6, asr #COL_SHIFT
|
||||
@ -340,11 +341,11 @@ function idct_col_add_armv6
|
||||
usat r8, #8, r8
|
||||
add lr, r4, lr, asr #COL_SHIFT
|
||||
usat lr, #8, lr
|
||||
strb r6, [r1], r2
|
||||
strb r10,[r1], r2
|
||||
strb r9, [r1], r2
|
||||
strb r8, [r1], r2
|
||||
strb lr, [r1], r2
|
||||
strb_post r6, r1, r2
|
||||
strb_post r10,r1, r2
|
||||
strb_post r9, r1, r2
|
||||
strb_post r8, r1, r2
|
||||
strb_post lr, r1, r2
|
||||
|
||||
sub r1, r1, r2, lsl #3
|
||||
|
||||
|
@ -71,7 +71,7 @@ function idct_row4_pld_neon
|
||||
add r3, r0, r1, lsl #2
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
pld [r3, -r1]
|
||||
A pld [r3, -r1]
|
||||
pld [r3]
|
||||
pld [r3, r1]
|
||||
add r3, r3, r1, lsl #1
|
||||
@ -164,6 +164,7 @@ function idct_col4_neon
|
||||
orrs r4, r4, r5
|
||||
|
||||
idct_col4_top
|
||||
it eq
|
||||
addeq r2, r2, #16
|
||||
beq 1f
|
||||
|
||||
@ -176,6 +177,7 @@ function idct_col4_neon
|
||||
|
||||
1: orrs r6, r6, r7
|
||||
ldrd r4, [r2, #16]
|
||||
it eq
|
||||
addeq r2, r2, #16
|
||||
beq 2f
|
||||
|
||||
@ -187,6 +189,7 @@ function idct_col4_neon
|
||||
|
||||
2: orrs r4, r4, r5
|
||||
ldrd r4, [r2, #16]
|
||||
it eq
|
||||
addeq r2, r2, #16
|
||||
beq 3f
|
||||
|
||||
@ -199,6 +202,7 @@ function idct_col4_neon
|
||||
vadd.i32 q13, q13, q8
|
||||
|
||||
3: orrs r4, r4, r5
|
||||
it eq
|
||||
addeq r2, r2, #16
|
||||
beq 4f
|
||||
|
||||
|
@ -100,9 +100,11 @@ NOVFP vldr s0, [sp, #12*4] @ scale
|
||||
vst1.32 {q9}, [r2,:128]
|
||||
|
||||
subs r1, r1, #1
|
||||
it eq
|
||||
popeq {r4-r11,pc}
|
||||
|
||||
cmp r4, #0
|
||||
itt eq
|
||||
subeq r8, r8, #512*4
|
||||
subeq r9, r9, #512*4
|
||||
sub r5, r5, #512*4
|
||||
|
@ -21,6 +21,14 @@
|
||||
#ifndef AVCODEC_ARM_VP56_ARITH_H
|
||||
#define AVCODEC_ARM_VP56_ARITH_H
|
||||
|
||||
#if CONFIG_THUMB
|
||||
# define A(x)
|
||||
# define T(x) x
|
||||
#else
|
||||
# define A(x) x
|
||||
# define T(x)
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV6 && HAVE_INLINE_ASM
|
||||
|
||||
#define vp56_rac_get_prob vp56_rac_get_prob_armv6
|
||||
@ -32,15 +40,21 @@ static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr)
|
||||
unsigned bit;
|
||||
|
||||
__asm__ ("adds %3, %3, %0 \n"
|
||||
"itt cs \n"
|
||||
"cmpcs %7, %4 \n"
|
||||
"ldrcsh %2, [%4], #2 \n"
|
||||
A("ldrcsh %2, [%4], #2 \n")
|
||||
T("ldrhcs %2, [%4], #2 \n")
|
||||
"rsb %0, %6, #256 \n"
|
||||
"smlabb %0, %5, %6, %0 \n"
|
||||
T("itttt cs \n")
|
||||
"rev16cs %2, %2 \n"
|
||||
"orrcs %1, %1, %2, lsl %3 \n"
|
||||
T("lslcs %2, %2, %3 \n")
|
||||
T("orrcs %1, %1, %2 \n")
|
||||
A("orrcs %1, %1, %2, lsl %3 \n")
|
||||
"subcs %3, %3, #16 \n"
|
||||
"lsr %0, %0, #8 \n"
|
||||
"cmp %1, %0, lsl #16 \n"
|
||||
"ittte ge \n"
|
||||
"subge %1, %1, %0, lsl #16 \n"
|
||||
"subge %0, %5, %0 \n"
|
||||
"movge %2, #1 \n"
|
||||
@ -64,12 +78,17 @@ static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr)
|
||||
unsigned tmp;
|
||||
|
||||
__asm__ ("adds %3, %3, %0 \n"
|
||||
"itt cs \n"
|
||||
"cmpcs %7, %4 \n"
|
||||
"ldrcsh %2, [%4], #2 \n"
|
||||
A("ldrcsh %2, [%4], #2 \n")
|
||||
T("ldrhcs %2, [%4], #2 \n")
|
||||
"rsb %0, %6, #256 \n"
|
||||
"smlabb %0, %5, %6, %0 \n"
|
||||
T("itttt cs \n")
|
||||
"rev16cs %2, %2 \n"
|
||||
"orrcs %1, %1, %2, lsl %3 \n"
|
||||
T("lslcs %2, %2, %3 \n")
|
||||
T("orrcs %1, %1, %2 \n")
|
||||
A("orrcs %1, %1, %2, lsl %3 \n")
|
||||
"subcs %3, %3, #16 \n"
|
||||
"lsr %0, %0, #8 \n"
|
||||
"lsl %2, %0, #16 \n"
|
||||
|
@ -25,13 +25,18 @@
|
||||
lsl \cw, \cw, \t0
|
||||
lsl \t0, \h, \t0
|
||||
rsb \h, \pr, #256
|
||||
it cs
|
||||
ldrhcs \t1, [\buf], #2
|
||||
smlabb \h, \t0, \pr, \h
|
||||
T itttt cs
|
||||
rev16cs \t1, \t1
|
||||
orrcs \cw, \cw, \t1, lsl \bs
|
||||
A orrcs \cw, \cw, \t1, lsl \bs
|
||||
T lslcs \t1, \t1, \bs
|
||||
T orrcs \cw, \cw, \t1
|
||||
subcs \bs, \bs, #16
|
||||
lsr \h, \h, #8
|
||||
cmp \cw, \h, lsl #16
|
||||
itt ge
|
||||
subge \cw, \cw, \h, lsl #16
|
||||
subge \h, \t0, \h
|
||||
.endm
|
||||
@ -40,14 +45,20 @@
|
||||
adds \bs, \bs, \t0
|
||||
lsl \cw, \cw, \t0
|
||||
lsl \t0, \h, \t0
|
||||
it cs
|
||||
ldrhcs \t1, [\buf], #2
|
||||
mov \h, #128
|
||||
it cs
|
||||
rev16cs \t1, \t1
|
||||
add \h, \h, \t0, lsl #7
|
||||
orrcs \cw, \cw, \t1, lsl \bs
|
||||
A orrcs \cw, \cw, \t1, lsl \bs
|
||||
T ittt cs
|
||||
T lslcs \t1, \t1, \bs
|
||||
T orrcs \cw, \cw, \t1
|
||||
subcs \bs, \bs, #16
|
||||
lsr \h, \h, #8
|
||||
cmp \cw, \h, lsl #16
|
||||
itt ge
|
||||
subge \cw, \cw, \h, lsl #16
|
||||
subge \h, \t0, \h
|
||||
.endm
|
||||
@ -59,6 +70,7 @@ function ff_decode_block_coeffs_armv6, export=1
|
||||
cmp r3, #0
|
||||
ldr r11, [r5]
|
||||
ldm r0, {r5-r7} @ high, bits, buf
|
||||
it ne
|
||||
pkhtbne r11, r11, r11, asr #16
|
||||
ldr r8, [r0, #16] @ code_word
|
||||
0:
|
||||
@ -80,19 +92,26 @@ function ff_decode_block_coeffs_armv6, export=1
|
||||
adds r6, r6, r9
|
||||
add r4, r4, #11
|
||||
lsl r8, r8, r9
|
||||
it cs
|
||||
ldrhcs r10, [r7], #2
|
||||
lsl r9, r5, r9
|
||||
mov r5, #128
|
||||
it cs
|
||||
rev16cs r10, r10
|
||||
add r5, r5, r9, lsl #7
|
||||
orrcs r8, r8, r10, lsl r6
|
||||
T ittt cs
|
||||
T lslcs r10, r10, r6
|
||||
T orrcs r8, r8, r10
|
||||
A orrcs r8, r8, r10, lsl r6
|
||||
subcs r6, r6, #16
|
||||
lsr r5, r5, #8
|
||||
cmp r8, r5, lsl #16
|
||||
movrel r10, zigzag_scan-1
|
||||
itt ge
|
||||
subge r8, r8, r5, lsl #16
|
||||
subge r5, r9, r5
|
||||
ldrb r10, [r10, r3]
|
||||
it ge
|
||||
rsbge r12, r12, #0
|
||||
cmp r3, #16
|
||||
strh r12, [r1, r10]
|
||||
@ -108,6 +127,7 @@ function ff_decode_block_coeffs_armv6, export=1
|
||||
ldr r0, [sp]
|
||||
ldr r9, [r0, #12]
|
||||
cmp r7, r9
|
||||
it hi
|
||||
movhi r7, r9
|
||||
stm r0, {r5-r7} @ high, bits, buf
|
||||
str r8, [r0, #16] @ code_word
|
||||
@ -131,11 +151,13 @@ function ff_decode_block_coeffs_armv6, export=1
|
||||
mov r12, #2
|
||||
ldrb r0, [r4, #4]
|
||||
rac_get_prob r5, r6, r7, r8, r0, r9, r10
|
||||
it ge
|
||||
addge r12, #1
|
||||
ldrb r9, [lr, r5]
|
||||
blt 4f
|
||||
ldrb r0, [r4, #5]
|
||||
rac_get_prob r5, r6, r7, r8, r0, r9, r10
|
||||
it ge
|
||||
addge r12, #1
|
||||
ldrb r9, [lr, r5]
|
||||
b 4f
|
||||
@ -153,6 +175,7 @@ function ff_decode_block_coeffs_armv6, export=1
|
||||
mov r12, #5
|
||||
mov r0, #159
|
||||
rac_get_prob r5, r6, r7, r8, r0, r9, r10
|
||||
it ge
|
||||
addge r12, r12, #1
|
||||
ldrb r9, [lr, r5]
|
||||
b 4f
|
||||
@ -160,23 +183,28 @@ function ff_decode_block_coeffs_armv6, export=1
|
||||
mov r12, #7
|
||||
mov r0, #165
|
||||
rac_get_prob r5, r6, r7, r8, r0, r9, r10
|
||||
it ge
|
||||
addge r12, r12, #2
|
||||
ldrb r9, [lr, r5]
|
||||
mov r0, #145
|
||||
rac_get_prob r5, r6, r7, r8, r0, r9, r10
|
||||
it ge
|
||||
addge r12, r12, #1
|
||||
ldrb r9, [lr, r5]
|
||||
b 4f
|
||||
3:
|
||||
ldrb r0, [r4, #8]
|
||||
rac_get_prob r5, r6, r7, r8, r0, r9, r10
|
||||
it ge
|
||||
addge r4, r4, #1
|
||||
ldrb r9, [lr, r5]
|
||||
ite ge
|
||||
movge r12, #2
|
||||
movlt r12, #0
|
||||
ldrb r0, [r4, #9]
|
||||
rac_get_prob r5, r6, r7, r8, r0, r9, r10
|
||||
mov r9, #8
|
||||
it ge
|
||||
addge r12, r12, #1
|
||||
movrel r4, X(ff_vp8_dct_cat_prob)
|
||||
lsl r9, r9, r12
|
||||
@ -189,6 +217,7 @@ function ff_decode_block_coeffs_armv6, export=1
|
||||
lsl r1, r1, #1
|
||||
rac_get_prob r5, r6, r7, r8, r0, r9, r10
|
||||
ldrb r0, [r4], #1
|
||||
it ge
|
||||
addge r1, r1, #1
|
||||
cmp r0, #0
|
||||
bne 1b
|
||||
@ -200,6 +229,7 @@ function ff_decode_block_coeffs_armv6, export=1
|
||||
add r4, r2, r4
|
||||
add r4, r4, #22
|
||||
rac_get_128 r5, r6, r7, r8, r9, r10
|
||||
it ge
|
||||
rsbge r12, r12, #0
|
||||
smulbb r12, r12, r11
|
||||
movrel r9, zigzag_scan-1
|
||||
|
@ -746,14 +746,14 @@ function ff_put_vp8_pixels4_neon, export=1
|
||||
push {r4-r6,lr}
|
||||
1:
|
||||
subs r12, r12, #4
|
||||
ldr r4, [r2], r3
|
||||
ldr r5, [r2], r3
|
||||
ldr r6, [r2], r3
|
||||
ldr lr, [r2], r3
|
||||
str r4, [r0], r1
|
||||
str r5, [r0], r1
|
||||
str r6, [r0], r1
|
||||
str lr, [r0], r1
|
||||
ldr_post r4, r2, r3
|
||||
ldr_post r5, r2, r3
|
||||
ldr_post r6, r2, r3
|
||||
ldr_post lr, r2, r3
|
||||
str_post r4, r0, r1
|
||||
str_post r5, r0, r1
|
||||
str_post r6, r0, r1
|
||||
str_post lr, r0, r1
|
||||
bgt 1b
|
||||
pop {r4-r6,pc}
|
||||
endfunc
|
||||
|
@ -36,6 +36,7 @@ static av_always_inline av_const int FASTDIV(int a, int b)
|
||||
int r;
|
||||
__asm__ ("cmp %2, #2 \n\t"
|
||||
"ldr %0, [%3, %2, lsl #2] \n\t"
|
||||
"ite le \n\t"
|
||||
"lsrle %0, %1, #1 \n\t"
|
||||
"smmulgt %0, %0, %1 \n\t"
|
||||
: "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc");
|
||||
@ -101,6 +102,7 @@ static av_always_inline av_const int32_t av_clipl_int32_arm(int64_t a)
|
||||
{
|
||||
int x, y;
|
||||
__asm__ ("adds %1, %R2, %Q2, lsr #31 \n\t"
|
||||
"itet ne \n\t"
|
||||
"mvnne %1, #1<<31 \n\t"
|
||||
"moveq %0, %Q2 \n\t"
|
||||
"eorne %0, %1, %R2, asr #31 \n\t"
|
||||
|
Loading…
Reference in New Issue
Block a user