From e4a27e2f2dea60fb0cce6e555a6a8296e50edc54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20B=C5=93sch?= <cboesch@gopro.com>
Date: Thu, 22 Jun 2017 11:04:26 +0200
Subject: [PATCH] lavc/arm: fix lack of precision in
 ff_ps_stereo_interpolate_neon

The code originally pre-multiply by 2 the steps, causing the running sum
of the h factors to drift away due to the lack of precision. It quickly
causes an inaccuracy > 0.01.

I tried diverse approaches such as multiply by 2.0 (instead of adding
the value itself) without success.

I'm unable to bench the impact of this change, feel free to compare.

This commit fixes the incoming aacpsdsp tests.

Following is an alternative simplified function (matching the incoming
AArch64 code) that may be used:

function ff_ps_stereo_interpolate_neon, export=1
        vld1.32         {q0}, [r2]
        vld1.32         {q1}, [r3]
        ldr             r12, [sp]
        vmov.f32        q8, q0
        vmov.f32        q9, q1
        vzip.32         q8, q0
        vzip.32         q9, q1
1:
        vld1.32         {d4}, [r0,:64]
        vld1.32         {d6}, [r1,:64]
        vadd.f32        q8, q8, q9
        vadd.f32        q0, q0, q1
        vmov.f32        d5, d4
        vmov.f32        d7, d6
        vmul.f32        q2, q2, q8
        vmla.f32        q2, q3, q0
        vst1.32         {d4}, [r0,:64]!
        vst1.32         {d5}, [r1,:64]!
        subs            r12, r12, #1
        bgt             1b
        bx              lr
endfunc
---
 libavcodec/arm/aacpsdsp_neon.S | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/libavcodec/arm/aacpsdsp_neon.S b/libavcodec/arm/aacpsdsp_neon.S
index a93bbfea9c..3b1bed2aa7 100644
--- a/libavcodec/arm/aacpsdsp_neon.S
+++ b/libavcodec/arm/aacpsdsp_neon.S
@@ -232,12 +232,11 @@ endfunc
 function ff_ps_stereo_interpolate_neon, export=1
         vld1.32         {q0},     [r2]
         vld1.32         {q14},    [r3]
-        vadd.f32        q15, q14, q14
         mov             r2,  r0
         mov             r3,  r1
         ldr             r12, [sp]
         vadd.f32        q1,  q0,  q14
-        vadd.f32        q0,  q0,  q15
+        vadd.f32        q0,  q1,  q14
         vld1.32         {q2},     [r0,:64]!
         vld1.32         {q3},     [r1,:64]!
         subs            r12, r12, #1
@@ -251,8 +250,10 @@ function ff_ps_stereo_interpolate_neon, export=1
         vmla.f32        d17, d7,  d1[0]
         vmla.f32        d18, d6,  d3[1]
         vmla.f32        d19, d7,  d1[1]
-        vadd.f32        q1,  q1,  q15
-        vadd.f32        q0,  q0,  q15
+        vadd.f32        q1,  q1,  q14
+        vadd.f32        q0,  q0,  q14
+        vadd.f32        q1,  q1,  q14
+        vadd.f32        q0,  q0,  q14
         vld1.32         {q2},     [r0,:64]!
         vld1.32         {q3},     [r1,:64]!
         vst1.32         {q8},     [r2,:64]!