arm: vp9itxfm: Move the load_add_store macro out from the itxfm16 pass2 function

This allows reusing the macro for a separate implementation of the pass2 function. Signed-off-by: Martin Storsjö <martin@martin.st>
2025-08-10 06:10:52 +02:00 · 2017-02-05 22:55:20 +02:00
parent 115476018d
commit 47b3c2c18d
1 changed files with 36 additions and 36 deletions
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -657,6 +657,42 @@ function iadst16
        bx              lr
 endfunc

+.macro load_add_store coef0, coef1, coef2, coef3
+        vrshr.s16       \coef0, \coef0, #6
+        vrshr.s16       \coef1, \coef1, #6
+
+        vld1.32         {d4[]},   [r0,:32], r1
+        vld1.32         {d4[1]},  [r3,:32], r1
+        vrshr.s16       \coef2, \coef2, #6
+        vrshr.s16       \coef3, \coef3, #6
+        vld1.32         {d5[]},   [r0,:32], r1
+        vld1.32         {d5[1]},  [r3,:32], r1
+        vaddw.u8        \coef0, \coef0, d4
+        vld1.32         {d6[]},   [r0,:32], r1
+        vld1.32         {d6[1]},  [r3,:32], r1
+        vaddw.u8        \coef1, \coef1, d5
+        vld1.32         {d7[]},   [r0,:32], r1
+        vld1.32         {d7[1]},  [r3,:32], r1
+
+        vqmovun.s16     d4,  \coef0
+        vqmovun.s16     d5,  \coef1
+        sub             r0,  r0,  r1, lsl #2
+        sub             r3,  r3,  r1, lsl #2
+        vaddw.u8        \coef2, \coef2, d6
+        vaddw.u8        \coef3, \coef3, d7
+        vst1.32         {d4[0]},  [r0,:32], r1
+        vst1.32         {d4[1]},  [r3,:32], r1
+        vqmovun.s16     d6,  \coef2
+        vst1.32         {d5[0]},  [r0,:32], r1
+        vst1.32         {d5[1]},  [r3,:32], r1
+        vqmovun.s16     d7,  \coef3
+
+        vst1.32         {d6[0]},  [r0,:32], r1
+        vst1.32         {d6[1]},  [r3,:32], r1
+        vst1.32         {d7[0]},  [r0,:32], r1
+        vst1.32         {d7[1]},  [r3,:32], r1
+.endm
+
 .macro itxfm16_1d_funcs txfm
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@ transpose into a horizontal 16x4 slice and store.
@@ -739,44 +775,8 @@ function \txfm\()16_1d_4x16_pass2_neon
        lsl             r1,  r1,  #1
        bl              \txfm\()16

-.macro load_add_store coef0, coef1, coef2, coef3
-        vrshr.s16       \coef0, \coef0, #6
-        vrshr.s16       \coef1, \coef1, #6
-
-        vld1.32         {d4[]},   [r0,:32], r1
-        vld1.32         {d4[1]},  [r3,:32], r1
-        vrshr.s16       \coef2, \coef2, #6
-        vrshr.s16       \coef3, \coef3, #6
-        vld1.32         {d5[]},   [r0,:32], r1
-        vld1.32         {d5[1]},  [r3,:32], r1
-        vaddw.u8        \coef0, \coef0, d4
-        vld1.32         {d6[]},   [r0,:32], r1
-        vld1.32         {d6[1]},  [r3,:32], r1
-        vaddw.u8        \coef1, \coef1, d5
-        vld1.32         {d7[]},   [r0,:32], r1
-        vld1.32         {d7[1]},  [r3,:32], r1
-
-        vqmovun.s16     d4,  \coef0
-        vqmovun.s16     d5,  \coef1
-        sub             r0,  r0,  r1, lsl #2
-        sub             r3,  r3,  r1, lsl #2
-        vaddw.u8        \coef2, \coef2, d6
-        vaddw.u8        \coef3, \coef3, d7
-        vst1.32         {d4[0]},  [r0,:32], r1
-        vst1.32         {d4[1]},  [r3,:32], r1
-        vqmovun.s16     d6,  \coef2
-        vst1.32         {d5[0]},  [r0,:32], r1
-        vst1.32         {d5[1]},  [r3,:32], r1
-        vqmovun.s16     d7,  \coef3
-
-        vst1.32         {d6[0]},  [r0,:32], r1
-        vst1.32         {d6[1]},  [r3,:32], r1
-        vst1.32         {d7[0]},  [r0,:32], r1
-        vst1.32         {d7[1]},  [r3,:32], r1
-.endm
        load_add_store  q8,  q9,  q10, q11
        load_add_store  q12, q13, q14, q15
-.purgem load_add_store

        pop             {pc}
 endfunc