x86/tx_float: adjust internal ASM call ABI again

There are many ways to go about it, and this one seems optimal for both MDCTs and PFA FFTs without requiring excessive instructions or stack usage.
2025-08-10 06:10:52 +02:00 · 2022-09-23 10:34:08 +02:00
parent 7e7baf8ab8
commit 3241e9225c
1 changed files with 8 additions and 20 deletions
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -22,11 +22,10 @@
 ; based upon and compare.

 ; Intra-asm call convention:
-;       272 bytes of stack available
-;       First 10 GPRs available
+;       320 bytes of stack available
+;       14 GPRs available (last 4 must not be clobbered)
+;       Additionally, don't clobber ctx, in, out, len, lut
 ;       All vector regs available
-;       Don't clobber ctx, len, lut
-;       in and out must point to the end

 ; TODO:
 ;       carry over registers from smaller transforms to save on ~8 loads/stores
@@ -686,8 +685,6 @@ cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride
    movaps m0, [inq]
    FFT2 m0, m1
    movaps [outq], m0
-    add inq, mmsize*1
-    add outq, mmsize*1
    ret

 cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
@@ -721,8 +718,6 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
    movaps [outq + 1*mmsize], m0

 %if %3
-    add inq, mmsize*2
-    add outq, mmsize*2
    ret
 %else
    RET
@@ -764,8 +759,6 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
    movups [outq + 3*mmsize], m1

 %if %1
-    add inq, mmsize*4
-    add outq, mmsize*4
    ret
 %else
    RET
@@ -806,8 +799,6 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
    vextractf128 [outq + 16*3], m0, 1

 %if %1
-    add inq, mmsize*2
-    add outq, mmsize*2
    ret
 %else
    RET
@@ -857,8 +848,6 @@ cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
    vextractf128 [outq + 16*7], m1, 1

 %if %2
-    add inq, mmsize*4
-    add outq, mmsize*4
    ret
 %else
    RET
@@ -943,8 +932,6 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
    vextractf128 [outq + 16*15],  m5, 1

 %if %2
-    add inq, mmsize*8
-    add outq, mmsize*8
    ret
 %else
    RET
@@ -1282,12 +1269,13 @@ FFT_SPLIT_RADIX_DEF 131072
    add outq, 8*mmsize
    add rtabq, 4*mmsize
    sub itabq, 4*mmsize
-    sub lenq, 4*mmsize
+    sub tgtq, 4*mmsize
    jg .synth_deinterleave

 %if %2
-    mov lenq, tgtq
-    add outq, offq
+    sub outq, tmpq
+    neg tmpq
+    lea inq, [inq + tmpq*4]
    ret
 %else
    RET
@@ -1369,7 +1357,7 @@ FFT_SPLIT_RADIX_DEF 131072
    vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1

 %if %2
-    add outq, 16*mmsize
+    sub inq, 16*mmsize
    ret
 %else
    RET