mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
x86/tx_float: adjust internal ASM call ABI again
There are many ways to go about it, and this one seems optimal for both MDCTs and PFA FFTs without requiring excessive instructions or stack usage.
This commit is contained in:
parent
7e7baf8ab8
commit
3241e9225c
@ -22,11 +22,10 @@
|
||||
; based upon and compare.
|
||||
|
||||
; Intra-asm call convention:
|
||||
; 272 bytes of stack available
|
||||
; First 10 GPRs available
|
||||
; 320 bytes of stack available
|
||||
; 14 GPRs available (last 4 must not be clobbered)
|
||||
; Additionally, don't clobber ctx, in, out, len, lut
|
||||
; All vector regs available
|
||||
; Don't clobber ctx, len, lut
|
||||
; in and out must point to the end
|
||||
|
||||
; TODO:
|
||||
; carry over registers from smaller transforms to save on ~8 loads/stores
|
||||
@ -686,8 +685,6 @@ cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride
|
||||
movaps m0, [inq]
|
||||
FFT2 m0, m1
|
||||
movaps [outq], m0
|
||||
add inq, mmsize*1
|
||||
add outq, mmsize*1
|
||||
ret
|
||||
|
||||
cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
|
||||
@ -721,8 +718,6 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
|
||||
movaps [outq + 1*mmsize], m0
|
||||
|
||||
%if %3
|
||||
add inq, mmsize*2
|
||||
add outq, mmsize*2
|
||||
ret
|
||||
%else
|
||||
RET
|
||||
@ -764,8 +759,6 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
|
||||
movups [outq + 3*mmsize], m1
|
||||
|
||||
%if %1
|
||||
add inq, mmsize*4
|
||||
add outq, mmsize*4
|
||||
ret
|
||||
%else
|
||||
RET
|
||||
@ -806,8 +799,6 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
|
||||
vextractf128 [outq + 16*3], m0, 1
|
||||
|
||||
%if %1
|
||||
add inq, mmsize*2
|
||||
add outq, mmsize*2
|
||||
ret
|
||||
%else
|
||||
RET
|
||||
@ -857,8 +848,6 @@ cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
|
||||
vextractf128 [outq + 16*7], m1, 1
|
||||
|
||||
%if %2
|
||||
add inq, mmsize*4
|
||||
add outq, mmsize*4
|
||||
ret
|
||||
%else
|
||||
RET
|
||||
@ -943,8 +932,6 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
|
||||
vextractf128 [outq + 16*15], m5, 1
|
||||
|
||||
%if %2
|
||||
add inq, mmsize*8
|
||||
add outq, mmsize*8
|
||||
ret
|
||||
%else
|
||||
RET
|
||||
@ -1282,12 +1269,13 @@ FFT_SPLIT_RADIX_DEF 131072
|
||||
add outq, 8*mmsize
|
||||
add rtabq, 4*mmsize
|
||||
sub itabq, 4*mmsize
|
||||
sub lenq, 4*mmsize
|
||||
sub tgtq, 4*mmsize
|
||||
jg .synth_deinterleave
|
||||
|
||||
%if %2
|
||||
mov lenq, tgtq
|
||||
add outq, offq
|
||||
sub outq, tmpq
|
||||
neg tmpq
|
||||
lea inq, [inq + tmpq*4]
|
||||
ret
|
||||
%else
|
||||
RET
|
||||
@ -1369,7 +1357,7 @@ FFT_SPLIT_RADIX_DEF 131072
|
||||
vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1
|
||||
|
||||
%if %2
|
||||
add outq, 16*mmsize
|
||||
sub inq, 16*mmsize
|
||||
ret
|
||||
%else
|
||||
RET
|
||||
|
Loading…
Reference in New Issue
Block a user