You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-10 06:10:52 +02:00
x86/tx_float: add support for calling assembly functions from assembly
Needed for the next patch. We get this for the extremely small cost of a branch on _ns functions, which wouldn't be used anyway with assembly.
This commit is contained in:
@@ -313,6 +313,8 @@ static void print_flags(AVBPrint *bp, uint64_t f)
|
|||||||
av_bprintf(bp, "%spreshuf", prev > 1 ? sep : "");
|
av_bprintf(bp, "%spreshuf", prev > 1 ? sep : "");
|
||||||
if ((f & AV_TX_FULL_IMDCT) && ++prev)
|
if ((f & AV_TX_FULL_IMDCT) && ++prev)
|
||||||
av_bprintf(bp, "%simdct_full", prev > 1 ? sep : "");
|
av_bprintf(bp, "%simdct_full", prev > 1 ? sep : "");
|
||||||
|
if ((f & FF_TX_ASM_CALL) && ++prev)
|
||||||
|
av_bprintf(bp, "%sasm_call", prev > 1 ? sep : "");
|
||||||
av_bprintf(bp, "]");
|
av_bprintf(bp, "]");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -484,7 +486,7 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
|
|||||||
uint64_t req_flags = flags;
|
uint64_t req_flags = flags;
|
||||||
|
|
||||||
/* Flags the codelet may require to be present */
|
/* Flags the codelet may require to be present */
|
||||||
uint64_t inv_req_mask = AV_TX_FULL_IMDCT | FF_TX_PRESHUFFLE;
|
uint64_t inv_req_mask = AV_TX_FULL_IMDCT | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL;
|
||||||
|
|
||||||
/* Unaligned codelets are compatible with the aligned flag */
|
/* Unaligned codelets are compatible with the aligned flag */
|
||||||
if (req_flags & FF_TX_ALIGNED)
|
if (req_flags & FF_TX_ALIGNED)
|
||||||
|
@@ -145,6 +145,7 @@ typedef void TXComplex;
|
|||||||
#define FF_TX_PRESHUFFLE (1ULL << 61) /* Codelet expects permuted coeffs */
|
#define FF_TX_PRESHUFFLE (1ULL << 61) /* Codelet expects permuted coeffs */
|
||||||
#define FF_TX_INVERSE_ONLY (1ULL << 60) /* For non-orthogonal inverse-only transforms */
|
#define FF_TX_INVERSE_ONLY (1ULL << 60) /* For non-orthogonal inverse-only transforms */
|
||||||
#define FF_TX_FORWARD_ONLY (1ULL << 59) /* For non-orthogonal forward-only transforms */
|
#define FF_TX_FORWARD_ONLY (1ULL << 59) /* For non-orthogonal forward-only transforms */
|
||||||
|
#define FF_TX_ASM_CALL (1ULL << 58) /* For asm->asm functions only */
|
||||||
|
|
||||||
typedef enum FFTXCodeletPriority {
|
typedef enum FFTXCodeletPriority {
|
||||||
FF_TX_PRIO_BASE = 0, /* Baseline priority */
|
FF_TX_PRIO_BASE = 0, /* Baseline priority */
|
||||||
|
@@ -707,20 +707,21 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
|
|||||||
FFT4 fwd, 0
|
FFT4 fwd, 0
|
||||||
FFT4 inv, 1
|
FFT4 inv, 1
|
||||||
|
|
||||||
%macro FFT8_SSE_FN 2
|
%macro FFT8_SSE_FN 1
|
||||||
INIT_XMM sse3
|
INIT_XMM sse3
|
||||||
cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp
|
%if %1
|
||||||
%if %2
|
cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, tmp
|
||||||
|
movaps m0, [inq + 0*mmsize]
|
||||||
|
movaps m1, [inq + 1*mmsize]
|
||||||
|
movaps m2, [inq + 2*mmsize]
|
||||||
|
movaps m3, [inq + 3*mmsize]
|
||||||
|
%else
|
||||||
|
cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
|
||||||
mov ctxq, [ctxq + AVTXContext.map]
|
mov ctxq, [ctxq + AVTXContext.map]
|
||||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
|
||||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
|
||||||
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
|
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
|
||||||
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq
|
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq
|
||||||
%else
|
|
||||||
movaps m0, [inq + 0*mmsize]
|
|
||||||
movaps m1, [inq + 1*mmsize]
|
|
||||||
movaps m2, [inq + 2*mmsize]
|
|
||||||
movaps m3, [inq + 3*mmsize]
|
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
FFT8 m0, m1, m2, m3, m4, m5
|
FFT8 m0, m1, m2, m3, m4, m5
|
||||||
@@ -735,22 +736,33 @@ cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp
|
|||||||
movups [outq + 2*mmsize], m5
|
movups [outq + 2*mmsize], m5
|
||||||
movups [outq + 3*mmsize], m1
|
movups [outq + 3*mmsize], m1
|
||||||
|
|
||||||
|
%if %1
|
||||||
|
ret
|
||||||
|
%else
|
||||||
RET
|
RET
|
||||||
|
%endif
|
||||||
|
|
||||||
|
%if %1
|
||||||
|
cglobal fft8_ns_float, 4, 4, 6, ctx, out, in, tmp
|
||||||
|
call ff_tx_fft8_asm_float_sse3
|
||||||
|
RET
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
FFT8_SSE_FN float, 1
|
FFT8_SSE_FN 0
|
||||||
FFT8_SSE_FN ns_float, 0
|
FFT8_SSE_FN 1
|
||||||
|
|
||||||
%macro FFT8_AVX_FN 2
|
%macro FFT8_AVX_FN 1
|
||||||
INIT_YMM avx
|
INIT_YMM avx
|
||||||
cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp
|
%if %1
|
||||||
%if %2
|
cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, tmp
|
||||||
|
movaps m0, [inq + 0*mmsize]
|
||||||
|
movaps m1, [inq + 1*mmsize]
|
||||||
|
%else
|
||||||
|
cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
|
||||||
mov ctxq, [ctxq + AVTXContext.map]
|
mov ctxq, [ctxq + AVTXContext.map]
|
||||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
|
||||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
|
||||||
%else
|
|
||||||
movaps m0, [inq + 0*mmsize]
|
|
||||||
movaps m1, [inq + 1*mmsize]
|
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
FFT8_AVX m0, m1, m2, m3
|
FFT8_AVX m0, m1, m2, m3
|
||||||
@@ -764,21 +776,32 @@ cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp
|
|||||||
vextractf128 [outq + 16*2], m2, 1
|
vextractf128 [outq + 16*2], m2, 1
|
||||||
vextractf128 [outq + 16*3], m0, 1
|
vextractf128 [outq + 16*3], m0, 1
|
||||||
|
|
||||||
|
%if %1
|
||||||
|
ret
|
||||||
|
%else
|
||||||
RET
|
RET
|
||||||
|
%endif
|
||||||
|
|
||||||
|
%if %1
|
||||||
|
cglobal fft8_ns_float, 4, 4, 4, ctx, out, in, tmp
|
||||||
|
call ff_tx_fft8_asm_float_avx
|
||||||
|
RET
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
FFT8_AVX_FN float, 1
|
FFT8_AVX_FN 0
|
||||||
FFT8_AVX_FN ns_float, 0
|
FFT8_AVX_FN 1
|
||||||
|
|
||||||
%macro FFT16_FN 3
|
%macro FFT16_FN 2
|
||||||
INIT_YMM %1
|
INIT_YMM %1
|
||||||
cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp
|
%if %2
|
||||||
%if %3
|
cglobal fft16_asm_float, 0, 0, 0, ctx, out, in, tmp
|
||||||
movaps m0, [inq + 0*mmsize]
|
movaps m0, [inq + 0*mmsize]
|
||||||
movaps m1, [inq + 1*mmsize]
|
movaps m1, [inq + 1*mmsize]
|
||||||
movaps m2, [inq + 2*mmsize]
|
movaps m2, [inq + 2*mmsize]
|
||||||
movaps m3, [inq + 3*mmsize]
|
movaps m3, [inq + 3*mmsize]
|
||||||
%else
|
%else
|
||||||
|
cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
|
||||||
mov ctxq, [ctxq + AVTXContext.map]
|
mov ctxq, [ctxq + AVTXContext.map]
|
||||||
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
|
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
|
||||||
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
|
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
|
||||||
@@ -802,23 +825,34 @@ cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp
|
|||||||
vextractf128 [outq + 16*6], m5, 1
|
vextractf128 [outq + 16*6], m5, 1
|
||||||
vextractf128 [outq + 16*7], m1, 1
|
vextractf128 [outq + 16*7], m1, 1
|
||||||
|
|
||||||
|
%if %2
|
||||||
|
ret
|
||||||
|
%else
|
||||||
RET
|
RET
|
||||||
|
%endif
|
||||||
|
|
||||||
|
%if %2
|
||||||
|
cglobal fft16_ns_float, 4, 4, 8, ctx, out, in, tmp
|
||||||
|
call ff_tx_fft16_asm_float_ %+ %1
|
||||||
|
RET
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
FFT16_FN avx, float, 0
|
FFT16_FN avx, 0
|
||||||
FFT16_FN avx, ns_float, 1
|
FFT16_FN avx, 1
|
||||||
FFT16_FN fma3, float, 0
|
FFT16_FN fma3, 0
|
||||||
FFT16_FN fma3, ns_float, 1
|
FFT16_FN fma3, 1
|
||||||
|
|
||||||
%macro FFT32_FN 3
|
%macro FFT32_FN 2
|
||||||
INIT_YMM %1
|
INIT_YMM %1
|
||||||
cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
|
%if %2
|
||||||
%if %3
|
cglobal fft32_asm_float, 0, 0, 0, ctx, out, in, tmp
|
||||||
movaps m4, [inq + 4*mmsize]
|
movaps m4, [inq + 4*mmsize]
|
||||||
movaps m5, [inq + 5*mmsize]
|
movaps m5, [inq + 5*mmsize]
|
||||||
movaps m6, [inq + 6*mmsize]
|
movaps m6, [inq + 6*mmsize]
|
||||||
movaps m7, [inq + 7*mmsize]
|
movaps m7, [inq + 7*mmsize]
|
||||||
%else
|
%else
|
||||||
|
cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
|
||||||
mov ctxq, [ctxq + AVTXContext.map]
|
mov ctxq, [ctxq + AVTXContext.map]
|
||||||
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m12
|
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m12
|
||||||
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m9, m13
|
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m9, m13
|
||||||
@@ -828,7 +862,7 @@ cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
|
|||||||
|
|
||||||
FFT8 m4, m5, m6, m7, m8, m9
|
FFT8 m4, m5, m6, m7, m8, m9
|
||||||
|
|
||||||
%if %3
|
%if %2
|
||||||
movaps m0, [inq + 0*mmsize]
|
movaps m0, [inq + 0*mmsize]
|
||||||
movaps m1, [inq + 1*mmsize]
|
movaps m1, [inq + 1*mmsize]
|
||||||
movaps m2, [inq + 2*mmsize]
|
movaps m2, [inq + 2*mmsize]
|
||||||
@@ -875,14 +909,24 @@ cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
|
|||||||
vextractf128 [outq + 16*14], m10, 1
|
vextractf128 [outq + 16*14], m10, 1
|
||||||
vextractf128 [outq + 16*15], m5, 1
|
vextractf128 [outq + 16*15], m5, 1
|
||||||
|
|
||||||
|
%if %2
|
||||||
|
ret
|
||||||
|
%else
|
||||||
RET
|
RET
|
||||||
|
%endif
|
||||||
|
|
||||||
|
%if %2
|
||||||
|
cglobal fft32_ns_float, 4, 4, 16, ctx, out, in, tmp
|
||||||
|
call ff_tx_fft32_asm_float_ %+ %1
|
||||||
|
RET
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%if ARCH_X86_64
|
%if ARCH_X86_64
|
||||||
FFT32_FN avx, float, 0
|
FFT32_FN avx, 0
|
||||||
FFT32_FN avx, ns_float, 1
|
FFT32_FN avx, 1
|
||||||
FFT32_FN fma3, float, 0
|
FFT32_FN fma3, 0
|
||||||
FFT32_FN fma3, ns_float, 1
|
FFT32_FN fma3, 1
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
%macro FFT_SPLIT_RADIX_DEF 1-2
|
%macro FFT_SPLIT_RADIX_DEF 1-2
|
||||||
@@ -923,17 +967,21 @@ ALIGN 16
|
|||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro FFT_SPLIT_RADIX_FN 3
|
%macro FFT_SPLIT_RADIX_FN 2
|
||||||
INIT_YMM %1
|
INIT_YMM %1
|
||||||
cglobal fft_sr_ %+ %2, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
|
%if %2
|
||||||
movsxd lenq, dword [lutq + AVTXContext.len]
|
cglobal fft_sr_asm_float, 0, 0, 0, ctx, out, in, tmp, len, lut, itab, rtab, tgt
|
||||||
mov lutq, [lutq + AVTXContext.map]
|
%else
|
||||||
|
cglobal fft_sr_float, 4, 9, 16, 272, ctx, out, in, tmp, len, lut, itab, rtab, tgt
|
||||||
|
movsxd lenq, dword [ctxq + AVTXContext.len]
|
||||||
|
mov lutq, [ctxq + AVTXContext.map]
|
||||||
mov tgtq, lenq
|
mov tgtq, lenq
|
||||||
|
%endif
|
||||||
|
|
||||||
; Bottom-most/32-point transform ===============================================
|
; Bottom-most/32-point transform ===============================================
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
.32pt:
|
.32pt:
|
||||||
%if %3
|
%if %2
|
||||||
movaps m4, [inq + 4*mmsize]
|
movaps m4, [inq + 4*mmsize]
|
||||||
movaps m5, [inq + 5*mmsize]
|
movaps m5, [inq + 5*mmsize]
|
||||||
movaps m6, [inq + 6*mmsize]
|
movaps m6, [inq + 6*mmsize]
|
||||||
@@ -947,7 +995,7 @@ ALIGN 16
|
|||||||
|
|
||||||
FFT8 m4, m5, m6, m7, m8, m9
|
FFT8 m4, m5, m6, m7, m8, m9
|
||||||
|
|
||||||
%if %3
|
%if %2
|
||||||
movaps m0, [inq + 0*mmsize]
|
movaps m0, [inq + 0*mmsize]
|
||||||
movaps m1, [inq + 1*mmsize]
|
movaps m1, [inq + 1*mmsize]
|
||||||
movaps m2, [inq + 2*mmsize]
|
movaps m2, [inq + 2*mmsize]
|
||||||
@@ -972,7 +1020,7 @@ ALIGN 16
|
|||||||
movaps [outq + 5*mmsize], m5
|
movaps [outq + 5*mmsize], m5
|
||||||
movaps [outq + 7*mmsize], m7
|
movaps [outq + 7*mmsize], m7
|
||||||
|
|
||||||
%if %3
|
%if %2
|
||||||
add inq, 8*mmsize
|
add inq, 8*mmsize
|
||||||
%else
|
%else
|
||||||
add lutq, (mmsize/2)*8
|
add lutq, (mmsize/2)*8
|
||||||
@@ -1007,7 +1055,7 @@ ALIGN 16
|
|||||||
SWAP m4, m1
|
SWAP m4, m1
|
||||||
SWAP m6, m3
|
SWAP m6, m3
|
||||||
|
|
||||||
%if %3
|
%if %2
|
||||||
movaps tx1_e0, [inq + 0*mmsize]
|
movaps tx1_e0, [inq + 0*mmsize]
|
||||||
movaps tx1_e1, [inq + 1*mmsize]
|
movaps tx1_e1, [inq + 1*mmsize]
|
||||||
movaps tx1_o0, [inq + 2*mmsize]
|
movaps tx1_o0, [inq + 2*mmsize]
|
||||||
@@ -1021,7 +1069,7 @@ ALIGN 16
|
|||||||
|
|
||||||
FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
|
FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
|
||||||
|
|
||||||
%if %3
|
%if %2
|
||||||
movaps tx2_e0, [inq + 4*mmsize]
|
movaps tx2_e0, [inq + 4*mmsize]
|
||||||
movaps tx2_e1, [inq + 5*mmsize]
|
movaps tx2_e1, [inq + 5*mmsize]
|
||||||
movaps tx2_o0, [inq + 6*mmsize]
|
movaps tx2_o0, [inq + 6*mmsize]
|
||||||
@@ -1038,7 +1086,7 @@ ALIGN 16
|
|||||||
movaps tw_e, [tab_64_float]
|
movaps tw_e, [tab_64_float]
|
||||||
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
|
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
|
||||||
|
|
||||||
%if %3
|
%if %2
|
||||||
add inq, 8*mmsize
|
add inq, 8*mmsize
|
||||||
%else
|
%else
|
||||||
add lutq, (mmsize/2)*8
|
add lutq, (mmsize/2)*8
|
||||||
@@ -1201,7 +1249,11 @@ FFT_SPLIT_RADIX_DEF 131072
|
|||||||
sub lenq, 4*mmsize
|
sub lenq, 4*mmsize
|
||||||
jg .synth_deinterleave
|
jg .synth_deinterleave
|
||||||
|
|
||||||
|
%if %2
|
||||||
|
ret
|
||||||
|
%else
|
||||||
RET
|
RET
|
||||||
|
%endif
|
||||||
|
|
||||||
; 64-point deinterleave which only has to load 4 registers =====================
|
; 64-point deinterleave which only has to load 4 registers =====================
|
||||||
.64pt_deint:
|
.64pt_deint:
|
||||||
@@ -1278,14 +1330,28 @@ FFT_SPLIT_RADIX_DEF 131072
|
|||||||
vextractf128 [outq + 15*mmsize + 0], tw_o, 1
|
vextractf128 [outq + 15*mmsize + 0], tw_o, 1
|
||||||
vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1
|
vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1
|
||||||
|
|
||||||
|
%if %2
|
||||||
|
ret
|
||||||
|
%else
|
||||||
RET
|
RET
|
||||||
|
%endif
|
||||||
|
|
||||||
|
%if %2
|
||||||
|
cglobal fft_sr_ns_float, 4, 9, 16, 272, ctx, out, in, tmp, len, lut, itab, rtab, tgt
|
||||||
|
movsxd lenq, dword [ctxq + AVTXContext.len]
|
||||||
|
mov lutq, [ctxq + AVTXContext.map]
|
||||||
|
mov tgtq, lenq
|
||||||
|
|
||||||
|
call ff_tx_fft_sr_asm_float_ %+ %1
|
||||||
|
RET
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%if ARCH_X86_64
|
%if ARCH_X86_64
|
||||||
FFT_SPLIT_RADIX_FN fma3, float, 0
|
FFT_SPLIT_RADIX_FN fma3, 0
|
||||||
FFT_SPLIT_RADIX_FN fma3, ns_float, 1
|
FFT_SPLIT_RADIX_FN fma3, 1
|
||||||
%if HAVE_AVX2_EXTERNAL
|
%if HAVE_AVX2_EXTERNAL
|
||||||
FFT_SPLIT_RADIX_FN avx2, float, 0
|
FFT_SPLIT_RADIX_FN avx2, 0
|
||||||
FFT_SPLIT_RADIX_FN avx2, ns_float, 1
|
FFT_SPLIT_RADIX_FN avx2, 1
|
||||||
%endif
|
%endif
|
||||||
%endif
|
%endif
|
||||||
|
@@ -43,6 +43,15 @@ TX_DECL_FN(fft_sr_ns, fma3)
|
|||||||
TX_DECL_FN(fft_sr, avx2)
|
TX_DECL_FN(fft_sr, avx2)
|
||||||
TX_DECL_FN(fft_sr_ns, avx2)
|
TX_DECL_FN(fft_sr_ns, avx2)
|
||||||
|
|
||||||
|
TX_DECL_FN(fft8_asm, sse3)
|
||||||
|
TX_DECL_FN(fft8_asm, avx)
|
||||||
|
TX_DECL_FN(fft16_asm, avx)
|
||||||
|
TX_DECL_FN(fft16_asm, fma3)
|
||||||
|
TX_DECL_FN(fft32_asm, avx)
|
||||||
|
TX_DECL_FN(fft32_asm, fma3)
|
||||||
|
TX_DECL_FN(fft_sr_asm, fma3)
|
||||||
|
TX_DECL_FN(fft_sr_asm, avx2)
|
||||||
|
|
||||||
#define DECL_INIT_FN(basis, interleave) \
|
#define DECL_INIT_FN(basis, interleave) \
|
||||||
static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \
|
static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \
|
||||||
const FFTXCodelet *cd, \
|
const FFTXCodelet *cd, \
|
||||||
@@ -70,30 +79,46 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
|
|||||||
TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
||||||
TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
|
TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
|
||||||
TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0),
|
TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0),
|
||||||
|
TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3,
|
||||||
|
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0),
|
||||||
TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
|
||||||
TX_DEF(fft8, FFT, 8, 8, 2, 0, 256, b8_i0, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
|
TX_DEF(fft8, FFT, 8, 8, 2, 0, 256, b8_i0, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
|
||||||
|
TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX,
|
||||||
|
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
|
||||||
TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||||
AV_CPU_FLAG_AVXSLOW),
|
AV_CPU_FLAG_AVXSLOW),
|
||||||
TX_DEF(fft16, FFT, 16, 16, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
|
TX_DEF(fft16, FFT, 16, 16, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
|
||||||
|
TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX,
|
||||||
|
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
|
||||||
TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||||
AV_CPU_FLAG_AVXSLOW),
|
AV_CPU_FLAG_AVXSLOW),
|
||||||
TX_DEF(fft16, FFT, 16, 16, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
|
TX_DEF(fft16, FFT, 16, 16, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
|
||||||
|
TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3,
|
||||||
|
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
|
||||||
TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||||
AV_CPU_FLAG_AVXSLOW),
|
AV_CPU_FLAG_AVXSLOW),
|
||||||
|
|
||||||
#if ARCH_X86_64
|
#if ARCH_X86_64
|
||||||
TX_DEF(fft32, FFT, 32, 32, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
|
TX_DEF(fft32, FFT, 32, 32, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
|
||||||
|
TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX,
|
||||||
|
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
|
||||||
TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||||
AV_CPU_FLAG_AVXSLOW),
|
AV_CPU_FLAG_AVXSLOW),
|
||||||
TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
|
TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
|
||||||
|
TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3,
|
||||||
|
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
|
||||||
TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||||
AV_CPU_FLAG_AVXSLOW),
|
AV_CPU_FLAG_AVXSLOW),
|
||||||
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, fma3, FMA3, 0, AV_CPU_FLAG_AVXSLOW),
|
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 288, b8_i2, fma3, FMA3, 0, AV_CPU_FLAG_AVXSLOW),
|
||||||
|
TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3,
|
||||||
|
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
|
||||||
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||||
AV_CPU_FLAG_AVXSLOW),
|
AV_CPU_FLAG_AVXSLOW),
|
||||||
#if HAVE_AVX2_EXTERNAL
|
#if HAVE_AVX2_EXTERNAL
|
||||||
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 320, b8_i2, avx2, AVX2, 0,
|
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 320, b8_i2, avx2, AVX2, 0,
|
||||||
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
||||||
|
TX_DEF(fft_sr_asm, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2,
|
||||||
|
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
||||||
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||||
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
||||||
#endif
|
#endif
|
||||||
|
Reference in New Issue
Block a user