mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-28 20:53:54 +02:00
lavu/tx: refactor and separate codelet list and prio code
This commit is contained in:
parent
958b3760b5
commit
1c8d77a2bf
125
libavutil/tx.c
125
libavutil/tx.c
@ -300,6 +300,67 @@ static const FFTXCodelet * const ff_tx_null_list[] = {
|
|||||||
NULL,
|
NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* Array of all compiled codelet lists. Order is irrelevant. */
|
||||||
|
static const FFTXCodelet * const * const codelet_list[] = {
|
||||||
|
ff_tx_codelet_list_float_c,
|
||||||
|
ff_tx_codelet_list_double_c,
|
||||||
|
ff_tx_codelet_list_int32_c,
|
||||||
|
ff_tx_null_list,
|
||||||
|
#if HAVE_X86ASM
|
||||||
|
ff_tx_codelet_list_float_x86,
|
||||||
|
#endif
|
||||||
|
#if ARCH_AARCH64
|
||||||
|
ff_tx_codelet_list_float_aarch64,
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
static const int codelet_list_num = FF_ARRAY_ELEMS(codelet_list);
|
||||||
|
|
||||||
|
static const int cpu_slow_mask = AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE3SLOW |
|
||||||
|
AV_CPU_FLAG_ATOM | AV_CPU_FLAG_SSSE3SLOW |
|
||||||
|
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER;
|
||||||
|
|
||||||
|
static const int cpu_slow_penalties[][2] = {
|
||||||
|
{ AV_CPU_FLAG_SSE2SLOW, 1 + 64 },
|
||||||
|
{ AV_CPU_FLAG_SSE3SLOW, 1 + 64 },
|
||||||
|
{ AV_CPU_FLAG_SSSE3SLOW, 1 + 64 },
|
||||||
|
{ AV_CPU_FLAG_ATOM, 1 + 128 },
|
||||||
|
{ AV_CPU_FLAG_AVXSLOW, 1 + 128 },
|
||||||
|
{ AV_CPU_FLAG_SLOW_GATHER, 1 + 32 },
|
||||||
|
};
|
||||||
|
|
||||||
|
static int get_codelet_prio(const FFTXCodelet *cd, int cpu_flags, int len)
|
||||||
|
{
|
||||||
|
int prio = cd->prio;
|
||||||
|
int max_factor = 0;
|
||||||
|
|
||||||
|
/* If the CPU has a SLOW flag, and the instruction is also flagged
|
||||||
|
* as being slow for such, reduce its priority */
|
||||||
|
for (int i = 0; i < FF_ARRAY_ELEMS(cpu_slow_penalties); i++) {
|
||||||
|
if ((cpu_flags & cd->cpu_flags) & cpu_slow_penalties[i][0])
|
||||||
|
prio -= cpu_slow_penalties[i][1];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Prioritize aligned-only codelets */
|
||||||
|
if ((cd->flags & FF_TX_ALIGNED) && !(cd->flags & AV_TX_UNALIGNED))
|
||||||
|
prio += 64;
|
||||||
|
|
||||||
|
/* Codelets for specific lengths are generally faster */
|
||||||
|
if ((len == cd->min_len) && (len == cd->max_len))
|
||||||
|
prio += 64;
|
||||||
|
|
||||||
|
/* Forward-only or inverse-only transforms are generally better */
|
||||||
|
if ((cd->flags & (FF_TX_FORWARD_ONLY | FF_TX_INVERSE_ONLY)))
|
||||||
|
prio += 64;
|
||||||
|
|
||||||
|
/* Larger factors are generally better */
|
||||||
|
for (int i = 0; i < TX_MAX_SUB; i++)
|
||||||
|
max_factor = FFMAX(cd->factors[i], max_factor);
|
||||||
|
if (max_factor)
|
||||||
|
prio += 16*max_factor;
|
||||||
|
|
||||||
|
return prio;
|
||||||
|
}
|
||||||
|
|
||||||
#if !CONFIG_SMALL
|
#if !CONFIG_SMALL
|
||||||
static void print_flags(AVBPrint *bp, uint64_t f)
|
static void print_flags(AVBPrint *bp, uint64_t f)
|
||||||
{
|
{
|
||||||
@ -465,41 +526,15 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
|
|||||||
AVTXContext *sub = NULL;
|
AVTXContext *sub = NULL;
|
||||||
TXCodeletMatch *cd_tmp, *cd_matches = NULL;
|
TXCodeletMatch *cd_tmp, *cd_matches = NULL;
|
||||||
unsigned int cd_matches_size = 0;
|
unsigned int cd_matches_size = 0;
|
||||||
|
int codelet_list_idx = codelet_list_num;
|
||||||
int nb_cd_matches = 0;
|
int nb_cd_matches = 0;
|
||||||
#if !CONFIG_SMALL
|
#if !CONFIG_SMALL
|
||||||
AVBPrint bp = { 0 };
|
AVBPrint bp = { 0 };
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Array of all compiled codelet lists. Order is irrelevant. */
|
|
||||||
const FFTXCodelet * const * const codelet_list[] = {
|
|
||||||
ff_tx_codelet_list_float_c,
|
|
||||||
ff_tx_codelet_list_double_c,
|
|
||||||
ff_tx_codelet_list_int32_c,
|
|
||||||
ff_tx_null_list,
|
|
||||||
#if HAVE_X86ASM
|
|
||||||
ff_tx_codelet_list_float_x86,
|
|
||||||
#endif
|
|
||||||
#if ARCH_AARCH64
|
|
||||||
ff_tx_codelet_list_float_aarch64,
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
int codelet_list_num = FF_ARRAY_ELEMS(codelet_list);
|
|
||||||
|
|
||||||
/* We still accept functions marked with SLOW, even if the CPU is
|
/* We still accept functions marked with SLOW, even if the CPU is
|
||||||
* marked with the same flag, but we give them lower priority. */
|
* marked with the same flag, but we give them lower priority. */
|
||||||
const int cpu_flags = av_get_cpu_flags();
|
const int cpu_flags = av_get_cpu_flags();
|
||||||
const int slow_mask = AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE3SLOW |
|
|
||||||
AV_CPU_FLAG_ATOM | AV_CPU_FLAG_SSSE3SLOW |
|
|
||||||
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER;
|
|
||||||
|
|
||||||
static const int slow_penalties[][2] = {
|
|
||||||
{ AV_CPU_FLAG_SSE2SLOW, 1 + 64 },
|
|
||||||
{ AV_CPU_FLAG_SSE3SLOW, 1 + 64 },
|
|
||||||
{ AV_CPU_FLAG_SSSE3SLOW, 1 + 64 },
|
|
||||||
{ AV_CPU_FLAG_ATOM, 1 + 128 },
|
|
||||||
{ AV_CPU_FLAG_AVXSLOW, 1 + 128 },
|
|
||||||
{ AV_CPU_FLAG_SLOW_GATHER, 1 + 32 },
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Flags the transform wants */
|
/* Flags the transform wants */
|
||||||
uint64_t req_flags = flags;
|
uint64_t req_flags = flags;
|
||||||
@ -519,13 +554,11 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
|
|||||||
|
|
||||||
/* Loop through all codelets in all codelet lists to find matches
|
/* Loop through all codelets in all codelet lists to find matches
|
||||||
* to the requirements */
|
* to the requirements */
|
||||||
while (codelet_list_num--) {
|
while (codelet_list_idx--) {
|
||||||
const FFTXCodelet * const * list = codelet_list[codelet_list_num];
|
const FFTXCodelet * const * list = codelet_list[codelet_list_idx];
|
||||||
const FFTXCodelet *cd = NULL;
|
const FFTXCodelet *cd = NULL;
|
||||||
|
|
||||||
while ((cd = *list++)) {
|
while ((cd = *list++)) {
|
||||||
int max_factor = 0;
|
|
||||||
|
|
||||||
/* Check if the type matches */
|
/* Check if the type matches */
|
||||||
if (cd->type != TX_TYPE_ANY && type != cd->type)
|
if (cd->type != TX_TYPE_ANY && type != cd->type)
|
||||||
continue;
|
continue;
|
||||||
@ -546,7 +579,7 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
|
|||||||
|
|
||||||
/* Check if the CPU supports the required ISA */
|
/* Check if the CPU supports the required ISA */
|
||||||
if (cd->cpu_flags != FF_TX_CPU_FLAGS_ALL &&
|
if (cd->cpu_flags != FF_TX_CPU_FLAGS_ALL &&
|
||||||
!(cpu_flags & (cd->cpu_flags & ~slow_mask)))
|
!(cpu_flags & (cd->cpu_flags & ~cpu_slow_mask)))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* Check for factors */
|
/* Check for factors */
|
||||||
@ -563,33 +596,7 @@ av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
|
|||||||
|
|
||||||
cd_matches = cd_tmp;
|
cd_matches = cd_tmp;
|
||||||
cd_matches[nb_cd_matches].cd = cd;
|
cd_matches[nb_cd_matches].cd = cd;
|
||||||
cd_matches[nb_cd_matches].prio = cd->prio;
|
cd_matches[nb_cd_matches].prio = get_codelet_prio(cd, cpu_flags, len);
|
||||||
|
|
||||||
/* If the CPU has a SLOW flag, and the instruction is also flagged
|
|
||||||
* as being slow for such, reduce its priority */
|
|
||||||
for (int i = 0; i < FF_ARRAY_ELEMS(slow_penalties); i++) {
|
|
||||||
if ((cpu_flags & cd->cpu_flags) & slow_penalties[i][0])
|
|
||||||
cd_matches[nb_cd_matches].prio -= slow_penalties[i][1];
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Prioritize aligned-only codelets */
|
|
||||||
if ((cd->flags & FF_TX_ALIGNED) && !(cd->flags & AV_TX_UNALIGNED))
|
|
||||||
cd_matches[nb_cd_matches].prio += 64;
|
|
||||||
|
|
||||||
/* Codelets for specific lengths are generally faster */
|
|
||||||
if ((len == cd->min_len) && (len == cd->max_len))
|
|
||||||
cd_matches[nb_cd_matches].prio += 64;
|
|
||||||
|
|
||||||
/* Forward-only or inverse-only transforms are generally better */
|
|
||||||
if ((cd->flags & (FF_TX_FORWARD_ONLY | FF_TX_INVERSE_ONLY)))
|
|
||||||
cd_matches[nb_cd_matches].prio += 64;
|
|
||||||
|
|
||||||
/* Larger factors are generally better */
|
|
||||||
for (int i = 0; i < TX_MAX_SUB; i++)
|
|
||||||
max_factor = FFMAX(cd->factors[i], max_factor);
|
|
||||||
if (max_factor)
|
|
||||||
cd_matches[nb_cd_matches].prio += 16*max_factor;
|
|
||||||
|
|
||||||
nb_cd_matches++;
|
nb_cd_matches++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user