From 1b932eb1508f550fac9e911923a0383efda53aa3 Mon Sep 17 00:00:00 2001 From: James Almer Date: Sat, 22 Feb 2014 04:54:01 +0000 Subject: [PATCH] x86: add detection for FMA3 instruction set Based on x264 code Signed-off-by: James Almer --- configure | 5 +++++ libavutil/cpu.c | 3 +++ libavutil/cpu.h | 1 + libavutil/x86/cpu.c | 5 ++++- libavutil/x86/cpu.h | 3 +++ 5 files changed, 16 insertions(+), 1 deletion(-) diff --git a/configure b/configure index a0d991d893..53afde0ce2 100755 --- a/configure +++ b/configure @@ -271,6 +271,7 @@ Optimization options (experts only): --disable-sse42 disable SSE4.2 optimizations --disable-avx disable AVX optimizations --disable-xop disable XOP optimizations + --disable-fma3 disable FMA3 optimizations --disable-fma4 disable FMA4 optimizations --disable-avx2 disable AVX2 optimizations --disable-armv5te disable armv5te optimizations @@ -1254,6 +1255,7 @@ ARCH_EXT_LIST_X86=' avx2 cpunop xop + fma3 fma4 i686 mmx @@ -1578,6 +1580,7 @@ sse4_deps="ssse3" sse42_deps="sse4" avx_deps="sse42" xop_deps="avx" +fma3_deps="avx" fma4_deps="avx" avx2_deps="avx" @@ -3761,6 +3764,7 @@ EOF check_yasm "movbe ecx, [5]" && enable yasm || die "yasm/nasm not found or too old. Use --disable-yasm for a crippled build." check_yasm "vpmacsdd xmm0, xmm1, xmm2, xmm3" || disable xop_external + check_yasm "vfmadd132ps ymm0, ymm1, ymm2" || disable fma3_external check_yasm "vfmaddps ymm0, ymm1, ymm2, ymm3" || disable fma4_external check_yasm "CPU amdnop" || disable cpunop fi @@ -4294,6 +4298,7 @@ if enabled x86; then echo "SSSE3 enabled ${ssse3-no}" echo "AVX enabled ${avx-no}" echo "XOP enabled ${xop-no}" + echo "FMA3 enabled ${fma3-no}" echo "FMA4 enabled ${fma4-no}" echo "i686 features enabled ${i686-no}" echo "CMOV is fast ${fast_cmov-no}" diff --git a/libavutil/cpu.c b/libavutil/cpu.c index 8c2cfb87cc..972e4eb79f 100644 --- a/libavutil/cpu.c +++ b/libavutil/cpu.c @@ -87,6 +87,7 @@ int av_parse_cpu_flags(const char *s) #define CPUFLAG_SSE42 (AV_CPU_FLAG_SSE42 | CPUFLAG_SSE4) #define CPUFLAG_AVX (AV_CPU_FLAG_AVX | CPUFLAG_SSE42) #define CPUFLAG_XOP (AV_CPU_FLAG_XOP | CPUFLAG_AVX) +#define CPUFLAG_FMA3 (AV_CPU_FLAG_FMA3 | CPUFLAG_AVX) #define CPUFLAG_FMA4 (AV_CPU_FLAG_FMA4 | CPUFLAG_AVX) #define CPUFLAG_AVX2 (AV_CPU_FLAG_AVX2 | CPUFLAG_AVX) static const AVOption cpuflags_opts[] = { @@ -107,6 +108,7 @@ int av_parse_cpu_flags(const char *s) { "sse4.2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_SSE42 }, .unit = "flags" }, { "avx" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVX }, .unit = "flags" }, { "xop" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_XOP }, .unit = "flags" }, + { "fma3" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA3 }, .unit = "flags" }, { "fma4" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA4 }, .unit = "flags" }, { "avx2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVX2 }, .unit = "flags" }, { "3dnow" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_3DNOW }, .unit = "flags" }, @@ -204,6 +206,7 @@ static const struct { { AV_CPU_FLAG_SSE42, "sse4.2" }, { AV_CPU_FLAG_AVX, "avx" }, { AV_CPU_FLAG_XOP, "xop" }, + { AV_CPU_FLAG_FMA3, "fma3" }, { AV_CPU_FLAG_FMA4, "fma4" }, { AV_CPU_FLAG_3DNOW, "3dnow" }, { AV_CPU_FLAG_3DNOWEXT, "3dnowext" }, diff --git a/libavutil/cpu.h b/libavutil/cpu.h index 29036e3941..934b3be6a2 100644 --- a/libavutil/cpu.h +++ b/libavutil/cpu.h @@ -49,6 +49,7 @@ #define AV_CPU_FLAG_FMA4 0x0800 ///< Bulldozer FMA4 functions #define AV_CPU_FLAG_CMOV 0x1000 ///< i686 cmov #define AV_CPU_FLAG_AVX2 0x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used +#define AV_CPU_FLAG_FMA3 0x10000 ///< Haswell FMA3 functions #define AV_CPU_FLAG_ALTIVEC 0x0001 ///< standard diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c index 0e06d5deb2..95359eeb7b 100644 --- a/libavutil/x86/cpu.c +++ b/libavutil/x86/cpu.c @@ -131,8 +131,11 @@ int ff_get_cpu_flags_x86(void) if ((ecx & 0x18000000) == 0x18000000) { /* Check for OS support */ xgetbv(0, eax, edx); - if ((eax & 0x6) == 0x6) + if ((eax & 0x6) == 0x6) { rval |= AV_CPU_FLAG_AVX; + if (ecx & 0x00001000) + rval |= AV_CPU_FLAG_FMA3; + } } #if HAVE_AVX2 if (max_std_level >= 7) { diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h index 40daf445c7..50da30e389 100644 --- a/libavutil/x86/cpu.h +++ b/libavutil/x86/cpu.h @@ -38,6 +38,7 @@ #define X86_SSE42(flags) CPUEXT(flags, SSE42) #define X86_AVX(flags) CPUEXT(flags, AVX) #define X86_XOP(flags) CPUEXT(flags, XOP) +#define X86_FMA3(flags) CPUEXT(flags, FMA3) #define X86_FMA4(flags) CPUEXT(flags, FMA4) #define X86_AVX2(flags) CPUEXT(flags, AVX2) @@ -53,6 +54,7 @@ #define EXTERNAL_SSE42(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, SSE42) #define EXTERNAL_AVX(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX) #define EXTERNAL_XOP(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, XOP) +#define EXTERNAL_FMA3(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA3) #define EXTERNAL_FMA4(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, FMA4) #define EXTERNAL_AVX2(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2) @@ -68,6 +70,7 @@ #define INLINE_SSE42(flags) CPUEXT_SUFFIX(flags, _INLINE, SSE42) #define INLINE_AVX(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX) #define INLINE_XOP(flags) CPUEXT_SUFFIX(flags, _INLINE, XOP) +#define INLINE_FMA3(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA3) #define INLINE_FMA4(flags) CPUEXT_SUFFIX(flags, _INLINE, FMA4) #define INLINE_AVX2(flags) CPUEXT_SUFFIX(flags, _INLINE, AVX2)