diff --git a/common.mak b/common.mak index 5e43c7dd1b..4dee65d86e 100644 --- a/common.mak +++ b/common.mak @@ -11,7 +11,7 @@ ifndef V Q = @ ECHO = printf "$(1)\t%s\n" $(2) BRIEF = CC CXX AS YASM AR LD HOSTCC STRIP CP -SILENT = DEPCC YASMDEP RM RANLIB +SILENT = DEPCC DEPAS DEPHOSTCC YASMDEP RM RANLIB MSG = $@ M = @$(call ECHO,$(TAG),$@); $(foreach VAR,$(BRIEF), \ @@ -26,15 +26,16 @@ ALLFFLIBS = avcodec avdevice avfilter avformat avresample avutil postproc swscal IFLAGS := -I. -I$(SRC_PATH)/ CPPFLAGS := $(IFLAGS) $(CPPFLAGS) CFLAGS += $(ECFLAGS) -CCFLAGS = $(CFLAGS) +CCFLAGS = $(CPPFLAGS) $(CFLAGS) +ASFLAGS := $(CPPFLAGS) $(ASFLAGS) CXXFLAGS := $(CFLAGS) $(CXXFLAGS) YASMFLAGS += $(IFLAGS) -I$(SRC_PATH)/libavutil/x86/ -Pconfig.asm -HOSTCFLAGS += $(IFLAGS) +HOSTCCFLAGS = $(IFLAGS) $(HOSTCFLAGS) LDFLAGS := $(ALLFFLIBS:%=-Llib%) $(LDFLAGS) define COMPILE - $($(1)DEP) - $($(1)) $(CPPFLAGS) $($(1)FLAGS) $($(1)_DEPFLAGS) -c $($(1)_O) $< + $(call $(1)DEP,$(1)) + $($(1)) $($(1)FLAGS) $($(1)_DEPFLAGS) -c $($(1)_O) $< endef COMPILE_C = $(call COMPILE,CC) @@ -101,7 +102,7 @@ checkheaders: $(filter-out $(SKIPHEADERS:.h=.ho),$(ALLHEADERS:.h=.ho)) alltools: $(TOOLS) $(HOSTOBJS): %.o: %.c - $(HOSTCC) $(HOSTCFLAGS) -c -o $@ $< + $(call COMPILE,HOSTCC) $(HOSTPROGS): %$(HOSTEXESUF): %.o $(HOSTCC) $(HOSTLDFLAGS) -o $@ $< $(HOSTLIBS) @@ -117,4 +118,4 @@ CLEANSUFFIXES = *.d *.o *~ *.ho *.map *.ver *.gcno *.gcda DISTCLEANSUFFIXES = *.pc LIBSUFFIXES = *.a *.lib *.so *.so.* *.dylib *.dll *.def *.dll.a --include $(wildcard $(OBJS:.o=.d) $(TESTOBJS:.o=.d)) +-include $(wildcard $(OBJS:.o=.d) $(HOSTOBJS:.o=.d) $(TESTOBJS:.o=.d)) diff --git a/configure b/configure index 7363f9acdb..818daa6d98 100755 --- a/configure +++ b/configure @@ -265,6 +265,7 @@ Optimization options (experts only): --disable-sse disable SSE optimizations --disable-ssse3 disable SSSE3 optimizations --disable-avx disable AVX optimizations + --disable-fma4 disable FMA4 optimizations --disable-armv5te disable armv5te optimizations --disable-armv6 disable armv6 optimizations --disable-armv6t2 disable armv6t2 optimizations @@ -1173,6 +1174,7 @@ ARCH_EXT_LIST=' armv6t2 armvfp avx + fma4 mmi mmx mmx2 @@ -1336,7 +1338,7 @@ HAVE_LIST=" # options emitted with CONFIG_ prefix but not available on command line CONFIG_EXTRA=" - aandct + aandcttables avutil golomb gplv3 @@ -1450,6 +1452,7 @@ mmx2_deps="mmx" sse_deps="mmx" ssse3_deps="sse" avx_deps="ssse3" +fma4_deps="avx" aligned_stack_if_any="ppc x86" fast_64bit_if_any="alpha ia64 mips64 parisc64 ppc64 sparc64 x86_64" @@ -1477,7 +1480,7 @@ ac3_fixed_encoder_select="mdct ac3dsp" alac_encoder_select="lpc" amrnb_decoder_select="lsp" amrwb_decoder_select="lsp" -amv_encoder_select="aandct" +amv_encoder_select="aandcttables" atrac1_decoder_select="mdct sinewin" atrac3_decoder_select="mdct" binkaudio_dct_decoder_select="mdct rdft dct sinewin" @@ -1487,13 +1490,13 @@ cook_decoder_select="mdct sinewin" cscd_decoder_suggest="zlib" dca_decoder_select="mdct" dirac_decoder_select="dwt golomb" -dnxhd_encoder_select="aandct" +dnxhd_encoder_select="aandcttables" dxa_decoder_select="zlib" eac3_decoder_select="ac3_decoder" eac3_encoder_select="mdct ac3dsp" -eamad_decoder_select="aandct" -eatgq_decoder_select="aandct" -eatqi_decoder_select="aandct" +eamad_decoder_select="aandcttables" +eatgq_decoder_select="aandcttables" +eatqi_decoder_select="aandcttables" exr_decoder_select="zlib" ffv1_decoder_select="golomb" flac_decoder_select="golomb" @@ -1505,9 +1508,9 @@ flashsv2_decoder_select="zlib" flv_decoder_select="h263_decoder" flv_encoder_select="h263_encoder" fraps_decoder_select="huffman" -h261_encoder_select="aandct" +h261_encoder_select="aandcttables" h263_decoder_select="h263_parser" -h263_encoder_select="aandct" +h263_encoder_select="aandcttables" h263_vaapi_hwaccel_select="vaapi h263_decoder" h263i_decoder_select="h263_decoder" h263p_encoder_select="h263_encoder" @@ -1523,9 +1526,9 @@ iac_decoder_select="fft mdct sinewin" imc_decoder_select="fft mdct sinewin" jpegls_decoder_select="golomb" jpegls_encoder_select="golomb" -ljpeg_encoder_select="aandct" +ljpeg_encoder_select="aandcttables" loco_decoder_select="golomb" -mjpeg_encoder_select="aandct" +mjpeg_encoder_select="aandcttables" mlp_decoder_select="mlp_parser" mp1_decoder_select="mpegaudiodsp" mp1float_decoder_select="mpegaudiodsp" @@ -1544,13 +1547,13 @@ mpeg_xvmc_decoder_deps="X11_extensions_XvMClib_h" mpeg_xvmc_decoder_select="mpegvideo_decoder" mpeg1_vdpau_decoder_select="vdpau mpeg1video_decoder" mpeg1_vdpau_hwaccel_select="vdpau mpeg1video_decoder" -mpeg1video_encoder_select="aandct" +mpeg1video_encoder_select="aandcttables" mpeg2_crystalhd_decoder_select="crystalhd" mpeg2_dxva2_hwaccel_deps="dxva2api_h" mpeg2_dxva2_hwaccel_select="dxva2 mpeg2video_decoder" mpeg2_vdpau_hwaccel_select="vdpau mpeg2video_decoder" mpeg2_vaapi_hwaccel_select="vaapi mpeg2video_decoder" -mpeg2video_encoder_select="aandct" +mpeg2video_encoder_select="aandcttables" mpeg4_crystalhd_decoder_select="crystalhd" mpeg4_decoder_select="h263_decoder mpeg4video_parser" mpeg4_encoder_select="h263_encoder" @@ -1580,11 +1583,11 @@ rv40_decoder_select="golomb h264chroma h264pred h264qpel" shorten_decoder_select="golomb" sipr_decoder_select="lsp" snow_decoder_select="dwt" -snow_encoder_select="aandct dwt" +snow_encoder_select="aandcttables dwt" sonic_decoder_select="golomb" sonic_encoder_select="golomb" sonic_ls_encoder_select="golomb" -svq1_encoder_select="aandct" +svq1_encoder_select="aandcttables" svq3_decoder_select="golomb h264chroma h264dsp h264pred h264qpel" svq3_decoder_suggest="zlib" theora_decoder_select="vp3_decoder" @@ -1965,6 +1968,8 @@ ldflags_filter=echo AS_O='-o $@' CC_O='-o $@' CXX_O='-o $@' +LD_O='-o $@' +HOSTCC_O='-o $@' host_cflags='-D_ISOC99_SOURCE -D_XOPEN_SOURCE=600 -O3 -g' host_libs='-lm' @@ -1975,8 +1980,8 @@ target_path='$(CURDIR)' # since the object filename is not given with the -MM flag, the compiler # is only able to print the basename, and we must add the path ourselves -DEPEND_CMD='$(DEPCC) $(DEPFLAGS) $< | sed -e "/^\#.*/d" -e "s,^[[:space:]]*$(*F)\\.o,$(@D)/$(*F).o," > $(@:.o=.d)' -DEPFLAGS='$(CPPFLAGS) $(CFLAGS) -MM' +DEPCMD='$(DEP$(1)) $(DEP$(1)FLAGS) $($(1)DEP_FLAGS) $< | sed -e "/^\#.*/d" -e "s,^[[:space:]]*$(*F)\\.o,$(@D)/$(*F).o," > $(@:.o=.d)' +DEPFLAGS='-MM' # find source path if test -f configure; then @@ -2319,120 +2324,150 @@ tms470_flags(){ done } -if $cc -v 2>&1 | grep -q '^gcc.*LLVM'; then - cc_type=llvm_gcc - gcc_extra_ver=$(expr "$($cc --version | head -n1)" : '.*\((.*)\)') - cc_ident="llvm-gcc $($cc -dumpversion) $gcc_extra_ver" - CC_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@' - AS_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@' - cflags_speed='-O3' - cflags_size='-Os' -elif $cc -v 2>&1 | grep -qi ^gcc; then - cc_type=gcc - gcc_version=$($cc --version | head -n1) - gcc_basever=$($cc -dumpversion) - gcc_pkg_ver=$(expr "$gcc_version" : '[^ ]* \(([^)]*)\)') - gcc_ext_ver=$(expr "$gcc_version" : ".*$gcc_pkg_ver $gcc_basever \\(.*\\)") - cc_ident=$(cleanws "gcc $gcc_basever $gcc_pkg_ver $gcc_ext_ver") - if ! $cc -dumpversion | grep -q '^2\.'; then - CC_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@' - AS_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@' +probe_cc(){ + pfx=$1 + _cc=$2 + + unset _type _ident _cc_o _flags _cflags _ldflags _depflags _DEPCMD _DEPFLAGS + _flags_filter=echo + + if $_cc -v 2>&1 | grep -q '^gcc.*LLVM'; then + _type=llvm_gcc + gcc_extra_ver=$(expr "$($_cc --version | head -n1)" : '.*\((.*)\)') + _ident="llvm-gcc $($_cc -dumpversion) $gcc_extra_ver" + _depflags='-MMD -MF $(@:.o=.d) -MT $@' + _cflags_speed='-O3' + _cflags_size='-Os' + elif $_cc -v 2>&1 | grep -qi ^gcc; then + _type=gcc + gcc_version=$($_cc --version | head -n1) + gcc_basever=$($_cc -dumpversion) + gcc_pkg_ver=$(expr "$gcc_version" : '[^ ]* \(([^)]*)\)') + gcc_ext_ver=$(expr "$gcc_version" : ".*$gcc_pkg_ver $gcc_basever \\(.*\\)") + _ident=$(cleanws "gcc $gcc_basever $gcc_pkg_ver $gcc_ext_ver") + if ! $_cc -dumpversion | grep -q '^2\.'; then + _depflags='-MMD -MF $(@:.o=.d) -MT $@' + fi + _cflags_speed='-O3' + _cflags_size='-Os' + elif $_cc --version 2>/dev/null | grep -q Intel; then + _type=icc + _ident=$($_cc --version | head -n1) + _depflags='-MMD' + _cflags_speed='-O3' + _cflags_size='-Os' + _cflags_noopt='-O1' + elif $_cc -v 2>&1 | grep -q xlc; then + _type=xlc + _ident=$($_cc -qversion 2>/dev/null | head -n1) + _cflags_speed='-O5' + _cflags_size='-O5 -qcompact' + elif $_cc -V 2>/dev/null | grep -q Compaq; then + _type=ccc + _ident=$($_cc -V | head -n1 | cut -d' ' -f1-3) + _DEPFLAGS='-M' + debuglevel=3 + _ldflags='-Wl,-z,now' # calls to libots crash without this + _cflags_speed='-fast' + _cflags_size='-O1' + elif $_cc --vsn 2>/dev/null | grep -q "ARM C/C++ Compiler"; then + test -d "$sysroot" || die "No valid sysroot specified." + _type=armcc + _ident=$($_cc --vsn | head -n1) + armcc_conf="$PWD/armcc.conf" + $_cc --arm_linux_configure \ + --arm_linux_config_file="$armcc_conf" \ + --configure_sysroot="$sysroot" \ + --configure_cpp_headers="$sysinclude" >>$logfile 2>&1 || + die "Error creating armcc configuration file." + $_cc --vsn | grep -q RVCT && armcc_opt=rvct || armcc_opt=armcc + _flags="--arm_linux_config_file=$armcc_conf --translate_gcc" + as_default="${cross_prefix}gcc" + _depflags='-MMD' + _cflags_speed='-O3' + _cflags_size='-Os' + elif $_cc -version 2>/dev/null | grep -q TMS470; then + _type=tms470 + _ident=$($_cc -version | head -n1 | tr -s ' ') + _flags='--gcc --abi=eabi -me' + _cflags='-D__gnuc_va_list=va_list -D__USER_LABEL_PREFIX__=' + _cc_o='-fe=$@' + as_default="${cross_prefix}gcc" + ld_default="${cross_prefix}gcc" + _depflags='-ppa -ppd=$(@:.o=.d)' + _cflags_speed='-O3 -mf=5' + _cflags_size='-O3 -mf=2' + _flags_filter=tms470_flags + elif $_cc -v 2>&1 | grep -q clang; then + _type=clang + _ident=$($_cc --version | head -n1) + _depflags='-MMD' + _cflags_speed='-O3' + _cflags_size='-Os' + elif $_cc -V 2>&1 | grep -q Sun; then + _type=suncc + _ident=$($_cc -V 2>&1 | head -n1 | cut -d' ' -f 2-) + _DEPCMD='$(DEP$(1)) $(DEP$(1)FLAGS) $($(1)DEP_FLAGS) $< | sed -e "1s,^.*: ,$@: ," -e "\$$!s,\$$, \\\," -e "1!s,^.*: , ," > $(@:.o=.d)' + _DEPFLAGS='-xM1' + _ldflags='-std=c99' + _cflags_speed='-O5' + _cflags_size='-O5 -xspace' + _flags_filter=suncc_flags + elif $_cc -v 2>&1 | grep -q 'PathScale\|Path64'; then + _type=pathscale + _ident=$($_cc -v 2>&1 | head -n1 | tr -d :) + _depflags='-MMD -MF $(@:.o=.d) -MT $@' + _cflags_speed='-O2' + _cflags_size='-Os' + _flags_filter='filter_out -Wdisabled-optimization' + elif $_cc -v 2>&1 | grep -q Open64; then + _type=open64 + _ident=$($_cc -v 2>&1 | head -n1 | tr -d :) + _depflags='-MMD -MF $(@:.o=.d) -MT $@' + _cflags_speed='-O2' + _cflags_size='-Os' + _flags_filter='filter_out -Wdisabled-optimization|-Wtype-limits|-fno-signed-zeros' + elif $_cc -V 2>&1 | grep -q Portland; then + _type=pgi + _ident="PGI $($_cc -V 2>&1 | awk '/^pgcc/ { print $2; exit }')" + opt_common='-alias=ansi -Mlre -Mpre' + _cflags_speed="-O3 -Mautoinline -Munroll=c:4 $opt_common" + _cflags_size="-O2 -Munroll=c:1 $opt_common" + _cflags_noopt="-O1" + _flags_filter=pgi_flags fi - cflags_speed='-O3' - cflags_size='-Os' -elif $cc --version 2>/dev/null | grep -q Intel; then - cc_type=icc - cc_ident=$($cc --version | head -n1) - CC_DEPFLAGS='-MMD' - AS_DEPFLAGS='-MMD' - cflags_speed='-O3' - cflags_size='-Os' - cflags_noopt='-O1' -elif $cc -v 2>&1 | grep -q xlc; then - cc_type=xlc - cc_ident=$($cc -qversion 2>/dev/null | head -n1) - cflags_speed='-O5' - cflags_size='-O5 -qcompact' -elif $cc -V 2>/dev/null | grep -q Compaq; then - cc_type=ccc - cc_ident=$($cc -V | head -n1 | cut -d' ' -f1-3) - DEPFLAGS='$(CPPFLAGS) $(CFLAGS) -M' - debuglevel=3 - add_ldflags -Wl,-z,now # calls to libots crash without this - cflags_speed='-fast' - cflags_size='-O1' -elif $cc --vsn 2>/dev/null | grep -q "ARM C/C++ Compiler"; then - test -d "$sysroot" || die "No valid sysroot specified." - cc_type=armcc - cc_ident=$($cc --vsn | head -n1) - armcc_conf="$PWD/armcc.conf" - $cc --arm_linux_configure \ - --arm_linux_config_file="$armcc_conf" \ - --configure_sysroot="$sysroot" \ - --configure_cpp_headers="$sysinclude" >>$logfile 2>&1 || - die "Error creating armcc configuration file." - $cc --vsn | grep -q RVCT && armcc_opt=rvct || armcc_opt=armcc - cc="$cc --arm_linux_config_file=$armcc_conf --translate_gcc" - as_default="${cross_prefix}gcc" - CC_DEPFLAGS='-MMD' - AS_DEPFLAGS='-MMD' - cflags_speed='-O3' - cflags_size='-Os' - asflags_filter="filter_out -W${armcc_opt}*" -elif $cc -version 2>/dev/null | grep -q TMS470; then - cc_type=tms470 - cc_ident=$($cc -version | head -n1 | tr -s ' ') - cc="$cc --gcc --abi=eabi -me" - CC_O='-fe=$@' - as_default="${cross_prefix}gcc" - ld_default="${cross_prefix}gcc" - add_cflags -D__gnuc_va_list=va_list -D__USER_LABEL_PREFIX__= - CC_DEPFLAGS='-ppa -ppd=$(@:.o=.d)' - AS_DEPFLAGS='-MMD' - cflags_speed='-O3 -mf=5' - cflags_size='-O3 -mf=2' - cflags_filter=tms470_flags -elif $cc -v 2>&1 | grep -q clang; then - cc_type=clang - cc_ident=$($cc --version | head -n1) - CC_DEPFLAGS='-MMD' - AS_DEPFLAGS='-MMD' - cflags_speed='-O3' - cflags_size='-Os' -elif $cc -V 2>&1 | grep -q Sun; then - cc_type=suncc - cc_ident=$($cc -V 2>&1 | head -n1 | cut -d' ' -f 2-) - DEPEND_CMD='$(DEPCC) $(DEPFLAGS) $< | sed -e "1s,^.*: ,$@: ," -e "\$$!s,\$$, \\\," -e "1!s,^.*: , ," > $(@:.o=.d)' - DEPFLAGS='$(CPPFLAGS) $(CFLAGS) -xM1' - add_ldflags -xc99 - cflags_speed='-O5' - cflags_size='-O5 -xspace' - cflags_filter=suncc_flags -elif $cc -v 2>&1 | grep -q 'PathScale\|Path64'; then - cc_type=pathscale - cc_ident=$($cc -v 2>&1 | head -n1 | tr -d :) - CC_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@' - AS_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@' - cflags_speed='-O2' - cflags_size='-Os' - cflags_filter='filter_out -Wdisabled-optimization' -elif $cc -v 2>&1 | grep -q Open64; then - cc_type=open64 - cc_ident=$($cc -v 2>&1 | head -n1 | tr -d :) - CC_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@' - AS_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@' - cflags_speed='-O2' - cflags_size='-Os' - cflags_filter='filter_out -Wdisabled-optimization|-Wtype-limits|-fno-signed-zeros' -elif $cc -V 2>&1 | grep -q Portland; then - cc_type=pgi - cc_ident="PGI $($cc -V 2>&1 | awk '/^pgcc/ { print $2; exit }')" - opt_common='-alias=ansi -Mlre -Mpre' - cflags_speed="-O3 -Mautoinline -Munroll=c:4 $opt_common" - cflags_size="-O2 -Munroll=c:1 $opt_common" - cflags_noopt="-O1" - cflags_filter=pgi_flags -fi + + eval ${pfx}_type=\$_type + eval ${pfx}_ident=\$_ident +} + +set_ccvars(){ + eval ${1}_O=\${_cc_o-\${${1}_O}} + + if [ -n "$_depflags" ]; then + eval ${1}_DEPFLAGS=\$_depflags + else + eval ${1}DEP=\${_DEPCMD:-\$DEPCMD} + eval ${1}DEP_FLAGS=\${_DEPFLAGS:-\$DEPFLAGS} + eval DEP${1}FLAGS=\$_flags + fi +} + +probe_cc cc "$cc" +cflags_filter=$_flags_filter +cflags_speed=$_cflags_speed +cflags_size=$_cflags_size +cflags_noopt=$_cflags_noopt +add_cflags $_flags $_cflags +cc_ldflags=$_ldflags +set_ccvars CC + +probe_cc hostcc "$host_cc" +host_cflags_filter=$_flags_filter +host_ldflags_filter=$_flags_filter +add_host_cflags $_flags $_cflags +add_host_ldflags $_flags $_ldflags +set_ccvars HOSTCC test -n "$cc_type" && enable $cc_type || warn "Unknown C compiler $cc, unable to select optimal CFLAGS" @@ -2442,9 +2477,23 @@ test -n "$cc_type" && enable $cc_type || : ${ld_default:=$cc} set_default ar as dep_cc ld -test -n "$CC_DEPFLAGS" || CCDEP=$DEPEND_CMD -test -n "$CXX_DEPFLAGS" || CXXDEP=$DEPEND_CMD -test -n "$AS_DEPFLAGS" || ASDEP=$DEPEND_CMD +probe_cc as "$as" +asflags_filter=$_flags_filter +add_asflags $_flags $_cflags +set_ccvars AS + +probe_cc ld "$ld" +ldflags_filter=$_flags_filter +add_ldflags $_flags $_ldflags +test "$cc_type" != "$ld_type" && add_ldflags $cc_ldflags +LD_O=${_cc_o-$LD_O} + +if [ -z "$CC_DEPFLAGS" ] && [ "$dep_cc" != "$cc" ]; then + probe_cc depcc "$dep_cc" + CCDEP=${_DEPCMD:-$DEPCMD} + CCDEP_FLAGS=${_DEPFLAGS:=$DEPFLAGS} + DEPCCFLAGS=$_flags +fi add_cflags $extra_cflags add_cxxflags $extra_cxxflags @@ -3140,6 +3189,7 @@ EOF check_yasm "pextrd [eax], xmm0, 1" && enable yasm || die "yasm not found, use --disable-yasm for a crippled build" check_yasm "vextractf128 xmm0, ymm0, 0" || disable avx + check_yasm "vfmaddps ymm0, ymm1, ymm2, ymm3" || disable fma4 fi case "$cpu" in @@ -3673,6 +3723,7 @@ if enabled x86; then echo "SSE enabled ${sse-no}" echo "SSSE3 enabled ${ssse3-no}" echo "AVX enabled ${avx-no}" + echo "FMA4 enabled ${fma4-no}" echo "CMOV enabled ${cmov-no}" echo "CMOV is fast ${fast_cmov-no}" echo "EBX available ${ebx_available-no}" @@ -3814,6 +3865,9 @@ CXX=$cxx AS=$as LD=$ld DEPCC=$dep_cc +DEPCCFLAGS=$DEPCCFLAGS \$(CPPFLAGS) +DEPAS=$as +DEPASFLAGS=$DEPASFLAGS \$(CPPFLAGS) YASM=$yasmexe YASMDEP=$yasmexe AR=$ar @@ -3825,9 +3879,10 @@ CPPFLAGS=$CPPFLAGS CFLAGS=$CFLAGS CXXFLAGS=$CXXFLAGS ASFLAGS=$ASFLAGS -AS_O=$CC_O +AS_O=$AS_O CC_O=$CC_O CXX_O=$CXX_O +LD_O=$LD_O LDFLAGS=$LDFLAGS FFSERVERLDFLAGS=$FFSERVERLDFLAGS SHFLAGS=$SHFLAGS @@ -3842,10 +3897,11 @@ SLIBPREF=$SLIBPREF SLIBSUF=$SLIBSUF EXESUF=$EXESUF EXTRA_VERSION=$extra_version -DEPFLAGS=$DEPFLAGS CCDEP=$CCDEP CXXDEP=$CXXDEP +CCDEP_FLAGS=$CCDEP_FLAGS ASDEP=$ASDEP +ASDEP_FLAGS=$ASDEP_FLAGS CC_DEPFLAGS=$CC_DEPFLAGS AS_DEPFLAGS=$AS_DEPFLAGS HOSTCC=$host_cc @@ -3853,6 +3909,12 @@ HOSTCFLAGS=$host_cflags HOSTEXESUF=$HOSTEXESUF HOSTLDFLAGS=$host_ldflags HOSTLIBS=$host_libs +DEPHOSTCC=$host_cc +DEPHOSTCCFLAGS=$DEPHOSTCCFLAGS \$(HOSTCCFLAGS) +HOSTCCDEP=$HOSTCCDEP +HOSTCCDEP_FLAGS=$HOSTCCDEP_FLAGS +HOSTCC_DEPFLAGS=$HOSTCC_DEPFLAGS +HOSTCC_O=$HOSTCC_O TARGET_EXEC=$target_exec TARGET_PATH=$target_path SDL_LIBS=$sdl_libs diff --git a/doc/Makefile b/doc/Makefile index 0ee812176e..845f5f77c2 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -28,8 +28,6 @@ doc/%.txt: doc/%.texi $(Q)$(TEXIDEP) $(M)makeinfo --force --no-headers -o $@ $< 2>/dev/null -doc/print_options.o: libavformat/options_table.h libavcodec/options_table.h - GENTEXI = format codec GENTEXI := $(GENTEXI:%=doc/avoptions_%.texi) diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 041df92e33..b4138e88d9 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -32,7 +32,7 @@ OBJS = allcodecs.o \ utils.o \ # parts needed for many different codecs -OBJS-$(CONFIG_AANDCT) += aandcttab.o +OBJS-$(CONFIG_AANDCTTABLES) += aandcttab.o OBJS-$(CONFIG_AC3DSP) += ac3dsp.o OBJS-$(CONFIG_CRYSTALHD) += crystalhd.o OBJS-$(CONFIG_ENCODERS) += faandct.o jfdctfst.o jfdctint.o diff --git a/libavcodec/alac.c b/libavcodec/alac.c index 4fa328539c..2d98456f8b 100644 --- a/libavcodec/alac.c +++ b/libavcodec/alac.c @@ -200,6 +200,7 @@ static void lpc_prediction(int32_t *error_buffer, int32_t *buffer_out, int lpc_order, int lpc_quant) { int i; + int32_t *pred = buffer_out; /* first sample always copies */ *buffer_out = *error_buffer; @@ -223,37 +224,35 @@ static void lpc_prediction(int32_t *error_buffer, int32_t *buffer_out, } /* read warm-up samples */ - for (i = 0; i < lpc_order; i++) { - buffer_out[i + 1] = sign_extend(buffer_out[i] + error_buffer[i + 1], - bps); - } + for (i = 1; i <= lpc_order; i++) + buffer_out[i] = sign_extend(buffer_out[i - 1] + error_buffer[i], bps); /* NOTE: 4 and 8 are very common cases that could be optimized. */ - for (i = lpc_order; i < nb_samples - 1; i++) { + for (; i < nb_samples; i++) { int j; int val = 0; - int error_val = error_buffer[i + 1]; + int error_val = error_buffer[i]; int error_sign; - int d = buffer_out[i - lpc_order]; + int d = *pred++; /* LPC prediction */ for (j = 0; j < lpc_order; j++) - val += (buffer_out[i - j] - d) * lpc_coefs[j]; + val += (pred[j] - d) * lpc_coefs[j]; val = (val + (1 << (lpc_quant - 1))) >> lpc_quant; val += d + error_val; - buffer_out[i + 1] = sign_extend(val, bps); + buffer_out[i] = sign_extend(val, bps); /* adapt LPC coefficients */ error_sign = sign_only(error_val); if (error_sign) { - for (j = lpc_order - 1; j >= 0 && error_val * error_sign > 0; j--) { + for (j = 0; j < lpc_order && error_val * error_sign > 0; j++) { int sign; - val = d - buffer_out[i - j]; + val = d - pred[j]; sign = sign_only(val) * error_sign; lpc_coefs[j] -= sign; val *= sign; - error_val -= (val >> lpc_quant) * (lpc_order - j); + error_val -= (val >> lpc_quant) * (j + 1); } } } @@ -356,7 +355,7 @@ static int decode_element(AVCodecContext *avctx, void *data, int ch_index, lpc_order[ch] = get_bits(&alac->gb, 5); /* read the predictor table */ - for (i = 0; i < lpc_order[ch]; i++) + for (i = lpc_order[ch] - 1; i >= 0; i--) lpc_coefs[ch][i] = get_sbits(&alac->gb, 16); } @@ -477,16 +476,19 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data, ALACContext *alac = avctx->priv_data; enum RawDataBlockType element; int channels; - int ch, ret; + int ch, ret, got_end; init_get_bits(&alac->gb, avpkt->data, avpkt->size * 8); + got_end = 0; alac->nb_samples = 0; ch = 0; - while (get_bits_left(&alac->gb)) { + while (get_bits_left(&alac->gb) >= 3) { element = get_bits(&alac->gb, 3); - if (element == TYPE_END) + if (element == TYPE_END) { + got_end = 1; break; + } if (element > TYPE_CPE && element != TYPE_LFE) { av_log(avctx, AV_LOG_ERROR, "syntax element unsupported: %d", element); return AVERROR_PATCHWELCOME; @@ -501,11 +503,15 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data, ret = decode_element(avctx, data, alac_channel_layout_offsets[alac->channels - 1][ch], channels); - if (ret < 0) + if (ret < 0 && get_bits_left(&alac->gb)) return ret; ch += channels; } + if (!got_end) { + av_log(avctx, AV_LOG_ERROR, "no end tag found. incomplete packet.\n"); + return AVERROR_INVALIDDATA; + } if (avpkt->size * 8 - get_bits_count(&alac->gb) > 8) { av_log(avctx, AV_LOG_ERROR, "Error : %d bits left\n", diff --git a/libavcodec/tscc2.c b/libavcodec/tscc2.c index a8fd652e59..5e2fd02745 100644 --- a/libavcodec/tscc2.c +++ b/libavcodec/tscc2.c @@ -298,8 +298,8 @@ static int tscc2_decode_frame(AVCodecContext *avctx, void *data, if (!size) { int skip_row = 1, j, off = i * c->mb_width; for (j = 0; j < c->mb_width; j++) { - if (c->slice_quants[off + i] == 1 || - c->slice_quants[off + i] == 2) { + if (c->slice_quants[off + j] == 1 || + c->slice_quants[off + j] == 2) { skip_row = 0; break; } diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index d2e5439e61..5a6c3d1eae 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -1158,12 +1158,7 @@ ALIGN 16 add src1q, 2*mmsize sub lenq, 2*mmsize jge .loop -%if mmsize == 32 - vzeroupper - RET -%else REP_RET -%endif %endmacro INIT_XMM sse @@ -1193,12 +1188,7 @@ ALIGN 16 sub lenq, 2*mmsize jge .loop -%if mmsize == 32 - vzeroupper - RET -%else REP_RET -%endif %endmacro INIT_XMM sse @@ -1243,10 +1233,6 @@ cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len %endif add lenq, mmsize jl .loop -%if mmsize == 32 - vzeroupper - RET -%endif .end: REP_RET %endmacro diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index 5bf0e2f259..5e5004ca97 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -750,9 +750,6 @@ section .text ; The others pass args in registers and don't spill anything. cglobal fft_dispatch%2, 2,5,8, z, nbits FFT_DISPATCH fullsuffix, nbits -%if mmsize == 32 - vzeroupper -%endif RET %endmacro ; DECL_FFT @@ -957,9 +954,6 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i %1 r0, r1, r6, rtcos, rtsin %if ARCH_X86_64 == 0 add esp, 12 -%endif -%if mmsize == 32 - vzeroupper %endif RET %endmacro diff --git a/libavcodec/x86/mpegaudiodec_mmx.c b/libavcodec/x86/mpegaudiodec_mmx.c index 939b441277..0d6cc08305 100644 --- a/libavcodec/x86/mpegaudiodec_mmx.c +++ b/libavcodec/x86/mpegaudiodec_mmx.c @@ -36,6 +36,8 @@ void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; +#if HAVE_INLINE_ASM + #define MACS(rt, ra, rb) rt+=(ra)*(rb) #define MLSS(rt, ra, rb) rt-=(ra)*(rb) @@ -178,6 +180,7 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out, *out = sum; } +#endif /* HAVE_INLINE_ASM */ #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ @@ -241,9 +244,11 @@ void ff_mpadsp_init_mmx(MPADSPContext *s) } } +#if HAVE_INLINE_ASM if (mm_flags & AV_CPU_FLAG_SSE2) { s->apply_window_float = apply_window_mp3; } +#endif /* HAVE_INLINE_ASM */ #if HAVE_YASM if (0) { #if HAVE_AVX diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm index a09d871fda..01e5deec93 100644 --- a/libavcodec/x86/proresdsp.asm +++ b/libavcodec/x86/proresdsp.asm @@ -83,8 +83,7 @@ section .text align=16 ; %1 = row or col (for rounding variable) ; %2 = number of bits to shift at the end -; %3 = optimization -%macro IDCT_1D 3 +%macro IDCT_1D 2 ; a0 = (W4 * row[0]) + (1 << (15 - 1)); ; a1 = a0; ; a2 = a0; @@ -235,8 +234,8 @@ section .text align=16 ; void prores_idct_put_10_(uint8_t *pixels, int stride, ; DCTELEM *block, const int16_t *qmat); -%macro idct_put_fn 2 -cglobal prores_idct_put_10_%1, 4, 4, %2 +%macro idct_put_fn 1 +cglobal prores_idct_put_10, 4, 4, %1 movsxd r1, r1d pxor m15, m15 ; zero @@ -252,7 +251,7 @@ cglobal prores_idct_put_10_%1, 4, 4, %2 pmullw m13,[r3+64] pmullw m12,[r3+96] - IDCT_1D row, 15, %1 + IDCT_1D row, 15 ; transpose for second part of IDCT TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3 @@ -267,7 +266,7 @@ cglobal prores_idct_put_10_%1, 4, 4, %2 ; for (i = 0; i < 8; i++) ; idctSparseColAdd(dest + i, line_size, block + i); - IDCT_1D col, 18, %1 + IDCT_1D col, 18 ; clip/store mova m3, [pw_4] @@ -302,13 +301,27 @@ cglobal prores_idct_put_10_%1, 4, 4, %2 RET %endmacro -INIT_XMM -idct_put_fn sse2, 16 -INIT_XMM -idct_put_fn sse4, 16 +%macro SIGNEXTEND 2-3 ; dstlow, dsthigh, tmp +%if cpuflag(sse4) + movhlps %2, %1 + pmovsxwd %1, %1 + pmovsxwd %2, %2 +%else ; sse2 + pxor %3, %3 + pcmpgtw %3, %1 + mova %2, %1 + punpcklwd %1, %3 + punpckhwd %2, %3 +%endif +%endmacro + +INIT_XMM sse2 +idct_put_fn 16 +INIT_XMM sse4 +idct_put_fn 16 %if HAVE_AVX -INIT_AVX -idct_put_fn avx, 16 +INIT_XMM avx +idct_put_fn 16 %endif %endif diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c index 4b41529e3a..a5517fc5e0 100644 --- a/libavfilter/avfiltergraph.c +++ b/libavfilter/avfiltergraph.c @@ -578,11 +578,44 @@ static void swap_samplerates(AVFilterGraph *graph) swap_samplerates_on_filter(graph->filters[i]); } +#define CH_CENTER_PAIR (AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER) +#define CH_FRONT_PAIR (AV_CH_FRONT_LEFT | AV_CH_FRONT_RIGHT) +#define CH_STEREO_PAIR (AV_CH_STEREO_LEFT | AV_CH_STEREO_RIGHT) +#define CH_WIDE_PAIR (AV_CH_WIDE_LEFT | AV_CH_WIDE_RIGHT) +#define CH_SIDE_PAIR (AV_CH_SIDE_LEFT | AV_CH_SIDE_RIGHT) +#define CH_DIRECT_PAIR (AV_CH_SURROUND_DIRECT_LEFT | AV_CH_SURROUND_DIRECT_RIGHT) +#define CH_BACK_PAIR (AV_CH_BACK_LEFT | AV_CH_BACK_RIGHT) + +/* allowable substitutions for channel pairs when comparing layouts, + * ordered by priority for both values */ +static const uint64_t ch_subst[][2] = { + { CH_FRONT_PAIR, CH_CENTER_PAIR }, + { CH_FRONT_PAIR, CH_WIDE_PAIR }, + { CH_FRONT_PAIR, AV_CH_FRONT_CENTER }, + { CH_CENTER_PAIR, CH_FRONT_PAIR }, + { CH_CENTER_PAIR, CH_WIDE_PAIR }, + { CH_CENTER_PAIR, AV_CH_FRONT_CENTER }, + { CH_WIDE_PAIR, CH_FRONT_PAIR }, + { CH_WIDE_PAIR, CH_CENTER_PAIR }, + { CH_WIDE_PAIR, AV_CH_FRONT_CENTER }, + { AV_CH_FRONT_CENTER, CH_FRONT_PAIR }, + { AV_CH_FRONT_CENTER, CH_CENTER_PAIR }, + { AV_CH_FRONT_CENTER, CH_WIDE_PAIR }, + { CH_SIDE_PAIR, CH_DIRECT_PAIR }, + { CH_SIDE_PAIR, CH_BACK_PAIR }, + { CH_SIDE_PAIR, AV_CH_BACK_CENTER }, + { CH_BACK_PAIR, CH_DIRECT_PAIR }, + { CH_BACK_PAIR, CH_SIDE_PAIR }, + { CH_BACK_PAIR, AV_CH_BACK_CENTER }, + { AV_CH_BACK_CENTER, CH_BACK_PAIR }, + { AV_CH_BACK_CENTER, CH_DIRECT_PAIR }, + { AV_CH_BACK_CENTER, CH_SIDE_PAIR }, +}; + static void swap_channel_layouts_on_filter(AVFilterContext *filter) { AVFilterLink *link = NULL; - uint64_t chlayout; - int i, j; + int i, j, k; for (i = 0; i < filter->nb_inputs; i++) { link = filter->inputs[i]; @@ -594,27 +627,55 @@ static void swap_channel_layouts_on_filter(AVFilterContext *filter) if (i == filter->nb_inputs) return; - chlayout = link->out_channel_layouts->channel_layouts[0]; - for (i = 0; i < filter->nb_outputs; i++) { AVFilterLink *outlink = filter->outputs[i]; - int best_idx, best_score = INT_MIN; + int best_idx, best_score = INT_MIN, best_count_diff = INT_MAX; if (outlink->type != AVMEDIA_TYPE_AUDIO || outlink->in_channel_layouts->nb_channel_layouts < 2) continue; for (j = 0; j < outlink->in_channel_layouts->nb_channel_layouts; j++) { + uint64_t in_chlayout = link->out_channel_layouts->channel_layouts[0]; uint64_t out_chlayout = outlink->in_channel_layouts->channel_layouts[j]; - int matched_channels = av_get_channel_layout_nb_channels(chlayout & - out_chlayout); - int extra_channels = av_get_channel_layout_nb_channels(out_chlayout & - (~chlayout)); - int score = matched_channels - extra_channels; + int in_channels = av_get_channel_layout_nb_channels(in_chlayout); + int out_channels = av_get_channel_layout_nb_channels(out_chlayout); + int count_diff = out_channels - in_channels; + int matched_channels, extra_channels; + int score = 0; - if (score > best_score) { + /* channel substitution */ + for (k = 0; k < FF_ARRAY_ELEMS(ch_subst); k++) { + uint64_t cmp0 = ch_subst[k][0]; + uint64_t cmp1 = ch_subst[k][1]; + if (( in_chlayout & cmp0) && (!(out_chlayout & cmp0)) && + (out_chlayout & cmp1) && (!( in_chlayout & cmp1))) { + in_chlayout &= ~cmp0; + out_chlayout &= ~cmp1; + /* add score for channel match, minus a deduction for + having to do the substitution */ + score += 10 * av_get_channel_layout_nb_channels(cmp1) - 2; + } + } + + /* no penalty for LFE channel mismatch */ + if ( (in_chlayout & AV_CH_LOW_FREQUENCY) && + (out_chlayout & AV_CH_LOW_FREQUENCY)) + score += 10; + in_chlayout &= ~AV_CH_LOW_FREQUENCY; + out_chlayout &= ~AV_CH_LOW_FREQUENCY; + + matched_channels = av_get_channel_layout_nb_channels(in_chlayout & + out_chlayout); + extra_channels = av_get_channel_layout_nb_channels(out_chlayout & + (~in_chlayout)); + score += 10 * matched_channels - 5 * extra_channels; + + if (score > best_score || + (count_diff < best_count_diff && score == best_score)) { best_score = score; best_idx = j; + best_count_diff = count_diff; } } FFSWAP(uint64_t, outlink->in_channel_layouts->channel_layouts[0], diff --git a/libavformat/rtmpproto.c b/libavformat/rtmpproto.c index 31a8639215..f68b7e65a1 100644 --- a/libavformat/rtmpproto.c +++ b/libavformat/rtmpproto.c @@ -515,6 +515,12 @@ static int gen_pong(URLContext *s, RTMPContext *rt, RTMPPacket *ppkt) uint8_t *p; int ret; + if (ppkt->data_size < 6) { + av_log(s, AV_LOG_ERROR, "Too short ping packet (%d)\n", + ppkt->data_size); + return AVERROR_INVALIDDATA; + } + if ((ret = ff_rtmp_packet_create(&pkt, RTMP_NETWORK_CHANNEL, RTMP_PT_PING, ppkt->timestamp + 1, 6)) < 0) return ret; @@ -885,9 +891,9 @@ static int handle_chunk_size(URLContext *s, RTMPPacket *pkt) RTMPContext *rt = s->priv_data; int ret; - if (pkt->data_size != 4) { + if (pkt->data_size < 4) { av_log(s, AV_LOG_ERROR, - "Chunk size change packet is not 4 bytes long (%d)\n", + "Too short chunk size change packet (%d)\n", pkt->data_size); return AVERROR_INVALIDDATA; } @@ -913,6 +919,12 @@ static int handle_ping(URLContext *s, RTMPPacket *pkt) RTMPContext *rt = s->priv_data; int t, ret; + if (pkt->data_size < 2) { + av_log(s, AV_LOG_ERROR, "Too short ping packet (%d)\n", + pkt->data_size); + return AVERROR_INVALIDDATA; + } + t = AV_RB16(pkt->data); if (t == 6) { if ((ret = gen_pong(s, rt, pkt)) < 0) @@ -950,6 +962,13 @@ static int handle_server_bw(URLContext *s, RTMPPacket *pkt) { RTMPContext *rt = s->priv_data; + if (pkt->data_size < 4) { + av_log(s, AV_LOG_ERROR, + "Too short server bandwidth report packet (%d)\n", + pkt->data_size); + return AVERROR_INVALIDDATA; + } + rt->server_bw = AV_RB32(pkt->data); if (rt->server_bw <= 0) { av_log(s, AV_LOG_ERROR, "Incorrect server bandwidth %d\n", diff --git a/libavresample/utils.c b/libavresample/utils.c index caf9081e5d..05ee65c68d 100644 --- a/libavresample/utils.c +++ b/libavresample/utils.c @@ -246,9 +246,10 @@ static int handle_buffered_output(AVAudioResampleContext *avr, return 0; } -int avresample_convert(AVAudioResampleContext *avr, void **output, - int out_plane_size, int out_samples, void **input, - int in_plane_size, int in_samples) +int attribute_align_arg avresample_convert(AVAudioResampleContext *avr, + void **output, int out_plane_size, + int out_samples, void **input, + int in_plane_size, int in_samples) { AudioData input_buffer; AudioData output_buffer; diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index 7b3cc223c7..244c4d1b08 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -145,12 +145,7 @@ cglobal conv_s32_to_flt, 3,3,3, dst, src, len mova [dstq+lenq+mmsize], m2 add lenq, mmsize*2 jl .loop -%if mmsize == 32 - vzeroupper - RET -%else REP_RET -%endif %endmacro INIT_XMM sse2 @@ -218,12 +213,7 @@ cglobal conv_flt_to_s32, 3,3,5, dst, src, len mova [dstq+lenq+3*mmsize], m3 add lenq, mmsize*4 jl .loop -%if mmsize == 32 - vzeroupper - RET -%else REP_RET -%endif %endmacro INIT_XMM sse2 diff --git a/libavresample/x86/audio_mix.asm b/libavresample/x86/audio_mix.asm index 58a4ded8c6..8aeac8b242 100644 --- a/libavresample/x86/audio_mix.asm +++ b/libavresample/x86/audio_mix.asm @@ -51,12 +51,7 @@ cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1 add srcq, mmsize*2 sub lend, mmsize*2/4 jg .loop -%if mmsize == 32 - vzeroupper - RET -%else REP_RET -%endif %endmacro INIT_XMM sse @@ -175,12 +170,7 @@ cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1 add src0q, mmsize sub lend, mmsize/4 jg .loop -%if mmsize == 32 - vzeroupper - RET -%else REP_RET -%endif %endmacro INIT_XMM sse @@ -236,3 +226,296 @@ MIX_1_TO_2_S16P_FLT INIT_XMM avx MIX_1_TO_2_S16P_FLT %endif + +;----------------------------------------------------------------------------- +; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix, +; int len, int out_ch, int in_ch); +;----------------------------------------------------------------------------- + +%macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp +; define some names to make the code clearer +%assign in_channels %1 +%assign out_channels %2 +%assign stereo out_channels - 1 +%ifidn %3, s16p + %assign is_s16 1 +%else + %assign is_s16 0 +%endif + +; determine how many matrix elements must go on the stack vs. mmregs +%assign matrix_elements in_channels * out_channels +%if is_s16 + %if stereo + %assign needed_mmregs 7 + %else + %assign needed_mmregs 5 + %endif +%else + %if stereo + %assign needed_mmregs 4 + %else + %assign needed_mmregs 3 + %endif +%endif +%assign matrix_elements_mm num_mmregs - needed_mmregs +%if matrix_elements < matrix_elements_mm + %assign matrix_elements_mm matrix_elements +%endif +%if matrix_elements_mm < matrix_elements + %assign matrix_elements_stack matrix_elements - matrix_elements_mm +%else + %assign matrix_elements_stack 0 +%endif + +cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, src0, src1, len, src2, src3, src4, src5, src6, src7 + +; get aligned stack space if needed +%if matrix_elements_stack > 0 + %if mmsize == 32 + %assign bkpreg %1 + 1 + %define bkpq r %+ bkpreg %+ q + mov bkpq, rsp + and rsp, ~(mmsize-1) + sub rsp, matrix_elements_stack * mmsize + %else + %assign pad matrix_elements_stack * mmsize + (mmsize - gprsize) - (stack_offset & (mmsize - gprsize)) + SUB rsp, pad + %endif +%endif + +; load matrix pointers +%define matrix0q r1q +%define matrix1q r3q +%if stereo + mov matrix1q, [matrix0q+gprsize] +%endif + mov matrix0q, [matrix0q] + +; define matrix coeff names +%assign %%i 0 +%assign %%j needed_mmregs +%rep in_channels + %if %%i >= matrix_elements_mm + CAT_XDEFINE mx_stack_0_, %%i, 1 + CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize] + %else + CAT_XDEFINE mx_stack_0_, %%i, 0 + CAT_XDEFINE mx_0_, %%i, m %+ %%j + %assign %%j %%j+1 + %endif + %assign %%i %%i+1 +%endrep +%if stereo +%assign %%i 0 +%rep in_channels + %if in_channels + %%i >= matrix_elements_mm + CAT_XDEFINE mx_stack_1_, %%i, 1 + CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize] + %else + CAT_XDEFINE mx_stack_1_, %%i, 0 + CAT_XDEFINE mx_1_, %%i, m %+ %%j + %assign %%j %%j+1 + %endif + %assign %%i %%i+1 +%endrep +%endif + +; load/splat matrix coeffs +%assign %%i 0 +%rep in_channels + %if mx_stack_0_ %+ %%i + VBROADCASTSS m0, [matrix0q+4*%%i] + mova mx_0_ %+ %%i, m0 + %else + VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i] + %endif + %if stereo + %if mx_stack_1_ %+ %%i + VBROADCASTSS m0, [matrix1q+4*%%i] + mova mx_1_ %+ %%i, m0 + %else + VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i] + %endif + %endif + %assign %%i %%i+1 +%endrep + +; load channel pointers to registers as offsets from the first channel pointer +%if ARCH_X86_64 + movsxd lenq, r2d +%endif + shl lenq, 2-is_s16 +%assign %%i 1 +%rep (in_channels - 1) + %if ARCH_X86_32 && in_channels >= 7 && %%i >= 5 + mov src5q, [src0q+%%i*gprsize] + add src5q, lenq + mov src %+ %%i %+ m, src5q + %else + mov src %+ %%i %+ q, [src0q+%%i*gprsize] + add src %+ %%i %+ q, lenq + %endif + %assign %%i %%i+1 +%endrep + mov src0q, [src0q] + add src0q, lenq + neg lenq +.loop +; for x86-32 with 7-8 channels we do not have enough gp registers for all src +; pointers, so we have to load some of them from the stack each time +%define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5 +%if is_s16 + ; mix with s16p input + mova m0, [src0q+lenq] + S16_TO_S32_SX 0, 1 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + %if stereo + mulps m2, m0, mx_1_0 + mulps m3, m1, mx_1_0 + %endif + mulps m0, m0, mx_0_0 + mulps m1, m1, mx_0_0 +%assign %%i 1 +%rep (in_channels - 1) + %if copy_src_from_stack + %define src_ptr src5q + %else + %define src_ptr src %+ %%i %+ q + %endif + %if stereo + %if copy_src_from_stack + mov src_ptr, src %+ %%i %+ m + %endif + mova m4, [src_ptr+lenq] + S16_TO_S32_SX 4, 5 + cvtdq2ps m4, m4 + cvtdq2ps m5, m5 + fmaddps m2, m4, mx_1_ %+ %%i, m2, m6 + fmaddps m3, m5, mx_1_ %+ %%i, m3, m6 + fmaddps m0, m4, mx_0_ %+ %%i, m0, m4 + fmaddps m1, m5, mx_0_ %+ %%i, m1, m5 + %else + %if copy_src_from_stack + mov src_ptr, src %+ %%i %+ m + %endif + mova m2, [src_ptr+lenq] + S16_TO_S32_SX 2, 3 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + fmaddps m0, m2, mx_0_ %+ %%i, m0, m4 + fmaddps m1, m3, mx_0_ %+ %%i, m1, m4 + %endif + %assign %%i %%i+1 +%endrep + %if stereo + cvtps2dq m2, m2 + cvtps2dq m3, m3 + packssdw m2, m3 + mova [src1q+lenq], m2 + %endif + cvtps2dq m0, m0 + cvtps2dq m1, m1 + packssdw m0, m1 + mova [src0q+lenq], m0 +%else + ; mix with fltp input + %if stereo || mx_stack_0_0 + mova m0, [src0q+lenq] + %endif + %if stereo + mulps m1, m0, mx_1_0 + %endif + %if stereo || mx_stack_0_0 + mulps m0, m0, mx_0_0 + %else + mulps m0, [src0q+lenq], mx_0_0 + %endif +%assign %%i 1 +%rep (in_channels - 1) + %if copy_src_from_stack + %define src_ptr src5q + mov src_ptr, src %+ %%i %+ m + %else + %define src_ptr src %+ %%i %+ q + %endif + ; avoid extra load for mono if matrix is in a mm register + %if stereo || mx_stack_0_ %+ %%i + mova m2, [src_ptr+lenq] + %endif + %if stereo + fmaddps m1, m2, mx_1_ %+ %%i, m1, m3 + %endif + %if stereo || mx_stack_0_ %+ %%i + fmaddps m0, m2, mx_0_ %+ %%i, m0, m2 + %else + fmaddps m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1 + %endif + %assign %%i %%i+1 +%endrep + mova [src0q+lenq], m0 + %if stereo + mova [src1q+lenq], m1 + %endif +%endif + + add lenq, mmsize + jl .loop +; restore stack pointer +%if matrix_elements_stack > 0 + %if mmsize == 32 + mov rsp, bkpq + %else + ADD rsp, pad + %endif +%endif +; zero ymm high halves +%if mmsize == 32 + vzeroupper +%endif + RET +%endmacro + +%macro MIX_3_8_TO_1_2_FLT_FUNCS 0 +%assign %%i 3 +%rep 6 + INIT_XMM sse + MIX_3_8_TO_1_2_FLT %%i, 1, fltp + MIX_3_8_TO_1_2_FLT %%i, 2, fltp + INIT_XMM sse2 + MIX_3_8_TO_1_2_FLT %%i, 1, s16p + MIX_3_8_TO_1_2_FLT %%i, 2, s16p + INIT_XMM sse4 + MIX_3_8_TO_1_2_FLT %%i, 1, s16p + MIX_3_8_TO_1_2_FLT %%i, 2, s16p + ; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues + %if HAVE_AVX + %if ARCH_X86_64 || %%i < 6 + INIT_YMM avx + %else + INIT_XMM avx + %endif + MIX_3_8_TO_1_2_FLT %%i, 1, fltp + MIX_3_8_TO_1_2_FLT %%i, 2, fltp + INIT_XMM avx + MIX_3_8_TO_1_2_FLT %%i, 1, s16p + MIX_3_8_TO_1_2_FLT %%i, 2, s16p + %endif + %if HAVE_FMA4 + %if ARCH_X86_64 || %%i < 6 + INIT_YMM fma4 + %else + INIT_XMM fma4 + %endif + MIX_3_8_TO_1_2_FLT %%i, 1, fltp + MIX_3_8_TO_1_2_FLT %%i, 2, fltp + INIT_XMM fma4 + MIX_3_8_TO_1_2_FLT %%i, 1, s16p + MIX_3_8_TO_1_2_FLT %%i, 2, s16p + %endif + %assign %%i %%i+1 +%endrep +%endmacro + +MIX_3_8_TO_1_2_FLT_FUNCS diff --git a/libavresample/x86/audio_mix_init.c b/libavresample/x86/audio_mix_init.c index b8f3a90eef..de4c148170 100644 --- a/libavresample/x86/audio_mix_init.c +++ b/libavresample/x86/audio_mix_init.c @@ -47,6 +47,129 @@ extern void ff_mix_1_to_2_s16p_flt_sse4(int16_t **src, float **matrix, int len, extern void ff_mix_1_to_2_s16p_flt_avx (int16_t **src, float **matrix, int len, int out_ch, int in_ch); +#define DEFINE_MIX_3_8_TO_1_2(chan) \ +extern void ff_mix_ ## chan ## _to_1_fltp_flt_sse(float **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); \ +extern void ff_mix_ ## chan ## _to_2_fltp_flt_sse(float **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); \ + \ +extern void ff_mix_ ## chan ## _to_1_s16p_flt_sse2(int16_t **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); \ +extern void ff_mix_ ## chan ## _to_2_s16p_flt_sse2(int16_t **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); \ + \ +extern void ff_mix_ ## chan ## _to_1_s16p_flt_sse4(int16_t **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); \ +extern void ff_mix_ ## chan ## _to_2_s16p_flt_sse4(int16_t **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); \ + \ +extern void ff_mix_ ## chan ## _to_1_fltp_flt_avx(float **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); \ +extern void ff_mix_ ## chan ## _to_2_fltp_flt_avx(float **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); \ + \ +extern void ff_mix_ ## chan ## _to_1_s16p_flt_avx(int16_t **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); \ +extern void ff_mix_ ## chan ## _to_2_s16p_flt_avx(int16_t **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); \ + \ +extern void ff_mix_ ## chan ## _to_1_fltp_flt_fma4(float **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); \ +extern void ff_mix_ ## chan ## _to_2_fltp_flt_fma4(float **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); \ + \ +extern void ff_mix_ ## chan ## _to_1_s16p_flt_fma4(int16_t **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); \ +extern void ff_mix_ ## chan ## _to_2_s16p_flt_fma4(int16_t **src, \ + float **matrix, int len, \ + int out_ch, int in_ch); + +DEFINE_MIX_3_8_TO_1_2(3) +DEFINE_MIX_3_8_TO_1_2(4) +DEFINE_MIX_3_8_TO_1_2(5) +DEFINE_MIX_3_8_TO_1_2(6) +DEFINE_MIX_3_8_TO_1_2(7) +DEFINE_MIX_3_8_TO_1_2(8) + +#define SET_MIX_3_8_TO_1_2(chan) \ + if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\ + chan, 1, 16, 4, "SSE", \ + ff_mix_ ## chan ## _to_1_fltp_flt_sse); \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\ + chan, 2, 16, 4, "SSE", \ + ff_mix_## chan ##_to_2_fltp_flt_sse); \ + } \ + if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\ + chan, 1, 16, 8, "SSE2", \ + ff_mix_ ## chan ## _to_1_s16p_flt_sse2); \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\ + chan, 2, 16, 8, "SSE2", \ + ff_mix_ ## chan ## _to_2_s16p_flt_sse2); \ + } \ + if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) { \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\ + chan, 1, 16, 8, "SSE4", \ + ff_mix_ ## chan ## _to_1_s16p_flt_sse4); \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\ + chan, 2, 16, 8, "SSE4", \ + ff_mix_ ## chan ## _to_2_s16p_flt_sse4); \ + } \ + if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { \ + int ptr_align = 32; \ + int smp_align = 8; \ + if (ARCH_X86_32 || chan >= 6) { \ + ptr_align = 16; \ + smp_align = 4; \ + } \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\ + chan, 1, ptr_align, smp_align, "AVX", \ + ff_mix_ ## chan ## _to_1_fltp_flt_avx); \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\ + chan, 2, ptr_align, smp_align, "AVX", \ + ff_mix_ ## chan ## _to_2_fltp_flt_avx); \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\ + chan, 1, 16, 8, "AVX", \ + ff_mix_ ## chan ## _to_1_s16p_flt_avx); \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\ + chan, 2, 16, 8, "AVX", \ + ff_mix_ ## chan ## _to_2_s16p_flt_avx); \ + } \ + if (mm_flags & AV_CPU_FLAG_FMA4 && HAVE_FMA4) { \ + int ptr_align = 32; \ + int smp_align = 8; \ + if (ARCH_X86_32 || chan >= 6) { \ + ptr_align = 16; \ + smp_align = 4; \ + } \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\ + chan, 1, ptr_align, smp_align, "FMA4", \ + ff_mix_ ## chan ## _to_1_fltp_flt_fma4); \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\ + chan, 2, ptr_align, smp_align, "FMA4", \ + ff_mix_ ## chan ## _to_2_fltp_flt_fma4); \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\ + chan, 1, 16, 8, "FMA4", \ + ff_mix_ ## chan ## _to_1_s16p_flt_fma4); \ + ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\ + chan, 2, 16, 8, "FMA4", \ + ff_mix_ ## chan ## _to_2_s16p_flt_fma4); \ + } + av_cold void ff_audio_mix_init_x86(AudioMix *am) { #if HAVE_YASM @@ -80,5 +203,12 @@ av_cold void ff_audio_mix_init_x86(AudioMix *am) ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT, 1, 2, 16, 8, "AVX", ff_mix_1_to_2_s16p_flt_avx); } + + SET_MIX_3_8_TO_1_2(3) + SET_MIX_3_8_TO_1_2(4) + SET_MIX_3_8_TO_1_2(5) + SET_MIX_3_8_TO_1_2(6) + SET_MIX_3_8_TO_1_2(7) + SET_MIX_3_8_TO_1_2(8) #endif } diff --git a/libavresample/x86/util.asm b/libavresample/x86/util.asm index 501f662d43..ca7fde513a 100644 --- a/libavresample/x86/util.asm +++ b/libavresample/x86/util.asm @@ -26,7 +26,8 @@ pmovsxwd m%1, m%1 SWAP %1, %2 %else - punpckhwd m%2, m%1 + mova m%2, m%1 + punpckhwd m%2, m%2 punpcklwd m%1, m%1 psrad m%2, 16 psrad m%1, 16 diff --git a/libavutil/eval.c b/libavutil/eval.c index fa76c6c949..6aa257efc4 100644 --- a/libavutil/eval.c +++ b/libavutil/eval.c @@ -797,11 +797,10 @@ int main(int argc, char **argv) av_expr_parse_and_eval(&d, *expr, const_names, const_values, NULL, NULL, NULL, NULL, NULL, 0, NULL); - if(isnan(d)){ + if (isnan(d)) printf("'%s' -> nan\n\n", *expr); - }else{ + else printf("'%s' -> %f\n\n", *expr, d); - } } av_expr_parse_and_eval(&d, "1+(5-2)^(3-1)+1/2+sin(PI)-max(-2.2,-3.1)", diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index f68e0bfe2d..7a18a20aca 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -42,12 +42,7 @@ ALIGN 16 sub lenq, 2*mmsize jge .loop -%if mmsize == 32 - vzeroupper - RET -%else REP_RET -%endif %endmacro INIT_XMM sse @@ -88,12 +83,7 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len mova [dstq+lenq+mmsize], m2 sub lenq, 2*mmsize jge .loop -%if mmsize == 32 - vzeroupper - RET -%else REP_RET -%endif %endmacro INIT_XMM sse diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 9a39df6ec8..c80e0a1c1a 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -392,11 +392,14 @@ DECLARE_REG 14, R15, R15D, R15W, R15B, 120 %macro RET 0 WIN64_RESTORE_XMM_INTERNAL rsp POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 +%if mmsize == 32 + vzeroupper +%endif ret %endmacro %macro REP_RET 0 - %if regs_used > 7 || xmm_regs_used > 6 + %if regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 RET %else rep ret @@ -433,11 +436,14 @@ DECLARE_REG 14, R15, R15D, R15W, R15B, 72 %macro RET 0 POP_IF_USED 14, 13, 12, 11, 10, 9 +%if mmsize == 32 + vzeroupper +%endif ret %endmacro %macro REP_RET 0 - %if regs_used > 9 + %if regs_used > 9 || mmsize == 32 RET %else rep ret @@ -479,11 +485,14 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %macro RET 0 POP_IF_USED 6, 5, 4, 3 +%if mmsize == 32 + vzeroupper +%endif ret %endmacro %macro REP_RET 0 - %if regs_used > 3 + %if regs_used > 3 || mmsize == 32 RET %else rep ret @@ -1126,16 +1135,22 @@ AVX_INSTR pfmul, 1, 0, 1 %undef j %macro FMA_INSTR 3 - %macro %1 4-7 %1, %2, %3 - %if cpuflag(xop) - v%5 %1, %2, %3, %4 + %macro %1 5-8 %1, %2, %3 + %if cpuflag(xop) || cpuflag(fma4) + v%6 %1, %2, %3, %4 %else - %6 %1, %2, %3 - %7 %1, %4 + %ifidn %1, %4 + %7 %5, %2, %3 + %8 %1, %4, %5 + %else + %7 %1, %2, %3 + %8 %1, %4 + %endif %endif %endmacro %endmacro +FMA_INSTR fmaddps, mulps, addps FMA_INSTR pmacsdd, pmulld, paddd FMA_INSTR pmacsww, pmullw, paddw FMA_INSTR pmadcswd, pmaddwd, paddd diff --git a/tests/Makefile b/tests/Makefile index 558e52331a..fbac76a0e3 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -15,9 +15,6 @@ ffservertest: ffserver$(EXESUF) tests/vsynth1/00.pgm tests/data/asynth1.sw OBJDIRS += tests/data tests/vsynth1 -# Required due to missing automatic dependency tracking for HOSTOBJS. -tests/rotozoom.o tests/videogen.o: tests/utils.c - tests/vsynth1/00.pgm: tests/videogen$(HOSTEXESUF) | tests/vsynth1 $(M)./$< 'tests/vsynth1/'