From faa1471ffcc1bbffe3a9d3d1f4b5fe3adbef647a Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sat, 28 Jun 2014 11:05:52 -0400 Subject: [PATCH] swr: rewrite resample_common/linear_float_sse/avx in yasm. Linear interpolation goes from 63 (llvm) or 58 (gcc) to 48 (yasm) cycles/sample on 64bit, or from 66 (llvm/gcc) to 52 (yasm) cycles/ sample on 32bit. Bon-linear goes from 43 (llvm) or 38 (gcc) to 32 (yasm) cycles/sample on 64bit, or from 46 (llvm) or 44 (gcc) to 38 (yasm) cycles/sample on 32bit (all testing on OSX 10.9.2, llvm 5.1 and gcc 4.8/9). Signed-off-by: Michael Niedermayer --- configure | 3 +- libswresample/resample_template.c | 12 +- libswresample/x86/Makefile | 1 + libswresample/x86/resample.asm | 462 +++++++++++++++++++++++++++ libswresample/x86/resample_mmx.h | 118 ------- libswresample/x86/resample_x86_dsp.c | 34 +- 6 files changed, 481 insertions(+), 149 deletions(-) create mode 100644 libswresample/x86/resample.asm diff --git a/configure b/configure index 296ca2bbe4..6c5ae3ec8c 100755 --- a/configure +++ b/configure @@ -4460,8 +4460,7 @@ EOF check_inline_asm inline_asm_direct_symbol_refs '"movl '$extern_prefix'test, %eax"' || check_inline_asm inline_asm_direct_symbol_refs '"movl '$extern_prefix'test(%rip), %eax"' - # check whether binutils is new enough to compile AVX/SSSE3/MMXEXT - enabled avx && check_inline_asm avx_inline '"vextractf128 $1, %ymm0, %xmm1"' + # check whether binutils is new enough to compile SSSE3/MMXEXT enabled ssse3 && check_inline_asm ssse3_inline '"pabsw %xmm0, %xmm0"' enabled mmxext && check_inline_asm mmxext_inline '"pmaxub %mm0, %mm1"' diff --git a/libswresample/resample_template.c b/libswresample/resample_template.c index 0fc9770b28..db208e3414 100644 --- a/libswresample/resample_template.c +++ b/libswresample/resample_template.c @@ -43,9 +43,7 @@ # define RENAME(N) N ## _double_sse2 # endif -#elif defined(TEMPLATE_RESAMPLE_FLT) \ - || defined(TEMPLATE_RESAMPLE_FLT_SSE) \ - || defined(TEMPLATE_RESAMPLE_FLT_AVX) +#elif defined(TEMPLATE_RESAMPLE_FLT) # define FILTER_SHIFT 0 # define DELEM float @@ -56,14 +54,6 @@ # if defined(TEMPLATE_RESAMPLE_FLT) # define RENAME(N) N ## _float -# elif defined(TEMPLATE_RESAMPLE_FLT_SSE) -# define COMMON_CORE COMMON_CORE_FLT_SSE -# define LINEAR_CORE LINEAR_CORE_FLT_SSE -# define RENAME(N) N ## _float_sse -# elif defined(TEMPLATE_RESAMPLE_FLT_AVX) -# define COMMON_CORE COMMON_CORE_FLT_AVX -# define LINEAR_CORE LINEAR_CORE_FLT_AVX -# define RENAME(N) N ## _float_avx # endif #elif defined(TEMPLATE_RESAMPLE_S32) diff --git a/libswresample/x86/Makefile b/libswresample/x86/Makefile index cc3e65f26f..cb6371ae1f 100644 --- a/libswresample/x86/Makefile +++ b/libswresample/x86/Makefile @@ -1,6 +1,7 @@ YASM-OBJS += x86/swresample_x86.o\ x86/audio_convert.o\ x86/rematrix.o\ + x86/resample.o\ OBJS += x86/resample_x86_dsp.o\ diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm new file mode 100644 index 0000000000..bb63fc7e8e --- /dev/null +++ b/libswresample/x86/resample.asm @@ -0,0 +1,462 @@ +;****************************************************************************** +;* Copyright (c) 2012 Michael Niedermayer +;* Copyright (c) 2014 James Almer gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 +%define pointer resq +%else +%define pointer resd +%endif + +struc ResampleContext + .av_class: pointer 1 + .filter_bank: pointer 1 + .filter_length: resd 1 + .filter_alloc: resd 1 + .ideal_dst_incr: resd 1 + .dst_incr: resd 1 + .dst_incr_div: resd 1 + .dst_incr_mod: resd 1 + .index: resd 1 + .frac: resd 1 + .src_incr: resd 1 + .compensation_distance: resd 1 + .phase_shift: resd 1 + .phase_mask: resd 1 + + ; there's a few more here but we only care about the first few +endstruc + +SECTION_RODATA + +pf_1: dd 1.0 + +SECTION .text + +%macro RESAMPLE_FLOAT_FNS 0 +; int resample_common_float(ResampleContext *ctx, float *dst, +; const float *src, int size, int update_ctx) +%if ARCH_X86_64 ; unix64 and win64 +cglobal resample_common_float, 0, 15, 2, ctx, dst, src, phase_shift, index, frac, \ + dst_incr_mod, size, min_filter_count_x4, \ + min_filter_len_x4, dst_incr_div, src_incr, \ + phase_mask, dst_end, filter_bank + + ; use red-zone for variable storage +%define ctx_stackq [rsp-0x8] +%define src_stackq [rsp-0x10] +%if WIN64 +%define update_context_stackd r4m +%else ; unix64 +%define update_context_stackd [rsp-0x14] +%endif + + ; load as many variables in registers as possible; for the rest, store + ; on stack so that we have 'ctx' available as one extra register + mov sized, r3d + mov phase_maskd, [ctxq+ResampleContext.phase_mask] +%if UNIX64 + mov update_context_stackd, r4d +%endif + mov indexd, [ctxq+ResampleContext.index] + mov fracd, [ctxq+ResampleContext.frac] + mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod] + mov filter_bankq, [ctxq+ResampleContext.filter_bank] + mov src_incrd, [ctxq+ResampleContext.src_incr] + mov ctx_stackq, ctxq + mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] + mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] + shl min_filter_len_x4d, 2 + lea dst_endq, [dstq+sizeq*4] + +%if UNIX64 + mov ecx, [ctxq+ResampleContext.phase_shift] + mov edi, [ctxq+ResampleContext.filter_alloc] + + DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ + filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ + src_incr, phase_mask, dst_end, filter_bank +%elif WIN64 + mov R9d, [ctxq+ResampleContext.filter_alloc] + mov ecx, [ctxq+ResampleContext.phase_shift] + + DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ + filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ + src_incr, phase_mask, dst_end, filter_bank +%endif + + neg min_filter_len_x4q + sub filter_bankq, min_filter_len_x4q + sub srcq, min_filter_len_x4q + mov src_stackq, srcq +%else ; x86-32 +cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \ + index, min_filter_length_x4, filter_bank + + ; push temp variables to stack +%define ctx_stackq r0mp +%define src_stackq r2mp +%define update_context_stackd r4m + + mov dstq, r1mp + mov r3, r3mp + lea r3, [dstq+r3*4] + PUSH dword [ctxq+ResampleContext.dst_incr_div] + PUSH dword [ctxq+ResampleContext.dst_incr_mod] + PUSH dword [ctxq+ResampleContext.filter_alloc] + PUSH r3 + PUSH dword [ctxq+ResampleContext.phase_mask] + PUSH dword [ctxq+ResampleContext.src_incr] + mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] + mov indexd, [ctxq+ResampleContext.index] + shl min_filter_length_x4d, 2 + mov fracd, [ctxq+ResampleContext.frac] + neg min_filter_length_x4q + mov filter_bankq, [ctxq+ResampleContext.filter_bank] + sub r2mp, min_filter_length_x4q + sub filter_bankq, min_filter_length_x4q + PUSH min_filter_length_x4q + PUSH filter_bankq + mov phase_shiftd, [ctxq+ResampleContext.phase_shift] + + DEFINE_ARGS src, phase_shift, dst, frac, index, min_filter_count_x4, filter + +%define filter_bankq dword [rsp+0x0] +%define min_filter_length_x4q dword [rsp+0x4] +%define src_incrd dword [rsp+0x8] +%define phase_maskd dword [rsp+0xc] +%define dst_endq dword [rsp+0x10] +%define filter_allocd dword [rsp+0x14] +%define dst_incr_modd dword [rsp+0x18] +%define dst_incr_divd dword [rsp+0x1c] + + mov srcq, r2mp +%endif + +.loop: + mov filterd, filter_allocd + imul filterd, indexd +%if ARCH_X86_64 + mov min_filter_count_x4q, min_filter_len_x4q + lea filterq, [filter_bankq+filterq*4] +%else ; x86-32 + mov min_filter_count_x4q, filter_bankq + lea filterq, [min_filter_count_x4q+filterq*4] + mov min_filter_count_x4q, min_filter_length_x4q +%endif + xorps m0, m0, m0 + + align 16 +.inner_loop: + movups m1, [srcq+min_filter_count_x4q*1] + mulps m1, m1, [filterq+min_filter_count_x4q*1] + addps m0, m0, m1 + add min_filter_count_x4q, mmsize + js .inner_loop + +%if cpuflag(avx) + vextractf128 xm1, m0, 0x1 + addps xm0, xm1 +%endif + + ; horizontal sum & store + movhlps xm1, xm0 + addps xm0, xm1 + shufps xm1, xm0, xm0, q0001 + add fracd, dst_incr_modd + addps xm0, xm1 + add indexd, dst_incr_divd + movss [dstq], xm0 + cmp fracd, src_incrd + jl .skip + sub fracd, src_incrd + inc indexd + +%if UNIX64 + DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ + index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ + src_incr, phase_mask, dst_end, filter_bank +%elif WIN64 + DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ + index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ + src_incr, phase_mask, dst_end, filter_bank +%else ; x86-32 + DEFINE_ARGS src, phase_shift, dst, frac, index, index_incr +%endif + +.skip: + mov index_incrd, indexd + add dstq, 4 + and indexd, phase_maskd + sar index_incrd, phase_shiftb + lea srcq, [srcq+index_incrq*4] + cmp dstq, dst_endq + jne .loop + +%if ARCH_X86_64 + DEFINE_ARGS ctx, dst, src, phase_shift, index, frac +%else ; x86-32 + DEFINE_ARGS src, ctx, update_context, frac, index +%endif + + cmp dword update_context_stackd, 0 + jz .skip_store + ; strictly speaking, the function should always return the consumed + ; number of bytes; however, we only use the value if update_context + ; is true, so let's just leave it uninitialized otherwise + mov ctxq, ctx_stackq + movifnidn rax, srcq + mov [ctxq+ResampleContext.frac ], fracd + sub rax, src_stackq + mov [ctxq+ResampleContext.index], indexd + shr rax, 2 + +.skip_store: +%if ARCH_X86_32 + ADD rsp, 0x20 +%endif + RET + +; int resample_linear_float(ResampleContext *ctx, float *dst, +; const float *src, int size, int update_ctx) +%if ARCH_X86_64 ; unix64 and win64 +cglobal resample_linear_float, 0, 15, 5, ctx, dst, src, phase_shift, index, frac, \ + dst_incr_mod, size, min_filter_count_x4, \ + min_filter_len_x4, dst_incr_div, src_incr, \ + phase_mask, dst_end, filter_bank + + ; use red-zone for variable storage +%define ctx_stackq [rsp-0x8] +%define src_stackq [rsp-0x10] +%define phase_mask_stackd [rsp-0x14] +%if WIN64 +%define update_context_stackd r4m +%else ; unix64 +%define update_context_stackd [rsp-0x18] +%endif + + ; load as many variables in registers as possible; for the rest, store + ; on stack so that we have 'ctx' available as one extra register + mov sized, r3d + mov phase_maskd, [ctxq+ResampleContext.phase_mask] +%if UNIX64 + mov update_context_stackd, r4d +%endif + mov indexd, [ctxq+ResampleContext.index] + mov fracd, [ctxq+ResampleContext.frac] + mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod] + mov filter_bankq, [ctxq+ResampleContext.filter_bank] + mov src_incrd, [ctxq+ResampleContext.src_incr] + mov ctx_stackq, ctxq + mov phase_mask_stackd, phase_maskd + mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] + cvtsi2ss xm0, src_incrd + movss xm4, [pf_1] + divss xm4, xm0 + mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] + shl min_filter_len_x4d, 2 + lea dst_endq, [dstq+sizeq*4] + +%if UNIX64 + mov ecx, [ctxq+ResampleContext.phase_shift] + mov edi, [ctxq+ResampleContext.filter_alloc] + + DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ + filter1, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ + src_incr, filter2, dst_end, filter_bank +%elif WIN64 + mov R9d, [ctxq+ResampleContext.filter_alloc] + mov ecx, [ctxq+ResampleContext.phase_shift] + + DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ + filter1, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ + src_incr, filter2, dst_end, filter_bank +%endif + + neg min_filter_len_x4q + sub filter_bankq, min_filter_len_x4q + sub srcq, min_filter_len_x4q + mov src_stackq, srcq +%else ; x86-32 +cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \ + index, min_filter_length_x4, filter_bank + + ; push temp variables to stack +%define ctx_stackq r0mp +%define src_stackq r2mp +%define update_context_stackd r4m + + mov dstq, r1mp + mov r3, r3mp + lea r3, [dstq+r3*4] + PUSH dword [ctxq+ResampleContext.dst_incr_div] + PUSH r3 + mov r3, dword [ctxq+ResampleContext.filter_alloc] + PUSH dword [ctxq+ResampleContext.dst_incr_mod] + PUSH r3 + shl r3, 2 + PUSH r3 + mov r3, dword [ctxq+ResampleContext.src_incr] + PUSH dword [ctxq+ResampleContext.phase_mask] + PUSH r3d + cvtsi2ss xm0, r3d + movss xm4, [pf_1] + divss xm4, xm0 + mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] + mov indexd, [ctxq+ResampleContext.index] + shl min_filter_length_x4d, 2 + mov fracd, [ctxq+ResampleContext.frac] + neg min_filter_length_x4q + mov filter_bankq, [ctxq+ResampleContext.filter_bank] + sub r2mp, min_filter_length_x4q + sub filter_bankq, min_filter_length_x4q + PUSH min_filter_length_x4q + PUSH filter_bankq + PUSH dword [ctxq+ResampleContext.phase_shift] + + DEFINE_ARGS src, filter1, dst, frac, index, min_filter_count_x4, filter2 + +%define phase_shift_stackd dword [rsp+0x0] +%define filter_bankq dword [rsp+0x4] +%define min_filter_length_x4q dword [rsp+0x8] +%define src_incrd dword [rsp+0xc] +%define phase_mask_stackd dword [rsp+0x10] +%define filter_alloc_x4q dword [rsp+0x14] +%define filter_allocd dword [rsp+0x18] +%define dst_incr_modd dword [rsp+0x1c] +%define dst_endq dword [rsp+0x20] +%define dst_incr_divd dword [rsp+0x24] + + mov srcq, r2mp +%endif + +.loop: + mov filter1d, filter_allocd + imul filter1d, indexd +%if ARCH_X86_64 + mov min_filter_count_x4q, min_filter_len_x4q + lea filter1q, [filter_bankq+filter1q*4] + lea filter2q, [filter1q+filter_allocq*4] +%else ; x86-32 + mov min_filter_count_x4q, filter_bankq + lea filter1q, [min_filter_count_x4q+filter1q*4] + mov min_filter_count_x4q, min_filter_length_x4q + mov filter2q, filter1q + add filter2q, filter_alloc_x4q +%endif + xorps m0, m0, m0 + xorps m2, m2, m2 + + align 16 +.inner_loop: + movups m1, [srcq+min_filter_count_x4q*1] + mulps m3, m1, [filter2q+min_filter_count_x4q*1] + mulps m1, m1, [filter1q+min_filter_count_x4q*1] + addps m2, m2, m3 + addps m0, m0, m1 + add min_filter_count_x4q, mmsize + js .inner_loop + +%if cpuflag(avx) + vextractf128 xm1, m0, 0x1 + vextractf128 xm3, m2, 0x1 + addps xm0, xm1 + addps xm2, xm3 +%endif + + ; val += (v2 - val) * (FELEML) frac / c->src_incr; + cvtsi2ss xm1, fracd + subps xm2, xm0 + mulps xm1, xm4 + shufps xm1, xm1, q0000 + mulps xm2, xm1 + addps xm0, xm2 + + ; horizontal sum & store + movhlps xm1, xm0 + addps xm0, xm1 + shufps xm1, xm0, xm0, q0001 + add fracd, dst_incr_modd + addps xm0, xm1 + add indexd, dst_incr_divd + movss [dstq], xm0 + cmp fracd, src_incrd + jl .skip + sub fracd, src_incrd + inc indexd + +%if UNIX64 + DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ + index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ + src_incr, filter2, dst_end, filter_bank +%elif WIN64 + DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ + index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ + src_incr, filter2, dst_end, filter_bank +%else ; x86-32 + DEFINE_ARGS src, phase_shift, dst, frac, index, index_incr +%endif + +.skip: +%if ARCH_X86_32 + mov phase_shiftd, phase_shift_stackd +%endif + mov index_incrd, indexd + add dstq, 4 + and indexd, phase_mask_stackd + sar index_incrd, phase_shiftb + lea srcq, [srcq+index_incrq*4] + cmp dstq, dst_endq + jne .loop + +%if ARCH_X86_64 + DEFINE_ARGS ctx, dst, src, phase_shift, index, frac +%else ; x86-32 + DEFINE_ARGS src, ctx, update_context, frac, index +%endif + + cmp dword update_context_stackd, 0 + jz .skip_store + ; strictly speaking, the function should always return the consumed + ; number of bytes; however, we only use the value if update_context + ; is true, so let's just leave it uninitialized otherwise + mov ctxq, ctx_stackq + movifnidn rax, srcq + mov [ctxq+ResampleContext.frac ], fracd + sub rax, src_stackq + mov [ctxq+ResampleContext.index], indexd + shr rax, 2 + +.skip_store: +%if ARCH_X86_32 + ADD rsp, 0x28 +%endif + RET +%endmacro + +INIT_XMM sse +RESAMPLE_FLOAT_FNS + +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +RESAMPLE_FLOAT_FNS +%endif diff --git a/libswresample/x86/resample_mmx.h b/libswresample/x86/resample_mmx.h index a4da1e9d1d..94237b0507 100644 --- a/libswresample/x86/resample_mmx.h +++ b/libswresample/x86/resample_mmx.h @@ -132,124 +132,6 @@ __asm__ volatile(\ XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\ ); -#define COMMON_CORE_FLT_SSE \ - x86_reg len= -4*c->filter_length;\ -__asm__ volatile(\ - "xorps %%xmm0, %%xmm0 \n\t"\ - "1: \n\t"\ - "movups (%1, %0), %%xmm1 \n\t"\ - "mulps (%2, %0), %%xmm1 \n\t"\ - "addps %%xmm1, %%xmm0 \n\t"\ - "add $16, %0 \n\t"\ - " js 1b \n\t"\ - "movhlps %%xmm0, %%xmm1 \n\t"\ - "addps %%xmm1, %%xmm0 \n\t"\ - "movss %%xmm0, %%xmm1 \n\t"\ - "shufps $1, %%xmm0, %%xmm0 \n\t"\ - "addps %%xmm1, %%xmm0 \n\t"\ - "movss %%xmm0, (%3) \n\t"\ - : "+r" (len)\ - : "r" (((uint8_t*)(src+sample_index))-len),\ - "r" (((uint8_t*)filter)-len),\ - "r" (dst+dst_index)\ - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\ -); - -#define LINEAR_CORE_FLT_SSE \ - x86_reg len= -4*c->filter_length;\ -__asm__ volatile(\ - "xorps %%xmm0, %%xmm0 \n\t"\ - "xorps %%xmm2, %%xmm2 \n\t"\ - "1: \n\t"\ - "movups (%3, %0), %%xmm1 \n\t"\ - "movaps %%xmm1, %%xmm3 \n\t"\ - "mulps (%4, %0), %%xmm1 \n\t"\ - "mulps (%5, %0), %%xmm3 \n\t"\ - "addps %%xmm1, %%xmm0 \n\t"\ - "addps %%xmm3, %%xmm2 \n\t"\ - "add $16, %0 \n\t"\ - " js 1b \n\t"\ - "movhlps %%xmm0, %%xmm1 \n\t"\ - "movhlps %%xmm2, %%xmm3 \n\t"\ - "addps %%xmm1, %%xmm0 \n\t"\ - "addps %%xmm3, %%xmm2 \n\t"\ - "movss %%xmm0, %%xmm1 \n\t"\ - "movss %%xmm2, %%xmm3 \n\t"\ - "shufps $1, %%xmm0, %%xmm0 \n\t"\ - "shufps $1, %%xmm2, %%xmm2 \n\t"\ - "addps %%xmm1, %%xmm0 \n\t"\ - "addps %%xmm3, %%xmm2 \n\t"\ - "movss %%xmm0, %1 \n\t"\ - "movss %%xmm2, %2 \n\t"\ - : "+r" (len),\ - "=m" (val),\ - "=m" (v2)\ - : "r" (((uint8_t*)(src+sample_index))-len),\ - "r" (((uint8_t*)filter)-len),\ - "r" (((uint8_t*)(filter+c->filter_alloc))-len)\ - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\ -); - -#define COMMON_CORE_FLT_AVX \ - x86_reg len= -4*c->filter_length;\ -__asm__ volatile(\ - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t"\ - "1: \n\t"\ - "vmovups (%1, %0), %%ymm1 \n\t"\ - "vmulps (%2, %0), %%ymm1, %%ymm1 \n\t"\ - "vaddps %%ymm1, %%ymm0, %%ymm0 \n\t"\ - "add $32, %0 \n\t"\ - " js 1b \n\t"\ - "vextractf128 $1, %%ymm0, %%xmm1 \n\t"\ - "vaddps %%xmm1, %%xmm0, %%xmm0 \n\t"\ - "vmovhlps %%xmm0, %%xmm1, %%xmm1 \n\t"\ - "vaddps %%xmm1, %%xmm0, %%xmm0 \n\t"\ - "vshufps $1, %%xmm0, %%xmm0, %%xmm1 \n\t"\ - "vaddss %%xmm1, %%xmm0, %%xmm0 \n\t"\ - "vmovss %%xmm0, (%3) \n\t"\ - : "+r" (len)\ - : "r" (((uint8_t*)(src+sample_index))-len),\ - "r" (((uint8_t*)filter)-len),\ - "r" (dst+dst_index)\ - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\ -); - -#define LINEAR_CORE_FLT_AVX \ - x86_reg len= -4*c->filter_length;\ -__asm__ volatile(\ - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t"\ - "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t"\ - "1: \n\t"\ - "vmovups (%3, %0), %%ymm1 \n\t"\ - "vmulps (%5, %0), %%ymm1, %%ymm3 \n\t"\ - "vmulps (%4, %0), %%ymm1, %%ymm1 \n\t"\ - "vaddps %%ymm1, %%ymm0, %%ymm0 \n\t"\ - "vaddps %%ymm3, %%ymm2, %%ymm2 \n\t"\ - "add $32, %0 \n\t"\ - " js 1b \n\t"\ - "vextractf128 $1, %%ymm0, %%xmm1 \n\t"\ - "vextractf128 $1, %%ymm2, %%xmm3 \n\t"\ - "vaddps %%xmm1, %%xmm0, %%xmm0 \n\t"\ - "vaddps %%xmm3, %%xmm2, %%xmm2 \n\t"\ - "vmovhlps %%xmm0, %%xmm1, %%xmm1 \n\t"\ - "vmovhlps %%xmm2, %%xmm3, %%xmm3 \n\t"\ - "vaddps %%xmm1, %%xmm0, %%xmm0 \n\t"\ - "vaddps %%xmm3, %%xmm2, %%xmm2 \n\t"\ - "vshufps $1, %%xmm0, %%xmm0, %%xmm1 \n\t"\ - "vshufps $1, %%xmm2, %%xmm2, %%xmm3 \n\t"\ - "vaddss %%xmm1, %%xmm0, %%xmm0 \n\t"\ - "vaddss %%xmm3, %%xmm2, %%xmm2 \n\t"\ - "vmovss %%xmm0, %1 \n\t"\ - "vmovss %%xmm2, %2 \n\t"\ - : "+r" (len),\ - "=m" (val),\ - "=m" (v2)\ - : "r" (((uint8_t*)(src+sample_index))-len),\ - "r" (((uint8_t*)filter)-len),\ - "r" (((uint8_t*)(filter+c->filter_alloc))-len)\ - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\ -); - #define COMMON_CORE_DBL_SSE2 \ x86_reg len= -8*c->filter_length;\ __asm__ volatile(\ diff --git a/libswresample/x86/resample_x86_dsp.c b/libswresample/x86/resample_x86_dsp.c index 63493af0de..c7d2054f9c 100644 --- a/libswresample/x86/resample_x86_dsp.c +++ b/libswresample/x86/resample_x86_dsp.c @@ -50,12 +50,6 @@ int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const do #undef TEMPLATE_RESAMPLE_S16_MMX2 #endif -#if HAVE_SSE_INLINE -#define TEMPLATE_RESAMPLE_FLT_SSE -#include "libswresample/resample_template.c" -#undef TEMPLATE_RESAMPLE_FLT_SSE -#endif - #if HAVE_SSE2_INLINE #define TEMPLATE_RESAMPLE_S16_SSE2 #include "libswresample/resample_template.c" @@ -66,16 +60,20 @@ int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const do #undef TEMPLATE_RESAMPLE_DBL_SSE2 #endif -#if HAVE_AVX_INLINE -#define TEMPLATE_RESAMPLE_FLT_AVX -#include "libswresample/resample_template.c" -#undef TEMPLATE_RESAMPLE_FLT_AVX -#endif - #undef DO_RESAMPLE_ONE #endif // HAVE_MMXEXT_INLINE +int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst, + const uint8_t *src, int sz, int upd); +int ff_resample_linear_float_sse(ResampleContext *c, uint8_t *dst, + const uint8_t *src, int sz, int upd); + +int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst, + const uint8_t *src, int sz, int upd); +int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst, + const uint8_t *src, int sz, int upd); + void swresample_dsp_x86_init(ResampleContext *c) { int av_unused mm_flags = av_get_cpu_flags(); @@ -85,9 +83,9 @@ void swresample_dsp_x86_init(ResampleContext *c) c->dsp.resample_common[FNIDX(S16P)] = (resample_fn) swri_resample_common_int16_mmx2; c->dsp.resample_linear[FNIDX(S16P)] = (resample_fn) swri_resample_linear_int16_mmx2; } - if (HAVE_SSE_INLINE && mm_flags & AV_CPU_FLAG_SSE) { - c->dsp.resample_common[FNIDX(FLTP)] = (resample_fn) swri_resample_common_float_sse; - c->dsp.resample_linear[FNIDX(FLTP)] = (resample_fn) swri_resample_linear_float_sse; + if (HAVE_SSE_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE) { + c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_sse; + c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_sse; } if (HAVE_SSE2_INLINE && mm_flags & AV_CPU_FLAG_SSE2) { c->dsp.resample_common[FNIDX(S16P)] = (resample_fn) swri_resample_common_int16_sse2; @@ -95,8 +93,8 @@ void swresample_dsp_x86_init(ResampleContext *c) c->dsp.resample_common[FNIDX(DBLP)] = (resample_fn) swri_resample_common_double_sse2; c->dsp.resample_linear[FNIDX(DBLP)] = (resample_fn) swri_resample_linear_double_sse2; } - if (HAVE_AVX_INLINE && mm_flags & AV_CPU_FLAG_AVX) { - c->dsp.resample_common[FNIDX(FLTP)] = (resample_fn) swri_resample_common_float_avx; - c->dsp.resample_linear[FNIDX(FLTP)] = (resample_fn) swri_resample_linear_float_avx; + if (HAVE_AVX_EXTERNAL && mm_flags & AV_CPU_FLAG_AVX) { + c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx; + c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_avx; } }