mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
swr: convert resample_common/linear_int16_mmx2/sse2 to yasm.
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
e5c806fd67
commit
847bb638c0
@ -44,17 +44,15 @@
|
||||
|
||||
#elif defined(TEMPLATE_RESAMPLE_FLT)
|
||||
|
||||
# define RENAME(N) N ## _float
|
||||
# define FILTER_SHIFT 0
|
||||
# define DELEM float
|
||||
# define FELEM float
|
||||
# define FELEM2 float
|
||||
# define OUT(d, v) d = v
|
||||
|
||||
# if defined(TEMPLATE_RESAMPLE_FLT)
|
||||
# define RENAME(N) N ## _float
|
||||
# endif
|
||||
|
||||
#elif defined(TEMPLATE_RESAMPLE_S32)
|
||||
|
||||
# define RENAME(N) N ## _int32
|
||||
# define FILTER_SHIFT 30
|
||||
# define DELEM int32_t
|
||||
@ -65,10 +63,9 @@
|
||||
# define OUT(d, v) v = (v + (1<<(FILTER_SHIFT-1)))>>FILTER_SHIFT;\
|
||||
d = (uint64_t)(v + 0x80000000) > 0xFFFFFFFF ? (v>>63) ^ 0x7FFFFFFF : v
|
||||
|
||||
#elif defined(TEMPLATE_RESAMPLE_S16) \
|
||||
|| defined(TEMPLATE_RESAMPLE_S16_MMX2) \
|
||||
|| defined(TEMPLATE_RESAMPLE_S16_SSE2)
|
||||
#elif defined(TEMPLATE_RESAMPLE_S16)
|
||||
|
||||
# define RENAME(N) N ## _int16
|
||||
# define FILTER_SHIFT 15
|
||||
# define DELEM int16_t
|
||||
# define FELEM int16_t
|
||||
@ -79,18 +76,6 @@
|
||||
# define OUT(d, v) v = (v + (1<<(FILTER_SHIFT-1)))>>FILTER_SHIFT;\
|
||||
d = (unsigned)(v + 32768) > 65535 ? (v>>31) ^ 32767 : v
|
||||
|
||||
# if defined(TEMPLATE_RESAMPLE_S16)
|
||||
# define RENAME(N) N ## _int16
|
||||
# elif defined(TEMPLATE_RESAMPLE_S16_MMX2)
|
||||
# define COMMON_CORE COMMON_CORE_INT16_MMX2
|
||||
# define LINEAR_CORE LINEAR_CORE_INT16_MMX2
|
||||
# define RENAME(N) N ## _int16_mmx2
|
||||
# elif defined(TEMPLATE_RESAMPLE_S16_SSE2)
|
||||
# define COMMON_CORE COMMON_CORE_INT16_SSE2
|
||||
# define LINEAR_CORE LINEAR_CORE_INT16_SSE2
|
||||
# define RENAME(N) N ## _int16_sse2
|
||||
# endif
|
||||
|
||||
#endif
|
||||
|
||||
#if DO_RESAMPLE_ONE
|
||||
|
@ -1,6 +1,7 @@
|
||||
;******************************************************************************
|
||||
;* Copyright (c) 2012 Michael Niedermayer
|
||||
;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com>
|
||||
;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
@ -48,18 +49,19 @@ endstruc
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pf_1: dd 1.0
|
||||
pf_1: dd 1.0
|
||||
pd_0x4000: dd 0x4000
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro RESAMPLE_FLOAT_FNS 0
|
||||
; int resample_common_float(ResampleContext *ctx, float *dst,
|
||||
; const float *src, int size, int update_ctx)
|
||||
%macro RESAMPLE_FNS 3 ; format [float or int16], bps, log2_bps
|
||||
; int resample_common_$format(ResampleContext *ctx, $format *dst,
|
||||
; const $format *src, int size, int update_ctx)
|
||||
%if ARCH_X86_64 ; unix64 and win64
|
||||
cglobal resample_common_float, 0, 15, 2, ctx, dst, src, phase_shift, index, frac, \
|
||||
dst_incr_mod, size, min_filter_count_x4, \
|
||||
min_filter_len_x4, dst_incr_div, src_incr, \
|
||||
phase_mask, dst_end, filter_bank
|
||||
cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_shift, index, frac, \
|
||||
dst_incr_mod, size, min_filter_count_x4, \
|
||||
min_filter_len_x4, dst_incr_div, src_incr, \
|
||||
phase_mask, dst_end, filter_bank
|
||||
|
||||
; use red-zone for variable storage
|
||||
%define ctx_stackq [rsp-0x8]
|
||||
@ -85,8 +87,8 @@ cglobal resample_common_float, 0, 15, 2, ctx, dst, src, phase_shift, index, frac
|
||||
mov ctx_stackq, ctxq
|
||||
mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
|
||||
mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
|
||||
shl min_filter_len_x4d, 2
|
||||
lea dst_endq, [dstq+sizeq*4]
|
||||
shl min_filter_len_x4d, %3
|
||||
lea dst_endq, [dstq+sizeq*%2]
|
||||
|
||||
%if UNIX64
|
||||
mov ecx, [ctxq+ResampleContext.phase_shift]
|
||||
@ -109,8 +111,8 @@ cglobal resample_common_float, 0, 15, 2, ctx, dst, src, phase_shift, index, frac
|
||||
sub srcq, min_filter_len_x4q
|
||||
mov src_stackq, srcq
|
||||
%else ; x86-32
|
||||
cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
|
||||
index, min_filter_length_x4, filter_bank
|
||||
cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
|
||||
index, min_filter_length_x4, filter_bank
|
||||
|
||||
; push temp variables to stack
|
||||
%define ctx_stackq r0mp
|
||||
@ -119,7 +121,7 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
|
||||
|
||||
mov dstq, r1mp
|
||||
mov r3, r3mp
|
||||
lea r3, [dstq+r3*4]
|
||||
lea r3, [dstq+r3*%2]
|
||||
PUSH dword [ctxq+ResampleContext.dst_incr_div]
|
||||
PUSH dword [ctxq+ResampleContext.dst_incr_mod]
|
||||
PUSH dword [ctxq+ResampleContext.filter_alloc]
|
||||
@ -128,7 +130,7 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
|
||||
PUSH dword [ctxq+ResampleContext.src_incr]
|
||||
mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
|
||||
mov indexd, [ctxq+ResampleContext.index]
|
||||
shl min_filter_length_x4d, 2
|
||||
shl min_filter_length_x4d, %3
|
||||
mov fracd, [ctxq+ResampleContext.frac]
|
||||
neg min_filter_length_x4q
|
||||
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
|
||||
@ -157,19 +159,28 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
|
||||
imul filterd, indexd
|
||||
%if ARCH_X86_64
|
||||
mov min_filter_count_x4q, min_filter_len_x4q
|
||||
lea filterq, [filter_bankq+filterq*4]
|
||||
lea filterq, [filter_bankq+filterq*%2]
|
||||
%else ; x86-32
|
||||
mov min_filter_count_x4q, filter_bankq
|
||||
lea filterq, [min_filter_count_x4q+filterq*4]
|
||||
lea filterq, [min_filter_count_x4q+filterq*%2]
|
||||
mov min_filter_count_x4q, min_filter_length_x4q
|
||||
%endif
|
||||
%ifidn %1, float
|
||||
xorps m0, m0, m0
|
||||
%else ; int16
|
||||
movd m0, [pd_0x4000]
|
||||
%endif
|
||||
|
||||
align 16
|
||||
.inner_loop:
|
||||
movups m1, [srcq+min_filter_count_x4q*1]
|
||||
movu m1, [srcq+min_filter_count_x4q*1]
|
||||
%ifidn %1, float
|
||||
mulps m1, m1, [filterq+min_filter_count_x4q*1]
|
||||
addps m0, m0, m1
|
||||
%else ; int16
|
||||
pmaddwd m1, [filterq+min_filter_count_x4q*1]
|
||||
paddd m0, m1
|
||||
%endif
|
||||
add min_filter_count_x4q, mmsize
|
||||
js .inner_loop
|
||||
|
||||
@ -179,6 +190,7 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
|
||||
%endif
|
||||
|
||||
; horizontal sum & store
|
||||
%ifidn %1, float
|
||||
movhlps xm1, xm0
|
||||
addps xm0, xm1
|
||||
shufps xm1, xm0, xm0, q0001
|
||||
@ -186,6 +198,21 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
|
||||
addps xm0, xm1
|
||||
add indexd, dst_incr_divd
|
||||
movss [dstq], xm0
|
||||
%else ; int16
|
||||
%if mmsize == 16
|
||||
pshufd m1, m0, q0032
|
||||
paddd m0, m1
|
||||
pshufd m1, m0, q0001
|
||||
%else ; mmsize == 8
|
||||
pshufw m1, m0, q0032
|
||||
%endif
|
||||
paddd m0, m1
|
||||
psrad m0, 15
|
||||
add fracd, dst_incr_modd
|
||||
packssdw m0, m0
|
||||
add indexd, dst_incr_divd
|
||||
movd [dstq], m0
|
||||
%endif
|
||||
cmp fracd, src_incrd
|
||||
jl .skip
|
||||
sub fracd, src_incrd
|
||||
@ -205,10 +232,10 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
|
||||
|
||||
.skip:
|
||||
mov index_incrd, indexd
|
||||
add dstq, 4
|
||||
add dstq, %2
|
||||
and indexd, phase_maskd
|
||||
sar index_incrd, phase_shiftb
|
||||
lea srcq, [srcq+index_incrq*4]
|
||||
lea srcq, [srcq+index_incrq*%2]
|
||||
cmp dstq, dst_endq
|
||||
jne .loop
|
||||
|
||||
@ -228,7 +255,7 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
|
||||
mov [ctxq+ResampleContext.frac ], fracd
|
||||
sub rax, src_stackq
|
||||
mov [ctxq+ResampleContext.index], indexd
|
||||
shr rax, 2
|
||||
shr rax, %3
|
||||
|
||||
.skip_store:
|
||||
%if ARCH_X86_32
|
||||
@ -236,13 +263,24 @@ cglobal resample_common_float, 1, 7, 2, ctx, phase_shift, dst, frac, \
|
||||
%endif
|
||||
RET
|
||||
|
||||
; int resample_linear_float(ResampleContext *ctx, float *dst,
|
||||
; const float *src, int size, int update_ctx)
|
||||
; int resample_linear_$format(ResampleContext *ctx, float *dst,
|
||||
; const float *src, int size, int update_ctx)
|
||||
%if ARCH_X86_64 ; unix64 and win64
|
||||
cglobal resample_linear_float, 0, 15, 5, ctx, dst, src, phase_shift, index, frac, \
|
||||
dst_incr_mod, size, min_filter_count_x4, \
|
||||
min_filter_len_x4, dst_incr_div, src_incr, \
|
||||
phase_mask, dst_end, filter_bank
|
||||
%if UNIX64
|
||||
cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_shift, index, frac, \
|
||||
size, dst_incr_mod, min_filter_count_x4, \
|
||||
min_filter_len_x4, dst_incr_div, src_incr, \
|
||||
src, dst_end, filter_bank
|
||||
|
||||
mov srcq, r2mp
|
||||
%else ; win64
|
||||
cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index, frac, \
|
||||
size, dst_incr_mod, min_filter_count_x4, \
|
||||
min_filter_len_x4, dst_incr_div, src_incr, \
|
||||
dst, dst_end, filter_bank
|
||||
|
||||
mov dstq, r1mp
|
||||
%endif
|
||||
|
||||
; use red-zone for variable storage
|
||||
%define ctx_stackq [rsp-0x8]
|
||||
@ -269,27 +307,31 @@ cglobal resample_linear_float, 0, 15, 5, ctx, dst, src, phase_shift, index, frac
|
||||
mov ctx_stackq, ctxq
|
||||
mov phase_mask_stackd, phase_maskd
|
||||
mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
|
||||
%ifidn %1, float
|
||||
cvtsi2ss xm0, src_incrd
|
||||
movss xm4, [pf_1]
|
||||
divss xm4, xm0
|
||||
%else ; int16
|
||||
movd m4, [pd_0x4000]
|
||||
%endif
|
||||
mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
|
||||
shl min_filter_len_x4d, 2
|
||||
lea dst_endq, [dstq+sizeq*4]
|
||||
shl min_filter_len_x4d, %3
|
||||
lea dst_endq, [dstq+sizeq*%2]
|
||||
|
||||
%if UNIX64
|
||||
mov ecx, [ctxq+ResampleContext.phase_shift]
|
||||
mov edi, [ctxq+ResampleContext.filter_alloc]
|
||||
|
||||
DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \
|
||||
filter1, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
|
||||
src_incr, filter2, dst_end, filter_bank
|
||||
DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, filter1, \
|
||||
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
|
||||
dst_incr_div, src_incr, src, dst_end, filter_bank
|
||||
%elif WIN64
|
||||
mov R9d, [ctxq+ResampleContext.filter_alloc]
|
||||
mov ecx, [ctxq+ResampleContext.phase_shift]
|
||||
|
||||
DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \
|
||||
filter1, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
|
||||
src_incr, filter2, dst_end, filter_bank
|
||||
DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, filter1, \
|
||||
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
|
||||
dst_incr_div, src_incr, dst, dst_end, filter_bank
|
||||
%endif
|
||||
|
||||
neg min_filter_len_x4q
|
||||
@ -297,8 +339,8 @@ cglobal resample_linear_float, 0, 15, 5, ctx, dst, src, phase_shift, index, frac
|
||||
sub srcq, min_filter_len_x4q
|
||||
mov src_stackq, srcq
|
||||
%else ; x86-32
|
||||
cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
|
||||
index, min_filter_length_x4, filter_bank
|
||||
cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
|
||||
frac, index, dst, filter_bank
|
||||
|
||||
; push temp variables to stack
|
||||
%define ctx_stackq r0mp
|
||||
@ -307,23 +349,27 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
|
||||
|
||||
mov dstq, r1mp
|
||||
mov r3, r3mp
|
||||
lea r3, [dstq+r3*4]
|
||||
lea r3, [dstq+r3*%2]
|
||||
PUSH dword [ctxq+ResampleContext.dst_incr_div]
|
||||
PUSH r3
|
||||
mov r3, dword [ctxq+ResampleContext.filter_alloc]
|
||||
PUSH dword [ctxq+ResampleContext.dst_incr_mod]
|
||||
PUSH r3
|
||||
shl r3, 2
|
||||
shl r3, %3
|
||||
PUSH r3
|
||||
mov r3, dword [ctxq+ResampleContext.src_incr]
|
||||
PUSH dword [ctxq+ResampleContext.phase_mask]
|
||||
PUSH r3d
|
||||
%ifidn %1, float
|
||||
cvtsi2ss xm0, r3d
|
||||
movss xm4, [pf_1]
|
||||
divss xm4, xm0
|
||||
%else ; int16
|
||||
movd m4, [pd_0x4000]
|
||||
%endif
|
||||
mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
|
||||
mov indexd, [ctxq+ResampleContext.index]
|
||||
shl min_filter_length_x4d, 2
|
||||
shl min_filter_length_x4d, %3
|
||||
mov fracd, [ctxq+ResampleContext.frac]
|
||||
neg min_filter_length_x4q
|
||||
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
|
||||
@ -333,7 +379,7 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
|
||||
PUSH filter_bankq
|
||||
PUSH dword [ctxq+ResampleContext.phase_shift]
|
||||
|
||||
DEFINE_ARGS src, filter1, dst, frac, index, min_filter_count_x4, filter2
|
||||
DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src
|
||||
|
||||
%define phase_shift_stackd dword [rsp+0x0]
|
||||
%define filter_bankq dword [rsp+0x4]
|
||||
@ -354,25 +400,37 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
|
||||
imul filter1d, indexd
|
||||
%if ARCH_X86_64
|
||||
mov min_filter_count_x4q, min_filter_len_x4q
|
||||
lea filter1q, [filter_bankq+filter1q*4]
|
||||
lea filter2q, [filter1q+filter_allocq*4]
|
||||
lea filter1q, [filter_bankq+filter1q*%2]
|
||||
lea filter2q, [filter1q+filter_allocq*%2]
|
||||
%else ; x86-32
|
||||
mov min_filter_count_x4q, filter_bankq
|
||||
lea filter1q, [min_filter_count_x4q+filter1q*4]
|
||||
lea filter1q, [min_filter_count_x4q+filter1q*%2]
|
||||
mov min_filter_count_x4q, min_filter_length_x4q
|
||||
mov filter2q, filter1q
|
||||
add filter2q, filter_alloc_x4q
|
||||
%endif
|
||||
%ifidn %1, float
|
||||
xorps m0, m0, m0
|
||||
xorps m2, m2, m2
|
||||
%else ; int16
|
||||
mova m0, m4
|
||||
mova m2, m4
|
||||
%endif
|
||||
|
||||
align 16
|
||||
.inner_loop:
|
||||
movups m1, [srcq+min_filter_count_x4q*1]
|
||||
movu m1, [srcq+min_filter_count_x4q*1]
|
||||
%ifidn %1, float
|
||||
mulps m3, m1, [filter2q+min_filter_count_x4q*1]
|
||||
mulps m1, m1, [filter1q+min_filter_count_x4q*1]
|
||||
addps m2, m2, m3
|
||||
addps m0, m0, m1
|
||||
%else ; int16
|
||||
pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1]
|
||||
pmaddwd m1, [filter1q+min_filter_count_x4q*1]
|
||||
paddd m2, m3
|
||||
paddd m0, m1
|
||||
%endif
|
||||
add min_filter_count_x4q, mmsize
|
||||
js .inner_loop
|
||||
|
||||
@ -383,6 +441,7 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
|
||||
addps xm2, xm3
|
||||
%endif
|
||||
|
||||
%ifidn %1, float
|
||||
; val += (v2 - val) * (FELEML) frac / c->src_incr;
|
||||
cvtsi2ss xm1, fracd
|
||||
subps xm2, xm0
|
||||
@ -399,21 +458,55 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
|
||||
addps xm0, xm1
|
||||
add indexd, dst_incr_divd
|
||||
movss [dstq], xm0
|
||||
%else ; int16
|
||||
%if mmsize == 16
|
||||
pshufd m3, m2, q0032
|
||||
pshufd m1, m0, q0032
|
||||
paddd m2, m3
|
||||
paddd m0, m1
|
||||
pshufd m3, m2, q0001
|
||||
pshufd m1, m0, q0001
|
||||
%else ; mmsize == 8
|
||||
pshufw m3, m2, q0032
|
||||
pshufw m1, m0, q0032
|
||||
%endif
|
||||
paddd m2, m3
|
||||
paddd m0, m1
|
||||
psubd m2, m0
|
||||
; This is probably a really bad idea on atom and other machines with a
|
||||
; long transfer latency between GPRs and XMMs (atom). However, it does
|
||||
; make the clip a lot simpler...
|
||||
movd eax, m2
|
||||
add indexd, dst_incr_divd
|
||||
imul fracd
|
||||
idiv src_incrd
|
||||
movd m1, eax
|
||||
add fracd, dst_incr_modd
|
||||
paddd m0, m1
|
||||
psrad m0, 15
|
||||
packssdw m0, m0
|
||||
movd [dstq], m0
|
||||
|
||||
; note that for imul/idiv, I need to move filter to edx/eax for each:
|
||||
; - 32bit: eax=r0[filter1], edx=r2[filter2]
|
||||
; - win64: eax=r6[filter1], edx=r1[todo]
|
||||
; - unix64: eax=r6[filter1], edx=r2[todo]
|
||||
%endif
|
||||
cmp fracd, src_incrd
|
||||
jl .skip
|
||||
sub fracd, src_incrd
|
||||
inc indexd
|
||||
|
||||
%if UNIX64
|
||||
DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \
|
||||
index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
|
||||
src_incr, filter2, dst_end, filter_bank
|
||||
DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, index_incr, \
|
||||
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
|
||||
dst_incr_div, src_incr, src, dst_end, filter_bank
|
||||
%elif WIN64
|
||||
DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \
|
||||
index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
|
||||
src_incr, filter2, dst_end, filter_bank
|
||||
DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, index_incr, \
|
||||
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
|
||||
dst_incr_div, src_incr, dst, dst_end, filter_bank
|
||||
%else ; x86-32
|
||||
DEFINE_ARGS src, phase_shift, dst, frac, index, index_incr
|
||||
DEFINE_ARGS filter1, phase_shift, index_incr, frac, index, dst, src
|
||||
%endif
|
||||
|
||||
.skip:
|
||||
@ -421,17 +514,23 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
|
||||
mov phase_shiftd, phase_shift_stackd
|
||||
%endif
|
||||
mov index_incrd, indexd
|
||||
add dstq, 4
|
||||
add dstq, %2
|
||||
and indexd, phase_mask_stackd
|
||||
sar index_incrd, phase_shiftb
|
||||
lea srcq, [srcq+index_incrq*4]
|
||||
lea srcq, [srcq+index_incrq*%2]
|
||||
cmp dstq, dst_endq
|
||||
jne .loop
|
||||
|
||||
%if ARCH_X86_64
|
||||
DEFINE_ARGS ctx, dst, src, phase_shift, index, frac
|
||||
%if UNIX64
|
||||
DEFINE_ARGS ctx, dst, filter2, phase_shift, index, frac, index_incr, \
|
||||
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
|
||||
dst_incr_div, src_incr, src, dst_end, filter_bank
|
||||
%elif WIN64
|
||||
DEFINE_ARGS ctx, filter2, src, phase_shift, index, frac, index_incr, \
|
||||
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
|
||||
dst_incr_div, src_incr, dst, dst_end, filter_bank
|
||||
%else ; x86-32
|
||||
DEFINE_ARGS src, ctx, update_context, frac, index
|
||||
DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src
|
||||
%endif
|
||||
|
||||
cmp dword update_context_stackd, 0
|
||||
@ -444,7 +543,7 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
|
||||
mov [ctxq+ResampleContext.frac ], fracd
|
||||
sub rax, src_stackq
|
||||
mov [ctxq+ResampleContext.index], indexd
|
||||
shr rax, 2
|
||||
shr rax, %3
|
||||
|
||||
.skip_store:
|
||||
%if ARCH_X86_32
|
||||
@ -454,9 +553,17 @@ cglobal resample_linear_float, 1, 7, 5, ctx, filter1, dst, frac, \
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
RESAMPLE_FLOAT_FNS
|
||||
RESAMPLE_FNS float, 4, 2
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
RESAMPLE_FLOAT_FNS
|
||||
RESAMPLE_FNS float, 4, 2
|
||||
%endif
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmxext
|
||||
RESAMPLE_FNS int16, 2, 1
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
RESAMPLE_FNS int16, 2, 1
|
||||
|
@ -22,116 +22,6 @@
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libswresample/swresample_internal.h"
|
||||
|
||||
DECLARE_ALIGNED(16, const uint64_t, ff_resample_int16_rounder)[2] = { 0x0000000000004000ULL, 0x0000000000000000ULL};
|
||||
|
||||
#define COMMON_CORE_INT16_MMX2 \
|
||||
x86_reg len= -2*c->filter_length;\
|
||||
__asm__ volatile(\
|
||||
"movq "MANGLE(ff_resample_int16_rounder)", %%mm0 \n\t"\
|
||||
"1: \n\t"\
|
||||
"movq (%1, %0), %%mm1 \n\t"\
|
||||
"pmaddwd (%2, %0), %%mm1 \n\t"\
|
||||
"paddd %%mm1, %%mm0 \n\t"\
|
||||
"add $8, %0 \n\t"\
|
||||
" js 1b \n\t"\
|
||||
"pshufw $0x0E, %%mm0, %%mm1 \n\t"\
|
||||
"paddd %%mm1, %%mm0 \n\t"\
|
||||
"psrad $15, %%mm0 \n\t"\
|
||||
"packssdw %%mm0, %%mm0 \n\t"\
|
||||
"movd %%mm0, (%3) \n\t"\
|
||||
: "+r" (len)\
|
||||
: "r" (((uint8_t*)(src+sample_index))-len),\
|
||||
"r" (((uint8_t*)filter)-len),\
|
||||
"r" (dst+dst_index)\
|
||||
NAMED_CONSTRAINTS_ARRAY_ADD(ff_resample_int16_rounder)\
|
||||
);
|
||||
|
||||
#define LINEAR_CORE_INT16_MMX2 \
|
||||
x86_reg len= -2*c->filter_length;\
|
||||
__asm__ volatile(\
|
||||
"pxor %%mm0, %%mm0 \n\t"\
|
||||
"pxor %%mm2, %%mm2 \n\t"\
|
||||
"1: \n\t"\
|
||||
"movq (%3, %0), %%mm1 \n\t"\
|
||||
"movq %%mm1, %%mm3 \n\t"\
|
||||
"pmaddwd (%4, %0), %%mm1 \n\t"\
|
||||
"pmaddwd (%5, %0), %%mm3 \n\t"\
|
||||
"paddd %%mm1, %%mm0 \n\t"\
|
||||
"paddd %%mm3, %%mm2 \n\t"\
|
||||
"add $8, %0 \n\t"\
|
||||
" js 1b \n\t"\
|
||||
"pshufw $0x0E, %%mm0, %%mm1 \n\t"\
|
||||
"pshufw $0x0E, %%mm2, %%mm3 \n\t"\
|
||||
"paddd %%mm1, %%mm0 \n\t"\
|
||||
"paddd %%mm3, %%mm2 \n\t"\
|
||||
"movd %%mm0, %1 \n\t"\
|
||||
"movd %%mm2, %2 \n\t"\
|
||||
: "+r" (len),\
|
||||
"=r" (val),\
|
||||
"=r" (v2)\
|
||||
: "r" (((uint8_t*)(src+sample_index))-len),\
|
||||
"r" (((uint8_t*)filter)-len),\
|
||||
"r" (((uint8_t*)(filter+c->filter_alloc))-len)\
|
||||
);
|
||||
|
||||
#define COMMON_CORE_INT16_SSE2 \
|
||||
x86_reg len= -2*c->filter_length;\
|
||||
__asm__ volatile(\
|
||||
"movdqa "MANGLE(ff_resample_int16_rounder)", %%xmm0 \n\t"\
|
||||
"1: \n\t"\
|
||||
"movdqu (%1, %0), %%xmm1 \n\t"\
|
||||
"pmaddwd (%2, %0), %%xmm1 \n\t"\
|
||||
"paddd %%xmm1, %%xmm0 \n\t"\
|
||||
"add $16, %0 \n\t"\
|
||||
" js 1b \n\t"\
|
||||
"pshufd $0x0E, %%xmm0, %%xmm1 \n\t"\
|
||||
"paddd %%xmm1, %%xmm0 \n\t"\
|
||||
"pshufd $0x01, %%xmm0, %%xmm1 \n\t"\
|
||||
"paddd %%xmm1, %%xmm0 \n\t"\
|
||||
"psrad $15, %%xmm0 \n\t"\
|
||||
"packssdw %%xmm0, %%xmm0 \n\t"\
|
||||
"movd %%xmm0, (%3) \n\t"\
|
||||
: "+r" (len)\
|
||||
: "r" (((uint8_t*)(src+sample_index))-len),\
|
||||
"r" (((uint8_t*)filter)-len),\
|
||||
"r" (dst+dst_index)\
|
||||
NAMED_CONSTRAINTS_ARRAY_ADD(ff_resample_int16_rounder)\
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\
|
||||
);
|
||||
|
||||
#define LINEAR_CORE_INT16_SSE2 \
|
||||
x86_reg len= -2*c->filter_length;\
|
||||
__asm__ volatile(\
|
||||
"pxor %%xmm0, %%xmm0 \n\t"\
|
||||
"pxor %%xmm2, %%xmm2 \n\t"\
|
||||
"1: \n\t"\
|
||||
"movdqu (%3, %0), %%xmm1 \n\t"\
|
||||
"movdqa %%xmm1, %%xmm3 \n\t"\
|
||||
"pmaddwd (%4, %0), %%xmm1 \n\t"\
|
||||
"pmaddwd (%5, %0), %%xmm3 \n\t"\
|
||||
"paddd %%xmm1, %%xmm0 \n\t"\
|
||||
"paddd %%xmm3, %%xmm2 \n\t"\
|
||||
"add $16, %0 \n\t"\
|
||||
" js 1b \n\t"\
|
||||
"pshufd $0x0E, %%xmm0, %%xmm1 \n\t"\
|
||||
"pshufd $0x0E, %%xmm2, %%xmm3 \n\t"\
|
||||
"paddd %%xmm1, %%xmm0 \n\t"\
|
||||
"paddd %%xmm3, %%xmm2 \n\t"\
|
||||
"pshufd $0x01, %%xmm0, %%xmm1 \n\t"\
|
||||
"pshufd $0x01, %%xmm2, %%xmm3 \n\t"\
|
||||
"paddd %%xmm1, %%xmm0 \n\t"\
|
||||
"paddd %%xmm3, %%xmm2 \n\t"\
|
||||
"movd %%xmm0, %1 \n\t"\
|
||||
"movd %%xmm2, %2 \n\t"\
|
||||
: "+r" (len),\
|
||||
"=r" (val),\
|
||||
"=r" (v2)\
|
||||
: "r" (((uint8_t*)(src+sample_index))-len),\
|
||||
"r" (((uint8_t*)filter)-len),\
|
||||
"r" (((uint8_t*)(filter+c->filter_alloc))-len)\
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
|
||||
);
|
||||
|
||||
#define COMMON_CORE_DBL_SSE2 \
|
||||
x86_reg len= -8*c->filter_length;\
|
||||
__asm__ volatile(\
|
||||
|
@ -27,34 +27,14 @@
|
||||
|
||||
#include "libswresample/resample.h"
|
||||
|
||||
int swri_resample_common_int16_mmx2 (ResampleContext *c, int16_t *dst, const int16_t *src, int n, int update_ctx);
|
||||
int swri_resample_linear_int16_mmx2 (ResampleContext *c, int16_t *dst, const int16_t *src, int n, int update_ctx);
|
||||
int swri_resample_common_int16_sse2 (ResampleContext *c, int16_t *dst, const int16_t *src, int n, int update_ctx);
|
||||
int swri_resample_linear_int16_sse2 (ResampleContext *c, int16_t *dst, const int16_t *src, int n, int update_ctx);
|
||||
int swri_resample_common_float_sse (ResampleContext *c, float *dst, const float *src, int n, int update_ctx);
|
||||
int swri_resample_linear_float_sse (ResampleContext *c, float *dst, const float *src, int n, int update_ctx);
|
||||
int swri_resample_common_float_avx (ResampleContext *c, float *dst, const float *src, int n, int update_ctx);
|
||||
int swri_resample_linear_float_avx (ResampleContext *c, float *dst, const float *src, int n, int update_ctx);
|
||||
int swri_resample_common_double_sse2(ResampleContext *c, double *dst, const double *src, int n, int update_ctx);
|
||||
int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const double *src, int n, int update_ctx);
|
||||
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
|
||||
#if HAVE_SSE2_INLINE
|
||||
#define DO_RESAMPLE_ONE 0
|
||||
|
||||
#include "resample_mmx.h"
|
||||
|
||||
#if ARCH_X86_32
|
||||
#define TEMPLATE_RESAMPLE_S16_MMX2
|
||||
#include "libswresample/resample_template.c"
|
||||
#undef TEMPLATE_RESAMPLE_S16_MMX2
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2_INLINE
|
||||
#define TEMPLATE_RESAMPLE_S16_SSE2
|
||||
#include "libswresample/resample_template.c"
|
||||
#undef TEMPLATE_RESAMPLE_S16_SSE2
|
||||
|
||||
#define TEMPLATE_RESAMPLE_DBL_SSE2
|
||||
#include "libswresample/resample_template.c"
|
||||
#undef TEMPLATE_RESAMPLE_DBL_SSE2
|
||||
@ -62,7 +42,15 @@ int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const do
|
||||
|
||||
#undef DO_RESAMPLE_ONE
|
||||
|
||||
#endif // HAVE_MMXEXT_INLINE
|
||||
int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst,
|
||||
const uint8_t *src, int sz, int upd);
|
||||
int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst,
|
||||
const uint8_t *src, int sz, int upd);
|
||||
|
||||
int ff_resample_common_int16_sse2(ResampleContext *c, uint8_t *dst,
|
||||
const uint8_t *src, int sz, int upd);
|
||||
int ff_resample_linear_int16_sse2(ResampleContext *c, uint8_t *dst,
|
||||
const uint8_t *src, int sz, int upd);
|
||||
|
||||
int ff_resample_common_float_sse(ResampleContext *c, uint8_t *dst,
|
||||
const uint8_t *src, int sz, int upd);
|
||||
@ -79,17 +67,19 @@ void swresample_dsp_x86_init(ResampleContext *c)
|
||||
int av_unused mm_flags = av_get_cpu_flags();
|
||||
|
||||
#define FNIDX(fmt) (AV_SAMPLE_FMT_##fmt - AV_SAMPLE_FMT_S16P)
|
||||
if (ARCH_X86_32 && HAVE_MMXEXT_INLINE && mm_flags & AV_CPU_FLAG_MMX2) {
|
||||
c->dsp.resample_common[FNIDX(S16P)] = (resample_fn) swri_resample_common_int16_mmx2;
|
||||
c->dsp.resample_linear[FNIDX(S16P)] = (resample_fn) swri_resample_linear_int16_mmx2;
|
||||
if (ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL && mm_flags & AV_CPU_FLAG_MMX2) {
|
||||
c->dsp.resample_common[FNIDX(S16P)] = ff_resample_common_int16_mmxext;
|
||||
c->dsp.resample_linear[FNIDX(S16P)] = ff_resample_linear_int16_mmxext;
|
||||
}
|
||||
if (HAVE_SSE_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE) {
|
||||
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_sse;
|
||||
c->dsp.resample_linear[FNIDX(FLTP)] = ff_resample_linear_float_sse;
|
||||
}
|
||||
if (HAVE_SSE2_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE2) {
|
||||
c->dsp.resample_common[FNIDX(S16P)] = ff_resample_common_int16_sse2;
|
||||
c->dsp.resample_linear[FNIDX(S16P)] = ff_resample_linear_int16_sse2;
|
||||
}
|
||||
if (HAVE_SSE2_INLINE && mm_flags & AV_CPU_FLAG_SSE2) {
|
||||
c->dsp.resample_common[FNIDX(S16P)] = (resample_fn) swri_resample_common_int16_sse2;
|
||||
c->dsp.resample_linear[FNIDX(S16P)] = (resample_fn) swri_resample_linear_int16_sse2;
|
||||
c->dsp.resample_common[FNIDX(DBLP)] = (resample_fn) swri_resample_common_double_sse2;
|
||||
c->dsp.resample_linear[FNIDX(DBLP)] = (resample_fn) swri_resample_linear_double_sse2;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user