1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-23 12:43:46 +02:00
FFmpeg/libavcodec/x86/mlpdsp.asm
James Almer 0de1d6287e x86/mlpdec: add ff_mlp_rematrix_channel_{sse4,avx2}
2x to 2.5x faster than the C version.

Reviewed-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: James Almer <jamrial@gmail.com>
2014-10-02 22:11:55 -03:00

197 lines
7.0 KiB
NASM

;******************************************************************************
;* SIMD-optimized MLP DSP functions
;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_TEXT
%if ARCH_X86_64
%macro SHLX 2
%if cpuflag(bmi2)
shlx %1, %1, %2q
%else
shl %1, %2b
%endif
%endmacro
%macro REMATRIX 0
movdqa m0, [samplesq]
movdqa m1, [coeffsq ]
pshufd m2, m0, q2301
pshufd m3, m1, q2301
pmuldq m0, m1
pmuldq m3, m2
paddq m0, m3
%if notcpuflag(avx2)
movdqa m1, [samplesq + 16]
movdqa m2, [coeffsq + 16]
pshufd m3, m1, q2301
pshufd m4, m2, q2301
pmuldq m1, m2
pmuldq m4, m3
paddq m0, m1
paddq m0, m4
%else
vextracti128 xm1, m0, 1
paddq xm0, xm1
%endif
%endmacro
%macro LOOP_END 0
pshufd xm1, xm0, q0032
paddq xm0, xm1
movq accumq, xm0
movzx blsbsd, byte [blsbs_ptrq] ; load *bypassed_lsbs
sar accumq, 14 ; accum >>= 14
and accumd, maskd ; accum &= mask
add accumd, blsbsd ; accum += *bypassed_lsbs
mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum
add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;
add samplesq, 32 ; samples += MAX_CHANNELS;
cmp blsbs_ptrq, cntq
%endmacro
%macro LOOP_SHIFT_END 0
pshufd xm1, xm0, q0032
paddq xm0, xm1
movq accumq, xm0
and indexd, auspd ; index &= access_unit_size_pow2;
movsx noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
add indexd, index2d ; index += index2
SHLX noiseq, mns ; noise_buffer[index] <<= matrix_noise_shift
add accumq, noiseq ; accum += noise_buffer[index]
movzx noised, byte [blsbs_ptrq] ; load *bypassed_lsbs (reuse tmp noise register)
sar accumq, 14 ; accum >>= 14
and accumd, maskd ; accum &= mask
add accumd, noised ; accum += *bypassed_lsbs
mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum
add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;
add samplesq, 32 ; samples += MAX_CHANNELS;
cmp blsbs_ptrq, cntq
%endmacro
;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
; const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
; int index, unsigned int dest_ch, uint16_t blockpos,
; unsigned int maxchan, int matrix_noise_shift,
; int access_unit_size_pow2, int32_t mask)
%macro MLP_REMATRIX_CHANNEL 0
cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
index, dest_ch, blockpos, maxchan, mns, \
accum, mask, cnt
mov mnsd, mnsm ; load matrix_noise_shift
movzx blockposq, word blockposm ; load and zero extend blockpos (16bit)
mov maxchand, maxchanm ; load maxchan
mov maskd, maskm ; load mask
%if WIN64
mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64)
%endif
shl dest_chd, 2
lea cntq, [blsbs_ptrq + blockposq*8]
test mnsd, mnsd ; is matrix_noise_shift != 0?
jne .shift ; jump if true
cmp maxchand, 4 ; is maxchan < 4?
jl .loop4 ; jump if true
align 16
.loop8:
; Process 5 or more channels
REMATRIX
LOOP_END
jne .loop8
RET
align 16
.loop4:
; Process up to 4 channels
movdqa xm0, [samplesq]
movdqa xm1, [coeffsq ]
pshufd xm2, xm0, q2301
pshufd xm3, xm1, q2301
pmuldq xm0, xm1
pmuldq xm3, xm2
paddq xm0, xm3
LOOP_END
jne .loop4
RET
.shift:
%if WIN64
mov indexd, indexm ; load index (not needed on UNIX64)
%endif
mov r9d, r9m ; load access_unit_size_pow2
%if cpuflag(bmi2)
; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
index, dest_ch, accum, index2, mns, \
ausp, mask, cnt, noise
add mnsd, 7 ; matrix_noise_shift += 7
%else ; sse4
mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift
%if WIN64
; r0 = rcx
DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
index2, accum, ausp, mask, cnt, noise
%else ; UNIX64
; r3 = rcx
DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
index2, accum, ausp, mask, cnt, noise
%endif
lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7
%endif ; cpuflag
sub auspd, 1 ; access_unit_size_pow2 -= 1
cmp r7d, 4 ; is maxchan < 4?
lea index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
jl .loop4_shift ; jump if maxchan < 4
align 16
.loop8_shift:
; Process 5 or more channels
REMATRIX
LOOP_SHIFT_END
jne .loop8_shift
RET
align 16
.loop4_shift:
; Process up to 4 channels
movdqa xm0, [samplesq]
movdqa xm1, [coeffsq ]
pshufd xm2, xm0, q2301
pshufd xm3, xm1, q2301
pmuldq xm0, xm1
pmuldq xm3, xm2
paddq xm0, xm3
LOOP_SHIFT_END
jne .loop4_shift
RET
%endmacro
INIT_XMM sse4
MLP_REMATRIX_CHANNEL
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2, bmi2
MLP_REMATRIX_CHANNEL
%endif
%endif ; ARCH_X86_64