mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
Merge commit '2e4bb99f4df7052b3e147ee898fcb4013a34d904'
* commit '2e4bb99f4df7052b3e147ee898fcb4013a34d904': vorbisdsp: convert x86 simd functions from inline asm to yasm. Merged-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
commit
b90ab2b993
@ -72,6 +72,7 @@ YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \
|
|||||||
YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
|
YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
|
||||||
YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
|
YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
|
||||||
YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
|
YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
|
||||||
|
YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
|
||||||
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
|
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
|
||||||
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o
|
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o
|
||||||
YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
|
YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
|
||||||
|
@ -39,9 +39,6 @@
|
|||||||
DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
|
DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
|
||||||
DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
|
DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
|
||||||
|
|
||||||
DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
|
|
||||||
{ 0x8000000080000000ULL, 0x8000000080000000ULL };
|
|
||||||
|
|
||||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
|
||||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
|
||||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
|
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
|
||||||
|
@ -31,8 +31,6 @@ typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
|
|||||||
extern const uint64_t ff_bone;
|
extern const uint64_t ff_bone;
|
||||||
extern const uint64_t ff_wtwo;
|
extern const uint64_t ff_wtwo;
|
||||||
|
|
||||||
extern const uint64_t ff_pdw_80000000[2];
|
|
||||||
|
|
||||||
extern const xmm_reg ff_pw_3;
|
extern const xmm_reg ff_pw_3;
|
||||||
extern const xmm_reg ff_pw_4;
|
extern const xmm_reg ff_pw_4;
|
||||||
extern const xmm_reg ff_pw_5;
|
extern const xmm_reg ff_pw_5;
|
||||||
|
83
libavcodec/x86/vorbisdsp.asm
Normal file
83
libavcodec/x86/vorbisdsp.asm
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
;******************************************************************************
|
||||||
|
;* Vorbis x86 optimizations
|
||||||
|
;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
|
||||||
|
;*
|
||||||
|
;* This file is part of FFmpeg.
|
||||||
|
;*
|
||||||
|
;* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
;* modify it under the terms of the GNU Lesser General Public
|
||||||
|
;* License as published by the Free Software Foundation; either
|
||||||
|
;* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
;*
|
||||||
|
;* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
;* Lesser General Public License for more details.
|
||||||
|
;*
|
||||||
|
;* You should have received a copy of the GNU Lesser General Public
|
||||||
|
;* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
;******************************************************************************
|
||||||
|
|
||||||
|
%include "libavutil/x86/x86util.asm"
|
||||||
|
|
||||||
|
SECTION_RODATA
|
||||||
|
|
||||||
|
pdw_80000000: times 4 dd 0x80000000
|
||||||
|
|
||||||
|
SECTION .text
|
||||||
|
|
||||||
|
%if ARCH_X86_32
|
||||||
|
INIT_MMX 3dnow
|
||||||
|
cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
|
||||||
|
pxor m7, m7
|
||||||
|
lea magq, [magq+block_sizeq*4]
|
||||||
|
lea angq, [angq+block_sizeq*4]
|
||||||
|
neg block_sizeq
|
||||||
|
.loop:
|
||||||
|
mova m0, [magq+block_sizeq*4]
|
||||||
|
mova m1, [angq+block_sizeq*4]
|
||||||
|
mova m2, m0
|
||||||
|
mova m3, m1
|
||||||
|
pfcmpge m2, m7 ; m <= 0.0
|
||||||
|
pfcmpge m3, m7 ; a <= 0.0
|
||||||
|
pslld m2, 31 ; keep only the sign bit
|
||||||
|
pxor m1, m2
|
||||||
|
mova m4, m3
|
||||||
|
pand m3, m1
|
||||||
|
pandn m4, m1
|
||||||
|
pfadd m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
|
||||||
|
pfsub m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
|
||||||
|
mova [angq+block_sizeq*4], m3
|
||||||
|
mova [magq+block_sizeq*4], m0
|
||||||
|
add block_sizeq, 2
|
||||||
|
jl .loop
|
||||||
|
femms
|
||||||
|
RET
|
||||||
|
%endif
|
||||||
|
|
||||||
|
INIT_XMM sse
|
||||||
|
cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr
|
||||||
|
mova m5, [pdw_80000000]
|
||||||
|
xor cntrq, cntrq
|
||||||
|
align 16
|
||||||
|
.loop:
|
||||||
|
mova m0, [magq+cntrq*4]
|
||||||
|
mova m1, [angq+cntrq*4]
|
||||||
|
xorps m2, m2
|
||||||
|
xorps m3, m3
|
||||||
|
cmpleps m2, m0 ; m <= 0.0
|
||||||
|
cmpleps m3, m1 ; a <= 0.0
|
||||||
|
andps m2, m5 ; keep only the sign bit
|
||||||
|
xorps m1, m2
|
||||||
|
mova m4, m3
|
||||||
|
andps m3, m1
|
||||||
|
andnps m4, m1
|
||||||
|
addps m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
|
||||||
|
subps m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
|
||||||
|
mova [angq+cntrq*4], m3
|
||||||
|
mova [magq+cntrq*4], m0
|
||||||
|
add cntrq, 4
|
||||||
|
cmp cntrq, block_sizeq
|
||||||
|
jl .loop
|
||||||
|
RET
|
@ -21,83 +21,22 @@
|
|||||||
#include "config.h"
|
#include "config.h"
|
||||||
#include "libavutil/cpu.h"
|
#include "libavutil/cpu.h"
|
||||||
#include "libavcodec/vorbisdsp.h"
|
#include "libavcodec/vorbisdsp.h"
|
||||||
#include "dsputil_mmx.h" // for ff_pdw_80000000
|
|
||||||
|
|
||||||
#if HAVE_INLINE_ASM
|
void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang,
|
||||||
#if ARCH_X86_32
|
intptr_t blocksize);
|
||||||
static void vorbis_inverse_coupling_3dnow(float *mag, float *ang,
|
void ff_vorbis_inverse_coupling_sse(float *mag, float *ang,
|
||||||
intptr_t blocksize)
|
intptr_t blocksize);
|
||||||
{
|
|
||||||
int i;
|
|
||||||
__asm__ volatile ("pxor %%mm7, %%mm7":);
|
|
||||||
for (i = 0; i < blocksize; i += 2) {
|
|
||||||
__asm__ volatile (
|
|
||||||
"movq %0, %%mm0 \n\t"
|
|
||||||
"movq %1, %%mm1 \n\t"
|
|
||||||
"movq %%mm0, %%mm2 \n\t"
|
|
||||||
"movq %%mm1, %%mm3 \n\t"
|
|
||||||
"pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
|
|
||||||
"pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
|
|
||||||
"pslld $31, %%mm2 \n\t" // keep only the sign bit
|
|
||||||
"pxor %%mm2, %%mm1 \n\t"
|
|
||||||
"movq %%mm3, %%mm4 \n\t"
|
|
||||||
"pand %%mm1, %%mm3 \n\t"
|
|
||||||
"pandn %%mm1, %%mm4 \n\t"
|
|
||||||
"pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
|
|
||||||
"pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
|
|
||||||
"movq %%mm3, %1 \n\t"
|
|
||||||
"movq %%mm0, %0 \n\t"
|
|
||||||
: "+m"(mag[i]), "+m"(ang[i])
|
|
||||||
:: "memory"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
__asm__ volatile ("femms");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void vorbis_inverse_coupling_sse(float *mag, float *ang,
|
|
||||||
intptr_t blocksize)
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
|
|
||||||
__asm__ volatile (
|
|
||||||
"movaps %0, %%xmm5 \n\t"
|
|
||||||
:: "m"(ff_pdw_80000000[0])
|
|
||||||
);
|
|
||||||
for (i = 0; i < blocksize; i += 4) {
|
|
||||||
__asm__ volatile (
|
|
||||||
"movaps %0, %%xmm0 \n\t"
|
|
||||||
"movaps %1, %%xmm1 \n\t"
|
|
||||||
"xorps %%xmm2, %%xmm2 \n\t"
|
|
||||||
"xorps %%xmm3, %%xmm3 \n\t"
|
|
||||||
"cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
|
|
||||||
"cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
|
|
||||||
"andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
|
|
||||||
"xorps %%xmm2, %%xmm1 \n\t"
|
|
||||||
"movaps %%xmm3, %%xmm4 \n\t"
|
|
||||||
"andps %%xmm1, %%xmm3 \n\t"
|
|
||||||
"andnps %%xmm1, %%xmm4 \n\t"
|
|
||||||
"addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
|
|
||||||
"subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
|
|
||||||
"movaps %%xmm3, %1 \n\t"
|
|
||||||
"movaps %%xmm0, %0 \n\t"
|
|
||||||
: "+m"(mag[i]), "+m"(ang[i])
|
|
||||||
:: "memory"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp)
|
void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp)
|
||||||
{
|
{
|
||||||
#if HAVE_INLINE_ASM
|
#if HAVE_YASM
|
||||||
int mm_flags = av_get_cpu_flags();
|
int mm_flags = av_get_cpu_flags();
|
||||||
|
|
||||||
#if ARCH_X86_32
|
#if ARCH_X86_32
|
||||||
if (mm_flags & AV_CPU_FLAG_3DNOW)
|
if (mm_flags & AV_CPU_FLAG_3DNOW)
|
||||||
dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
|
dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow;
|
||||||
#endif /* ARCH_X86_32 */
|
#endif /* ARCH_X86_32 */
|
||||||
if (mm_flags & AV_CPU_FLAG_SSE)
|
if (mm_flags & AV_CPU_FLAG_SSE)
|
||||||
dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
|
dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse;
|
||||||
#endif /* HAVE_INLINE_ASM */
|
#endif /* HAVE_YASM */
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user