mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
avcodec: Remove DCT, FFT, MDCT and RDFT
They were replaced by TX from libavutil; the tremendous work to get to this point (both creating TX as well as porting the users of the components removed in this commit) was completely performed by Lynne alone. Removing the subsystems from configure may break some command lines, because the --disable-fft etc. options are no longer recognized. Co-authored-by: Lynne <dev@lynne.ee> Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
parent
d9464f3e34
commit
6f7bf64dbc
11
configure
vendored
11
configure
vendored
@ -136,13 +136,9 @@ Component options:
|
||||
--disable-w32threads disable Win32 threads [autodetect]
|
||||
--disable-os2threads disable OS/2 threads [autodetect]
|
||||
--disable-network disable network support [no]
|
||||
--disable-dct disable DCT code
|
||||
--disable-dwt disable DWT code
|
||||
--disable-error-resilience disable error resilience code
|
||||
--disable-lsp disable LSP code
|
||||
--disable-mdct disable MDCT code
|
||||
--disable-rdft disable RDFT code
|
||||
--disable-fft disable FFT code
|
||||
--disable-faan disable floating point AAN (I)DCT code
|
||||
--disable-pixelutils disable pixel utils in libavutil
|
||||
|
||||
@ -2004,17 +2000,13 @@ PROGRAM_LIST="
|
||||
"
|
||||
|
||||
SUBSYSTEM_LIST="
|
||||
dct
|
||||
dwt
|
||||
error_resilience
|
||||
faan
|
||||
fast_unaligned
|
||||
fft
|
||||
lsp
|
||||
mdct
|
||||
pixelutils
|
||||
network
|
||||
rdft
|
||||
"
|
||||
|
||||
# COMPONENT_LIST needs to come last to ensure correct dependency checking
|
||||
@ -2766,7 +2758,6 @@ cbs_h266_select="cbs"
|
||||
cbs_jpeg_select="cbs"
|
||||
cbs_mpeg2_select="cbs"
|
||||
cbs_vp9_select="cbs"
|
||||
dct_select="rdft"
|
||||
deflate_wrapper_deps="zlib"
|
||||
dirac_parse_select="golomb"
|
||||
dovi_rpu_select="golomb"
|
||||
@ -2786,7 +2777,6 @@ frame_thread_encoder_deps="encoders threads"
|
||||
inflate_wrapper_deps="zlib"
|
||||
intrax8_select="blockdsp wmv2dsp"
|
||||
iso_media_select="mpeg4audio"
|
||||
mdct_select="fft"
|
||||
me_cmp_select="idctdsp"
|
||||
mpeg_er_select="error_resilience"
|
||||
mpegaudio_select="mpegaudiodsp mpegaudioheader"
|
||||
@ -2796,7 +2786,6 @@ mpegvideoenc_select="aandcttables fdctdsp me_cmp mpegvideo pixblockdsp"
|
||||
msmpeg4dec_select="h263_decoder"
|
||||
msmpeg4enc_select="h263_encoder"
|
||||
vc1dsp_select="h264chroma qpeldsp startcode"
|
||||
rdft_select="fft"
|
||||
|
||||
# decoders / encoders
|
||||
aac_decoder_select="adts_header mpeg4audio sinewin"
|
||||
|
@ -48,11 +48,6 @@ Files that have MIPS copyright notice in them:
|
||||
float_dsp_mips.c
|
||||
libm_mips.h
|
||||
softfloat_tables.h
|
||||
* libavcodec/
|
||||
fft_fixed_32.c
|
||||
fft_init_table.c
|
||||
fft_table.h
|
||||
mdct_fixed_32.c
|
||||
* libavcodec/mips/
|
||||
aacdec_fixed.c
|
||||
aacsbr_fixed.c
|
||||
@ -70,9 +65,6 @@ Files that have MIPS copyright notice in them:
|
||||
compute_antialias_float.h
|
||||
lsp_mips.h
|
||||
dsputil_mips.c
|
||||
fft_mips.c
|
||||
fft_table.h
|
||||
fft_init_table.c
|
||||
fmtconvert_mips.c
|
||||
iirfilter_mips.c
|
||||
mpegaudiodsp_mips_fixed.c
|
||||
|
@ -32,6 +32,7 @@ OBJS = ac3_parser.o \
|
||||
allcodecs.o \
|
||||
avcodec.o \
|
||||
avdct.o \
|
||||
avfft.o \
|
||||
avpacket.o \
|
||||
bitstream.o \
|
||||
bitstream_filters.o \
|
||||
@ -81,7 +82,6 @@ OBJS-$(CONFIG_CBS_JPEG) += cbs_jpeg.o
|
||||
OBJS-$(CONFIG_CBS_MPEG2) += cbs_mpeg2.o
|
||||
OBJS-$(CONFIG_CBS_VP9) += cbs_vp9.o
|
||||
OBJS-$(CONFIG_CRYSTALHD) += crystalhd.o
|
||||
OBJS-$(CONFIG_DCT) += dct.o dct32_fixed.o dct32_float.o
|
||||
OBJS-$(CONFIG_DEFLATE_WRAPPER) += zlib_wrapper.o
|
||||
OBJS-$(CONFIG_DOVI_RPU) += dovi_rpu.o
|
||||
OBJS-$(CONFIG_ERROR_RESILIENCE) += error_resilience.o
|
||||
@ -90,9 +90,6 @@ OBJS-$(CONFIG_EXIF) += exif.o tiff_common.o
|
||||
OBJS-$(CONFIG_FAANDCT) += faandct.o
|
||||
OBJS-$(CONFIG_FAANIDCT) += faanidct.o
|
||||
OBJS-$(CONFIG_FDCTDSP) += fdctdsp.o jfdctfst.o jfdctint.o
|
||||
FFT-OBJS-$(CONFIG_HARDCODED_TABLES) += cos_tables.o
|
||||
OBJS-$(CONFIG_FFT) += avfft.o fft_float.o fft_fixed_32.o \
|
||||
fft_init_table.o $(FFT-OBJS-yes)
|
||||
OBJS-$(CONFIG_FMTCONVERT) += fmtconvert.o
|
||||
OBJS-$(CONFIG_GOLOMB) += golomb.o
|
||||
OBJS-$(CONFIG_H263DSP) += h263dsp.o
|
||||
@ -125,7 +122,6 @@ OBJS-$(CONFIG_LLVIDENCDSP) += lossless_videoencdsp.o
|
||||
OBJS-$(CONFIG_LPC) += lpc.o
|
||||
OBJS-$(CONFIG_LSP) += lsp.o
|
||||
OBJS-$(CONFIG_LZF) += lzf.o
|
||||
OBJS-$(CONFIG_MDCT) += mdct_float.o mdct_fixed_32.o
|
||||
OBJS-$(CONFIG_ME_CMP) += me_cmp.o
|
||||
OBJS-$(CONFIG_MEDIACODEC) += mediacodecdec_common.o mediacodec_surface.o mediacodec_wrapper.o mediacodec_sw_buffer.o
|
||||
OBJS-$(CONFIG_MPEG_ER) += mpeg_er.o
|
||||
@ -157,7 +153,6 @@ OBJS-$(CONFIG_QSV) += qsv.o
|
||||
OBJS-$(CONFIG_QSVDEC) += qsvdec.o
|
||||
OBJS-$(CONFIG_QSVENC) += qsvenc.o
|
||||
OBJS-$(CONFIG_RANGECODER) += rangecoder.o
|
||||
OBJS-$(CONFIG_RDFT) += rdft.o
|
||||
OBJS-$(CONFIG_RV34DSP) += rv34dsp.o
|
||||
OBJS-$(CONFIG_SINEWIN) += sinewin.o
|
||||
OBJS-$(CONFIG_SNAPPY) += snappy.o
|
||||
@ -1326,8 +1321,6 @@ TESTPROGS = avcodec \
|
||||
|
||||
TESTPROGS-$(CONFIG_AV1_VAAPI_ENCODER) += av1_levels
|
||||
TESTPROGS-$(CONFIG_CABAC) += cabac
|
||||
TESTPROGS-$(CONFIG_DCT) += avfft
|
||||
TESTPROGS-$(CONFIG_FFT) += fft fft-fixed32
|
||||
TESTPROGS-$(CONFIG_GOLOMB) += golomb
|
||||
TESTPROGS-$(CONFIG_IDCTDSP) += dct
|
||||
TESTPROGS-$(CONFIG_IIRFILTER) += iirfilter
|
||||
@ -1347,7 +1340,6 @@ HOSTPROGS = aacps_tablegen \
|
||||
aacps_fixed_tablegen \
|
||||
cbrt_tablegen \
|
||||
cbrt_fixed_tablegen \
|
||||
cos_tablegen \
|
||||
dv_tablegen \
|
||||
motionpixels_tablegen \
|
||||
mpegaudio_tablegen \
|
||||
@ -1362,12 +1354,6 @@ CLEANFILES = *_tables.c *_tables.h *_tablegen$(HOSTEXESUF)
|
||||
$(SUBDIR)tests/dct$(EXESUF): $(SUBDIR)dctref.o $(SUBDIR)aandcttab.o
|
||||
$(SUBDIR)dv_tablegen$(HOSTEXESUF): $(SUBDIR)dvdata_host.o
|
||||
|
||||
TRIG_TABLES = cos cos_fixed sin
|
||||
TRIG_TABLES := $(TRIG_TABLES:%=$(SUBDIR)%_tables.c)
|
||||
|
||||
$(TRIG_TABLES): $(SUBDIR)%_tables.c: $(SUBDIR)cos_tablegen$(HOSTEXESUF)
|
||||
$(M)./$< $* > $@
|
||||
|
||||
ifdef CONFIG_SMALL
|
||||
$(SUBDIR)%_tablegen$(HOSTEXESUF): HOSTCFLAGS += -DCONFIG_SMALL=1
|
||||
else
|
||||
|
@ -1,5 +1,4 @@
|
||||
# subsystems
|
||||
OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o
|
||||
OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
|
||||
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
|
||||
@ -36,7 +35,6 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
|
||||
|
||||
# subsystems
|
||||
NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
|
||||
NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
|
||||
@ -47,7 +45,6 @@ NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
|
||||
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \
|
||||
aarch64/simple_idct_neon.o
|
||||
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
|
||||
NEON-OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_neon.o
|
||||
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
|
||||
|
@ -1,25 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_AARCH64_ASM_OFFSETS_H
|
||||
#define AVCODEC_AARCH64_ASM_OFFSETS_H
|
||||
|
||||
/* FFTContext */
|
||||
#define IMDCT_HALF 0x48
|
||||
|
||||
#endif /* AVCODEC_AARCH64_ASM_OFFSETS_H */
|
@ -1,52 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
|
||||
#include "libavcodec/fft.h"
|
||||
|
||||
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
|
||||
|
||||
void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
av_cold void ff_fft_init_aarch64(FFTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
if (s->nbits < 17) {
|
||||
s->fft_permute = ff_fft_permute_neon;
|
||||
s->fft_calc = ff_fft_calc_neon;
|
||||
}
|
||||
#if CONFIG_MDCT
|
||||
s->imdct_calc = ff_imdct_calc_neon;
|
||||
s->imdct_half = ff_imdct_half_neon;
|
||||
s->mdct_calc = ff_mdct_calc_neon;
|
||||
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
|
||||
#endif
|
||||
}
|
||||
}
|
@ -1,447 +0,0 @@
|
||||
/*
|
||||
* ARM NEON optimised FFT
|
||||
*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2009 Naotoshi Nojiri
|
||||
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This algorithm (though not any of the implementation details) is
|
||||
* based on libdjbfft by D. J. Bernstein.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
#define M_SQRT1_2 0.70710678118654752440
|
||||
|
||||
.macro transpose d0, d1, s0, s1
|
||||
trn1 \d0, \s0, \s1
|
||||
trn2 \d1, \s0, \s1
|
||||
.endm
|
||||
|
||||
|
||||
function fft4_neon
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
|
||||
|
||||
fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1
|
||||
fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1
|
||||
|
||||
ext v16.8b, v2.8b, v3.8b, #4
|
||||
ext v17.8b, v3.8b, v2.8b, #4
|
||||
|
||||
fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3
|
||||
fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3
|
||||
|
||||
fadd v0.2s, v4.2s, v5.2s
|
||||
fsub v2.2s, v4.2s, v5.2s
|
||||
fadd v1.2s, v6.2s, v7.2s
|
||||
fsub v3.2s, v6.2s, v7.2s
|
||||
|
||||
st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function fft8_neon
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
mov x1, x0
|
||||
ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
|
||||
ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
|
||||
ext v22.8b, v2.8b, v3.8b, #4
|
||||
ext v23.8b, v3.8b, v2.8b, #4
|
||||
fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
|
||||
fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
|
||||
fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
|
||||
fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
|
||||
rev64 v27.2s, v28.2s // ???
|
||||
fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
|
||||
fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
|
||||
fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
|
||||
ext v6.8b, v4.8b, v5.8b, #4
|
||||
ext v7.8b, v5.8b, v4.8b, #4
|
||||
fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
|
||||
fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
|
||||
fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
|
||||
fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
|
||||
fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
|
||||
fadd v0.2s, v20.2s, v21.2s
|
||||
fsub v2.2s, v20.2s, v21.2s
|
||||
fadd v1.2s, v22.2s, v23.2s
|
||||
rev64 v26.2s, v26.2s
|
||||
rev64 v27.2s, v27.2s
|
||||
fsub v3.2s, v22.2s, v23.2s
|
||||
fsub v6.2s, v6.2s, v7.2s
|
||||
fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
|
||||
fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
|
||||
fadd v7.2s, v4.2s, v5.2s
|
||||
fsub v18.2s, v2.2s, v6.2s
|
||||
ext v26.8b, v24.8b, v25.8b, #4
|
||||
ext v27.8b, v25.8b, v24.8b, #4
|
||||
fadd v2.2s, v2.2s, v6.2s
|
||||
fsub v16.2s, v0.2s, v7.2s
|
||||
fadd v5.2s, v25.2s, v24.2s
|
||||
fsub v4.2s, v26.2s, v27.2s
|
||||
fadd v0.2s, v0.2s, v7.2s
|
||||
fsub v17.2s, v1.2s, v5.2s
|
||||
fsub v19.2s, v3.2s, v4.2s
|
||||
fadd v3.2s, v3.2s, v4.2s
|
||||
fadd v1.2s, v1.2s, v5.2s
|
||||
|
||||
st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
|
||||
st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function fft16_neon
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
mov x1, x0
|
||||
ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
|
||||
ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
|
||||
ext v22.8b, v2.8b, v3.8b, #4
|
||||
ext v23.8b, v3.8b, v2.8b, #4
|
||||
fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
|
||||
fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
|
||||
fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
|
||||
fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
|
||||
rev64 v27.2s, v28.2s // ???
|
||||
fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
|
||||
fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
|
||||
fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
|
||||
ext v6.8b, v4.8b, v5.8b, #4
|
||||
ext v7.8b, v5.8b, v4.8b, #4
|
||||
fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
|
||||
fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
|
||||
fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
|
||||
fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
|
||||
fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
|
||||
fadd v0.2s, v20.2s, v21.2s
|
||||
fsub v2.2s, v20.2s, v21.2s
|
||||
fadd v1.2s, v22.2s, v23.2s
|
||||
rev64 v26.2s, v26.2s
|
||||
rev64 v27.2s, v27.2s
|
||||
fsub v3.2s, v22.2s, v23.2s
|
||||
fsub v6.2s, v6.2s, v7.2s
|
||||
fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
|
||||
fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
|
||||
fadd v7.2s, v4.2s, v5.2s
|
||||
fsub v18.2s, v2.2s, v6.2s
|
||||
ld1 {v20.4s,v21.4s}, [x0], #32
|
||||
ld1 {v22.4s,v23.4s}, [x0], #32
|
||||
ext v26.8b, v24.8b, v25.8b, #4
|
||||
ext v27.8b, v25.8b, v24.8b, #4
|
||||
fadd v2.2s, v2.2s, v6.2s
|
||||
fsub v16.2s, v0.2s, v7.2s
|
||||
fadd v5.2s, v25.2s, v24.2s
|
||||
fsub v4.2s, v26.2s, v27.2s
|
||||
transpose v24.2d, v25.2d, v20.2d, v22.2d
|
||||
transpose v26.2d, v27.2d, v21.2d, v23.2d
|
||||
fadd v0.2s, v0.2s, v7.2s
|
||||
fsub v17.2s, v1.2s, v5.2s
|
||||
fsub v19.2s, v3.2s, v4.2s
|
||||
fadd v3.2s, v3.2s, v4.2s
|
||||
fadd v1.2s, v1.2s, v5.2s
|
||||
ext v20.16b, v21.16b, v21.16b, #4
|
||||
ext v21.16b, v23.16b, v23.16b, #4
|
||||
|
||||
zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]}
|
||||
zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]}
|
||||
zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]}
|
||||
zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]}
|
||||
|
||||
// 2 x fft4
|
||||
transpose v22.2d, v23.2d, v20.2d, v21.2d
|
||||
|
||||
fadd v4.4s, v24.4s, v25.4s
|
||||
fadd v5.4s, v26.4s, v27.4s
|
||||
fsub v6.4s, v24.4s, v25.4s
|
||||
fsub v7.4s, v22.4s, v23.4s
|
||||
|
||||
ld1 {v23.4s}, [x14]
|
||||
|
||||
fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]}
|
||||
fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]}
|
||||
fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]}
|
||||
fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]}
|
||||
|
||||
//fft_pass_neon_16
|
||||
rev64 v7.4s, v25.4s
|
||||
fmul v25.4s, v25.4s, v23.s[1]
|
||||
fmul v7.4s, v7.4s, v29.4s
|
||||
fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a}
|
||||
|
||||
zip1 v20.4s, v24.4s, v25.4s
|
||||
zip2 v21.4s, v24.4s, v25.4s
|
||||
fneg v22.4s, v20.4s
|
||||
fadd v4.4s, v21.4s, v20.4s
|
||||
fsub v6.4s, v20.4s, v21.4s // just the second half
|
||||
fadd v5.4s, v21.4s, v22.4s // just the first half
|
||||
|
||||
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
||||
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
||||
|
||||
fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]}
|
||||
fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]}
|
||||
fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]}
|
||||
fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]}
|
||||
|
||||
//second half
|
||||
rev64 v6.4s, v26.4s
|
||||
fmul v26.4s, v26.4s, v23.s[2]
|
||||
rev64 v7.4s, v27.4s
|
||||
fmul v27.4s, v27.4s, v23.s[3]
|
||||
fmul v6.4s, v6.4s, v29.4s
|
||||
fmul v7.4s, v7.4s, v29.4s
|
||||
fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6}
|
||||
fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a}
|
||||
|
||||
zip1 v24.4s, v26.4s, v27.4s
|
||||
zip2 v25.4s, v26.4s, v27.4s
|
||||
fneg v26.4s, v24.4s
|
||||
fadd v4.4s, v25.4s, v24.4s
|
||||
fsub v6.4s, v24.4s, v25.4s // just the second half
|
||||
fadd v5.4s, v25.4s, v26.4s // just the first half
|
||||
|
||||
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
||||
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
||||
|
||||
fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]}
|
||||
fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]}
|
||||
fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]}
|
||||
fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]}
|
||||
|
||||
st1 {v16.4s,v17.4s}, [x1], #32
|
||||
st1 {v18.4s,v19.4s}, [x1], #32
|
||||
st1 {v20.4s,v21.4s}, [x1], #32
|
||||
st1 {v22.4s,v23.4s}, [x1], #32
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
const trans4_float, align=4
|
||||
.byte 0, 1, 2, 3
|
||||
.byte 8, 9, 10, 11
|
||||
.byte 4, 5, 6, 7
|
||||
.byte 12, 13, 14, 15
|
||||
endconst
|
||||
|
||||
const trans8_float, align=4
|
||||
.byte 24, 25, 26, 27
|
||||
.byte 0, 1, 2, 3
|
||||
.byte 28, 29, 30, 31
|
||||
.byte 4, 5, 6, 7
|
||||
endconst
|
||||
|
||||
function fft_pass_neon
|
||||
sub x6, x2, #1 // n - 1, loop counter
|
||||
lsl x5, x2, #3 // 2 * n * sizeof FFTSample
|
||||
lsl x1, x2, #4 // 2 * n * sizeof FFTComplex
|
||||
add x5, x4, x5 // wim
|
||||
add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex
|
||||
add x2, x0, x2, lsl #5 // &z[o2]
|
||||
add x3, x0, x3 // &z[o3]
|
||||
add x1, x0, x1 // &z[o1]
|
||||
ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
|
||||
ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
|
||||
ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
|
||||
trn2 v25.2d, v20.2d, v22.2d
|
||||
sub x5, x5, #4 // wim--
|
||||
trn1 v24.2d, v20.2d, v22.2d
|
||||
ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1]
|
||||
rev64 v7.4s, v25.4s
|
||||
fmul v25.4s, v25.4s, v4.s[1]
|
||||
ld1 {v16.4s}, [x0] // {z[0],z[1]}
|
||||
fmul v7.4s, v7.4s, v29.4s
|
||||
ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]}
|
||||
prfm pldl1keep, [x2, #16]
|
||||
prfm pldl1keep, [x3, #16]
|
||||
fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
|
||||
prfm pldl1keep, [x0, #16]
|
||||
prfm pldl1keep, [x1, #16]
|
||||
|
||||
zip1 v20.4s, v24.4s, v25.4s
|
||||
zip2 v21.4s, v24.4s, v25.4s
|
||||
fneg v22.4s, v20.4s
|
||||
fadd v4.4s, v21.4s, v20.4s
|
||||
fsub v6.4s, v20.4s, v21.4s // just the second half
|
||||
fadd v5.4s, v21.4s, v22.4s // just the first half
|
||||
|
||||
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
||||
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
||||
|
||||
fadd v20.4s, v16.4s, v4.4s
|
||||
fsub v22.4s, v16.4s, v4.4s
|
||||
fadd v21.4s, v17.4s, v5.4s
|
||||
st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
|
||||
fsub v23.4s, v17.4s, v5.4s
|
||||
|
||||
st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
|
||||
st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
|
||||
st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
|
||||
1:
|
||||
ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
|
||||
ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
|
||||
ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
|
||||
transpose v26.2d, v27.2d, v20.2d, v22.2d
|
||||
ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]}
|
||||
rev64 v6.4s, v26.4s
|
||||
fmul v26.4s, v26.4s, v4.s[0]
|
||||
rev64 v7.4s, v27.4s
|
||||
fmul v27.4s, v27.4s, v4.s[1]
|
||||
fmul v6.4s, v6.4s, v29.4s
|
||||
fmul v7.4s, v7.4s, v29.4s
|
||||
ld1 {v16.4s},[x0] // {z[0],z[1]}
|
||||
fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6}
|
||||
fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
|
||||
ld1 {v17.4s},[x1] // {z[o1],z[o1+1]}
|
||||
|
||||
subs x6, x6, #1 // n--
|
||||
|
||||
zip1 v20.4s, v26.4s, v27.4s
|
||||
zip2 v21.4s, v26.4s, v27.4s
|
||||
fneg v22.4s, v20.4s
|
||||
fadd v4.4s, v21.4s, v20.4s
|
||||
fsub v6.4s, v20.4s, v21.4s // just the second half
|
||||
fadd v5.4s, v21.4s, v22.4s // just the first half
|
||||
|
||||
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
||||
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
||||
|
||||
fadd v20.4s, v16.4s, v4.4s
|
||||
fsub v22.4s, v16.4s, v4.4s
|
||||
fadd v21.4s, v17.4s, v5.4s
|
||||
st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
|
||||
fsub v23.4s, v17.4s, v5.4s
|
||||
|
||||
st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
|
||||
st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
|
||||
st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
|
||||
b.ne 1b
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro def_fft n, n2, n4
|
||||
function fft\n\()_neon, align=6
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
AARCH64_SIGN_LINK_REGISTER
|
||||
stp x28, x30, [sp, #-16]!
|
||||
add x28, x0, #\n4*2*8
|
||||
bl fft\n2\()_neon
|
||||
mov x0, x28
|
||||
bl fft\n4\()_neon
|
||||
add x0, x28, #\n4*1*8
|
||||
bl fft\n4\()_neon
|
||||
sub x0, x28, #\n4*2*8
|
||||
ldp x28, x30, [sp], #16
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
movrel x4, X(ff_cos_\n)
|
||||
mov x2, #\n4>>1
|
||||
b fft_pass_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
def_fft 32, 16, 8
|
||||
def_fft 64, 32, 16
|
||||
def_fft 128, 64, 32
|
||||
def_fft 256, 128, 64
|
||||
def_fft 512, 256, 128
|
||||
def_fft 1024, 512, 256
|
||||
def_fft 2048, 1024, 512
|
||||
def_fft 4096, 2048, 1024
|
||||
def_fft 8192, 4096, 2048
|
||||
def_fft 16384, 8192, 4096
|
||||
def_fft 32768, 16384, 8192
|
||||
def_fft 65536, 32768, 16384
|
||||
|
||||
function ff_fft_calc_neon, export=1
|
||||
prfm pldl1keep, [x1]
|
||||
movrel x10, trans4_float
|
||||
ldr w2, [x0]
|
||||
movrel x11, trans8_float
|
||||
sub w2, w2, #2
|
||||
movrel x3, fft_tab_neon
|
||||
ld1 {v30.16b}, [x10]
|
||||
mov x7, #-8
|
||||
movrel x12, pmmp
|
||||
ldr x3, [x3, x2, lsl #3]
|
||||
movrel x13, mppm
|
||||
movrel x14, X(ff_cos_16)
|
||||
ld1 {v31.16b}, [x11]
|
||||
mov x0, x1
|
||||
ld1 {v29.4s}, [x12] // pmmp
|
||||
ld1 {v28.4s}, [x13]
|
||||
br x3
|
||||
endfunc
|
||||
|
||||
function ff_fft_permute_neon, export=1
|
||||
mov x6, #1
|
||||
ldr w2, [x0] // nbits
|
||||
ldr x3, [x0, #16] // tmp_buf
|
||||
ldr x0, [x0, #8] // revtab
|
||||
lsl x6, x6, x2
|
||||
mov x2, x6
|
||||
1:
|
||||
ld1 {v0.2s,v1.2s}, [x1], #16
|
||||
ldr w4, [x0], #4
|
||||
uxth w5, w4
|
||||
lsr w4, w4, #16
|
||||
add x5, x3, x5, lsl #3
|
||||
add x4, x3, x4, lsl #3
|
||||
st1 {v0.2s}, [x5]
|
||||
st1 {v1.2s}, [x4]
|
||||
subs x6, x6, #2
|
||||
b.gt 1b
|
||||
|
||||
sub x1, x1, x2, lsl #3
|
||||
1:
|
||||
ld1 {v0.4s,v1.4s}, [x3], #32
|
||||
st1 {v0.4s,v1.4s}, [x1], #32
|
||||
subs x2, x2, #4
|
||||
b.gt 1b
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
const fft_tab_neon, relocate=1
|
||||
.quad fft4_neon
|
||||
.quad fft8_neon
|
||||
.quad fft16_neon
|
||||
.quad fft32_neon
|
||||
.quad fft64_neon
|
||||
.quad fft128_neon
|
||||
.quad fft256_neon
|
||||
.quad fft512_neon
|
||||
.quad fft1024_neon
|
||||
.quad fft2048_neon
|
||||
.quad fft4096_neon
|
||||
.quad fft8192_neon
|
||||
.quad fft16384_neon
|
||||
.quad fft32768_neon
|
||||
.quad fft65536_neon
|
||||
endconst
|
||||
|
||||
const pmmp, align=4
|
||||
.float +1.0, -1.0, -1.0, +1.0
|
||||
endconst
|
||||
|
||||
const mppm, align=4
|
||||
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
|
||||
endconst
|
@ -1,326 +0,0 @@
|
||||
/*
|
||||
* AArch64 NEON optimised MDCT
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_imdct_half_neon, export=1
|
||||
stp x19, x20, [sp, #-32]!
|
||||
AARCH64_SIGN_LINK_REGISTER
|
||||
str x30, [sp, #16]
|
||||
mov x12, #1
|
||||
ldr w14, [x0, #28] // mdct_bits
|
||||
ldr x4, [x0, #32] // tcos
|
||||
ldr x3, [x0, #8] // revtab
|
||||
lsl x12, x12, x14 // n = 1 << nbits
|
||||
lsr x14, x12, #2 // n4 = n >> 2
|
||||
add x7, x2, x12, lsl #1
|
||||
mov x12, #-16
|
||||
sub x7, x7, #16
|
||||
|
||||
ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0
|
||||
ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x
|
||||
rev64 v17.2s, v17.2s
|
||||
ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
|
||||
fmul v6.2s, v17.2s, v2.2s
|
||||
fmul v7.2s, v0.2s, v2.2s
|
||||
1:
|
||||
subs x14, x14, #2
|
||||
ldr w6, [x3], #4
|
||||
fmul v4.2s, v0.2s, v3.2s
|
||||
fmul v5.2s, v17.2s, v3.2s
|
||||
fsub v4.2s, v6.2s, v4.2s
|
||||
fadd v5.2s, v5.2s, v7.2s
|
||||
ubfm x8, x6, #16, #31
|
||||
ubfm x6, x6, #0, #15
|
||||
add x8, x1, x8, lsl #3
|
||||
add x6, x1, x6, lsl #3
|
||||
b.eq 2f
|
||||
ld2 {v16.2s,v17.2s}, [x7], x12
|
||||
ld2 {v0.2s,v1.2s}, [x2], #16
|
||||
rev64 v17.2s, v17.2s
|
||||
ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
|
||||
fmul v6.2s, v17.2s, v2.2s
|
||||
fmul v7.2s, v0.2s, v2.2s
|
||||
st2 {v4.s,v5.s}[0], [x6]
|
||||
st2 {v4.s,v5.s}[1], [x8]
|
||||
b 1b
|
||||
2:
|
||||
st2 {v4.s,v5.s}[0], [x6]
|
||||
st2 {v4.s,v5.s}[1], [x8]
|
||||
|
||||
mov x19, x0
|
||||
mov x20, x1
|
||||
bl X(ff_fft_calc_neon)
|
||||
|
||||
mov x12, #1
|
||||
ldr w14, [x19, #28] // mdct_bits
|
||||
ldr x4, [x19, #32] // tcos
|
||||
lsl x12, x12, x14 // n = 1 << nbits
|
||||
lsr x14, x12, #3 // n8 = n >> 3
|
||||
|
||||
add x4, x4, x14, lsl #3
|
||||
add x6, x20, x14, lsl #3
|
||||
sub x1, x4, #16
|
||||
sub x3, x6, #16
|
||||
|
||||
mov x7, #-16
|
||||
mov x8, x6
|
||||
mov x0, x3
|
||||
|
||||
ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0
|
||||
ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
|
||||
ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
|
||||
3:
|
||||
subs x14, x14, #2
|
||||
fmul v7.2s, v0.2s, v17.2s
|
||||
ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3
|
||||
fmul v4.2s, v1.2s, v17.2s
|
||||
fmul v6.2s, v21.2s, v19.2s
|
||||
fmul v5.2s, v20.2s, v19.2s
|
||||
fmul v22.2s, v1.2s, v16.2s
|
||||
fmul v23.2s, v21.2s, v18.2s
|
||||
fmul v24.2s, v0.2s, v16.2s
|
||||
fmul v25.2s, v20.2s, v18.2s
|
||||
fadd v7.2s, v7.2s, v22.2s
|
||||
fadd v5.2s, v5.2s, v23.2s
|
||||
fsub v4.2s, v4.2s, v24.2s
|
||||
fsub v6.2s, v6.2s, v25.2s
|
||||
b.eq 4f
|
||||
ld2 {v0.2s,v1.2s}, [x3], x7
|
||||
ld2 {v20.2s,v21.2s},[x6], #16
|
||||
ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
|
||||
rev64 v5.2s, v5.2s
|
||||
rev64 v7.2s, v7.2s
|
||||
st2 {v4.2s,v5.2s}, [x0], x7
|
||||
st2 {v6.2s,v7.2s}, [x8], #16
|
||||
b 3b
|
||||
4:
|
||||
rev64 v5.2s, v5.2s
|
||||
rev64 v7.2s, v7.2s
|
||||
st2 {v4.2s,v5.2s}, [x0]
|
||||
st2 {v6.2s,v7.2s}, [x8]
|
||||
|
||||
ldr x30, [sp, #16]
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ldp x19, x20, [sp], #32
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_imdct_calc_neon, export=1
|
||||
stp x19, x20, [sp, #-32]!
|
||||
AARCH64_SIGN_LINK_REGISTER
|
||||
str x30, [sp, #16]
|
||||
ldr w3, [x0, #28] // mdct_bits
|
||||
mov x19, #1
|
||||
mov x20, x1
|
||||
lsl x19, x19, x3
|
||||
add x1, x1, x19
|
||||
|
||||
bl X(ff_imdct_half_neon)
|
||||
|
||||
add x0, x20, x19, lsl #2
|
||||
add x1, x20, x19, lsl #1
|
||||
sub x0, x0, #8
|
||||
sub x2, x1, #16
|
||||
mov x3, #-16
|
||||
mov x6, #-8
|
||||
1:
|
||||
ld1 {v0.4s}, [x2], x3
|
||||
prfum pldl1keep, [x0, #-16]
|
||||
rev64 v0.4s, v0.4s
|
||||
ld1 {v2.2s,v3.2s}, [x1], #16
|
||||
fneg v4.4s, v0.4s
|
||||
prfum pldl1keep, [x2, #-16]
|
||||
rev64 v2.2s, v2.2s
|
||||
rev64 v3.2s, v3.2s
|
||||
ext v4.16b, v4.16b, v4.16b, #8
|
||||
st1 {v2.2s}, [x0], x6
|
||||
st1 {v3.2s}, [x0], x6
|
||||
st1 {v4.4s}, [x20], #16
|
||||
subs x19, x19, #16
|
||||
b.gt 1b
|
||||
|
||||
ldr x30, [sp, #16]
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ldp x19, x20, [sp], #32
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
function ff_mdct_calc_neon, export=1
|
||||
stp x19, x20, [sp, #-32]!
|
||||
AARCH64_SIGN_LINK_REGISTER
|
||||
str x30, [sp, #16]
|
||||
|
||||
mov x12, #1
|
||||
ldr w14, [x0, #28] // mdct_bits
|
||||
ldr x4, [x0, #32] // tcos
|
||||
ldr x3, [x0, #8] // revtab
|
||||
lsl x14, x12, x14 // n = 1 << nbits
|
||||
add x7, x2, x14 // in4u
|
||||
sub x9, x7, #16 // in4d
|
||||
add x2, x7, x14, lsl #1 // in3u
|
||||
add x8, x9, x14, lsl #1 // in3d
|
||||
add x5, x4, x14, lsl #1
|
||||
sub x5, x5, #16
|
||||
sub x3, x3, #4
|
||||
mov x12, #-16
|
||||
lsr x13, x14, #1
|
||||
|
||||
ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
|
||||
ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
|
||||
ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
|
||||
rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
|
||||
rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
|
||||
ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
|
||||
fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
|
||||
ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
|
||||
rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
|
||||
rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
|
||||
ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
|
||||
fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
|
||||
fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
|
||||
fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
|
||||
1:
|
||||
fmul v7.2s, v0.2s, v21.2s // I*s
|
||||
ldr w10, [x3, x13]
|
||||
fmul v6.2s, v2.2s, v20.2s // -R*c
|
||||
ldr w6, [x3, #4]!
|
||||
fmul v4.2s, v2.2s, v21.2s // -R*s
|
||||
fmul v5.2s, v0.2s, v20.2s // I*c
|
||||
fmul v24.2s, v16.2s, v30.2s // R*c
|
||||
fmul v25.2s, v18.2s, v31.2s // -I*s
|
||||
fmul v22.2s, v16.2s, v31.2s // R*s
|
||||
fmul v23.2s, v18.2s, v30.2s // I*c
|
||||
subs x14, x14, #16
|
||||
subs x13, x13, #8
|
||||
fsub v6.2s, v6.2s, v7.2s // -R*c-I*s
|
||||
fadd v7.2s, v4.2s, v5.2s // -R*s+I*c
|
||||
fsub v24.2s, v25.2s, v24.2s // I*s-R*c
|
||||
fadd v25.2s, v22.2s, v23.2s // R*s-I*c
|
||||
b.eq 1f
|
||||
mov x12, #-16
|
||||
ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
|
||||
ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
|
||||
fneg v7.2s, v7.2s // R*s-I*c
|
||||
ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
|
||||
rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
|
||||
rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
|
||||
ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
|
||||
fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
|
||||
ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
|
||||
rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
|
||||
rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
|
||||
ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
|
||||
fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
|
||||
fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
|
||||
fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
|
||||
ubfm x12, x6, #16, #31
|
||||
ubfm x6, x6, #0, #15
|
||||
add x12, x1, x12, lsl #3
|
||||
add x6, x1, x6, lsl #3
|
||||
st2 {v6.s,v7.s}[0], [x6]
|
||||
st2 {v6.s,v7.s}[1], [x12]
|
||||
ubfm x6, x10, #16, #31
|
||||
ubfm x10, x10, #0, #15
|
||||
add x6 , x1, x6, lsl #3
|
||||
add x10, x1, x10, lsl #3
|
||||
st2 {v24.s,v25.s}[0], [x10]
|
||||
st2 {v24.s,v25.s}[1], [x6]
|
||||
b 1b
|
||||
1:
|
||||
fneg v7.2s, v7.2s // R*s-I*c
|
||||
ubfm x12, x6, #16, #31
|
||||
ubfm x6, x6, #0, #15
|
||||
add x12, x1, x12, lsl #3
|
||||
add x6, x1, x6, lsl #3
|
||||
st2 {v6.s,v7.s}[0], [x6]
|
||||
st2 {v6.s,v7.s}[1], [x12]
|
||||
ubfm x6, x10, #16, #31
|
||||
ubfm x10, x10, #0, #15
|
||||
add x6 , x1, x6, lsl #3
|
||||
add x10, x1, x10, lsl #3
|
||||
st2 {v24.s,v25.s}[0], [x10]
|
||||
st2 {v24.s,v25.s}[1], [x6]
|
||||
|
||||
mov x19, x0
|
||||
mov x20, x1
|
||||
bl X(ff_fft_calc_neon)
|
||||
|
||||
mov x12, #1
|
||||
ldr w14, [x19, #28] // mdct_bits
|
||||
ldr x4, [x19, #32] // tcos
|
||||
lsl x12, x12, x14 // n = 1 << nbits
|
||||
lsr x14, x12, #3 // n8 = n >> 3
|
||||
|
||||
add x4, x4, x14, lsl #3
|
||||
add x6, x20, x14, lsl #3
|
||||
sub x1, x4, #16
|
||||
sub x3, x6, #16
|
||||
|
||||
mov x7, #-16
|
||||
mov x8, x6
|
||||
mov x0, x3
|
||||
|
||||
ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0
|
||||
ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3
|
||||
ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
|
||||
1:
|
||||
subs x14, x14, #2
|
||||
fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0
|
||||
ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3
|
||||
fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0
|
||||
fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3
|
||||
fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3
|
||||
fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0
|
||||
fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3
|
||||
fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3
|
||||
fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0
|
||||
fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0
|
||||
fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3
|
||||
fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3
|
||||
fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0
|
||||
fneg v4.2s, v4.2s
|
||||
fneg v6.2s, v6.2s
|
||||
b.eq 1f
|
||||
ld2 {v0.2s, v1.2s}, [x3], x7
|
||||
ld2 {v20.2s,v21.2s}, [x6], #16
|
||||
ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
|
||||
rev64 v5.2s, v5.2s
|
||||
rev64 v7.2s, v7.2s
|
||||
st2 {v4.2s,v5.2s}, [x0], x7
|
||||
st2 {v6.2s,v7.2s}, [x8], #16
|
||||
b 1b
|
||||
1:
|
||||
rev64 v5.2s, v5.2s
|
||||
rev64 v7.2s, v7.2s
|
||||
st2 {v4.2s,v5.2s}, [x0]
|
||||
st2 {v6.2s,v7.2s}, [x8]
|
||||
|
||||
ldr x30, [sp, #16]
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ldp x19, x20, [sp], #32
|
||||
|
||||
ret
|
||||
endfunc
|
@ -23,15 +23,8 @@
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavcodec/fft.h"
|
||||
#include "libavcodec/synth_filter.h"
|
||||
|
||||
#include "asm-offsets.h"
|
||||
|
||||
#if HAVE_NEON
|
||||
AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
|
||||
#endif
|
||||
|
||||
void ff_synth_filter_float_neon(AVTXContext *imdct,
|
||||
float *synth_buf_ptr, int *synth_buf_offset,
|
||||
float synth_buf2[32], const float window[512],
|
||||
|
@ -19,8 +19,6 @@
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "asm-offsets.h"
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
.macro inner_loop
|
||||
|
@ -5,7 +5,6 @@ OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \
|
||||
arm/ac3dsp_arm.o
|
||||
OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_arm.o
|
||||
OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_arm.o
|
||||
OBJS-$(CONFIG_FFT) += arm/fft_init_arm.o
|
||||
OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_init_arm.o
|
||||
OBJS-$(CONFIG_G722DSP) += arm/g722dsp_init_arm.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o
|
||||
@ -25,7 +24,6 @@ OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
|
||||
OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o
|
||||
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
|
||||
OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_init_arm.o
|
||||
OBJS-$(CONFIG_RDFT) += arm/rdft_init_arm.o
|
||||
OBJS-$(CONFIG_RV34DSP) += arm/rv34dsp_init_arm.o
|
||||
OBJS-$(CONFIG_VC1DSP) += arm/vc1dsp_init_arm.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o
|
||||
@ -90,9 +88,7 @@ ARMV6-OBJS-$(CONFIG_TRUEHD_DECODER) += arm/mlpdsp_armv6.o
|
||||
# VFP optimizations
|
||||
|
||||
# subsystems
|
||||
VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o
|
||||
VFP-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_vfp.o
|
||||
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
|
||||
|
||||
# decoders/encoders
|
||||
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
|
||||
@ -107,7 +103,6 @@ NEON-OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_neon.o \
|
||||
arm/int_neon.o
|
||||
NEON-OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_neon.o \
|
||||
arm/blockdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o
|
||||
NEON-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_neon.o
|
||||
NEON-OBJS-$(CONFIG_G722DSP) += arm/g722dsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264CHROMA) += arm/h264cmc_neon.o
|
||||
@ -121,10 +116,8 @@ NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_neon.o \
|
||||
NEON-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_neon.o \
|
||||
arm/idctdsp_neon.o \
|
||||
arm/simple_idct_neon.o
|
||||
NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o
|
||||
NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o
|
||||
NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_RDFT) += arm/rdft_neon.o
|
||||
NEON-OBJS-$(CONFIG_VC1DSP) += arm/vc1dsp_init_neon.o \
|
||||
arm/vc1dsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o
|
||||
|
@ -1,63 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
|
||||
#include "libavcodec/fft.h"
|
||||
|
||||
void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
|
||||
|
||||
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
|
||||
|
||||
void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
av_cold void ff_fft_init_arm(FFTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_vfp_vm(cpu_flags)) {
|
||||
s->fft_calc = ff_fft_calc_vfp;
|
||||
#if CONFIG_MDCT
|
||||
s->imdct_half = ff_imdct_half_vfp;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#if CONFIG_FFT
|
||||
if (s->nbits < 17) {
|
||||
s->fft_permute = ff_fft_permute_neon;
|
||||
s->fft_calc = ff_fft_calc_neon;
|
||||
}
|
||||
#endif
|
||||
#if CONFIG_MDCT
|
||||
s->imdct_calc = ff_imdct_calc_neon;
|
||||
s->imdct_half = ff_imdct_half_neon;
|
||||
s->mdct_calc = ff_mdct_calc_neon;
|
||||
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
|
||||
#endif
|
||||
}
|
||||
}
|
@ -1,375 +0,0 @@
|
||||
/*
|
||||
* ARM NEON optimised FFT
|
||||
*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2009 Naotoshi Nojiri
|
||||
*
|
||||
* This algorithm (though not any of the implementation details) is
|
||||
* based on libdjbfft by D. J. Bernstein.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
#define M_SQRT1_2 0.70710678118654752440
|
||||
|
||||
|
||||
function fft4_neon
|
||||
vld1.32 {d0-d3}, [r0,:128]
|
||||
|
||||
vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2
|
||||
vsub.f32 d6, d0, d1 @ r0-r1,i0-i1
|
||||
vsub.f32 d7, d16, d17 @ r3-r2,i2-i3
|
||||
vadd.f32 d4, d0, d1 @ r0+r1,i0+i1
|
||||
vadd.f32 d5, d2, d3 @ i2+i3,r2+r3
|
||||
vadd.f32 d1, d6, d7
|
||||
vsub.f32 d3, d6, d7
|
||||
vadd.f32 d0, d4, d5
|
||||
vsub.f32 d2, d4, d5
|
||||
|
||||
vst1.32 {d0-d3}, [r0,:128]
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function fft8_neon
|
||||
mov r1, r0
|
||||
vld1.32 {d0-d3}, [r1,:128]!
|
||||
vld1.32 {d16-d19}, [r1,:128]
|
||||
|
||||
movw r2, #0x04f3 @ sqrt(1/2)
|
||||
movt r2, #0x3f35
|
||||
eor r3, r2, #1<<31
|
||||
vdup.32 d31, r2
|
||||
|
||||
vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2
|
||||
vadd.f32 d4, d16, d17 @ r4+r5,i4+i5
|
||||
vmov d28, r3, r2
|
||||
vadd.f32 d5, d18, d19 @ r6+r7,i6+i7
|
||||
vsub.f32 d17, d16, d17 @ r4-r5,i4-i5
|
||||
vsub.f32 d19, d18, d19 @ r6-r7,i6-i7
|
||||
vrev64.32 d29, d28
|
||||
vadd.f32 d20, d0, d1 @ r0+r1,i0+i1
|
||||
vadd.f32 d21, d2, d3 @ r2+r3,i2+i3
|
||||
vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w
|
||||
vext.32 q3, q2, q2, #1
|
||||
vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w
|
||||
vsub.f32 d23, d22, d23 @ i2-i3,r3-r2
|
||||
vsub.f32 d22, d0, d1 @ r0-r1,i0-i1
|
||||
vmul.f32 d24, d17, d31 @ a2r*w,a2i*w
|
||||
vmul.f32 d25, d19, d31 @ a3r*w,a3i*w
|
||||
vadd.f32 d0, d20, d21
|
||||
vsub.f32 d2, d20, d21
|
||||
vadd.f32 d1, d22, d23
|
||||
vrev64.32 q13, q13
|
||||
vsub.f32 d3, d22, d23
|
||||
vsub.f32 d6, d6, d7
|
||||
vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2
|
||||
vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6
|
||||
vadd.f32 d7, d4, d5
|
||||
vsub.f32 d18, d2, d6
|
||||
vext.32 q13, q12, q12, #1
|
||||
vadd.f32 d2, d2, d6
|
||||
vsub.f32 d16, d0, d7
|
||||
vadd.f32 d5, d25, d24
|
||||
vsub.f32 d4, d26, d27
|
||||
vadd.f32 d0, d0, d7
|
||||
vsub.f32 d17, d1, d5
|
||||
vsub.f32 d19, d3, d4
|
||||
vadd.f32 d3, d3, d4
|
||||
vadd.f32 d1, d1, d5
|
||||
|
||||
vst1.32 {d16-d19}, [r1,:128]
|
||||
vst1.32 {d0-d3}, [r0,:128]
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function fft16_neon
|
||||
movrel r1, mppm
|
||||
vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
|
||||
pld [r0, #32]
|
||||
vld1.32 {d2-d3}, [r1,:128]
|
||||
vext.32 q13, q9, q9, #1
|
||||
vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
|
||||
vadd.f32 d4, d16, d17
|
||||
vsub.f32 d5, d16, d17
|
||||
vadd.f32 d18, d18, d19
|
||||
vsub.f32 d19, d26, d27
|
||||
|
||||
vadd.f32 d20, d22, d23
|
||||
vsub.f32 d22, d22, d23
|
||||
vsub.f32 d23, d24, d25
|
||||
vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1}
|
||||
vadd.f32 d21, d24, d25
|
||||
vmul.f32 d24, d22, d2
|
||||
vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3}
|
||||
vmul.f32 d25, d23, d3
|
||||
vuzp.32 d16, d17 @ {r0,r1,i0,i1}
|
||||
vmul.f32 q1, q11, d2[1]
|
||||
vuzp.32 d18, d19 @ {r2,r3,i2,i3}
|
||||
vrev64.32 q12, q12
|
||||
vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6}
|
||||
vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
|
||||
vzip.32 q10, q11
|
||||
vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
|
||||
vadd.f32 d0, d22, d20
|
||||
vadd.f32 d1, d21, d23
|
||||
vsub.f32 d2, d21, d23
|
||||
vsub.f32 d3, d22, d20
|
||||
sub r0, r0, #96
|
||||
vext.32 q13, q13, q13, #1
|
||||
vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5}
|
||||
vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
|
||||
vext.32 q15, q15, q15, #1
|
||||
vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7}
|
||||
vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
|
||||
vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3}
|
||||
vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
|
||||
vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6}
|
||||
vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a}
|
||||
movrelx r2, X(ff_cos_16)
|
||||
vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8}
|
||||
vrev64.32 d1, d1
|
||||
vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a}
|
||||
vrev64.32 d3, d3
|
||||
movrel r3, pmmp
|
||||
vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
|
||||
vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
|
||||
vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9}
|
||||
vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13}
|
||||
vld1.32 {d4-d5}, [r2,:64]
|
||||
vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11}
|
||||
vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15}
|
||||
vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
|
||||
vld1.32 {d6-d7}, [r3,:128]
|
||||
vrev64.32 q1, q14
|
||||
vmul.f32 q14, q14, d4[1]
|
||||
vmul.f32 q1, q1, q3
|
||||
vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a}
|
||||
vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
|
||||
vzip.32 q12, q14
|
||||
vadd.f32 d0, d28, d24
|
||||
vadd.f32 d1, d25, d29
|
||||
vsub.f32 d2, d25, d29
|
||||
vsub.f32 d3, d28, d24
|
||||
vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9}
|
||||
vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
|
||||
vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13}
|
||||
mov r1, #32
|
||||
vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5}
|
||||
vrev64.32 q0, q13
|
||||
vmul.f32 q13, q13, d5[0]
|
||||
vrev64.32 q1, q15
|
||||
vmul.f32 q15, q15, d5[1]
|
||||
vst2.32 {d16-d17},[r0,:128], r1
|
||||
vmul.f32 q0, q0, q3
|
||||
vst2.32 {d20-d21},[r0,:128], r1
|
||||
vmul.f32 q1, q1, q3
|
||||
vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6}
|
||||
vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a}
|
||||
vst2.32 {d24-d25},[r0,:128], r1
|
||||
vst2.32 {d28-d29},[r0,:128]
|
||||
vzip.32 q13, q15
|
||||
sub r0, r0, #80
|
||||
vadd.f32 d0, d30, d26
|
||||
vadd.f32 d1, d27, d31
|
||||
vsub.f32 d2, d27, d31
|
||||
vsub.f32 d3, d30, d26
|
||||
vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11}
|
||||
vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3}
|
||||
vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15}
|
||||
vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7}
|
||||
vst2.32 {d18-d19},[r0,:128], r1
|
||||
vst2.32 {d22-d23},[r0,:128], r1
|
||||
vst2.32 {d26-d27},[r0,:128], r1
|
||||
vst2.32 {d30-d31},[r0,:128]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function fft_pass_neon
|
||||
push {r4-r6,lr}
|
||||
mov r6, r2 @ n
|
||||
lsl r5, r2, #3 @ 2 * n * sizeof FFTSample
|
||||
lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex
|
||||
lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex
|
||||
add r3, r2, r4
|
||||
add r4, r4, r0 @ &z[o1]
|
||||
add r2, r2, r0 @ &z[o2]
|
||||
add r3, r3, r0 @ &z[o3]
|
||||
vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
|
||||
movrel r12, pmmp
|
||||
vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
|
||||
add r5, r5, r1 @ wim
|
||||
vld1.32 {d6-d7}, [r12,:128] @ pmmp
|
||||
vswp d21, d22
|
||||
vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]}
|
||||
sub r5, r5, #4 @ wim--
|
||||
vrev64.32 q1, q11
|
||||
vmul.f32 q11, q11, d4[1]
|
||||
vmul.f32 q1, q1, q3
|
||||
vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1]
|
||||
vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
|
||||
vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
|
||||
sub r6, r6, #1 @ n--
|
||||
vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
|
||||
vzip.32 q10, q11
|
||||
vadd.f32 d0, d22, d20
|
||||
vadd.f32 d1, d21, d23
|
||||
vsub.f32 d2, d21, d23
|
||||
vsub.f32 d3, d22, d20
|
||||
vsub.f32 q10, q8, q0
|
||||
vadd.f32 q8, q8, q0
|
||||
vsub.f32 q11, q9, q1
|
||||
vadd.f32 q9, q9, q1
|
||||
vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]}
|
||||
vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]}
|
||||
vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]}
|
||||
vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]}
|
||||
sub r5, r5, #8 @ wim -= 2
|
||||
1:
|
||||
vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
|
||||
vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
|
||||
vswp d21, d22
|
||||
vld1.32 {d4}, [r1]! @ {wre[0],wre[1]}
|
||||
vrev64.32 q0, q10
|
||||
vmul.f32 q10, q10, d4[0]
|
||||
vrev64.32 q1, q11
|
||||
vmul.f32 q11, q11, d4[1]
|
||||
vld1.32 {d5}, [r5] @ {wim[-1],wim[0]}
|
||||
vmul.f32 q0, q0, q3
|
||||
sub r5, r5, #8 @ wim -= 2
|
||||
vmul.f32 q1, q1, q3
|
||||
vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6}
|
||||
vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
|
||||
vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
|
||||
subs r6, r6, #1 @ n--
|
||||
vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
|
||||
vzip.32 q10, q11
|
||||
vadd.f32 d0, d22, d20
|
||||
vadd.f32 d1, d21, d23
|
||||
vsub.f32 d2, d21, d23
|
||||
vsub.f32 d3, d22, d20
|
||||
vsub.f32 q10, q8, q0
|
||||
vadd.f32 q8, q8, q0
|
||||
vsub.f32 q11, q9, q1
|
||||
vadd.f32 q9, q9, q1
|
||||
vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]}
|
||||
vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]}
|
||||
vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]}
|
||||
vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]}
|
||||
bne 1b
|
||||
|
||||
pop {r4-r6,pc}
|
||||
endfunc
|
||||
|
||||
.macro def_fft n, n2, n4
|
||||
.align 6
|
||||
function fft\n\()_neon
|
||||
push {r4, lr}
|
||||
mov r4, r0
|
||||
bl fft\n2\()_neon
|
||||
add r0, r4, #\n4*2*8
|
||||
bl fft\n4\()_neon
|
||||
add r0, r4, #\n4*3*8
|
||||
bl fft\n4\()_neon
|
||||
mov r0, r4
|
||||
pop {r4, lr}
|
||||
movrelx r1, X(ff_cos_\n)
|
||||
mov r2, #\n4/2
|
||||
b fft_pass_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
def_fft 32, 16, 8
|
||||
def_fft 64, 32, 16
|
||||
def_fft 128, 64, 32
|
||||
def_fft 256, 128, 64
|
||||
def_fft 512, 256, 128
|
||||
def_fft 1024, 512, 256
|
||||
def_fft 2048, 1024, 512
|
||||
def_fft 4096, 2048, 1024
|
||||
def_fft 8192, 4096, 2048
|
||||
def_fft 16384, 8192, 4096
|
||||
def_fft 32768, 16384, 8192
|
||||
def_fft 65536, 32768, 16384
|
||||
|
||||
function ff_fft_calc_neon, export=1
|
||||
ldr r2, [r0]
|
||||
sub r2, r2, #2
|
||||
movrel r3, fft_tab_neon
|
||||
ldr r3, [r3, r2, lsl #2]
|
||||
mov r0, r1
|
||||
bx r3
|
||||
endfunc
|
||||
|
||||
function ff_fft_permute_neon, export=1
|
||||
push {r4,lr}
|
||||
mov r12, #1
|
||||
ldr r2, [r0] @ nbits
|
||||
ldr r3, [r0, #12] @ tmp_buf
|
||||
ldr r0, [r0, #8] @ revtab
|
||||
lsl r12, r12, r2
|
||||
mov r2, r12
|
||||
1:
|
||||
vld1.32 {d0-d1}, [r1,:128]!
|
||||
ldr r4, [r0], #4
|
||||
uxth lr, r4
|
||||
uxth r4, r4, ror #16
|
||||
add lr, r3, lr, lsl #3
|
||||
add r4, r3, r4, lsl #3
|
||||
vst1.32 {d0}, [lr,:64]
|
||||
vst1.32 {d1}, [r4,:64]
|
||||
subs r12, r12, #2
|
||||
bgt 1b
|
||||
|
||||
sub r1, r1, r2, lsl #3
|
||||
1:
|
||||
vld1.32 {d0-d3}, [r3,:128]!
|
||||
vst1.32 {d0-d3}, [r1,:128]!
|
||||
subs r2, r2, #4
|
||||
bgt 1b
|
||||
|
||||
pop {r4,pc}
|
||||
endfunc
|
||||
|
||||
const fft_tab_neon, relocate=1
|
||||
.word fft4_neon
|
||||
.word fft8_neon
|
||||
.word fft16_neon
|
||||
.word fft32_neon
|
||||
.word fft64_neon
|
||||
.word fft128_neon
|
||||
.word fft256_neon
|
||||
.word fft512_neon
|
||||
.word fft1024_neon
|
||||
.word fft2048_neon
|
||||
.word fft4096_neon
|
||||
.word fft8192_neon
|
||||
.word fft16384_neon
|
||||
.word fft32768_neon
|
||||
.word fft65536_neon
|
||||
endconst
|
||||
|
||||
const pmmp, align=4
|
||||
.float +1.0, -1.0, -1.0, +1.0
|
||||
endconst
|
||||
|
||||
const mppm, align=4
|
||||
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
|
||||
endconst
|
@ -1,530 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013 RISC OS Open Ltd
|
||||
* Author: Ben Avison <bavison@riscosopen.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
|
||||
@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
|
||||
@ all single-precision VFP registers may be corrupted on exit. The a2
|
||||
@ register may not be clobbered in these functions, as it holds the
|
||||
@ stored original FPSCR.
|
||||
|
||||
function ff_fft_calc_vfp, export=1
|
||||
ldr ip, [a1, #0] @ nbits
|
||||
mov a1, a2
|
||||
movrel a2, (fft_tab_vfp - 8)
|
||||
ldr pc, [a2, ip, lsl #2]
|
||||
endfunc
|
||||
const fft_tab_vfp, relocate=1
|
||||
.word fft4_vfp
|
||||
.word fft8_vfp
|
||||
.word X(ff_fft16_vfp) @ this one alone is exported
|
||||
.word fft32_vfp
|
||||
.word fft64_vfp
|
||||
.word fft128_vfp
|
||||
.word fft256_vfp
|
||||
.word fft512_vfp
|
||||
.word fft1024_vfp
|
||||
.word fft2048_vfp
|
||||
.word fft4096_vfp
|
||||
.word fft8192_vfp
|
||||
.word fft16384_vfp
|
||||
.word fft32768_vfp
|
||||
.word fft65536_vfp
|
||||
endconst
|
||||
|
||||
function fft4_vfp
|
||||
vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
|
||||
vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
|
||||
vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
|
||||
vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
|
||||
@ stall
|
||||
vadd.f s12, s0, s8 @ i0
|
||||
vadd.f s13, s1, s9 @ i1
|
||||
vadd.f s14, s2, s10 @ i2
|
||||
vadd.f s15, s3, s11 @ i3
|
||||
vsub.f s8, s0, s8 @ i4
|
||||
vsub.f s9, s1, s9 @ i5
|
||||
vsub.f s10, s2, s10 @ i6
|
||||
vsub.f s11, s3, s11 @ i7
|
||||
@ stall
|
||||
@ stall
|
||||
vadd.f s0, s12, s14 @ z[0].re
|
||||
vsub.f s4, s12, s14 @ z[2].re
|
||||
vadd.f s1, s13, s15 @ z[0].im
|
||||
vsub.f s5, s13, s15 @ z[2].im
|
||||
vadd.f s7, s9, s10 @ z[3].im
|
||||
vsub.f s3, s9, s10 @ z[1].im
|
||||
vadd.f s2, s8, s11 @ z[1].re
|
||||
vsub.f s6, s8, s11 @ z[3].re
|
||||
@ stall
|
||||
@ stall
|
||||
vstr d0, [a1, #0*2*4]
|
||||
vstr d2, [a1, #2*2*4]
|
||||
@ stall
|
||||
@ stall
|
||||
vstr d1, [a1, #1*2*4]
|
||||
vstr d3, [a1, #3*2*4]
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
.macro macro_fft8_head
|
||||
@ FFT4
|
||||
vldr d4, [a1, #0 * 2*4]
|
||||
vldr d6, [a1, #1 * 2*4]
|
||||
vldr d5, [a1, #2 * 2*4]
|
||||
vldr d7, [a1, #3 * 2*4]
|
||||
@ BF
|
||||
vldr d12, [a1, #4 * 2*4]
|
||||
vadd.f s16, s8, s12 @ vector op
|
||||
vldr d14, [a1, #5 * 2*4]
|
||||
vldr d13, [a1, #6 * 2*4]
|
||||
vldr d15, [a1, #7 * 2*4]
|
||||
vsub.f s20, s8, s12 @ vector op
|
||||
vadd.f s0, s16, s18
|
||||
vsub.f s2, s16, s18
|
||||
vadd.f s1, s17, s19
|
||||
vsub.f s3, s17, s19
|
||||
vadd.f s7, s21, s22
|
||||
vsub.f s5, s21, s22
|
||||
vadd.f s4, s20, s23
|
||||
vsub.f s6, s20, s23
|
||||
vsub.f s20, s24, s28 @ vector op
|
||||
vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
|
||||
vstr d1, [a1, #1 * 2*4]
|
||||
vldr s0, cos1pi4
|
||||
vadd.f s16, s24, s28 @ vector op
|
||||
vstr d2, [a1, #2 * 2*4]
|
||||
vstr d3, [a1, #3 * 2*4]
|
||||
vldr d12, [a1, #0 * 2*4]
|
||||
@ TRANSFORM
|
||||
vmul.f s20, s20, s0 @ vector x scalar op
|
||||
vldr d13, [a1, #1 * 2*4]
|
||||
vldr d14, [a1, #2 * 2*4]
|
||||
vldr d15, [a1, #3 * 2*4]
|
||||
@ BUTTERFLIES
|
||||
vadd.f s0, s18, s16
|
||||
vadd.f s1, s17, s19
|
||||
vsub.f s2, s17, s19
|
||||
vsub.f s3, s18, s16
|
||||
vadd.f s4, s21, s20
|
||||
vsub.f s5, s21, s20
|
||||
vadd.f s6, s22, s23
|
||||
vsub.f s7, s22, s23
|
||||
vadd.f s8, s0, s24 @ vector op
|
||||
vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
|
||||
vstr d1, [a1, #1 * 2*4]
|
||||
vldr d6, [a1, #0 * 2*4]
|
||||
vldr d7, [a1, #1 * 2*4]
|
||||
vadd.f s1, s5, s6
|
||||
vadd.f s0, s7, s4
|
||||
vsub.f s2, s5, s6
|
||||
vsub.f s3, s7, s4
|
||||
vsub.f s12, s24, s12 @ vector op
|
||||
vsub.f s5, s29, s1
|
||||
vsub.f s4, s28, s0
|
||||
vsub.f s6, s30, s2
|
||||
vsub.f s7, s31, s3
|
||||
vadd.f s16, s0, s28 @ vector op
|
||||
vstr d6, [a1, #4 * 2*4]
|
||||
vstr d7, [a1, #6 * 2*4]
|
||||
vstr d4, [a1, #0 * 2*4]
|
||||
vstr d5, [a1, #2 * 2*4]
|
||||
vstr d2, [a1, #5 * 2*4]
|
||||
vstr d3, [a1, #7 * 2*4]
|
||||
.endm
|
||||
|
||||
.macro macro_fft8_tail
|
||||
vstr d8, [a1, #1 * 2*4]
|
||||
vstr d9, [a1, #3 * 2*4]
|
||||
.endm
|
||||
|
||||
function .Lfft8_internal_vfp
|
||||
macro_fft8_head
|
||||
macro_fft8_tail
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function fft8_vfp
|
||||
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
|
||||
fmrx a2, FPSCR
|
||||
fmxr FPSCR, a3
|
||||
vpush {s16-s31}
|
||||
mov ip, lr
|
||||
bl .Lfft8_internal_vfp
|
||||
vpop {s16-s31}
|
||||
fmxr FPSCR, a2
|
||||
bx ip
|
||||
endfunc
|
||||
|
||||
.align 3
|
||||
cos1pi4: @ cos(1*pi/4) = sqrt(2)
|
||||
.float 0.707106769084930419921875
|
||||
cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
|
||||
.float 0.92387950420379638671875
|
||||
cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
|
||||
.float 0.3826834261417388916015625
|
||||
|
||||
function .Lfft16_internal_vfp
|
||||
macro_fft8_head
|
||||
@ FFT4(z+8)
|
||||
vldr d10, [a1, #8 * 2*4]
|
||||
vldr d12, [a1, #9 * 2*4]
|
||||
vldr d11, [a1, #10 * 2*4]
|
||||
vldr d13, [a1, #11 * 2*4]
|
||||
macro_fft8_tail
|
||||
vadd.f s16, s20, s24 @ vector op
|
||||
@ FFT4(z+12)
|
||||
vldr d4, [a1, #12 * 2*4]
|
||||
vldr d6, [a1, #13 * 2*4]
|
||||
vldr d5, [a1, #14 * 2*4]
|
||||
vsub.f s20, s20, s24 @ vector op
|
||||
vldr d7, [a1, #15 * 2*4]
|
||||
vadd.f s0, s16, s18
|
||||
vsub.f s4, s16, s18
|
||||
vadd.f s1, s17, s19
|
||||
vsub.f s5, s17, s19
|
||||
vadd.f s7, s21, s22
|
||||
vsub.f s3, s21, s22
|
||||
vadd.f s2, s20, s23
|
||||
vsub.f s6, s20, s23
|
||||
vadd.f s16, s8, s12 @ vector op
|
||||
vstr d0, [a1, #8 * 2*4]
|
||||
vstr d2, [a1, #10 * 2*4]
|
||||
vstr d1, [a1, #9 * 2*4]
|
||||
vsub.f s20, s8, s12
|
||||
vstr d3, [a1, #11 * 2*4]
|
||||
@ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
|
||||
vldr d12, [a1, #10 * 2*4]
|
||||
vadd.f s0, s16, s18
|
||||
vadd.f s1, s17, s19
|
||||
vsub.f s6, s16, s18
|
||||
vsub.f s7, s17, s19
|
||||
vsub.f s3, s21, s22
|
||||
vadd.f s2, s20, s23
|
||||
vadd.f s5, s21, s22
|
||||
vsub.f s4, s20, s23
|
||||
vstr d0, [a1, #12 * 2*4]
|
||||
vmov s0, s6
|
||||
@ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
|
||||
vldr d6, [a1, #9 * 2*4]
|
||||
vstr d1, [a1, #13 * 2*4]
|
||||
vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
|
||||
vstr d2, [a1, #15 * 2*4]
|
||||
vldr d7, [a1, #13 * 2*4]
|
||||
vadd.f s4, s25, s24
|
||||
vsub.f s5, s25, s24
|
||||
vsub.f s6, s0, s7
|
||||
vadd.f s7, s0, s7
|
||||
vmul.f s20, s12, s3 @ vector op
|
||||
@ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
|
||||
vldr d4, [a1, #11 * 2*4]
|
||||
vldr d5, [a1, #15 * 2*4]
|
||||
vldr s1, cos3pi8
|
||||
vmul.f s24, s4, s2 @ vector * scalar op
|
||||
vmul.f s28, s12, s1 @ vector * scalar op
|
||||
vmul.f s12, s8, s1 @ vector * scalar op
|
||||
vadd.f s4, s20, s29
|
||||
vsub.f s5, s21, s28
|
||||
vsub.f s6, s22, s31
|
||||
vadd.f s7, s23, s30
|
||||
vmul.f s8, s8, s3 @ vector * scalar op
|
||||
vldr d8, [a1, #1 * 2*4]
|
||||
vldr d9, [a1, #5 * 2*4]
|
||||
vldr d10, [a1, #3 * 2*4]
|
||||
vldr d11, [a1, #7 * 2*4]
|
||||
vldr d14, [a1, #2 * 2*4]
|
||||
vadd.f s0, s6, s4
|
||||
vadd.f s1, s5, s7
|
||||
vsub.f s2, s5, s7
|
||||
vsub.f s3, s6, s4
|
||||
vadd.f s4, s12, s9
|
||||
vsub.f s5, s13, s8
|
||||
vsub.f s6, s14, s11
|
||||
vadd.f s7, s15, s10
|
||||
vadd.f s12, s0, s16 @ vector op
|
||||
vstr d0, [a1, #1 * 2*4]
|
||||
vstr d1, [a1, #5 * 2*4]
|
||||
vldr d4, [a1, #1 * 2*4]
|
||||
vldr d5, [a1, #5 * 2*4]
|
||||
vadd.f s0, s6, s4
|
||||
vadd.f s1, s5, s7
|
||||
vsub.f s2, s5, s7
|
||||
vsub.f s3, s6, s4
|
||||
vsub.f s8, s16, s8 @ vector op
|
||||
vstr d6, [a1, #1 * 2*4]
|
||||
vstr d7, [a1, #5 * 2*4]
|
||||
vldr d15, [a1, #6 * 2*4]
|
||||
vsub.f s4, s20, s0
|
||||
vsub.f s5, s21, s1
|
||||
vsub.f s6, s22, s2
|
||||
vsub.f s7, s23, s3
|
||||
vadd.f s20, s0, s20 @ vector op
|
||||
vstr d4, [a1, #9 * 2*4]
|
||||
@ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
|
||||
vldr d6, [a1, #8 * 2*4]
|
||||
vstr d5, [a1, #13 * 2*4]
|
||||
vldr d7, [a1, #12 * 2*4]
|
||||
vstr d2, [a1, #11 * 2*4]
|
||||
vldr d8, [a1, #0 * 2*4]
|
||||
vstr d3, [a1, #15 * 2*4]
|
||||
vldr d9, [a1, #4 * 2*4]
|
||||
vadd.f s0, s26, s24
|
||||
vadd.f s1, s25, s27
|
||||
vsub.f s2, s25, s27
|
||||
vsub.f s3, s26, s24
|
||||
vadd.f s4, s14, s12
|
||||
vadd.f s5, s13, s15
|
||||
vsub.f s6, s13, s15
|
||||
vsub.f s7, s14, s12
|
||||
vadd.f s8, s0, s28 @ vector op
|
||||
vstr d0, [a1, #3 * 2*4]
|
||||
vstr d1, [a1, #7 * 2*4]
|
||||
vldr d6, [a1, #3 * 2*4]
|
||||
vldr d7, [a1, #7 * 2*4]
|
||||
vsub.f s0, s16, s4
|
||||
vsub.f s1, s17, s5
|
||||
vsub.f s2, s18, s6
|
||||
vsub.f s3, s19, s7
|
||||
vsub.f s12, s28, s12 @ vector op
|
||||
vadd.f s16, s4, s16 @ vector op
|
||||
vstr d10, [a1, #3 * 2*4]
|
||||
vstr d11, [a1, #7 * 2*4]
|
||||
vstr d4, [a1, #2 * 2*4]
|
||||
vstr d5, [a1, #6 * 2*4]
|
||||
vstr d0, [a1, #8 * 2*4]
|
||||
vstr d1, [a1, #12 * 2*4]
|
||||
vstr d6, [a1, #10 * 2*4]
|
||||
vstr d7, [a1, #14 * 2*4]
|
||||
vstr d8, [a1, #0 * 2*4]
|
||||
vstr d9, [a1, #4 * 2*4]
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function ff_fft16_vfp, export=1
|
||||
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
|
||||
fmrx a2, FPSCR
|
||||
fmxr FPSCR, a3
|
||||
vpush {s16-s31}
|
||||
mov ip, lr
|
||||
bl .Lfft16_internal_vfp
|
||||
vpop {s16-s31}
|
||||
fmxr FPSCR, a2
|
||||
bx ip
|
||||
endfunc
|
||||
|
||||
.macro pass n, z0, z1, z2, z3
|
||||
add v6, v5, #4*2*\n
|
||||
@ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
|
||||
@ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
|
||||
@ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
|
||||
@ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
|
||||
vldr d8, [\z2, #8*(o2+1)] @ s16,s17
|
||||
vldmdb v6!, {s2}
|
||||
vldr d9, [\z3, #8*(o3+1)] @ s18,s19
|
||||
vldmia v5!, {s0,s1} @ s0 is unused
|
||||
vldr s7, [\z2, #8*o2] @ t1
|
||||
vmul.f s20, s16, s2 @ vector * scalar
|
||||
vldr s0, [\z3, #8*o3] @ t5
|
||||
vldr s6, [\z2, #8*o2+4] @ t2
|
||||
vldr s3, [\z3, #8*o3+4] @ t6
|
||||
vmul.f s16, s16, s1 @ vector * scalar
|
||||
ldr a4, =\n-1
|
||||
1: add \z0, \z0, #8*2
|
||||
.if \n*4*2 >= 512
|
||||
add \z1, \z1, #8*2
|
||||
.endif
|
||||
.if \n*4*2 >= 256
|
||||
add \z2, \z2, #8*2
|
||||
.endif
|
||||
.if \n*4*2 >= 512
|
||||
add \z3, \z3, #8*2
|
||||
.endif
|
||||
@ up to 2 stalls (VFP vector issuing / waiting for s0)
|
||||
@ depending upon whether this is the first iteration and
|
||||
@ how many add instructions are inserted above
|
||||
vadd.f s4, s0, s7 @ t5
|
||||
vadd.f s5, s6, s3 @ t6
|
||||
vsub.f s6, s6, s3 @ t4
|
||||
vsub.f s7, s0, s7 @ t3
|
||||
vldr d6, [\z0, #8*0-8*2] @ s12,s13
|
||||
vadd.f s0, s16, s21 @ t1
|
||||
vldr d7, [\z1, #8*o1-8*2] @ s14,s15
|
||||
vsub.f s1, s18, s23 @ t5
|
||||
vadd.f s8, s4, s12 @ vector + vector
|
||||
@ stall (VFP vector issuing)
|
||||
@ stall (VFP vector issuing)
|
||||
@ stall (VFP vector issuing)
|
||||
vsub.f s4, s12, s4
|
||||
vsub.f s5, s13, s5
|
||||
vsub.f s6, s14, s6
|
||||
vsub.f s7, s15, s7
|
||||
vsub.f s2, s17, s20 @ t2
|
||||
vadd.f s3, s19, s22 @ t6
|
||||
vstr d4, [\z0, #8*0-8*2] @ s8,s9
|
||||
vstr d5, [\z1, #8*o1-8*2] @ s10,s11
|
||||
@ stall (waiting for s5)
|
||||
vstr d2, [\z2, #8*o2-8*2] @ s4,s5
|
||||
vadd.f s4, s1, s0 @ t5
|
||||
vstr d3, [\z3, #8*o3-8*2] @ s6,s7
|
||||
vsub.f s7, s1, s0 @ t3
|
||||
vadd.f s5, s2, s3 @ t6
|
||||
vsub.f s6, s2, s3 @ t4
|
||||
vldr d6, [\z0, #8*1-8*2] @ s12,s13
|
||||
vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15
|
||||
vldr d4, [\z2, #8*o2] @ s8,s9
|
||||
vldmdb v6!, {s2,s3}
|
||||
vldr d5, [\z3, #8*o3] @ s10,s11
|
||||
vadd.f s20, s4, s12 @ vector + vector
|
||||
vldmia v5!, {s0,s1}
|
||||
vldr d8, [\z2, #8*(o2+1)] @ s16,s17
|
||||
@ stall (VFP vector issuing)
|
||||
vsub.f s4, s12, s4
|
||||
vsub.f s5, s13, s5
|
||||
vsub.f s6, s14, s6
|
||||
vsub.f s7, s15, s7
|
||||
vmul.f s12, s8, s3 @ vector * scalar
|
||||
vstr d10, [\z0, #8*1-8*2] @ s20,s21
|
||||
vldr d9, [\z3, #8*(o3+1)] @ s18,s19
|
||||
vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23
|
||||
vmul.f s8, s8, s0 @ vector * scalar
|
||||
vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5
|
||||
@ stall (waiting for s7)
|
||||
vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7
|
||||
vmul.f s20, s16, s2 @ vector * scalar
|
||||
@ stall (VFP vector issuing)
|
||||
@ stall (VFP vector issuing)
|
||||
@ stall (VFP vector issuing)
|
||||
vadd.f s7, s8, s13 @ t1
|
||||
vsub.f s6, s9, s12 @ t2
|
||||
vsub.f s0, s10, s15 @ t5
|
||||
vadd.f s3, s11, s14 @ t6
|
||||
vmul.f s16, s16, s1 @ vector * scalar
|
||||
subs a4, a4, #1
|
||||
bne 1b
|
||||
@ What remains is identical to the first two indentations of
|
||||
@ the above, but without the increment of z
|
||||
vadd.f s4, s0, s7 @ t5
|
||||
vadd.f s5, s6, s3 @ t6
|
||||
vsub.f s6, s6, s3 @ t4
|
||||
vsub.f s7, s0, s7 @ t3
|
||||
vldr d6, [\z0, #8*0] @ s12,s13
|
||||
vadd.f s0, s16, s21 @ t1
|
||||
vldr d7, [\z1, #8*o1] @ s14,s15
|
||||
vsub.f s1, s18, s23 @ t5
|
||||
vadd.f s8, s4, s12 @ vector + vector
|
||||
vsub.f s4, s12, s4
|
||||
vsub.f s5, s13, s5
|
||||
vsub.f s6, s14, s6
|
||||
vsub.f s7, s15, s7
|
||||
vsub.f s2, s17, s20 @ t2
|
||||
vadd.f s3, s19, s22 @ t6
|
||||
vstr d4, [\z0, #8*0] @ s8,s9
|
||||
vstr d5, [\z1, #8*o1] @ s10,s11
|
||||
vstr d2, [\z2, #8*o2] @ s4,s5
|
||||
vadd.f s4, s1, s0 @ t5
|
||||
vstr d3, [\z3, #8*o3] @ s6,s7
|
||||
vsub.f s7, s1, s0 @ t3
|
||||
vadd.f s5, s2, s3 @ t6
|
||||
vsub.f s6, s2, s3 @ t4
|
||||
vldr d6, [\z0, #8*1] @ s12,s13
|
||||
vldr d7, [\z1, #8*(o1+1)] @ s14,s15
|
||||
vadd.f s20, s4, s12 @ vector + vector
|
||||
vsub.f s4, s12, s4
|
||||
vsub.f s5, s13, s5
|
||||
vsub.f s6, s14, s6
|
||||
vsub.f s7, s15, s7
|
||||
vstr d10, [\z0, #8*1] @ s20,s21
|
||||
vstr d11, [\z1, #8*(o1+1)] @ s22,s23
|
||||
vstr d2, [\z2, #8*(o2+1)] @ s4,s5
|
||||
vstr d3, [\z3, #8*(o3+1)] @ s6,s7
|
||||
.endm
|
||||
|
||||
.macro def_fft n, n2, n4
|
||||
function .Lfft\n\()_internal_vfp
|
||||
.if \n >= 512
|
||||
push {v1-v6,lr}
|
||||
.elseif \n >= 256
|
||||
push {v1-v2,v5-v6,lr}
|
||||
.else
|
||||
push {v1,v5-v6,lr}
|
||||
.endif
|
||||
mov v1, a1
|
||||
bl .Lfft\n2\()_internal_vfp
|
||||
add a1, v1, #8*(\n/4)*2
|
||||
bl .Lfft\n4\()_internal_vfp
|
||||
movrelx v5, X(ff_cos_\n), a1
|
||||
add a1, v1, #8*(\n/4)*3
|
||||
bl .Lfft\n4\()_internal_vfp
|
||||
.if \n >= 512
|
||||
.set o1, 0*(\n/4/2)
|
||||
.set o2, 0*(\n/4/2)
|
||||
.set o3, 0*(\n/4/2)
|
||||
add v2, v1, #8*2*(\n/4/2)
|
||||
add v3, v1, #8*4*(\n/4/2)
|
||||
add v4, v1, #8*6*(\n/4/2)
|
||||
pass (\n/4/2), v1, v2, v3, v4
|
||||
pop {v1-v6,pc}
|
||||
.elseif \n >= 256
|
||||
.set o1, 2*(\n/4/2)
|
||||
.set o2, 0*(\n/4/2)
|
||||
.set o3, 2*(\n/4/2)
|
||||
add v2, v1, #8*4*(\n/4/2)
|
||||
pass (\n/4/2), v1, v1, v2, v2
|
||||
pop {v1-v2,v5-v6,pc}
|
||||
.else
|
||||
.set o1, 2*(\n/4/2)
|
||||
.set o2, 4*(\n/4/2)
|
||||
.set o3, 6*(\n/4/2)
|
||||
pass (\n/4/2), v1, v1, v1, v1
|
||||
pop {v1,v5-v6,pc}
|
||||
.endif
|
||||
endfunc
|
||||
|
||||
function fft\n\()_vfp
|
||||
ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */
|
||||
fmrx a2, FPSCR
|
||||
fmxr FPSCR, a3
|
||||
vpush {s16-s31}
|
||||
mov ip, lr
|
||||
bl .Lfft\n\()_internal_vfp
|
||||
vpop {s16-s31}
|
||||
fmxr FPSCR, a2
|
||||
bx ip
|
||||
endfunc
|
||||
|
||||
.ltorg
|
||||
.endm
|
||||
|
||||
def_fft 32, 16, 8
|
||||
def_fft 64, 32, 16
|
||||
def_fft 128, 64, 32
|
||||
def_fft 256, 128, 64
|
||||
def_fft 512, 256, 128
|
||||
def_fft 1024, 512, 256
|
||||
def_fft 2048, 1024, 512
|
||||
def_fft 4096, 2048, 1024
|
||||
def_fft 8192, 4096, 2048
|
||||
def_fft 16384, 8192, 4096
|
||||
def_fft 32768, 16384, 8192
|
||||
def_fft 65536, 32768, 16384
|
@ -1,301 +0,0 @@
|
||||
/*
|
||||
* ARM NEON optimised MDCT
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
#define ff_fft_calc_neon X(ff_fft_calc_neon)
|
||||
|
||||
function ff_imdct_half_neon, export=1
|
||||
push {r4-r8,lr}
|
||||
|
||||
mov r12, #1
|
||||
ldr lr, [r0, #20] @ mdct_bits
|
||||
ldr r4, [r0, #24] @ tcos
|
||||
ldr r3, [r0, #8] @ revtab
|
||||
lsl r12, r12, lr @ n = 1 << nbits
|
||||
lsr lr, r12, #2 @ n4 = n >> 2
|
||||
add r7, r2, r12, lsl #1
|
||||
mov r12, #-16
|
||||
sub r7, r7, #16
|
||||
|
||||
vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
|
||||
vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
|
||||
vrev64.32 d17, d17
|
||||
vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
|
||||
vmul.f32 d6, d17, d2
|
||||
vmul.f32 d7, d0, d2
|
||||
1:
|
||||
subs lr, lr, #2
|
||||
ldr r6, [r3], #4
|
||||
vmul.f32 d4, d0, d3
|
||||
vmul.f32 d5, d17, d3
|
||||
vsub.f32 d4, d6, d4
|
||||
vadd.f32 d5, d5, d7
|
||||
uxth r8, r6, ror #16
|
||||
uxth r6, r6
|
||||
add r8, r1, r8, lsl #3
|
||||
add r6, r1, r6, lsl #3
|
||||
beq 1f
|
||||
vld2.32 {d16-d17},[r7,:128],r12
|
||||
vld2.32 {d0-d1}, [r2,:128]!
|
||||
vrev64.32 d17, d17
|
||||
vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
|
||||
vmul.f32 d6, d17, d2
|
||||
vmul.f32 d7, d0, d2
|
||||
vst2.32 {d4[0],d5[0]}, [r6,:64]
|
||||
vst2.32 {d4[1],d5[1]}, [r8,:64]
|
||||
b 1b
|
||||
1:
|
||||
vst2.32 {d4[0],d5[0]}, [r6,:64]
|
||||
vst2.32 {d4[1],d5[1]}, [r8,:64]
|
||||
|
||||
mov r4, r0
|
||||
mov r6, r1
|
||||
bl ff_fft_calc_neon
|
||||
|
||||
mov r12, #1
|
||||
ldr lr, [r4, #20] @ mdct_bits
|
||||
ldr r4, [r4, #24] @ tcos
|
||||
lsl r12, r12, lr @ n = 1 << nbits
|
||||
lsr lr, r12, #3 @ n8 = n >> 3
|
||||
|
||||
add r4, r4, lr, lsl #3
|
||||
add r6, r6, lr, lsl #3
|
||||
sub r1, r4, #16
|
||||
sub r3, r6, #16
|
||||
|
||||
mov r7, #-16
|
||||
mov r8, r6
|
||||
mov r0, r3
|
||||
|
||||
vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
|
||||
vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
|
||||
vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
|
||||
1:
|
||||
subs lr, lr, #2
|
||||
vmul.f32 d7, d0, d18
|
||||
vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
|
||||
vmul.f32 d4, d1, d18
|
||||
vmul.f32 d5, d21, d19
|
||||
vmul.f32 d6, d20, d19
|
||||
vmul.f32 d22, d1, d16
|
||||
vmul.f32 d23, d21, d17
|
||||
vmul.f32 d24, d0, d16
|
||||
vmul.f32 d25, d20, d17
|
||||
vadd.f32 d7, d7, d22
|
||||
vadd.f32 d6, d6, d23
|
||||
vsub.f32 d4, d4, d24
|
||||
vsub.f32 d5, d5, d25
|
||||
beq 1f
|
||||
vld2.32 {d0-d1}, [r3,:128], r7
|
||||
vld2.32 {d20-d21},[r6,:128]!
|
||||
vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
|
||||
vrev64.32 q3, q3
|
||||
vst2.32 {d4,d6}, [r0,:128], r7
|
||||
vst2.32 {d5,d7}, [r8,:128]!
|
||||
b 1b
|
||||
1:
|
||||
vrev64.32 q3, q3
|
||||
vst2.32 {d4,d6}, [r0,:128]
|
||||
vst2.32 {d5,d7}, [r8,:128]
|
||||
|
||||
pop {r4-r8,pc}
|
||||
endfunc
|
||||
|
||||
function ff_imdct_calc_neon, export=1
|
||||
push {r4-r6,lr}
|
||||
|
||||
ldr r3, [r0, #20]
|
||||
mov r4, #1
|
||||
mov r5, r1
|
||||
lsl r4, r4, r3
|
||||
add r1, r1, r4
|
||||
|
||||
bl X(ff_imdct_half_neon)
|
||||
|
||||
add r0, r5, r4, lsl #2
|
||||
add r1, r5, r4, lsl #1
|
||||
sub r0, r0, #8
|
||||
sub r2, r1, #16
|
||||
mov r3, #-16
|
||||
mov r6, #-8
|
||||
vmov.i32 d30, #1<<31
|
||||
1:
|
||||
vld1.32 {d0-d1}, [r2,:128], r3
|
||||
pld [r0, #-16]
|
||||
vrev64.32 q0, q0
|
||||
vld1.32 {d2-d3}, [r1,:128]!
|
||||
veor d4, d1, d30
|
||||
pld [r2, #-16]
|
||||
vrev64.32 q1, q1
|
||||
veor d5, d0, d30
|
||||
vst1.32 {d2}, [r0,:64], r6
|
||||
vst1.32 {d3}, [r0,:64], r6
|
||||
vst1.32 {d4-d5}, [r5,:128]!
|
||||
subs r4, r4, #16
|
||||
bgt 1b
|
||||
|
||||
pop {r4-r6,pc}
|
||||
endfunc
|
||||
|
||||
function ff_mdct_calc_neon, export=1
|
||||
push {r4-r10,lr}
|
||||
|
||||
mov r12, #1
|
||||
ldr lr, [r0, #20] @ mdct_bits
|
||||
ldr r4, [r0, #24] @ tcos
|
||||
ldr r3, [r0, #8] @ revtab
|
||||
lsl lr, r12, lr @ n = 1 << nbits
|
||||
add r7, r2, lr @ in4u
|
||||
sub r9, r7, #16 @ in4d
|
||||
add r2, r7, lr, lsl #1 @ in3u
|
||||
add r8, r9, lr, lsl #1 @ in3d
|
||||
add r5, r4, lr, lsl #1
|
||||
sub r5, r5, #16
|
||||
sub r3, r3, #4
|
||||
mov r12, #-16
|
||||
|
||||
vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
|
||||
vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
|
||||
vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
|
||||
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
|
||||
vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
|
||||
vsub.f32 d0, d18, d0 @ in4d-in4u I
|
||||
vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
|
||||
vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
|
||||
vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
|
||||
vadd.f32 d1, d1, d19 @ in3u+in3d -R
|
||||
vsub.f32 d16, d16, d2 @ in0u-in2d R
|
||||
vadd.f32 d17, d17, d3 @ in2u+in1d -I
|
||||
1:
|
||||
vmul.f32 d7, d0, d21 @ I*s
|
||||
A ldr r10, [r3, lr, lsr #1]
|
||||
T lsr r10, lr, #1
|
||||
T ldr r10, [r3, r10]
|
||||
vmul.f32 d6, d1, d20 @ -R*c
|
||||
ldr r6, [r3, #4]!
|
||||
vmul.f32 d4, d1, d21 @ -R*s
|
||||
vmul.f32 d5, d0, d20 @ I*c
|
||||
vmul.f32 d24, d16, d30 @ R*c
|
||||
vmul.f32 d25, d17, d31 @ -I*s
|
||||
vmul.f32 d22, d16, d31 @ R*s
|
||||
vmul.f32 d23, d17, d30 @ I*c
|
||||
subs lr, lr, #16
|
||||
vsub.f32 d6, d6, d7 @ -R*c-I*s
|
||||
vadd.f32 d7, d4, d5 @ -R*s+I*c
|
||||
vsub.f32 d24, d25, d24 @ I*s-R*c
|
||||
vadd.f32 d25, d22, d23 @ R*s-I*c
|
||||
beq 1f
|
||||
mov r12, #-16
|
||||
vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
|
||||
vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
|
||||
vneg.f32 d7, d7 @ R*s-I*c
|
||||
vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
|
||||
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
|
||||
vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
|
||||
vsub.f32 d0, d18, d0 @ in4d-in4u I
|
||||
vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
|
||||
vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
|
||||
vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
|
||||
vadd.f32 d1, d1, d19 @ in3u+in3d -R
|
||||
vsub.f32 d16, d16, d2 @ in0u-in2d R
|
||||
vadd.f32 d17, d17, d3 @ in2u+in1d -I
|
||||
uxth r12, r6, ror #16
|
||||
uxth r6, r6
|
||||
add r12, r1, r12, lsl #3
|
||||
add r6, r1, r6, lsl #3
|
||||
vst2.32 {d6[0],d7[0]}, [r6,:64]
|
||||
vst2.32 {d6[1],d7[1]}, [r12,:64]
|
||||
uxth r6, r10, ror #16
|
||||
uxth r10, r10
|
||||
add r6 , r1, r6, lsl #3
|
||||
add r10, r1, r10, lsl #3
|
||||
vst2.32 {d24[0],d25[0]},[r10,:64]
|
||||
vst2.32 {d24[1],d25[1]},[r6,:64]
|
||||
b 1b
|
||||
1:
|
||||
vneg.f32 d7, d7 @ R*s-I*c
|
||||
uxth r12, r6, ror #16
|
||||
uxth r6, r6
|
||||
add r12, r1, r12, lsl #3
|
||||
add r6, r1, r6, lsl #3
|
||||
vst2.32 {d6[0],d7[0]}, [r6,:64]
|
||||
vst2.32 {d6[1],d7[1]}, [r12,:64]
|
||||
uxth r6, r10, ror #16
|
||||
uxth r10, r10
|
||||
add r6 , r1, r6, lsl #3
|
||||
add r10, r1, r10, lsl #3
|
||||
vst2.32 {d24[0],d25[0]},[r10,:64]
|
||||
vst2.32 {d24[1],d25[1]},[r6,:64]
|
||||
|
||||
mov r4, r0
|
||||
mov r6, r1
|
||||
bl ff_fft_calc_neon
|
||||
|
||||
mov r12, #1
|
||||
ldr lr, [r4, #20] @ mdct_bits
|
||||
ldr r4, [r4, #24] @ tcos
|
||||
lsl r12, r12, lr @ n = 1 << nbits
|
||||
lsr lr, r12, #3 @ n8 = n >> 3
|
||||
|
||||
add r4, r4, lr, lsl #3
|
||||
add r6, r6, lr, lsl #3
|
||||
sub r1, r4, #16
|
||||
sub r3, r6, #16
|
||||
|
||||
mov r7, #-16
|
||||
mov r8, r6
|
||||
mov r0, r3
|
||||
|
||||
vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
|
||||
vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
|
||||
vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
|
||||
1:
|
||||
subs lr, lr, #2
|
||||
vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
|
||||
vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
|
||||
vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
|
||||
vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
|
||||
vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
|
||||
vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
|
||||
vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
|
||||
vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
|
||||
vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
|
||||
vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
|
||||
vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
|
||||
vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
|
||||
vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
|
||||
vneg.f32 q2, q2
|
||||
beq 1f
|
||||
vld2.32 {d0-d1}, [r3,:128], r7
|
||||
vld2.32 {d20-d21},[r6,:128]!
|
||||
vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
|
||||
vrev64.32 q3, q3
|
||||
vst2.32 {d4,d6}, [r0,:128], r7
|
||||
vst2.32 {d5,d7}, [r8,:128]!
|
||||
b 1b
|
||||
1:
|
||||
vrev64.32 q3, q3
|
||||
vst2.32 {d4,d6}, [r0,:128]
|
||||
vst2.32 {d5,d7}, [r8,:128]
|
||||
|
||||
pop {r4-r10,pc}
|
||||
endfunc
|
@ -1,347 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2013 RISC OS Open Ltd
|
||||
* Author: Ben Avison <bavison@riscosopen.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
CONTEXT .req a1
|
||||
ORIGOUT .req a2
|
||||
IN .req a3
|
||||
OUT .req v1
|
||||
REVTAB .req v2
|
||||
TCOS .req v3
|
||||
TSIN .req v4
|
||||
OLDFPSCR .req v5
|
||||
J0 .req a2
|
||||
J1 .req a4
|
||||
J2 .req ip
|
||||
J3 .req lr
|
||||
REVTAB_HI .req v5
|
||||
IN_HI .req v6
|
||||
OUT_HI .req v6
|
||||
TCOS_HI .req sl
|
||||
TSIN_HI .req fp
|
||||
|
||||
.macro prerotation_innerloop
|
||||
.set trig_lo, k
|
||||
.set trig_hi, n4 - k - 2
|
||||
.set in_lo, trig_lo * 2
|
||||
.set in_hi, trig_hi * 2
|
||||
vldr d8, [TCOS, #trig_lo*4] @ s16,s17
|
||||
vldr d9, [TCOS, #trig_hi*4] @ s18,s19
|
||||
vldr s0, [IN, #in_hi*4 + 12]
|
||||
vldr s1, [IN, #in_hi*4 + 4]
|
||||
vldr s2, [IN, #in_lo*4 + 12]
|
||||
vldr s3, [IN, #in_lo*4 + 4]
|
||||
vmul.f s8, s0, s16 @ vector operation
|
||||
vldr d10, [TSIN, #trig_lo*4] @ s20,s21
|
||||
vldr d11, [TSIN, #trig_hi*4] @ s22,s23
|
||||
vldr s4, [IN, #in_lo*4]
|
||||
vldr s5, [IN, #in_lo*4 + 8]
|
||||
vldr s6, [IN, #in_hi*4]
|
||||
vldr s7, [IN, #in_hi*4 + 8]
|
||||
ldr J0, [REVTAB, #trig_lo*2]
|
||||
vmul.f s12, s0, s20 @ vector operation
|
||||
ldr J2, [REVTAB, #trig_hi*2]
|
||||
mov J1, J0, lsr #16
|
||||
and J0, J0, #255 @ halfword value will be < n4
|
||||
vmls.f s8, s4, s20 @ vector operation
|
||||
mov J3, J2, lsr #16
|
||||
and J2, J2, #255 @ halfword value will be < n4
|
||||
add J0, OUT, J0, lsl #3
|
||||
vmla.f s12, s4, s16 @ vector operation
|
||||
add J1, OUT, J1, lsl #3
|
||||
add J2, OUT, J2, lsl #3
|
||||
add J3, OUT, J3, lsl #3
|
||||
vstr s8, [J0]
|
||||
vstr s9, [J1]
|
||||
vstr s10, [J2]
|
||||
vstr s11, [J3]
|
||||
vstr s12, [J0, #4]
|
||||
vstr s13, [J1, #4]
|
||||
vstr s14, [J2, #4]
|
||||
vstr s15, [J3, #4]
|
||||
.set k, k + 2
|
||||
.endm
|
||||
|
||||
.macro prerotation_innerloop_rolled
|
||||
vldmia TCOS!, {s16,s17}
|
||||
vldmdb TCOS_HI!, {s18,s19}
|
||||
vldr s0, [IN_HI, #-4]
|
||||
vldr s1, [IN_HI, #-12]
|
||||
vldr s2, [IN, #12]
|
||||
vldr s3, [IN, #4]
|
||||
vmul.f s8, s0, s16 @ vector operation
|
||||
vldmia TSIN!, {s20,s21}
|
||||
vldmdb TSIN_HI!, {s22,s23}
|
||||
vldr s4, [IN]
|
||||
vldr s5, [IN, #8]
|
||||
vldr s6, [IN_HI, #-16]
|
||||
vldr s7, [IN_HI, #-8]
|
||||
vmul.f s12, s0, s20 @ vector operation
|
||||
add IN, IN, #16
|
||||
sub IN_HI, IN_HI, #16
|
||||
ldrh J0, [REVTAB], #2
|
||||
ldrh J1, [REVTAB], #2
|
||||
vmls.f s8, s4, s20 @ vector operation
|
||||
ldrh J3, [REVTAB_HI, #-2]!
|
||||
ldrh J2, [REVTAB_HI, #-2]!
|
||||
add J0, OUT, J0, lsl #3
|
||||
vmla.f s12, s4, s16 @ vector operation
|
||||
add J1, OUT, J1, lsl #3
|
||||
add J2, OUT, J2, lsl #3
|
||||
add J3, OUT, J3, lsl #3
|
||||
vstr s8, [J0]
|
||||
vstr s9, [J1]
|
||||
vstr s10, [J2]
|
||||
vstr s11, [J3]
|
||||
vstr s12, [J0, #4]
|
||||
vstr s13, [J1, #4]
|
||||
vstr s14, [J2, #4]
|
||||
vstr s15, [J3, #4]
|
||||
.endm
|
||||
|
||||
.macro postrotation_innerloop tail, head
|
||||
.set trig_lo_head, n8 - k - 2
|
||||
.set trig_hi_head, n8 + k
|
||||
.set out_lo_head, trig_lo_head * 2
|
||||
.set out_hi_head, trig_hi_head * 2
|
||||
.set trig_lo_tail, n8 - (k - 2) - 2
|
||||
.set trig_hi_tail, n8 + (k - 2)
|
||||
.set out_lo_tail, trig_lo_tail * 2
|
||||
.set out_hi_tail, trig_hi_tail * 2
|
||||
.if (k & 2) == 0
|
||||
TCOS_D0_HEAD .req d10 @ s20,s21
|
||||
TCOS_D1_HEAD .req d11 @ s22,s23
|
||||
TCOS_S0_TAIL .req s24
|
||||
.else
|
||||
TCOS_D0_HEAD .req d12 @ s24,s25
|
||||
TCOS_D1_HEAD .req d13 @ s26,s27
|
||||
TCOS_S0_TAIL .req s20
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
|
||||
vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
|
||||
vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vldr s0, [OUT, #out_lo_head*4]
|
||||
vldr s1, [OUT, #out_lo_head*4 + 8]
|
||||
vldr s2, [OUT, #out_hi_head*4]
|
||||
vldr s3, [OUT, #out_hi_head*4 + 8]
|
||||
vldr s4, [OUT, #out_lo_head*4 + 4]
|
||||
vldr s5, [OUT, #out_lo_head*4 + 12]
|
||||
vldr s6, [OUT, #out_hi_head*4 + 4]
|
||||
vldr s7, [OUT, #out_hi_head*4 + 12]
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vstr s8, [OUT, #out_lo_tail*4]
|
||||
vstr s9, [OUT, #out_lo_tail*4 + 8]
|
||||
vstr s10, [OUT, #out_hi_tail*4]
|
||||
vstr s11, [OUT, #out_hi_tail*4 + 8]
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vmul.f s8, s4, s16 @ vector operation
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vstr s12, [OUT, #out_hi_tail*4 + 12]
|
||||
vstr s13, [OUT, #out_hi_tail*4 + 4]
|
||||
vstr s14, [OUT, #out_lo_tail*4 + 12]
|
||||
vstr s15, [OUT, #out_lo_tail*4 + 4]
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vmul.f s12, s0, s16 @ vector operation
|
||||
vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
|
||||
.endif
|
||||
.unreq TCOS_D0_HEAD
|
||||
.unreq TCOS_D1_HEAD
|
||||
.unreq TCOS_S0_TAIL
|
||||
.ifnc "\head",""
|
||||
.set k, k + 2
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
|
||||
.ifnc "\tail",""
|
||||
vmls.f s8, s0, \tcos_s0_tail @ vector operation
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vldmia TSIN!, {s16,s17}
|
||||
vldmdb TSIN_HI!, {s18,s19}
|
||||
vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head}
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vmla.f s12, s4, \tcos_s0_tail @ vector operation
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vldr s0, [OUT, #+\out_offset_head+0]
|
||||
vldr s1, [OUT, #+\out_offset_head+8]
|
||||
vldr s2, [OUT_HI, #-\out_offset_head-16]
|
||||
vldr s3, [OUT_HI, #-\out_offset_head-8]
|
||||
vldr s4, [OUT, #+\out_offset_head+4]
|
||||
vldr s5, [OUT, #+\out_offset_head+12]
|
||||
vldr s6, [OUT_HI, #-\out_offset_head-12]
|
||||
vldr s7, [OUT_HI, #-\out_offset_head-4]
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vstr s8, [OUT, #+\out_offset_tail+0]
|
||||
vstr s9, [OUT, #+\out_offset_tail+8]
|
||||
vstr s10, [OUT_HI, #-\out_offset_tail-16]
|
||||
vstr s11, [OUT_HI, #-\out_offset_tail-8]
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vmul.f s8, s4, s16 @ vector operation
|
||||
.endif
|
||||
.ifnc "\tail",""
|
||||
vstr s12, [OUT_HI, #-\out_offset_tail-4]
|
||||
vstr s13, [OUT_HI, #-\out_offset_tail-12]
|
||||
vstr s14, [OUT, #+\out_offset_tail+12]
|
||||
vstr s15, [OUT, #+\out_offset_tail+4]
|
||||
.endif
|
||||
.ifnc "\head",""
|
||||
vmul.f s12, s0, s16 @ vector operation
|
||||
vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
/* void ff_imdct_half_vfp(FFTContext *s,
|
||||
* FFTSample *output,
|
||||
* const FFTSample *input)
|
||||
*/
|
||||
function ff_imdct_half_vfp, export=1
|
||||
ldr ip, [CONTEXT, #5*4] @ mdct_bits
|
||||
teq ip, #6
|
||||
bne 10f
|
||||
|
||||
.set n, 1<<6
|
||||
.set n2, n/2
|
||||
.set n4, n/4
|
||||
.set n8, n/8
|
||||
|
||||
push {v1-v5,lr}
|
||||
vpush {s16-s27}
|
||||
fmrx OLDFPSCR, FPSCR
|
||||
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
fmxr FPSCR, lr
|
||||
mov OUT, ORIGOUT
|
||||
ldr REVTAB, [CONTEXT, #2*4]
|
||||
ldr TCOS, [CONTEXT, #6*4]
|
||||
ldr TSIN, [CONTEXT, #7*4]
|
||||
|
||||
.set k, 0
|
||||
.rept n8/2
|
||||
prerotation_innerloop
|
||||
.endr
|
||||
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
mov a1, OUT
|
||||
bl X(ff_fft16_vfp)
|
||||
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
fmxr FPSCR, lr
|
||||
|
||||
.set k, 0
|
||||
postrotation_innerloop , head
|
||||
.rept n8/2 - 1
|
||||
postrotation_innerloop tail, head
|
||||
.endr
|
||||
postrotation_innerloop tail
|
||||
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
vpop {s16-s27}
|
||||
pop {v1-v5,pc}
|
||||
|
||||
10:
|
||||
push {v1-v6,sl,fp,lr}
|
||||
vpush {s16-s27}
|
||||
fmrx OLDFPSCR, FPSCR
|
||||
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
fmxr FPSCR, lr
|
||||
mov lr, #1
|
||||
mov OUT, ORIGOUT
|
||||
ldr REVTAB, [CONTEXT, #2*4]
|
||||
ldr TCOS, [CONTEXT, #6*4]
|
||||
ldr TSIN, [CONTEXT, #7*4]
|
||||
mov lr, lr, lsl ip
|
||||
|
||||
push {CONTEXT,OLDFPSCR}
|
||||
add IN_HI, IN, lr, lsl #1
|
||||
add REVTAB_HI, REVTAB, lr, lsr #1
|
||||
add TCOS_HI, TCOS, lr
|
||||
add TSIN_HI, TSIN, lr
|
||||
0: prerotation_innerloop_rolled
|
||||
teq IN, IN_HI
|
||||
bne 0b
|
||||
ldmia sp, {CONTEXT,OLDFPSCR}
|
||||
|
||||
mov ORIGOUT, OUT
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
ldr ip, [CONTEXT, #9*4]
|
||||
blx ip @ s->fft_calc(s, output)
|
||||
|
||||
pop {CONTEXT,OLDFPSCR}
|
||||
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
ldr ip, [CONTEXT, #5*4] @ mdct_bits
|
||||
fmxr FPSCR, lr
|
||||
mov lr, #1
|
||||
mov lr, lr, lsl ip
|
||||
sub TCOS, TCOS, lr, lsr #1
|
||||
sub TSIN, TSIN, lr, lsr #1
|
||||
add OUT_HI, OUT, lr, lsl #1
|
||||
add TCOS_HI, TCOS, lr
|
||||
add TSIN_HI, TSIN, lr
|
||||
postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
|
||||
b 1f
|
||||
0: add OUT, OUT, #32
|
||||
sub OUT_HI, OUT_HI, #32
|
||||
postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
|
||||
1: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
|
||||
teq TSIN, TSIN_HI
|
||||
bne 0b
|
||||
postrotation_innerloop_rolled tail,,,,,, s24,, 16
|
||||
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
vpop {s16-s27}
|
||||
pop {v1-v6,sl,fp,pc}
|
||||
endfunc
|
||||
|
||||
.unreq CONTEXT
|
||||
.unreq ORIGOUT
|
||||
.unreq IN
|
||||
.unreq OUT
|
||||
.unreq REVTAB
|
||||
.unreq TCOS
|
||||
.unreq TSIN
|
||||
.unreq OLDFPSCR
|
||||
.unreq J0
|
||||
.unreq J1
|
||||
.unreq J2
|
||||
.unreq J3
|
||||
.unreq REVTAB_HI
|
||||
.unreq IN_HI
|
||||
.unreq OUT_HI
|
||||
.unreq TCOS_HI
|
||||
.unreq TSIN_HI
|
@ -1,33 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/arm/cpu.h"
|
||||
|
||||
#include "libavcodec/rdft.h"
|
||||
|
||||
void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
|
||||
|
||||
av_cold void ff_rdft_init_arm(RDFTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags))
|
||||
s->rdft_calc = ff_rdft_calc_neon;
|
||||
}
|
@ -1,155 +0,0 @@
|
||||
/*
|
||||
* ARM NEON optimised RDFT
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/arm/asm.S"
|
||||
|
||||
function ff_rdft_calc_neon, export=1
|
||||
push {r4-r8,lr}
|
||||
|
||||
ldr r6, [r0, #4] @ inverse
|
||||
mov r4, r0
|
||||
mov r5, r1
|
||||
|
||||
lsls r6, r6, #31
|
||||
bne 1f
|
||||
add r0, r4, #24
|
||||
bl X(ff_fft_permute_neon)
|
||||
add r0, r4, #24
|
||||
mov r1, r5
|
||||
bl X(ff_fft_calc_neon)
|
||||
1:
|
||||
ldr r12, [r4, #0] @ nbits
|
||||
mov r2, #1
|
||||
ldr r8, [r4, #20] @ negative_sin
|
||||
lsl r12, r2, r12
|
||||
add r0, r5, #8
|
||||
lsl r8, r8, #31
|
||||
add r1, r5, r12, lsl #2
|
||||
lsr r12, r12, #2
|
||||
vdup.32 d26, r8
|
||||
ldr r2, [r4, #12] @ tcos
|
||||
sub r12, r12, #2
|
||||
ldr r3, [r4, #16] @ tsin
|
||||
mov r7, r0
|
||||
sub r1, r1, #8
|
||||
mov lr, r1
|
||||
mov r8, #-8
|
||||
vld1.32 {d0}, [r0,:64]! @ d1[0,1]
|
||||
vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
|
||||
vld1.32 {d4}, [r2,:64]! @ tcos[i]
|
||||
vld1.32 {d5}, [r3,:64]! @ tsin[i]
|
||||
vmov.f32 d18, #0.5 @ k1
|
||||
vdup.32 d19, r6
|
||||
veor d5, d26, d5
|
||||
pld [r0, #32]
|
||||
veor d19, d18, d19 @ k2
|
||||
vmov.i32 d16, #0
|
||||
vmov.i32 d17, #1<<31
|
||||
pld [r1, #-32]
|
||||
vtrn.32 d16, d17
|
||||
pld [r2, #32]
|
||||
vrev64.32 d16, d16 @ d16=1,0 d17=0,1
|
||||
pld [r3, #32]
|
||||
2:
|
||||
veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
|
||||
vld1.32 {d24}, [r0,:64]! @ d1[0,1]
|
||||
vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
|
||||
vld1.32 {d25}, [r1,:64], r8 @ d2[0,1]
|
||||
vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
|
||||
veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1]
|
||||
pld [r0, #32]
|
||||
vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
|
||||
pld [r1, #-32]
|
||||
vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1]
|
||||
vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1]
|
||||
vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re
|
||||
veor d7, d21, d16 @ -od.im, od.re
|
||||
vrev64.32 d3, d21 @ od.re, od.im
|
||||
veor d6, d20, d17 @ ev.re,-ev.im
|
||||
veor d2, d3, d16 @ -od.re, od.im
|
||||
vmla.f32 d20, d3, d4[1]
|
||||
vmla.f32 d20, d7, d5[1]
|
||||
vmla.f32 d6, d2, d4[1]
|
||||
vmla.f32 d6, d21, d5[1]
|
||||
vld1.32 {d4}, [r2,:64]! @ tcos[i]
|
||||
veor d7, d23, d16 @ -od.im, od.re
|
||||
vld1.32 {d5}, [r3,:64]! @ tsin[i]
|
||||
veor d24, d22, d17 @ ev.re,-ev.im
|
||||
vrev64.32 d3, d23 @ od.re, od.im
|
||||
veor d5, d26, d5
|
||||
pld [r2, #32]
|
||||
veor d2, d3, d16 @ -od.re, od.im
|
||||
pld [r3, #32]
|
||||
vmla.f32 d22, d3, d4[0]
|
||||
vmla.f32 d22, d7, d5[0]
|
||||
vmla.f32 d24, d2, d4[0]
|
||||
vmla.f32 d24, d23, d5[0]
|
||||
vld1.32 {d0}, [r0,:64]! @ d1[0,1]
|
||||
vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
|
||||
vst1.32 {d20}, [r7,:64]!
|
||||
vst1.32 {d6}, [lr,:64], r8
|
||||
vst1.32 {d22}, [r7,:64]!
|
||||
vst1.32 {d24}, [lr,:64], r8
|
||||
subs r12, r12, #2
|
||||
bgt 2b
|
||||
|
||||
veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
|
||||
vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
|
||||
vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
|
||||
ldr r2, [r4, #8] @ sign_convention
|
||||
vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
|
||||
add r0, r0, #4
|
||||
bfc r2, #0, #31
|
||||
vld1.32 {d0[0]}, [r0,:32]
|
||||
veor d7, d21, d16 @ -od.im, od.re
|
||||
vrev64.32 d3, d21 @ od.re, od.im
|
||||
veor d6, d20, d17 @ ev.re,-ev.im
|
||||
vld1.32 {d22}, [r5,:64]
|
||||
vdup.32 d1, r2
|
||||
vmov d23, d22
|
||||
veor d2, d3, d16 @ -od.re, od.im
|
||||
vtrn.32 d22, d23
|
||||
veor d0, d0, d1
|
||||
veor d23, d23, d17
|
||||
vmla.f32 d20, d3, d4[1]
|
||||
vmla.f32 d20, d7, d5[1]
|
||||
vmla.f32 d6, d2, d4[1]
|
||||
vmla.f32 d6, d21, d5[1]
|
||||
vadd.f32 d22, d22, d23
|
||||
vst1.32 {d20}, [r7,:64]
|
||||
vst1.32 {d6}, [lr,:64]
|
||||
vst1.32 {d0[0]}, [r0,:32]
|
||||
vst1.32 {d22}, [r5,:64]
|
||||
|
||||
cmp r6, #0
|
||||
it eq
|
||||
popeq {r4-r8,pc}
|
||||
|
||||
vmul.f32 d22, d22, d18
|
||||
vst1.32 {d22}, [r5,:64]
|
||||
add r0, r4, #24
|
||||
mov r1, r5
|
||||
bl X(ff_fft_permute_neon)
|
||||
add r0, r4, #24
|
||||
mov r1, r5
|
||||
pop {r4-r8,lr}
|
||||
b X(ff_fft_calc_neon)
|
||||
endfunc
|
@ -23,7 +23,6 @@
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavcodec/fft.h"
|
||||
#include "libavcodec/synth_filter.h"
|
||||
|
||||
void ff_synth_filter_float_vfp(AVTXContext *imdct,
|
||||
|
@ -1,80 +0,0 @@
|
||||
/*
|
||||
* Generate a header file for hardcoded ff_cos_* tables
|
||||
*
|
||||
* Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "libavutil/mathematics.h"
|
||||
|
||||
#define BITS 17
|
||||
#define FLOATFMT "%.18e"
|
||||
#define FIXEDFMT "%6d"
|
||||
|
||||
static int clip_f15(int v)
|
||||
{
|
||||
return v < -32767 ? -32767 :
|
||||
v > 32767 ? 32767 :
|
||||
v;
|
||||
}
|
||||
|
||||
static void printval(double val, int fixed)
|
||||
{
|
||||
if (fixed) {
|
||||
/* lrint() isn't always available, so round and cast manually. */
|
||||
double new_val = val * (double) (1 << 15);
|
||||
|
||||
new_val = new_val >= 0 ? floor(new_val + 0.5) : ceil(new_val - 0.5);
|
||||
|
||||
printf(" "FIXEDFMT",", clip_f15((long int) new_val));
|
||||
} else {
|
||||
printf(" "FLOATFMT",", val);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i, j;
|
||||
int do_sin = argc > 1 && !strcmp(argv[1], "sin");
|
||||
int fixed = argc > 1 && strstr(argv[1], "fixed");
|
||||
double (*func)(double) = do_sin ? sin : cos;
|
||||
|
||||
printf("/* This file was automatically generated. */\n");
|
||||
printf("#define FFT_FLOAT %d\n", !fixed);
|
||||
printf("#include \"libavcodec/%s\"\n", do_sin ? "rdft.h" : "fft.h");
|
||||
for (i = 4; i <= BITS; i++) {
|
||||
int m = 1 << i;
|
||||
double freq = 2*M_PI/m;
|
||||
printf("%s(%i) = {\n ", do_sin ? "SINTABLE" : "COSTABLE", m);
|
||||
for (j = 0; j < m/2 - 1; j++) {
|
||||
int idx = j > m/4 ? m/2 - j : j;
|
||||
if (do_sin && j >= m/4)
|
||||
idx = m/4 - j;
|
||||
printval(func(idx*freq), fixed);
|
||||
if ((j & 3) == 3)
|
||||
printf("\n ");
|
||||
}
|
||||
printval(func(do_sin ? -(m/4 - 1)*freq : freq), fixed);
|
||||
printf("\n};\n");
|
||||
}
|
||||
return 0;
|
||||
}
|
228
libavcodec/dct.c
228
libavcodec/dct.c
@ -1,228 +0,0 @@
|
||||
/*
|
||||
* (I)DCT Transforms
|
||||
* Copyright (c) 2009 Peter Ross <pross@xvid.org>
|
||||
* Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
|
||||
* Copyright (c) 2010 Vitor Sessak
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* (Inverse) Discrete Cosine Transforms. These are also known as the
|
||||
* type II and type III DCTs respectively.
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "libavutil/error.h"
|
||||
#include "libavutil/mathematics.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "dct.h"
|
||||
#include "dct32.h"
|
||||
|
||||
/* sin((M_PI * x / (2 * n)) */
|
||||
#define SIN(s, n, x) (s->costab[(n) - (x)])
|
||||
|
||||
/* cos((M_PI * x / (2 * n)) */
|
||||
#define COS(s, n, x) (s->costab[x])
|
||||
|
||||
static void dst_calc_I_c(DCTContext *ctx, FFTSample *data)
|
||||
{
|
||||
int n = 1 << ctx->nbits;
|
||||
int i;
|
||||
|
||||
data[0] = 0;
|
||||
for (i = 1; i < n / 2; i++) {
|
||||
float tmp1 = data[i ];
|
||||
float tmp2 = data[n - i];
|
||||
float s = SIN(ctx, n, 2 * i);
|
||||
|
||||
s *= tmp1 + tmp2;
|
||||
tmp1 = (tmp1 - tmp2) * 0.5f;
|
||||
data[i] = s + tmp1;
|
||||
data[n - i] = s - tmp1;
|
||||
}
|
||||
|
||||
data[n / 2] *= 2;
|
||||
ctx->rdft.rdft_calc(&ctx->rdft, data);
|
||||
|
||||
data[0] *= 0.5f;
|
||||
|
||||
for (i = 1; i < n - 2; i += 2) {
|
||||
data[i + 1] += data[i - 1];
|
||||
data[i] = -data[i + 2];
|
||||
}
|
||||
|
||||
data[n - 1] = 0;
|
||||
}
|
||||
|
||||
static void dct_calc_I_c(DCTContext *ctx, FFTSample *data)
|
||||
{
|
||||
int n = 1 << ctx->nbits;
|
||||
int i;
|
||||
float next = -0.5f * (data[0] - data[n]);
|
||||
|
||||
for (i = 0; i < n / 2; i++) {
|
||||
float tmp1 = data[i];
|
||||
float tmp2 = data[n - i];
|
||||
float s = SIN(ctx, n, 2 * i);
|
||||
float c = COS(ctx, n, 2 * i);
|
||||
|
||||
c *= tmp1 - tmp2;
|
||||
s *= tmp1 - tmp2;
|
||||
|
||||
next += c;
|
||||
|
||||
tmp1 = (tmp1 + tmp2) * 0.5f;
|
||||
data[i] = tmp1 - s;
|
||||
data[n - i] = tmp1 + s;
|
||||
}
|
||||
|
||||
ctx->rdft.rdft_calc(&ctx->rdft, data);
|
||||
data[n] = data[1];
|
||||
data[1] = next;
|
||||
|
||||
for (i = 3; i <= n; i += 2)
|
||||
data[i] = data[i - 2] - data[i];
|
||||
}
|
||||
|
||||
static void dct_calc_III_c(DCTContext *ctx, FFTSample *data)
|
||||
{
|
||||
int n = 1 << ctx->nbits;
|
||||
int i;
|
||||
|
||||
float next = data[n - 1];
|
||||
float inv_n = 1.0f / n;
|
||||
|
||||
for (i = n - 2; i >= 2; i -= 2) {
|
||||
float val1 = data[i];
|
||||
float val2 = data[i - 1] - data[i + 1];
|
||||
float c = COS(ctx, n, i);
|
||||
float s = SIN(ctx, n, i);
|
||||
|
||||
data[i] = c * val1 + s * val2;
|
||||
data[i + 1] = s * val1 - c * val2;
|
||||
}
|
||||
|
||||
data[1] = 2 * next;
|
||||
|
||||
ctx->rdft.rdft_calc(&ctx->rdft, data);
|
||||
|
||||
for (i = 0; i < n / 2; i++) {
|
||||
float tmp1 = data[i] * inv_n;
|
||||
float tmp2 = data[n - i - 1] * inv_n;
|
||||
float csc = ctx->csc2[i] * (tmp1 - tmp2);
|
||||
|
||||
tmp1 += tmp2;
|
||||
data[i] = tmp1 + csc;
|
||||
data[n - i - 1] = tmp1 - csc;
|
||||
}
|
||||
}
|
||||
|
||||
static void dct_calc_II_c(DCTContext *ctx, FFTSample *data)
|
||||
{
|
||||
int n = 1 << ctx->nbits;
|
||||
int i;
|
||||
float next;
|
||||
|
||||
for (i = 0; i < n / 2; i++) {
|
||||
float tmp1 = data[i];
|
||||
float tmp2 = data[n - i - 1];
|
||||
float s = SIN(ctx, n, 2 * i + 1);
|
||||
|
||||
s *= tmp1 - tmp2;
|
||||
tmp1 = (tmp1 + tmp2) * 0.5f;
|
||||
|
||||
data[i] = tmp1 + s;
|
||||
data[n-i-1] = tmp1 - s;
|
||||
}
|
||||
|
||||
ctx->rdft.rdft_calc(&ctx->rdft, data);
|
||||
|
||||
next = data[1] * 0.5;
|
||||
data[1] *= -1;
|
||||
|
||||
for (i = n - 2; i >= 0; i -= 2) {
|
||||
float inr = data[i ];
|
||||
float ini = data[i + 1];
|
||||
float c = COS(ctx, n, i);
|
||||
float s = SIN(ctx, n, i);
|
||||
|
||||
data[i] = c * inr + s * ini;
|
||||
data[i + 1] = next;
|
||||
|
||||
next += s * inr - c * ini;
|
||||
}
|
||||
}
|
||||
|
||||
static void dct32_func(DCTContext *ctx, FFTSample *data)
|
||||
{
|
||||
ctx->dct32(data, data);
|
||||
}
|
||||
|
||||
av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
|
||||
{
|
||||
int n = 1 << nbits;
|
||||
int i;
|
||||
int ret;
|
||||
|
||||
memset(s, 0, sizeof(*s));
|
||||
|
||||
s->nbits = nbits;
|
||||
s->inverse = inverse;
|
||||
|
||||
if (inverse == DCT_II && nbits == 5) {
|
||||
s->dct_calc = dct32_func;
|
||||
} else {
|
||||
ff_init_ff_cos_tabs(nbits + 2);
|
||||
|
||||
s->costab = ff_cos_tabs[nbits + 2];
|
||||
s->csc2 = av_malloc_array(n / 2, sizeof(FFTSample));
|
||||
if (!s->csc2)
|
||||
return AVERROR(ENOMEM);
|
||||
|
||||
if ((ret = ff_rdft_init(&s->rdft, nbits, inverse == DCT_III)) < 0) {
|
||||
av_freep(&s->csc2);
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (i = 0; i < n / 2; i++)
|
||||
s->csc2[i] = 0.5 / sin((M_PI / (2 * n) * (2 * i + 1)));
|
||||
|
||||
switch (inverse) {
|
||||
case DCT_I : s->dct_calc = dct_calc_I_c; break;
|
||||
case DCT_II : s->dct_calc = dct_calc_II_c; break;
|
||||
case DCT_III: s->dct_calc = dct_calc_III_c; break;
|
||||
case DST_I : s->dct_calc = dst_calc_I_c; break;
|
||||
}
|
||||
}
|
||||
|
||||
s->dct32 = ff_dct32_float;
|
||||
#if ARCH_X86
|
||||
ff_dct_init_x86(s);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
av_cold void ff_dct_end(DCTContext *s)
|
||||
{
|
||||
ff_rdft_end(&s->rdft);
|
||||
av_freep(&s->csc2);
|
||||
}
|
@ -21,37 +21,12 @@
|
||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#if !defined(AVCODEC_DCT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT)
|
||||
#ifndef AVCODEC_DCT_H
|
||||
#define AVCODEC_DCT_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "rdft.h"
|
||||
|
||||
struct DCTContext {
|
||||
int nbits;
|
||||
int inverse;
|
||||
RDFTContext rdft;
|
||||
const float *costab;
|
||||
FFTSample *csc2;
|
||||
void (*dct_calc)(struct DCTContext *s, FFTSample *data);
|
||||
void (*dct32)(FFTSample *out, const FFTSample *in);
|
||||
};
|
||||
|
||||
/**
|
||||
* Set up DCT.
|
||||
* @param nbits size of the input array:
|
||||
* (1 << nbits) for DCT-II, DCT-III and DST-I
|
||||
* (1 << nbits) + 1 for DCT-I
|
||||
*
|
||||
* @note the first element of the input of DST-I is ignored
|
||||
*/
|
||||
int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType type);
|
||||
void ff_dct_end (DCTContext *s);
|
||||
|
||||
void ff_dct_init_x86(DCTContext *s);
|
||||
|
||||
void ff_j_rev_dct(int16_t *data);
|
||||
void ff_j_rev_dct4(int16_t *data);
|
||||
void ff_j_rev_dct2(int16_t *data);
|
||||
|
@ -1,62 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_FFT_INTERNAL_H
|
||||
#define AVCODEC_FFT_INTERNAL_H
|
||||
|
||||
#include "libavutil/mathematics.h"
|
||||
#include "fft.h"
|
||||
|
||||
#if FFT_FLOAT
|
||||
|
||||
#define FIX15(v) (v)
|
||||
#define sqrthalf (float)M_SQRT1_2
|
||||
|
||||
#define BF(x, y, a, b) do { \
|
||||
x = a - b; \
|
||||
y = a + b; \
|
||||
} while (0)
|
||||
|
||||
#define CMUL(dre, dim, are, aim, bre, bim) do { \
|
||||
(dre) = (are) * (bre) - (aim) * (bim); \
|
||||
(dim) = (are) * (bim) + (aim) * (bre); \
|
||||
} while (0)
|
||||
|
||||
#else /* FFT_FLOAT */
|
||||
|
||||
#define CMUL(dre, dim, are, aim, bre, bim) do { \
|
||||
int64_t accu; \
|
||||
(accu) = (int64_t)(bre) * (are); \
|
||||
(accu) -= (int64_t)(bim) * (aim); \
|
||||
(dre) = (int)(((accu) + 0x40000000) >> 31); \
|
||||
(accu) = (int64_t)(bre) * (aim); \
|
||||
(accu) += (int64_t)(bim) * (are); \
|
||||
(dim) = (int)(((accu) + 0x40000000) >> 31); \
|
||||
} while (0)
|
||||
|
||||
#endif /* FFT_FLOAT */
|
||||
|
||||
#define ff_imdct_calc_c FFT_NAME(ff_imdct_calc_c)
|
||||
#define ff_imdct_half_c FFT_NAME(ff_imdct_half_c)
|
||||
#define ff_mdct_calc_c FFT_NAME(ff_mdct_calc_c)
|
||||
|
||||
void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
#endif /* AVCODEC_FFT_INTERNAL_H */
|
160
libavcodec/fft.h
160
libavcodec/fft.h
@ -1,160 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2000, 2001, 2002 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_FFT_H
|
||||
#define AVCODEC_FFT_H
|
||||
|
||||
#ifndef FFT_FLOAT
|
||||
#define FFT_FLOAT 1
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/attributes_internal.h"
|
||||
#include "libavutil/mem_internal.h"
|
||||
|
||||
#if FFT_FLOAT
|
||||
|
||||
#include "avfft.h"
|
||||
|
||||
#define FFT_NAME(x) x
|
||||
|
||||
typedef float FFTDouble;
|
||||
|
||||
#else
|
||||
|
||||
#define Q31(x) (int)((x)*2147483648.0 + 0.5)
|
||||
#define FFT_NAME(x) x ## _fixed_32
|
||||
|
||||
typedef int32_t FFTSample;
|
||||
|
||||
typedef struct FFTComplex {
|
||||
FFTSample re, im;
|
||||
} FFTComplex;
|
||||
|
||||
typedef int FFTDouble;
|
||||
typedef struct FFTContext FFTContext;
|
||||
|
||||
#endif /* FFT_FLOAT */
|
||||
|
||||
typedef struct FFTDComplex {
|
||||
FFTDouble re, im;
|
||||
} FFTDComplex;
|
||||
|
||||
/* FFT computation */
|
||||
|
||||
enum fft_permutation_type {
|
||||
FF_FFT_PERM_DEFAULT,
|
||||
FF_FFT_PERM_SWAP_LSBS,
|
||||
FF_FFT_PERM_AVX,
|
||||
};
|
||||
|
||||
enum mdct_permutation_type {
|
||||
FF_MDCT_PERM_NONE,
|
||||
FF_MDCT_PERM_INTERLEAVE,
|
||||
};
|
||||
|
||||
struct FFTContext {
|
||||
int nbits;
|
||||
int inverse;
|
||||
uint16_t *revtab;
|
||||
FFTComplex *tmp_buf;
|
||||
int mdct_size; /* size of MDCT (i.e. number of input data * 2) */
|
||||
int mdct_bits; /* n = 2^nbits */
|
||||
/* pre/post rotation tables */
|
||||
FFTSample *tcos;
|
||||
FFTSample *tsin;
|
||||
/**
|
||||
* Do the permutation needed BEFORE calling fft_calc().
|
||||
*/
|
||||
void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
|
||||
/**
|
||||
* Do a complex FFT with the parameters defined in ff_fft_init(). The
|
||||
* input data must be permuted before. No 1.0/sqrt(n) normalization is done.
|
||||
*/
|
||||
void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
|
||||
void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
enum fft_permutation_type fft_permutation;
|
||||
enum mdct_permutation_type mdct_permutation;
|
||||
uint32_t *revtab32;
|
||||
};
|
||||
|
||||
#if CONFIG_HARDCODED_TABLES
|
||||
#define COSTABLE_CONST const
|
||||
#define ff_init_ff_cos_tabs(index)
|
||||
#else
|
||||
#define COSTABLE_CONST
|
||||
#define ff_init_ff_cos_tabs FFT_NAME(ff_init_ff_cos_tabs)
|
||||
|
||||
/**
|
||||
* Initialize the cosine table in ff_cos_tabs[index]
|
||||
* @param index index in ff_cos_tabs array of the table to initialize
|
||||
*/
|
||||
void ff_init_ff_cos_tabs(int index);
|
||||
#endif
|
||||
|
||||
#define COSTABLE(size) \
|
||||
COSTABLE_CONST attribute_visibility_hidden DECLARE_ALIGNED(32, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
|
||||
|
||||
extern COSTABLE(16);
|
||||
extern COSTABLE(32);
|
||||
extern COSTABLE(64);
|
||||
extern COSTABLE(128);
|
||||
extern COSTABLE(256);
|
||||
extern COSTABLE(512);
|
||||
extern COSTABLE(1024);
|
||||
extern COSTABLE(2048);
|
||||
extern COSTABLE(4096);
|
||||
extern COSTABLE(8192);
|
||||
extern COSTABLE(16384);
|
||||
extern COSTABLE(32768);
|
||||
extern COSTABLE(65536);
|
||||
extern COSTABLE(131072);
|
||||
extern COSTABLE_CONST FFTSample* const FFT_NAME(ff_cos_tabs)[18];
|
||||
|
||||
#define ff_fft_init FFT_NAME(ff_fft_init)
|
||||
#define ff_fft_end FFT_NAME(ff_fft_end)
|
||||
|
||||
/**
|
||||
* Set up a complex FFT.
|
||||
* @param nbits log2 of the length of the input array
|
||||
* @param inverse if 0 perform the forward transform, if 1 perform the inverse
|
||||
*/
|
||||
int ff_fft_init(FFTContext *s, int nbits, int inverse);
|
||||
|
||||
void ff_fft_init_aarch64(FFTContext *s);
|
||||
void ff_fft_init_x86(FFTContext *s);
|
||||
void ff_fft_init_arm(FFTContext *s);
|
||||
void ff_fft_init_mips(FFTContext *s);
|
||||
void ff_fft_init_ppc(FFTContext *s);
|
||||
|
||||
void ff_fft_end(FFTContext *s);
|
||||
|
||||
#define ff_mdct_init FFT_NAME(ff_mdct_init)
|
||||
#define ff_mdct_end FFT_NAME(ff_mdct_end)
|
||||
|
||||
int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale);
|
||||
void ff_mdct_end(FFTContext *s);
|
||||
|
||||
#endif /* AVCODEC_FFT_H */
|
@ -1,51 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2012
|
||||
* MIPS Technologies, Inc., California.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Stanislav Ocovaj (socovaj@mips.com)
|
||||
* Goran Cordasic (goran@mips.com)
|
||||
* Djordje Pesut (djordje@mips.com)
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#define FFT_FLOAT 0
|
||||
#include "fft_template.c"
|
@ -1,20 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#define FFT_FLOAT 1
|
||||
#include "fft_template.c"
|
@ -1,344 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2012
|
||||
* MIPS Technologies, Inc., California.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Stanislav Ocovaj (socovaj@mips.com)
|
||||
* Goran Cordasic (goran@mips.com)
|
||||
* Djordje Pesut (djordje@mips.com)
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* definitions and initialization of LUT table for FFT
|
||||
*/
|
||||
#include "libavutil/thread.h"
|
||||
|
||||
#include "libavcodec/fft_table.h"
|
||||
|
||||
const int32_t ff_w_tab_sr[MAX_FFT_SIZE/(4*16)] = {
|
||||
2147483647, 2147483016, 2147481121, 2147477963, 2147473542, 2147467857, 2147460908, 2147452697,
|
||||
2147443222, 2147432484, 2147420483, 2147407218, 2147392690, 2147376899, 2147359845, 2147341527,
|
||||
2147321946, 2147301102, 2147278995, 2147255625, 2147230991, 2147205094, 2147177934, 2147149511,
|
||||
2147119825, 2147088876, 2147056664, 2147023188, 2146988450, 2146952448, 2146915184, 2146876656,
|
||||
2146836866, 2146795813, 2146753497, 2146709917, 2146665076, 2146618971, 2146571603, 2146522973,
|
||||
2146473080, 2146421924, 2146369505, 2146315824, 2146260881, 2146204674, 2146147205, 2146088474,
|
||||
2146028480, 2145967224, 2145904705, 2145840924, 2145775880, 2145709574, 2145642006, 2145573176,
|
||||
2145503083, 2145431729, 2145359112, 2145285233, 2145210092, 2145133690, 2145056025, 2144977098,
|
||||
2144896910, 2144815460, 2144732748, 2144648774, 2144563539, 2144477042, 2144389283, 2144300264,
|
||||
2144209982, 2144118439, 2144025635, 2143931570, 2143836244, 2143739656, 2143641807, 2143542697,
|
||||
2143442326, 2143340694, 2143237802, 2143133648, 2143028234, 2142921559, 2142813624, 2142704427,
|
||||
2142593971, 2142482254, 2142369276, 2142255039, 2142139541, 2142022783, 2141904764, 2141785486,
|
||||
2141664948, 2141543150, 2141420092, 2141295774, 2141170197, 2141043360, 2140915264, 2140785908,
|
||||
2140655293, 2140523418, 2140390284, 2140255892, 2140120240, 2139983329, 2139845159, 2139705730,
|
||||
2139565043, 2139423097, 2139279892, 2139135429, 2138989708, 2138842728, 2138694490, 2138544994,
|
||||
2138394240, 2138242228, 2138088958, 2137934430, 2137778644, 2137621601, 2137463301, 2137303743,
|
||||
2137142927, 2136980855, 2136817525, 2136652938, 2136487095, 2136319994, 2136151637, 2135982023,
|
||||
2135811153, 2135639026, 2135465642, 2135291003, 2135115107, 2134937956, 2134759548, 2134579885,
|
||||
2134398966, 2134216791, 2134033361, 2133848675, 2133662734, 2133475538, 2133287087, 2133097381,
|
||||
2132906420, 2132714204, 2132520734, 2132326009, 2132130030, 2131932796, 2131734309, 2131534567,
|
||||
2131333572, 2131131322, 2130927819, 2130723062, 2130517052, 2130309789, 2130101272, 2129891502,
|
||||
2129680480, 2129468204, 2129254676, 2129039895, 2128823862, 2128606576, 2128388038, 2128168248,
|
||||
2127947206, 2127724913, 2127501367, 2127276570, 2127050522, 2126823222, 2126594672, 2126364870,
|
||||
2126133817, 2125901514, 2125667960, 2125433155, 2125197100, 2124959795, 2124721240, 2124481435,
|
||||
2124240380, 2123998076, 2123754522, 2123509718, 2123263666, 2123016364, 2122767814, 2122518015,
|
||||
2122266967, 2122014670, 2121761126, 2121506333, 2121250292, 2120993003, 2120734467, 2120474683,
|
||||
2120213651, 2119951372, 2119687847, 2119423074, 2119157054, 2118889788, 2118621275, 2118351516,
|
||||
2118080511, 2117808259, 2117534762, 2117260020, 2116984031, 2116706797, 2116428319, 2116148595,
|
||||
2115867626, 2115585412, 2115301954, 2115017252, 2114731305, 2114444114, 2114155680, 2113866001,
|
||||
2113575080, 2113282914, 2112989506, 2112694855, 2112398960, 2112101824, 2111803444, 2111503822,
|
||||
2111202959, 2110900853, 2110597505, 2110292916, 2109987085, 2109680013, 2109371700, 2109062146,
|
||||
2108751352, 2108439317, 2108126041, 2107811526, 2107495770, 2107178775, 2106860540, 2106541065,
|
||||
2106220352, 2105898399, 2105575208, 2105250778, 2104925109, 2104598202, 2104270057, 2103940674,
|
||||
2103610054, 2103278196, 2102945101, 2102610768, 2102275199, 2101938393, 2101600350, 2101261071,
|
||||
2100920556, 2100578805, 2100235819, 2099891596, 2099546139, 2099199446, 2098851519, 2098502357,
|
||||
2098151960, 2097800329, 2097447464, 2097093365, 2096738032, 2096381466, 2096023667, 2095664635,
|
||||
2095304370, 2094942872, 2094580142, 2094216179, 2093850985, 2093484559, 2093116901, 2092748012,
|
||||
2092377892, 2092006541, 2091633960, 2091260147, 2090885105, 2090508833, 2090131331, 2089752599,
|
||||
2089372638, 2088991448, 2088609029, 2088225381, 2087840505, 2087454400, 2087067068, 2086678508,
|
||||
2086288720, 2085897705, 2085505463, 2085111994, 2084717298, 2084321376, 2083924228, 2083525854,
|
||||
2083126254, 2082725429, 2082323379, 2081920103, 2081515603, 2081109879, 2080702930, 2080294757,
|
||||
2079885360, 2079474740, 2079062896, 2078649830, 2078235540, 2077820028, 2077403294, 2076985338,
|
||||
2076566160, 2076145760, 2075724139, 2075301296, 2074877233, 2074451950, 2074025446, 2073597721,
|
||||
2073168777, 2072738614, 2072307231, 2071874629, 2071440808, 2071005769, 2070569511, 2070132035,
|
||||
2069693342, 2069253430, 2068812302, 2068369957, 2067926394, 2067481616, 2067035621, 2066588410,
|
||||
2066139983, 2065690341, 2065239484, 2064787411, 2064334124, 2063879623, 2063423908, 2062966978,
|
||||
2062508835, 2062049479, 2061588910, 2061127128, 2060664133, 2060199927, 2059734508, 2059267877,
|
||||
2058800036, 2058330983, 2057860719, 2057389244, 2056916560, 2056442665, 2055967560, 2055491246,
|
||||
2055013723, 2054534991, 2054055050, 2053573901, 2053091544, 2052607979, 2052123207, 2051637227,
|
||||
2051150040, 2050661647, 2050172048, 2049681242, 2049189231, 2048696014, 2048201592, 2047705965,
|
||||
2047209133, 2046711097, 2046211857, 2045711414, 2045209767, 2044706916, 2044202863, 2043697608,
|
||||
2043191150, 2042683490, 2042174628, 2041664565, 2041153301, 2040640837, 2040127172, 2039612306,
|
||||
2039096241, 2038578976, 2038060512, 2037540850, 2037019988, 2036497928, 2035974670, 2035450215,
|
||||
2034924562, 2034397712, 2033869665, 2033340422, 2032809982, 2032278347, 2031745516, 2031211490,
|
||||
2030676269, 2030139853, 2029602243, 2029063439, 2028523442, 2027982251, 2027439867, 2026896291,
|
||||
2026351522, 2025805561, 2025258408, 2024710064, 2024160529, 2023609803, 2023057887, 2022504780,
|
||||
2021950484, 2021394998, 2020838323, 2020280460, 2019721407, 2019161167, 2018599739, 2018037123,
|
||||
2017473321, 2016908331, 2016342155, 2015774793, 2015206245, 2014636511, 2014065592, 2013493489,
|
||||
2012920201, 2012345729, 2011770073, 2011193233, 2010615210, 2010036005, 2009455617, 2008874047,
|
||||
2008291295, 2007707362, 2007122248, 2006535953, 2005948478, 2005359822, 2004769987, 2004178973,
|
||||
2003586779, 2002993407, 2002398857, 2001803128, 2001206222, 2000608139, 2000008879, 1999408442,
|
||||
1998806829, 1998204040, 1997600076, 1996994937, 1996388622, 1995781134, 1995172471, 1994562635,
|
||||
1993951625, 1993339442, 1992726087, 1992111559, 1991495860, 1990878989, 1990260946, 1989641733,
|
||||
1989021350, 1988399796, 1987777073, 1987153180, 1986528118, 1985901888, 1985274489, 1984645923,
|
||||
1984016189, 1983385288, 1982753220, 1982119985, 1981485585, 1980850019, 1980213288, 1979575392,
|
||||
1978936331, 1978296106, 1977654717, 1977012165, 1976368450, 1975723572, 1975077532, 1974430331,
|
||||
1973781967, 1973132443, 1972481757, 1971829912, 1971176906, 1970522741, 1969867417, 1969210933,
|
||||
1968553292, 1967894492, 1967234535, 1966573420, 1965911148, 1965247720, 1964583136, 1963917396,
|
||||
1963250501, 1962582451, 1961913246, 1961242888, 1960571375, 1959898709, 1959224890, 1958549919,
|
||||
1957873796, 1957196520, 1956518093, 1955838516, 1955157788, 1954475909, 1953792881, 1953108703,
|
||||
1952423377, 1951736902, 1951049279, 1950360508, 1949670589, 1948979524, 1948287312, 1947593954,
|
||||
1946899451, 1946203802, 1945507008, 1944809070, 1944109987, 1943409761, 1942708392, 1942005880,
|
||||
1941302225, 1940597428, 1939891490, 1939184411, 1938476190, 1937766830, 1937056329, 1936344689,
|
||||
1935631910, 1934917992, 1934202936, 1933486742, 1932769411, 1932050943, 1931331338, 1930610597,
|
||||
1929888720, 1929165708, 1928441561, 1927716279, 1926989864, 1926262315, 1925533633, 1924803818,
|
||||
1924072871, 1923340791, 1922607581, 1921873239, 1921137767, 1920401165, 1919663432, 1918924571,
|
||||
1918184581, 1917443462, 1916701216, 1915957841, 1915213340, 1914467712, 1913720958, 1912973078,
|
||||
1912224073, 1911473942, 1910722688, 1909970309, 1909216806, 1908462181, 1907706433, 1906949562,
|
||||
1906191570, 1905432457, 1904672222, 1903910867, 1903148392, 1902384797, 1901620084, 1900854251,
|
||||
1900087301, 1899319232, 1898550047, 1897779744, 1897008325, 1896235790, 1895462140, 1894687374,
|
||||
1893911494, 1893134500, 1892356392, 1891577171, 1890796837, 1890015391, 1889232832, 1888449163,
|
||||
1887664383, 1886878492, 1886091491, 1885303381, 1884514161, 1883723833, 1882932397, 1882139853,
|
||||
1881346202, 1880551444, 1879755580, 1878958610, 1878160535, 1877361354, 1876561070, 1875759681,
|
||||
1874957189, 1874153594, 1873348897, 1872543097, 1871736196, 1870928194, 1870119091, 1869308888,
|
||||
1868497586, 1867685184, 1866871683, 1866057085, 1865241388, 1864424594, 1863606704, 1862787717,
|
||||
1861967634, 1861146456, 1860324183, 1859500816, 1858676355, 1857850800, 1857024153, 1856196413,
|
||||
1855367581, 1854537657, 1853706643, 1852874538, 1852041343, 1851207059, 1850371686, 1849535224,
|
||||
1848697674, 1847859036, 1847019312, 1846178501, 1845336604, 1844493621, 1843649553, 1842804401,
|
||||
1841958164, 1841110844, 1840262441, 1839412956, 1838562388, 1837710739, 1836858008, 1836004197,
|
||||
1835149306, 1834293336, 1833436286, 1832578158, 1831718951, 1830858668, 1829997307, 1829134869,
|
||||
1828271356, 1827406767, 1826541103, 1825674364, 1824806552, 1823937666, 1823067707, 1822196675,
|
||||
1821324572, 1820451397, 1819577151, 1818701835, 1817825449, 1816947994, 1816069469, 1815189877,
|
||||
1814309216, 1813427489, 1812544694, 1811660833, 1810775906, 1809889915, 1809002858, 1808114737,
|
||||
1807225553, 1806335305, 1805443995, 1804551623, 1803658189, 1802763694, 1801868139, 1800971523,
|
||||
1800073849, 1799175115, 1798275323, 1797374472, 1796472565, 1795569601, 1794665580, 1793760504,
|
||||
1792854372, 1791947186, 1791038946, 1790129652, 1789219305, 1788307905, 1787395453, 1786481950,
|
||||
1785567396, 1784651792, 1783735137, 1782817434, 1781898681, 1780978881, 1780058032, 1779136137,
|
||||
1778213194, 1777289206, 1776364172, 1775438094, 1774510970, 1773582803, 1772653593, 1771723340,
|
||||
1770792044, 1769859707, 1768926328, 1767991909, 1767056450, 1766119952, 1765182414, 1764243838,
|
||||
1763304224, 1762363573, 1761421885, 1760479161, 1759535401, 1758590607, 1757644777, 1756697914,
|
||||
1755750017, 1754801087, 1753851126, 1752900132, 1751948107, 1750995052, 1750040966, 1749085851,
|
||||
1748129707, 1747172535, 1746214334, 1745255107, 1744294853, 1743333573, 1742371267, 1741407936,
|
||||
1740443581, 1739478202, 1738511799, 1737544374, 1736575927, 1735606458, 1734635968, 1733664458,
|
||||
1732691928, 1731718378, 1730743810, 1729768224, 1728791620, 1727813999, 1726835361, 1725855708,
|
||||
1724875040, 1723893357, 1722910659, 1721926948, 1720942225, 1719956488, 1718969740, 1717981981,
|
||||
1716993211, 1716003431, 1715012642, 1714020844, 1713028037, 1712034223, 1711039401, 1710043573,
|
||||
1709046739, 1708048900, 1707050055, 1706050207, 1705049355, 1704047500, 1703044642, 1702040783,
|
||||
1701035922, 1700030061, 1699023199, 1698015339, 1697006479, 1695996621, 1694985765, 1693973912,
|
||||
1692961062, 1691947217, 1690932376, 1689916541, 1688899711, 1687881888, 1686863072, 1685843263,
|
||||
1684822463, 1683800672, 1682777890, 1681754118, 1680729357, 1679703608, 1678676870, 1677649144,
|
||||
1676620432, 1675590733, 1674560049, 1673528379, 1672495725, 1671462087, 1670427466, 1669391862,
|
||||
1668355276, 1667317709, 1666279161, 1665239632, 1664199124, 1663157637, 1662115172, 1661071729,
|
||||
1660027308, 1658981911, 1657935539, 1656888190, 1655839867, 1654790570, 1653740300, 1652689057,
|
||||
1651636841, 1650583654, 1649529496, 1648474367, 1647418269, 1646361202, 1645303166, 1644244162,
|
||||
1643184191, 1642123253, 1641061349, 1639998480, 1638934646, 1637869848, 1636804087, 1635737362,
|
||||
1634669676, 1633601027, 1632531418, 1631460848, 1630389319, 1629316830, 1628243383, 1627168978,
|
||||
1626093616, 1625017297, 1623940023, 1622861793, 1621782608, 1620702469, 1619621377, 1618539332,
|
||||
1617456335, 1616372386, 1615287487, 1614201637, 1613114838, 1612027089, 1610938393, 1609848749,
|
||||
1608758157, 1607666620, 1606574136, 1605480708, 1604386335, 1603291018, 1602194758, 1601097555,
|
||||
1599999411, 1598900325, 1597800299, 1596699333, 1595597428, 1594494583, 1593390801, 1592286082,
|
||||
1591180426, 1590073833, 1588966306, 1587857843, 1586748447, 1585638117, 1584526854, 1583414660,
|
||||
1582301533, 1581187476, 1580072489, 1578956572, 1577839726, 1576721952, 1575603251, 1574483623,
|
||||
1573363068, 1572241588, 1571119183, 1569995854, 1568871601, 1567746425, 1566620327, 1565493307,
|
||||
1564365367, 1563236506, 1562106725, 1560976026, 1559844408, 1558711873, 1557578421, 1556444052,
|
||||
1555308768, 1554172569, 1553035455, 1551897428, 1550758488, 1549618636, 1548477872, 1547336197,
|
||||
1546193612, 1545050118, 1543905714, 1542760402, 1541614183, 1540467057, 1539319024, 1538170087,
|
||||
1537020244, 1535869497, 1534717846, 1533565293, 1532411837, 1531257480, 1530102222, 1528946064,
|
||||
1527789007, 1526631051, 1525472197, 1524312445, 1523151797, 1521990252, 1520827813, 1519664478,
|
||||
1518500250, 1517335128, 1516169114, 1515002208, 1513834411, 1512665723, 1511496145, 1510325678,
|
||||
1509154322, 1507982079, 1506808949, 1505634932, 1504460029, 1503284242, 1502107570, 1500930014,
|
||||
1499751576, 1498572255, 1497392053, 1496210969, 1495029006, 1493846163, 1492662441, 1491477842,
|
||||
1490292364, 1489106011, 1487918781, 1486730675, 1485541696, 1484351842, 1483161115, 1481969516,
|
||||
1480777044, 1479583702, 1478389489, 1477194407, 1475998456, 1474801636, 1473603949, 1472405394,
|
||||
1471205974, 1470005688, 1468804538, 1467602523, 1466399645, 1465195904, 1463991302, 1462785838,
|
||||
1461579514, 1460372329, 1459164286, 1457955385, 1456745625, 1455535009, 1454323536, 1453111208,
|
||||
1451898025, 1450683988, 1449469098, 1448253355, 1447036760, 1445819314, 1444601017, 1443381870,
|
||||
1442161874, 1440941030, 1439719338, 1438496799, 1437273414, 1436049184, 1434824109, 1433598189,
|
||||
1432371426, 1431143821, 1429915374, 1428686085, 1427455956, 1426224988, 1424993180, 1423760534,
|
||||
1422527051, 1421292730, 1420057574, 1418821582, 1417584755, 1416347095, 1415108601, 1413869275,
|
||||
1412629117, 1411388129, 1410146309, 1408903661, 1407660183, 1406415878, 1405170745, 1403924785,
|
||||
1402678000, 1401430389, 1400181954, 1398932695, 1397682613, 1396431709, 1395179984, 1393927438,
|
||||
1392674072, 1391419886, 1390164882, 1388909060, 1387652422, 1386394966, 1385136696, 1383877610,
|
||||
1382617710, 1381356997, 1380095472, 1378833134, 1377569986, 1376306026, 1375041258, 1373775680,
|
||||
1372509294, 1371242101, 1369974101, 1368705296, 1367435685, 1366165269, 1364894050, 1363622028,
|
||||
1362349204, 1361075579, 1359801152, 1358525926, 1357249901, 1355973077, 1354695455, 1353417037,
|
||||
1352137822, 1350857812, 1349577007, 1348295409, 1347013017, 1345729833, 1344445857, 1343161090,
|
||||
1341875533, 1340589187, 1339302052, 1338014129, 1336725419, 1335435923, 1334145641, 1332854574,
|
||||
1331562723, 1330270089, 1328976672, 1327682474, 1326387494, 1325091734, 1323795195, 1322497877,
|
||||
1321199781, 1319900907, 1318601257, 1317300832, 1315999631, 1314697657, 1313394909, 1312091388,
|
||||
1310787095, 1309482032, 1308176198, 1306869594, 1305562222, 1304254082, 1302945174, 1301635500,
|
||||
1300325060, 1299013855, 1297701886, 1296389154, 1295075659, 1293761402, 1292446384, 1291130606,
|
||||
1289814068, 1288496772, 1287178717, 1285859905, 1284540337, 1283220013, 1281898935, 1280577102,
|
||||
1279254516, 1277931177, 1276607086, 1275282245, 1273956653, 1272630312, 1271303222, 1269975384,
|
||||
1268646800, 1267317469, 1265987392, 1264656571, 1263325005, 1261992697, 1260659646, 1259325853,
|
||||
1257991320, 1256656047, 1255320034, 1253983283, 1252645794, 1251307568, 1249968606, 1248628909,
|
||||
1247288478, 1245947312, 1244605414, 1243262783, 1241919421, 1240575329, 1239230506, 1237884955,
|
||||
1236538675, 1235191668, 1233843935, 1232495475, 1231146291, 1229796382, 1228445750, 1227094395,
|
||||
1225742318, 1224389521, 1223036002, 1221681765, 1220326809, 1218971135, 1217614743, 1216257636,
|
||||
1214899813, 1213541275, 1212182024, 1210822059, 1209461382, 1208099993, 1206737894, 1205375085,
|
||||
1204011567, 1202647340, 1201282407, 1199916766, 1198550419, 1197183368, 1195815612, 1194447153,
|
||||
1193077991, 1191708127, 1190337562, 1188966297, 1187594332, 1186221669, 1184848308, 1183474250,
|
||||
1182099496, 1180724046, 1179347902, 1177971064, 1176593533, 1175215310, 1173836395, 1172456790,
|
||||
1171076495, 1169695512, 1168313840, 1166931481, 1165548435, 1164164704, 1162780288, 1161395188,
|
||||
1160009405, 1158622939, 1157235792, 1155847964, 1154459456, 1153070269, 1151680403, 1150289860,
|
||||
1148898640, 1147506745, 1146114174, 1144720929, 1143327011, 1141932420, 1140537158, 1139141224,
|
||||
1137744621, 1136347348, 1134949406, 1133550797, 1132151521, 1130751579, 1129350972, 1127949701,
|
||||
1126547765, 1125145168, 1123741908, 1122337987, 1120933406, 1119528166, 1118122267, 1116715710,
|
||||
1115308496, 1113900627, 1112492101, 1111082922, 1109673089, 1108262603, 1106851465, 1105439676,
|
||||
1104027237, 1102614148, 1101200410, 1099786025, 1098370993, 1096955314, 1095538991, 1094122023,
|
||||
1092704411, 1091286156, 1089867259, 1088447722, 1087027544, 1085606726, 1084185270, 1082763176,
|
||||
1081340445, 1079917078, 1078493076, 1077068439, 1075643169, 1074217266, 1072790730, 1071363564,
|
||||
1069935768, 1068507342, 1067078288, 1065648605, 1064218296, 1062787361, 1061355801, 1059923616,
|
||||
1058490808, 1057057377, 1055623324, 1054188651, 1052753357, 1051317443, 1049880912, 1048443763,
|
||||
1047005996, 1045567615, 1044128617, 1042689006, 1041248781, 1039807944, 1038366495, 1036924436,
|
||||
1035481766, 1034038487, 1032594600, 1031150105, 1029705004, 1028259297, 1026812985, 1025366069,
|
||||
1023918550, 1022470428, 1021021705, 1019572382, 1018122458, 1016671936, 1015220816, 1013769098,
|
||||
1012316784, 1010863875, 1009410370, 1007956272, 1006501581, 1005046298, 1003590424, 1002133959,
|
||||
1000676905, 999219262, 997761031, 996302214, 994842810, 993382821, 991922248, 990461091,
|
||||
988999351, 987537030, 986074127, 984610645, 983146583, 981681943, 980216726, 978750932,
|
||||
977284562, 975817617, 974350098, 972882006, 971413342, 969944106, 968474300, 967003923,
|
||||
965532978, 964061465, 962589385, 961116739, 959643527, 958169751, 956695411, 955220508,
|
||||
953745043, 952269017, 950792431, 949315286, 947837582, 946359321, 944880503, 943401129,
|
||||
941921200, 940440717, 938959681, 937478092, 935995952, 934513261, 933030021, 931546231,
|
||||
930061894, 928577010, 927091579, 925605603, 924119082, 922632018, 921144411, 919656262,
|
||||
918167572, 916678342, 915188572, 913698265, 912207419, 910716038, 909224120, 907731667,
|
||||
906238681, 904745161, 903251110, 901756526, 900261413, 898765769, 897269597, 895772898,
|
||||
894275671, 892777918, 891279640, 889780838, 888281512, 886781663, 885281293, 883780402,
|
||||
882278992, 880777062, 879274614, 877771649, 876268167, 874764170, 873259659, 871754633,
|
||||
870249095, 868743045, 867236484, 865729413, 864221832, 862713743, 861205147, 859696043,
|
||||
858186435, 856676321, 855165703, 853654582, 852142959, 850630835, 849118210, 847605086,
|
||||
846091463, 844577343, 843062726, 841547612, 840032004, 838515901, 836999305, 835482217,
|
||||
833964638, 832446567, 830928007, 829408958, 827889422, 826369398, 824848888, 823327893,
|
||||
821806413, 820284450, 818762005, 817239078, 815715670, 814191782, 812667415, 811142571,
|
||||
809617249, 808091450, 806565177, 805038429, 803511207, 801983513, 800455346, 798926709,
|
||||
797397602, 795868026, 794337982, 792807470, 791276492, 789745049, 788213141, 786680769,
|
||||
785147934, 783614638, 782080880, 780546663, 779011986, 777476851, 775941259, 774405210,
|
||||
772868706, 771331747, 769794334, 768256469, 766718151, 765179382, 763640164, 762100496,
|
||||
760560380, 759019816, 757478806, 755937350, 754395449, 752853105, 751310318, 749767089,
|
||||
748223418, 746679308, 745134758, 743589770, 742044345, 740498483, 738952186, 737405453,
|
||||
735858287, 734310688, 732762657, 731214195, 729665303, 728115982, 726566232, 725016055,
|
||||
723465451, 721914422, 720362968, 718811090, 717258790, 715706067, 714152924, 712599360,
|
||||
711045377, 709490976, 707936158, 706380923, 704825272, 703269207, 701712728, 700155836,
|
||||
698598533, 697040818, 695482694, 693924160, 692365218, 690805869, 689246113, 687685952,
|
||||
686125387, 684564417, 683003045, 681441272, 679879097, 678316522, 676753549, 675190177,
|
||||
673626408, 672062243, 670497682, 668932727, 667367379, 665801638, 664235505, 662668981,
|
||||
661102068, 659534766, 657967075, 656398998, 654830535, 653261686, 651692453, 650122837,
|
||||
648552838, 646982457, 645411696, 643840556, 642269036, 640697139, 639124865, 637552215,
|
||||
635979190, 634405791, 632832018, 631257873, 629683357, 628108471, 626533215, 624957590,
|
||||
623381598, 621805239, 620228514, 618651424, 617073971, 615496154, 613917975, 612339436,
|
||||
610760536, 609181276, 607601658, 606021683, 604441352, 602860664, 601279623, 599698227,
|
||||
598116479, 596534378, 594951927, 593369126, 591785976, 590202477, 588618632, 587034440,
|
||||
585449903, 583865021, 582279796, 580694229, 579108320, 577522070, 575935480, 574348552,
|
||||
572761285, 571173682, 569585743, 567997469, 566408860, 564819919, 563230645, 561641039,
|
||||
560051104, 558460839, 556870245, 555279324, 553688076, 552096502, 550504604, 548912382,
|
||||
547319836, 545726969, 544133781, 542540273, 540946445, 539352300, 537757837, 536163058,
|
||||
534567963, 532972554, 531376831, 529780796, 528184449, 526587791, 524990824, 523393547,
|
||||
521795963, 520198072, 518599875, 517001373, 515402566, 513803457, 512204045, 510604332,
|
||||
509004318, 507404005, 505803394, 504202485, 502601279, 500999778, 499397982, 497795892,
|
||||
496193509, 494590835, 492987869, 491384614, 489781069, 488177236, 486573117, 484968710,
|
||||
483364019, 481759043, 480153784, 478548243, 476942419, 475336316, 473729932, 472123270,
|
||||
470516330, 468909114, 467301622, 465693854, 464085813, 462477499, 460868912, 459260055,
|
||||
457650927, 456041530, 454431865, 452821933, 451211734, 449601270, 447990541, 446379549,
|
||||
444768294, 443156777, 441545000, 439932963, 438320667, 436708113, 435095303, 433482236,
|
||||
431868915, 430255339, 428641511, 427027430, 425413098, 423798515, 422183684, 420568604,
|
||||
418953276, 417337703, 415721883, 414105819, 412489512, 410872962, 409256170, 407639137,
|
||||
406021865, 404404353, 402786604, 401168618, 399550396, 397931939, 396313247, 394694323,
|
||||
393075166, 391455778, 389836160, 388216313, 386596237, 384975934, 383355404, 381734649,
|
||||
380113669, 378492466, 376871039, 375249392, 373627523, 372005435, 370383128, 368760603,
|
||||
367137861, 365514903, 363891730, 362268343, 360644742, 359020930, 357396906, 355772673,
|
||||
354148230, 352523578, 350898719, 349273654, 347648383, 346022908, 344397230, 342771348,
|
||||
341145265, 339518981, 337892498, 336265816, 334638936, 333011859, 331384586, 329757119,
|
||||
328129457, 326501602, 324873555, 323245317, 321616889, 319988272, 318359466, 316730474,
|
||||
315101295, 313471930, 311842381, 310212649, 308582734, 306952638, 305322361, 303691904,
|
||||
302061269, 300430456, 298799466, 297168301, 295536961, 293905447, 292273760, 290641901,
|
||||
289009871, 287377671, 285745302, 284112765, 282480061, 280847190, 279214155, 277580955,
|
||||
275947592, 274314066, 272680379, 271046532, 269412525, 267778360, 266144038, 264509558,
|
||||
262874923, 261240134, 259605191, 257970095, 256334847, 254699448, 253063900, 251428203,
|
||||
249792358, 248156366, 246520228, 244883945, 243247518, 241610947, 239974235, 238337382,
|
||||
236700388, 235063255, 233425984, 231788575, 230151030, 228513350, 226875535, 225237587,
|
||||
223599506, 221961294, 220322951, 218684479, 217045878, 215407149, 213768293, 212129312,
|
||||
210490206, 208850976, 207211624, 205572149, 203932553, 202292838, 200653003, 199013051,
|
||||
197372981, 195732795, 194092495, 192452080, 190811551, 189170911, 187530159, 185889297,
|
||||
184248325, 182607245, 180966058, 179324764, 177683365, 176041861, 174400254, 172758544,
|
||||
171116733, 169474820, 167832808, 166190698, 164548489, 162906184, 161263783, 159621287,
|
||||
157978697, 156336015, 154693240, 153050374, 151407418, 149764374, 148121241, 146478021,
|
||||
144834714, 143191323, 141547847, 139904288, 138260647, 136616925, 134973122, 133329239,
|
||||
131685278, 130041240, 128397125, 126752935, 125108670, 123464332, 121819921, 120175438,
|
||||
118530885, 116886262, 115241570, 113596810, 111951983, 110307091, 108662134, 107017112,
|
||||
105372028, 103726882, 102081675, 100436408, 98791081, 97145697, 95500255, 93854758,
|
||||
92209205, 90563597, 88917937, 87272224, 85626460, 83980645, 82334782, 80688869,
|
||||
79042909, 77396903, 75750851, 74104755, 72458615, 70812432, 69166208, 67519943,
|
||||
65873638, 64227295, 62580914, 60934496, 59288042, 57641553, 55995030, 54348475,
|
||||
52701887, 51055268, 49408620, 47761942, 46115236, 44468503, 42821744, 41174960,
|
||||
39528151, 37881320, 36234466, 34587590, 32940695, 31293780, 29646846, 27999895,
|
||||
26352928, 24705945, 23058947, 21411936, 19764913, 18117878, 16470832, 14823776,
|
||||
13176712, 11529640, 9882561, 8235476, 6588387, 4941294, 3294197, 1647099
|
||||
};
|
||||
|
||||
uint16_t ff_fft_offsets_lut[21845];
|
||||
|
||||
static void fft_lut_init(uint16_t *table, int off, int size, int *index)
|
||||
{
|
||||
if (size < 16) {
|
||||
table[*index] = off >> 2;
|
||||
(*index)++;
|
||||
}
|
||||
else {
|
||||
fft_lut_init(table, off, size >> 1, index);
|
||||
fft_lut_init(table, off + (size >> 1), size >> 2, index);
|
||||
fft_lut_init(table, off + 3 * (size >> 2), size >> 2, index);
|
||||
}
|
||||
}
|
||||
|
||||
static void fft_lut_init_start(void)
|
||||
{
|
||||
int n = 0;
|
||||
|
||||
fft_lut_init(ff_fft_offsets_lut, 0, 1 << 17, &n);
|
||||
}
|
||||
|
||||
void ff_fft_lut_init(void)
|
||||
{
|
||||
static AVOnce init_once = AV_ONCE_INIT;
|
||||
|
||||
ff_thread_once(&init_once, fft_lut_init_start);
|
||||
}
|
@ -1,66 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2012
|
||||
* MIPS Technologies, Inc., California.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Stanislav Ocovaj (socovaj@mips.com)
|
||||
* Goran Cordasic (goran@mips.com)
|
||||
* Djordje Pesut (djordje@mips.com)
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* definitions and tables for FFT
|
||||
*/
|
||||
#ifndef AVCODEC_FFT_TABLE_H
|
||||
#define AVCODEC_FFT_TABLE_H
|
||||
|
||||
#include "libavcodec/fft.h"
|
||||
|
||||
#define MAX_LOG2_NFFT 17 //!< Specifies maximum allowed fft size
|
||||
#define MAX_FFT_SIZE (1 << MAX_LOG2_NFFT)
|
||||
|
||||
extern const int32_t ff_w_tab_sr[];
|
||||
extern uint16_t ff_fft_offsets_lut[];
|
||||
void ff_fft_lut_init(void);
|
||||
|
||||
#endif /* AVCODEC_FFT_TABLE_H */
|
@ -1,628 +0,0 @@
|
||||
/*
|
||||
* FFT/IFFT transforms
|
||||
* Copyright (c) 2008 Loren Merritt
|
||||
* Copyright (c) 2002 Fabrice Bellard
|
||||
* Partly based on libdjbfft by D. J. Bernstein
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* FFT/IFFT transforms.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "libavutil/mathematics.h"
|
||||
#include "libavutil/thread.h"
|
||||
#include "fft.h"
|
||||
#include "fft-internal.h"
|
||||
|
||||
#if !FFT_FLOAT
|
||||
#include "fft_table.h"
|
||||
#else /* !FFT_FLOAT */
|
||||
|
||||
/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
|
||||
#if !CONFIG_HARDCODED_TABLES
|
||||
COSTABLE(16);
|
||||
COSTABLE(32);
|
||||
COSTABLE(64);
|
||||
COSTABLE(128);
|
||||
COSTABLE(256);
|
||||
COSTABLE(512);
|
||||
COSTABLE(1024);
|
||||
COSTABLE(2048);
|
||||
COSTABLE(4096);
|
||||
COSTABLE(8192);
|
||||
COSTABLE(16384);
|
||||
COSTABLE(32768);
|
||||
COSTABLE(65536);
|
||||
COSTABLE(131072);
|
||||
|
||||
static av_cold void init_ff_cos_tabs(int index)
|
||||
{
|
||||
int i;
|
||||
int m = 1<<index;
|
||||
double freq = 2*M_PI/m;
|
||||
FFTSample *tab = FFT_NAME(ff_cos_tabs)[index];
|
||||
for(i=0; i<=m/4; i++)
|
||||
tab[i] = FIX15(cos(i*freq));
|
||||
for(i=1; i<m/4; i++)
|
||||
tab[m/2-i] = tab[i];
|
||||
}
|
||||
|
||||
typedef struct CosTabsInitOnce {
|
||||
void (*func)(void);
|
||||
AVOnce control;
|
||||
} CosTabsInitOnce;
|
||||
|
||||
#define INIT_FF_COS_TABS_FUNC(index, size) \
|
||||
static av_cold void init_ff_cos_tabs_ ## size (void)\
|
||||
{ \
|
||||
init_ff_cos_tabs(index); \
|
||||
}
|
||||
|
||||
INIT_FF_COS_TABS_FUNC(4, 16)
|
||||
INIT_FF_COS_TABS_FUNC(5, 32)
|
||||
INIT_FF_COS_TABS_FUNC(6, 64)
|
||||
INIT_FF_COS_TABS_FUNC(7, 128)
|
||||
INIT_FF_COS_TABS_FUNC(8, 256)
|
||||
INIT_FF_COS_TABS_FUNC(9, 512)
|
||||
INIT_FF_COS_TABS_FUNC(10, 1024)
|
||||
INIT_FF_COS_TABS_FUNC(11, 2048)
|
||||
INIT_FF_COS_TABS_FUNC(12, 4096)
|
||||
INIT_FF_COS_TABS_FUNC(13, 8192)
|
||||
INIT_FF_COS_TABS_FUNC(14, 16384)
|
||||
INIT_FF_COS_TABS_FUNC(15, 32768)
|
||||
INIT_FF_COS_TABS_FUNC(16, 65536)
|
||||
INIT_FF_COS_TABS_FUNC(17, 131072)
|
||||
|
||||
static CosTabsInitOnce cos_tabs_init_once[] = {
|
||||
{ NULL },
|
||||
{ NULL },
|
||||
{ NULL },
|
||||
{ NULL },
|
||||
{ init_ff_cos_tabs_16, AV_ONCE_INIT },
|
||||
{ init_ff_cos_tabs_32, AV_ONCE_INIT },
|
||||
{ init_ff_cos_tabs_64, AV_ONCE_INIT },
|
||||
{ init_ff_cos_tabs_128, AV_ONCE_INIT },
|
||||
{ init_ff_cos_tabs_256, AV_ONCE_INIT },
|
||||
{ init_ff_cos_tabs_512, AV_ONCE_INIT },
|
||||
{ init_ff_cos_tabs_1024, AV_ONCE_INIT },
|
||||
{ init_ff_cos_tabs_2048, AV_ONCE_INIT },
|
||||
{ init_ff_cos_tabs_4096, AV_ONCE_INIT },
|
||||
{ init_ff_cos_tabs_8192, AV_ONCE_INIT },
|
||||
{ init_ff_cos_tabs_16384, AV_ONCE_INIT },
|
||||
{ init_ff_cos_tabs_32768, AV_ONCE_INIT },
|
||||
{ init_ff_cos_tabs_65536, AV_ONCE_INIT },
|
||||
{ init_ff_cos_tabs_131072, AV_ONCE_INIT },
|
||||
};
|
||||
|
||||
av_cold void ff_init_ff_cos_tabs(int index)
|
||||
{
|
||||
ff_thread_once(&cos_tabs_init_once[index].control, cos_tabs_init_once[index].func);
|
||||
}
|
||||
#endif
|
||||
COSTABLE_CONST FFTSample * const FFT_NAME(ff_cos_tabs)[] = {
|
||||
NULL, NULL, NULL, NULL,
|
||||
FFT_NAME(ff_cos_16),
|
||||
FFT_NAME(ff_cos_32),
|
||||
FFT_NAME(ff_cos_64),
|
||||
FFT_NAME(ff_cos_128),
|
||||
FFT_NAME(ff_cos_256),
|
||||
FFT_NAME(ff_cos_512),
|
||||
FFT_NAME(ff_cos_1024),
|
||||
FFT_NAME(ff_cos_2048),
|
||||
FFT_NAME(ff_cos_4096),
|
||||
FFT_NAME(ff_cos_8192),
|
||||
FFT_NAME(ff_cos_16384),
|
||||
FFT_NAME(ff_cos_32768),
|
||||
FFT_NAME(ff_cos_65536),
|
||||
FFT_NAME(ff_cos_131072),
|
||||
};
|
||||
|
||||
#endif /* FFT_FLOAT */
|
||||
|
||||
static void fft_permute_c(FFTContext *s, FFTComplex *z);
|
||||
static void fft_calc_c(FFTContext *s, FFTComplex *z);
|
||||
|
||||
static int split_radix_permutation(int i, int n, int inverse)
|
||||
{
|
||||
int m;
|
||||
if(n <= 2) return i&1;
|
||||
m = n >> 1;
|
||||
if(!(i&m)) return split_radix_permutation(i, m, inverse)*2;
|
||||
m >>= 1;
|
||||
if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1;
|
||||
else return split_radix_permutation(i, m, inverse)*4 - 1;
|
||||
}
|
||||
|
||||
|
||||
static const int avx_tab[] = {
|
||||
0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
|
||||
};
|
||||
|
||||
static int is_second_half_of_fft32(int i, int n)
|
||||
{
|
||||
if (n <= 32)
|
||||
return i >= 16;
|
||||
else if (i < n/2)
|
||||
return is_second_half_of_fft32(i, n/2);
|
||||
else if (i < 3*n/4)
|
||||
return is_second_half_of_fft32(i - n/2, n/4);
|
||||
else
|
||||
return is_second_half_of_fft32(i - 3*n/4, n/4);
|
||||
}
|
||||
|
||||
static av_cold void fft_perm_avx(FFTContext *s)
|
||||
{
|
||||
int i;
|
||||
int n = 1 << s->nbits;
|
||||
|
||||
for (i = 0; i < n; i += 16) {
|
||||
int k;
|
||||
if (is_second_half_of_fft32(i, n)) {
|
||||
for (k = 0; k < 16; k++)
|
||||
s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] =
|
||||
i + avx_tab[k];
|
||||
|
||||
} else {
|
||||
for (k = 0; k < 16; k++) {
|
||||
int j = i + k;
|
||||
j = (j & ~7) | ((j >> 1) & 3) | ((j << 2) & 4);
|
||||
s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
|
||||
{
|
||||
int i, j, n;
|
||||
|
||||
s->revtab = NULL;
|
||||
s->revtab32 = NULL;
|
||||
|
||||
if (nbits < 2 || nbits > 17)
|
||||
goto fail;
|
||||
s->nbits = nbits;
|
||||
n = 1 << nbits;
|
||||
|
||||
if (nbits <= 16) {
|
||||
s->revtab = av_malloc(n * sizeof(uint16_t));
|
||||
if (!s->revtab)
|
||||
goto fail;
|
||||
} else {
|
||||
s->revtab32 = av_malloc(n * sizeof(uint32_t));
|
||||
if (!s->revtab32)
|
||||
goto fail;
|
||||
}
|
||||
s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
|
||||
if (!s->tmp_buf)
|
||||
goto fail;
|
||||
s->inverse = inverse;
|
||||
s->fft_permutation = FF_FFT_PERM_DEFAULT;
|
||||
|
||||
s->fft_permute = fft_permute_c;
|
||||
s->fft_calc = fft_calc_c;
|
||||
#if CONFIG_MDCT
|
||||
s->imdct_calc = ff_imdct_calc_c;
|
||||
s->imdct_half = ff_imdct_half_c;
|
||||
s->mdct_calc = ff_mdct_calc_c;
|
||||
#endif
|
||||
|
||||
#if FFT_FLOAT
|
||||
#if ARCH_AARCH64
|
||||
ff_fft_init_aarch64(s);
|
||||
#elif ARCH_ARM
|
||||
ff_fft_init_arm(s);
|
||||
#elif ARCH_PPC
|
||||
ff_fft_init_ppc(s);
|
||||
#elif ARCH_X86
|
||||
ff_fft_init_x86(s);
|
||||
#endif
|
||||
#if HAVE_MIPSFPU
|
||||
ff_fft_init_mips(s);
|
||||
#endif
|
||||
for(j=4; j<=nbits; j++) {
|
||||
ff_init_ff_cos_tabs(j);
|
||||
}
|
||||
#else /* FFT_FLOAT */
|
||||
ff_fft_lut_init();
|
||||
#endif
|
||||
|
||||
|
||||
if (ARCH_X86 && FFT_FLOAT && s->fft_permutation == FF_FFT_PERM_AVX) {
|
||||
fft_perm_avx(s);
|
||||
} else {
|
||||
#define PROCESS_FFT_PERM_SWAP_LSBS(num) do {\
|
||||
for(i = 0; i < n; i++) {\
|
||||
int k;\
|
||||
j = i;\
|
||||
j = (j & ~3) | ((j >> 1) & 1) | ((j << 1) & 2);\
|
||||
k = -split_radix_permutation(i, n, s->inverse) & (n - 1);\
|
||||
s->revtab##num[k] = j;\
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
#define PROCESS_FFT_PERM_DEFAULT(num) do {\
|
||||
for(i = 0; i < n; i++) {\
|
||||
int k;\
|
||||
j = i;\
|
||||
k = -split_radix_permutation(i, n, s->inverse) & (n - 1);\
|
||||
s->revtab##num[k] = j;\
|
||||
} \
|
||||
} while(0);
|
||||
|
||||
#define SPLIT_RADIX_PERMUTATION(num) do { \
|
||||
if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS) {\
|
||||
PROCESS_FFT_PERM_SWAP_LSBS(num) \
|
||||
} else {\
|
||||
PROCESS_FFT_PERM_DEFAULT(num) \
|
||||
}\
|
||||
} while(0);
|
||||
|
||||
if (s->revtab)
|
||||
SPLIT_RADIX_PERMUTATION()
|
||||
if (s->revtab32)
|
||||
SPLIT_RADIX_PERMUTATION(32)
|
||||
|
||||
#undef PROCESS_FFT_PERM_DEFAULT
|
||||
#undef PROCESS_FFT_PERM_SWAP_LSBS
|
||||
#undef SPLIT_RADIX_PERMUTATION
|
||||
}
|
||||
|
||||
return 0;
|
||||
fail:
|
||||
av_freep(&s->revtab);
|
||||
av_freep(&s->revtab32);
|
||||
av_freep(&s->tmp_buf);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static void fft_permute_c(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
int j, np;
|
||||
const uint16_t *revtab = s->revtab;
|
||||
const uint32_t *revtab32 = s->revtab32;
|
||||
np = 1 << s->nbits;
|
||||
/* TODO: handle split-radix permute in a more optimal way, probably in-place */
|
||||
if (revtab) {
|
||||
for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j];
|
||||
} else
|
||||
for(j=0;j<np;j++) s->tmp_buf[revtab32[j]] = z[j];
|
||||
|
||||
memcpy(z, s->tmp_buf, np * sizeof(FFTComplex));
|
||||
}
|
||||
|
||||
av_cold void ff_fft_end(FFTContext *s)
|
||||
{
|
||||
av_freep(&s->revtab);
|
||||
av_freep(&s->revtab32);
|
||||
av_freep(&s->tmp_buf);
|
||||
}
|
||||
|
||||
#if !FFT_FLOAT
|
||||
|
||||
static void fft_calc_c(FFTContext *s, FFTComplex *z) {
|
||||
|
||||
int nbits, i, n, num_transforms, offset, step;
|
||||
int n4, n2, n34;
|
||||
unsigned tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
|
||||
FFTComplex *tmpz;
|
||||
const int fft_size = (1 << s->nbits);
|
||||
int64_t accu;
|
||||
|
||||
num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
|
||||
|
||||
for (n=0; n<num_transforms; n++){
|
||||
offset = ff_fft_offsets_lut[n] << 2;
|
||||
tmpz = z + offset;
|
||||
|
||||
tmp1 = tmpz[0].re + (unsigned)tmpz[1].re;
|
||||
tmp5 = tmpz[2].re + (unsigned)tmpz[3].re;
|
||||
tmp2 = tmpz[0].im + (unsigned)tmpz[1].im;
|
||||
tmp6 = tmpz[2].im + (unsigned)tmpz[3].im;
|
||||
tmp3 = tmpz[0].re - (unsigned)tmpz[1].re;
|
||||
tmp8 = tmpz[2].im - (unsigned)tmpz[3].im;
|
||||
tmp4 = tmpz[0].im - (unsigned)tmpz[1].im;
|
||||
tmp7 = tmpz[2].re - (unsigned)tmpz[3].re;
|
||||
|
||||
tmpz[0].re = tmp1 + tmp5;
|
||||
tmpz[2].re = tmp1 - tmp5;
|
||||
tmpz[0].im = tmp2 + tmp6;
|
||||
tmpz[2].im = tmp2 - tmp6;
|
||||
tmpz[1].re = tmp3 + tmp8;
|
||||
tmpz[3].re = tmp3 - tmp8;
|
||||
tmpz[1].im = tmp4 - tmp7;
|
||||
tmpz[3].im = tmp4 + tmp7;
|
||||
}
|
||||
|
||||
if (fft_size < 8)
|
||||
return;
|
||||
|
||||
num_transforms = (num_transforms >> 1) | 1;
|
||||
|
||||
for (n=0; n<num_transforms; n++){
|
||||
offset = ff_fft_offsets_lut[n] << 3;
|
||||
tmpz = z + offset;
|
||||
|
||||
tmp1 = tmpz[4].re + (unsigned)tmpz[5].re;
|
||||
tmp3 = tmpz[6].re + (unsigned)tmpz[7].re;
|
||||
tmp2 = tmpz[4].im + (unsigned)tmpz[5].im;
|
||||
tmp4 = tmpz[6].im + (unsigned)tmpz[7].im;
|
||||
tmp5 = tmp1 + tmp3;
|
||||
tmp7 = tmp1 - tmp3;
|
||||
tmp6 = tmp2 + tmp4;
|
||||
tmp8 = tmp2 - tmp4;
|
||||
|
||||
tmp1 = tmpz[4].re - (unsigned)tmpz[5].re;
|
||||
tmp2 = tmpz[4].im - (unsigned)tmpz[5].im;
|
||||
tmp3 = tmpz[6].re - (unsigned)tmpz[7].re;
|
||||
tmp4 = tmpz[6].im - (unsigned)tmpz[7].im;
|
||||
|
||||
tmpz[4].re = tmpz[0].re - tmp5;
|
||||
tmpz[0].re = tmpz[0].re + tmp5;
|
||||
tmpz[4].im = tmpz[0].im - tmp6;
|
||||
tmpz[0].im = tmpz[0].im + tmp6;
|
||||
tmpz[6].re = tmpz[2].re - tmp8;
|
||||
tmpz[2].re = tmpz[2].re + tmp8;
|
||||
tmpz[6].im = tmpz[2].im + tmp7;
|
||||
tmpz[2].im = tmpz[2].im - tmp7;
|
||||
|
||||
accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp1 + tmp2);
|
||||
tmp5 = (int32_t)((accu + 0x40000000) >> 31);
|
||||
accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 - tmp4);
|
||||
tmp7 = (int32_t)((accu + 0x40000000) >> 31);
|
||||
accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp2 - tmp1);
|
||||
tmp6 = (int32_t)((accu + 0x40000000) >> 31);
|
||||
accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 + tmp4);
|
||||
tmp8 = (int32_t)((accu + 0x40000000) >> 31);
|
||||
tmp1 = tmp5 + tmp7;
|
||||
tmp3 = tmp5 - tmp7;
|
||||
tmp2 = tmp6 + tmp8;
|
||||
tmp4 = tmp6 - tmp8;
|
||||
|
||||
tmpz[5].re = tmpz[1].re - tmp1;
|
||||
tmpz[1].re = tmpz[1].re + tmp1;
|
||||
tmpz[5].im = tmpz[1].im - tmp2;
|
||||
tmpz[1].im = tmpz[1].im + tmp2;
|
||||
tmpz[7].re = tmpz[3].re - tmp4;
|
||||
tmpz[3].re = tmpz[3].re + tmp4;
|
||||
tmpz[7].im = tmpz[3].im + tmp3;
|
||||
tmpz[3].im = tmpz[3].im - tmp3;
|
||||
}
|
||||
|
||||
step = 1 << ((MAX_LOG2_NFFT-4) - 4);
|
||||
n4 = 4;
|
||||
|
||||
for (nbits=4; nbits<=s->nbits; nbits++){
|
||||
n2 = 2*n4;
|
||||
n34 = 3*n4;
|
||||
num_transforms = (num_transforms >> 1) | 1;
|
||||
|
||||
for (n=0; n<num_transforms; n++){
|
||||
const FFTSample *w_re_ptr = ff_w_tab_sr + step;
|
||||
const FFTSample *w_im_ptr = ff_w_tab_sr + MAX_FFT_SIZE/(4*16) - step;
|
||||
offset = ff_fft_offsets_lut[n] << nbits;
|
||||
tmpz = z + offset;
|
||||
|
||||
tmp5 = tmpz[ n2].re + (unsigned)tmpz[n34].re;
|
||||
tmp1 = tmpz[ n2].re - (unsigned)tmpz[n34].re;
|
||||
tmp6 = tmpz[ n2].im + (unsigned)tmpz[n34].im;
|
||||
tmp2 = tmpz[ n2].im - (unsigned)tmpz[n34].im;
|
||||
|
||||
tmpz[ n2].re = tmpz[ 0].re - tmp5;
|
||||
tmpz[ 0].re = tmpz[ 0].re + tmp5;
|
||||
tmpz[ n2].im = tmpz[ 0].im - tmp6;
|
||||
tmpz[ 0].im = tmpz[ 0].im + tmp6;
|
||||
tmpz[n34].re = tmpz[n4].re - tmp2;
|
||||
tmpz[ n4].re = tmpz[n4].re + tmp2;
|
||||
tmpz[n34].im = tmpz[n4].im + tmp1;
|
||||
tmpz[ n4].im = tmpz[n4].im - tmp1;
|
||||
|
||||
for (i=1; i<n4; i++){
|
||||
FFTSample w_re = w_re_ptr[0];
|
||||
FFTSample w_im = w_im_ptr[0];
|
||||
accu = (int64_t)w_re*tmpz[ n2+i].re;
|
||||
accu += (int64_t)w_im*tmpz[ n2+i].im;
|
||||
tmp1 = (int32_t)((accu + 0x40000000) >> 31);
|
||||
accu = (int64_t)w_re*tmpz[ n2+i].im;
|
||||
accu -= (int64_t)w_im*tmpz[ n2+i].re;
|
||||
tmp2 = (int32_t)((accu + 0x40000000) >> 31);
|
||||
accu = (int64_t)w_re*tmpz[n34+i].re;
|
||||
accu -= (int64_t)w_im*tmpz[n34+i].im;
|
||||
tmp3 = (int32_t)((accu + 0x40000000) >> 31);
|
||||
accu = (int64_t)w_re*tmpz[n34+i].im;
|
||||
accu += (int64_t)w_im*tmpz[n34+i].re;
|
||||
tmp4 = (int32_t)((accu + 0x40000000) >> 31);
|
||||
|
||||
tmp5 = tmp1 + tmp3;
|
||||
tmp1 = tmp1 - tmp3;
|
||||
tmp6 = tmp2 + tmp4;
|
||||
tmp2 = tmp2 - tmp4;
|
||||
|
||||
tmpz[ n2+i].re = tmpz[ i].re - tmp5;
|
||||
tmpz[ i].re = tmpz[ i].re + tmp5;
|
||||
tmpz[ n2+i].im = tmpz[ i].im - tmp6;
|
||||
tmpz[ i].im = tmpz[ i].im + tmp6;
|
||||
tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
|
||||
tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
|
||||
tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
|
||||
tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
|
||||
|
||||
w_re_ptr += step;
|
||||
w_im_ptr -= step;
|
||||
}
|
||||
}
|
||||
step >>= 1;
|
||||
n4 <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
#else /* !FFT_FLOAT */
|
||||
|
||||
#define BUTTERFLIES(a0,a1,a2,a3) {\
|
||||
BF(t3, t5, t5, t1);\
|
||||
BF(a2.re, a0.re, a0.re, t5);\
|
||||
BF(a3.im, a1.im, a1.im, t3);\
|
||||
BF(t4, t6, t2, t6);\
|
||||
BF(a3.re, a1.re, a1.re, t4);\
|
||||
BF(a2.im, a0.im, a0.im, t6);\
|
||||
}
|
||||
|
||||
// force loading all the inputs before storing any.
|
||||
// this is slightly slower for small data, but avoids store->load aliasing
|
||||
// for addresses separated by large powers of 2.
|
||||
#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\
|
||||
FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\
|
||||
BF(t3, t5, t5, t1);\
|
||||
BF(a2.re, a0.re, r0, t5);\
|
||||
BF(a3.im, a1.im, i1, t3);\
|
||||
BF(t4, t6, t2, t6);\
|
||||
BF(a3.re, a1.re, r1, t4);\
|
||||
BF(a2.im, a0.im, i0, t6);\
|
||||
}
|
||||
|
||||
#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\
|
||||
CMUL(t1, t2, a2.re, a2.im, wre, -wim);\
|
||||
CMUL(t5, t6, a3.re, a3.im, wre, wim);\
|
||||
BUTTERFLIES(a0,a1,a2,a3)\
|
||||
}
|
||||
|
||||
#define TRANSFORM_ZERO(a0,a1,a2,a3) {\
|
||||
t1 = a2.re;\
|
||||
t2 = a2.im;\
|
||||
t5 = a3.re;\
|
||||
t6 = a3.im;\
|
||||
BUTTERFLIES(a0,a1,a2,a3)\
|
||||
}
|
||||
|
||||
/* z[0...8n-1], w[1...2n-1] */
|
||||
#define PASS(name)\
|
||||
static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\
|
||||
{\
|
||||
FFTDouble t1, t2, t3, t4, t5, t6;\
|
||||
int o1 = 2*n;\
|
||||
int o2 = 4*n;\
|
||||
int o3 = 6*n;\
|
||||
const FFTSample *wim = wre+o1;\
|
||||
n--;\
|
||||
\
|
||||
TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\
|
||||
TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
|
||||
do {\
|
||||
z += 2;\
|
||||
wre += 2;\
|
||||
wim -= 2;\
|
||||
TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\
|
||||
TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
|
||||
} while(--n);\
|
||||
}
|
||||
|
||||
PASS(pass)
|
||||
#if !CONFIG_SMALL
|
||||
#undef BUTTERFLIES
|
||||
#define BUTTERFLIES BUTTERFLIES_BIG
|
||||
PASS(pass_big)
|
||||
#endif
|
||||
|
||||
#define DECL_FFT(n,n2,n4)\
|
||||
static void fft##n(FFTComplex *z)\
|
||||
{\
|
||||
fft##n2(z);\
|
||||
fft##n4(z+n4*2);\
|
||||
fft##n4(z+n4*3);\
|
||||
pass(z,FFT_NAME(ff_cos_##n),n4/2);\
|
||||
}
|
||||
|
||||
static void fft4(FFTComplex *z)
|
||||
{
|
||||
FFTDouble t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
|
||||
BF(t3, t1, z[0].re, z[1].re);
|
||||
BF(t8, t6, z[3].re, z[2].re);
|
||||
BF(z[2].re, z[0].re, t1, t6);
|
||||
BF(t4, t2, z[0].im, z[1].im);
|
||||
BF(t7, t5, z[2].im, z[3].im);
|
||||
BF(z[3].im, z[1].im, t4, t8);
|
||||
BF(z[3].re, z[1].re, t3, t7);
|
||||
BF(z[2].im, z[0].im, t2, t5);
|
||||
}
|
||||
|
||||
static void fft8(FFTComplex *z)
|
||||
{
|
||||
FFTDouble t1, t2, t3, t4, t5, t6;
|
||||
|
||||
fft4(z);
|
||||
|
||||
BF(t1, z[5].re, z[4].re, -z[5].re);
|
||||
BF(t2, z[5].im, z[4].im, -z[5].im);
|
||||
BF(t5, z[7].re, z[6].re, -z[7].re);
|
||||
BF(t6, z[7].im, z[6].im, -z[7].im);
|
||||
|
||||
BUTTERFLIES(z[0],z[2],z[4],z[6]);
|
||||
TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf);
|
||||
}
|
||||
|
||||
#if !CONFIG_SMALL
|
||||
static void fft16(FFTComplex *z)
|
||||
{
|
||||
FFTDouble t1, t2, t3, t4, t5, t6;
|
||||
FFTSample cos_16_1 = FFT_NAME(ff_cos_16)[1];
|
||||
FFTSample cos_16_3 = FFT_NAME(ff_cos_16)[3];
|
||||
|
||||
fft8(z);
|
||||
fft4(z+8);
|
||||
fft4(z+12);
|
||||
|
||||
TRANSFORM_ZERO(z[0],z[4],z[8],z[12]);
|
||||
TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf);
|
||||
TRANSFORM(z[1],z[5],z[9],z[13],cos_16_1,cos_16_3);
|
||||
TRANSFORM(z[3],z[7],z[11],z[15],cos_16_3,cos_16_1);
|
||||
}
|
||||
#else
|
||||
DECL_FFT(16,8,4)
|
||||
#endif
|
||||
DECL_FFT(32,16,8)
|
||||
DECL_FFT(64,32,16)
|
||||
DECL_FFT(128,64,32)
|
||||
DECL_FFT(256,128,64)
|
||||
DECL_FFT(512,256,128)
|
||||
#if !CONFIG_SMALL
|
||||
#define pass pass_big
|
||||
#endif
|
||||
DECL_FFT(1024,512,256)
|
||||
DECL_FFT(2048,1024,512)
|
||||
DECL_FFT(4096,2048,1024)
|
||||
DECL_FFT(8192,4096,2048)
|
||||
DECL_FFT(16384,8192,4096)
|
||||
DECL_FFT(32768,16384,8192)
|
||||
DECL_FFT(65536,32768,16384)
|
||||
DECL_FFT(131072,65536,32768)
|
||||
|
||||
static void (* const fft_dispatch[])(FFTComplex*) = {
|
||||
fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024,
|
||||
fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, fft131072
|
||||
};
|
||||
|
||||
static void fft_calc_c(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
fft_dispatch[s->nbits-2](z);
|
||||
}
|
||||
#endif /* !FFT_FLOAT */
|
@ -1,51 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2012
|
||||
* MIPS Technologies, Inc., California.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* Authors: Stanislav Ocovaj (socovaj@mips.com)
|
||||
* Goran Cordasic (goran@mips.com)
|
||||
* Djordje Pesut (djordje@mips.com)
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#define FFT_FLOAT 0
|
||||
#include "mdct_template.c"
|
@ -1,20 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#define FFT_FLOAT 1
|
||||
#include "mdct_template.c"
|
@ -1,209 +0,0 @@
|
||||
/*
|
||||
* MDCT/IMDCT transforms
|
||||
* Copyright (c) 2002 Fabrice Bellard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/libm.h"
|
||||
#include "libavutil/mathematics.h"
|
||||
#include "fft.h"
|
||||
#include "fft-internal.h"
|
||||
|
||||
/**
|
||||
* @file
|
||||
* MDCT/IMDCT transforms.
|
||||
*/
|
||||
|
||||
#if FFT_FLOAT
|
||||
# define RSCALE(x, y) ((x) + (y))
|
||||
#else
|
||||
# define RSCALE(x, y) ((int)((x) + (unsigned)(y) + 32) >> 6)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* init MDCT or IMDCT computation.
|
||||
*/
|
||||
av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale)
|
||||
{
|
||||
int n, n4, i;
|
||||
double alpha, theta;
|
||||
int tstep;
|
||||
|
||||
memset(s, 0, sizeof(*s));
|
||||
n = 1 << nbits;
|
||||
s->mdct_bits = nbits;
|
||||
s->mdct_size = n;
|
||||
n4 = n >> 2;
|
||||
s->mdct_permutation = FF_MDCT_PERM_NONE;
|
||||
|
||||
if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0)
|
||||
goto fail;
|
||||
|
||||
s->tcos = av_malloc_array(n/2, sizeof(FFTSample));
|
||||
if (!s->tcos)
|
||||
goto fail;
|
||||
|
||||
switch (s->mdct_permutation) {
|
||||
case FF_MDCT_PERM_NONE:
|
||||
s->tsin = s->tcos + n4;
|
||||
tstep = 1;
|
||||
break;
|
||||
case FF_MDCT_PERM_INTERLEAVE:
|
||||
s->tsin = s->tcos + 1;
|
||||
tstep = 2;
|
||||
break;
|
||||
default:
|
||||
goto fail;
|
||||
}
|
||||
|
||||
theta = 1.0 / 8.0 + (scale < 0 ? n4 : 0);
|
||||
scale = sqrt(fabs(scale));
|
||||
for(i=0;i<n4;i++) {
|
||||
alpha = 2 * M_PI * (i + theta) / n;
|
||||
#if !FFT_FLOAT
|
||||
s->tcos[i*tstep] = lrint(-cos(alpha) * 2147483648.0);
|
||||
s->tsin[i*tstep] = lrint(-sin(alpha) * 2147483648.0);
|
||||
#else
|
||||
s->tcos[i*tstep] = FIX15(-cos(alpha) * scale);
|
||||
s->tsin[i*tstep] = FIX15(-sin(alpha) * scale);
|
||||
#endif
|
||||
}
|
||||
return 0;
|
||||
fail:
|
||||
ff_mdct_end(s);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the middle half of the inverse MDCT of size N = 2^nbits,
|
||||
* thus excluding the parts that can be derived by symmetry
|
||||
* @param output N/2 samples
|
||||
* @param input N/2 samples
|
||||
*/
|
||||
void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
{
|
||||
int k, n8, n4, n2, n, j;
|
||||
const uint16_t *revtab = s->revtab;
|
||||
const FFTSample *tcos = s->tcos;
|
||||
const FFTSample *tsin = s->tsin;
|
||||
const FFTSample *in1, *in2;
|
||||
FFTComplex *z = (FFTComplex *)output;
|
||||
|
||||
n = 1 << s->mdct_bits;
|
||||
n2 = n >> 1;
|
||||
n4 = n >> 2;
|
||||
n8 = n >> 3;
|
||||
|
||||
/* pre rotation */
|
||||
in1 = input;
|
||||
in2 = input + n2 - 1;
|
||||
for(k = 0; k < n4; k++) {
|
||||
j=revtab[k];
|
||||
CMUL(z[j].re, z[j].im, *in2, *in1, tcos[k], tsin[k]);
|
||||
in1 += 2;
|
||||
in2 -= 2;
|
||||
}
|
||||
s->fft_calc(s, z);
|
||||
|
||||
/* post rotation + reordering */
|
||||
for(k = 0; k < n8; k++) {
|
||||
FFTSample r0, i0, r1, i1;
|
||||
CMUL(r0, i1, z[n8-k-1].im, z[n8-k-1].re, tsin[n8-k-1], tcos[n8-k-1]);
|
||||
CMUL(r1, i0, z[n8+k ].im, z[n8+k ].re, tsin[n8+k ], tcos[n8+k ]);
|
||||
z[n8-k-1].re = r0;
|
||||
z[n8-k-1].im = i0;
|
||||
z[n8+k ].re = r1;
|
||||
z[n8+k ].im = i1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute inverse MDCT of size N = 2^nbits
|
||||
* @param output N samples
|
||||
* @param input N/2 samples
|
||||
*/
|
||||
void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
{
|
||||
int k;
|
||||
int n = 1 << s->mdct_bits;
|
||||
int n2 = n >> 1;
|
||||
int n4 = n >> 2;
|
||||
|
||||
ff_imdct_half_c(s, output+n4, input);
|
||||
|
||||
for(k = 0; k < n4; k++) {
|
||||
output[k] = -output[n2-k-1];
|
||||
output[n-k-1] = output[n2+k];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute MDCT of size N = 2^nbits
|
||||
* @param input N samples
|
||||
* @param out N/2 samples
|
||||
*/
|
||||
void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input)
|
||||
{
|
||||
int i, j, n, n8, n4, n2, n3;
|
||||
FFTDouble re, im;
|
||||
const uint16_t *revtab = s->revtab;
|
||||
const FFTSample *tcos = s->tcos;
|
||||
const FFTSample *tsin = s->tsin;
|
||||
FFTComplex *x = (FFTComplex *)out;
|
||||
|
||||
n = 1 << s->mdct_bits;
|
||||
n2 = n >> 1;
|
||||
n4 = n >> 2;
|
||||
n8 = n >> 3;
|
||||
n3 = 3 * n4;
|
||||
|
||||
/* pre rotation */
|
||||
for(i=0;i<n8;i++) {
|
||||
re = RSCALE(-input[2*i+n3], - input[n3-1-2*i]);
|
||||
im = RSCALE(-input[n4+2*i], + input[n4-1-2*i]);
|
||||
j = revtab[i];
|
||||
CMUL(x[j].re, x[j].im, re, im, -tcos[i], tsin[i]);
|
||||
|
||||
re = RSCALE( input[2*i] , - input[n2-1-2*i]);
|
||||
im = RSCALE(-input[n2+2*i], - input[ n-1-2*i]);
|
||||
j = revtab[n8 + i];
|
||||
CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]);
|
||||
}
|
||||
|
||||
s->fft_calc(s, x);
|
||||
|
||||
/* post rotation */
|
||||
for(i=0;i<n8;i++) {
|
||||
FFTSample r0, i0, r1, i1;
|
||||
CMUL(i1, r0, x[n8-i-1].re, x[n8-i-1].im, -tsin[n8-i-1], -tcos[n8-i-1]);
|
||||
CMUL(i0, r1, x[n8+i ].re, x[n8+i ].im, -tsin[n8+i ], -tcos[n8+i ]);
|
||||
x[n8-i-1].re = r0;
|
||||
x[n8-i-1].im = i0;
|
||||
x[n8+i ].re = r1;
|
||||
x[n8+i ].im = i1;
|
||||
}
|
||||
}
|
||||
|
||||
av_cold void ff_mdct_end(FFTContext *s)
|
||||
{
|
||||
av_freep(&s->tcos);
|
||||
ff_fft_end(s);
|
||||
}
|
@ -13,7 +13,6 @@ MIPSFPU-OBJS-$(CONFIG_AMRWB_DECODER) += mips/acelp_filters_mips.o \
|
||||
mips/acelp_vectors_mips.o
|
||||
MIPSFPU-OBJS-$(CONFIG_MPEGAUDIODSP) += mips/mpegaudiodsp_mips_float.o
|
||||
MIPSDSP-OBJS-$(CONFIG_MPEGAUDIODSP) += mips/mpegaudiodsp_mips_fixed.o
|
||||
MIPSFPU-OBJS-$(CONFIG_FFT) += mips/fft_mips.o
|
||||
MIPSFPU-OBJS-$(CONFIG_FMTCONVERT) += mips/fmtconvert_mips.o
|
||||
OBJS-$(CONFIG_AC3DSP) += mips/ac3dsp_mips.o
|
||||
OBJS-$(CONFIG_AAC_DECODER) += mips/aacdec_mips.o \
|
||||
|
@ -1,516 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2012
|
||||
* MIPS Technologies, Inc., California.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* Author: Stanislav Ocovaj (socovaj@mips.com)
|
||||
* Author: Zoran Lukic (zoranl@mips.com)
|
||||
*
|
||||
* Optimized MDCT/IMDCT and FFT transforms
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/fft.h"
|
||||
#include "libavcodec/fft_table.h"
|
||||
#include "libavutil/mips/asmdefs.h"
|
||||
|
||||
/**
|
||||
* FFT transform
|
||||
*/
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
|
||||
static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
int nbits, i, n, num_transforms, offset, step;
|
||||
int n4, n2, n34;
|
||||
FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
|
||||
FFTComplex *tmpz;
|
||||
float w_re, w_im;
|
||||
float *w_re_ptr, *w_im_ptr;
|
||||
const int fft_size = (1 << s->nbits);
|
||||
float pom, pom1, pom2, pom3;
|
||||
float temp, temp1, temp3, temp4;
|
||||
FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
|
||||
FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
|
||||
float f1 = 0.7071067812;
|
||||
|
||||
num_transforms = (21845 >> (17 - s->nbits)) | 1;
|
||||
|
||||
for (n=0; n<num_transforms; n++) {
|
||||
offset = ff_fft_offsets_lut[n] << 2;
|
||||
tmpz = z + offset;
|
||||
|
||||
tmp1 = tmpz[0].re + tmpz[1].re;
|
||||
tmp5 = tmpz[2].re + tmpz[3].re;
|
||||
tmp2 = tmpz[0].im + tmpz[1].im;
|
||||
tmp6 = tmpz[2].im + tmpz[3].im;
|
||||
tmp3 = tmpz[0].re - tmpz[1].re;
|
||||
tmp8 = tmpz[2].im - tmpz[3].im;
|
||||
tmp4 = tmpz[0].im - tmpz[1].im;
|
||||
tmp7 = tmpz[2].re - tmpz[3].re;
|
||||
|
||||
tmpz[0].re = tmp1 + tmp5;
|
||||
tmpz[2].re = tmp1 - tmp5;
|
||||
tmpz[0].im = tmp2 + tmp6;
|
||||
tmpz[2].im = tmp2 - tmp6;
|
||||
tmpz[1].re = tmp3 + tmp8;
|
||||
tmpz[3].re = tmp3 - tmp8;
|
||||
tmpz[1].im = tmp4 - tmp7;
|
||||
tmpz[3].im = tmp4 + tmp7;
|
||||
|
||||
}
|
||||
|
||||
if (fft_size < 8)
|
||||
return;
|
||||
|
||||
num_transforms = (num_transforms >> 1) | 1;
|
||||
|
||||
for (n=0; n<num_transforms; n++) {
|
||||
offset = ff_fft_offsets_lut[n] << 3;
|
||||
tmpz = z + offset;
|
||||
|
||||
__asm__ volatile (
|
||||
"lwc1 %[tmp1], 32(%[tmpz]) \n\t"
|
||||
"lwc1 %[pom], 40(%[tmpz]) \n\t"
|
||||
"lwc1 %[tmp3], 48(%[tmpz]) \n\t"
|
||||
"lwc1 %[pom1], 56(%[tmpz]) \n\t"
|
||||
"lwc1 %[tmp2], 36(%[tmpz]) \n\t"
|
||||
"lwc1 %[pom2], 44(%[tmpz]) \n\t"
|
||||
"lwc1 %[pom3], 60(%[tmpz]) \n\t"
|
||||
"lwc1 %[tmp4], 52(%[tmpz]) \n\t"
|
||||
"add.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re + tmpz[5].re;
|
||||
"add.s %[tmp3], %[tmp3], %[pom1] \n\t" // tmp3 = tmpz[6].re + tmpz[7].re;
|
||||
"add.s %[tmp2], %[tmp2], %[pom2] \n\t" // tmp2 = tmpz[4].im + tmpz[5].im;
|
||||
"lwc1 %[pom], 40(%[tmpz]) \n\t"
|
||||
"add.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im + tmpz[7].im;
|
||||
"add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
|
||||
"sub.s %[tmp7], %[tmp1], %[tmp3] \n\t" // tmp7 = tmp1 - tmp3;
|
||||
"lwc1 %[tmp1], 32(%[tmpz]) \n\t"
|
||||
"lwc1 %[pom1], 44(%[tmpz]) \n\t"
|
||||
"add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
|
||||
"sub.s %[tmp8], %[tmp2], %[tmp4] \n\t" // tmp8 = tmp2 - tmp4;
|
||||
"lwc1 %[tmp2], 36(%[tmpz]) \n\t"
|
||||
"lwc1 %[pom2], 56(%[tmpz]) \n\t"
|
||||
"lwc1 %[pom3], 60(%[tmpz]) \n\t"
|
||||
"lwc1 %[tmp3], 48(%[tmpz]) \n\t"
|
||||
"lwc1 %[tmp4], 52(%[tmpz]) \n\t"
|
||||
"sub.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re - tmpz[5].re;
|
||||
"lwc1 %[pom], 0(%[tmpz]) \n\t"
|
||||
"sub.s %[tmp2], %[tmp2], %[pom1] \n\t" // tmp2 = tmpz[4].im - tmpz[5].im;
|
||||
"sub.s %[tmp3], %[tmp3], %[pom2] \n\t" // tmp3 = tmpz[6].re - tmpz[7].re;
|
||||
"lwc1 %[pom2], 4(%[tmpz]) \n\t"
|
||||
"sub.s %[pom1], %[pom], %[tmp5] \n\t"
|
||||
"sub.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im - tmpz[7].im;
|
||||
"add.s %[pom3], %[pom], %[tmp5] \n\t"
|
||||
"sub.s %[pom], %[pom2], %[tmp6] \n\t"
|
||||
"add.s %[pom2], %[pom2], %[tmp6] \n\t"
|
||||
"swc1 %[pom1], 32(%[tmpz]) \n\t" // tmpz[4].re = tmpz[0].re - tmp5;
|
||||
"swc1 %[pom3], 0(%[tmpz]) \n\t" // tmpz[0].re = tmpz[0].re + tmp5;
|
||||
"swc1 %[pom], 36(%[tmpz]) \n\t" // tmpz[4].im = tmpz[0].im - tmp6;
|
||||
"swc1 %[pom2], 4(%[tmpz]) \n\t" // tmpz[0].im = tmpz[0].im + tmp6;
|
||||
"lwc1 %[pom1], 16(%[tmpz]) \n\t"
|
||||
"lwc1 %[pom3], 20(%[tmpz]) \n\t"
|
||||
"add.s %[temp1],%[tmp1], %[tmp2] \n\t"
|
||||
"sub.s %[temp], %[pom1], %[tmp8] \n\t"
|
||||
"add.s %[pom2], %[pom3], %[tmp7] \n\t"
|
||||
"sub.s %[temp3],%[tmp3], %[tmp4] \n\t"
|
||||
"sub.s %[temp4],%[tmp2], %[tmp1] \n\t"
|
||||
"swc1 %[temp], 48(%[tmpz]) \n\t" // tmpz[6].re = tmpz[2].re - tmp8;
|
||||
"swc1 %[pom2], 52(%[tmpz]) \n\t" // tmpz[6].im = tmpz[2].im + tmp7;
|
||||
"add.s %[pom1], %[pom1], %[tmp8] \n\t"
|
||||
"sub.s %[pom3], %[pom3], %[tmp7] \n\t"
|
||||
"add.s %[tmp3], %[tmp3], %[tmp4] \n\t"
|
||||
"mul.s %[tmp5], %[f1], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2);
|
||||
"mul.s %[tmp7], %[f1], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4);
|
||||
"mul.s %[tmp6], %[f1], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1);
|
||||
"mul.s %[tmp8], %[f1], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4);
|
||||
"swc1 %[pom1], 16(%[tmpz]) \n\t" // tmpz[2].re = tmpz[2].re + tmp8;
|
||||
"swc1 %[pom3], 20(%[tmpz]) \n\t" // tmpz[2].im = tmpz[2].im - tmp7;
|
||||
"add.s %[tmp1], %[tmp5], %[tmp7] \n\t" // tmp1 = tmp5 + tmp7;
|
||||
"sub.s %[tmp3], %[tmp5], %[tmp7] \n\t" // tmp3 = tmp5 - tmp7;
|
||||
"add.s %[tmp2], %[tmp6], %[tmp8] \n\t" // tmp2 = tmp6 + tmp8;
|
||||
"sub.s %[tmp4], %[tmp6], %[tmp8] \n\t" // tmp4 = tmp6 - tmp8;
|
||||
"lwc1 %[temp], 8(%[tmpz]) \n\t"
|
||||
"lwc1 %[temp1],12(%[tmpz]) \n\t"
|
||||
"lwc1 %[pom], 24(%[tmpz]) \n\t"
|
||||
"lwc1 %[pom2], 28(%[tmpz]) \n\t"
|
||||
"sub.s %[temp4],%[temp], %[tmp1] \n\t"
|
||||
"sub.s %[temp3],%[temp1], %[tmp2] \n\t"
|
||||
"add.s %[temp], %[temp], %[tmp1] \n\t"
|
||||
"add.s %[temp1],%[temp1], %[tmp2] \n\t"
|
||||
"sub.s %[pom1], %[pom], %[tmp4] \n\t"
|
||||
"add.s %[pom3], %[pom2], %[tmp3] \n\t"
|
||||
"add.s %[pom], %[pom], %[tmp4] \n\t"
|
||||
"sub.s %[pom2], %[pom2], %[tmp3] \n\t"
|
||||
"swc1 %[temp4],40(%[tmpz]) \n\t" // tmpz[5].re = tmpz[1].re - tmp1;
|
||||
"swc1 %[temp3],44(%[tmpz]) \n\t" // tmpz[5].im = tmpz[1].im - tmp2;
|
||||
"swc1 %[temp], 8(%[tmpz]) \n\t" // tmpz[1].re = tmpz[1].re + tmp1;
|
||||
"swc1 %[temp1],12(%[tmpz]) \n\t" // tmpz[1].im = tmpz[1].im + tmp2;
|
||||
"swc1 %[pom1], 56(%[tmpz]) \n\t" // tmpz[7].re = tmpz[3].re - tmp4;
|
||||
"swc1 %[pom3], 60(%[tmpz]) \n\t" // tmpz[7].im = tmpz[3].im + tmp3;
|
||||
"swc1 %[pom], 24(%[tmpz]) \n\t" // tmpz[3].re = tmpz[3].re + tmp4;
|
||||
"swc1 %[pom2], 28(%[tmpz]) \n\t" // tmpz[3].im = tmpz[3].im - tmp3;
|
||||
: [tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
|
||||
[tmp3]"=&f"(tmp3), [tmp2]"=&f"(tmp2), [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp7]"=&f"(tmp7),
|
||||
[tmp6]"=&f"(tmp6), [tmp8]"=&f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
|
||||
[temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
|
||||
: [tmpz]"r"(tmpz), [f1]"f"(f1)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
step = 1 << (MAX_LOG2_NFFT - 4);
|
||||
n4 = 4;
|
||||
|
||||
for (nbits=4; nbits<=s->nbits; nbits++) {
|
||||
num_transforms = (num_transforms >> 1) | 1;
|
||||
n2 = 2 * n4;
|
||||
n34 = 3 * n4;
|
||||
|
||||
for (n=0; n<num_transforms; n++) {
|
||||
offset = ff_fft_offsets_lut[n] << nbits;
|
||||
tmpz = z + offset;
|
||||
|
||||
tmpz_n2 = tmpz + n2;
|
||||
tmpz_n4 = tmpz + n4;
|
||||
tmpz_n34 = tmpz + n34;
|
||||
|
||||
__asm__ volatile (
|
||||
"lwc1 %[pom1], 0(%[tmpz_n2]) \n\t"
|
||||
"lwc1 %[pom], 0(%[tmpz_n34]) \n\t"
|
||||
"lwc1 %[pom2], 4(%[tmpz_n2]) \n\t"
|
||||
"lwc1 %[pom3], 4(%[tmpz_n34]) \n\t"
|
||||
"lwc1 %[temp1],0(%[tmpz]) \n\t"
|
||||
"lwc1 %[temp3],4(%[tmpz]) \n\t"
|
||||
"add.s %[tmp5], %[pom1], %[pom] \n\t" // tmp5 = tmpz[ n2].re + tmpz[n34].re;
|
||||
"sub.s %[tmp1], %[pom1], %[pom] \n\t" // tmp1 = tmpz[ n2].re - tmpz[n34].re;
|
||||
"add.s %[tmp6], %[pom2], %[pom3] \n\t" // tmp6 = tmpz[ n2].im + tmpz[n34].im;
|
||||
"sub.s %[tmp2], %[pom2], %[pom3] \n\t" // tmp2 = tmpz[ n2].im - tmpz[n34].im;
|
||||
"sub.s %[temp], %[temp1], %[tmp5] \n\t"
|
||||
"add.s %[temp1],%[temp1], %[tmp5] \n\t"
|
||||
"sub.s %[temp4],%[temp3], %[tmp6] \n\t"
|
||||
"add.s %[temp3],%[temp3], %[tmp6] \n\t"
|
||||
"swc1 %[temp], 0(%[tmpz_n2]) \n\t" // tmpz[ n2].re = tmpz[ 0].re - tmp5;
|
||||
"swc1 %[temp1],0(%[tmpz]) \n\t" // tmpz[ 0].re = tmpz[ 0].re + tmp5;
|
||||
"lwc1 %[pom1], 0(%[tmpz_n4]) \n\t"
|
||||
"swc1 %[temp4],4(%[tmpz_n2]) \n\t" // tmpz[ n2].im = tmpz[ 0].im - tmp6;
|
||||
"lwc1 %[temp], 4(%[tmpz_n4]) \n\t"
|
||||
"swc1 %[temp3],4(%[tmpz]) \n\t" // tmpz[ 0].im = tmpz[ 0].im + tmp6;
|
||||
"sub.s %[pom], %[pom1], %[tmp2] \n\t"
|
||||
"add.s %[pom1], %[pom1], %[tmp2] \n\t"
|
||||
"add.s %[temp1],%[temp], %[tmp1] \n\t"
|
||||
"sub.s %[temp], %[temp], %[tmp1] \n\t"
|
||||
"swc1 %[pom], 0(%[tmpz_n34]) \n\t" // tmpz[n34].re = tmpz[n4].re - tmp2;
|
||||
"swc1 %[pom1], 0(%[tmpz_n4]) \n\t" // tmpz[ n4].re = tmpz[n4].re + tmp2;
|
||||
"swc1 %[temp1],4(%[tmpz_n34]) \n\t" // tmpz[n34].im = tmpz[n4].im + tmp1;
|
||||
"swc1 %[temp], 4(%[tmpz_n4]) \n\t" // tmpz[ n4].im = tmpz[n4].im - tmp1;
|
||||
: [tmp5]"=&f"(tmp5),
|
||||
[tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
|
||||
[tmp2]"=&f"(tmp2), [tmp6]"=&f"(tmp6), [pom3]"=&f"(pom3),
|
||||
[temp]"=&f"(temp), [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
|
||||
: [tmpz]"r"(tmpz), [tmpz_n2]"r"(tmpz_n2), [tmpz_n34]"r"(tmpz_n34), [tmpz_n4]"r"(tmpz_n4)
|
||||
: "memory"
|
||||
);
|
||||
|
||||
w_re_ptr = (float*)(ff_cos_131072 + step);
|
||||
w_im_ptr = (float*)(ff_cos_131072 + MAX_FFT_SIZE/4 - step);
|
||||
|
||||
for (i=1; i<n4; i++) {
|
||||
w_re = w_re_ptr[0];
|
||||
w_im = w_im_ptr[0];
|
||||
tmpz_n2_i = tmpz_n2 + i;
|
||||
tmpz_n4_i = tmpz_n4 + i;
|
||||
tmpz_n34_i= tmpz_n34 + i;
|
||||
tmpz_i = tmpz + i;
|
||||
|
||||
__asm__ volatile (
|
||||
"lwc1 %[temp], 0(%[tmpz_n2_i]) \n\t"
|
||||
"lwc1 %[temp1], 4(%[tmpz_n2_i]) \n\t"
|
||||
"lwc1 %[pom], 0(%[tmpz_n34_i]) \n\t"
|
||||
"lwc1 %[pom1], 4(%[tmpz_n34_i]) \n\t"
|
||||
"mul.s %[temp3], %[w_im], %[temp] \n\t"
|
||||
"mul.s %[temp4], %[w_im], %[temp1] \n\t"
|
||||
"mul.s %[pom2], %[w_im], %[pom1] \n\t"
|
||||
"mul.s %[pom3], %[w_im], %[pom] \n\t"
|
||||
"msub.s %[tmp2], %[temp3], %[w_re], %[temp1] \n\t" // tmp2 = w_re * tmpz[ n2+i].im - w_im * tmpz[ n2+i].re;
|
||||
"madd.s %[tmp1], %[temp4], %[w_re], %[temp] \n\t" // tmp1 = w_re * tmpz[ n2+i].re + w_im * tmpz[ n2+i].im;
|
||||
"msub.s %[tmp3], %[pom2], %[w_re], %[pom] \n\t" // tmp3 = w_re * tmpz[n34+i].re - w_im * tmpz[n34+i].im;
|
||||
"madd.s %[tmp4], %[pom3], %[w_re], %[pom1] \n\t" // tmp4 = w_re * tmpz[n34+i].im + w_im * tmpz[n34+i].re;
|
||||
"lwc1 %[temp], 0(%[tmpz_i]) \n\t"
|
||||
"lwc1 %[pom], 4(%[tmpz_i]) \n\t"
|
||||
"add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
|
||||
"sub.s %[tmp1], %[tmp1], %[tmp3] \n\t" // tmp1 = tmp1 - tmp3;
|
||||
"add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
|
||||
"sub.s %[tmp2], %[tmp2], %[tmp4] \n\t" // tmp2 = tmp2 - tmp4;
|
||||
"sub.s %[temp1], %[temp], %[tmp5] \n\t"
|
||||
"add.s %[temp], %[temp], %[tmp5] \n\t"
|
||||
"sub.s %[pom1], %[pom], %[tmp6] \n\t"
|
||||
"add.s %[pom], %[pom], %[tmp6] \n\t"
|
||||
"lwc1 %[temp3], 0(%[tmpz_n4_i]) \n\t"
|
||||
"lwc1 %[pom2], 4(%[tmpz_n4_i]) \n\t"
|
||||
"swc1 %[temp1], 0(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].re = tmpz[ i].re - tmp5;
|
||||
"swc1 %[temp], 0(%[tmpz_i]) \n\t" // tmpz[ i].re = tmpz[ i].re + tmp5;
|
||||
"swc1 %[pom1], 4(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].im = tmpz[ i].im - tmp6;
|
||||
"swc1 %[pom] , 4(%[tmpz_i]) \n\t" // tmpz[ i].im = tmpz[ i].im + tmp6;
|
||||
"sub.s %[temp4], %[temp3], %[tmp2] \n\t"
|
||||
"add.s %[pom3], %[pom2], %[tmp1] \n\t"
|
||||
"add.s %[temp3], %[temp3], %[tmp2] \n\t"
|
||||
"sub.s %[pom2], %[pom2], %[tmp1] \n\t"
|
||||
"swc1 %[temp4], 0(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
|
||||
"swc1 %[pom3], 4(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
|
||||
"swc1 %[temp3], 0(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
|
||||
"swc1 %[pom2], 4(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
|
||||
: [tmp1]"=&f"(tmp1), [tmp2]"=&f" (tmp2), [temp]"=&f"(temp), [tmp3]"=&f"(tmp3),
|
||||
[tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp6]"=&f"(tmp6),
|
||||
[temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
|
||||
[pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2), [pom3]"=&f"(pom3)
|
||||
: [w_re]"f"(w_re), [w_im]"f"(w_im),
|
||||
[tmpz_i]"r"(tmpz_i),[tmpz_n2_i]"r"(tmpz_n2_i),
|
||||
[tmpz_n34_i]"r"(tmpz_n34_i), [tmpz_n4_i]"r"(tmpz_n4_i)
|
||||
: "memory"
|
||||
);
|
||||
w_re_ptr += step;
|
||||
w_im_ptr -= step;
|
||||
}
|
||||
}
|
||||
step >>= 1;
|
||||
n4 <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* MDCT/IMDCT transforms.
|
||||
*/
|
||||
|
||||
static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
{
|
||||
int k, n8, n4, n2, n, j;
|
||||
const uint16_t *revtab = s->revtab;
|
||||
const FFTSample *tcos = s->tcos;
|
||||
const FFTSample *tsin = s->tsin;
|
||||
const FFTSample *in1, *in2, *in3, *in4;
|
||||
FFTComplex *z = (FFTComplex *)output;
|
||||
|
||||
int j1;
|
||||
const float *tcos1, *tsin1, *tcos2, *tsin2;
|
||||
float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
|
||||
temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
|
||||
FFTComplex *z1, *z2;
|
||||
|
||||
n = 1 << s->mdct_bits;
|
||||
n2 = n >> 1;
|
||||
n4 = n >> 2;
|
||||
n8 = n >> 3;
|
||||
|
||||
/* pre rotation */
|
||||
in1 = input;
|
||||
in2 = input + n2 - 1;
|
||||
in3 = input + 2;
|
||||
in4 = input + n2 - 3;
|
||||
|
||||
tcos1 = tcos;
|
||||
tsin1 = tsin;
|
||||
|
||||
/* n4 = 64 or 128 */
|
||||
for(k = 0; k < n4; k += 2) {
|
||||
j = revtab[k ];
|
||||
j1 = revtab[k + 1];
|
||||
|
||||
__asm__ volatile (
|
||||
"lwc1 %[temp1], 0(%[in2]) \t\n"
|
||||
"lwc1 %[temp2], 0(%[tcos1]) \t\n"
|
||||
"lwc1 %[temp3], 0(%[tsin1]) \t\n"
|
||||
"lwc1 %[temp4], 0(%[in1]) \t\n"
|
||||
"lwc1 %[temp5], 0(%[in4]) \t\n"
|
||||
"mul.s %[temp9], %[temp1], %[temp2] \t\n"
|
||||
"mul.s %[temp10], %[temp1], %[temp3] \t\n"
|
||||
"lwc1 %[temp6], 4(%[tcos1]) \t\n"
|
||||
"lwc1 %[temp7], 4(%[tsin1]) \t\n"
|
||||
"nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n"
|
||||
"madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n"
|
||||
"mul.s %[temp11], %[temp5], %[temp6] \t\n"
|
||||
"mul.s %[temp12], %[temp5], %[temp7] \t\n"
|
||||
"lwc1 %[temp8], 0(%[in3]) \t\n"
|
||||
PTR_ADDIU " %[tcos1], %[tcos1], 8 \t\n"
|
||||
PTR_ADDIU " %[tsin1], %[tsin1], 8 \t\n"
|
||||
PTR_ADDIU " %[in1], %[in1], 16 \t\n"
|
||||
"nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n"
|
||||
"madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n"
|
||||
PTR_ADDIU " %[in2], %[in2], -16 \t\n"
|
||||
PTR_ADDIU " %[in3], %[in3], 16 \t\n"
|
||||
PTR_ADDIU " %[in4], %[in4], -16 \t\n"
|
||||
|
||||
: [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
|
||||
[temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
|
||||
[temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
|
||||
[temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
|
||||
[temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
|
||||
[temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
|
||||
[tsin1]"+r"(tsin1), [tcos1]"+r"(tcos1),
|
||||
[in1]"+r"(in1), [in2]"+r"(in2),
|
||||
[in3]"+r"(in3), [in4]"+r"(in4)
|
||||
:
|
||||
: "memory"
|
||||
);
|
||||
|
||||
z[j ].re = temp9;
|
||||
z[j ].im = temp10;
|
||||
z[j1].re = temp11;
|
||||
z[j1].im = temp12;
|
||||
}
|
||||
|
||||
s->fft_calc(s, z);
|
||||
|
||||
/* post rotation + reordering */
|
||||
/* n8 = 32 or 64 */
|
||||
for(k = 0; k < n8; k += 2) {
|
||||
tcos1 = &tcos[n8 - k - 2];
|
||||
tsin1 = &tsin[n8 - k - 2];
|
||||
tcos2 = &tcos[n8 + k];
|
||||
tsin2 = &tsin[n8 + k];
|
||||
z1 = &z[n8 - k - 2];
|
||||
z2 = &z[n8 + k ];
|
||||
|
||||
__asm__ volatile (
|
||||
"lwc1 %[temp1], 12(%[z1]) \t\n"
|
||||
"lwc1 %[temp2], 4(%[tsin1]) \t\n"
|
||||
"lwc1 %[temp3], 4(%[tcos1]) \t\n"
|
||||
"lwc1 %[temp4], 8(%[z1]) \t\n"
|
||||
"lwc1 %[temp5], 4(%[z1]) \t\n"
|
||||
"mul.s %[temp9], %[temp1], %[temp2] \t\n"
|
||||
"mul.s %[temp10], %[temp1], %[temp3] \t\n"
|
||||
"lwc1 %[temp6], 0(%[tsin1]) \t\n"
|
||||
"lwc1 %[temp7], 0(%[tcos1]) \t\n"
|
||||
"nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n"
|
||||
"madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n"
|
||||
"mul.s %[temp11], %[temp5], %[temp6] \t\n"
|
||||
"mul.s %[temp12], %[temp5], %[temp7] \t\n"
|
||||
"lwc1 %[temp8], 0(%[z1]) \t\n"
|
||||
"lwc1 %[temp1], 4(%[z2]) \t\n"
|
||||
"lwc1 %[temp2], 0(%[tsin2]) \t\n"
|
||||
"lwc1 %[temp3], 0(%[tcos2]) \t\n"
|
||||
"nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n"
|
||||
"madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n"
|
||||
"mul.s %[temp13], %[temp1], %[temp2] \t\n"
|
||||
"mul.s %[temp14], %[temp1], %[temp3] \t\n"
|
||||
"lwc1 %[temp4], 0(%[z2]) \t\n"
|
||||
"lwc1 %[temp5], 12(%[z2]) \t\n"
|
||||
"lwc1 %[temp6], 4(%[tsin2]) \t\n"
|
||||
"lwc1 %[temp7], 4(%[tcos2]) \t\n"
|
||||
"nmsub.s %[temp13], %[temp13], %[temp4], %[temp3] \t\n"
|
||||
"madd.s %[temp14], %[temp14], %[temp4], %[temp2] \t\n"
|
||||
"mul.s %[temp15], %[temp5], %[temp6] \t\n"
|
||||
"mul.s %[temp16], %[temp5], %[temp7] \t\n"
|
||||
"lwc1 %[temp8], 8(%[z2]) \t\n"
|
||||
"nmsub.s %[temp15], %[temp15], %[temp8], %[temp7] \t\n"
|
||||
"madd.s %[temp16], %[temp16], %[temp8], %[temp6] \t\n"
|
||||
: [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
|
||||
[temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
|
||||
[temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
|
||||
[temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
|
||||
[temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
|
||||
[temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
|
||||
[temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
|
||||
[temp15]"=&f"(temp15), [temp16]"=&f"(temp16)
|
||||
: [z1]"r"(z1), [z2]"r"(z2),
|
||||
[tsin1]"r"(tsin1), [tcos1]"r"(tcos1),
|
||||
[tsin2]"r"(tsin2), [tcos2]"r"(tcos2)
|
||||
: "memory"
|
||||
);
|
||||
|
||||
z1[1].re = temp9;
|
||||
z1[1].im = temp14;
|
||||
z2[0].re = temp13;
|
||||
z2[0].im = temp10;
|
||||
|
||||
z1[0].re = temp11;
|
||||
z1[0].im = temp16;
|
||||
z2[1].re = temp15;
|
||||
z2[1].im = temp12;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute inverse MDCT of size N = 2^nbits
|
||||
* @param output N samples
|
||||
* @param input N/2 samples
|
||||
*/
|
||||
static void ff_imdct_calc_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
{
|
||||
int k;
|
||||
int n = 1 << s->mdct_bits;
|
||||
int n2 = n >> 1;
|
||||
int n4 = n >> 2;
|
||||
|
||||
ff_imdct_half_mips(s, output+n4, input);
|
||||
|
||||
for(k = 0; k < n4; k+=4) {
|
||||
output[k] = -output[n2-k-1];
|
||||
output[k+1] = -output[n2-k-2];
|
||||
output[k+2] = -output[n2-k-3];
|
||||
output[k+3] = -output[n2-k-4];
|
||||
|
||||
output[n-k-1] = output[n2+k];
|
||||
output[n-k-2] = output[n2+k+1];
|
||||
output[n-k-3] = output[n2+k+2];
|
||||
output[n-k-4] = output[n2+k+3];
|
||||
}
|
||||
}
|
||||
#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
av_cold void ff_fft_init_mips(FFTContext *s)
|
||||
{
|
||||
ff_fft_lut_init();
|
||||
ff_init_ff_cos_tabs(17);
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
|
||||
s->fft_calc = ff_fft_calc_mips;
|
||||
#if CONFIG_MDCT
|
||||
s->imdct_calc = ff_imdct_calc_mips;
|
||||
s->imdct_half = ff_imdct_half_mips;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
@ -1,9 +1,6 @@
|
||||
# subsystems
|
||||
OBJS-$(CONFIG_AUDIODSP) += ppc/audiodsp.o
|
||||
OBJS-$(CONFIG_BLOCKDSP) += ppc/blockdsp.o
|
||||
OBJS-$(CONFIG_FFT) += ppc/fft_init.o \
|
||||
ppc/fft_altivec.o \
|
||||
ppc/fft_vsx.o
|
||||
OBJS-$(CONFIG_FDCTDSP) += ppc/fdctdsp.o
|
||||
OBJS-$(CONFIG_FMTCONVERT) += ppc/fmtconvert_altivec.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += ppc/h264chroma_init.o
|
||||
|
@ -1,458 +0,0 @@
|
||||
/*
|
||||
* FFT transform with Altivec optimizations
|
||||
* Copyright (c) 2009 Loren Merritt
|
||||
*
|
||||
* This algorithm (though not any of the implementation details) is
|
||||
* based on libdjbfft by D. J. Bernstein.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/*
|
||||
* These functions are not individually interchangeable with the C versions.
|
||||
* While C takes arrays of FFTComplex, Altivec leaves intermediate results
|
||||
* in blocks as convenient to the vector size.
|
||||
* i.e. {4x real, 4x imaginary, 4x real, ...}
|
||||
*
|
||||
* I ignore standard calling convention.
|
||||
* Instead, the following registers are treated as global constants:
|
||||
* v14: zero
|
||||
* v15..v18: cosines
|
||||
* v19..v29: permutations
|
||||
* r9: 16
|
||||
* r12: ff_cos_tabs
|
||||
* and the rest are free for local use.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN
|
||||
|
||||
#include "asm.S"
|
||||
|
||||
.text
|
||||
|
||||
.macro addi2 ra, imm // add 32-bit immediate
|
||||
.if \imm & 0xffff
|
||||
addi \ra, \ra, \imm@l
|
||||
.endif
|
||||
.if (\imm+0x8000)>>16
|
||||
addis \ra, \ra, \imm@ha
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
|
||||
vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
|
||||
vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
|
||||
vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
|
||||
vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
|
||||
vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
|
||||
vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
|
||||
vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
|
||||
vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
|
||||
vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
|
||||
vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
|
||||
.endm
|
||||
|
||||
.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
|
||||
vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
|
||||
vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
|
||||
vperm \b2,\b0,\b1,v20
|
||||
vperm \b3,\b0,\b1,v21
|
||||
vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
|
||||
vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
|
||||
vaddfp \b0,\b2,\b3
|
||||
vsubfp \b1,\b2,\b3
|
||||
vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
|
||||
vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
|
||||
vmrghw \b2,\b0,\b1
|
||||
vperm \b3,\b0,\b1,v22
|
||||
vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
|
||||
vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
|
||||
vaddfp \b0,\b2,\b3
|
||||
vsubfp \b1,\b2,\b3
|
||||
vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
|
||||
vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
|
||||
vperm \b2,\b0,\b1,v23
|
||||
vperm \b3,\b0,\b1,v24
|
||||
.endm
|
||||
|
||||
.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
|
||||
vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
|
||||
vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
|
||||
vperm \a2,\a0,\a1,v20 // FFT4 ...
|
||||
vperm \a3,\a0,\a1,v21
|
||||
vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4}
|
||||
vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7}
|
||||
vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7}
|
||||
vaddfp \a0,\a2,\a3
|
||||
vsubfp \a1,\a2,\a3
|
||||
vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
|
||||
vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
|
||||
vmrghw \a2,\a0,\a1
|
||||
vperm \a3,\a0,\a1,v22
|
||||
vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
|
||||
vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
|
||||
vaddfp \a0,\a2,\a3
|
||||
vsubfp \a1,\a2,\a3
|
||||
vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta}
|
||||
vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb}
|
||||
vperm \a2,\a0,\a1,v23
|
||||
vperm \a3,\a0,\a1,v24
|
||||
vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
|
||||
vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
|
||||
vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7}
|
||||
vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7}
|
||||
vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3}
|
||||
vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3}
|
||||
.endm
|
||||
|
||||
.macro BF d0,d1,s0,s1
|
||||
vsubfp \d1,\s0,\s1
|
||||
vaddfp \d0,\s0,\s1
|
||||
.endm
|
||||
|
||||
.macro zip d0,d1,s0,s1
|
||||
vmrghw \d0,\s0,\s1
|
||||
vmrglw \d1,\s0,\s1
|
||||
.endm
|
||||
|
||||
.macro def_fft4 interleave
|
||||
fft4\interleave\()_altivec:
|
||||
lvx v0, 0,r3
|
||||
lvx v1,r9,r3
|
||||
FFT4 v0,v1,v2,v3
|
||||
.ifnb \interleave
|
||||
zip v0,v1,v2,v3
|
||||
stvx v0, 0,r3
|
||||
stvx v1,r9,r3
|
||||
.else
|
||||
stvx v2, 0,r3
|
||||
stvx v3,r9,r3
|
||||
.endif
|
||||
blr
|
||||
.endm
|
||||
|
||||
.macro def_fft8 interleave
|
||||
fft8\interleave\()_altivec:
|
||||
addi r4,r3,32
|
||||
lvx v0, 0,r3
|
||||
lvx v1,r9,r3
|
||||
lvx v2, 0,r4
|
||||
lvx v3,r9,r4
|
||||
FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
|
||||
.ifnb \interleave
|
||||
zip v4,v5,v0,v1
|
||||
zip v6,v7,v2,v3
|
||||
stvx v4, 0,r3
|
||||
stvx v5,r9,r3
|
||||
stvx v6, 0,r4
|
||||
stvx v7,r9,r4
|
||||
.else
|
||||
stvx v0, 0,r3
|
||||
stvx v1,r9,r3
|
||||
stvx v2, 0,r4
|
||||
stvx v3,r9,r4
|
||||
.endif
|
||||
blr
|
||||
.endm
|
||||
|
||||
.macro def_fft16 interleave
|
||||
fft16\interleave\()_altivec:
|
||||
addi r5,r3,64
|
||||
addi r6,r3,96
|
||||
addi r4,r3,32
|
||||
lvx v0, 0,r5
|
||||
lvx v1,r9,r5
|
||||
lvx v2, 0,r6
|
||||
lvx v3,r9,r6
|
||||
FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
|
||||
lvx v0, 0,r3
|
||||
lvx v1,r9,r3
|
||||
lvx v2, 0,r4
|
||||
lvx v3,r9,r4
|
||||
FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12
|
||||
vmaddfp v8,v4,v15,v14 // r2*wre
|
||||
vmaddfp v9,v5,v15,v14 // i2*wre
|
||||
vmaddfp v10,v6,v15,v14 // r3*wre
|
||||
vmaddfp v11,v7,v15,v14 // i3*wre
|
||||
vmaddfp v8,v5,v16,v8 // i2*wim
|
||||
vnmsubfp v9,v4,v16,v9 // r2*wim
|
||||
vnmsubfp v10,v7,v16,v10 // i3*wim
|
||||
vmaddfp v11,v6,v16,v11 // r3*wim
|
||||
BF v10,v12,v10,v8
|
||||
BF v11,v13,v9,v11
|
||||
BF v0,v4,v0,v10
|
||||
BF v3,v7,v3,v12
|
||||
BF v1,v5,v1,v11
|
||||
BF v2,v6,v2,v13
|
||||
.ifnb \interleave
|
||||
zip v8, v9,v0,v1
|
||||
zip v10,v11,v2,v3
|
||||
zip v12,v13,v4,v5
|
||||
zip v14,v15,v6,v7
|
||||
stvx v8, 0,r3
|
||||
stvx v9,r9,r3
|
||||
stvx v10, 0,r4
|
||||
stvx v11,r9,r4
|
||||
stvx v12, 0,r5
|
||||
stvx v13,r9,r5
|
||||
stvx v14, 0,r6
|
||||
stvx v15,r9,r6
|
||||
.else
|
||||
stvx v0, 0,r3
|
||||
stvx v4, 0,r5
|
||||
stvx v3,r9,r4
|
||||
stvx v7,r9,r6
|
||||
stvx v1,r9,r3
|
||||
stvx v5,r9,r5
|
||||
stvx v2, 0,r4
|
||||
stvx v6, 0,r6
|
||||
.endif
|
||||
blr
|
||||
.endm
|
||||
|
||||
// void pass(float *z, float *wre, int n)
|
||||
.macro PASS interleave, suffix
|
||||
fft_pass\suffix\()_altivec:
|
||||
mtctr r5
|
||||
slwi r0,r5,4
|
||||
slwi r7,r5,6 // o2
|
||||
slwi r5,r5,5 // o1
|
||||
add r10,r5,r7 // o3
|
||||
add r0,r4,r0 // wim
|
||||
addi r6,r5,16 // o1+16
|
||||
addi r8,r7,16 // o2+16
|
||||
addi r11,r10,16 // o3+16
|
||||
1:
|
||||
lvx v8, 0,r4 // wre
|
||||
lvx v10, 0,r0 // wim
|
||||
sub r0,r0,r9
|
||||
lvx v9, 0,r0
|
||||
vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3]
|
||||
lvx v4,r3,r7 // r2 = z[o2]
|
||||
lvx v5,r3,r8 // i2 = z[o2+16]
|
||||
lvx v6,r3,r10 // r3 = z[o3]
|
||||
lvx v7,r3,r11 // i3 = z[o3+16]
|
||||
vmaddfp v10,v4,v8,v14 // r2*wre
|
||||
vmaddfp v11,v5,v8,v14 // i2*wre
|
||||
vmaddfp v12,v6,v8,v14 // r3*wre
|
||||
vmaddfp v13,v7,v8,v14 // i3*wre
|
||||
lvx v0, 0,r3 // r0 = z[0]
|
||||
lvx v3,r3,r6 // i1 = z[o1+16]
|
||||
vmaddfp v10,v5,v9,v10 // i2*wim
|
||||
vnmsubfp v11,v4,v9,v11 // r2*wim
|
||||
vnmsubfp v12,v7,v9,v12 // i3*wim
|
||||
vmaddfp v13,v6,v9,v13 // r3*wim
|
||||
lvx v1,r3,r9 // i0 = z[16]
|
||||
lvx v2,r3,r5 // r1 = z[o1]
|
||||
BF v12,v8,v12,v10
|
||||
BF v13,v9,v11,v13
|
||||
BF v0,v4,v0,v12
|
||||
BF v3,v7,v3,v8
|
||||
.if !\interleave
|
||||
stvx v0, 0,r3
|
||||
stvx v4,r3,r7
|
||||
stvx v3,r3,r6
|
||||
stvx v7,r3,r11
|
||||
.endif
|
||||
BF v1,v5,v1,v13
|
||||
BF v2,v6,v2,v9
|
||||
.if !\interleave
|
||||
stvx v1,r3,r9
|
||||
stvx v2,r3,r5
|
||||
stvx v5,r3,r8
|
||||
stvx v6,r3,r10
|
||||
.else
|
||||
vmrghw v8,v0,v1
|
||||
vmrglw v9,v0,v1
|
||||
stvx v8, 0,r3
|
||||
stvx v9,r3,r9
|
||||
vmrghw v8,v2,v3
|
||||
vmrglw v9,v2,v3
|
||||
stvx v8,r3,r5
|
||||
stvx v9,r3,r6
|
||||
vmrghw v8,v4,v5
|
||||
vmrglw v9,v4,v5
|
||||
stvx v8,r3,r7
|
||||
stvx v9,r3,r8
|
||||
vmrghw v8,v6,v7
|
||||
vmrglw v9,v6,v7
|
||||
stvx v8,r3,r10
|
||||
stvx v9,r3,r11
|
||||
.endif
|
||||
addi r3,r3,32
|
||||
addi r4,r4,16
|
||||
bdnz 1b
|
||||
sub r3,r3,r5
|
||||
blr
|
||||
.endm
|
||||
|
||||
#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
|
||||
|
||||
#define WORD_0 0x00,0x01,0x02,0x03
|
||||
#define WORD_1 0x04,0x05,0x06,0x07
|
||||
#define WORD_2 0x08,0x09,0x0a,0x0b
|
||||
#define WORD_3 0x0c,0x0d,0x0e,0x0f
|
||||
#define WORD_s0 0x10,0x11,0x12,0x13
|
||||
#define WORD_s1 0x14,0x15,0x16,0x17
|
||||
#define WORD_s2 0x18,0x19,0x1a,0x1b
|
||||
#define WORD_s3 0x1c,0x1d,0x1e,0x1f
|
||||
|
||||
#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
|
||||
|
||||
.rodata
|
||||
.align 4
|
||||
fft_data:
|
||||
.float 0, 0, 0, 0
|
||||
.float 1, 0.92387953, M_SQRT1_2, 0.38268343
|
||||
.float 0, 0.38268343, M_SQRT1_2, 0.92387953
|
||||
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
|
||||
.float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
|
||||
vcprm(s0,3,2,1)
|
||||
vcprm(0,1,s2,s1)
|
||||
vcprm(2,3,s0,s3)
|
||||
vcprm(2,s3,3,s2)
|
||||
vcprm(0,1,s0,s1)
|
||||
vcprm(2,3,s2,s3)
|
||||
vcprm(2,3,0,1)
|
||||
vcprm(1,2,s3,s0)
|
||||
vcprm(0,3,s2,s1)
|
||||
vcprm(0,2,s1,s3)
|
||||
vcprm(1,3,s0,s2)
|
||||
|
||||
.macro lvm b, r, regs:vararg
|
||||
lvx \r, 0, \b
|
||||
addi \b, \b, 16
|
||||
.ifnb \regs
|
||||
lvm \b, \regs
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro stvm b, r, regs:vararg
|
||||
stvx \r, 0, \b
|
||||
addi \b, \b, 16
|
||||
.ifnb \regs
|
||||
stvm \b, \regs
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro fft_calc interleave
|
||||
extfunc ff_fft_calc\interleave\()_altivec
|
||||
mflr r0
|
||||
stp r0, 2*PS(R(1))
|
||||
stpu r1, -(160+16*PS)(R(1))
|
||||
get_got r11
|
||||
addi r6, r1, 16*PS
|
||||
stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
mfvrsave r0
|
||||
stw r0, 15*PS(R(1))
|
||||
#if __APPLE__
|
||||
li r6, 0xfffffffc
|
||||
#else
|
||||
li r6, -4
|
||||
#endif
|
||||
mtvrsave r6
|
||||
|
||||
movrel r6, fft_data, r11
|
||||
lvm r6, v14, v15, v16, v17, v18, v19, v20, v21
|
||||
lvm r6, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
|
||||
li r9, 16
|
||||
movrel r12, X(ff_cos_tabs), r11
|
||||
|
||||
movrel r6, fft_dispatch_tab\interleave\()_altivec, r11
|
||||
lwz r3, 0(R(3))
|
||||
subi r3, r3, 2
|
||||
slwi r3, r3, 2+ARCH_PPC64
|
||||
lpx r3, r3, r6
|
||||
mtctr r3
|
||||
mr r3, r4
|
||||
bctrl
|
||||
|
||||
addi r6, r1, 16*PS
|
||||
lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
lwz r6, 15*PS(R(1))
|
||||
mtvrsave r6
|
||||
lp r1, 0(R(1))
|
||||
lp r0, 2*PS(R(1))
|
||||
mtlr r0
|
||||
blr
|
||||
.endm
|
||||
|
||||
.macro DECL_FFT suffix, bits, n, n2, n4
|
||||
fft\n\suffix\()_altivec:
|
||||
mflr r0
|
||||
stp r0,PS*(\bits-3)(R(1))
|
||||
bl fft\n2\()_altivec
|
||||
addi2 r3,\n*4
|
||||
bl fft\n4\()_altivec
|
||||
addi2 r3,\n*2
|
||||
bl fft\n4\()_altivec
|
||||
addi2 r3,\n*-6
|
||||
lp r0,PS*(\bits-3)(R(1))
|
||||
lp r4,\bits*PS(R(12))
|
||||
mtlr r0
|
||||
li r5,\n/16
|
||||
b fft_pass\suffix\()_altivec
|
||||
.endm
|
||||
|
||||
.macro DECL_FFTS interleave, suffix
|
||||
.text
|
||||
def_fft4 \suffix
|
||||
def_fft8 \suffix
|
||||
def_fft16 \suffix
|
||||
PASS \interleave, \suffix
|
||||
DECL_FFT \suffix, 5, 32, 16, 8
|
||||
DECL_FFT \suffix, 6, 64, 32, 16
|
||||
DECL_FFT \suffix, 7, 128, 64, 32
|
||||
DECL_FFT \suffix, 8, 256, 128, 64
|
||||
DECL_FFT \suffix, 9, 512, 256, 128
|
||||
DECL_FFT \suffix,10, 1024, 512, 256
|
||||
DECL_FFT \suffix,11, 2048, 1024, 512
|
||||
DECL_FFT \suffix,12, 4096, 2048, 1024
|
||||
DECL_FFT \suffix,13, 8192, 4096, 2048
|
||||
DECL_FFT \suffix,14,16384, 8192, 4096
|
||||
DECL_FFT \suffix,15,32768,16384, 8192
|
||||
DECL_FFT \suffix,16,65536,32768,16384
|
||||
|
||||
fft_calc \suffix
|
||||
|
||||
.rodata
|
||||
.align 3
|
||||
fft_dispatch_tab\suffix\()_altivec:
|
||||
PTR fft4\suffix\()_altivec
|
||||
PTR fft8\suffix\()_altivec
|
||||
PTR fft16\suffix\()_altivec
|
||||
PTR fft32\suffix\()_altivec
|
||||
PTR fft64\suffix\()_altivec
|
||||
PTR fft128\suffix\()_altivec
|
||||
PTR fft256\suffix\()_altivec
|
||||
PTR fft512\suffix\()_altivec
|
||||
PTR fft1024\suffix\()_altivec
|
||||
PTR fft2048\suffix\()_altivec
|
||||
PTR fft4096\suffix\()_altivec
|
||||
PTR fft8192\suffix\()_altivec
|
||||
PTR fft16384\suffix\()_altivec
|
||||
PTR fft32768\suffix\()_altivec
|
||||
PTR fft65536\suffix\()_altivec
|
||||
.endm
|
||||
|
||||
DECL_FFTS 0
|
||||
DECL_FFTS 1, _interleave
|
||||
|
||||
#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
|
@ -1,168 +0,0 @@
|
||||
/*
|
||||
* FFT/IFFT transforms
|
||||
* AltiVec-enabled
|
||||
* Copyright (c) 2009 Loren Merritt
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/ppc/cpu.h"
|
||||
#include "libavutil/ppc/util_altivec.h"
|
||||
#include "libavcodec/fft.h"
|
||||
|
||||
/**
|
||||
* Do a complex FFT with the parameters defined in ff_fft_init().
|
||||
* The input data must be permuted before with s->revtab table.
|
||||
* No 1.0 / sqrt(n) normalization is done.
|
||||
* AltiVec-enabled:
|
||||
* This code assumes that the 'z' pointer is 16 bytes-aligned.
|
||||
* It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
|
||||
*/
|
||||
|
||||
#if HAVE_VSX
|
||||
#include "fft_vsx.h"
|
||||
#else
|
||||
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z);
|
||||
#endif
|
||||
|
||||
#if HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX)
|
||||
static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
{
|
||||
int j, k;
|
||||
int n = 1 << s->mdct_bits;
|
||||
int n4 = n >> 2;
|
||||
int n8 = n >> 3;
|
||||
int n32 = n >> 5;
|
||||
const uint16_t *revtabj = s->revtab;
|
||||
const uint16_t *revtabk = s->revtab+n4;
|
||||
const vec_f *tcos = (const vec_f*)(s->tcos+n8);
|
||||
const vec_f *tsin = (const vec_f*)(s->tsin+n8);
|
||||
const vec_f *pin = (const vec_f*)(input+n4);
|
||||
vec_f *pout = (vec_f*)(output+n4);
|
||||
|
||||
/* pre rotation */
|
||||
k = n32-1;
|
||||
do {
|
||||
vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;
|
||||
#define CMULA(p,o0,o1,o2,o3)\
|
||||
a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\
|
||||
b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\
|
||||
re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\
|
||||
im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\
|
||||
cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\
|
||||
sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\
|
||||
r##p = im*cos - re*sin;\
|
||||
i##p = re*cos + im*sin;
|
||||
#define STORE2(v,dst)\
|
||||
j = dst;\
|
||||
vec_ste(v, 0, output+j*2);\
|
||||
vec_ste(v, 4, output+j*2);
|
||||
#define STORE8(p)\
|
||||
a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\
|
||||
b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\
|
||||
c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\
|
||||
d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\
|
||||
STORE2(a, revtabk[ p*2-4]);\
|
||||
STORE2(b, revtabk[ p*2-3]);\
|
||||
STORE2(c, revtabj[-p*2+2]);\
|
||||
STORE2(d, revtabj[-p*2+3]);
|
||||
|
||||
cos0 = tcos[k];
|
||||
sin0 = tsin[k];
|
||||
cos1 = tcos[-k-1];
|
||||
sin1 = tsin[-k-1];
|
||||
CMULA(0, 0,1,2,3);
|
||||
CMULA(1, 2,3,0,1);
|
||||
STORE8(0);
|
||||
STORE8(1);
|
||||
revtabj += 4;
|
||||
revtabk -= 4;
|
||||
k--;
|
||||
} while(k >= 0);
|
||||
|
||||
#if HAVE_VSX
|
||||
ff_fft_calc_vsx(s, (FFTComplex*)output);
|
||||
#else
|
||||
ff_fft_calc_altivec(s, (FFTComplex*)output);
|
||||
#endif
|
||||
|
||||
/* post rotation + reordering */
|
||||
j = -n32;
|
||||
k = n32-1;
|
||||
do {
|
||||
vec_f cos,sin,re,im,a,b,c,d;
|
||||
#define CMULB(d0,d1,o)\
|
||||
re = pout[o*2];\
|
||||
im = pout[o*2+1];\
|
||||
cos = tcos[o];\
|
||||
sin = tsin[o];\
|
||||
d0 = im*sin - re*cos;\
|
||||
d1 = re*sin + im*cos;
|
||||
|
||||
CMULB(a,b,j);
|
||||
CMULB(c,d,k);
|
||||
pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2));
|
||||
pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));
|
||||
pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2));
|
||||
pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));
|
||||
j++;
|
||||
k--;
|
||||
} while(k >= 0);
|
||||
}
|
||||
|
||||
static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
{
|
||||
int k;
|
||||
int n = 1 << s->mdct_bits;
|
||||
int n4 = n >> 2;
|
||||
int n16 = n >> 4;
|
||||
vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};
|
||||
vec_u32 *p0 = (vec_u32*)(output+n4);
|
||||
vec_u32 *p1 = (vec_u32*)(output+n4*3);
|
||||
|
||||
imdct_half_altivec(s, output + n4, input);
|
||||
|
||||
for (k = 0; k < n16; k++) {
|
||||
vec_u32 a = p0[k] ^ sign;
|
||||
vec_u32 b = p1[-k-1];
|
||||
p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));
|
||||
p1[k] = vec_perm(b, b, vcprm(3,2,1,0));
|
||||
}
|
||||
}
|
||||
#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX) */
|
||||
|
||||
av_cold void ff_fft_init_ppc(FFTContext *s)
|
||||
{
|
||||
#if HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX)
|
||||
if (!PPC_ALTIVEC(av_get_cpu_flags()))
|
||||
return;
|
||||
|
||||
#if HAVE_VSX
|
||||
s->fft_calc = ff_fft_calc_interleave_vsx;
|
||||
#else
|
||||
s->fft_calc = ff_fft_calc_interleave_altivec;
|
||||
#endif
|
||||
if (s->mdct_bits >= 5) {
|
||||
s->imdct_calc = imdct_calc_altivec;
|
||||
s->imdct_half = imdct_half_altivec;
|
||||
}
|
||||
#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
|
||||
}
|
@ -1,226 +0,0 @@
|
||||
/*
|
||||
* FFT transform, optimized with VSX built-in functions
|
||||
* Copyright (c) 2014 Rong Yan
|
||||
*
|
||||
* This algorithm (though not any of the implementation details) is
|
||||
* based on libdjbfft by D. J. Bernstein.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/ppc/util_altivec.h"
|
||||
#include "libavcodec/fft.h"
|
||||
#include "libavcodec/fft-internal.h"
|
||||
#include "fft_vsx.h"
|
||||
|
||||
#if HAVE_VSX
|
||||
|
||||
static void fft32_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
fft16_vsx_interleave(z);
|
||||
fft8_vsx_interleave(z+16);
|
||||
fft8_vsx_interleave(z+24);
|
||||
pass_vsx_interleave(z,ff_cos_32,4);
|
||||
}
|
||||
|
||||
static void fft64_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
fft32_vsx_interleave(z);
|
||||
fft16_vsx_interleave(z+32);
|
||||
fft16_vsx_interleave(z+48);
|
||||
pass_vsx_interleave(z,ff_cos_64, 8);
|
||||
}
|
||||
static void fft128_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
fft64_vsx_interleave(z);
|
||||
fft32_vsx_interleave(z+64);
|
||||
fft32_vsx_interleave(z+96);
|
||||
pass_vsx_interleave(z,ff_cos_128,16);
|
||||
}
|
||||
static void fft256_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
fft128_vsx_interleave(z);
|
||||
fft64_vsx_interleave(z+128);
|
||||
fft64_vsx_interleave(z+192);
|
||||
pass_vsx_interleave(z,ff_cos_256,32);
|
||||
}
|
||||
static void fft512_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
fft256_vsx_interleave(z);
|
||||
fft128_vsx_interleave(z+256);
|
||||
fft128_vsx_interleave(z+384);
|
||||
pass_vsx_interleave(z,ff_cos_512,64);
|
||||
}
|
||||
static void fft1024_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
fft512_vsx_interleave(z);
|
||||
fft256_vsx_interleave(z+512);
|
||||
fft256_vsx_interleave(z+768);
|
||||
pass_vsx_interleave(z,ff_cos_1024,128);
|
||||
|
||||
}
|
||||
static void fft2048_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
fft1024_vsx_interleave(z);
|
||||
fft512_vsx_interleave(z+1024);
|
||||
fft512_vsx_interleave(z+1536);
|
||||
pass_vsx_interleave(z,ff_cos_2048,256);
|
||||
}
|
||||
static void fft4096_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
fft2048_vsx_interleave(z);
|
||||
fft1024_vsx_interleave(z+2048);
|
||||
fft1024_vsx_interleave(z+3072);
|
||||
pass_vsx_interleave(z,ff_cos_4096, 512);
|
||||
}
|
||||
static void fft8192_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
fft4096_vsx_interleave(z);
|
||||
fft2048_vsx_interleave(z+4096);
|
||||
fft2048_vsx_interleave(z+6144);
|
||||
pass_vsx_interleave(z,ff_cos_8192,1024);
|
||||
}
|
||||
static void fft16384_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
fft8192_vsx_interleave(z);
|
||||
fft4096_vsx_interleave(z+8192);
|
||||
fft4096_vsx_interleave(z+12288);
|
||||
pass_vsx_interleave(z,ff_cos_16384,2048);
|
||||
}
|
||||
static void fft32768_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
fft16384_vsx_interleave(z);
|
||||
fft8192_vsx_interleave(z+16384);
|
||||
fft8192_vsx_interleave(z+24576);
|
||||
pass_vsx_interleave(z,ff_cos_32768,4096);
|
||||
}
|
||||
static void fft65536_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
fft32768_vsx_interleave(z);
|
||||
fft16384_vsx_interleave(z+32768);
|
||||
fft16384_vsx_interleave(z+49152);
|
||||
pass_vsx_interleave(z,ff_cos_65536,8192);
|
||||
}
|
||||
|
||||
static void fft32_vsx(FFTComplex *z)
|
||||
{
|
||||
fft16_vsx(z);
|
||||
fft8_vsx(z+16);
|
||||
fft8_vsx(z+24);
|
||||
pass_vsx(z,ff_cos_32,4);
|
||||
}
|
||||
|
||||
static void fft64_vsx(FFTComplex *z)
|
||||
{
|
||||
fft32_vsx(z);
|
||||
fft16_vsx(z+32);
|
||||
fft16_vsx(z+48);
|
||||
pass_vsx(z,ff_cos_64, 8);
|
||||
}
|
||||
static void fft128_vsx(FFTComplex *z)
|
||||
{
|
||||
fft64_vsx(z);
|
||||
fft32_vsx(z+64);
|
||||
fft32_vsx(z+96);
|
||||
pass_vsx(z,ff_cos_128,16);
|
||||
}
|
||||
static void fft256_vsx(FFTComplex *z)
|
||||
{
|
||||
fft128_vsx(z);
|
||||
fft64_vsx(z+128);
|
||||
fft64_vsx(z+192);
|
||||
pass_vsx(z,ff_cos_256,32);
|
||||
}
|
||||
static void fft512_vsx(FFTComplex *z)
|
||||
{
|
||||
fft256_vsx(z);
|
||||
fft128_vsx(z+256);
|
||||
fft128_vsx(z+384);
|
||||
pass_vsx(z,ff_cos_512,64);
|
||||
}
|
||||
static void fft1024_vsx(FFTComplex *z)
|
||||
{
|
||||
fft512_vsx(z);
|
||||
fft256_vsx(z+512);
|
||||
fft256_vsx(z+768);
|
||||
pass_vsx(z,ff_cos_1024,128);
|
||||
|
||||
}
|
||||
static void fft2048_vsx(FFTComplex *z)
|
||||
{
|
||||
fft1024_vsx(z);
|
||||
fft512_vsx(z+1024);
|
||||
fft512_vsx(z+1536);
|
||||
pass_vsx(z,ff_cos_2048,256);
|
||||
}
|
||||
static void fft4096_vsx(FFTComplex *z)
|
||||
{
|
||||
fft2048_vsx(z);
|
||||
fft1024_vsx(z+2048);
|
||||
fft1024_vsx(z+3072);
|
||||
pass_vsx(z,ff_cos_4096, 512);
|
||||
}
|
||||
static void fft8192_vsx(FFTComplex *z)
|
||||
{
|
||||
fft4096_vsx(z);
|
||||
fft2048_vsx(z+4096);
|
||||
fft2048_vsx(z+6144);
|
||||
pass_vsx(z,ff_cos_8192,1024);
|
||||
}
|
||||
static void fft16384_vsx(FFTComplex *z)
|
||||
{
|
||||
fft8192_vsx(z);
|
||||
fft4096_vsx(z+8192);
|
||||
fft4096_vsx(z+12288);
|
||||
pass_vsx(z,ff_cos_16384,2048);
|
||||
}
|
||||
static void fft32768_vsx(FFTComplex *z)
|
||||
{
|
||||
fft16384_vsx(z);
|
||||
fft8192_vsx(z+16384);
|
||||
fft8192_vsx(z+24576);
|
||||
pass_vsx(z,ff_cos_32768,4096);
|
||||
}
|
||||
static void fft65536_vsx(FFTComplex *z)
|
||||
{
|
||||
fft32768_vsx(z);
|
||||
fft16384_vsx(z+32768);
|
||||
fft16384_vsx(z+49152);
|
||||
pass_vsx(z,ff_cos_65536,8192);
|
||||
}
|
||||
|
||||
static void (* const fft_dispatch_vsx[])(FFTComplex*) = {
|
||||
fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,
|
||||
fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,
|
||||
};
|
||||
static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {
|
||||
fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,
|
||||
fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,
|
||||
fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,
|
||||
};
|
||||
void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
fft_dispatch_vsx_interleave[s->nbits-2](z);
|
||||
}
|
||||
void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
fft_dispatch_vsx[s->nbits-2](z);
|
||||
}
|
||||
#endif /* HAVE_VSX */
|
@ -1,829 +0,0 @@
|
||||
#ifndef AVCODEC_PPC_FFT_VSX_H
|
||||
#define AVCODEC_PPC_FFT_VSX_H
|
||||
/*
|
||||
* FFT transform, optimized with VSX built-in functions
|
||||
* Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt
|
||||
*
|
||||
* This algorithm (though not any of the implementation details) is
|
||||
* based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/ppc/util_altivec.h"
|
||||
#include "libavcodec/fft.h"
|
||||
#include "libavcodec/fft-internal.h"
|
||||
|
||||
#if HAVE_VSX
|
||||
|
||||
void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z);
|
||||
|
||||
|
||||
#define byte_2complex (2*sizeof(FFTComplex))
|
||||
#define byte_4complex (4*sizeof(FFTComplex))
|
||||
#define byte_6complex (6*sizeof(FFTComplex))
|
||||
#define byte_8complex (8*sizeof(FFTComplex))
|
||||
#define byte_10complex (10*sizeof(FFTComplex))
|
||||
#define byte_12complex (12*sizeof(FFTComplex))
|
||||
#define byte_14complex (14*sizeof(FFTComplex))
|
||||
|
||||
inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
|
||||
{
|
||||
int o1 = n<<1;
|
||||
int o2 = n<<2;
|
||||
int o3 = o1+o2;
|
||||
int i1, i2, i3;
|
||||
FFTSample* out = (FFTSample*)z;
|
||||
const FFTSample *wim = wre+o1;
|
||||
vec_f vz0, vzo1, vzo2, vzo3;
|
||||
vec_f x0, x1, x2, x3;
|
||||
vec_f x4, x5, x6, x7;
|
||||
vec_f x8, x9, x10, x11;
|
||||
vec_f x12, x13, x14, x15;
|
||||
vec_f x16, x17, x18, x19;
|
||||
vec_f x20, x21, x22, x23;
|
||||
vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
|
||||
vec_f y0, y1, y2, y3;
|
||||
vec_f y4, y5, y8, y9;
|
||||
vec_f y10, y13, y14, y15;
|
||||
vec_f y16, y17, y18, y19;
|
||||
vec_f y20, y21, y22, y23;
|
||||
vec_f wr1, wi1, wr0, wi0;
|
||||
vec_f wr2, wi2, wr3, wi3;
|
||||
vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
|
||||
|
||||
n = n-2;
|
||||
i1 = o1*sizeof(FFTComplex);
|
||||
i2 = o2*sizeof(FFTComplex);
|
||||
i3 = o3*sizeof(FFTComplex);
|
||||
vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
|
||||
vzo2plus1 = vec_ld(i2+16, &(out[0]));
|
||||
vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
|
||||
vzo3plus1 = vec_ld(i3+16, &(out[0]));
|
||||
vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
|
||||
vz0plus1 = vec_ld(16, &(out[0]));
|
||||
vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
|
||||
vzo1plus1 = vec_ld(i1+16, &(out[0]));
|
||||
|
||||
x0 = vec_add(vzo2, vzo3);
|
||||
x1 = vec_sub(vzo2, vzo3);
|
||||
y0 = vec_add(vzo2plus1, vzo3plus1);
|
||||
y1 = vec_sub(vzo2plus1, vzo3plus1);
|
||||
|
||||
wr1 = vec_splats(wre[1]);
|
||||
wi1 = vec_splats(wim[-1]);
|
||||
wi2 = vec_splats(wim[-2]);
|
||||
wi3 = vec_splats(wim[-3]);
|
||||
wr2 = vec_splats(wre[2]);
|
||||
wr3 = vec_splats(wre[3]);
|
||||
|
||||
x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
|
||||
x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
|
||||
|
||||
y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
|
||||
y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
|
||||
y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
|
||||
y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
|
||||
|
||||
ymulwi2 = vec_mul(y4, wi2);
|
||||
ymulwi3 = vec_mul(y5, wi3);
|
||||
x4 = vec_mul(x2, wr1);
|
||||
x5 = vec_mul(x3, wi1);
|
||||
y8 = vec_madd(y2, wr2, ymulwi2);
|
||||
y9 = vec_msub(y2, wr2, ymulwi2);
|
||||
x6 = vec_add(x4, x5);
|
||||
x7 = vec_sub(x4, x5);
|
||||
y13 = vec_madd(y3, wr3, ymulwi3);
|
||||
y14 = vec_msub(y3, wr3, ymulwi3);
|
||||
|
||||
x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3));
|
||||
y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
|
||||
y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
|
||||
|
||||
x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2));
|
||||
x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1));
|
||||
|
||||
y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
|
||||
y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
|
||||
|
||||
x11 = vec_add(vz0, x9);
|
||||
x12 = vec_sub(vz0, x9);
|
||||
x13 = vec_add(vzo1, x10);
|
||||
x14 = vec_sub(vzo1, x10);
|
||||
|
||||
y18 = vec_add(vz0plus1, y16);
|
||||
y19 = vec_sub(vz0plus1, y16);
|
||||
y20 = vec_add(vzo1plus1, y17);
|
||||
y21 = vec_sub(vzo1plus1, y17);
|
||||
|
||||
x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3));
|
||||
x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3));
|
||||
y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
|
||||
y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
|
||||
|
||||
|
||||
vec_st(x11, 0, &(out[0]));
|
||||
vec_st(y18, 16, &(out[0]));
|
||||
vec_st(x15, i1, &(out[0]));
|
||||
vec_st(y22, i1+16, &(out[0]));
|
||||
vec_st(x12, i2, &(out[0]));
|
||||
vec_st(y19, i2+16, &(out[0]));
|
||||
vec_st(x16, i3, &(out[0]));
|
||||
vec_st(y23, i3+16, &(out[0]));
|
||||
|
||||
do {
|
||||
out += 8;
|
||||
wre += 4;
|
||||
wim -= 4;
|
||||
wr0 = vec_splats(wre[0]);
|
||||
wr1 = vec_splats(wre[1]);
|
||||
wi0 = vec_splats(wim[0]);
|
||||
wi1 = vec_splats(wim[-1]);
|
||||
|
||||
wr2 = vec_splats(wre[2]);
|
||||
wr3 = vec_splats(wre[3]);
|
||||
wi2 = vec_splats(wim[-2]);
|
||||
wi3 = vec_splats(wim[-3]);
|
||||
|
||||
vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
|
||||
vzo2plus1 = vec_ld(i2+16, &(out[0]));
|
||||
vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
|
||||
vzo3plus1 = vec_ld(i3+16, &(out[0]));
|
||||
vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
|
||||
vz0plus1 = vec_ld(16, &(out[0]));
|
||||
vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
|
||||
vzo1plus1 = vec_ld(i1+16, &(out[0]));
|
||||
|
||||
x0 = vec_add(vzo2, vzo3);
|
||||
x1 = vec_sub(vzo2, vzo3);
|
||||
|
||||
y0 = vec_add(vzo2plus1, vzo3plus1);
|
||||
y1 = vec_sub(vzo2plus1, vzo3plus1);
|
||||
|
||||
x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0));
|
||||
x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
|
||||
x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1));
|
||||
x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
|
||||
|
||||
y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
|
||||
y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
|
||||
xmulwi0 = vec_mul(x4, wi0);
|
||||
xmulwi1 = vec_mul(x5, wi1);
|
||||
|
||||
y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
|
||||
y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
|
||||
|
||||
x8 = vec_madd(x2, wr0, xmulwi0);
|
||||
x9 = vec_msub(x2, wr0, xmulwi0);
|
||||
ymulwi2 = vec_mul(y4, wi2);
|
||||
ymulwi3 = vec_mul(y5, wi3);
|
||||
|
||||
x13 = vec_madd(x3, wr1, xmulwi1);
|
||||
x14 = vec_msub(x3, wr1, xmulwi1);
|
||||
|
||||
y8 = vec_madd(y2, wr2, ymulwi2);
|
||||
y9 = vec_msub(y2, wr2, ymulwi2);
|
||||
y13 = vec_madd(y3, wr3, ymulwi3);
|
||||
y14 = vec_msub(y3, wr3, ymulwi3);
|
||||
|
||||
x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3));
|
||||
x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3));
|
||||
|
||||
y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
|
||||
y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
|
||||
|
||||
x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2));
|
||||
x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1));
|
||||
|
||||
y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
|
||||
y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
|
||||
|
||||
x18 = vec_add(vz0, x16);
|
||||
x19 = vec_sub(vz0, x16);
|
||||
x20 = vec_add(vzo1, x17);
|
||||
x21 = vec_sub(vzo1, x17);
|
||||
|
||||
y18 = vec_add(vz0plus1, y16);
|
||||
y19 = vec_sub(vz0plus1, y16);
|
||||
y20 = vec_add(vzo1plus1, y17);
|
||||
y21 = vec_sub(vzo1plus1, y17);
|
||||
|
||||
x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3));
|
||||
x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3));
|
||||
|
||||
y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
|
||||
y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
|
||||
|
||||
vec_st(x18, 0, &(out[0]));
|
||||
vec_st(y18, 16, &(out[0]));
|
||||
vec_st(x22, i1, &(out[0]));
|
||||
vec_st(y22, i1+16, &(out[0]));
|
||||
vec_st(x19, i2, &(out[0]));
|
||||
vec_st(y19, i2+16, &(out[0]));
|
||||
vec_st(x23, i3, &(out[0]));
|
||||
vec_st(y23, i3+16, &(out[0]));
|
||||
} while (n-=2);
|
||||
}
|
||||
|
||||
inline static void fft2_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
FFTSample r1, i1;
|
||||
|
||||
r1 = z[0].re - z[1].re;
|
||||
z[0].re += z[1].re;
|
||||
z[1].re = r1;
|
||||
|
||||
i1 = z[0].im - z[1].im;
|
||||
z[0].im += z[1].im;
|
||||
z[1].im = i1;
|
||||
}
|
||||
|
||||
inline static void fft4_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
vec_f a, b, c, d;
|
||||
float* out= (float*)z;
|
||||
a = vec_ld(0, &(out[0]));
|
||||
b = vec_ld(byte_2complex, &(out[0]));
|
||||
|
||||
c = vec_perm(a, b, vcprm(0,1,s2,s1));
|
||||
d = vec_perm(a, b, vcprm(2,3,s0,s3));
|
||||
a = vec_add(c, d);
|
||||
b = vec_sub(c, d);
|
||||
|
||||
c = vec_perm(a, b, vcprm(0,1,s0,s1));
|
||||
d = vec_perm(a, b, vcprm(2,3,s3,s2));
|
||||
|
||||
a = vec_add(c, d);
|
||||
b = vec_sub(c, d);
|
||||
vec_st(a, 0, &(out[0]));
|
||||
vec_st(b, byte_2complex, &(out[0]));
|
||||
}
|
||||
|
||||
inline static void fft8_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
vec_f vz0, vz1, vz2, vz3;
|
||||
vec_f x0, x1, x2, x3;
|
||||
vec_f x4, x5, x6, x7;
|
||||
vec_f x8, x9, x10, x11;
|
||||
vec_f x12, x13, x14, x15;
|
||||
vec_f x16, x17, x18, x19;
|
||||
vec_f x20, x21, x22, x23;
|
||||
vec_f x24, x25, x26, x27;
|
||||
vec_f x28, x29, x30, x31;
|
||||
vec_f x32, x33, x34;
|
||||
|
||||
float* out= (float*)z;
|
||||
vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
|
||||
|
||||
vz0 = vec_ld(0, &(out[0]));
|
||||
vz1 = vec_ld(byte_2complex, &(out[0]));
|
||||
vz2 = vec_ld(byte_4complex, &(out[0]));
|
||||
vz3 = vec_ld(byte_6complex, &(out[0]));
|
||||
|
||||
x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
|
||||
x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
|
||||
x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1));
|
||||
x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3));
|
||||
|
||||
x4 = vec_add(x0, x1);
|
||||
x5 = vec_sub(x0, x1);
|
||||
x6 = vec_add(x2, x3);
|
||||
x7 = vec_sub(x2, x3);
|
||||
|
||||
x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1));
|
||||
x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2));
|
||||
x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1));
|
||||
x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3));
|
||||
|
||||
x12 = vec_add(x8, x9);
|
||||
x13 = vec_sub(x8, x9);
|
||||
x14 = vec_add(x10, x11);
|
||||
x15 = vec_sub(x10, x11);
|
||||
x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1));
|
||||
x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1));
|
||||
x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1));
|
||||
x19 = vec_add(x16, x18); // z0.r z2.r z0.i z2.i
|
||||
x20 = vec_sub(x16, x18); // z4.r z6.r z4.i z6.i
|
||||
|
||||
x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3));
|
||||
x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3));
|
||||
x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2));
|
||||
x24 = vec_add(x22, x23);
|
||||
x25 = vec_sub(x22, x23);
|
||||
x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1);
|
||||
|
||||
x27 = vec_add(x21, x26); // z1.r z7.r z1.i z3.i
|
||||
x28 = vec_sub(x21, x26); //z5.r z3.r z5.i z7.i
|
||||
|
||||
x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r z0.i z1.r z1.i
|
||||
x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r z2.i z7.r z3.i
|
||||
x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r z4.i z5.r z5.i
|
||||
x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r z6.i z3.r z7.i
|
||||
x33 = vec_perm(x30, x32, vcprm(0,1,s2,3)); // z2.r z2.i z3.r z3.i
|
||||
x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r z6.i z7.r z7.i
|
||||
|
||||
vec_st(x29, 0, &(out[0]));
|
||||
vec_st(x33, byte_2complex, &(out[0]));
|
||||
vec_st(x31, byte_4complex, &(out[0]));
|
||||
vec_st(x34, byte_6complex, &(out[0]));
|
||||
}
|
||||
|
||||
inline static void fft16_vsx_interleave(FFTComplex *z)
|
||||
{
|
||||
float* out= (float*)z;
|
||||
vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
|
||||
vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]};
|
||||
vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]};
|
||||
vec_f vz0, vz1, vz2, vz3;
|
||||
vec_f vz4, vz5, vz6, vz7;
|
||||
vec_f x0, x1, x2, x3;
|
||||
vec_f x4, x5, x6, x7;
|
||||
vec_f x8, x9, x10, x11;
|
||||
vec_f x12, x13, x14, x15;
|
||||
vec_f x16, x17, x18, x19;
|
||||
vec_f x20, x21, x22, x23;
|
||||
vec_f x24, x25, x26, x27;
|
||||
vec_f x28, x29, x30, x31;
|
||||
vec_f x32, x33, x34, x35;
|
||||
vec_f x36, x37, x38, x39;
|
||||
vec_f x40, x41, x42, x43;
|
||||
vec_f x44, x45, x46, x47;
|
||||
vec_f x48, x49, x50, x51;
|
||||
vec_f x52, x53, x54, x55;
|
||||
vec_f x56, x57, x58, x59;
|
||||
vec_f x60, x61, x62, x63;
|
||||
vec_f x64, x65, x66, x67;
|
||||
vec_f x68, x69, x70, x71;
|
||||
vec_f x72, x73, x74, x75;
|
||||
vec_f x76, x77, x78, x79;
|
||||
vec_f x80, x81, x82, x83;
|
||||
vec_f x84, x85, x86;
|
||||
|
||||
vz0 = vec_ld(0, &(out[0]));
|
||||
vz1 = vec_ld(byte_2complex, &(out[0]));
|
||||
vz2 = vec_ld(byte_4complex, &(out[0]));
|
||||
vz3 = vec_ld(byte_6complex, &(out[0]));
|
||||
vz4 = vec_ld(byte_8complex, &(out[0]));
|
||||
vz5 = vec_ld(byte_10complex, &(out[0]));
|
||||
vz6 = vec_ld(byte_12complex, &(out[0]));
|
||||
vz7 = vec_ld(byte_14complex, &(out[0]));
|
||||
|
||||
x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
|
||||
x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
|
||||
x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
|
||||
x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
|
||||
|
||||
x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1));
|
||||
x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3));
|
||||
x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1));
|
||||
x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3));
|
||||
|
||||
x8 = vec_add(x0, x1);
|
||||
x9 = vec_sub(x0, x1);
|
||||
x10 = vec_add(x2, x3);
|
||||
x11 = vec_sub(x2, x3);
|
||||
|
||||
x12 = vec_add(x4, x5);
|
||||
x13 = vec_sub(x4, x5);
|
||||
x14 = vec_add(x6, x7);
|
||||
x15 = vec_sub(x6, x7);
|
||||
|
||||
x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1));
|
||||
x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2));
|
||||
x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2));
|
||||
x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3));
|
||||
x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1));
|
||||
x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3));
|
||||
x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1));
|
||||
x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2));
|
||||
|
||||
x24 = vec_add(x16, x17);
|
||||
x25 = vec_sub(x16, x17);
|
||||
x26 = vec_add(x18, x19);
|
||||
x27 = vec_sub(x18, x19);
|
||||
x28 = vec_add(x20, x21);
|
||||
x29 = vec_sub(x20, x21);
|
||||
x30 = vec_add(x22, x23);
|
||||
x31 = vec_sub(x22, x23);
|
||||
|
||||
x32 = vec_add(x24, x26);
|
||||
x33 = vec_sub(x24, x26);
|
||||
x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1));
|
||||
|
||||
x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2));
|
||||
x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3));
|
||||
x37 = vec_add(x35, x36);
|
||||
x38 = vec_sub(x35, x36);
|
||||
x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0));
|
||||
|
||||
x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3));
|
||||
x41 = vec_perm(x26, x37, vcprm(2,3,s3,s2));
|
||||
x42 = vec_add(x40, x41);
|
||||
x43 = vec_sub(x40, x41);
|
||||
x44 = vec_mul(x42, vc0);
|
||||
x45 = vec_mul(x43, vc0);
|
||||
|
||||
x46 = vec_add(x34, x39); // z0.r z0.i z4.r z4.i
|
||||
x47 = vec_sub(x34, x39); // z8.r z8.i z12.r z12.i
|
||||
|
||||
x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2));
|
||||
x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0));
|
||||
x50 = vec_add(x48, x49);
|
||||
x51 = vec_sub(x48, x49);
|
||||
x52 = vec_mul(x50, vc1);
|
||||
x53 = vec_mul(x50, vc2);
|
||||
x54 = vec_mul(x51, vc1);
|
||||
x55 = vec_mul(x51, vc2);
|
||||
|
||||
x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3));
|
||||
x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0));
|
||||
x58 = vec_add(x56, x57);
|
||||
x59 = vec_sub(x56, x57);
|
||||
|
||||
x60 = vec_perm(x54, x55, vcprm(1,0,3,2));
|
||||
x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2));
|
||||
x62 = vec_add(x52, x61);
|
||||
x63 = vec_sub(x52, x61);
|
||||
x64 = vec_add(x60, x53);
|
||||
x65 = vec_sub(x60, x53);
|
||||
x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2));
|
||||
x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2));
|
||||
|
||||
x68 = vec_add(x58, x66); // z1.r z1.i z3.r z3.i
|
||||
x69 = vec_sub(x58, x66); // z9.r z9.i z11.r z11.i
|
||||
x70 = vec_add(x59, x67); // z5.r z5.i z15.r z15.i
|
||||
x71 = vec_sub(x59, x67); // z13.r z13.i z7.r z7.i
|
||||
|
||||
x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3));
|
||||
x73 = vec_add(x25, x72);
|
||||
x74 = vec_sub(x25, x72);
|
||||
x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1));
|
||||
x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3));
|
||||
x77 = vec_add(x75, x76); // z2.r z2.i z6.r z6.i
|
||||
x78 = vec_sub(x75, x76); // z10.r z10.i z14.r z14.i
|
||||
|
||||
x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r z0.i z1.r z1.i
|
||||
x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r z2.i z3.r z3.i
|
||||
x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r z4.i z5.r z5.i
|
||||
x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r z6.i z7.r z7.i
|
||||
vec_st(x79, 0, &(out[0]));
|
||||
vec_st(x80, byte_2complex, &(out[0]));
|
||||
vec_st(x81, byte_4complex, &(out[0]));
|
||||
vec_st(x82, byte_6complex, &(out[0]));
|
||||
x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r z8.i z9.r z9.i
|
||||
x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r z10.i z11.r z11.i
|
||||
x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r z12.i z13.r z13.i
|
||||
x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r z14.i z15.r z15.i
|
||||
vec_st(x83, byte_8complex, &(out[0]));
|
||||
vec_st(x84, byte_10complex, &(out[0]));
|
||||
vec_st(x85, byte_12complex, &(out[0]));
|
||||
vec_st(x86, byte_14complex, &(out[0]));
|
||||
}
|
||||
|
||||
inline static void fft4_vsx(FFTComplex *z)
|
||||
{
|
||||
vec_f a, b, c, d;
|
||||
float* out= (float*)z;
|
||||
a = vec_ld(0, &(out[0]));
|
||||
b = vec_ld(byte_2complex, &(out[0]));
|
||||
|
||||
c = vec_perm(a, b, vcprm(0,1,s2,s1));
|
||||
d = vec_perm(a, b, vcprm(2,3,s0,s3));
|
||||
a = vec_add(c, d);
|
||||
b = vec_sub(c, d);
|
||||
|
||||
c = vec_perm(a,b, vcprm(0,s0,1,s1));
|
||||
d = vec_perm(a, b, vcprm(2,s3,3,s2));
|
||||
|
||||
a = vec_add(c, d);
|
||||
b = vec_sub(c, d);
|
||||
|
||||
c = vec_perm(a, b, vcprm(0,1,s0,s1));
|
||||
d = vec_perm(a, b, vcprm(2,3,s2,s3));
|
||||
|
||||
vec_st(c, 0, &(out[0]));
|
||||
vec_st(d, byte_2complex, &(out[0]));
|
||||
return;
|
||||
}
|
||||
|
||||
inline static void fft8_vsx(FFTComplex *z)
|
||||
{
|
||||
vec_f vz0, vz1, vz2, vz3;
|
||||
vec_f vz4, vz5, vz6, vz7, vz8;
|
||||
|
||||
float* out= (float*)z;
|
||||
vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
|
||||
vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
|
||||
vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
|
||||
|
||||
vz0 = vec_ld(0, &(out[0]));
|
||||
vz1 = vec_ld(byte_2complex, &(out[0]));
|
||||
vz2 = vec_ld(byte_4complex, &(out[0]));
|
||||
vz3 = vec_ld(byte_6complex, &(out[0]));
|
||||
|
||||
vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
|
||||
vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
|
||||
vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
|
||||
vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
|
||||
|
||||
vz2 = vec_add(vz6, vz7);
|
||||
vz3 = vec_sub(vz6, vz7);
|
||||
vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
|
||||
|
||||
vz0 = vec_add(vz4, vz5);
|
||||
vz1 = vec_sub(vz4, vz5);
|
||||
|
||||
vz3 = vec_madd(vz3, vc1, vc0);
|
||||
vz3 = vec_madd(vz8, vc2, vz3);
|
||||
|
||||
vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
|
||||
vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
|
||||
vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
|
||||
vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
|
||||
|
||||
vz0 = vec_add(vz4, vz5);
|
||||
vz1 = vec_sub(vz4, vz5);
|
||||
vz2 = vec_add(vz6, vz7);
|
||||
vz3 = vec_sub(vz6, vz7);
|
||||
|
||||
vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
|
||||
vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
|
||||
vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
|
||||
vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
|
||||
|
||||
|
||||
vz2 = vec_sub(vz4, vz6);
|
||||
vz3 = vec_sub(vz5, vz7);
|
||||
|
||||
vz0 = vec_add(vz4, vz6);
|
||||
vz1 = vec_add(vz5, vz7);
|
||||
|
||||
vec_st(vz0, 0, &(out[0]));
|
||||
vec_st(vz1, byte_2complex, &(out[0]));
|
||||
vec_st(vz2, byte_4complex, &(out[0]));
|
||||
vec_st(vz3, byte_6complex, &(out[0]));
|
||||
return;
|
||||
}
|
||||
|
||||
inline static void fft16_vsx(FFTComplex *z)
|
||||
{
|
||||
float* out= (float*)z;
|
||||
vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
|
||||
vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
|
||||
vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
|
||||
vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343};
|
||||
vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953};
|
||||
vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953};
|
||||
|
||||
vec_f vz0, vz1, vz2, vz3;
|
||||
vec_f vz4, vz5, vz6, vz7;
|
||||
vec_f vz8, vz9, vz10, vz11;
|
||||
vec_f vz12, vz13;
|
||||
|
||||
vz0 = vec_ld(byte_8complex, &(out[0]));
|
||||
vz1 = vec_ld(byte_10complex, &(out[0]));
|
||||
vz2 = vec_ld(byte_12complex, &(out[0]));
|
||||
vz3 = vec_ld(byte_14complex, &(out[0]));
|
||||
|
||||
vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
|
||||
vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
|
||||
vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1));
|
||||
vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3));
|
||||
|
||||
vz0 = vec_add(vz4, vz5);
|
||||
vz1= vec_sub(vz4, vz5);
|
||||
vz2 = vec_add(vz6, vz7);
|
||||
vz3 = vec_sub(vz6, vz7);
|
||||
|
||||
vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
|
||||
vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
|
||||
vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
|
||||
vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2));
|
||||
|
||||
vz0 = vec_add(vz4, vz5);
|
||||
vz1 = vec_sub(vz4, vz5);
|
||||
vz2 = vec_add(vz6, vz7);
|
||||
vz3 = vec_sub(vz6, vz7);
|
||||
|
||||
vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
|
||||
vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
|
||||
|
||||
vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
|
||||
vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
|
||||
|
||||
vz0 = vec_ld(0, &(out[0]));
|
||||
vz1 = vec_ld(byte_2complex, &(out[0]));
|
||||
vz2 = vec_ld(byte_4complex, &(out[0]));
|
||||
vz3 = vec_ld(byte_6complex, &(out[0]));
|
||||
vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
|
||||
vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
|
||||
vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
|
||||
vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
|
||||
|
||||
vz2 = vec_add(vz10, vz11);
|
||||
vz3 = vec_sub(vz10, vz11);
|
||||
vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
|
||||
vz0 = vec_add(vz8, vz9);
|
||||
vz1 = vec_sub(vz8, vz9);
|
||||
|
||||
vz3 = vec_madd(vz3, vc1, vc0);
|
||||
vz3 = vec_madd(vz12, vc2, vz3);
|
||||
vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
|
||||
vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
|
||||
vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
|
||||
vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
|
||||
|
||||
vz0 = vec_add(vz8, vz9);
|
||||
vz1 = vec_sub(vz8, vz9);
|
||||
vz2 = vec_add(vz10, vz11);
|
||||
vz3 = vec_sub(vz10, vz11);
|
||||
|
||||
vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
|
||||
vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
|
||||
vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
|
||||
vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
|
||||
|
||||
vz2 = vec_sub(vz8, vz10);
|
||||
vz3 = vec_sub(vz9, vz11);
|
||||
vz0 = vec_add(vz8, vz10);
|
||||
vz1 = vec_add(vz9, vz11);
|
||||
|
||||
vz8 = vec_madd(vz4, vc3, vc0);
|
||||
vz9 = vec_madd(vz5, vc3, vc0);
|
||||
vz10 = vec_madd(vz6, vc3, vc0);
|
||||
vz11 = vec_madd(vz7, vc3, vc0);
|
||||
|
||||
vz8 = vec_madd(vz5, vc4, vz8);
|
||||
vz9 = vec_madd(vz4, vc5, vz9);
|
||||
vz10 = vec_madd(vz7, vc5, vz10);
|
||||
vz11 = vec_madd(vz6, vc4, vz11);
|
||||
|
||||
vz12 = vec_sub(vz10, vz8);
|
||||
vz10 = vec_add(vz10, vz8);
|
||||
|
||||
vz13 = vec_sub(vz9, vz11);
|
||||
vz11 = vec_add(vz9, vz11);
|
||||
|
||||
vz4 = vec_sub(vz0, vz10);
|
||||
vz0 = vec_add(vz0, vz10);
|
||||
|
||||
vz7= vec_sub(vz3, vz12);
|
||||
vz3= vec_add(vz3, vz12);
|
||||
|
||||
vz5 = vec_sub(vz1, vz11);
|
||||
vz1 = vec_add(vz1, vz11);
|
||||
|
||||
vz6 = vec_sub(vz2, vz13);
|
||||
vz2 = vec_add(vz2, vz13);
|
||||
|
||||
vec_st(vz0, 0, &(out[0]));
|
||||
vec_st(vz1, byte_2complex, &(out[0]));
|
||||
vec_st(vz2, byte_4complex, &(out[0]));
|
||||
vec_st(vz3, byte_6complex, &(out[0]));
|
||||
vec_st(vz4, byte_8complex, &(out[0]));
|
||||
vec_st(vz5, byte_10complex, &(out[0]));
|
||||
vec_st(vz6, byte_12complex, &(out[0]));
|
||||
vec_st(vz7, byte_14complex, &(out[0]));
|
||||
return;
|
||||
|
||||
}
|
||||
inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n)
|
||||
{
|
||||
int o1 = n<<1;
|
||||
int o2 = n<<2;
|
||||
int o3 = o1+o2;
|
||||
int i1, i2, i3;
|
||||
FFTSample* out = (FFTSample*)z;
|
||||
const FFTSample *wim = wre+o1;
|
||||
vec_f v0, v1, v2, v3;
|
||||
vec_f v4, v5, v6, v7;
|
||||
vec_f v8, v9, v10, v11;
|
||||
vec_f v12, v13;
|
||||
|
||||
n = n-2;
|
||||
i1 = o1*sizeof(FFTComplex);
|
||||
i2 = o2*sizeof(FFTComplex);
|
||||
i3 = o3*sizeof(FFTComplex);
|
||||
|
||||
v8 = vec_ld(0, &(wre[0]));
|
||||
v10 = vec_ld(0, &(wim[0]));
|
||||
v9 = vec_ld(0, &(wim[-4]));
|
||||
v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
|
||||
|
||||
v4 = vec_ld(i2, &(out[0]));
|
||||
v5 = vec_ld(i2+16, &(out[0]));
|
||||
v6 = vec_ld(i3, &(out[0]));
|
||||
v7 = vec_ld(i3+16, &(out[0]));
|
||||
v10 = vec_mul(v4, v8); // r2*wre
|
||||
v11 = vec_mul(v5, v8); // i2*wre
|
||||
v12 = vec_mul(v6, v8); // r3*wre
|
||||
v13 = vec_mul(v7, v8); // i3*wre
|
||||
|
||||
v0 = vec_ld(0, &(out[0])); // r0
|
||||
v3 = vec_ld(i1+16, &(out[0])); // i1
|
||||
v10 = vec_madd(v5, v9, v10); // r2*wim
|
||||
v11 = vec_nmsub(v4, v9, v11); // i2*wim
|
||||
v12 = vec_nmsub(v7, v9, v12); // r3*wim
|
||||
v13 = vec_madd(v6, v9, v13); // i3*wim
|
||||
|
||||
v1 = vec_ld(16, &(out[0])); // i0
|
||||
v2 = vec_ld(i1, &(out[0])); // r1
|
||||
v8 = vec_sub(v12, v10);
|
||||
v12 = vec_add(v12, v10);
|
||||
v9 = vec_sub(v11, v13);
|
||||
v13 = vec_add(v11, v13);
|
||||
v4 = vec_sub(v0, v12);
|
||||
v0 = vec_add(v0, v12);
|
||||
v7 = vec_sub(v3, v8);
|
||||
v3 = vec_add(v3, v8);
|
||||
|
||||
vec_st(v0, 0, &(out[0])); // r0
|
||||
vec_st(v3, i1+16, &(out[0])); // i1
|
||||
vec_st(v4, i2, &(out[0])); // r2
|
||||
vec_st(v7, i3+16, &(out[0]));// i3
|
||||
|
||||
v5 = vec_sub(v1, v13);
|
||||
v1 = vec_add(v1, v13);
|
||||
v6 = vec_sub(v2, v9);
|
||||
v2 = vec_add(v2, v9);
|
||||
|
||||
vec_st(v1, 16, &(out[0])); // i0
|
||||
vec_st(v2, i1, &(out[0])); // r1
|
||||
vec_st(v5, i2+16, &(out[0])); // i2
|
||||
vec_st(v6, i3, &(out[0])); // r3
|
||||
|
||||
do {
|
||||
out += 8;
|
||||
wre += 4;
|
||||
wim -= 4;
|
||||
|
||||
v8 = vec_ld(0, &(wre[0]));
|
||||
v10 = vec_ld(0, &(wim[0]));
|
||||
v9 = vec_ld(0, &(wim[-4]));
|
||||
v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
|
||||
|
||||
v4 = vec_ld(i2, &(out[0])); // r2
|
||||
v5 = vec_ld(i2+16, &(out[0])); // i2
|
||||
v6 = vec_ld(i3, &(out[0])); // r3
|
||||
v7 = vec_ld(i3+16, &(out[0]));// i3
|
||||
v10 = vec_mul(v4, v8); // r2*wre
|
||||
v11 = vec_mul(v5, v8); // i2*wre
|
||||
v12 = vec_mul(v6, v8); // r3*wre
|
||||
v13 = vec_mul(v7, v8); // i3*wre
|
||||
|
||||
v0 = vec_ld(0, &(out[0])); // r0
|
||||
v3 = vec_ld(i1+16, &(out[0])); // i1
|
||||
v10 = vec_madd(v5, v9, v10); // r2*wim
|
||||
v11 = vec_nmsub(v4, v9, v11); // i2*wim
|
||||
v12 = vec_nmsub(v7, v9, v12); // r3*wim
|
||||
v13 = vec_madd(v6, v9, v13); // i3*wim
|
||||
|
||||
v1 = vec_ld(16, &(out[0])); // i0
|
||||
v2 = vec_ld(i1, &(out[0])); // r1
|
||||
v8 = vec_sub(v12, v10);
|
||||
v12 = vec_add(v12, v10);
|
||||
v9 = vec_sub(v11, v13);
|
||||
v13 = vec_add(v11, v13);
|
||||
v4 = vec_sub(v0, v12);
|
||||
v0 = vec_add(v0, v12);
|
||||
v7 = vec_sub(v3, v8);
|
||||
v3 = vec_add(v3, v8);
|
||||
|
||||
vec_st(v0, 0, &(out[0])); // r0
|
||||
vec_st(v3, i1+16, &(out[0])); // i1
|
||||
vec_st(v4, i2, &(out[0])); // r2
|
||||
vec_st(v7, i3+16, &(out[0])); // i3
|
||||
|
||||
v5 = vec_sub(v1, v13);
|
||||
v1 = vec_add(v1, v13);
|
||||
v6 = vec_sub(v2, v9);
|
||||
v2 = vec_add(v2, v9);
|
||||
|
||||
vec_st(v1, 16, &(out[0])); // i0
|
||||
vec_st(v2, i1, &(out[0])); // r1
|
||||
vec_st(v5, i2+16, &(out[0])); // i2
|
||||
vec_st(v6, i3, &(out[0])); // r3
|
||||
} while (n-=2);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* AVCODEC_PPC_FFT_VSX_H */
|
@ -1,120 +0,0 @@
|
||||
/*
|
||||
* (I)RDFT transforms
|
||||
* Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include "libavutil/error.h"
|
||||
#include "libavutil/mathematics.h"
|
||||
#include "rdft.h"
|
||||
|
||||
/**
|
||||
* @file
|
||||
* (Inverse) Real Discrete Fourier Transforms.
|
||||
*/
|
||||
|
||||
/** Map one real FFT into two parallel real even and odd FFTs. Then interleave
|
||||
* the two real FFTs into one complex FFT. Unmangle the results.
|
||||
* ref: http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM
|
||||
*/
|
||||
static void rdft_calc_c(RDFTContext *s, FFTSample *data)
|
||||
{
|
||||
int i, i1, i2;
|
||||
FFTComplex ev, od, odsum;
|
||||
const int n = 1 << s->nbits;
|
||||
const float k1 = 0.5;
|
||||
const float k2 = 0.5 - s->inverse;
|
||||
const FFTSample *tcos = s->tcos;
|
||||
const FFTSample *tsin = s->tsin;
|
||||
|
||||
if (!s->inverse) {
|
||||
s->fft.fft_permute(&s->fft, (FFTComplex*)data);
|
||||
s->fft.fft_calc(&s->fft, (FFTComplex*)data);
|
||||
}
|
||||
/* i=0 is a special case because of packing, the DC term is real, so we
|
||||
are going to throw the N/2 term (also real) in with it. */
|
||||
ev.re = data[0];
|
||||
data[0] = ev.re+data[1];
|
||||
data[1] = ev.re-data[1];
|
||||
|
||||
#define RDFT_UNMANGLE(sign0, sign1) \
|
||||
for (i = 1; i < (n>>2); i++) { \
|
||||
i1 = 2*i; \
|
||||
i2 = n-i1; \
|
||||
/* Separate even and odd FFTs */ \
|
||||
ev.re = k1*(data[i1 ]+data[i2 ]); \
|
||||
od.im = k2*(data[i2 ]-data[i1 ]); \
|
||||
ev.im = k1*(data[i1+1]-data[i2+1]); \
|
||||
od.re = k2*(data[i1+1]+data[i2+1]); \
|
||||
/* Apply twiddle factors to the odd FFT and add to the even FFT */ \
|
||||
odsum.re = od.re*tcos[i] sign0 od.im*tsin[i]; \
|
||||
odsum.im = od.im*tcos[i] sign1 od.re*tsin[i]; \
|
||||
data[i1 ] = ev.re + odsum.re; \
|
||||
data[i1+1] = ev.im + odsum.im; \
|
||||
data[i2 ] = ev.re - odsum.re; \
|
||||
data[i2+1] = odsum.im - ev.im; \
|
||||
}
|
||||
|
||||
if (s->negative_sin) {
|
||||
RDFT_UNMANGLE(+,-)
|
||||
} else {
|
||||
RDFT_UNMANGLE(-,+)
|
||||
}
|
||||
|
||||
data[2*i+1]=s->sign_convention*data[2*i+1];
|
||||
if (s->inverse) {
|
||||
data[0] *= k1;
|
||||
data[1] *= k1;
|
||||
s->fft.fft_permute(&s->fft, (FFTComplex*)data);
|
||||
s->fft.fft_calc(&s->fft, (FFTComplex*)data);
|
||||
}
|
||||
}
|
||||
|
||||
av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans)
|
||||
{
|
||||
int n = 1 << nbits;
|
||||
int ret;
|
||||
|
||||
s->nbits = nbits;
|
||||
s->inverse = trans == IDFT_C2R || trans == DFT_C2R;
|
||||
s->sign_convention = trans == IDFT_R2C || trans == DFT_C2R ? 1 : -1;
|
||||
s->negative_sin = trans == DFT_C2R || trans == DFT_R2C;
|
||||
|
||||
if (nbits < 4 || nbits > 16)
|
||||
return AVERROR(EINVAL);
|
||||
|
||||
if ((ret = ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C)) < 0)
|
||||
return ret;
|
||||
|
||||
ff_init_ff_cos_tabs(nbits);
|
||||
s->tcos = ff_cos_tabs[nbits];
|
||||
s->tsin = ff_cos_tabs[nbits] + (n >> 2);
|
||||
s->rdft_calc = rdft_calc_c;
|
||||
|
||||
#if ARCH_ARM
|
||||
ff_rdft_init_arm(s);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
av_cold void ff_rdft_end(RDFTContext *s)
|
||||
{
|
||||
ff_fft_end(&s->fft);
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
/*
|
||||
* (I)RDFT transforms
|
||||
* Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#if !defined(AVCODEC_RDFT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT)
|
||||
#define AVCODEC_RDFT_H
|
||||
|
||||
#include "config.h"
|
||||
#include "fft.h"
|
||||
|
||||
struct RDFTContext {
|
||||
int nbits;
|
||||
int inverse;
|
||||
int sign_convention;
|
||||
|
||||
/* pre/post rotation tables */
|
||||
const FFTSample *tcos;
|
||||
const FFTSample *tsin;
|
||||
int negative_sin;
|
||||
FFTContext fft;
|
||||
void (*rdft_calc)(struct RDFTContext *s, FFTSample *z);
|
||||
};
|
||||
|
||||
/**
|
||||
* Set up a real FFT.
|
||||
* @param nbits log2 of the length of the input array
|
||||
* @param trans the type of transform
|
||||
*/
|
||||
int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans);
|
||||
void ff_rdft_end(RDFTContext *s);
|
||||
|
||||
void ff_rdft_init_arm(RDFTContext *s);
|
||||
|
||||
|
||||
#endif /* AVCODEC_RDFT_H */
|
3
libavcodec/tests/.gitignore
vendored
3
libavcodec/tests/.gitignore
vendored
@ -1,6 +1,5 @@
|
||||
/av1_levels
|
||||
/avcodec
|
||||
/avfft
|
||||
/avpacket
|
||||
/bitstream_be
|
||||
/bitstream_le
|
||||
@ -8,8 +7,6 @@
|
||||
/celp_math
|
||||
/codec_desc
|
||||
/dct
|
||||
/fft
|
||||
/fft-fixed32
|
||||
/golomb
|
||||
/h264_levels
|
||||
/h265_levels
|
||||
|
@ -1,25 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/*
|
||||
* This test is similar to fft-fixed.c or fft-fixed32.c
|
||||
*/
|
||||
|
||||
#define AVFFT 1
|
||||
#define FFT_FLOAT 1
|
||||
#include "fft.c"
|
@ -1,21 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#define FFT_FLOAT 0
|
||||
#define AVFFT 0
|
||||
#include "fft.c"
|
@ -1,683 +0,0 @@
|
||||
/*
|
||||
* (c) 2002 Fabrice Bellard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/internal.h"
|
||||
|
||||
FF_DISABLE_DEPRECATION_WARNINGS
|
||||
|
||||
/**
|
||||
* @file
|
||||
* FFT and MDCT tests.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#ifndef AVFFT
|
||||
#define AVFFT 0
|
||||
#endif
|
||||
|
||||
#include <math.h>
|
||||
#if HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/error.h"
|
||||
#include "libavutil/lfg.h"
|
||||
#include "libavutil/log.h"
|
||||
#include "libavutil/mathematics.h"
|
||||
#include "libavutil/time.h"
|
||||
|
||||
#if AVFFT
|
||||
#include "libavcodec/avfft.h"
|
||||
#else
|
||||
#include "libavcodec/fft.h"
|
||||
#endif
|
||||
|
||||
#if FFT_FLOAT
|
||||
#include "libavcodec/dct.h"
|
||||
#include "libavcodec/rdft.h"
|
||||
#endif
|
||||
|
||||
/* reference fft */
|
||||
|
||||
#define MUL16(a, b) ((a) * (b))
|
||||
|
||||
#define CMAC(pre, pim, are, aim, bre, bim) \
|
||||
{ \
|
||||
pre += (MUL16(are, bre) - MUL16(aim, bim)); \
|
||||
pim += (MUL16(are, bim) + MUL16(bre, aim)); \
|
||||
}
|
||||
|
||||
#if FFT_FLOAT || AVFFT
|
||||
#define RANGE 1.0
|
||||
#define REF_SCALE(x, bits) (x)
|
||||
#define FMT "%10.6f"
|
||||
#else
|
||||
#define RANGE 8388608
|
||||
#define REF_SCALE(x, bits) (x)
|
||||
#define FMT "%6d"
|
||||
#endif
|
||||
|
||||
static struct {
|
||||
float re, im;
|
||||
} *exptab;
|
||||
|
||||
static int fft_ref_init(int nbits, int inverse)
|
||||
{
|
||||
int i, n = 1 << nbits;
|
||||
|
||||
exptab = av_malloc_array((n / 2), sizeof(*exptab));
|
||||
if (!exptab)
|
||||
return AVERROR(ENOMEM);
|
||||
|
||||
for (i = 0; i < (n / 2); i++) {
|
||||
double alpha = 2 * M_PI * (float) i / (float) n;
|
||||
double c1 = cos(alpha), s1 = sin(alpha);
|
||||
if (!inverse)
|
||||
s1 = -s1;
|
||||
exptab[i].re = c1;
|
||||
exptab[i].im = s1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void fft_ref(FFTComplex *tabr, FFTComplex *tab, int nbits)
|
||||
{
|
||||
int i, j;
|
||||
int n = 1 << nbits;
|
||||
int n2 = n >> 1;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
double tmp_re = 0, tmp_im = 0;
|
||||
FFTComplex *q = tab;
|
||||
for (j = 0; j < n; j++) {
|
||||
double s, c;
|
||||
int k = (i * j) & (n - 1);
|
||||
if (k >= n2) {
|
||||
c = -exptab[k - n2].re;
|
||||
s = -exptab[k - n2].im;
|
||||
} else {
|
||||
c = exptab[k].re;
|
||||
s = exptab[k].im;
|
||||
}
|
||||
CMAC(tmp_re, tmp_im, c, s, q->re, q->im);
|
||||
q++;
|
||||
}
|
||||
tabr[i].re = REF_SCALE(tmp_re, nbits);
|
||||
tabr[i].im = REF_SCALE(tmp_im, nbits);
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_MDCT
|
||||
static void imdct_ref(FFTSample *out, FFTSample *in, int nbits)
|
||||
{
|
||||
int i, k, n = 1 << nbits;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
double sum = 0;
|
||||
for (k = 0; k < n / 2; k++) {
|
||||
int a = (2 * i + 1 + (n / 2)) * (2 * k + 1);
|
||||
double f = cos(M_PI * a / (double) (2 * n));
|
||||
sum += f * in[k];
|
||||
}
|
||||
out[i] = REF_SCALE(-sum, nbits - 2);
|
||||
}
|
||||
}
|
||||
|
||||
/* NOTE: no normalisation by 1 / N is done */
|
||||
static void mdct_ref(FFTSample *output, FFTSample *input, int nbits)
|
||||
{
|
||||
int i, k, n = 1 << nbits;
|
||||
|
||||
/* do it by hand */
|
||||
for (k = 0; k < n / 2; k++) {
|
||||
double s = 0;
|
||||
for (i = 0; i < n; i++) {
|
||||
double a = (2 * M_PI * (2 * i + 1 + n / 2) * (2 * k + 1) / (4 * n));
|
||||
s += input[i] * cos(a);
|
||||
}
|
||||
output[k] = REF_SCALE(s, nbits - 1);
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_MDCT */
|
||||
|
||||
#if FFT_FLOAT
|
||||
#if CONFIG_DCT
|
||||
static void idct_ref(FFTSample *output, FFTSample *input, int nbits)
|
||||
{
|
||||
int i, k, n = 1 << nbits;
|
||||
|
||||
/* do it by hand */
|
||||
for (i = 0; i < n; i++) {
|
||||
double s = 0.5 * input[0];
|
||||
for (k = 1; k < n; k++) {
|
||||
double a = M_PI * k * (i + 0.5) / n;
|
||||
s += input[k] * cos(a);
|
||||
}
|
||||
output[i] = 2 * s / n;
|
||||
}
|
||||
}
|
||||
|
||||
static void dct_ref(FFTSample *output, FFTSample *input, int nbits)
|
||||
{
|
||||
int i, k, n = 1 << nbits;
|
||||
|
||||
/* do it by hand */
|
||||
for (k = 0; k < n; k++) {
|
||||
double s = 0;
|
||||
for (i = 0; i < n; i++) {
|
||||
double a = M_PI * k * (i + 0.5) / n;
|
||||
s += input[i] * cos(a);
|
||||
}
|
||||
output[k] = s;
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_DCT */
|
||||
#endif /* FFT_FLOAT */
|
||||
|
||||
static FFTSample frandom(AVLFG *prng)
|
||||
{
|
||||
return (int16_t) av_lfg_get(prng) / 32768.0 * RANGE;
|
||||
}
|
||||
|
||||
static int check_diff(FFTSample *tab1, FFTSample *tab2, int n, double scale)
|
||||
{
|
||||
int i, err = 0;
|
||||
double error = 0, max = 0;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
double e = fabs(tab1[i] - (tab2[i] / scale)) / RANGE;
|
||||
if (e >= 1e-3) {
|
||||
av_log(NULL, AV_LOG_ERROR, "ERROR %5d: "FMT" "FMT"\n",
|
||||
i, tab1[i], tab2[i]);
|
||||
err = 1;
|
||||
}
|
||||
error += e * e;
|
||||
if (e > max)
|
||||
max = e;
|
||||
}
|
||||
av_log(NULL, AV_LOG_INFO, "max:%f e:%g\n", max, sqrt(error / n));
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline void fft_init(FFTContext **s, int nbits, int inverse)
|
||||
{
|
||||
#if AVFFT
|
||||
*s = av_fft_init(nbits, inverse);
|
||||
#else
|
||||
ff_fft_init(*s, nbits, inverse);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if CONFIG_MDCT
|
||||
static inline void mdct_init(FFTContext **s, int nbits, int inverse, double scale)
|
||||
{
|
||||
#if AVFFT
|
||||
*s = av_mdct_init(nbits, inverse, scale);
|
||||
#else
|
||||
ff_mdct_init(*s, nbits, inverse, scale);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
{
|
||||
#if AVFFT
|
||||
av_mdct_calc(s, output, input);
|
||||
#else
|
||||
s->mdct_calc(s, output, input);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void imdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
{
|
||||
#if AVFFT
|
||||
av_imdct_calc(s, output, input);
|
||||
#else
|
||||
s->imdct_calc(s, output, input);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void fft_permute(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
#if AVFFT
|
||||
av_fft_permute(s, z);
|
||||
#else
|
||||
s->fft_permute(s, z);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void fft_calc(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
#if AVFFT
|
||||
av_fft_calc(s, z);
|
||||
#else
|
||||
s->fft_calc(s, z);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void mdct_end(FFTContext *s)
|
||||
{
|
||||
#if AVFFT
|
||||
av_mdct_end(s);
|
||||
#else
|
||||
ff_mdct_end(s);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void fft_end(FFTContext *s)
|
||||
{
|
||||
#if AVFFT
|
||||
av_fft_end(s);
|
||||
#else
|
||||
ff_fft_end(s);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if FFT_FLOAT
|
||||
static inline void rdft_init(RDFTContext **r, int nbits, enum RDFTransformType trans)
|
||||
{
|
||||
#if AVFFT
|
||||
*r = av_rdft_init(nbits, trans);
|
||||
#else
|
||||
ff_rdft_init(*r, nbits, trans);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void dct_init(DCTContext **d, int nbits, enum DCTTransformType trans)
|
||||
{
|
||||
#if AVFFT
|
||||
*d = av_dct_init(nbits, trans);
|
||||
#else
|
||||
ff_dct_init(*d, nbits, trans);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void rdft_calc(RDFTContext *r, FFTSample *tab)
|
||||
{
|
||||
#if AVFFT
|
||||
av_rdft_calc(r, tab);
|
||||
#else
|
||||
r->rdft_calc(r, tab);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void dct_calc(DCTContext *d, FFTSample *data)
|
||||
{
|
||||
#if AVFFT
|
||||
av_dct_calc(d, data);
|
||||
#else
|
||||
d->dct_calc(d, data);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void rdft_end(RDFTContext *r)
|
||||
{
|
||||
#if AVFFT
|
||||
av_rdft_end(r);
|
||||
#else
|
||||
ff_rdft_end(r);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void dct_end(DCTContext *d)
|
||||
{
|
||||
#if AVFFT
|
||||
av_dct_end(d);
|
||||
#else
|
||||
ff_dct_end(d);
|
||||
#endif
|
||||
}
|
||||
#endif /* FFT_FLOAT */
|
||||
|
||||
static void help(void)
|
||||
{
|
||||
av_log(NULL, AV_LOG_INFO,
|
||||
"usage: fft-test [-h] [-s] [-i] [-n b]\n"
|
||||
"-h print this help\n"
|
||||
"-s speed test\n"
|
||||
"-m (I)MDCT test\n"
|
||||
"-d (I)DCT test\n"
|
||||
"-r (I)RDFT test\n"
|
||||
"-i inverse transform test\n"
|
||||
"-n b set the transform size to 2^b\n"
|
||||
"-f x set scale factor for output data of (I)MDCT to x\n");
|
||||
}
|
||||
|
||||
enum tf_transform {
|
||||
TRANSFORM_FFT,
|
||||
TRANSFORM_MDCT,
|
||||
TRANSFORM_RDFT,
|
||||
TRANSFORM_DCT,
|
||||
};
|
||||
|
||||
#if !HAVE_GETOPT
|
||||
#include "compat/getopt.c"
|
||||
#endif
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
FFTComplex *tab, *tab1, *tab_ref;
|
||||
FFTSample *tab2;
|
||||
enum tf_transform transform = TRANSFORM_FFT;
|
||||
FFTContext *m, *s;
|
||||
#if FFT_FLOAT
|
||||
RDFTContext *r;
|
||||
DCTContext *d;
|
||||
#endif /* FFT_FLOAT */
|
||||
int it, i, err = 1;
|
||||
int do_speed = 0, do_inverse = 0;
|
||||
int fft_nbits = 9, fft_size;
|
||||
double scale = 1.0;
|
||||
AVLFG prng;
|
||||
|
||||
#if !AVFFT
|
||||
s = av_mallocz(sizeof(*s));
|
||||
m = av_mallocz(sizeof(*m));
|
||||
#endif
|
||||
|
||||
#if !AVFFT && FFT_FLOAT
|
||||
r = av_mallocz(sizeof(*r));
|
||||
d = av_mallocz(sizeof(*d));
|
||||
#endif
|
||||
|
||||
av_lfg_init(&prng, 1);
|
||||
|
||||
for (;;) {
|
||||
int c = getopt(argc, argv, "hsimrdn:f:c:");
|
||||
if (c == -1)
|
||||
break;
|
||||
switch (c) {
|
||||
case 'h':
|
||||
help();
|
||||
return 1;
|
||||
case 's':
|
||||
do_speed = 1;
|
||||
break;
|
||||
case 'i':
|
||||
do_inverse = 1;
|
||||
break;
|
||||
case 'm':
|
||||
transform = TRANSFORM_MDCT;
|
||||
break;
|
||||
case 'r':
|
||||
transform = TRANSFORM_RDFT;
|
||||
break;
|
||||
case 'd':
|
||||
transform = TRANSFORM_DCT;
|
||||
break;
|
||||
case 'n':
|
||||
fft_nbits = atoi(optarg);
|
||||
break;
|
||||
case 'f':
|
||||
scale = atof(optarg);
|
||||
break;
|
||||
case 'c':
|
||||
{
|
||||
unsigned cpuflags = av_get_cpu_flags();
|
||||
|
||||
if (av_parse_cpu_caps(&cpuflags, optarg) < 0)
|
||||
return 1;
|
||||
|
||||
av_force_cpu_flags(cpuflags);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fft_size = 1 << fft_nbits;
|
||||
tab = av_malloc_array(fft_size, sizeof(FFTComplex));
|
||||
tab1 = av_malloc_array(fft_size, sizeof(FFTComplex));
|
||||
tab_ref = av_malloc_array(fft_size, sizeof(FFTComplex));
|
||||
tab2 = av_malloc_array(fft_size, sizeof(FFTSample));
|
||||
|
||||
if (!(tab && tab1 && tab_ref && tab2))
|
||||
goto cleanup;
|
||||
|
||||
switch (transform) {
|
||||
#if CONFIG_MDCT
|
||||
case TRANSFORM_MDCT:
|
||||
av_log(NULL, AV_LOG_INFO, "Scale factor is set to %f\n", scale);
|
||||
if (do_inverse)
|
||||
av_log(NULL, AV_LOG_INFO, "IMDCT");
|
||||
else
|
||||
av_log(NULL, AV_LOG_INFO, "MDCT");
|
||||
mdct_init(&m, fft_nbits, do_inverse, scale);
|
||||
break;
|
||||
#endif /* CONFIG_MDCT */
|
||||
case TRANSFORM_FFT:
|
||||
if (do_inverse)
|
||||
av_log(NULL, AV_LOG_INFO, "IFFT");
|
||||
else
|
||||
av_log(NULL, AV_LOG_INFO, "FFT");
|
||||
fft_init(&s, fft_nbits, do_inverse);
|
||||
if ((err = fft_ref_init(fft_nbits, do_inverse)) < 0)
|
||||
goto cleanup;
|
||||
break;
|
||||
#if FFT_FLOAT
|
||||
# if CONFIG_RDFT
|
||||
case TRANSFORM_RDFT:
|
||||
if (do_inverse)
|
||||
av_log(NULL, AV_LOG_INFO, "IDFT_C2R");
|
||||
else
|
||||
av_log(NULL, AV_LOG_INFO, "DFT_R2C");
|
||||
rdft_init(&r, fft_nbits, do_inverse ? IDFT_C2R : DFT_R2C);
|
||||
if ((err = fft_ref_init(fft_nbits, do_inverse)) < 0)
|
||||
goto cleanup;
|
||||
break;
|
||||
# endif /* CONFIG_RDFT */
|
||||
# if CONFIG_DCT
|
||||
case TRANSFORM_DCT:
|
||||
if (do_inverse)
|
||||
av_log(NULL, AV_LOG_INFO, "DCT_III");
|
||||
else
|
||||
av_log(NULL, AV_LOG_INFO, "DCT_II");
|
||||
dct_init(&d, fft_nbits, do_inverse ? DCT_III : DCT_II);
|
||||
break;
|
||||
# endif /* CONFIG_DCT */
|
||||
#endif /* FFT_FLOAT */
|
||||
default:
|
||||
av_log(NULL, AV_LOG_ERROR, "Requested transform not supported\n");
|
||||
goto cleanup;
|
||||
}
|
||||
av_log(NULL, AV_LOG_INFO, " %d test\n", fft_size);
|
||||
|
||||
/* generate random data */
|
||||
|
||||
for (i = 0; i < fft_size; i++) {
|
||||
tab1[i].re = frandom(&prng);
|
||||
tab1[i].im = frandom(&prng);
|
||||
}
|
||||
|
||||
/* checking result */
|
||||
av_log(NULL, AV_LOG_INFO, "Checking...\n");
|
||||
|
||||
switch (transform) {
|
||||
#if CONFIG_MDCT
|
||||
case TRANSFORM_MDCT:
|
||||
if (do_inverse) {
|
||||
imdct_ref(&tab_ref->re, &tab1->re, fft_nbits);
|
||||
imdct_calc(m, tab2, &tab1->re);
|
||||
err = check_diff(&tab_ref->re, tab2, fft_size, scale);
|
||||
} else {
|
||||
mdct_ref(&tab_ref->re, &tab1->re, fft_nbits);
|
||||
mdct_calc(m, tab2, &tab1->re);
|
||||
err = check_diff(&tab_ref->re, tab2, fft_size / 2, scale);
|
||||
}
|
||||
break;
|
||||
#endif /* CONFIG_MDCT */
|
||||
case TRANSFORM_FFT:
|
||||
memcpy(tab, tab1, fft_size * sizeof(FFTComplex));
|
||||
fft_permute(s, tab);
|
||||
fft_calc(s, tab);
|
||||
|
||||
fft_ref(tab_ref, tab1, fft_nbits);
|
||||
err = check_diff(&tab_ref->re, &tab->re, fft_size * 2, 1.0);
|
||||
break;
|
||||
#if FFT_FLOAT
|
||||
#if CONFIG_RDFT
|
||||
case TRANSFORM_RDFT:
|
||||
{
|
||||
int fft_size_2 = fft_size >> 1;
|
||||
if (do_inverse) {
|
||||
tab1[0].im = 0;
|
||||
tab1[fft_size_2].im = 0;
|
||||
for (i = 1; i < fft_size_2; i++) {
|
||||
tab1[fft_size_2 + i].re = tab1[fft_size_2 - i].re;
|
||||
tab1[fft_size_2 + i].im = -tab1[fft_size_2 - i].im;
|
||||
}
|
||||
|
||||
memcpy(tab2, tab1, fft_size * sizeof(FFTSample));
|
||||
tab2[1] = tab1[fft_size_2].re;
|
||||
|
||||
rdft_calc(r, tab2);
|
||||
fft_ref(tab_ref, tab1, fft_nbits);
|
||||
for (i = 0; i < fft_size; i++) {
|
||||
tab[i].re = tab2[i];
|
||||
tab[i].im = 0;
|
||||
}
|
||||
err = check_diff(&tab_ref->re, &tab->re, fft_size * 2, 0.5);
|
||||
} else {
|
||||
for (i = 0; i < fft_size; i++) {
|
||||
tab2[i] = tab1[i].re;
|
||||
tab1[i].im = 0;
|
||||
}
|
||||
rdft_calc(r, tab2);
|
||||
fft_ref(tab_ref, tab1, fft_nbits);
|
||||
tab_ref[0].im = tab_ref[fft_size_2].re;
|
||||
err = check_diff(&tab_ref->re, tab2, fft_size, 1.0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif /* CONFIG_RDFT */
|
||||
#if CONFIG_DCT
|
||||
case TRANSFORM_DCT:
|
||||
memcpy(tab, tab1, fft_size * sizeof(FFTComplex));
|
||||
dct_calc(d, &tab->re);
|
||||
if (do_inverse)
|
||||
idct_ref(&tab_ref->re, &tab1->re, fft_nbits);
|
||||
else
|
||||
dct_ref(&tab_ref->re, &tab1->re, fft_nbits);
|
||||
err = check_diff(&tab_ref->re, &tab->re, fft_size, 1.0);
|
||||
break;
|
||||
#endif /* CONFIG_DCT */
|
||||
#endif /* FFT_FLOAT */
|
||||
}
|
||||
|
||||
/* do a speed test */
|
||||
|
||||
if (do_speed) {
|
||||
int64_t time_start, duration;
|
||||
int nb_its;
|
||||
|
||||
av_log(NULL, AV_LOG_INFO, "Speed test...\n");
|
||||
/* we measure during about 1 seconds */
|
||||
nb_its = 1;
|
||||
for (;;) {
|
||||
time_start = av_gettime_relative();
|
||||
for (it = 0; it < nb_its; it++) {
|
||||
switch (transform) {
|
||||
#if CONFIG_MDCT
|
||||
case TRANSFORM_MDCT:
|
||||
if (do_inverse)
|
||||
imdct_calc(m, &tab->re, &tab1->re);
|
||||
else
|
||||
mdct_calc(m, &tab->re, &tab1->re);
|
||||
break;
|
||||
#endif
|
||||
case TRANSFORM_FFT:
|
||||
memcpy(tab, tab1, fft_size * sizeof(FFTComplex));
|
||||
fft_calc(s, tab);
|
||||
break;
|
||||
#if FFT_FLOAT
|
||||
case TRANSFORM_RDFT:
|
||||
memcpy(tab2, tab1, fft_size * sizeof(FFTSample));
|
||||
rdft_calc(r, tab2);
|
||||
break;
|
||||
case TRANSFORM_DCT:
|
||||
memcpy(tab2, tab1, fft_size * sizeof(FFTSample));
|
||||
dct_calc(d, tab2);
|
||||
break;
|
||||
#endif /* FFT_FLOAT */
|
||||
}
|
||||
}
|
||||
duration = av_gettime_relative() - time_start;
|
||||
if (duration >= 1000000)
|
||||
break;
|
||||
nb_its *= 2;
|
||||
}
|
||||
av_log(NULL, AV_LOG_INFO,
|
||||
"time: %0.1f us/transform [total time=%0.2f s its=%d]\n",
|
||||
(double) duration / nb_its,
|
||||
(double) duration / 1000000.0,
|
||||
nb_its);
|
||||
}
|
||||
|
||||
switch (transform) {
|
||||
#if CONFIG_MDCT
|
||||
case TRANSFORM_MDCT:
|
||||
mdct_end(m);
|
||||
break;
|
||||
#endif /* CONFIG_MDCT */
|
||||
case TRANSFORM_FFT:
|
||||
fft_end(s);
|
||||
break;
|
||||
#if FFT_FLOAT
|
||||
# if CONFIG_RDFT
|
||||
case TRANSFORM_RDFT:
|
||||
rdft_end(r);
|
||||
break;
|
||||
# endif /* CONFIG_RDFT */
|
||||
# if CONFIG_DCT
|
||||
case TRANSFORM_DCT:
|
||||
dct_end(d);
|
||||
break;
|
||||
# endif /* CONFIG_DCT */
|
||||
#endif /* FFT_FLOAT */
|
||||
}
|
||||
|
||||
cleanup:
|
||||
av_free(tab);
|
||||
av_free(tab1);
|
||||
av_free(tab2);
|
||||
av_free(tab_ref);
|
||||
av_free(exptab);
|
||||
|
||||
#if !AVFFT
|
||||
av_free(s);
|
||||
av_free(m);
|
||||
#endif
|
||||
|
||||
#if !AVFFT && FFT_FLOAT
|
||||
av_free(r);
|
||||
av_free(d);
|
||||
#endif
|
||||
|
||||
if (err)
|
||||
printf("Error: %d.\n", err);
|
||||
|
||||
return !!err;
|
||||
}
|
||||
|
||||
FF_ENABLE_DEPRECATION_WARNINGS
|
@ -5,11 +5,9 @@ OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
|
||||
OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o
|
||||
OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o
|
||||
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o
|
||||
OBJS-$(CONFIG_DCT) += x86/dct_init.o
|
||||
OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \
|
||||
x86/dirac_dwt_init.o
|
||||
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
|
||||
OBJS-$(CONFIG_FFT) += x86/fft_init.o
|
||||
OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o
|
||||
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
|
||||
@ -98,8 +96,6 @@ X86ASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \
|
||||
X86ASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o
|
||||
X86ASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o
|
||||
X86ASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
|
||||
X86ASM-OBJS-$(CONFIG_FFT) += x86/fft.o
|
||||
X86ASM-OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert.o
|
||||
X86ASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
|
||||
X86ASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
|
||||
|
@ -1,36 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dct.h"
|
||||
|
||||
void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
|
||||
void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
|
||||
|
||||
av_cold void ff_dct_init_x86(DCTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
s->dct32 = ff_dct32_float_sse2;
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags))
|
||||
s->dct32 = ff_dct32_float_avx;
|
||||
}
|
@ -1,838 +0,0 @@
|
||||
;******************************************************************************
|
||||
;* FFT transform with SSE/AVX optimizations
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;* Copyright (c) 2011 Vitor Sessak
|
||||
;*
|
||||
;* This algorithm (though not any of the implementation details) is
|
||||
;* based on libdjbfft by D. J. Bernstein.
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
; These functions are not individually interchangeable with the C versions.
|
||||
; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
|
||||
; in blocks as conventient to the vector size.
|
||||
; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
%if ARCH_X86_64
|
||||
%define pointer resq
|
||||
%else
|
||||
%define pointer resd
|
||||
%endif
|
||||
|
||||
struc FFTContext
|
||||
.nbits: resd 1
|
||||
.reverse: resd 1
|
||||
.revtab: pointer 1
|
||||
.tmpbuf: pointer 1
|
||||
.mdctsize: resd 1
|
||||
.mdctbits: resd 1
|
||||
.tcos: pointer 1
|
||||
.tsin: pointer 1
|
||||
.fftperm: pointer 1
|
||||
.fftcalc: pointer 1
|
||||
.imdctcalc:pointer 1
|
||||
.imdcthalf:pointer 1
|
||||
endstruc
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
%define M_SQRT1_2 0.70710678118654752440
|
||||
%define M_COS_PI_1_8 0.923879532511287
|
||||
%define M_COS_PI_3_8 0.38268343236509
|
||||
|
||||
ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
|
||||
ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
|
||||
|
||||
ps_root2: times 8 dd M_SQRT1_2
|
||||
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
|
||||
ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
|
||||
|
||||
perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
|
||||
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
|
||||
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
|
||||
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
|
||||
ps_m1p1: dd 1<<31, 0
|
||||
|
||||
cextern ps_neg
|
||||
|
||||
%assign i 16
|
||||
%rep 14
|
||||
cextern cos_ %+ i
|
||||
%assign i i<<1
|
||||
%endrep
|
||||
|
||||
%if ARCH_X86_64
|
||||
%define pointer dq
|
||||
%else
|
||||
%define pointer dd
|
||||
%endif
|
||||
|
||||
%macro IF0 1+
|
||||
%endmacro
|
||||
%macro IF1 1+
|
||||
%1
|
||||
%endmacro
|
||||
|
||||
SECTION .text
|
||||
|
||||
; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
|
||||
; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
|
||||
; %3, %4, %5 tmp
|
||||
; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
|
||||
; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
|
||||
%macro T8_AVX 5
|
||||
vsubps %5, %1, %2 ; v = %1 - %2
|
||||
vaddps %3, %1, %2 ; w = %1 + %2
|
||||
vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
|
||||
vpermilps %2, %2, [perm1]
|
||||
vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
|
||||
vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
|
||||
vsubps %4, %5, %1 ; s = r - q
|
||||
vaddps %1, %5, %1 ; u = r + q
|
||||
vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
|
||||
vshufps %5, %4, %1, 0xbb
|
||||
vshufps %3, %4, %1, 0xee
|
||||
vperm2f128 %3, %3, %5, 0x13
|
||||
vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
|
||||
vshufps %2, %1, %4, 0xdd
|
||||
vshufps %1, %1, %4, 0x88
|
||||
vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
|
||||
vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
|
||||
vsubps %5, %1, %3
|
||||
vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
|
||||
vsubps %2, %4, %1 ; %2 = v - w
|
||||
vaddps %1, %4, %1 ; %1 = v + w
|
||||
%endmacro
|
||||
|
||||
; In SSE mode do one fft4 transforms
|
||||
; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
|
||||
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
|
||||
;
|
||||
; In AVX mode do two fft4 transforms
|
||||
; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
|
||||
; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
|
||||
%macro T4_SSE 3
|
||||
subps %3, %1, %2 ; {t3,t4,-t8,t7}
|
||||
addps %1, %1, %2 ; {t1,t2,t6,t5}
|
||||
xorps %3, %3, [ps_p1p1m1p1]
|
||||
shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
|
||||
shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
|
||||
subps %3, %1, %2 ; {r2,i2,r3,i3}
|
||||
addps %1, %1, %2 ; {r0,i0,r1,i1}
|
||||
shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
|
||||
shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
|
||||
%endmacro
|
||||
|
||||
; In SSE mode do one FFT8
|
||||
; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
|
||||
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
|
||||
;
|
||||
; In AVX mode do two FFT8
|
||||
; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
|
||||
; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
|
||||
; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
|
||||
; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
|
||||
%macro T8_SSE 6
|
||||
addps %6, %3, %4 ; {t1,t2,t3,t4}
|
||||
subps %3, %3, %4 ; {r5,i5,r7,i7}
|
||||
shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
|
||||
mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
|
||||
mulps %4, %4, [ps_root2]
|
||||
addps %3, %3, %4 ; {t8,t7,ta,t9}
|
||||
shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
|
||||
shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
|
||||
subps %3, %6, %4 ; {t6,t5,tc,tb}
|
||||
addps %6, %6, %4 ; {t1,t2,t9,ta}
|
||||
shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
|
||||
shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
|
||||
subps %3, %1, %6 ; {r4,r5,r6,r7}
|
||||
addps %1, %1, %6 ; {r0,r1,r2,r3}
|
||||
subps %4, %2, %5 ; {i4,i5,i6,i7}
|
||||
addps %2, %2, %5 ; {i0,i1,i2,i3}
|
||||
%endmacro
|
||||
|
||||
%macro INTERL 5
|
||||
%if cpuflag(avx)
|
||||
vunpckhps %3, %2, %1
|
||||
vunpcklps %2, %2, %1
|
||||
vextractf128 %4(%5), %2, 0
|
||||
vextractf128 %4 %+ H(%5), %3, 0
|
||||
vextractf128 %4(%5 + 1), %2, 1
|
||||
vextractf128 %4 %+ H(%5 + 1), %3, 1
|
||||
%elif cpuflag(sse)
|
||||
mova %3, %2
|
||||
unpcklps %2, %1
|
||||
unpckhps %3, %1
|
||||
mova %4(%5), %2
|
||||
mova %4(%5+1), %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; scheduled for cpu-bound sizes
|
||||
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
|
||||
IF%1 mova m4, Z(4)
|
||||
IF%1 mova m5, Z(5)
|
||||
mova m0, %2 ; wre
|
||||
mova m1, %3 ; wim
|
||||
mulps m2, m4, m0 ; r2*wre
|
||||
IF%1 mova m6, Z2(6)
|
||||
mulps m3, m5, m1 ; i2*wim
|
||||
IF%1 mova m7, Z2(7)
|
||||
mulps m4, m4, m1 ; r2*wim
|
||||
mulps m5, m5, m0 ; i2*wre
|
||||
addps m2, m2, m3 ; r2*wre + i2*wim
|
||||
mulps m3, m1, m7 ; i3*wim
|
||||
subps m5, m5, m4 ; i2*wre - r2*wim
|
||||
mulps m1, m1, m6 ; r3*wim
|
||||
mulps m4, m0, m6 ; r3*wre
|
||||
mulps m0, m0, m7 ; i3*wre
|
||||
subps m4, m4, m3 ; r3*wre - i3*wim
|
||||
mova m3, Z(0)
|
||||
addps m0, m0, m1 ; i3*wre + r3*wim
|
||||
subps m1, m4, m2 ; t3
|
||||
addps m4, m4, m2 ; t5
|
||||
subps m3, m3, m4 ; r2
|
||||
addps m4, m4, Z(0) ; r0
|
||||
mova m6, Z(2)
|
||||
mova Z(4), m3
|
||||
mova Z(0), m4
|
||||
subps m3, m5, m0 ; t4
|
||||
subps m4, m6, m3 ; r3
|
||||
addps m3, m3, m6 ; r1
|
||||
mova Z2(6), m4
|
||||
mova Z(2), m3
|
||||
mova m2, Z(3)
|
||||
addps m3, m5, m0 ; t6
|
||||
subps m2, m2, m1 ; i3
|
||||
mova m7, Z(1)
|
||||
addps m1, m1, Z(3) ; i1
|
||||
mova Z2(7), m2
|
||||
mova Z(3), m1
|
||||
subps m4, m7, m3 ; i2
|
||||
addps m3, m3, m7 ; i0
|
||||
mova Z(5), m4
|
||||
mova Z(1), m3
|
||||
%endmacro
|
||||
|
||||
; scheduled to avoid store->load aliasing
|
||||
%macro PASS_BIG 1 ; (!interleave)
|
||||
mova m4, Z(4) ; r2
|
||||
mova m5, Z(5) ; i2
|
||||
mova m0, [wq] ; wre
|
||||
mova m1, [wq+o1q] ; wim
|
||||
mulps m2, m4, m0 ; r2*wre
|
||||
mova m6, Z2(6) ; r3
|
||||
mulps m3, m5, m1 ; i2*wim
|
||||
mova m7, Z2(7) ; i3
|
||||
mulps m4, m4, m1 ; r2*wim
|
||||
mulps m5, m5, m0 ; i2*wre
|
||||
addps m2, m2, m3 ; r2*wre + i2*wim
|
||||
mulps m3, m1, m7 ; i3*wim
|
||||
mulps m1, m1, m6 ; r3*wim
|
||||
subps m5, m5, m4 ; i2*wre - r2*wim
|
||||
mulps m4, m0, m6 ; r3*wre
|
||||
mulps m0, m0, m7 ; i3*wre
|
||||
subps m4, m4, m3 ; r3*wre - i3*wim
|
||||
mova m3, Z(0)
|
||||
addps m0, m0, m1 ; i3*wre + r3*wim
|
||||
subps m1, m4, m2 ; t3
|
||||
addps m4, m4, m2 ; t5
|
||||
subps m3, m3, m4 ; r2
|
||||
addps m4, m4, Z(0) ; r0
|
||||
mova m6, Z(2)
|
||||
mova Z(4), m3
|
||||
mova Z(0), m4
|
||||
subps m3, m5, m0 ; t4
|
||||
subps m4, m6, m3 ; r3
|
||||
addps m3, m3, m6 ; r1
|
||||
IF%1 mova Z2(6), m4
|
||||
IF%1 mova Z(2), m3
|
||||
mova m2, Z(3)
|
||||
addps m5, m5, m0 ; t6
|
||||
subps m2, m2, m1 ; i3
|
||||
mova m7, Z(1)
|
||||
addps m1, m1, Z(3) ; i1
|
||||
IF%1 mova Z2(7), m2
|
||||
IF%1 mova Z(3), m1
|
||||
subps m6, m7, m5 ; i2
|
||||
addps m5, m5, m7 ; i0
|
||||
IF%1 mova Z(5), m6
|
||||
IF%1 mova Z(1), m5
|
||||
%if %1==0
|
||||
INTERL m1, m3, m7, Z, 2
|
||||
INTERL m2, m4, m0, Z2, 6
|
||||
|
||||
mova m1, Z(0)
|
||||
mova m2, Z(4)
|
||||
|
||||
INTERL m5, m1, m3, Z, 0
|
||||
INTERL m6, m2, m7, Z, 4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%define Z(x) [r0+mmsize*x]
|
||||
%define Z2(x) [r0+mmsize*x]
|
||||
%define ZH(x) [r0+mmsize*x+mmsize/2]
|
||||
|
||||
INIT_YMM avx
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
align 16
|
||||
fft8_avx:
|
||||
mova m0, Z(0)
|
||||
mova m1, Z(1)
|
||||
T8_AVX m0, m1, m2, m3, m4
|
||||
mova Z(0), m0
|
||||
mova Z(1), m1
|
||||
ret
|
||||
|
||||
|
||||
align 16
|
||||
fft16_avx:
|
||||
mova m2, Z(2)
|
||||
mova m3, Z(3)
|
||||
T4_SSE m2, m3, m7
|
||||
|
||||
mova m0, Z(0)
|
||||
mova m1, Z(1)
|
||||
T8_AVX m0, m1, m4, m5, m7
|
||||
|
||||
mova m4, [ps_cos16_1]
|
||||
mova m5, [ps_cos16_2]
|
||||
vmulps m6, m2, m4
|
||||
vmulps m7, m3, m5
|
||||
vaddps m7, m7, m6
|
||||
vmulps m2, m2, m5
|
||||
vmulps m3, m3, m4
|
||||
vsubps m3, m3, m2
|
||||
vblendps m2, m7, m3, 0xf0
|
||||
vperm2f128 m3, m7, m3, 0x21
|
||||
vaddps m4, m2, m3
|
||||
vsubps m2, m3, m2
|
||||
vperm2f128 m2, m2, m2, 0x01
|
||||
vsubps m3, m1, m2
|
||||
vaddps m1, m1, m2
|
||||
vsubps m5, m0, m4
|
||||
vaddps m0, m0, m4
|
||||
vextractf128 Z(0), m0, 0
|
||||
vextractf128 ZH(0), m1, 0
|
||||
vextractf128 Z(1), m0, 1
|
||||
vextractf128 ZH(1), m1, 1
|
||||
vextractf128 Z(2), m5, 0
|
||||
vextractf128 ZH(2), m3, 0
|
||||
vextractf128 Z(3), m5, 1
|
||||
vextractf128 ZH(3), m3, 1
|
||||
ret
|
||||
|
||||
align 16
|
||||
fft32_avx:
|
||||
call fft16_avx
|
||||
|
||||
mova m0, Z(4)
|
||||
mova m1, Z(5)
|
||||
|
||||
T4_SSE m0, m1, m4
|
||||
|
||||
mova m2, Z(6)
|
||||
mova m3, Z(7)
|
||||
|
||||
T8_SSE m0, m1, m2, m3, m4, m6
|
||||
; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
|
||||
; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
|
||||
|
||||
vperm2f128 m4, m0, m2, 0x20
|
||||
vperm2f128 m5, m1, m3, 0x20
|
||||
vperm2f128 m6, m0, m2, 0x31
|
||||
vperm2f128 m7, m1, m3, 0x31
|
||||
|
||||
PASS_SMALL 0, [cos_32], [cos_32+32]
|
||||
|
||||
ret
|
||||
|
||||
fft32_interleave_avx:
|
||||
call fft32_avx
|
||||
mov r2d, 32
|
||||
.deint_loop:
|
||||
mova m2, Z(0)
|
||||
mova m3, Z(1)
|
||||
vunpcklps m0, m2, m3
|
||||
vunpckhps m1, m2, m3
|
||||
vextractf128 Z(0), m0, 0
|
||||
vextractf128 ZH(0), m1, 0
|
||||
vextractf128 Z(1), m0, 1
|
||||
vextractf128 ZH(1), m1, 1
|
||||
add r0, mmsize*2
|
||||
sub r2d, mmsize/4
|
||||
jg .deint_loop
|
||||
ret
|
||||
|
||||
%endif
|
||||
|
||||
INIT_XMM sse
|
||||
|
||||
align 16
|
||||
fft4_avx:
|
||||
fft4_sse:
|
||||
mova m0, Z(0)
|
||||
mova m1, Z(1)
|
||||
T4_SSE m0, m1, m2
|
||||
mova Z(0), m0
|
||||
mova Z(1), m1
|
||||
ret
|
||||
|
||||
align 16
|
||||
fft8_sse:
|
||||
mova m0, Z(0)
|
||||
mova m1, Z(1)
|
||||
T4_SSE m0, m1, m2
|
||||
mova m2, Z(2)
|
||||
mova m3, Z(3)
|
||||
T8_SSE m0, m1, m2, m3, m4, m5
|
||||
mova Z(0), m0
|
||||
mova Z(1), m1
|
||||
mova Z(2), m2
|
||||
mova Z(3), m3
|
||||
ret
|
||||
|
||||
align 16
|
||||
fft16_sse:
|
||||
mova m0, Z(0)
|
||||
mova m1, Z(1)
|
||||
T4_SSE m0, m1, m2
|
||||
mova m2, Z(2)
|
||||
mova m3, Z(3)
|
||||
T8_SSE m0, m1, m2, m3, m4, m5
|
||||
mova m4, Z(4)
|
||||
mova m5, Z(5)
|
||||
mova Z(0), m0
|
||||
mova Z(1), m1
|
||||
mova Z(2), m2
|
||||
mova Z(3), m3
|
||||
T4_SSE m4, m5, m6
|
||||
mova m6, Z2(6)
|
||||
mova m7, Z2(7)
|
||||
T4_SSE m6, m7, m0
|
||||
PASS_SMALL 0, [cos_16], [cos_16+16]
|
||||
ret
|
||||
|
||||
|
||||
%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
|
||||
%define Z2(x) [zcq + o3q + mmsize*(x&1)]
|
||||
%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
|
||||
%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
|
||||
|
||||
%macro DECL_PASS 2+ ; name, payload
|
||||
align 16
|
||||
%1:
|
||||
DEFINE_ARGS zc, w, n, o1, o3
|
||||
lea o3q, [nq*3]
|
||||
lea o1q, [nq*8]
|
||||
shl o3q, 4
|
||||
.loop:
|
||||
%2
|
||||
add zcq, mmsize*2
|
||||
add wq, mmsize
|
||||
sub nd, mmsize/8
|
||||
jg .loop
|
||||
rep ret
|
||||
%endmacro
|
||||
|
||||
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
|
||||
lea r2, [dispatch_tab%1]
|
||||
mov r2, [r2 + (%2q-2)*gprsize]
|
||||
%ifdef PIC
|
||||
lea r3, [$$]
|
||||
add r2, r3
|
||||
%endif
|
||||
call r2
|
||||
%endmacro ; FFT_DISPATCH
|
||||
|
||||
INIT_YMM avx
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
DECL_PASS pass_avx, PASS_BIG 1
|
||||
DECL_PASS pass_interleave_avx, PASS_BIG 0
|
||||
|
||||
cglobal fft_calc, 2,5,8
|
||||
mov r3d, [r0 + FFTContext.nbits]
|
||||
mov r0, r1
|
||||
mov r1, r3
|
||||
FFT_DISPATCH _interleave %+ SUFFIX, r1
|
||||
RET
|
||||
|
||||
%endif
|
||||
|
||||
INIT_XMM sse
|
||||
|
||||
DECL_PASS pass_sse, PASS_BIG 1
|
||||
DECL_PASS pass_interleave_sse, PASS_BIG 0
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal fft_calc, 2,5,8
|
||||
mov r3d, [r0 + FFTContext.nbits]
|
||||
PUSH r1
|
||||
PUSH r3
|
||||
mov r0, r1
|
||||
mov r1, r3
|
||||
FFT_DISPATCH _interleave %+ SUFFIX, r1
|
||||
POP rcx
|
||||
POP r4
|
||||
cmp rcx, 3+(mmsize/16)
|
||||
jg .end
|
||||
mov r2, -1
|
||||
add rcx, 3
|
||||
shl r2, cl
|
||||
sub r4, r2
|
||||
.loop:
|
||||
movaps xmm0, [r4 + r2]
|
||||
movaps xmm1, xmm0
|
||||
unpcklps xmm0, [r4 + r2 + 16]
|
||||
unpckhps xmm1, [r4 + r2 + 16]
|
||||
movaps [r4 + r2], xmm0
|
||||
movaps [r4 + r2 + 16], xmm1
|
||||
add r2, mmsize*2
|
||||
jl .loop
|
||||
.end:
|
||||
RET
|
||||
|
||||
cglobal fft_permute, 2,7,1
|
||||
mov r4, [r0 + FFTContext.revtab]
|
||||
mov r5, [r0 + FFTContext.tmpbuf]
|
||||
mov ecx, [r0 + FFTContext.nbits]
|
||||
mov r2, 1
|
||||
shl r2, cl
|
||||
xor r0, r0
|
||||
%if ARCH_X86_32
|
||||
mov r1, r1m
|
||||
%endif
|
||||
.loop:
|
||||
movaps xmm0, [r1 + 8*r0]
|
||||
movzx r6, word [r4 + 2*r0]
|
||||
movzx r3, word [r4 + 2*r0 + 2]
|
||||
movlps [r5 + 8*r6], xmm0
|
||||
movhps [r5 + 8*r3], xmm0
|
||||
add r0, 2
|
||||
cmp r0, r2
|
||||
jl .loop
|
||||
shl r2, 3
|
||||
add r1, r2
|
||||
add r5, r2
|
||||
neg r2
|
||||
; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
|
||||
.loopcopy:
|
||||
movaps xmm0, [r5 + r2]
|
||||
movaps xmm1, [r5 + r2 + 16]
|
||||
movaps [r1 + r2], xmm0
|
||||
movaps [r1 + r2 + 16], xmm1
|
||||
add r2, 32
|
||||
jl .loopcopy
|
||||
RET
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal imdct_calc, 3,5,3
|
||||
mov r3d, [r0 + FFTContext.mdctsize]
|
||||
mov r4, [r0 + FFTContext.imdcthalf]
|
||||
add r1, r3
|
||||
PUSH r3
|
||||
PUSH r1
|
||||
%if ARCH_X86_32
|
||||
push r2
|
||||
push r1
|
||||
push r0
|
||||
%else
|
||||
sub rsp, 8+32*WIN64 ; allocate win64 shadow space
|
||||
%endif
|
||||
call r4
|
||||
%if ARCH_X86_32
|
||||
add esp, 12
|
||||
%else
|
||||
add rsp, 8+32*WIN64
|
||||
%endif
|
||||
POP r1
|
||||
POP r3
|
||||
lea r0, [r1 + 2*r3]
|
||||
mov r2, r3
|
||||
sub r3, mmsize
|
||||
neg r2
|
||||
mova m2, [ps_neg]
|
||||
.loop:
|
||||
mova m0, [r1 + r3]
|
||||
mova m1, [r0 + r2]
|
||||
shufps m0, m0, 0x1b
|
||||
shufps m1, m1, 0x1b
|
||||
xorps m0, m2
|
||||
mova [r0 + r3], m1
|
||||
mova [r1 + r2], m0
|
||||
sub r3, mmsize
|
||||
add r2, mmsize
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
%ifdef PIC
|
||||
%define SECTION_REL - $$
|
||||
%else
|
||||
%define SECTION_REL
|
||||
%endif
|
||||
|
||||
%macro DECL_FFT 1-2 ; nbits, suffix
|
||||
%ifidn %0, 1
|
||||
%xdefine fullsuffix SUFFIX
|
||||
%else
|
||||
%xdefine fullsuffix %2 %+ SUFFIX
|
||||
%endif
|
||||
%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
|
||||
%if %1>=5
|
||||
%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
|
||||
%endif
|
||||
%if %1>=6
|
||||
%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
|
||||
%endif
|
||||
|
||||
%assign n 1<<%1
|
||||
%rep 18-%1
|
||||
%assign n2 n/2
|
||||
%assign n4 n/4
|
||||
%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
|
||||
|
||||
align 16
|
||||
fft %+ n %+ fullsuffix:
|
||||
call fft %+ n2 %+ SUFFIX
|
||||
add r0, n*4 - (n&(-2<<%1))
|
||||
call fft %+ n4 %+ SUFFIX
|
||||
add r0, n*2 - (n2&(-2<<%1))
|
||||
call fft %+ n4 %+ SUFFIX
|
||||
sub r0, n*6 + (n2&(-2<<%1))
|
||||
lea r1, [cos_ %+ n]
|
||||
mov r2d, n4/2
|
||||
jmp pass %+ fullsuffix
|
||||
|
||||
%assign n n*2
|
||||
%endrep
|
||||
%undef n
|
||||
|
||||
align 8
|
||||
dispatch_tab %+ fullsuffix: pointer list_of_fft
|
||||
%endmacro ; DECL_FFT
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
DECL_FFT 6
|
||||
DECL_FFT 6, _interleave
|
||||
%endif
|
||||
INIT_XMM sse
|
||||
DECL_FFT 5
|
||||
DECL_FFT 5, _interleave
|
||||
|
||||
INIT_XMM sse
|
||||
%undef mulps
|
||||
%undef addps
|
||||
%undef subps
|
||||
%undef unpcklps
|
||||
%undef unpckhps
|
||||
|
||||
%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
|
||||
movaps xmm0, [%3+%2*4]
|
||||
movaps xmm1, [%3+%1*4-0x10]
|
||||
movaps xmm2, xmm0
|
||||
shufps xmm0, xmm1, 0x88
|
||||
shufps xmm1, xmm2, 0x77
|
||||
movlps xmm4, [%4+%2*2]
|
||||
movlps xmm5, [%5+%2*2+0x0]
|
||||
movhps xmm4, [%4+%1*2-0x8]
|
||||
movhps xmm5, [%5+%1*2-0x8]
|
||||
movaps xmm2, xmm0
|
||||
movaps xmm3, xmm1
|
||||
mulps xmm0, xmm5
|
||||
mulps xmm1, xmm4
|
||||
mulps xmm2, xmm4
|
||||
mulps xmm3, xmm5
|
||||
subps xmm1, xmm0
|
||||
addps xmm2, xmm3
|
||||
movaps xmm0, xmm1
|
||||
unpcklps xmm1, xmm2
|
||||
unpckhps xmm0, xmm2
|
||||
%endmacro
|
||||
|
||||
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
|
||||
mulps m6, %3, [%5+%1]
|
||||
mulps m7, %2, [%5+%1]
|
||||
mulps %2, %2, [%6+%1]
|
||||
mulps %3, %3, [%6+%1]
|
||||
subps %2, %2, m6
|
||||
addps %3, %3, m7
|
||||
%endmacro
|
||||
|
||||
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
|
||||
.post:
|
||||
%if cpuflag(avx)
|
||||
vmovaps ymm1, [%3+%1*2]
|
||||
vmovaps ymm0, [%3+%1*2+0x20]
|
||||
vmovaps ymm3, [%3+%2*2]
|
||||
vmovaps ymm2, [%3+%2*2+0x20]
|
||||
|
||||
CMUL %1, ymm0, ymm1, %3, %4, %5
|
||||
CMUL %2, ymm2, ymm3, %3, %4, %5
|
||||
vshufps ymm1, ymm1, ymm1, 0x1b
|
||||
vshufps ymm3, ymm3, ymm3, 0x1b
|
||||
vperm2f128 ymm1, ymm1, ymm1, 0x01
|
||||
vperm2f128 ymm3, ymm3, ymm3, 0x01
|
||||
vunpcklps ymm6, ymm2, ymm1
|
||||
vunpckhps ymm4, ymm2, ymm1
|
||||
vunpcklps ymm7, ymm0, ymm3
|
||||
vunpckhps ymm5, ymm0, ymm3
|
||||
|
||||
vextractf128 [%3+%1*2], ymm7, 0
|
||||
vextractf128 [%3+%1*2+0x10], ymm5, 0
|
||||
vextractf128 [%3+%1*2+0x20], ymm7, 1
|
||||
vextractf128 [%3+%1*2+0x30], ymm5, 1
|
||||
|
||||
vextractf128 [%3+%2*2], ymm6, 0
|
||||
vextractf128 [%3+%2*2+0x10], ymm4, 0
|
||||
vextractf128 [%3+%2*2+0x20], ymm6, 1
|
||||
vextractf128 [%3+%2*2+0x30], ymm4, 1
|
||||
sub %2, 0x20
|
||||
add %1, 0x20
|
||||
jl .post
|
||||
%else
|
||||
movaps xmm1, [%3+%1*2]
|
||||
movaps xmm0, [%3+%1*2+0x10]
|
||||
CMUL %1, xmm0, xmm1, %3, %4, %5
|
||||
movaps xmm5, [%3+%2*2]
|
||||
movaps xmm4, [%3+%2*2+0x10]
|
||||
CMUL %2, xmm4, xmm5, %3, %4, %5
|
||||
shufps xmm1, xmm1, 0x1b
|
||||
shufps xmm5, xmm5, 0x1b
|
||||
movaps xmm6, xmm4
|
||||
unpckhps xmm4, xmm1
|
||||
unpcklps xmm6, xmm1
|
||||
movaps xmm2, xmm0
|
||||
unpcklps xmm0, xmm5
|
||||
unpckhps xmm2, xmm5
|
||||
movaps [%3+%2*2], xmm6
|
||||
movaps [%3+%2*2+0x10], xmm4
|
||||
movaps [%3+%1*2], xmm0
|
||||
movaps [%3+%1*2+0x10], xmm2
|
||||
sub %2, 0x10
|
||||
add %1, 0x10
|
||||
jl .post
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DECL_IMDCT 0
|
||||
cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
|
||||
%if ARCH_X86_64
|
||||
%define rrevtab r7
|
||||
%define rtcos r8
|
||||
%define rtsin r9
|
||||
%else
|
||||
%define rrevtab r6
|
||||
%define rtsin r6
|
||||
%define rtcos r5
|
||||
%endif
|
||||
mov r3d, [r0+FFTContext.mdctsize]
|
||||
add r2, r3
|
||||
shr r3, 1
|
||||
mov rtcos, [r0+FFTContext.tcos]
|
||||
mov rtsin, [r0+FFTContext.tsin]
|
||||
add rtcos, r3
|
||||
add rtsin, r3
|
||||
%if ARCH_X86_64 == 0
|
||||
push rtcos
|
||||
push rtsin
|
||||
%endif
|
||||
shr r3, 1
|
||||
mov rrevtab, [r0+FFTContext.revtab]
|
||||
add rrevtab, r3
|
||||
%if ARCH_X86_64 == 0
|
||||
push rrevtab
|
||||
%endif
|
||||
|
||||
sub r3, 4
|
||||
%if ARCH_X86_64
|
||||
xor r4, r4
|
||||
sub r4, r3
|
||||
%endif
|
||||
.pre:
|
||||
%if ARCH_X86_64 == 0
|
||||
;unspill
|
||||
xor r4, r4
|
||||
sub r4, r3
|
||||
mov rtcos, [esp+8]
|
||||
mov rtsin, [esp+4]
|
||||
%endif
|
||||
|
||||
PREROTATER r4, r3, r2, rtcos, rtsin
|
||||
%if ARCH_X86_64
|
||||
movzx r5, word [rrevtab+r4-4]
|
||||
movzx r6, word [rrevtab+r4-2]
|
||||
movzx r10, word [rrevtab+r3]
|
||||
movzx r11, word [rrevtab+r3+2]
|
||||
movlps [r1+r5 *8], xmm0
|
||||
movhps [r1+r6 *8], xmm0
|
||||
movlps [r1+r10*8], xmm1
|
||||
movhps [r1+r11*8], xmm1
|
||||
add r4, 4
|
||||
%else
|
||||
mov r6, [esp]
|
||||
movzx r5, word [r6+r4-4]
|
||||
movzx r4, word [r6+r4-2]
|
||||
movlps [r1+r5*8], xmm0
|
||||
movhps [r1+r4*8], xmm0
|
||||
movzx r5, word [r6+r3]
|
||||
movzx r4, word [r6+r3+2]
|
||||
movlps [r1+r5*8], xmm1
|
||||
movhps [r1+r4*8], xmm1
|
||||
%endif
|
||||
sub r3, 4
|
||||
jns .pre
|
||||
|
||||
mov r5, r0
|
||||
mov r6, r1
|
||||
mov r0, r1
|
||||
mov r1d, [r5+FFTContext.nbits]
|
||||
|
||||
FFT_DISPATCH SUFFIX, r1
|
||||
|
||||
mov r0d, [r5+FFTContext.mdctsize]
|
||||
add r6, r0
|
||||
shr r0, 1
|
||||
%if ARCH_X86_64 == 0
|
||||
%define rtcos r2
|
||||
%define rtsin r3
|
||||
mov rtcos, [esp+8]
|
||||
mov rtsin, [esp+4]
|
||||
%endif
|
||||
neg r0
|
||||
mov r1, -mmsize
|
||||
sub r1, r0
|
||||
POSROTATESHUF r0, r1, r6, rtcos, rtsin
|
||||
%if ARCH_X86_64 == 0
|
||||
add esp, 12
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
DECL_IMDCT
|
||||
|
||||
INIT_YMM avx
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
DECL_IMDCT
|
||||
%endif
|
@ -1,32 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_FFT_H
|
||||
#define AVCODEC_X86_FFT_H
|
||||
|
||||
#include "libavcodec/fft.h"
|
||||
|
||||
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
|
||||
|
||||
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
#endif /* AVCODEC_X86_FFT_H */
|
@ -1,47 +0,0 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
|
||||
#include "fft.h"
|
||||
|
||||
av_cold void ff_fft_init_x86(FFTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (s->nbits > 16)
|
||||
return;
|
||||
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
s->imdct_calc = ff_imdct_calc_sse;
|
||||
s->imdct_half = ff_imdct_half_sse;
|
||||
s->fft_permute = ff_fft_permute_sse;
|
||||
s->fft_calc = ff_fft_calc_sse;
|
||||
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
|
||||
s->imdct_half = ff_imdct_half_avx;
|
||||
s->fft_calc = ff_fft_calc_avx;
|
||||
s->fft_permutation = FF_FFT_PERM_AVX;
|
||||
}
|
||||
}
|
@ -181,7 +181,6 @@ include $(SRC_PATH)/tests/fate/enc_external.mak
|
||||
# Must be included after lavf-video.mak
|
||||
include $(SRC_PATH)/tests/fate/ffmpeg.mak
|
||||
include $(SRC_PATH)/tests/fate/ffprobe.mak
|
||||
include $(SRC_PATH)/tests/fate/fft.mak
|
||||
include $(SRC_PATH)/tests/fate/fifo-muxer.mak
|
||||
include $(SRC_PATH)/tests/fate/filter-audio.mak
|
||||
# Must be included after vcodec.mak
|
||||
|
@ -1,83 +0,0 @@
|
||||
define DEF_FFT
|
||||
FATE_DCT-$(CONFIG_DCT) += fate-dct1d-$(1) fate-idct1d-$(1)
|
||||
FATE_FFT-$(CONFIG_FFT) += fate-fft-$(1) fate-ifft-$(1)
|
||||
FATE_MDCT-$(CONFIG_MDCT) += fate-mdct-$(1) fate-imdct-$(1)
|
||||
FATE_RDFT-$(CONFIG_RDFT) += fate-rdft-$(1) fate-irdft-$(1)
|
||||
|
||||
fate-fft-$(N): ARGS = -n$(1)
|
||||
fate-ifft-$(N): ARGS = -n$(1) -i
|
||||
fate-mdct-$(N): ARGS = -n$(1) -m
|
||||
fate-imdct-$(N): ARGS = -n$(1) -m -i
|
||||
fate-rdft-$(N): ARGS = -n$(1) -r
|
||||
fate-irdft-$(N): ARGS = -n$(1) -r -i
|
||||
fate-dct1d-$(N): ARGS = -n$(1) -d
|
||||
fate-idct1d-$(N): ARGS = -n$(1) -d -i
|
||||
endef
|
||||
|
||||
$(foreach N, 4 5 6 7 8 9 10 11 12, $(eval $(call DEF_FFT,$(N))))
|
||||
|
||||
fate-dct-float: $(FATE_DCT-yes)
|
||||
fate-fft-float: $(FATE_FFT-yes)
|
||||
fate-mdct-float: $(FATE_MDCT-yes)
|
||||
fate-rdft-float: $(FATE_RDFT-yes)
|
||||
|
||||
FATE_FFT_ALL = $(FATE_DCT-yes) $(FATE_FFT-yes) $(FATE_MDCT-yes) $(FATE_RDFT-yes)
|
||||
|
||||
$(FATE_FFT_ALL): libavcodec/tests/fft$(EXESUF)
|
||||
$(FATE_FFT_ALL): CMD = run libavcodec/tests/fft$(EXESUF) $(CPUFLAGS:%=-c%) $(ARGS)
|
||||
|
||||
$(FATE_FFT_ALL): CMP = null
|
||||
|
||||
define DEF_FFT_FIXED32
|
||||
FATE_FFT_FIXED32 += fate-fft-fixed32-$(1) fate-ifft-fixed32-$(1) \
|
||||
fate-mdct-fixed32-$(1) fate-imdct-fixed32-$(1)
|
||||
|
||||
fate-fft-fixed32-$(1): ARGS = -n$(1)
|
||||
fate-ifft-fixed32-$(1): ARGS = -n$(1) -i
|
||||
#fate-mdct-fixed32-$(1): ARGS = -n$(1) -m
|
||||
fate-imdct-fixed32-$(1): ARGS = -n$(1) -m -i
|
||||
endef
|
||||
|
||||
$(foreach N, 4 5 6 7 8 9 10 11 12, $(eval $(call DEF_FFT_FIXED32,$(N))))
|
||||
|
||||
fate-fft-fixed32: $(FATE_FFT_FIXED32)
|
||||
$(FATE_FFT_FIXED32): libavcodec/tests/fft-fixed32$(EXESUF)
|
||||
$(FATE_FFT_FIXED32): CMD = run libavcodec/tests/fft-fixed32$(EXESUF) $(CPUFLAGS:%=-c%) $(ARGS)
|
||||
$(FATE_FFT_FIXED32): CMP = null
|
||||
|
||||
define DEF_AV_FFT
|
||||
FATE_AV_DCT-$(CONFIG_DCT) += fate-av-dct1d-$(1) fate-av-idct1d-$(1)
|
||||
FATE_AV_FFT-$(CONFIG_FFT) += fate-av-fft-$(1) fate-av-ifft-$(1)
|
||||
FATE_AV_MDCT-$(CONFIG_MDCT) += fate-av-mdct-$(1) fate-av-imdct-$(1)
|
||||
FATE_AV_RDFT-$(CONFIG_RDFT) += fate-av-rdft-$(1) fate-av-irdft-$(1)
|
||||
|
||||
fate-av-fft-$(N): ARGS = -n$(1)
|
||||
fate-av-ifft-$(N): ARGS = -n$(1) -i
|
||||
fate-av-mdct-$(N): ARGS = -n$(1) -m
|
||||
fate-av-imdct-$(N): ARGS = -n$(1) -m -i
|
||||
fate-av-rdft-$(N): ARGS = -n$(1) -r
|
||||
fate-av-irdft-$(N): ARGS = -n$(1) -r -i
|
||||
fate-av-dct1d-$(N): ARGS = -n$(1) -d
|
||||
fate-av-idct1d-$(N): ARGS = -n$(1) -d -i
|
||||
endef
|
||||
|
||||
$(foreach N, 4 5 6 7 8 9 10 11 12, $(eval $(call DEF_AV_FFT,$(N))))
|
||||
|
||||
fate-av-dct-float: $(FATE_AV_DCT-yes)
|
||||
fate-av-fft-float: $(FATE_AV_FFT-yes)
|
||||
fate-av-mdct-float: $(FATE_AV_MDCT-yes)
|
||||
fate-av-rdft-float: $(FATE_AV_RDFT-yes)
|
||||
|
||||
FATE_AV_FFT_ALL = $(FATE_AV_DCT-yes) $(FATE_AV_FFT-yes) $(FATE_AV_MDCT-yes) $(FATE_AV_RDFT-yes)
|
||||
|
||||
$(FATE_AV_FFT_ALL): libavcodec/tests/avfft$(EXESUF)
|
||||
$(FATE_AV_FFT_ALL): CMD = run libavcodec/tests/avfft$(EXESUF) $(CPUFLAGS:%=-c%) $(ARGS)
|
||||
$(FATE_AV_FFT_ALL): CMP = null
|
||||
|
||||
fate-dct: fate-dct-float
|
||||
fate-fft: fate-fft-float fate-fft-fixed32
|
||||
fate-mdct: fate-mdct-float
|
||||
fate-rdft: fate-rdft-float
|
||||
|
||||
FATE-$(call ALLYES, AVCODEC FFT MDCT) += $(FATE_FFT_ALL) $(FATE_FFT_FIXED32) $(FATE_AV_FFT_ALL)
|
||||
fate-fft-all: $(FATE_FFT_ALL) $(FATE_FFT_FIXED32) $(FATE_AV_FFT_ALL)
|
Loading…
Reference in New Issue
Block a user