FFmpeg/libavcodec/ppc/fft_vsx.c

/*
 * FFT  transform, optimized with VSX built-in functions
 * Copyright (c) 2014 Rong Yan
 *
 * This algorithm (though not any of the implementation details) is
 * based on libdjbfft by D. J. Bernstein.
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */


#include "config.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
#include "libavcodec/fft-internal.h"
#include "fft_vsx.h"

#if HAVE_VSX

static void fft32_vsx_interleave(FFTComplex *z)
{
    fft16_vsx_interleave(z);
    fft8_vsx_interleave(z+16);
    fft8_vsx_interleave(z+24);
    pass_vsx_interleave(z,ff_cos_32,4);
}

static void fft64_vsx_interleave(FFTComplex *z)
{
    fft32_vsx_interleave(z);
    fft16_vsx_interleave(z+32);
    fft16_vsx_interleave(z+48);
    pass_vsx_interleave(z,ff_cos_64, 8);
}
static void fft128_vsx_interleave(FFTComplex *z)
{
    fft64_vsx_interleave(z);
    fft32_vsx_interleave(z+64);
    fft32_vsx_interleave(z+96);
    pass_vsx_interleave(z,ff_cos_128,16);
}
static void fft256_vsx_interleave(FFTComplex *z)
{
    fft128_vsx_interleave(z);
    fft64_vsx_interleave(z+128);
    fft64_vsx_interleave(z+192);
    pass_vsx_interleave(z,ff_cos_256,32);
}
static void fft512_vsx_interleave(FFTComplex *z)
{
    fft256_vsx_interleave(z);
    fft128_vsx_interleave(z+256);
    fft128_vsx_interleave(z+384);
    pass_vsx_interleave(z,ff_cos_512,64);
}
static void fft1024_vsx_interleave(FFTComplex *z)
{
    fft512_vsx_interleave(z);
    fft256_vsx_interleave(z+512);
    fft256_vsx_interleave(z+768);
    pass_vsx_interleave(z,ff_cos_1024,128);

}
static void fft2048_vsx_interleave(FFTComplex *z)
{
    fft1024_vsx_interleave(z);
    fft512_vsx_interleave(z+1024);
    fft512_vsx_interleave(z+1536);
    pass_vsx_interleave(z,ff_cos_2048,256);
}
static void fft4096_vsx_interleave(FFTComplex *z)
{
    fft2048_vsx_interleave(z);
    fft1024_vsx_interleave(z+2048);
    fft1024_vsx_interleave(z+3072);
    pass_vsx_interleave(z,ff_cos_4096, 512);
}
static void fft8192_vsx_interleave(FFTComplex *z)
{
    fft4096_vsx_interleave(z);
    fft2048_vsx_interleave(z+4096);
    fft2048_vsx_interleave(z+6144);
    pass_vsx_interleave(z,ff_cos_8192,1024);
}
static void fft16384_vsx_interleave(FFTComplex *z)
{
    fft8192_vsx_interleave(z);
    fft4096_vsx_interleave(z+8192);
    fft4096_vsx_interleave(z+12288);
    pass_vsx_interleave(z,ff_cos_16384,2048);
}
static void fft32768_vsx_interleave(FFTComplex *z)
{
    fft16384_vsx_interleave(z);
    fft8192_vsx_interleave(z+16384);
    fft8192_vsx_interleave(z+24576);
    pass_vsx_interleave(z,ff_cos_32768,4096);
}
static void fft65536_vsx_interleave(FFTComplex *z)
{
    fft32768_vsx_interleave(z);
    fft16384_vsx_interleave(z+32768);
    fft16384_vsx_interleave(z+49152);
    pass_vsx_interleave(z,ff_cos_65536,8192);
}

static void fft32_vsx(FFTComplex *z)
{
    fft16_vsx(z);
    fft8_vsx(z+16);
    fft8_vsx(z+24);
    pass_vsx(z,ff_cos_32,4);
}

static void fft64_vsx(FFTComplex *z)
{
    fft32_vsx(z);
    fft16_vsx(z+32);
    fft16_vsx(z+48);
    pass_vsx(z,ff_cos_64, 8);
}
static void fft128_vsx(FFTComplex *z)
{
    fft64_vsx(z);
    fft32_vsx(z+64);
    fft32_vsx(z+96);
    pass_vsx(z,ff_cos_128,16);
}
static void fft256_vsx(FFTComplex *z)
{
    fft128_vsx(z);
    fft64_vsx(z+128);
    fft64_vsx(z+192);
    pass_vsx(z,ff_cos_256,32);
}
static void fft512_vsx(FFTComplex *z)
{
    fft256_vsx(z);
    fft128_vsx(z+256);
    fft128_vsx(z+384);
    pass_vsx(z,ff_cos_512,64);
}
static void fft1024_vsx(FFTComplex *z)
{
    fft512_vsx(z);
    fft256_vsx(z+512);
    fft256_vsx(z+768);
    pass_vsx(z,ff_cos_1024,128);

}
static void fft2048_vsx(FFTComplex *z)
{
    fft1024_vsx(z);
    fft512_vsx(z+1024);
    fft512_vsx(z+1536);
    pass_vsx(z,ff_cos_2048,256);
}
static void fft4096_vsx(FFTComplex *z)
{
    fft2048_vsx(z);
    fft1024_vsx(z+2048);
    fft1024_vsx(z+3072);
    pass_vsx(z,ff_cos_4096, 512);
}
static void fft8192_vsx(FFTComplex *z)
{
    fft4096_vsx(z);
    fft2048_vsx(z+4096);
    fft2048_vsx(z+6144);
    pass_vsx(z,ff_cos_8192,1024);
}
static void fft16384_vsx(FFTComplex *z)
{
    fft8192_vsx(z);
    fft4096_vsx(z+8192);
    fft4096_vsx(z+12288);
    pass_vsx(z,ff_cos_16384,2048);
}
static void fft32768_vsx(FFTComplex *z)
{
    fft16384_vsx(z);
    fft8192_vsx(z+16384);
    fft8192_vsx(z+24576);
    pass_vsx(z,ff_cos_32768,4096);
}
static void fft65536_vsx(FFTComplex *z)
{
    fft32768_vsx(z);
    fft16384_vsx(z+32768);
    fft16384_vsx(z+49152);
    pass_vsx(z,ff_cos_65536,8192);
}

static void (* const fft_dispatch_vsx[])(FFTComplex*) = {
    fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,
    fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,
};
static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {
    fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,
    fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,
    fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,
};
void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z)
{
     fft_dispatch_vsx_interleave[s->nbits-2](z);
}
void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z)
{
     fft_dispatch_vsx[s->nbits-2](z);
}
#endif /* HAVE_VSX */
ppc: Fix the bug of fft for little endian environment on POWER7 and later Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-06-18 11:51:35 +03:00			`/*`
			`* FFT transform, optimized with VSX built-in functions`
			`* Copyright (c) 2014 Rong Yan`
			`*`
			`* This algorithm (though not any of the implementation details) is`
			`* based on libdjbfft by D. J. Bernstein.`
			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with FFmpeg; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`


			`#include "config.h"`
			`#include "libavutil/cpu.h"`
			`#include "libavutil/ppc/types_altivec.h"`
			`#include "libavutil/ppc/util_altivec.h"`
			`#include "libavcodec/fft.h"`
			`#include "libavcodec/fft-internal.h"`
			`#include "fft_vsx.h"`

			`#if HAVE_VSX`

			`static void fft32_vsx_interleave(FFTComplex *z)`
			`{`
			`fft16_vsx_interleave(z);`
			`fft8_vsx_interleave(z+16);`
			`fft8_vsx_interleave(z+24);`
			`pass_vsx_interleave(z,ff_cos_32,4);`
			`}`

			`static void fft64_vsx_interleave(FFTComplex *z)`
			`{`
			`fft32_vsx_interleave(z);`
			`fft16_vsx_interleave(z+32);`
			`fft16_vsx_interleave(z+48);`
			`pass_vsx_interleave(z,ff_cos_64, 8);`
			`}`
			`static void fft128_vsx_interleave(FFTComplex *z)`
			`{`
			`fft64_vsx_interleave(z);`
			`fft32_vsx_interleave(z+64);`
			`fft32_vsx_interleave(z+96);`
			`pass_vsx_interleave(z,ff_cos_128,16);`
			`}`
			`static void fft256_vsx_interleave(FFTComplex *z)`
			`{`
			`fft128_vsx_interleave(z);`
			`fft64_vsx_interleave(z+128);`
			`fft64_vsx_interleave(z+192);`
			`pass_vsx_interleave(z,ff_cos_256,32);`
			`}`
			`static void fft512_vsx_interleave(FFTComplex *z)`
			`{`
			`fft256_vsx_interleave(z);`
			`fft128_vsx_interleave(z+256);`
			`fft128_vsx_interleave(z+384);`
			`pass_vsx_interleave(z,ff_cos_512,64);`
			`}`
			`static void fft1024_vsx_interleave(FFTComplex *z)`
			`{`
			`fft512_vsx_interleave(z);`
			`fft256_vsx_interleave(z+512);`
			`fft256_vsx_interleave(z+768);`
			`pass_vsx_interleave(z,ff_cos_1024,128);`

			`}`
			`static void fft2048_vsx_interleave(FFTComplex *z)`
			`{`
			`fft1024_vsx_interleave(z);`
			`fft512_vsx_interleave(z+1024);`
			`fft512_vsx_interleave(z+1536);`
			`pass_vsx_interleave(z,ff_cos_2048,256);`
			`}`
			`static void fft4096_vsx_interleave(FFTComplex *z)`
			`{`
			`fft2048_vsx_interleave(z);`
			`fft1024_vsx_interleave(z+2048);`
			`fft1024_vsx_interleave(z+3072);`
			`pass_vsx_interleave(z,ff_cos_4096, 512);`
			`}`
			`static void fft8192_vsx_interleave(FFTComplex *z)`
			`{`
			`fft4096_vsx_interleave(z);`
			`fft2048_vsx_interleave(z+4096);`
			`fft2048_vsx_interleave(z+6144);`
			`pass_vsx_interleave(z,ff_cos_8192,1024);`
			`}`
			`static void fft16384_vsx_interleave(FFTComplex *z)`
			`{`
			`fft8192_vsx_interleave(z);`
			`fft4096_vsx_interleave(z+8192);`
			`fft4096_vsx_interleave(z+12288);`
			`pass_vsx_interleave(z,ff_cos_16384,2048);`
			`}`
			`static void fft32768_vsx_interleave(FFTComplex *z)`
			`{`
			`fft16384_vsx_interleave(z);`
			`fft8192_vsx_interleave(z+16384);`
			`fft8192_vsx_interleave(z+24576);`
			`pass_vsx_interleave(z,ff_cos_32768,4096);`
			`}`
			`static void fft65536_vsx_interleave(FFTComplex *z)`
			`{`
			`fft32768_vsx_interleave(z);`
			`fft16384_vsx_interleave(z+32768);`
			`fft16384_vsx_interleave(z+49152);`
			`pass_vsx_interleave(z,ff_cos_65536,8192);`
			`}`

			`static void fft32_vsx(FFTComplex *z)`
			`{`
			`fft16_vsx(z);`
			`fft8_vsx(z+16);`
			`fft8_vsx(z+24);`
			`pass_vsx(z,ff_cos_32,4);`
			`}`

			`static void fft64_vsx(FFTComplex *z)`
			`{`
			`fft32_vsx(z);`
			`fft16_vsx(z+32);`
			`fft16_vsx(z+48);`
			`pass_vsx(z,ff_cos_64, 8);`
			`}`
			`static void fft128_vsx(FFTComplex *z)`
			`{`
			`fft64_vsx(z);`
			`fft32_vsx(z+64);`
			`fft32_vsx(z+96);`
			`pass_vsx(z,ff_cos_128,16);`
			`}`
			`static void fft256_vsx(FFTComplex *z)`
			`{`
			`fft128_vsx(z);`
			`fft64_vsx(z+128);`
			`fft64_vsx(z+192);`
			`pass_vsx(z,ff_cos_256,32);`
			`}`
			`static void fft512_vsx(FFTComplex *z)`
			`{`
			`fft256_vsx(z);`
			`fft128_vsx(z+256);`
			`fft128_vsx(z+384);`
			`pass_vsx(z,ff_cos_512,64);`
			`}`
			`static void fft1024_vsx(FFTComplex *z)`
			`{`
			`fft512_vsx(z);`
			`fft256_vsx(z+512);`
			`fft256_vsx(z+768);`
			`pass_vsx(z,ff_cos_1024,128);`

			`}`
			`static void fft2048_vsx(FFTComplex *z)`
			`{`
			`fft1024_vsx(z);`
			`fft512_vsx(z+1024);`
			`fft512_vsx(z+1536);`
			`pass_vsx(z,ff_cos_2048,256);`
			`}`
			`static void fft4096_vsx(FFTComplex *z)`
			`{`
			`fft2048_vsx(z);`
			`fft1024_vsx(z+2048);`
			`fft1024_vsx(z+3072);`
			`pass_vsx(z,ff_cos_4096, 512);`
			`}`
			`static void fft8192_vsx(FFTComplex *z)`
			`{`
			`fft4096_vsx(z);`
			`fft2048_vsx(z+4096);`
			`fft2048_vsx(z+6144);`
			`pass_vsx(z,ff_cos_8192,1024);`
			`}`
			`static void fft16384_vsx(FFTComplex *z)`
			`{`
			`fft8192_vsx(z);`
			`fft4096_vsx(z+8192);`
			`fft4096_vsx(z+12288);`
			`pass_vsx(z,ff_cos_16384,2048);`
			`}`
			`static void fft32768_vsx(FFTComplex *z)`
			`{`
			`fft16384_vsx(z);`
			`fft8192_vsx(z+16384);`
			`fft8192_vsx(z+24576);`
			`pass_vsx(z,ff_cos_32768,4096);`
			`}`
			`static void fft65536_vsx(FFTComplex *z)`
			`{`
			`fft32768_vsx(z);`
			`fft16384_vsx(z+32768);`
			`fft16384_vsx(z+49152);`
			`pass_vsx(z,ff_cos_65536,8192);`
			`}`

			`static void (* const fft_dispatch_vsx[])(FFTComplex*) = {`
			`fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,`
			`fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,`
			`};`
			`static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {`
			`fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,`
			`fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,`
			`fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,`
			`};`
			`void ff_fft_calc_interleave_vsx(FFTContext s, FFTComplex z)`
			`{`
			`fft_dispatch_vsx_interleave[s->nbits-2](z);`
			`}`
			`void ff_fft_calc_vsx(FFTContext s, FFTComplex z)`
			`{`
			`fft_dispatch_vsx[s->nbits-2](z);`
			`}`
			`#endif /* HAVE_VSX */`