FFmpeg/libavcodec/ppc/fft_altivec.c

/*
 * FFT/IFFT transforms
 * AltiVec-enabled
 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
 * Based on code Copyright (c) 2002 Fabrice Bellard
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
#include "libavcodec/dsputil.h"
#include "dsputil_ppc.h"
#include "util_altivec.h"
/**
 * Do a complex FFT with the parameters defined in ff_fft_init(). The
 * input data must be permuted before with s->revtab table. No
 * 1.0/sqrt(n) normalization is done.
 * AltiVec-enabled
 * This code assumes that the 'z' pointer is 16 bytes-aligned
 * It also assumes all FFTComplex are 8 bytes-aligned pair of float
 * The code is exactly the same as the SSE version, except
 * that successive MUL + ADD/SUB have been merged into
 * fused multiply-add ('vec_madd' in altivec)
 */
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
{
POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);
    register const vector float vczero = (const vector float)vec_splat_u32(0.);

    int ln = s->nbits;
    int j, np, np2;
    int nblocks, nloops;
    register FFTComplex *p, *q;
    FFTComplex *cptr, *cptr1;
    int k;

POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);

    np = 1 << ln;

    {
        vector float *r, a, b, a1, c1, c2;

        r = (vector float *)&z[0];

        c1 = vcii(p,p,n,n);

        if (s->inverse) {
            c2 = vcii(p,p,n,p);
        } else {
            c2 = vcii(p,p,p,n);
        }

        j = (np >> 2);
        do {
            a = vec_ld(0, r);
            a1 = vec_ld(sizeof(vector float), r);

            b = vec_perm(a,a,vcprmle(1,0,3,2));
            a = vec_madd(a,c1,b);
            /* do the pass 0 butterfly */

            b = vec_perm(a1,a1,vcprmle(1,0,3,2));
            b = vec_madd(a1,c1,b);
            /* do the pass 0 butterfly */

            /* multiply third by -i */
            b = vec_perm(b,b,vcprmle(2,3,1,0));

            /* do the pass 1 butterfly */
            vec_st(vec_madd(b,c2,a), 0, r);
            vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);

            r += 2;
        } while (--j != 0);
    }
    /* pass 2 .. ln-1 */

    nblocks = np >> 3;
    nloops = 1 << 2;
    np2 = np >> 1;

    cptr1 = s->exptab1;
    do {
        p = z;
        q = z + nloops;
        j = nblocks;
        do {
            cptr = cptr1;
            k = nloops >> 1;
            do {
                vector float a,b,c,t1;

                a = vec_ld(0, (float*)p);
                b = vec_ld(0, (float*)q);

                /* complex mul */
                c = vec_ld(0, (float*)cptr);
                /*  cre*re cim*re */
                t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
                c = vec_ld(sizeof(vector float), (float*)cptr);
                /*  -cim*im cre*im */
                b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);

                /* butterfly */
                vec_st(vec_add(a,b), 0, (float*)p);
                vec_st(vec_sub(a,b), 0, (float*)q);

                p += 2;
                q += 2;
                cptr += 4;
            } while (--k);

            p += nloops;
            q += nloops;
        } while (--j);
        cptr1 += nloops * 2;
        nblocks = nblocks >> 1;
        nloops = nloops << 1;
    } while (nblocks != 0);

POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
}

av_cold void ff_fft_init_altivec(FFTContext *s)
{
    s->fft_calc = ff_fft_calc_altivec;
    s->split_radix = 0;
}
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`/*`
			`* FFT/IFFT transforms`
			`* AltiVec-enabled`
altivec patches by Romain Dolbeau Originally committed as revision 1423 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-08 20:47:49 +02:00			`* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>`
cosmetics: Remove pointless period after copyright statement non-sentences. Originally committed as revision 16684 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-01-19 17:46:40 +02:00			`* Based on code Copyright (c) 2002 Fabrice Bellard`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 18:30:46 +03:00			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 18:30:46 +03:00			`* version 2.1 of the License, or (at your option) any later version.`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 18:30:46 +03:00			`* FFmpeg is distributed in the hope that it will be useful,`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 18:30:46 +03:00			`* License along with FFmpeg; if not, write to the Free Software`
Update licensing information: The FSF changed postal address. Originally committed as revision 4842 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-01-13 00:43:26 +02:00			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`*/`
Use full path for #includes from another directory. Originally committed as revision 13098 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-05-09 14:56:36 +03:00			`#include "libavcodec/dsputil.h"`
Sanitize altivec code so it can be built with runtime check properly Originally committed as revision 10640 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-10-02 14:39:32 +03:00			`#include "dsputil_ppc.h"`
			`#include "util_altivec.h"`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`/**`
fft_() renamed into ff_fft_() patch by (Gildas Bazin <gbazin at altern dot org>) Originally committed as revision 2882 to svn://svn.ffmpeg.org/ffmpeg/trunk 2004-03-13 23:43:24 +02:00			`* Do a complex FFT with the parameters defined in ff_fft_init(). The`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`* input data must be permuted before with s->revtab table. No`
			`* 1.0/sqrt(n) normalization is done.`
			`* AltiVec-enabled`
			`* This code assumes that the 'z' pointer is 16 bytes-aligned`
			`* It also assumes all FFTComplex are 8 bytes-aligned pair of float`
			`* The code is exactly the same as the SSE version, except`
altivec patches by Romain Dolbeau Originally committed as revision 1423 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-08 20:47:49 +02:00			`* that successive MUL + ADD/SUB have been merged into`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`* fused multiply-add ('vec_madd' in altivec)`
			`*/`
fft_() renamed into ff_fft_() patch by (Gildas Bazin <gbazin at altern dot org>) Originally committed as revision 2882 to svn://svn.ffmpeg.org/ffmpeg/trunk 2004-03-13 23:43:24 +02:00			`void ff_fft_calc_altivec(FFTContext s, FFTComplex z)`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`{`
1) remove TBL support in PPC performance. It's much more useful to use the PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless code around 2) make the PPC perf stuff a configure option 3) make put_pixels16_altivec a bit faster by unrolling the loop by 4 patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2022 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-07-09 23:18:13 +03:00			`POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);`
Simplify Originally committed as revision 6932 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-11-07 14:45:29 +02:00			`register const vector float vczero = (const vector float)vec_splat_u32(0.);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`int ln = s->nbits;`
COSMETICS: tabs --> spaces, some prettyprinting Originally committed as revision 4764 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-22 03:10:11 +02:00			`int j, np, np2;`
			`int nblocks, nloops;`
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`register FFTComplex p, q;`
			`FFTComplex cptr, cptr1;`
			`int k;`

1) remove TBL support in PPC performance. It's much more useful to use the PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless code around 2) make the PPC perf stuff a configure option 3) make put_pixels16_altivec a bit faster by unrolling the loop by 4 patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2022 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-07-09 23:18:13 +03:00			`POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);`
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 1464 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-16 23:54:55 +02:00
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`np = 1 << ln;`

			`{`
			`vector float *r, a, b, a1, c1, c2;`

			`r = (vector float *)&z[0];`

			`c1 = vcii(p,p,n,n);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
cosmetics: Reformat PPC code in libavcodec according to style guidelines. This includes indentation changes, comment reformatting, consistent brace placement and some prettyprinting. Originally committed as revision 14316 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-20 21:58:30 +03:00			`if (s->inverse) {`
			`c2 = vcii(p,p,n,p);`
			`} else {`
			`c2 = vcii(p,p,p,n);`
			`}`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`j = (np >> 2);`
			`do {`
			`a = vec_ld(0, r);`
			`a1 = vec_ld(sizeof(vector float), r);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`b = vec_perm(a,a,vcprmle(1,0,3,2));`
			`a = vec_madd(a,c1,b);`
			`/* do the pass 0 butterfly */`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`b = vec_perm(a1,a1,vcprmle(1,0,3,2));`
			`b = vec_madd(a1,c1,b);`
			`/* do the pass 0 butterfly */`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`/* multiply third by -i */`
			`b = vec_perm(b,b,vcprmle(2,3,1,0));`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`/* do the pass 1 butterfly */`
			`vec_st(vec_madd(b,c2,a), 0, r);`
			`vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`r += 2;`
			`} while (--j != 0);`
			`}`
			`/* pass 2 .. ln-1 */`

			`nblocks = np >> 3;`
			`nloops = 1 << 2;`
			`np2 = np >> 1;`

			`cptr1 = s->exptab1;`
			`do {`
			`p = z;`
			`q = z + nloops;`
			`j = nblocks;`
			`do {`
			`cptr = cptr1;`
			`k = nloops >> 1;`
			`do {`
			`vector float a,b,c,t1;`

			`a = vec_ld(0, (float*)p);`
			`b = vec_ld(0, (float*)q);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`/* complex mul */`
			`c = vec_ld(0, (float*)cptr);`
			`/* crere cimre */`
			`t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);`
			`c = vec_ld(sizeof(vector float), (float*)cptr);`
			`/* -cimim creim */`
			`b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`/* butterfly */`
			`vec_st(vec_add(a,b), 0, (float*)p);`
			`vec_st(vec_sub(a,b), 0, (float*)q);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`p += 2;`
			`q += 2;`
			`cptr += 4;`
			`} while (--k);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding Originally committed as revision 1417 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-07 20:15:48 +02:00			`p += nloops;`
			`q += nloops;`
			`} while (--j);`
			`cptr1 += nloops * 2;`
			`nblocks = nblocks >> 1;`
			`nloops = nloops << 1;`
			`} while (nblocks != 0);`

1) remove TBL support in PPC performance. It's much more useful to use the PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless code around 2) make the PPC perf stuff a configure option 3) make put_pixels16_altivec a bit faster by unrolling the loop by 4 patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2022 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-07-09 23:18:13 +03:00			`POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);`
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 1464 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-16 23:54:55 +02:00			`}`
Move per-arch fft init bits into the corresponding subdirs Originally committed as revision 19864 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-09-16 00:14:14 +03:00
			`av_cold void ff_fft_init_altivec(FFTContext *s)`
			`{`
			`s->fft_calc = ff_fft_calc_altivec;`
			`s->split_radix = 0;`
			`}`