FFmpeg/libavcodec/ppc/gmc_altivec.c

/*
 * GMC (Global Motion Compensation)
 * AltiVec-enabled
 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavcodec/dsputil.h"
#include "dsputil_ppc.h"
#include "util_altivec.h"
#include "types_altivec.h"
#include "dsputil_altivec.h"

/*
  altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
  to preserve proper dst alignment.
*/
#define GMC1_PERF_COND (h==8)
void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder)
{
POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
    const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder;
    const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] =
        {
            (16-x16)*(16-y16), /* A */
            (   x16)*(16-y16), /* B */
            (16-x16)*(   y16), /* C */
            (   x16)*(   y16), /* D */
            0, 0, 0, 0         /* padding */
        };
    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
    register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
    register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
    register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;
    int i;
    unsigned long dst_odd = (unsigned long)dst & 0x0000000F;
    unsigned long src_really_odd = (unsigned long)src & 0x0000000F;


POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);

    tempA = vec_ld(0, (unsigned short*)ABCD);
    Av = vec_splat(tempA, 0);
    Bv = vec_splat(tempA, 1);
    Cv = vec_splat(tempA, 2);
    Dv = vec_splat(tempA, 3);

    rounderV = vec_splat((vec_u16)vec_lde(0, &rounder_a), 0);

    // we'll be able to pick-up our 9 char elements
    // at src from those 32 bytes
    // we load the first batch here, as inside the loop
    // we can re-use 'src+stride' from one iteration
    // as the 'src' of the next.
    src_0 = vec_ld(0, src);
    src_1 = vec_ld(16, src);
    srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));

    if (src_really_odd != 0x0000000F) {
        // if src & 0xF == 0xF, then (src+1) is properly aligned
        // on the second vector.
        srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
    } else {
        srcvB = src_1;
    }
    srcvA = vec_mergeh(vczero, srcvA);
    srcvB = vec_mergeh(vczero, srcvB);

    for(i=0; i<h; i++) {
        dst_odd = (unsigned long)dst & 0x0000000F;
        src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;

        dstv = vec_ld(0, dst);

        // we we'll be able to pick-up our 9 char elements
        // at src + stride from those 32 bytes
        // then reuse the resulting 2 vectors srvcC and srcvD
        // as the next srcvA and srcvB
        src_0 = vec_ld(stride + 0, src);
        src_1 = vec_ld(stride + 16, src);
        srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));

        if (src_really_odd != 0x0000000F) {
            // if src & 0xF == 0xF, then (src+1) is properly aligned
            // on the second vector.
            srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
        } else {
            srcvD = src_1;
        }

        srcvC = vec_mergeh(vczero, srcvC);
        srcvD = vec_mergeh(vczero, srcvD);


        // OK, now we (finally) do the math :-)
        // those four instructions replaces 32 int muls & 32 int adds.
        // isn't AltiVec nice ?
        tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
        tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
        tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
        tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);

        srcvA = srcvC;
        srcvB = srcvD;

        tempD = vec_sr(tempD, vcsr8);

        dstv2 = vec_pack(tempD, (vector unsigned short)vczero);

        if (dst_odd) {
            dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
        } else {
            dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
        }

        vec_st(dstv2, 0, dst);

        dst += stride;
        src += stride;
    }

POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
}
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`/*`
dct_unquantize_h263_altivec by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1455 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-12 15:29:24 +02:00			`* GMC (Global Motion Compensation)`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`* AltiVec-enabled`
			`* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>`
			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 18:30:46 +03:00			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 18:30:46 +03:00			`* version 2.1 of the License, or (at your option) any later version.`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`*`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 18:30:46 +03:00			`* FFmpeg is distributed in the hope that it will be useful,`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
Change license headers to say 'FFmpeg' instead of 'this program/this library' and fix GPL/LGPL version mismatches. Originally committed as revision 6577 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-10-07 18:30:46 +03:00			`* License along with FFmpeg; if not, write to the Free Software`
Update licensing information: The FSF changed postal address. Originally committed as revision 4842 to svn://svn.ffmpeg.org/ffmpeg/trunk 2006-01-13 00:43:26 +02:00			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`*/`

Use full path for #includes from another directory. Originally committed as revision 13098 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-05-09 14:56:36 +03:00			`#include "libavcodec/dsputil.h"`
Sanitize altivec code so it can be built with runtime check properly Originally committed as revision 10640 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-10-02 14:39:32 +03:00			`#include "dsputil_ppc.h"`
			`#include "util_altivec.h"`
PPC: simplify loading some values into altivec registers Instead of filling a local array with the desired value and loading it, load a single element and vec_splat() it to fill the vector. Originally committed as revision 19691 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-08-24 13:36:13 +03:00			`#include "types_altivec.h"`
PPC: move prototypes to headers and make some functions static Originally committed as revision 22267 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-03-07 00:37:14 +02:00			`#include "dsputil_altivec.h"`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00
			`/*`
			`altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,`
spelling Originally committed as revision 11122 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-12-02 00:21:04 +02:00			`to preserve proper dst alignment.`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`*/`
PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2008 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-07-04 12:39:05 +03:00			`#define GMC1_PERF_COND (h==8)`
* UINTX -> uintx_t INTX -> intx_t Originally committed as revision 1578 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-02-11 18:35:48 +02:00			`void gmc1_altivec(uint8_t dst / align 8 /, uint8_t src /* align1 */, int stride, int h, int x16, int y16, int rounder)`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`{`
1) remove TBL support in PPC performance. It's much more useful to use the PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless code around 2) make the PPC perf stuff a configure option 3) make put_pixels16_altivec a bit faster by unrolling the loop by 4 patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2022 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-07-09 23:18:13 +03:00			`POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);`
Remove DECLARE_ALIGNED_{8,16} macros These macros are redundant. All uses are replaced with the generic DECLARE_ALIGNED macro instead. Originally committed as revision 22233 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-03-06 16:24:59 +02:00			`const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder;`
			`const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] =`
cosmetics: Reformat PPC code in libavcodec according to style guidelines. This includes indentation changes, comment reformatting, consistent brace placement and some prettyprinting. Originally committed as revision 14316 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-20 21:58:30 +03:00			`{`
			`(16-x16)(16-y16), / A */`
			`( x16)(16-y16), / B */`
			`(16-x16)( y16), / C */`
			`( x16)( y16), / D */`
			`0, 0, 0, 0 /* padding */`
			`};`
Remove const vector macro indirection that is useless and obfuscating now that the Metrowerks workarounds are gone. Originally committed as revision 10633 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-10-01 17:23:36 +03:00			`register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);`
			`register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;`
			`register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;`
			`int i;`
			`unsigned long dst_odd = (unsigned long)dst & 0x0000000F;`
			`unsigned long src_really_odd = (unsigned long)src & 0x0000000F;`

AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 1464 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-16 23:54:55 +02:00
1) remove TBL support in PPC performance. It's much more useful to use the PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless code around 2) make the PPC perf stuff a configure option 3) make put_pixels16_altivec a bit faster by unrolling the loop by 4 patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2022 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-07-09 23:18:13 +03:00			`POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);`
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 1464 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-16 23:54:55 +02:00
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`tempA = vec_ld(0, (unsigned short*)ABCD);`
			`Av = vec_splat(tempA, 0);`
			`Bv = vec_splat(tempA, 1);`
			`Cv = vec_splat(tempA, 2);`
			`Dv = vec_splat(tempA, 3);`

PPC: simplify loading some values into altivec registers Instead of filling a local array with the desired value and loading it, load a single element and vec_splat() it to fill the vector. Originally committed as revision 19691 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-08-24 13:36:13 +03:00			`rounderV = vec_splat((vec_u16)vec_lde(0, &rounder_a), 0);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`// we'll be able to pick-up our 9 char elements`
			`// at src from those 32 bytes`
			`// we load the first batch here, as inside the loop`
			`// we can re-use 'src+stride' from one iteration`
			`// as the 'src' of the next.`
			`src_0 = vec_ld(0, src);`
			`src_1 = vec_ld(16, src);`
			`srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
cosmetics: Reformat PPC code in libavcodec according to style guidelines. This includes indentation changes, comment reformatting, consistent brace placement and some prettyprinting. Originally committed as revision 14316 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-20 21:58:30 +03:00			`if (src_really_odd != 0x0000000F) {`
			`// if src & 0xF == 0xF, then (src+1) is properly aligned`
			`// on the second vector.`
			`srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));`
			`} else {`
			`srcvB = src_1;`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`}`
			`srcvA = vec_mergeh(vczero, srcvA);`
			`srcvB = vec_mergeh(vczero, srcvB);`
COSMETICS: Remove all trailing whitespace. Originally committed as revision 4749 to svn://svn.ffmpeg.org/ffmpeg/trunk 2005-12-17 20:14:38 +02:00
cosmetics: Reformat PPC code in libavcodec according to style guidelines. This includes indentation changes, comment reformatting, consistent brace placement and some prettyprinting. Originally committed as revision 14316 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-20 21:58:30 +03:00			`for(i=0; i<h; i++) {`
			`dst_odd = (unsigned long)dst & 0x0000000F;`
			`src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;`

			`dstv = vec_ld(0, dst);`

			`// we we'll be able to pick-up our 9 char elements`
			`// at src + stride from those 32 bytes`
			`// then reuse the resulting 2 vectors srvcC and srcvD`
			`// as the next srcvA and srcvB`
			`src_0 = vec_ld(stride + 0, src);`
			`src_1 = vec_ld(stride + 16, src);`
			`srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));`

			`if (src_really_odd != 0x0000000F) {`
			`// if src & 0xF == 0xF, then (src+1) is properly aligned`
			`// on the second vector.`
			`srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));`
			`} else {`
			`srcvD = src_1;`
			`}`

			`srcvC = vec_mergeh(vczero, srcvC);`
			`srcvD = vec_mergeh(vczero, srcvD);`


			`// OK, now we (finally) do the math :-)`
			`// those four instructions replaces 32 int muls & 32 int adds.`
			`// isn't AltiVec nice ?`
			`tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);`
			`tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);`
			`tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);`
			`tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);`

			`srcvA = srcvC;`
			`srcvB = srcvD;`

			`tempD = vec_sr(tempD, vcsr8);`

			`dstv2 = vec_pack(tempD, (vector unsigned short)vczero);`

			`if (dst_odd) {`
			`dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));`
			`} else {`
			`dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));`
			`}`

			`vec_st(dstv2, 0, dst);`

			`dst += stride;`
			`src += stride;`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`}`
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 1464 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-16 23:54:55 +02:00
1) remove TBL support in PPC performance. It's much more useful to use the PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless code around 2) make the PPC perf stuff a configure option 3) make put_pixels16_altivec a bit faster by unrolling the loop by 4 patch by (Romain Dolbeau <dolbeau at irisa dot fr>) Originally committed as revision 2022 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-07-09 23:18:13 +03:00			`POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);`
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>) Originally committed as revision 1448 to svn://svn.ffmpeg.org/ffmpeg/trunk 2003-01-11 22:51:03 +02:00			`}`