FFmpeg/libavcodec/ppc/int_altivec.c

/*
 * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

/**
 ** @file
 ** integer misc ops.
 **/

#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif

#include "libavutil/ppc/types_altivec.h"
#include "libavcodec/dsputil.h"

#include "dsputil_altivec.h"

static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
                                     int size) {
    int i, size16;
    vector signed char vpix1;
    vector signed short vpix2, vdiff, vpix1l,vpix1h;
    union { vector signed int vscore;
            int32_t score[4];
          } u;
    u.vscore = vec_splat_s32(0);
//
//XXX lazy way, fix it later

#define vec_unaligned_load(b) \
    vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));

    size16 = size >> 4;
    while(size16) {
//        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
        //load pix1 and the first batch of pix2

        vpix1 = vec_unaligned_load(pix1);
        vpix2 = vec_unaligned_load(pix2);
        pix2 += 8;
        //unpack
        vpix1h = vec_unpackh(vpix1);
        vdiff  = vec_sub(vpix1h, vpix2);
        vpix1l = vec_unpackl(vpix1);
        // load another batch from pix2
        vpix2 = vec_unaligned_load(pix2);
        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
        vdiff  = vec_sub(vpix1l, vpix2);
        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
        pix1 += 16;
        pix2 += 8;
        size16--;
    }
    u.vscore = vec_sums(u.vscore, vec_splat_s32(0));

    size %= 16;
    for (i = 0; i < size; i++) {
        u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
    }
    return u.score[3];
}

static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
                                           int order)
{
    int i;
    LOAD_ZERO;
    const vec_s16 *pv;
    register vec_s16 vec1;
    register vec_s32 res = vec_splat_s32(0), t;
    int32_t ires;

    for(i = 0; i < order; i += 8){
        pv = (const vec_s16*)v1;
        vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
        t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
        res = vec_sums(t, res);
        v1 += 8;
        v2 += 8;
    }
    res = vec_splat(res, 3);
    vec_ste(res, 0, &ires);
    return ires;
}

static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
{
    LOAD_ZERO;
    vec_s16 *pv1 = (vec_s16*)v1;
    register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
    register vec_s16 t0, t1, i0, i1, i4;
    register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
    register vec_s32 res = zero_s32v;
    register vec_u8 align = vec_lvsl(0, v2);
    int32_t ires;
    order >>= 4;
    do {
        i1 = vec_ld(16, v2);
        t0 = vec_perm(i2, i1, align);
        i2 = vec_ld(32, v2);
        t1 = vec_perm(i1, i2, align);
        i0 = pv1[0];
        i1 = pv1[1];
        res = vec_msum(t0, i0, res);
        res = vec_msum(t1, i1, res);
        i4 = vec_ld(16, v3);
        t0 = vec_perm(i3, i4, align);
        i3 = vec_ld(32, v3);
        t1 = vec_perm(i4, i3, align);
        pv1[0] = vec_mladd(t0, muls, i0);
        pv1[1] = vec_mladd(t1, muls, i1);
        pv1 += 2;
        v2  += 8;
        v3  += 8;
    } while(--order);
    res = vec_splat(vec_sums(res, zero_s32v), 3);
    vec_ste(res, 0, &ires);
    return ires;
}

void ff_int_init_altivec(DSPContext* c, AVCodecContext *avctx)
{
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
    c->scalarproduct_int16 = scalarproduct_int16_altivec;
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
}
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 12:47:37 +03:00			`/*`
			`* Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>`
			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with FFmpeg; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`/**`
Remove explicit filename from Doxygen @file commands. Passing an explicit filename to this command is only necessary if the documentation in the @file block refers to a file different from the one the block resides in. Originally committed as revision 22921 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-04-20 17:45:34 +03:00			`** @file`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 12:47:37 +03:00			`** integer misc ops.`
			`**/`

Remove unnecessary gcc_fixes.h #include. Originally committed as revision 18384 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-04-09 14:54:13 +03:00			`#include "config.h"`
			`#if HAVE_ALTIVEC_H`
			`#include <altivec.h>`
			`#endif`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 12:47:37 +03:00
PPC: Move types_altivec.h and util_altivec.h from libavcodec to libavutil This will allow for easier implementation of Altivec functions in libraries other than libavcodec. 2012-05-21 23:24:42 +03:00			`#include "libavutil/ppc/types_altivec.h"`
Remove unnecessary gcc_fixes.h #include. Originally committed as revision 18384 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-04-09 14:54:13 +03:00			`#include "libavcodec/dsputil.h"`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 12:47:37 +03:00
			`#include "dsputil_altivec.h"`

make arguments to ssd_int8_vs_int16() const Originally committed as revision 9548 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-07-09 02:15:00 +03:00			`static int ssd_int8_vs_int16_altivec(const int8_t pix1, const int16_t pix2,`
			`int size) {`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 12:47:37 +03:00			`int i, size16;`
			`vector signed char vpix1;`
			`vector signed short vpix2, vdiff, vpix1l,vpix1h;`
			`union { vector signed int vscore;`
			`int32_t score[4];`
cosmetics: Reformat PPC code in libavcodec according to style guidelines. This includes indentation changes, comment reformatting, consistent brace placement and some prettyprinting. Originally committed as revision 14316 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-20 21:58:30 +03:00			`} u;`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 12:47:37 +03:00			`u.vscore = vec_splat_s32(0);`
			`//`
			`//XXX lazy way, fix it later`

			`#define vec_unaligned_load(b) \`
			`vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));`

			`size16 = size >> 4;`
			`while(size16) {`
			`// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);`
			`//load pix1 and the first batch of pix2`

			`vpix1 = vec_unaligned_load(pix1);`
			`vpix2 = vec_unaligned_load(pix2);`
			`pix2 += 8;`
			`//unpack`
			`vpix1h = vec_unpackh(vpix1);`
			`vdiff = vec_sub(vpix1h, vpix2);`
			`vpix1l = vec_unpackl(vpix1);`
			`// load another batch from pix2`
			`vpix2 = vec_unaligned_load(pix2);`
			`u.vscore = vec_msum(vdiff, vdiff, u.vscore);`
			`vdiff = vec_sub(vpix1l, vpix2);`
			`u.vscore = vec_msum(vdiff, vdiff, u.vscore);`
			`pix1 += 16;`
			`pix2 += 8;`
			`size16--;`
			`}`
			`u.vscore = vec_sums(u.vscore, vec_splat_s32(0));`

			`size %= 16;`
			`for (i = 0; i < size; i++) {`
			`u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);`
			`}`
			`return u.score[3];`
			`}`

ppc: add const where needed in scalarproduct_int16_altivec() Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-04-27 12:43:08 +03:00			`static int32_t scalarproduct_int16_altivec(const int16_t v1, const int16_t v2,`
ppc: remove shift parameter from scalarproduct_int16_altivec() The shift parameter was removed from this interface in 7e1ce6a. This updates the Altivec implementation to match. Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-04-27 12:39:58 +03:00			`int order)`
Altivec implementation of APE vector functions Originally committed as revision 14082 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-06 10:33:09 +03:00			`{`
			`int i;`
			`LOAD_ZERO;`
ppc: add const where needed in scalarproduct_int16_altivec() Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-04-27 12:43:08 +03:00			`const vec_s16 *pv;`
			`register vec_s16 vec1;`
Cleanup _t types in libavcodec/ppc Originally committed as revision 16357 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-12-27 13:21:28 +02:00			`register vec_s32 res = vec_splat_s32(0), t;`
PPC: remove unnecessary alignment on local variables Storing a single element from a vector where all elements have the same value does not require an aligned destination. Which element is stored depends on the alignment of the destination address, but since they all have the same value, the result is the same regardless of the alignment. Originally committed as revision 19696 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-08-25 00:42:22 +03:00			`int32_t ires;`
Altivec implementation of APE vector functions Originally committed as revision 14082 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-06 10:33:09 +03:00
			`for(i = 0; i < order; i += 8){`
ppc: add const where needed in scalarproduct_int16_altivec() Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-04-27 12:43:08 +03:00			`pv = (const vec_s16*)v1;`
Altivec implementation of APE vector functions Originally committed as revision 14082 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-06 10:33:09 +03:00			`vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));`
			`t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);`
			`res = vec_sums(t, res);`
			`v1 += 8;`
			`v2 += 8;`
			`}`
			`res = vec_splat(res, 3);`
			`vec_ste(res, 0, &ires);`
			`return ires;`
			`}`

Add const to some pointer parameters. Patch by Eli Friedman, eli D friedman A gmail Originally committed as revision 23826 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-06-27 18:11:38 +03:00			`static int32_t scalarproduct_and_madd_int16_altivec(int16_t v1, const int16_t v2, const int16_t *v3, int order, int mul)`
refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-12-05 17:09:10 +02:00			`{`
			`LOAD_ZERO;`
			`vec_s16 pv1 = (vec_s16)v1;`
			`register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};`
ppc: dsputil: do unaligned block accesses correctly To load unaligned vector data in the usual way, explicit vec_ld() should be used rather than dereferencing a pointer to a vector type. When the VSX extension is enabled, gcc may compile vector pointer dereferences using the VSX lxvw4x instruction instead of the lvx instruction typically used with Altivec/VMX. As the behaviour of these instructions with unaligned addresses differs, it is important that only lvx is used here. Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-04-27 04:46:14 +03:00			`register vec_s16 t0, t1, i0, i1, i4;`
			`register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);`
refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-12-05 17:09:10 +02:00			`register vec_s32 res = zero_s32v;`
			`register vec_u8 align = vec_lvsl(0, v2);`
			`int32_t ires;`
			`order >>= 4;`
			`do {`
ppc: dsputil: do unaligned block accesses correctly To load unaligned vector data in the usual way, explicit vec_ld() should be used rather than dereferencing a pointer to a vector type. When the VSX extension is enabled, gcc may compile vector pointer dereferences using the VSX lxvw4x instruction instead of the lvx instruction typically used with Altivec/VMX. As the behaviour of these instructions with unaligned addresses differs, it is important that only lvx is used here. Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-04-27 04:46:14 +03:00			`i1 = vec_ld(16, v2);`
			`t0 = vec_perm(i2, i1, align);`
			`i2 = vec_ld(32, v2);`
			`t1 = vec_perm(i1, i2, align);`
refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-12-05 17:09:10 +02:00			`i0 = pv1[0];`
			`i1 = pv1[1];`
			`res = vec_msum(t0, i0, res);`
			`res = vec_msum(t1, i1, res);`
ppc: dsputil: do unaligned block accesses correctly To load unaligned vector data in the usual way, explicit vec_ld() should be used rather than dereferencing a pointer to a vector type. When the VSX extension is enabled, gcc may compile vector pointer dereferences using the VSX lxvw4x instruction instead of the lvx instruction typically used with Altivec/VMX. As the behaviour of these instructions with unaligned addresses differs, it is important that only lvx is used here. Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-04-27 04:46:14 +03:00			`i4 = vec_ld(16, v3);`
			`t0 = vec_perm(i3, i4, align);`
			`i3 = vec_ld(32, v3);`
			`t1 = vec_perm(i4, i3, align);`
refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-12-05 17:09:10 +02:00			`pv1[0] = vec_mladd(t0, muls, i0);`
			`pv1[1] = vec_mladd(t1, muls, i1);`
			`pv1 += 2;`
ppc: dsputil: do unaligned block accesses correctly To load unaligned vector data in the usual way, explicit vec_ld() should be used rather than dereferencing a pointer to a vector type. When the VSX extension is enabled, gcc may compile vector pointer dereferences using the VSX lxvw4x instruction instead of the lvx instruction typically used with Altivec/VMX. As the behaviour of these instructions with unaligned addresses differs, it is important that only lvx is used here. Signed-off-by: Mans Rullgard <mans@mansr.com> 2012-04-27 04:46:14 +03:00			`v2 += 8;`
			`v3 += 8;`
refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-12-05 17:09:10 +02:00			`} while(--order);`
			`res = vec_splat(vec_sums(res, zero_s32v), 3);`
			`vec_ste(res, 0, &ires);`
			`return ires;`
			`}`

ppc: Add ff_ prefix to nonstatic symbols Signed-off-by: Martin Storsjö <martin@martin.st> 2012-02-15 15:42:56 +03:00			`void ff_int_init_altivec(DSPContext* c, AVCodecContext *avctx)`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 12:47:37 +03:00			`{`
			`c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;`
Altivec implementation of APE vector functions Originally committed as revision 14082 to svn://svn.ffmpeg.org/ffmpeg/trunk 2008-07-06 10:33:09 +03:00			`c->scalarproduct_int16 = scalarproduct_int16_altivec;`
refactor and optimize scalarproduct 29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk 2009-12-05 17:09:10 +02:00			`c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;`
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1 Originally committed as revision 8706 to svn://svn.ffmpeg.org/ffmpeg/trunk 2007-04-10 12:47:37 +03:00			`}`