Merge commit '65d5d5865845f057cc6530a8d0f34db952d9009c'

* commit '65d5d5865845f057cc6530a8d0f34db952d9009c': dsputil: Move SVQ1 encoding specific bits into svq1enc Conflicts: libavcodec/x86/Makefile Merged-by: Michael Niedermayer <michaelni@gmx.at>
2024-12-23 12:43:46 +02:00 · 2014-05-30 00:01:45 +02:00 · 2014-05-30 00:01:45 +02:00 · ea0931fb96
commit ea0931fb96
parent cb8763bda7 65d5d58658
10 changed files with 252 additions and 134 deletions
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@ -2216,16 +2216,6 @@ static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
    return score;
 }
 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
                               int size)
 {
    int score = 0, i;
    for (i = 0; i < size; i++)
        score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
    return score;
 }
 #define WRAPPER8_16_SQ(name8, name16)                                   \
 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
                  int stride, int h)                                    \
@ -2626,8 +2616,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
    ff_dsputil_init_dwt(c);
 #endif
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
    c->bswap_buf   = bswap_buf;
    c->bswap16_buf = bswap16_buf;
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@ -177,9 +177,6 @@ typedef struct DSPContext {
    me_cmp_func ildct_cmp[6]; // only width 16 used
    me_cmp_func frame_skip_cmp[6]; // only width 8 used
    int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
                             int size);
    qpel_mc_func put_qpel_pixels_tab[2][16];
    qpel_mc_func avg_qpel_pixels_tab[2][16];
    qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@ -12,6 +12,7 @@ OBJS-$(CONFIG_MPEGVIDEO)               += ppc/mpegvideo_altivec.o
 OBJS-$(CONFIG_VIDEODSP)                += ppc/videodsp_ppc.o
 OBJS-$(CONFIG_VP3DSP)                  += ppc/vp3dsp_altivec.o
 OBJS-$(CONFIG_SVQ1_ENCODER)            += ppc/svq1enc_altivec.o
 OBJS-$(CONFIG_VC1_DECODER)             += ppc/vc1dsp_altivec.o
 OBJS-$(CONFIG_VORBIS_DECODER)          += ppc/vorbisdsp_altivec.o
 OBJS-$(CONFIG_VP7_DECODER)             += ppc/vp8dsp_altivec.o
--- a/libavcodec/ppc/int_altivec.c
+++ b/libavcodec/ppc/int_altivec.c
@ -34,48 +34,6 @@
 #include "libavcodec/dsputil.h"
 #include "dsputil_altivec.h"
 static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
                                     int size)
 {
    int i, size16 = size >> 4;
    vector signed char vpix1;
    vector signed short vpix2, vdiff, vpix1l, vpix1h;
    union {
        vector signed int vscore;
        int32_t score[4];
    } u = { .vscore = vec_splat_s32(0) };
 // XXX lazy way, fix it later
    while (size16) {
        // score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
        // load pix1 and the first batch of pix2
        vpix1 = vec_unaligned_load(pix1);
        vpix2 = vec_unaligned_load(pix2);
        pix2 += 8;
        // unpack
        vpix1h = vec_unpackh(vpix1);
        vdiff  = vec_sub(vpix1h, vpix2);
        vpix1l = vec_unpackl(vpix1);
        // load another batch from pix2
        vpix2    = vec_unaligned_load(pix2);
        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
        vdiff    = vec_sub(vpix1l, vpix2);
        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
        pix1    += 16;
        pix2    += 8;
        size16--;
    }
    u.vscore = vec_sums(u.vscore, vec_splat_s32(0));
    size %= 16;
    for (i = 0; i < size; i++)
        u.score[3] += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
    return u.score[3];
 }
 static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
                                           int order)
 {
@ -140,8 +98,6 @@ static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
 av_cold void ff_int_init_altivec(DSPContext *c, AVCodecContext *avctx)
 {
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
    c->scalarproduct_int16 = scalarproduct_int16_altivec;
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
--- a/libavcodec/ppc/svq1enc_altivec.c
+++ b/libavcodec/ppc/svq1enc_altivec.c
@ -0,0 +1,80 @@
 /*
 * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include <stdint.h>
 #include "config.h"
 #if HAVE_ALTIVEC_H
 #include <altivec.h>
 #endif
 #include "libavutil/attributes.h"
 #include "libavutil/ppc/types_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 #include "libavcodec/svq1enc.h"
 #if HAVE_ALTIVEC
 static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
                                     int size)
 {
    int i, size16 = size >> 4;
    vector signed char vpix1;
    vector signed short vpix2, vdiff, vpix1l, vpix1h;
    union {
        vector signed int vscore;
        int32_t score[4];
    } u = { .vscore = vec_splat_s32(0) };
    while (size16) {
        // score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
        // load pix1 and the first batch of pix2
        vpix1 = vec_unaligned_load(pix1);
        vpix2 = vec_unaligned_load(pix2);
        pix2 += 8;
        // unpack
        vpix1h = vec_unpackh(vpix1);
        vdiff  = vec_sub(vpix1h, vpix2);
        vpix1l = vec_unpackl(vpix1);
        // load another batch from pix2
        vpix2    = vec_unaligned_load(pix2);
        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
        vdiff    = vec_sub(vpix1l, vpix2);
        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
        pix1    += 16;
        pix2    += 8;
        size16--;
    }
    u.vscore = vec_sums(u.vscore, vec_splat_s32(0));
    size %= 16;
    for (i = 0; i < size; i++)
        u.score[3] += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
    return u.score[3];
 }
 #endif /* HAVE_ALTIVEC */
 av_cold void ff_svq1enc_init_ppc(SVQ1EncContext *c)
 {
 #if HAVE_ALTIVEC
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
 #endif /* HAVE_ALTIVEC */
 }
--- a/libavcodec/svq1enc.c
+++ b/libavcodec/svq1enc.c
@ -34,48 +34,11 @@
 #include "internal.h"
 #include "mpegutils.h"
 #include "svq1.h"
 #include "svq1enc.h"
 #include "svq1enc_cb.h"
 #include "libavutil/avassert.h"
 typedef struct SVQ1EncContext {
    /* FIXME: Needed for motion estimation, should not be used for anything
     * else, the idea is to make the motion estimation eventually independent
     * of MpegEncContext, so this will be removed then. */
    MpegEncContext m;
    AVCodecContext *avctx;
    DSPContext dsp;
    HpelDSPContext hdsp;
    AVFrame *current_picture;
    AVFrame *last_picture;
    PutBitContext pb;
    GetBitContext gb;
    /* why ooh why this sick breadth first order,
     * everything is slower and more complex */
    PutBitContext reorder_pb[6];
    int frame_width;
    int frame_height;
    /* Y plane block dimensions */
    int y_block_width;
    int y_block_height;
    /* U & V plane (C planes) block dimensions */
    int c_block_width;
    int c_block_height;
    uint16_t *mb_type;
    uint32_t *dummy;
    int16_t (*motion_val8[3])[2];
    int16_t (*motion_val16[3])[2];
    int64_t rd_total;
    uint8_t *scratchbuf;
 } SVQ1EncContext;
 static void svq1_write_header(SVQ1EncContext *s, int frame_type)
 {
    int i;
@ -113,6 +76,16 @@ static void svq1_write_header(SVQ1EncContext *s, int frame_type)
 #define QUALITY_THRESHOLD    100
 #define THRESHOLD_MULTIPLIER 0.6
 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
                               int size)
 {
    int score = 0, i;
    for (i = 0; i < size; i++)
        score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
    return score;
 }
 static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
                        uint8_t *decoded, int stride, int level,
                        int threshold, int lambda, int intra)
@ -174,7 +147,7 @@ static int encode_block(SVQ1EncContext *s, uint8_t *src, uint8_t *ref,
                int sqr, diff, score;
                vector = codebook + stage * size * 16 + i * size;
-                sqr    = s->dsp.ssd_int8_vs_int16(vector, block[stage], size);
+                sqr    = s->ssd_int8_vs_int16(vector, block[stage], size);
                diff   = block_sum[stage] - sum;
                score  = sqr - (diff * (int64_t)diff >> (level + 3)); // FIXME: 64bit slooow
                if (score < best_vector_score) {
@ -580,6 +553,13 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx)
                                        s->y_block_height * sizeof(int16_t));
    s->dummy               = av_mallocz((s->y_block_width + 1) *
                                        s->y_block_height * sizeof(int32_t));
    s->ssd_int8_vs_int16   = ssd_int8_vs_int16_c;
    if (ARCH_PPC)
        ff_svq1enc_init_ppc(s);
    if (ARCH_X86)
        ff_svq1enc_init_x86(s);
    ff_h263_encode_init(&s->m); // mv_penalty
    return 0;
--- a/libavcodec/svq1enc.h
+++ b/libavcodec/svq1enc.h
@ -0,0 +1,78 @@
 /*
 * SVQ1 encoder
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #ifndef AVCODEC_SVQ1ENC_H
 #define AVCODEC_SVQ1ENC_H
 #include <stdint.h>
 #include "libavutil/frame.h"
 #include "avcodec.h"
 #include "dsputil.h"
 #include "get_bits.h"
 #include "hpeldsp.h"
 #include "mpegvideo.h"
 #include "put_bits.h"
 typedef struct SVQ1EncContext {
    /* FIXME: Needed for motion estimation, should not be used for anything
     * else, the idea is to make the motion estimation eventually independent
     * of MpegEncContext, so this will be removed then. */
    MpegEncContext m;
    AVCodecContext *avctx;
    DSPContext dsp;
    HpelDSPContext hdsp;
    AVFrame *current_picture;
    AVFrame *last_picture;
    PutBitContext pb;
    GetBitContext gb;
    /* why ooh why this sick breadth first order,
     * everything is slower and more complex */
    PutBitContext reorder_pb[6];
    int frame_width;
    int frame_height;
    /* Y plane block dimensions */
    int y_block_width;
    int y_block_height;
    /* U & V plane (C planes) block dimensions */
    int c_block_width;
    int c_block_height;
    uint16_t *mb_type;
    uint32_t *dummy;
    int16_t (*motion_val8[3])[2];
    int16_t (*motion_val16[3])[2];
    int64_t rd_total;
    uint8_t *scratchbuf;
    int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
                             int size);
 } SVQ1EncContext;
 void ff_svq1enc_init_ppc(SVQ1EncContext *c);
 void ff_svq1enc_init_x86(SVQ1EncContext *c);
 #endif /* AVCODEC_SVQ1ENC_H */
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@ -59,6 +59,7 @@ MMX-OBJS-$(CONFIG_HUFFYUVDSP)          += x86/huffyuvdsp_mmx.o
 MMX-OBJS-$(CONFIG_SNOW_DECODER)        += x86/snowdsp.o
 MMX-OBJS-$(CONFIG_SNOW_ENCODER)        += x86/snowdsp.o
 MMX-OBJS-$(CONFIG_SVQ1_ENCODER)        += x86/svq1enc_mmx.o
 MMX-OBJS-$(CONFIG_VC1_DECODER)         += x86/vc1dsp_mmx.o
 YASM-OBJS                              += x86/deinterlace.o             \
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@ -703,40 +703,6 @@ static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
 #undef SUM
 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
                                 int size)
 {
    int sum;
    x86_reg i = size;
    __asm__ volatile (
        "pxor %%mm4, %%mm4 \n"
        "1: \n"
        "sub $8, %0 \n"
        "movq (%2, %0), %%mm2 \n"
        "movq (%3, %0, 2), %%mm0 \n"
        "movq 8(%3, %0, 2), %%mm1 \n"
        "punpckhbw %%mm2, %%mm3 \n"
        "punpcklbw %%mm2, %%mm2 \n"
        "psraw $8, %%mm3 \n"
        "psraw $8, %%mm2 \n"
        "psubw %%mm3, %%mm1 \n"
        "psubw %%mm2, %%mm0 \n"
        "pmaddwd %%mm1, %%mm1 \n"
        "pmaddwd %%mm0, %%mm0 \n"
        "paddd %%mm1, %%mm4 \n"
        "paddd %%mm0, %%mm4 \n"
        "jg 1b \n"
        "movq %%mm4, %%mm3 \n"
        "psrlq $32, %%mm3 \n"
        "paddd %%mm3, %%mm4 \n"
        "movd %%mm4, %1 \n"
        : "+r" (i), "=r" (sum)
        : "r" (pix1), "r" (pix2));
    return sum;
 }
 #define PHADDD(a, t)                            \
    "movq  " #a ", " #t "               \n\t"   \
    "psrlq    $32, " #a "               \n\t"   \
@ -854,8 +820,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
            c->try_8x8basis = try_8x8basis_mmx;
        }
        c->add_8x8basis = add_8x8basis_mmx;
        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
    }
    if (INLINE_AMD3DNOW(cpu_flags)) {
--- a/libavcodec/x86/svq1enc_mmx.c
+++ b/libavcodec/x86/svq1enc_mmx.c
@ -0,0 +1,73 @@
 /*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/svq1enc.h"
 #if HAVE_INLINE_ASM
 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
                                 int size)
 {
    int sum;
    x86_reg i = size;
    __asm__ volatile (
        "pxor %%mm4, %%mm4 \n"
        "1: \n"
        "sub $8, %0 \n"
        "movq (%2, %0), %%mm2 \n"
        "movq (%3, %0, 2), %%mm0 \n"
        "movq 8(%3, %0, 2), %%mm1 \n"
        "punpckhbw %%mm2, %%mm3 \n"
        "punpcklbw %%mm2, %%mm2 \n"
        "psraw $8, %%mm3 \n"
        "psraw $8, %%mm2 \n"
        "psubw %%mm3, %%mm1 \n"
        "psubw %%mm2, %%mm0 \n"
        "pmaddwd %%mm1, %%mm1 \n"
        "pmaddwd %%mm0, %%mm0 \n"
        "paddd %%mm1, %%mm4 \n"
        "paddd %%mm0, %%mm4 \n"
        "jg 1b \n"
        "movq %%mm4, %%mm3 \n"
        "psrlq $32, %%mm3 \n"
        "paddd %%mm3, %%mm4 \n"
        "movd %%mm4, %1 \n"
        : "+r" (i), "=r" (sum)
        : "r" (pix1), "r" (pix2));
    return sum;
 }
 #endif /* HAVE_INLINE_ASM */
 av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
 {
 #if HAVE_INLINE_ASM
    int cpu_flags = av_get_cpu_flags();
    if (INLINE_MMX(cpu_flags)) {
        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
    }
 #endif /* HAVE_INLINE_ASM */
 }