From ae4c9ddebc32eaacbd62681d776881e59ca6e6f7 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Sun, 12 Jul 2015 06:44:39 -0400
Subject: [PATCH] vf_psnr: sse2 optimizations for sum-squared-error.

The internal line accumulator for 16bit can overflow, so I changed that
from int to uint64_t in the C code. The matching assembly looks a little
weird but output looks correct.

(avx2 should be trivial to add later.)

Reviewed-by: Paul B Mahol <onemda@gmail.com>
Reviewed-by: James Almer <jamrial@gmail.com>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
---
 libavfilter/psnr.h             |  33 ++++++++
 libavfilter/vf_psnr.c          |  76 ++++++++----------
 libavfilter/x86/Makefile       |   2 +
 libavfilter/x86/vf_psnr.asm    | 139 +++++++++++++++++++++++++++++++++
 libavfilter/x86/vf_psnr_init.c |  39 +++++++++
 5 files changed, 247 insertions(+), 42 deletions(-)
 create mode 100644 libavfilter/psnr.h
 create mode 100644 libavfilter/x86/vf_psnr.asm
 create mode 100644 libavfilter/x86/vf_psnr_init.c

diff --git a/libavfilter/psnr.h b/libavfilter/psnr.h
new file mode 100644
index 0000000000..efe94da23c
--- /dev/null
+++ b/libavfilter/psnr.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef LIBAVFILTER_PSNR_H
+#define LIBAVFILTER_PSNR_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct PSNRDSPContext {
+    uint64_t (*sse_line)(const uint8_t *buf, const uint8_t *ref, int w);
+} PSNRDSPContext;
+
+void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp);
+
+#endif /* LIBAVFILTER_PSNR_H */
diff --git a/libavfilter/vf_psnr.c b/libavfilter/vf_psnr.c
index 74afdaa505..9390f7c625 100644
--- a/libavfilter/vf_psnr.c
+++ b/libavfilter/vf_psnr.c
@@ -33,6 +33,7 @@
 #include "drawutils.h"
 #include "formats.h"
 #include "internal.h"
+#include "psnr.h"
 #include "video.h"
 
 typedef struct PSNRContext {
@@ -50,11 +51,7 @@ typedef struct PSNRContext {
     int planewidth[4];
     int planeheight[4];
     double planeweight[4];
-
-    void (*compute_mse)(struct PSNRContext *s,
-                        const uint8_t *m[4], const int ml[4],
-                        const uint8_t *r[4], const int rl[4],
-                        int w, int h, double mse[4]);
+    PSNRDSPContext dsp;
 } PSNRContext;
 
 #define OFFSET(x) offsetof(PSNRContext, x)
@@ -78,13 +75,37 @@ static inline double get_psnr(double mse, uint64_t nb_frames, int max)
     return 10.0 * log(pow2(max) / (mse / nb_frames)) / log(10.0);
 }
 
+static uint64_t sse_line_8bit(const uint8_t *main_line,  const uint8_t *ref_line, int outw)
+{
+    int j;
+    unsigned m2 = 0;
+
+    for (j = 0; j < outw; j++)
+        m2 += pow2(main_line[j] - ref_line[j]);
+
+    return m2;
+}
+
+static uint64_t sse_line_16bit(const uint8_t *_main_line, const uint8_t *_ref_line, int outw)
+{
+    int j;
+    uint64_t m2 = 0;
+    const uint16_t *main_line = (const uint16_t *) _main_line;
+    const uint16_t *ref_line = (const uint16_t *) _ref_line;
+
+    for (j = 0; j < outw; j++)
+        m2 += pow2(main_line[j] - ref_line[j]);
+
+    return m2;
+}
+
 static inline
 void compute_images_mse(PSNRContext *s,
                         const uint8_t *main_data[4], const int main_linesizes[4],
                         const uint8_t *ref_data[4], const int ref_linesizes[4],
                         int w, int h, double mse[4])
 {
-    int i, c, j;
+    int i, c;
 
     for (c = 0; c < s->nb_components; c++) {
         const int outw = s->planewidth[c];
@@ -94,39 +115,8 @@ void compute_images_mse(PSNRContext *s,
         const int ref_linesize = ref_linesizes[c];
         const int main_linesize = main_linesizes[c];
         uint64_t m = 0;
-
         for (i = 0; i < outh; i++) {
-            int m2 = 0;
-            for (j = 0; j < outw; j++)
-                m2 += pow2(main_line[j] - ref_line[j]);
-            m += m2;
-            ref_line += ref_linesize;
-            main_line += main_linesize;
-        }
-        mse[c] = m / (double)(outw * outh);
-    }
-}
-
-static inline
-void compute_images_mse_16bit(PSNRContext *s,
-                        const uint8_t *main_data[4], const int main_linesizes[4],
-                        const uint8_t *ref_data[4], const int ref_linesizes[4],
-                        int w, int h, double mse[4])
-{
-    int i, c, j;
-
-    for (c = 0; c < s->nb_components; c++) {
-        const int outw = s->planewidth[c];
-        const int outh = s->planeheight[c];
-        const uint16_t *main_line = (uint16_t *)main_data[c];
-        const uint16_t *ref_line = (uint16_t *)ref_data[c];
-        const int ref_linesize = ref_linesizes[c] / 2;
-        const int main_linesize = main_linesizes[c] / 2;
-        uint64_t m = 0;
-
-        for (i = 0; i < outh; i++) {
-            for (j = 0; j < outw; j++)
-                m += pow2(main_line[j] - ref_line[j]);
+            m += s->dsp.sse_line(main_line, ref_line, outw);
             ref_line += ref_linesize;
             main_line += main_linesize;
         }
@@ -155,9 +145,9 @@ static AVFrame *do_psnr(AVFilterContext *ctx, AVFrame *main,
     int j, c;
     AVDictionary **metadata = avpriv_frame_get_metadatap(main);
 
-    s->compute_mse(s, (const uint8_t **)main->data, main->linesize,
-                      (const uint8_t **)ref->data, ref->linesize,
-                       main->width, main->height, comp_mse);
+    compute_images_mse(s, (const uint8_t **)main->data, main->linesize,
+                          (const uint8_t **)ref->data, ref->linesize,
+                          main->width, main->height, comp_mse);
 
     for (j = 0; j < s->nb_components; j++)
         mse += comp_mse[j] * s->planeweight[j];
@@ -283,7 +273,9 @@ static int config_input_ref(AVFilterLink *inlink)
         s->average_max += s->max[j] * s->planeweight[j];
     }
 
-    s->compute_mse = desc->comp[0].depth_minus1 > 7 ? compute_images_mse_16bit : compute_images_mse;
+    s->dsp.sse_line = desc->comp[0].depth_minus1 > 7 ? sse_line_16bit : sse_line_8bit;
+    if (ARCH_X86)
+        ff_psnr_init_x86(&s->dsp, desc->comp[0].depth_minus1 + 1);
 
     return 0;
 }
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 61be8c6f54..230e879899 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -6,6 +6,7 @@ OBJS-$(CONFIG_IDET_FILTER)                   += x86/vf_idet_init.o
 OBJS-$(CONFIG_INTERLACE_FILTER)              += x86/vf_interlace_init.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += x86/vf_noise.o
 OBJS-$(CONFIG_PP7_FILTER)                    += x86/vf_pp7_init.o
+OBJS-$(CONFIG_PSNR_FILTER)                   += x86/vf_psnr_init.o
 OBJS-$(CONFIG_PULLUP_FILTER)                 += x86/vf_pullup_init.o
 OBJS-$(CONFIG_SPP_FILTER)                    += x86/vf_spp.o
 OBJS-$(CONFIG_SSIM_FILTER)                   += x86/vf_ssim_init.o
@@ -19,6 +20,7 @@ YASM-OBJS-$(CONFIG_HQDN3D_FILTER)            += x86/vf_hqdn3d.o
 YASM-OBJS-$(CONFIG_IDET_FILTER)              += x86/vf_idet.o
 YASM-OBJS-$(CONFIG_INTERLACE_FILTER)         += x86/vf_interlace.o
 YASM-OBJS-$(CONFIG_PP7_FILTER)               += x86/vf_pp7.o
+YASM-OBJS-$(CONFIG_PSNR_FILTER)              += x86/vf_psnr.o
 YASM-OBJS-$(CONFIG_PULLUP_FILTER)            += x86/vf_pullup.o
 YASM-OBJS-$(CONFIG_SSIM_FILTER)              += x86/vf_ssim.o
 YASM-OBJS-$(CONFIG_TINTERLACE_FILTER)        += x86/vf_interlace.o
diff --git a/libavfilter/x86/vf_psnr.asm b/libavfilter/x86/vf_psnr.asm
new file mode 100644
index 0000000000..023ae06efb
--- /dev/null
+++ b/libavfilter/x86/vf_psnr.asm
@@ -0,0 +1,139 @@
+;*****************************************************************************
+;* x86-optimized functions for interlace filter
+;*
+;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro SSE_LINE_FN 2 ; 8 or 16, byte or word
+INIT_XMM sse2
+%if ARCH_X86_32
+%if %1 == 8
+cglobal sse_line_%1 %+ bit, 0, 6, 8, res, buf, w, px1, px2, ref
+%else
+cglobal sse_line_%1 %+ bit, 0, 7, 8, res, buf, reshigh, w, px1, px2, ref
+%endif
+    mov       bufq, r0mp
+    mov       refq, r1mp
+    mov         wd, r2m
+%else
+cglobal sse_line_%1 %+ bit, 3, 5, 8, buf, ref, w, px1, px2
+%endif
+    pxor        m6, m6
+    pxor        m7, m7
+    sub         wd, mmsize*2
+    jl .end
+
+.loop:
+    movu        m0, [bufq+mmsize*0]
+    movu        m1, [bufq+mmsize*1]
+    movu        m2, [refq+mmsize*0]
+    movu        m3, [refq+mmsize*1]
+%if %1 == 8
+    add       bufq, mmsize*2
+    add       refq, mmsize*2
+    psubusb     m4, m0, m2
+    psubusb     m5, m1, m3
+    psubusb     m2, m0
+    psubusb     m3, m1
+    por         m2, m4
+    por         m3, m5
+    punpcklbw   m0, m2, m6
+    punpcklbw   m1, m3, m6
+    punpckhbw   m2, m6
+    punpckhbw   m3, m6
+%else
+    psubw       m0, m2
+    psubw       m1, m3
+    movu        m2, [bufq+mmsize*2]
+    movu        m3, [bufq+mmsize*3]
+    movu        m4, [refq+mmsize*2]
+    movu        m5, [refq+mmsize*3]
+    psubw       m2, m4
+    psubw       m3, m5
+    add       bufq, mmsize*4
+    add       refq, mmsize*4
+%endif
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    pmaddwd     m2, m2
+    pmaddwd     m3, m3
+    paddd       m0, m1
+    paddd       m2, m3
+%if %1 == 8
+    paddd       m7, m0
+    paddd       m7, m2
+%else
+    paddd       m0, m2
+    punpckldq   m2, m0, m6
+    punpckhdq   m0, m6
+    paddq       m7, m0
+    paddq       m7, m2
+%endif
+    sub         wd, mmsize*2
+    jge .loop
+
+.end:
+    add         wd, mmsize*2
+    movhlps     m0, m7
+%if %1 == 8
+    paddd       m7, m0
+    pshufd      m0, m7, 1
+    paddd       m7, m0
+    movd       eax, m7
+%else
+    paddq       m7, m0
+%if ARCH_X86_32
+    movd       eax, m7
+    psrldq      m7, 4
+    movd       edx, m7
+%else
+    movq       rax, m7
+%endif
+%endif
+
+    ; deal with cases where w % 32 != 0
+    test        wd, wd
+    jz .end_scalar
+.loop_scalar:
+    movzx     px1d, %2 [bufq+wq*(%1/8)-(%1/8)]
+    movzx     px2d, %2 [refq+wq*(%1/8)-(%1/8)]
+    sub       px1d, px2d
+    imul      px1d, px1d
+%if %1 == 8
+    add        eax, px1d
+%elif ARCH_X86_64
+    add        rax, px1q
+%else
+    add        eax, px1d
+    adc        edx, 0
+%endif
+    dec         wd
+    jg .loop_scalar
+
+.end_scalar:
+    ; for %1=8, no need to zero edx on x86-32, since edx=wd, which is zero
+    RET
+%endmacro
+
+INIT_XMM sse2
+SSE_LINE_FN  8, byte
+SSE_LINE_FN 16, word
diff --git a/libavfilter/x86/vf_psnr_init.c b/libavfilter/x86/vf_psnr_init.c
new file mode 100644
index 0000000000..c387812204
--- /dev/null
+++ b/libavfilter/x86/vf_psnr_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/cpu.h"
+
+#include "libavfilter/psnr.h"
+
+uint64_t ff_sse_line_8bit_sse2(const uint8_t *buf, const uint8_t *ref, int w);
+uint64_t ff_sse_line_16bit_sse2(const uint8_t *buf, const uint8_t *ref, int w);
+
+void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        if (bpp <= 8) {
+            dsp->sse_line = ff_sse_line_8bit_sse2;
+        } else if (bpp <= 15) {
+            dsp->sse_line = ff_sse_line_16bit_sse2;
+        }
+    }
+}