From ae4c9ddebc32eaacbd62681d776881e59ca6e6f7 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sun, 12 Jul 2015 06:44:39 -0400 Subject: [PATCH] vf_psnr: sse2 optimizations for sum-squared-error. The internal line accumulator for 16bit can overflow, so I changed that from int to uint64_t in the C code. The matching assembly looks a little weird but output looks correct. (avx2 should be trivial to add later.) Reviewed-by: Paul B Mahol Reviewed-by: James Almer Signed-off-by: Michael Niedermayer --- libavfilter/psnr.h | 33 ++++++++ libavfilter/vf_psnr.c | 76 ++++++++---------- libavfilter/x86/Makefile | 2 + libavfilter/x86/vf_psnr.asm | 139 +++++++++++++++++++++++++++++++++ libavfilter/x86/vf_psnr_init.c | 39 +++++++++ 5 files changed, 247 insertions(+), 42 deletions(-) create mode 100644 libavfilter/psnr.h create mode 100644 libavfilter/x86/vf_psnr.asm create mode 100644 libavfilter/x86/vf_psnr_init.c diff --git a/libavfilter/psnr.h b/libavfilter/psnr.h new file mode 100644 index 0000000000..efe94da23c --- /dev/null +++ b/libavfilter/psnr.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2015 Ronald S. Bultje + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef LIBAVFILTER_PSNR_H +#define LIBAVFILTER_PSNR_H + +#include +#include + +typedef struct PSNRDSPContext { + uint64_t (*sse_line)(const uint8_t *buf, const uint8_t *ref, int w); +} PSNRDSPContext; + +void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp); + +#endif /* LIBAVFILTER_PSNR_H */ diff --git a/libavfilter/vf_psnr.c b/libavfilter/vf_psnr.c index 74afdaa505..9390f7c625 100644 --- a/libavfilter/vf_psnr.c +++ b/libavfilter/vf_psnr.c @@ -33,6 +33,7 @@ #include "drawutils.h" #include "formats.h" #include "internal.h" +#include "psnr.h" #include "video.h" typedef struct PSNRContext { @@ -50,11 +51,7 @@ typedef struct PSNRContext { int planewidth[4]; int planeheight[4]; double planeweight[4]; - - void (*compute_mse)(struct PSNRContext *s, - const uint8_t *m[4], const int ml[4], - const uint8_t *r[4], const int rl[4], - int w, int h, double mse[4]); + PSNRDSPContext dsp; } PSNRContext; #define OFFSET(x) offsetof(PSNRContext, x) @@ -78,13 +75,37 @@ static inline double get_psnr(double mse, uint64_t nb_frames, int max) return 10.0 * log(pow2(max) / (mse / nb_frames)) / log(10.0); } +static uint64_t sse_line_8bit(const uint8_t *main_line, const uint8_t *ref_line, int outw) +{ + int j; + unsigned m2 = 0; + + for (j = 0; j < outw; j++) + m2 += pow2(main_line[j] - ref_line[j]); + + return m2; +} + +static uint64_t sse_line_16bit(const uint8_t *_main_line, const uint8_t *_ref_line, int outw) +{ + int j; + uint64_t m2 = 0; + const uint16_t *main_line = (const uint16_t *) _main_line; + const uint16_t *ref_line = (const uint16_t *) _ref_line; + + for (j = 0; j < outw; j++) + m2 += pow2(main_line[j] - ref_line[j]); + + return m2; +} + static inline void compute_images_mse(PSNRContext *s, const uint8_t *main_data[4], const int main_linesizes[4], const uint8_t *ref_data[4], const int ref_linesizes[4], int w, int h, double mse[4]) { - int i, c, j; + int i, c; for (c = 0; c < s->nb_components; c++) { const int outw = s->planewidth[c]; @@ -94,39 +115,8 @@ void compute_images_mse(PSNRContext *s, const int ref_linesize = ref_linesizes[c]; const int main_linesize = main_linesizes[c]; uint64_t m = 0; - for (i = 0; i < outh; i++) { - int m2 = 0; - for (j = 0; j < outw; j++) - m2 += pow2(main_line[j] - ref_line[j]); - m += m2; - ref_line += ref_linesize; - main_line += main_linesize; - } - mse[c] = m / (double)(outw * outh); - } -} - -static inline -void compute_images_mse_16bit(PSNRContext *s, - const uint8_t *main_data[4], const int main_linesizes[4], - const uint8_t *ref_data[4], const int ref_linesizes[4], - int w, int h, double mse[4]) -{ - int i, c, j; - - for (c = 0; c < s->nb_components; c++) { - const int outw = s->planewidth[c]; - const int outh = s->planeheight[c]; - const uint16_t *main_line = (uint16_t *)main_data[c]; - const uint16_t *ref_line = (uint16_t *)ref_data[c]; - const int ref_linesize = ref_linesizes[c] / 2; - const int main_linesize = main_linesizes[c] / 2; - uint64_t m = 0; - - for (i = 0; i < outh; i++) { - for (j = 0; j < outw; j++) - m += pow2(main_line[j] - ref_line[j]); + m += s->dsp.sse_line(main_line, ref_line, outw); ref_line += ref_linesize; main_line += main_linesize; } @@ -155,9 +145,9 @@ static AVFrame *do_psnr(AVFilterContext *ctx, AVFrame *main, int j, c; AVDictionary **metadata = avpriv_frame_get_metadatap(main); - s->compute_mse(s, (const uint8_t **)main->data, main->linesize, - (const uint8_t **)ref->data, ref->linesize, - main->width, main->height, comp_mse); + compute_images_mse(s, (const uint8_t **)main->data, main->linesize, + (const uint8_t **)ref->data, ref->linesize, + main->width, main->height, comp_mse); for (j = 0; j < s->nb_components; j++) mse += comp_mse[j] * s->planeweight[j]; @@ -283,7 +273,9 @@ static int config_input_ref(AVFilterLink *inlink) s->average_max += s->max[j] * s->planeweight[j]; } - s->compute_mse = desc->comp[0].depth_minus1 > 7 ? compute_images_mse_16bit : compute_images_mse; + s->dsp.sse_line = desc->comp[0].depth_minus1 > 7 ? sse_line_16bit : sse_line_8bit; + if (ARCH_X86) + ff_psnr_init_x86(&s->dsp, desc->comp[0].depth_minus1 + 1); return 0; } diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 61be8c6f54..230e879899 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -6,6 +6,7 @@ OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace_init.o OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o +OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o @@ -19,6 +20,7 @@ YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o YASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o YASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o YASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o +YASM-OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr.o YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o YASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o diff --git a/libavfilter/x86/vf_psnr.asm b/libavfilter/x86/vf_psnr.asm new file mode 100644 index 0000000000..023ae06efb --- /dev/null +++ b/libavfilter/x86/vf_psnr.asm @@ -0,0 +1,139 @@ +;***************************************************************************** +;* x86-optimized functions for interlace filter +;* +;* Copyright (C) 2015 Ronald S. Bultje +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%macro SSE_LINE_FN 2 ; 8 or 16, byte or word +INIT_XMM sse2 +%if ARCH_X86_32 +%if %1 == 8 +cglobal sse_line_%1 %+ bit, 0, 6, 8, res, buf, w, px1, px2, ref +%else +cglobal sse_line_%1 %+ bit, 0, 7, 8, res, buf, reshigh, w, px1, px2, ref +%endif + mov bufq, r0mp + mov refq, r1mp + mov wd, r2m +%else +cglobal sse_line_%1 %+ bit, 3, 5, 8, buf, ref, w, px1, px2 +%endif + pxor m6, m6 + pxor m7, m7 + sub wd, mmsize*2 + jl .end + +.loop: + movu m0, [bufq+mmsize*0] + movu m1, [bufq+mmsize*1] + movu m2, [refq+mmsize*0] + movu m3, [refq+mmsize*1] +%if %1 == 8 + add bufq, mmsize*2 + add refq, mmsize*2 + psubusb m4, m0, m2 + psubusb m5, m1, m3 + psubusb m2, m0 + psubusb m3, m1 + por m2, m4 + por m3, m5 + punpcklbw m0, m2, m6 + punpcklbw m1, m3, m6 + punpckhbw m2, m6 + punpckhbw m3, m6 +%else + psubw m0, m2 + psubw m1, m3 + movu m2, [bufq+mmsize*2] + movu m3, [bufq+mmsize*3] + movu m4, [refq+mmsize*2] + movu m5, [refq+mmsize*3] + psubw m2, m4 + psubw m3, m5 + add bufq, mmsize*4 + add refq, mmsize*4 +%endif + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m0, m1 + paddd m2, m3 +%if %1 == 8 + paddd m7, m0 + paddd m7, m2 +%else + paddd m0, m2 + punpckldq m2, m0, m6 + punpckhdq m0, m6 + paddq m7, m0 + paddq m7, m2 +%endif + sub wd, mmsize*2 + jge .loop + +.end: + add wd, mmsize*2 + movhlps m0, m7 +%if %1 == 8 + paddd m7, m0 + pshufd m0, m7, 1 + paddd m7, m0 + movd eax, m7 +%else + paddq m7, m0 +%if ARCH_X86_32 + movd eax, m7 + psrldq m7, 4 + movd edx, m7 +%else + movq rax, m7 +%endif +%endif + + ; deal with cases where w % 32 != 0 + test wd, wd + jz .end_scalar +.loop_scalar: + movzx px1d, %2 [bufq+wq*(%1/8)-(%1/8)] + movzx px2d, %2 [refq+wq*(%1/8)-(%1/8)] + sub px1d, px2d + imul px1d, px1d +%if %1 == 8 + add eax, px1d +%elif ARCH_X86_64 + add rax, px1q +%else + add eax, px1d + adc edx, 0 +%endif + dec wd + jg .loop_scalar + +.end_scalar: + ; for %1=8, no need to zero edx on x86-32, since edx=wd, which is zero + RET +%endmacro + +INIT_XMM sse2 +SSE_LINE_FN 8, byte +SSE_LINE_FN 16, word diff --git a/libavfilter/x86/vf_psnr_init.c b/libavfilter/x86/vf_psnr_init.c new file mode 100644 index 0000000000..c387812204 --- /dev/null +++ b/libavfilter/x86/vf_psnr_init.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2015 Ronald S. Bultje + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/cpu.h" + +#include "libavfilter/psnr.h" + +uint64_t ff_sse_line_8bit_sse2(const uint8_t *buf, const uint8_t *ref, int w); +uint64_t ff_sse_line_16bit_sse2(const uint8_t *buf, const uint8_t *ref, int w); + +void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) { + if (bpp <= 8) { + dsp->sse_line = ff_sse_line_8bit_sse2; + } else if (bpp <= 15) { + dsp->sse_line = ff_sse_line_16bit_sse2; + } + } +}