mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
vf_psnr: sse2 optimizations for sum-squared-error.
The internal line accumulator for 16bit can overflow, so I changed that from int to uint64_t in the C code. The matching assembly looks a little weird but output looks correct. (avx2 should be trivial to add later.) Reviewed-by: Paul B Mahol <onemda@gmail.com> Reviewed-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
parent
fcbea93cf8
commit
ae4c9ddebc
33
libavfilter/psnr.h
Normal file
33
libavfilter/psnr.h
Normal file
@ -0,0 +1,33 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef LIBAVFILTER_PSNR_H
|
||||
#define LIBAVFILTER_PSNR_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct PSNRDSPContext {
|
||||
uint64_t (*sse_line)(const uint8_t *buf, const uint8_t *ref, int w);
|
||||
} PSNRDSPContext;
|
||||
|
||||
void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp);
|
||||
|
||||
#endif /* LIBAVFILTER_PSNR_H */
|
@ -33,6 +33,7 @@
|
||||
#include "drawutils.h"
|
||||
#include "formats.h"
|
||||
#include "internal.h"
|
||||
#include "psnr.h"
|
||||
#include "video.h"
|
||||
|
||||
typedef struct PSNRContext {
|
||||
@ -50,11 +51,7 @@ typedef struct PSNRContext {
|
||||
int planewidth[4];
|
||||
int planeheight[4];
|
||||
double planeweight[4];
|
||||
|
||||
void (*compute_mse)(struct PSNRContext *s,
|
||||
const uint8_t *m[4], const int ml[4],
|
||||
const uint8_t *r[4], const int rl[4],
|
||||
int w, int h, double mse[4]);
|
||||
PSNRDSPContext dsp;
|
||||
} PSNRContext;
|
||||
|
||||
#define OFFSET(x) offsetof(PSNRContext, x)
|
||||
@ -78,13 +75,37 @@ static inline double get_psnr(double mse, uint64_t nb_frames, int max)
|
||||
return 10.0 * log(pow2(max) / (mse / nb_frames)) / log(10.0);
|
||||
}
|
||||
|
||||
static uint64_t sse_line_8bit(const uint8_t *main_line, const uint8_t *ref_line, int outw)
|
||||
{
|
||||
int j;
|
||||
unsigned m2 = 0;
|
||||
|
||||
for (j = 0; j < outw; j++)
|
||||
m2 += pow2(main_line[j] - ref_line[j]);
|
||||
|
||||
return m2;
|
||||
}
|
||||
|
||||
static uint64_t sse_line_16bit(const uint8_t *_main_line, const uint8_t *_ref_line, int outw)
|
||||
{
|
||||
int j;
|
||||
uint64_t m2 = 0;
|
||||
const uint16_t *main_line = (const uint16_t *) _main_line;
|
||||
const uint16_t *ref_line = (const uint16_t *) _ref_line;
|
||||
|
||||
for (j = 0; j < outw; j++)
|
||||
m2 += pow2(main_line[j] - ref_line[j]);
|
||||
|
||||
return m2;
|
||||
}
|
||||
|
||||
static inline
|
||||
void compute_images_mse(PSNRContext *s,
|
||||
const uint8_t *main_data[4], const int main_linesizes[4],
|
||||
const uint8_t *ref_data[4], const int ref_linesizes[4],
|
||||
int w, int h, double mse[4])
|
||||
{
|
||||
int i, c, j;
|
||||
int i, c;
|
||||
|
||||
for (c = 0; c < s->nb_components; c++) {
|
||||
const int outw = s->planewidth[c];
|
||||
@ -94,39 +115,8 @@ void compute_images_mse(PSNRContext *s,
|
||||
const int ref_linesize = ref_linesizes[c];
|
||||
const int main_linesize = main_linesizes[c];
|
||||
uint64_t m = 0;
|
||||
|
||||
for (i = 0; i < outh; i++) {
|
||||
int m2 = 0;
|
||||
for (j = 0; j < outw; j++)
|
||||
m2 += pow2(main_line[j] - ref_line[j]);
|
||||
m += m2;
|
||||
ref_line += ref_linesize;
|
||||
main_line += main_linesize;
|
||||
}
|
||||
mse[c] = m / (double)(outw * outh);
|
||||
}
|
||||
}
|
||||
|
||||
static inline
|
||||
void compute_images_mse_16bit(PSNRContext *s,
|
||||
const uint8_t *main_data[4], const int main_linesizes[4],
|
||||
const uint8_t *ref_data[4], const int ref_linesizes[4],
|
||||
int w, int h, double mse[4])
|
||||
{
|
||||
int i, c, j;
|
||||
|
||||
for (c = 0; c < s->nb_components; c++) {
|
||||
const int outw = s->planewidth[c];
|
||||
const int outh = s->planeheight[c];
|
||||
const uint16_t *main_line = (uint16_t *)main_data[c];
|
||||
const uint16_t *ref_line = (uint16_t *)ref_data[c];
|
||||
const int ref_linesize = ref_linesizes[c] / 2;
|
||||
const int main_linesize = main_linesizes[c] / 2;
|
||||
uint64_t m = 0;
|
||||
|
||||
for (i = 0; i < outh; i++) {
|
||||
for (j = 0; j < outw; j++)
|
||||
m += pow2(main_line[j] - ref_line[j]);
|
||||
m += s->dsp.sse_line(main_line, ref_line, outw);
|
||||
ref_line += ref_linesize;
|
||||
main_line += main_linesize;
|
||||
}
|
||||
@ -155,7 +145,7 @@ static AVFrame *do_psnr(AVFilterContext *ctx, AVFrame *main,
|
||||
int j, c;
|
||||
AVDictionary **metadata = avpriv_frame_get_metadatap(main);
|
||||
|
||||
s->compute_mse(s, (const uint8_t **)main->data, main->linesize,
|
||||
compute_images_mse(s, (const uint8_t **)main->data, main->linesize,
|
||||
(const uint8_t **)ref->data, ref->linesize,
|
||||
main->width, main->height, comp_mse);
|
||||
|
||||
@ -283,7 +273,9 @@ static int config_input_ref(AVFilterLink *inlink)
|
||||
s->average_max += s->max[j] * s->planeweight[j];
|
||||
}
|
||||
|
||||
s->compute_mse = desc->comp[0].depth_minus1 > 7 ? compute_images_mse_16bit : compute_images_mse;
|
||||
s->dsp.sse_line = desc->comp[0].depth_minus1 > 7 ? sse_line_16bit : sse_line_8bit;
|
||||
if (ARCH_X86)
|
||||
ff_psnr_init_x86(&s->dsp, desc->comp[0].depth_minus1 + 1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o
|
||||
OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace_init.o
|
||||
OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o
|
||||
OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o
|
||||
OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o
|
||||
OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o
|
||||
OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o
|
||||
OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o
|
||||
@ -19,6 +20,7 @@ YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
|
||||
YASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
|
||||
YASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
|
||||
YASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o
|
||||
YASM-OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr.o
|
||||
YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o
|
||||
YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o
|
||||
YASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o
|
||||
|
139
libavfilter/x86/vf_psnr.asm
Normal file
139
libavfilter/x86/vf_psnr.asm
Normal file
@ -0,0 +1,139 @@
|
||||
;*****************************************************************************
|
||||
;* x86-optimized functions for interlace filter
|
||||
;*
|
||||
;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro SSE_LINE_FN 2 ; 8 or 16, byte or word
|
||||
INIT_XMM sse2
|
||||
%if ARCH_X86_32
|
||||
%if %1 == 8
|
||||
cglobal sse_line_%1 %+ bit, 0, 6, 8, res, buf, w, px1, px2, ref
|
||||
%else
|
||||
cglobal sse_line_%1 %+ bit, 0, 7, 8, res, buf, reshigh, w, px1, px2, ref
|
||||
%endif
|
||||
mov bufq, r0mp
|
||||
mov refq, r1mp
|
||||
mov wd, r2m
|
||||
%else
|
||||
cglobal sse_line_%1 %+ bit, 3, 5, 8, buf, ref, w, px1, px2
|
||||
%endif
|
||||
pxor m6, m6
|
||||
pxor m7, m7
|
||||
sub wd, mmsize*2
|
||||
jl .end
|
||||
|
||||
.loop:
|
||||
movu m0, [bufq+mmsize*0]
|
||||
movu m1, [bufq+mmsize*1]
|
||||
movu m2, [refq+mmsize*0]
|
||||
movu m3, [refq+mmsize*1]
|
||||
%if %1 == 8
|
||||
add bufq, mmsize*2
|
||||
add refq, mmsize*2
|
||||
psubusb m4, m0, m2
|
||||
psubusb m5, m1, m3
|
||||
psubusb m2, m0
|
||||
psubusb m3, m1
|
||||
por m2, m4
|
||||
por m3, m5
|
||||
punpcklbw m0, m2, m6
|
||||
punpcklbw m1, m3, m6
|
||||
punpckhbw m2, m6
|
||||
punpckhbw m3, m6
|
||||
%else
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
movu m2, [bufq+mmsize*2]
|
||||
movu m3, [bufq+mmsize*3]
|
||||
movu m4, [refq+mmsize*2]
|
||||
movu m5, [refq+mmsize*3]
|
||||
psubw m2, m4
|
||||
psubw m3, m5
|
||||
add bufq, mmsize*4
|
||||
add refq, mmsize*4
|
||||
%endif
|
||||
pmaddwd m0, m0
|
||||
pmaddwd m1, m1
|
||||
pmaddwd m2, m2
|
||||
pmaddwd m3, m3
|
||||
paddd m0, m1
|
||||
paddd m2, m3
|
||||
%if %1 == 8
|
||||
paddd m7, m0
|
||||
paddd m7, m2
|
||||
%else
|
||||
paddd m0, m2
|
||||
punpckldq m2, m0, m6
|
||||
punpckhdq m0, m6
|
||||
paddq m7, m0
|
||||
paddq m7, m2
|
||||
%endif
|
||||
sub wd, mmsize*2
|
||||
jge .loop
|
||||
|
||||
.end:
|
||||
add wd, mmsize*2
|
||||
movhlps m0, m7
|
||||
%if %1 == 8
|
||||
paddd m7, m0
|
||||
pshufd m0, m7, 1
|
||||
paddd m7, m0
|
||||
movd eax, m7
|
||||
%else
|
||||
paddq m7, m0
|
||||
%if ARCH_X86_32
|
||||
movd eax, m7
|
||||
psrldq m7, 4
|
||||
movd edx, m7
|
||||
%else
|
||||
movq rax, m7
|
||||
%endif
|
||||
%endif
|
||||
|
||||
; deal with cases where w % 32 != 0
|
||||
test wd, wd
|
||||
jz .end_scalar
|
||||
.loop_scalar:
|
||||
movzx px1d, %2 [bufq+wq*(%1/8)-(%1/8)]
|
||||
movzx px2d, %2 [refq+wq*(%1/8)-(%1/8)]
|
||||
sub px1d, px2d
|
||||
imul px1d, px1d
|
||||
%if %1 == 8
|
||||
add eax, px1d
|
||||
%elif ARCH_X86_64
|
||||
add rax, px1q
|
||||
%else
|
||||
add eax, px1d
|
||||
adc edx, 0
|
||||
%endif
|
||||
dec wd
|
||||
jg .loop_scalar
|
||||
|
||||
.end_scalar:
|
||||
; for %1=8, no need to zero edx on x86-32, since edx=wd, which is zero
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
SSE_LINE_FN 8, byte
|
||||
SSE_LINE_FN 16, word
|
39
libavfilter/x86/vf_psnr_init.c
Normal file
39
libavfilter/x86/vf_psnr_init.c
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/x86/cpu.h"
|
||||
|
||||
#include "libavfilter/psnr.h"
|
||||
|
||||
uint64_t ff_sse_line_8bit_sse2(const uint8_t *buf, const uint8_t *ref, int w);
|
||||
uint64_t ff_sse_line_16bit_sse2(const uint8_t *buf, const uint8_t *ref, int w);
|
||||
|
||||
void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
if (bpp <= 8) {
|
||||
dsp->sse_line = ff_sse_line_8bit_sse2;
|
||||
} else if (bpp <= 15) {
|
||||
dsp->sse_line = ff_sse_line_16bit_sse2;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user