mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
697533e76d
Add an optional filter_line3 to the available optimisations. filter_line3 is equivalent to filter_line, memcpy, filter_line filter_line shares quite a number of loads and some calculations in common with its next iteration and testing shows that using aarch64 neon filter_line3s performance is 30% better than two filter_lines and a memcpy. Adds a test for vf_bwdif filter_line3 to checkasm Rounds job start lines down to a multiple of 4. This means that if filter_line3 exists then filter_line will not sometimes be called once at the end of a slice depending on thread count. The final slice may do up to 3 extra lines but filter_edge is faster than filter_line so it is unlikely to create any noticable thread load variation. Signed-off-by: John Cox <jc@kynesim.co.uk> Signed-off-by: Martin Storsjö <martin@martin.st>
257 lines
11 KiB
C
257 lines
11 KiB
C
/*
|
|
* This file is part of FFmpeg.
|
|
*
|
|
* FFmpeg is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* FFmpeg is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License along
|
|
* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*/
|
|
|
|
#include <string.h>
|
|
#include "checkasm.h"
|
|
#include "libavcodec/internal.h"
|
|
#include "libavfilter/bwdif.h"
|
|
#include "libavutil/mem_internal.h"
|
|
|
|
#define WIDTH 256
|
|
|
|
#define randomize_buffers(buf0, buf1, mask, count) \
|
|
for (size_t i = 0; i < count; i++) \
|
|
buf0[i] = buf1[i] = rnd() & mask
|
|
|
|
#define randomize_overflow_check(buf0, buf1, mask, count) \
|
|
for (size_t i = 0; i < count; i++) \
|
|
buf0[i] = buf1[i] = (rnd() & 1) != 0 ? mask : 0;
|
|
|
|
#define BODY(type, depth) \
|
|
do { \
|
|
type prev0[9*WIDTH], prev1[9*WIDTH]; \
|
|
type next0[9*WIDTH], next1[9*WIDTH]; \
|
|
type cur0[9*WIDTH], cur1[9*WIDTH]; \
|
|
type dst0[WIDTH], dst1[WIDTH]; \
|
|
const int stride = WIDTH; \
|
|
const int mask = (1<<depth)-1; \
|
|
\
|
|
declare_func(void, void *dst, void *prev, void *cur, void *next, \
|
|
int w, int prefs, int mrefs, int prefs2, int mrefs2, \
|
|
int prefs3, int mrefs3, int prefs4, int mrefs4, \
|
|
int parity, int clip_max); \
|
|
\
|
|
randomize_buffers(prev0, prev1, mask, 9*WIDTH); \
|
|
randomize_buffers(next0, next1, mask, 9*WIDTH); \
|
|
randomize_buffers( cur0, cur1, mask, 9*WIDTH); \
|
|
\
|
|
call_ref(dst0, prev0 + 4*WIDTH, cur0 + 4*WIDTH, next0 + 4*WIDTH, \
|
|
WIDTH, stride, -stride, 2*stride, -2*stride, \
|
|
3*stride, -3*stride, 4*stride, -4*stride, \
|
|
0, mask); \
|
|
call_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH, \
|
|
WIDTH, stride, -stride, 2*stride, -2*stride, \
|
|
3*stride, -3*stride, 4*stride, -4*stride, \
|
|
0, mask); \
|
|
\
|
|
if (memcmp(dst0, dst1, sizeof dst0) \
|
|
|| memcmp(prev0, prev1, sizeof prev0) \
|
|
|| memcmp(next0, next1, sizeof next0) \
|
|
|| memcmp( cur0, cur1, sizeof cur0)) \
|
|
fail(); \
|
|
bench_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH, \
|
|
WIDTH, stride, -stride, 2*stride, -2*stride, \
|
|
3*stride, -3*stride, 4*stride, -4*stride, \
|
|
0, mask); \
|
|
} while (0)
|
|
|
|
void checkasm_check_vf_bwdif(void)
|
|
{
|
|
BWDIFContext ctx_8, ctx_10;
|
|
|
|
ff_bwdif_init_filter_line(&ctx_8, 8);
|
|
ff_bwdif_init_filter_line(&ctx_10, 10);
|
|
|
|
if (check_func(ctx_8.filter_line, "bwdif8")) {
|
|
BODY(uint8_t, 8);
|
|
report("bwdif8");
|
|
}
|
|
|
|
if (check_func(ctx_10.filter_line, "bwdif10")) {
|
|
BODY(uint16_t, 10);
|
|
report("bwdif10");
|
|
}
|
|
|
|
if (!ctx_8.filter_line3)
|
|
ctx_8.filter_line3 = ff_bwdif_filter_line3_c;
|
|
|
|
{
|
|
LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, cur0, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, cur1, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, dst0, [WIDTH*3]);
|
|
LOCAL_ALIGNED_16(uint8_t, dst1, [WIDTH*3]);
|
|
const int stride = WIDTH;
|
|
const int mask = (1<<8)-1;
|
|
int parity;
|
|
|
|
for (parity = 0; parity != 2; ++parity) {
|
|
if (check_func(ctx_8.filter_line3, "bwdif8.line3.rnd.p%d", parity)) {
|
|
|
|
declare_func(void, void * dst1, int d_stride,
|
|
const void * prev1, const void * cur1, const void * next1, int prefs,
|
|
int w, int parity, int clip_max);
|
|
|
|
randomize_buffers(prev0, prev1, mask, 11*WIDTH);
|
|
randomize_buffers(next0, next1, mask, 11*WIDTH);
|
|
randomize_buffers( cur0, cur1, mask, 11*WIDTH);
|
|
|
|
call_ref(dst0, stride,
|
|
prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
|
|
WIDTH, parity, mask);
|
|
call_new(dst1, stride,
|
|
prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
|
|
WIDTH, parity, mask);
|
|
|
|
if (memcmp(dst0, dst1, WIDTH*3)
|
|
|| memcmp(prev0, prev1, WIDTH*11)
|
|
|| memcmp(next0, next1, WIDTH*11)
|
|
|| memcmp( cur0, cur1, WIDTH*11))
|
|
fail();
|
|
|
|
bench_new(dst1, stride,
|
|
prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
|
|
WIDTH, parity, mask);
|
|
}
|
|
}
|
|
|
|
// Use just 0s and ~0s to try to provoke bad cropping or overflow
|
|
// Parity makes no difference to this test so just test 0
|
|
if (check_func(ctx_8.filter_line3, "bwdif8.line3.overflow")) {
|
|
|
|
declare_func(void, void * dst1, int d_stride,
|
|
const void * prev1, const void * cur1, const void * next1, int prefs,
|
|
int w, int parity, int clip_max);
|
|
|
|
randomize_overflow_check(prev0, prev1, mask, 11*WIDTH);
|
|
randomize_overflow_check(next0, next1, mask, 11*WIDTH);
|
|
randomize_overflow_check( cur0, cur1, mask, 11*WIDTH);
|
|
|
|
call_ref(dst0, stride,
|
|
prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
|
|
WIDTH, 0, mask);
|
|
call_new(dst1, stride,
|
|
prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
|
|
WIDTH, 0, mask);
|
|
|
|
if (memcmp(dst0, dst1, WIDTH*3)
|
|
|| memcmp(prev0, prev1, WIDTH*11)
|
|
|| memcmp(next0, next1, WIDTH*11)
|
|
|| memcmp( cur0, cur1, WIDTH*11))
|
|
fail();
|
|
|
|
// No point to benching
|
|
}
|
|
|
|
report("bwdif8.line3");
|
|
}
|
|
|
|
{
|
|
LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, cur0, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, cur1, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, dst0, [WIDTH*3]);
|
|
LOCAL_ALIGNED_16(uint8_t, dst1, [WIDTH*3]);
|
|
const int stride = WIDTH;
|
|
const int mask = (1<<8)-1;
|
|
int spat;
|
|
int parity;
|
|
|
|
for (spat = 0; spat != 2; ++spat) {
|
|
for (parity = 0; parity != 2; ++parity) {
|
|
if (check_func(ctx_8.filter_edge, "bwdif8.edge.s%d.p%d", spat, parity)) {
|
|
|
|
declare_func(void, void *dst1, void *prev1, void *cur1, void *next1,
|
|
int w, int prefs, int mrefs, int prefs2, int mrefs2,
|
|
int parity, int clip_max, int spat);
|
|
|
|
randomize_buffers(prev0, prev1, mask, 11*WIDTH);
|
|
randomize_buffers(next0, next1, mask, 11*WIDTH);
|
|
randomize_buffers( cur0, cur1, mask, 11*WIDTH);
|
|
memset(dst0, 0xba, WIDTH * 3);
|
|
memset(dst1, 0xba, WIDTH * 3);
|
|
|
|
call_ref(dst0 + stride,
|
|
prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, WIDTH,
|
|
stride, -stride, stride * 2, -stride * 2,
|
|
parity, mask, spat);
|
|
call_new(dst1 + stride,
|
|
prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
|
|
stride, -stride, stride * 2, -stride * 2,
|
|
parity, mask, spat);
|
|
|
|
if (memcmp(dst0, dst1, WIDTH*3)
|
|
|| memcmp(prev0, prev1, WIDTH*11)
|
|
|| memcmp(next0, next1, WIDTH*11)
|
|
|| memcmp( cur0, cur1, WIDTH*11))
|
|
fail();
|
|
|
|
bench_new(dst1 + stride,
|
|
prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
|
|
stride, -stride, stride * 2, -stride * 2,
|
|
parity, mask, spat);
|
|
}
|
|
}
|
|
}
|
|
|
|
report("bwdif8.edge");
|
|
}
|
|
|
|
if (check_func(ctx_8.filter_intra, "bwdif8.intra")) {
|
|
LOCAL_ALIGNED_16(uint8_t, cur0, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, cur1, [11*WIDTH]);
|
|
LOCAL_ALIGNED_16(uint8_t, dst0, [WIDTH*3]);
|
|
LOCAL_ALIGNED_16(uint8_t, dst1, [WIDTH*3]);
|
|
const int stride = WIDTH;
|
|
const int mask = (1<<8)-1;
|
|
|
|
declare_func(void, void *dst1, void *cur1, int w, int prefs, int mrefs,
|
|
int prefs3, int mrefs3, int parity, int clip_max);
|
|
|
|
randomize_buffers( cur0, cur1, mask, 11*WIDTH);
|
|
memset(dst0, 0xba, WIDTH * 3);
|
|
memset(dst1, 0xba, WIDTH * 3);
|
|
|
|
call_ref(dst0 + stride,
|
|
cur0 + stride * 4, WIDTH,
|
|
stride, -stride, stride * 3, -stride * 3,
|
|
0, mask);
|
|
call_new(dst1 + stride,
|
|
cur0 + stride * 4, WIDTH,
|
|
stride, -stride, stride * 3, -stride * 3,
|
|
0, mask);
|
|
|
|
if (memcmp(dst0, dst1, WIDTH*3)
|
|
|| memcmp( cur0, cur1, WIDTH*11))
|
|
fail();
|
|
|
|
bench_new(dst1 + stride,
|
|
cur0 + stride * 4, WIDTH,
|
|
stride, -stride, stride * 3, -stride * 3,
|
|
0, mask);
|
|
|
|
report("bwdif8.intra");
|
|
}
|
|
}
|