1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-11-21 10:55:51 +02:00

avfilter/x86/vf_gblur: add postscale SIMD

This commit is contained in:
Paul B Mahol 2021-02-13 12:09:47 +01:00
parent 058db59e16
commit 44cf3a2b16
3 changed files with 70 additions and 9 deletions

View File

@ -171,13 +171,14 @@ static int filter_postscale(AVFilterContext *ctx, void *arg, int jobnr, int nb_j
const float min = s->flt ? -FLT_MAX : 0.f;
const int height = td->height;
const int width = td->width;
const int64_t numpixels = width * (int64_t)height;
const int slice_start = (numpixels * jobnr ) / nb_jobs;
const int slice_end = (numpixels * (jobnr+1)) / nb_jobs;
const int awidth = FFALIGN(width, 64);
const int slice_start = (height * jobnr ) / nb_jobs;
const int slice_end = (height * (jobnr+1)) / nb_jobs;
const float postscale = s->postscale * s->postscaleV;
float *buffer = s->buffer + slice_start;
const int slice_size = slice_end - slice_start;
s->postscale_slice(buffer, slice_end - slice_start, postscale, min, max);
s->postscale_slice(s->buffer + slice_start * awidth,
slice_size * awidth, postscale, min, max);
return 0;
}
@ -251,7 +252,7 @@ static int config_input(AVFilterLink *inlink)
s->nb_planes = av_pix_fmt_count_planes(inlink->format);
s->buffer = av_malloc_array(FFALIGN(inlink->w, 16), FFALIGN(inlink->h, 16) * sizeof(*s->buffer));
s->buffer = av_malloc_array(FFALIGN(inlink->w, 64), FFALIGN(inlink->h, 64) * sizeof(*s->buffer));
if (!s->buffer)
return AVERROR(ENOMEM);

View File

@ -183,3 +183,52 @@ HORIZ_SLICE
INIT_XMM avx2
HORIZ_SLICE
%endif
%macro POSTSCALE_SLICE 0
%if UNIX64
cglobal postscale_slice, 2, 2, 4, ptr, length
%else
cglobal postscale_slice, 5, 5, 4, ptr, length, postscale, min, max
%endif
shl lengthd, 2
add ptrq, lengthq
neg lengthq
%if WIN64
SWAP 0, 2
SWAP 1, 3
SWAP 2, 4
%endif
%if cpuflag(avx2)
vbroadcastss m0, xm0
vbroadcastss m1, xm1
vbroadcastss m2, xm2
%else
shufps xm0, xm0, 0
shufps xm1, xm1, 0
shufps xm2, xm2, 0
%endif
.loop:
%if cpuflag(avx2)
mulps m3, m0, [ptrq + lengthq]
%else
movu m3, [ptrq + lengthq]
mulps m3, m0
%endif
maxps m3, m1
minps m3, m2
movu [ptrq+lengthq], m3
add lengthq, mmsize
jl .loop
RET
%endmacro
INIT_XMM sse
POSTSCALE_SLICE
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
POSTSCALE_SLICE
%endif

View File

@ -27,14 +27,25 @@
void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale);
void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale);
void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min, float max);
void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
av_cold void ff_gblur_init_x86(GBlurContext *s)
{
#if ARCH_X86_64
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE4(cpu_flags))
if (EXTERNAL_SSE(cpu_flags)) {
s->postscale_slice = ff_postscale_slice_sse;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
s->postscale_slice = ff_postscale_slice_avx2;
}
#if ARCH_X86_64
if (EXTERNAL_SSE4(cpu_flags)) {
s->horiz_slice = ff_horiz_slice_sse4;
if (EXTERNAL_AVX2(cpu_flags))
}
if (EXTERNAL_AVX2(cpu_flags)) {
s->horiz_slice = ff_horiz_slice_avx2;
}
#endif
}