mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-08 13:22:53 +02:00
avfilter/vf_overlay: add x86 SIMD
Specifically for yuv444, yuv422, yuv420 format when main stream has no alpha, and alpha is straight. Signed-off-by: Paul B Mahol <onemda@gmail.com>
This commit is contained in:
parent
a150b2e3a0
commit
6d7c63588c
@ -39,6 +39,7 @@
|
||||
#include "drawutils.h"
|
||||
#include "framesync.h"
|
||||
#include "video.h"
|
||||
#include "vf_overlay.h"
|
||||
|
||||
typedef struct ThreadData {
|
||||
AVFrame *dst, *src;
|
||||
@ -59,21 +60,6 @@ static const char *const var_names[] = {
|
||||
NULL
|
||||
};
|
||||
|
||||
enum var_name {
|
||||
VAR_MAIN_W, VAR_MW,
|
||||
VAR_MAIN_H, VAR_MH,
|
||||
VAR_OVERLAY_W, VAR_OW,
|
||||
VAR_OVERLAY_H, VAR_OH,
|
||||
VAR_HSUB,
|
||||
VAR_VSUB,
|
||||
VAR_X,
|
||||
VAR_Y,
|
||||
VAR_N,
|
||||
VAR_POS,
|
||||
VAR_T,
|
||||
VAR_VARS_NB
|
||||
};
|
||||
|
||||
#define MAIN 0
|
||||
#define OVERLAY 1
|
||||
|
||||
@ -92,45 +78,6 @@ enum EvalMode {
|
||||
EVAL_MODE_NB
|
||||
};
|
||||
|
||||
enum OverlayFormat {
|
||||
OVERLAY_FORMAT_YUV420,
|
||||
OVERLAY_FORMAT_YUV422,
|
||||
OVERLAY_FORMAT_YUV444,
|
||||
OVERLAY_FORMAT_RGB,
|
||||
OVERLAY_FORMAT_GBRP,
|
||||
OVERLAY_FORMAT_AUTO,
|
||||
OVERLAY_FORMAT_NB
|
||||
};
|
||||
|
||||
typedef struct OverlayContext {
|
||||
const AVClass *class;
|
||||
int x, y; ///< position of overlaid picture
|
||||
|
||||
uint8_t main_is_packed_rgb;
|
||||
uint8_t main_rgba_map[4];
|
||||
uint8_t main_has_alpha;
|
||||
uint8_t overlay_is_packed_rgb;
|
||||
uint8_t overlay_rgba_map[4];
|
||||
uint8_t overlay_has_alpha;
|
||||
int format; ///< OverlayFormat
|
||||
int alpha_format;
|
||||
int eval_mode; ///< EvalMode
|
||||
|
||||
FFFrameSync fs;
|
||||
|
||||
int main_pix_step[4]; ///< steps per pixel for each plane of the main output
|
||||
int overlay_pix_step[4]; ///< steps per pixel for each plane of the overlay
|
||||
int hsub, vsub; ///< chroma subsampling values
|
||||
const AVPixFmtDescriptor *main_desc; ///< format descriptor for main input
|
||||
|
||||
double var_values[VAR_VARS_NB];
|
||||
char *x_expr, *y_expr;
|
||||
|
||||
AVExpr *x_pexpr, *y_pexpr;
|
||||
|
||||
int (*blend_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
|
||||
} OverlayContext;
|
||||
|
||||
static av_cold void uninit(AVFilterContext *ctx)
|
||||
{
|
||||
OverlayContext *s = ctx->priv;
|
||||
@ -509,6 +456,7 @@ static av_always_inline void blend_plane(AVFilterContext *ctx,
|
||||
int jobnr,
|
||||
int nb_jobs)
|
||||
{
|
||||
OverlayContext *octx = ctx->priv;
|
||||
int src_wp = AV_CEIL_RSHIFT(src_w, hsub);
|
||||
int src_hp = AV_CEIL_RSHIFT(src_h, vsub);
|
||||
int dst_wp = AV_CEIL_RSHIFT(dst_w, hsub);
|
||||
@ -538,8 +486,18 @@ static av_always_inline void blend_plane(AVFilterContext *ctx,
|
||||
s = sp + k;
|
||||
a = ap + (k<<hsub);
|
||||
da = dap + ((xp+k) << hsub);
|
||||
kmax = FFMIN(-xp + dst_wp, src_wp);
|
||||
|
||||
for (kmax = FFMIN(-xp + dst_wp, src_wp); k < kmax; k++) {
|
||||
if (((vsub && j+1 < src_hp) || !vsub) && octx->blend_row[i]) {
|
||||
int c = octx->blend_row[i](d, da, s, a, kmax - k, src->linesize[3]);
|
||||
|
||||
s += c;
|
||||
d += dst_step * c;
|
||||
da += (1 << hsub) * c;
|
||||
a += (1 << hsub) * c;
|
||||
k += c;
|
||||
}
|
||||
for (; k < kmax; k++) {
|
||||
int alpha_v, alpha_h, alpha;
|
||||
|
||||
// average alpha for color components, improve quality
|
||||
@ -916,7 +874,7 @@ static int config_input_main(AVFilterLink *inlink)
|
||||
}
|
||||
|
||||
if (!s->alpha_format)
|
||||
return 0;
|
||||
goto end;
|
||||
|
||||
switch (s->format) {
|
||||
case OVERLAY_FORMAT_YUV420:
|
||||
@ -960,6 +918,11 @@ static int config_input_main(AVFilterLink *inlink)
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
end:
|
||||
if (ARCH_X86)
|
||||
ff_overlay_init_x86(s, s->format, s->alpha_format, s->main_has_alpha);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
85
libavfilter/vf_overlay.h
Normal file
85
libavfilter/vf_overlay.h
Normal file
@ -0,0 +1,85 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVFILTER_OVERLAY_H
|
||||
#define AVFILTER_OVERLAY_H
|
||||
|
||||
#include "libavutil/eval.h"
|
||||
#include "libavutil/pixdesc.h"
|
||||
#include "framesync.h"
|
||||
#include "avfilter.h"
|
||||
|
||||
enum var_name {
|
||||
VAR_MAIN_W, VAR_MW,
|
||||
VAR_MAIN_H, VAR_MH,
|
||||
VAR_OVERLAY_W, VAR_OW,
|
||||
VAR_OVERLAY_H, VAR_OH,
|
||||
VAR_HSUB,
|
||||
VAR_VSUB,
|
||||
VAR_X,
|
||||
VAR_Y,
|
||||
VAR_N,
|
||||
VAR_POS,
|
||||
VAR_T,
|
||||
VAR_VARS_NB
|
||||
};
|
||||
|
||||
enum OverlayFormat {
|
||||
OVERLAY_FORMAT_YUV420,
|
||||
OVERLAY_FORMAT_YUV422,
|
||||
OVERLAY_FORMAT_YUV444,
|
||||
OVERLAY_FORMAT_RGB,
|
||||
OVERLAY_FORMAT_GBRP,
|
||||
OVERLAY_FORMAT_AUTO,
|
||||
OVERLAY_FORMAT_NB
|
||||
};
|
||||
|
||||
typedef struct OverlayContext {
|
||||
const AVClass *class;
|
||||
int x, y; ///< position of overlaid picture
|
||||
|
||||
uint8_t main_is_packed_rgb;
|
||||
uint8_t main_rgba_map[4];
|
||||
uint8_t main_has_alpha;
|
||||
uint8_t overlay_is_packed_rgb;
|
||||
uint8_t overlay_rgba_map[4];
|
||||
uint8_t overlay_has_alpha;
|
||||
int format; ///< OverlayFormat
|
||||
int alpha_format;
|
||||
int eval_mode; ///< EvalMode
|
||||
|
||||
FFFrameSync fs;
|
||||
|
||||
int main_pix_step[4]; ///< steps per pixel for each plane of the main output
|
||||
int overlay_pix_step[4]; ///< steps per pixel for each plane of the overlay
|
||||
int hsub, vsub; ///< chroma subsampling values
|
||||
const AVPixFmtDescriptor *main_desc; ///< format descriptor for main input
|
||||
|
||||
double var_values[VAR_VARS_NB];
|
||||
char *x_expr, *y_expr;
|
||||
|
||||
AVExpr *x_pexpr, *y_pexpr;
|
||||
|
||||
int (*blend_row[4])(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a, int w,
|
||||
ptrdiff_t alinesize);
|
||||
int (*blend_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
|
||||
} OverlayContext;
|
||||
|
||||
void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha);
|
||||
|
||||
#endif /* AVFILTER_OVERLAY_H */
|
@ -13,6 +13,7 @@ OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o
|
||||
OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o
|
||||
OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o
|
||||
OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o
|
||||
OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay_init.o
|
||||
OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o
|
||||
OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o
|
||||
OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o
|
||||
@ -41,6 +42,7 @@ X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
|
||||
X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
|
||||
X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o
|
||||
X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o
|
||||
X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o
|
||||
X86ASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o
|
||||
X86ASM-OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr.o
|
||||
X86ASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o
|
||||
|
144
libavfilter/x86/vf_overlay.asm
Normal file
144
libavfilter/x86/vf_overlay.asm
Normal file
@ -0,0 +1,144 @@
|
||||
;*****************************************************************************
|
||||
;* x86-optimized functions for overlay filter
|
||||
;*
|
||||
;* Copyright (C) 2018 Paul B Mahol
|
||||
;* Copyright (C) 2018 Henrik Gramner
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;*****************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pb_1: times 16 db 1
|
||||
pw_128: times 8 dw 128
|
||||
pw_255: times 8 dw 255
|
||||
pw_257: times 8 dw 257
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM sse4
|
||||
cglobal overlay_row_44, 5, 7, 6, 0, d, da, s, a, w, r, x
|
||||
xor xq, xq
|
||||
movsxdifnidn wq, wd
|
||||
mov rq, wq
|
||||
and rq, mmsize/2 - 1
|
||||
cmp wq, mmsize/2
|
||||
jl .end
|
||||
sub wq, rq
|
||||
mova m3, [pw_255]
|
||||
mova m4, [pw_128]
|
||||
mova m5, [pw_257]
|
||||
.loop:
|
||||
pmovzxbw m0, [sq+xq]
|
||||
pmovzxbw m2, [aq+xq]
|
||||
pmovzxbw m1, [dq+xq]
|
||||
pmullw m0, m2
|
||||
pxor m2, m3
|
||||
pmullw m1, m2
|
||||
paddw m0, m4
|
||||
paddw m0, m1
|
||||
pmulhuw m0, m5
|
||||
packuswb m0, m0
|
||||
movq [dq+xq], m0
|
||||
add xq, mmsize/2
|
||||
cmp xq, wq
|
||||
jl .loop
|
||||
|
||||
.end:
|
||||
mov eax, xd
|
||||
RET
|
||||
|
||||
INIT_XMM sse4
|
||||
cglobal overlay_row_22, 5, 7, 6, 0, d, da, s, a, w, r, x
|
||||
xor xq, xq
|
||||
movsxdifnidn wq, wd
|
||||
sub wq, 1
|
||||
mov rq, wq
|
||||
and rq, mmsize/2 - 1
|
||||
cmp wq, mmsize/2
|
||||
jl .end
|
||||
sub wq, rq
|
||||
mova m3, [pw_255]
|
||||
mova m4, [pw_128]
|
||||
mova m5, [pw_257]
|
||||
.loop:
|
||||
pmovzxbw m0, [sq+xq]
|
||||
movu m1, [aq+2*xq]
|
||||
pandn m2, m3, m1
|
||||
psllw m1, 8
|
||||
pavgw m2, m1
|
||||
pavgw m2, m1
|
||||
psrlw m2, 8
|
||||
pmovzxbw m1, [dq+xq]
|
||||
pmullw m0, m2
|
||||
pxor m2, m3
|
||||
pmullw m1, m2
|
||||
paddw m0, m4
|
||||
paddw m0, m1
|
||||
pmulhuw m0, m5
|
||||
packuswb m0, m0
|
||||
movq [dq+xq], m0
|
||||
add xq, mmsize/2
|
||||
cmp xq, wq
|
||||
jl .loop
|
||||
|
||||
.end:
|
||||
mov eax, xd
|
||||
RET
|
||||
|
||||
INIT_XMM sse4
|
||||
cglobal overlay_row_20, 6, 7, 7, 0, d, da, s, a, w, r, x
|
||||
mov daq, aq
|
||||
add daq, rmp
|
||||
xor xq, xq
|
||||
movsxdifnidn wq, wd
|
||||
sub wq, 1
|
||||
mov rq, wq
|
||||
and rq, mmsize/2 - 1
|
||||
cmp wq, mmsize/2
|
||||
jl .end
|
||||
sub wq, rq
|
||||
mova m3, [pw_255]
|
||||
mova m4, [pw_128]
|
||||
mova m5, [pw_257]
|
||||
mova m6, [pb_1]
|
||||
.loop:
|
||||
pmovzxbw m0, [sq+xq]
|
||||
movu m2, [aq+2*xq]
|
||||
movu m1, [daq+2*xq]
|
||||
pmaddubsw m2, m6
|
||||
pmaddubsw m1, m6
|
||||
paddw m2, m1
|
||||
psrlw m2, 2
|
||||
pmovzxbw m1, [dq+xq]
|
||||
pmullw m0, m2
|
||||
pxor m2, m3
|
||||
pmullw m1, m2
|
||||
paddw m0, m4
|
||||
paddw m0, m1
|
||||
pmulhuw m0, m5
|
||||
packuswb m0, m0
|
||||
movq [dq+xq], m0
|
||||
add xq, mmsize/2
|
||||
cmp xq, wq
|
||||
jl .loop
|
||||
|
||||
.end:
|
||||
mov eax, xd
|
||||
RET
|
63
libavfilter/x86/vf_overlay_init.c
Normal file
63
libavfilter/x86/vf_overlay_init.c
Normal file
@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Paul B Mahol
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavfilter/vf_overlay.h"
|
||||
|
||||
int ff_overlay_row_44_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
|
||||
int w, ptrdiff_t alinesize);
|
||||
|
||||
int ff_overlay_row_20_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
|
||||
int w, ptrdiff_t alinesize);
|
||||
|
||||
int ff_overlay_row_22_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
|
||||
int w, ptrdiff_t alinesize);
|
||||
|
||||
av_cold void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags) &&
|
||||
(format == OVERLAY_FORMAT_YUV444 ||
|
||||
format == OVERLAY_FORMAT_GBRP) &&
|
||||
alpha_format == 0 && main_has_alpha == 0) {
|
||||
s->blend_row[0] = ff_overlay_row_44_sse4;
|
||||
s->blend_row[1] = ff_overlay_row_44_sse4;
|
||||
s->blend_row[2] = ff_overlay_row_44_sse4;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags) &&
|
||||
(format == OVERLAY_FORMAT_YUV420) &&
|
||||
alpha_format == 0 && main_has_alpha == 0) {
|
||||
s->blend_row[0] = ff_overlay_row_44_sse4;
|
||||
s->blend_row[1] = ff_overlay_row_20_sse4;
|
||||
s->blend_row[2] = ff_overlay_row_20_sse4;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE4(cpu_flags) &&
|
||||
(format == OVERLAY_FORMAT_YUV422) &&
|
||||
alpha_format == 0 && main_has_alpha == 0) {
|
||||
s->blend_row[0] = ff_overlay_row_44_sse4;
|
||||
s->blend_row[1] = ff_overlay_row_22_sse4;
|
||||
s->blend_row[2] = ff_overlay_row_22_sse4;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user