mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
avfilter/vf_v360: x86 SIMD for interpolations
This commit is contained in:
parent
f0d8005ec5
commit
058bbf48c6
113
libavfilter/v360.h
Normal file
113
libavfilter/v360.h
Normal file
@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Copyright (c) 2019 Eugene Lyapustin
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVFILTER_V360_H
|
||||
#define AVFILTER_V360_H
|
||||
#include "avfilter.h"
|
||||
|
||||
enum Projections {
|
||||
EQUIRECTANGULAR,
|
||||
CUBEMAP_3_2,
|
||||
CUBEMAP_6_1,
|
||||
EQUIANGULAR,
|
||||
FLAT,
|
||||
DUAL_FISHEYE,
|
||||
BARREL,
|
||||
CUBEMAP_1_6,
|
||||
NB_PROJECTIONS,
|
||||
};
|
||||
|
||||
enum InterpMethod {
|
||||
NEAREST,
|
||||
BILINEAR,
|
||||
BICUBIC,
|
||||
LANCZOS,
|
||||
NB_INTERP_METHODS,
|
||||
};
|
||||
|
||||
enum Faces {
|
||||
TOP_LEFT,
|
||||
TOP_MIDDLE,
|
||||
TOP_RIGHT,
|
||||
BOTTOM_LEFT,
|
||||
BOTTOM_MIDDLE,
|
||||
BOTTOM_RIGHT,
|
||||
NB_FACES,
|
||||
};
|
||||
|
||||
enum Direction {
|
||||
RIGHT, ///< Axis +X
|
||||
LEFT, ///< Axis -X
|
||||
UP, ///< Axis +Y
|
||||
DOWN, ///< Axis -Y
|
||||
FRONT, ///< Axis -Z
|
||||
BACK, ///< Axis +Z
|
||||
NB_DIRECTIONS,
|
||||
};
|
||||
|
||||
enum Rotation {
|
||||
ROT_0,
|
||||
ROT_90,
|
||||
ROT_180,
|
||||
ROT_270,
|
||||
NB_ROTATIONS,
|
||||
};
|
||||
|
||||
typedef struct V360Context {
|
||||
const AVClass *class;
|
||||
int in, out;
|
||||
int interp;
|
||||
int width, height;
|
||||
char* in_forder;
|
||||
char* out_forder;
|
||||
char* in_frot;
|
||||
char* out_frot;
|
||||
|
||||
int in_cubemap_face_order[6];
|
||||
int out_cubemap_direction_order[6];
|
||||
int in_cubemap_face_rotation[6];
|
||||
int out_cubemap_face_rotation[6];
|
||||
|
||||
float in_pad, out_pad;
|
||||
|
||||
float yaw, pitch, roll;
|
||||
|
||||
int h_flip, v_flip, d_flip;
|
||||
|
||||
float h_fov, v_fov;
|
||||
float flat_range[3];
|
||||
|
||||
int planewidth[4], planeheight[4];
|
||||
int inplanewidth[4], inplaneheight[4];
|
||||
int nb_planes;
|
||||
|
||||
uint16_t *u[4], *v[4];
|
||||
int16_t *ker[4];
|
||||
|
||||
int (*remap_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
|
||||
|
||||
void (*remap_line)(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
|
||||
const uint16_t *u, const uint16_t *v, const int16_t *ker);
|
||||
} V360Context;
|
||||
|
||||
void ff_v360_init(V360Context *s, int depth);
|
||||
void ff_v360_init_x86(V360Context *s, int depth);
|
||||
|
||||
#endif /* AVFILTER_V360_H */
|
@ -41,88 +41,7 @@
|
||||
#include "formats.h"
|
||||
#include "internal.h"
|
||||
#include "video.h"
|
||||
|
||||
enum Projections {
|
||||
EQUIRECTANGULAR,
|
||||
CUBEMAP_3_2,
|
||||
CUBEMAP_6_1,
|
||||
EQUIANGULAR,
|
||||
FLAT,
|
||||
DUAL_FISHEYE,
|
||||
BARREL,
|
||||
CUBEMAP_1_6,
|
||||
NB_PROJECTIONS,
|
||||
};
|
||||
|
||||
enum InterpMethod {
|
||||
NEAREST,
|
||||
BILINEAR,
|
||||
BICUBIC,
|
||||
LANCZOS,
|
||||
NB_INTERP_METHODS,
|
||||
};
|
||||
|
||||
enum Faces {
|
||||
TOP_LEFT,
|
||||
TOP_MIDDLE,
|
||||
TOP_RIGHT,
|
||||
BOTTOM_LEFT,
|
||||
BOTTOM_MIDDLE,
|
||||
BOTTOM_RIGHT,
|
||||
NB_FACES,
|
||||
};
|
||||
|
||||
enum Direction {
|
||||
RIGHT, ///< Axis +X
|
||||
LEFT, ///< Axis -X
|
||||
UP, ///< Axis +Y
|
||||
DOWN, ///< Axis -Y
|
||||
FRONT, ///< Axis -Z
|
||||
BACK, ///< Axis +Z
|
||||
NB_DIRECTIONS,
|
||||
};
|
||||
|
||||
enum Rotation {
|
||||
ROT_0,
|
||||
ROT_90,
|
||||
ROT_180,
|
||||
ROT_270,
|
||||
NB_ROTATIONS,
|
||||
};
|
||||
|
||||
typedef struct V360Context {
|
||||
const AVClass *class;
|
||||
int in, out;
|
||||
int interp;
|
||||
int width, height;
|
||||
char* in_forder;
|
||||
char* out_forder;
|
||||
char* in_frot;
|
||||
char* out_frot;
|
||||
|
||||
int in_cubemap_face_order[6];
|
||||
int out_cubemap_direction_order[6];
|
||||
int in_cubemap_face_rotation[6];
|
||||
int out_cubemap_face_rotation[6];
|
||||
|
||||
float in_pad, out_pad;
|
||||
|
||||
float yaw, pitch, roll;
|
||||
|
||||
int h_flip, v_flip, d_flip;
|
||||
|
||||
float h_fov, v_fov;
|
||||
float flat_range[3];
|
||||
|
||||
int planewidth[4], planeheight[4];
|
||||
int inplanewidth[4], inplaneheight[4];
|
||||
int nb_planes;
|
||||
|
||||
uint16_t *u[4], *v[4];
|
||||
int16_t *ker[4];
|
||||
|
||||
int (*remap_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
|
||||
} V360Context;
|
||||
#include "v360.h"
|
||||
|
||||
typedef struct ThreadData {
|
||||
AVFrame *in;
|
||||
@ -251,47 +170,22 @@ static int query_formats(AVFilterContext *ctx)
|
||||
return ff_set_common_formats(ctx, fmts_list);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate no-interpolation remapping function with a given pixel depth.
|
||||
*
|
||||
* @param bits number of bits per pixel
|
||||
* @param div number of bytes per pixel
|
||||
*/
|
||||
#define DEFINE_REMAP1(bits, div) \
|
||||
static int remap1_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \
|
||||
{ \
|
||||
ThreadData *td = (ThreadData*)arg; \
|
||||
const V360Context *s = ctx->priv; \
|
||||
const AVFrame *in = td->in; \
|
||||
AVFrame *out = td->out; \
|
||||
\
|
||||
int plane, x, y; \
|
||||
\
|
||||
for (plane = 0; plane < s->nb_planes; plane++) { \
|
||||
const int in_linesize = in->linesize[plane] / div; \
|
||||
const int out_linesize = out->linesize[plane] / div; \
|
||||
const uint##bits##_t *src = (const uint##bits##_t *)in->data[plane]; \
|
||||
uint##bits##_t *dst = (uint##bits##_t *)out->data[plane]; \
|
||||
const int width = s->planewidth[plane]; \
|
||||
const int height = s->planeheight[plane]; \
|
||||
\
|
||||
const int slice_start = (height * jobnr ) / nb_jobs; \
|
||||
const int slice_end = (height * (jobnr + 1)) / nb_jobs; \
|
||||
\
|
||||
for (y = slice_start; y < slice_end; y++) { \
|
||||
const uint16_t *u = s->u[plane] + y * width; \
|
||||
const uint16_t *v = s->v[plane] + y * width; \
|
||||
uint##bits##_t *d = dst + y * out_linesize; \
|
||||
for (x = 0; x < width; x++) \
|
||||
*d++ = src[v[x] * in_linesize + u[x]]; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
return 0; \
|
||||
#define DEFINE_REMAP1_LINE(bits, div) \
|
||||
static void remap1_##bits##bit_line_c(uint8_t *dst, int width, const uint8_t *src, \
|
||||
ptrdiff_t in_linesize, \
|
||||
const uint16_t *u, const uint16_t *v, const int16_t *ker) \
|
||||
{ \
|
||||
const uint##bits##_t *s = (const uint##bits##_t *)src; \
|
||||
uint##bits##_t *d = (uint##bits##_t *)dst; \
|
||||
\
|
||||
in_linesize /= div; \
|
||||
\
|
||||
for (int x = 0; x < width; x++) \
|
||||
d[x] = s[v[x] * in_linesize + u[x]]; \
|
||||
}
|
||||
|
||||
DEFINE_REMAP1( 8, 1)
|
||||
DEFINE_REMAP1(16, 2)
|
||||
DEFINE_REMAP1_LINE( 8, 1)
|
||||
DEFINE_REMAP1_LINE(16, 2)
|
||||
|
||||
typedef struct XYRemap {
|
||||
uint16_t u[4][4];
|
||||
@ -304,9 +198,8 @@ typedef struct XYRemap {
|
||||
*
|
||||
* @param ws size of interpolation window
|
||||
* @param bits number of bits per pixel
|
||||
* @param div number of bytes per pixel
|
||||
*/
|
||||
#define DEFINE_REMAP(ws, bits, div) \
|
||||
#define DEFINE_REMAP(ws, bits) \
|
||||
static int remap##ws##_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \
|
||||
{ \
|
||||
ThreadData *td = (ThreadData*)arg; \
|
||||
@ -314,48 +207,85 @@ static int remap##ws##_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jo
|
||||
const AVFrame *in = td->in; \
|
||||
AVFrame *out = td->out; \
|
||||
\
|
||||
int plane, x, y, i, j; \
|
||||
\
|
||||
for (plane = 0; plane < s->nb_planes; plane++) { \
|
||||
const int in_linesize = in->linesize[plane] / div; \
|
||||
const int out_linesize = out->linesize[plane] / div; \
|
||||
const uint##bits##_t *src = (const uint##bits##_t *)in->data[plane]; \
|
||||
uint##bits##_t *dst = (uint##bits##_t *)out->data[plane]; \
|
||||
for (int plane = 0; plane < s->nb_planes; plane++) { \
|
||||
const int in_linesize = in->linesize[plane]; \
|
||||
const int out_linesize = out->linesize[plane]; \
|
||||
const uint8_t *src = in->data[plane]; \
|
||||
uint8_t *dst = out->data[plane]; \
|
||||
const int width = s->planewidth[plane]; \
|
||||
const int height = s->planeheight[plane]; \
|
||||
\
|
||||
const int slice_start = (height * jobnr ) / nb_jobs; \
|
||||
const int slice_end = (height * (jobnr + 1)) / nb_jobs; \
|
||||
\
|
||||
for (y = slice_start; y < slice_end; y++) { \
|
||||
uint##bits##_t *d = dst + y * out_linesize; \
|
||||
for (int y = slice_start; y < slice_end; y++) { \
|
||||
const uint16_t *u = s->u[plane] + y * width * ws * ws; \
|
||||
const uint16_t *v = s->v[plane] + y * width * ws * ws; \
|
||||
const int16_t *ker = s->ker[plane] + y * width * ws * ws; \
|
||||
for (x = 0; x < width; x++) { \
|
||||
const uint16_t *uu = u + x * ws * ws; \
|
||||
const uint16_t *vv = v + x * ws * ws; \
|
||||
const int16_t *kker = ker + x * ws * ws; \
|
||||
int tmp = 0; \
|
||||
\
|
||||
for (i = 0; i < ws; i++) { \
|
||||
for (j = 0; j < ws; j++) { \
|
||||
tmp += kker[i * ws + j] * src[vv[i * ws + j] * in_linesize + uu[i * ws + j]]; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
*d++ = av_clip_uint##bits(tmp >> (15 - ws)); \
|
||||
} \
|
||||
s->remap_line(dst + y * out_linesize, width, src, in_linesize, u, v, ker); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
return 0; \
|
||||
}
|
||||
|
||||
DEFINE_REMAP(2, 8, 1)
|
||||
DEFINE_REMAP(4, 8, 1)
|
||||
DEFINE_REMAP(2, 16, 2)
|
||||
DEFINE_REMAP(4, 16, 2)
|
||||
DEFINE_REMAP(1, 8)
|
||||
DEFINE_REMAP(2, 8)
|
||||
DEFINE_REMAP(4, 8)
|
||||
DEFINE_REMAP(1, 16)
|
||||
DEFINE_REMAP(2, 16)
|
||||
DEFINE_REMAP(4, 16)
|
||||
|
||||
#define DEFINE_REMAP_LINE(ws, bits, div) \
|
||||
static void remap##ws##_##bits##bit_line_c(uint8_t *dst, int width, const uint8_t *src, \
|
||||
ptrdiff_t in_linesize, \
|
||||
const uint16_t *u, const uint16_t *v, const int16_t *ker) \
|
||||
{ \
|
||||
const uint##bits##_t *s = (const uint##bits##_t *)src; \
|
||||
uint##bits##_t *d = (uint##bits##_t *)dst; \
|
||||
\
|
||||
in_linesize /= div; \
|
||||
\
|
||||
for (int x = 0; x < width; x++) { \
|
||||
const uint16_t *uu = u + x * ws * ws; \
|
||||
const uint16_t *vv = v + x * ws * ws; \
|
||||
const int16_t *kker = ker + x * ws * ws; \
|
||||
int tmp = 0; \
|
||||
\
|
||||
for (int i = 0; i < ws; i++) { \
|
||||
for (int j = 0; j < ws; j++) { \
|
||||
tmp += kker[i * ws + j] * s[vv[i * ws + j] * in_linesize + uu[i * ws + j]]; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
d[x] = av_clip_uint##bits(tmp >> 14); \
|
||||
} \
|
||||
}
|
||||
|
||||
DEFINE_REMAP_LINE(2, 8, 1)
|
||||
DEFINE_REMAP_LINE(4, 8, 1)
|
||||
DEFINE_REMAP_LINE(2, 16, 2)
|
||||
DEFINE_REMAP_LINE(4, 16, 2)
|
||||
|
||||
void ff_v360_init(V360Context *s, int depth)
|
||||
{
|
||||
switch (s->interp) {
|
||||
case NEAREST:
|
||||
s->remap_line = depth <= 8 ? remap1_8bit_line_c : remap1_16bit_line_c;
|
||||
break;
|
||||
case BILINEAR:
|
||||
s->remap_line = depth <= 8 ? remap2_8bit_line_c : remap2_16bit_line_c;
|
||||
break;
|
||||
case BICUBIC:
|
||||
case LANCZOS:
|
||||
s->remap_line = depth <= 8 ? remap4_8bit_line_c : remap4_16bit_line_c;
|
||||
break;
|
||||
}
|
||||
|
||||
if (ARCH_X86_64)
|
||||
ff_v360_init_x86(s, depth);
|
||||
}
|
||||
|
||||
/**
|
||||
* Save nearest pixel coordinates for remapping.
|
||||
@ -399,10 +329,10 @@ static void bilinear_kernel(float du, float dv, const XYRemap *r_tmp,
|
||||
}
|
||||
}
|
||||
|
||||
ker[0] = (1.f - du) * (1.f - dv) * 8192;
|
||||
ker[1] = du * (1.f - dv) * 8192;
|
||||
ker[2] = (1.f - du) * dv * 8192;
|
||||
ker[3] = du * dv * 8192;
|
||||
ker[0] = (1.f - du) * (1.f - dv) * 16384;
|
||||
ker[1] = du * (1.f - dv) * 16384;
|
||||
ker[2] = (1.f - du) * dv * 16384;
|
||||
ker[3] = du * dv * 16384;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -446,7 +376,7 @@ static void bicubic_kernel(float du, float dv, const XYRemap *r_tmp,
|
||||
for (j = 0; j < 4; j++) {
|
||||
u[i * 4 + j] = r_tmp->u[i][j];
|
||||
v[i * 4 + j] = r_tmp->v[i][j];
|
||||
ker[i * 4 + j] = du_coeffs[j] * dv_coeffs[i] * 2048;
|
||||
ker[i * 4 + j] = du_coeffs[j] * dv_coeffs[i] * 16384;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -501,7 +431,7 @@ static void lanczos_kernel(float du, float dv, const XYRemap *r_tmp,
|
||||
for (j = 0; j < 4; j++) {
|
||||
u[i * 4 + j] = r_tmp->u[i][j];
|
||||
v[i * 4 + j] = r_tmp->v[i][j];
|
||||
ker[i * 4 + j] = du_coeffs[j] * dv_coeffs[i] * 2048;
|
||||
ker[i * 4 + j] = du_coeffs[j] * dv_coeffs[i] * 16384;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2038,6 +1968,8 @@ static int config_output(AVFilterLink *outlink)
|
||||
av_assert0(0);
|
||||
}
|
||||
|
||||
ff_v360_init(s, depth);
|
||||
|
||||
switch (s->in) {
|
||||
case EQUIRECTANGULAR:
|
||||
in_transform = xyz_to_equirect;
|
||||
|
@ -31,6 +31,7 @@ OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend_init.o
|
||||
OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold_init.o
|
||||
OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o
|
||||
OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o
|
||||
OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360_init.o
|
||||
OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif_init.o
|
||||
OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o
|
||||
|
||||
@ -66,5 +67,6 @@ X86ASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o
|
||||
X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold.o
|
||||
X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o
|
||||
X86ASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o
|
||||
X86ASM-OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360.o
|
||||
X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif.o
|
||||
X86ASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
|
||||
|
142
libavfilter/x86/vf_v360.asm
Normal file
142
libavfilter/x86/vf_v360.asm
Normal file
@ -0,0 +1,142 @@
|
||||
;*****************************************************************************
|
||||
;* x86-optimized functions for v360 filter
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pb_mask: db 0,4,8,12,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
|
||||
pd_255: times 4 dd 255
|
||||
|
||||
SECTION .text
|
||||
|
||||
; void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
|
||||
; const uint16_t *u, const uint16_t *v, const int16_t *ker);
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal remap1_8bit_line, 6, 7, 6, dst, width, src, in_linesize, u, v, x
|
||||
movsxdifnidn widthq, widthd
|
||||
xor xq, xq
|
||||
movd xm0, in_linesized
|
||||
pcmpeqw m4, m4
|
||||
VBROADCASTI128 m3, [pb_mask]
|
||||
vpbroadcastd m0, xm0
|
||||
|
||||
.loop:
|
||||
pmovsxwd m1, [vq + xq * 2]
|
||||
pmovsxwd m2, [uq + xq * 2]
|
||||
|
||||
pmulld m1, m0
|
||||
paddd m1, m2
|
||||
mova m2, m4
|
||||
vpgatherdd m5, [srcq + m1], m2
|
||||
pshufb m1, m5, m3
|
||||
vextracti128 xm2, m1, 1
|
||||
movd [dstq+xq], xm1
|
||||
movd [dstq+xq+4], xm2
|
||||
|
||||
add xq, mmsize / 4
|
||||
cmp xq, widthq
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal remap2_8bit_line, 7, 8, 8, dst, width, src, in_linesize, u, v, ker, x
|
||||
movsxdifnidn widthq, widthd
|
||||
xor xq, xq
|
||||
movd xm0, in_linesized
|
||||
pcmpeqw m7, m7
|
||||
vpbroadcastd m0, xm0
|
||||
vpbroadcastd m6, [pd_255]
|
||||
|
||||
.loop:
|
||||
pmovsxwd m1, [kerq + xq * 8]
|
||||
pmovsxwd m2, [vq + xq * 8]
|
||||
pmovsxwd m3, [uq + xq * 8]
|
||||
|
||||
pmulld m4, m2, m0
|
||||
paddd m4, m3
|
||||
mova m3, m7
|
||||
vpgatherdd m2, [srcq + m4], m3
|
||||
pand m2, m6
|
||||
pmulld m2, m1
|
||||
phaddd m2, m2
|
||||
phaddd m1, m2, m2
|
||||
psrld m1, m1, 0xe
|
||||
vextracti128 xm2, m1, 1
|
||||
|
||||
pextrb [dstq+xq], xm1, 0
|
||||
pextrb [dstq+xq+1], xm2, 0
|
||||
|
||||
add xq, mmsize / 16
|
||||
cmp xq, widthq
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal remap4_8bit_line, 7, 9, 11, dst, width, src, in_linesize, u, v, ker, x, y
|
||||
movsxdifnidn widthq, widthd
|
||||
xor yq, yq
|
||||
xor xq, xq
|
||||
movd xm0, in_linesized
|
||||
pcmpeqw m7, m7
|
||||
vpbroadcastd m0, xm0
|
||||
vpbroadcastd m6, [pd_255]
|
||||
|
||||
.loop:
|
||||
pmovsxwd m1, [kerq + yq]
|
||||
pmovsxwd m5, [kerq + yq + 16]
|
||||
pmovsxwd m2, [vq + yq]
|
||||
pmovsxwd m8, [vq + yq + 16]
|
||||
pmovsxwd m3, [uq + yq]
|
||||
pmovsxwd m9, [uq + yq + 16]
|
||||
|
||||
pmulld m4, m2, m0
|
||||
pmulld m10, m8, m0
|
||||
paddd m4, m3
|
||||
paddd m10, m9
|
||||
mova m3, m7
|
||||
vpgatherdd m2, [srcq + m4], m3
|
||||
mova m3, m7
|
||||
vpgatherdd m4, [srcq + m10], m3
|
||||
pand m2, m6
|
||||
pand m4, m6
|
||||
pmulld m2, m1
|
||||
pmulld m4, m5
|
||||
|
||||
paddd m2, m4
|
||||
vextracti128 xm1, m2, 1
|
||||
paddd m1, m2
|
||||
phaddd m1, m1
|
||||
phaddd m1, m1
|
||||
psrld m1, m1, 0xe
|
||||
packuswb m1, m1
|
||||
|
||||
pextrb [dstq+xq], xm1, 0
|
||||
|
||||
add xq, 1
|
||||
add yq, 32
|
||||
cmp xq, widthq
|
||||
jl .loop
|
||||
RET
|
||||
|
||||
%endif
|
50
libavfilter/x86/vf_v360_init.c
Normal file
50
libavfilter/x86/vf_v360_init.c
Normal file
@ -0,0 +1,50 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavfilter/v360.h"
|
||||
|
||||
void ff_remap1_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
|
||||
const uint16_t *u, const uint16_t *v, const int16_t *ker);
|
||||
|
||||
void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
|
||||
const uint16_t *u, const uint16_t *v, const int16_t *ker);
|
||||
|
||||
void ff_remap4_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
|
||||
const uint16_t *u, const uint16_t *v, const int16_t *ker);
|
||||
|
||||
av_cold void ff_v360_init_x86(V360Context *s, int depth)
|
||||
{
|
||||
#if ARCH_X86_64
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == NEAREST && depth <= 8)
|
||||
s->remap_line = ff_remap1_8bit_line_avx2;
|
||||
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == BILINEAR && depth <= 8)
|
||||
s->remap_line = ff_remap2_8bit_line_avx2;
|
||||
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags) && (s->interp == BICUBIC ||
|
||||
s->interp == LANCZOS) && depth <= 8)
|
||||
s->remap_line = ff_remap4_8bit_line_avx2;
|
||||
#endif
|
||||
}
|
Loading…
Reference in New Issue
Block a user