mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
av_filter/x86/idet: MMX/SSE2 implementation of 16bits filter_line()
tested on http://ps-auxw.de/10bit-h264-sample/10bit-eldorado.mkv MMX: ~30% faster decoding overall SSE2:~40% faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
881f96c4c2
commit
e3fd6a3a4e
@ -61,7 +61,7 @@ int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w)
|
int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w)
|
||||||
{
|
{
|
||||||
int x;
|
int x;
|
||||||
int ret=0;
|
int ret=0;
|
||||||
@ -169,8 +169,11 @@ static int filter_frame(AVFilterLink *link, AVFrame *picref)
|
|||||||
|
|
||||||
if (!idet->csp)
|
if (!idet->csp)
|
||||||
idet->csp = av_pix_fmt_desc_get(link->format);
|
idet->csp = av_pix_fmt_desc_get(link->format);
|
||||||
if (idet->csp->comp[0].depth_minus1 / 8 == 1)
|
if (idet->csp->comp[0].depth_minus1 / 8 == 1){
|
||||||
idet->filter_line = (void*)filter_line_c_16bit;
|
idet->filter_line = (ff_idet_filter_func)ff_idet_filter_line_c_16bit;
|
||||||
|
if (ARCH_X86)
|
||||||
|
ff_idet_init_x86(idet, 1);
|
||||||
|
}
|
||||||
|
|
||||||
filter(ctx);
|
filter(ctx);
|
||||||
|
|
||||||
@ -245,7 +248,7 @@ static av_cold int init(AVFilterContext *ctx)
|
|||||||
idet->filter_line = ff_idet_filter_line_c;
|
idet->filter_line = ff_idet_filter_line_c;
|
||||||
|
|
||||||
if (ARCH_X86)
|
if (ARCH_X86)
|
||||||
ff_idet_init_x86(idet);
|
ff_idet_init_x86(idet, 0);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -24,6 +24,8 @@
|
|||||||
|
|
||||||
#define HIST_SIZE 4
|
#define HIST_SIZE 4
|
||||||
|
|
||||||
|
typedef int (*ff_idet_filter_func)(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w);
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
TFF,
|
TFF,
|
||||||
BFF,
|
BFF,
|
||||||
@ -45,14 +47,15 @@ typedef struct {
|
|||||||
AVFrame *cur;
|
AVFrame *cur;
|
||||||
AVFrame *next;
|
AVFrame *next;
|
||||||
AVFrame *prev;
|
AVFrame *prev;
|
||||||
int (*filter_line)(const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w);
|
ff_idet_filter_func filter_line;
|
||||||
|
|
||||||
const AVPixFmtDescriptor *csp;
|
const AVPixFmtDescriptor *csp;
|
||||||
} IDETContext;
|
} IDETContext;
|
||||||
|
|
||||||
void ff_idet_init_x86(IDETContext *idet);
|
void ff_idet_init_x86(IDETContext *idet, int for_16b);
|
||||||
|
|
||||||
/* main fall-back for left-over */
|
/* main fall-back for left-over */
|
||||||
int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w);
|
int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w);
|
||||||
|
int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -25,8 +25,6 @@
|
|||||||
|
|
||||||
SECTION_TEXT
|
SECTION_TEXT
|
||||||
|
|
||||||
%if ARCH_X86_32
|
|
||||||
|
|
||||||
; Implementation that does 8-bytes at a time using single-word operations.
|
; Implementation that does 8-bytes at a time using single-word operations.
|
||||||
%macro IDET_FILTER_LINE 1
|
%macro IDET_FILTER_LINE 1
|
||||||
INIT_MMX %1
|
INIT_MMX %1
|
||||||
@ -78,11 +76,79 @@ cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index
|
|||||||
RET
|
RET
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
%if ARCH_X86_32
|
||||||
IDET_FILTER_LINE mmxext
|
IDET_FILTER_LINE mmxext
|
||||||
IDET_FILTER_LINE mmx
|
IDET_FILTER_LINE mmx
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
|
;******************************************************************************
|
||||||
|
; 16bit implementation that does 4/8-pixels at a time
|
||||||
|
|
||||||
|
%macro PABS_DIFF_WD 3 ; a, b, junk , output=a
|
||||||
|
psubusw %3, %2, %1
|
||||||
|
psubusw %1, %2
|
||||||
|
por %1, %3
|
||||||
|
|
||||||
|
mova %2, %1
|
||||||
|
punpcklwd %1, m_zero
|
||||||
|
punpckhwd %2, m_zero
|
||||||
|
paddd %1, %2
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words)
|
||||||
|
cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
|
||||||
|
xor indexq, indexq
|
||||||
|
%define m_zero m1
|
||||||
|
%define m_sum m0
|
||||||
|
pxor m_sum, m_sum
|
||||||
|
pxor m_zero, m_zero
|
||||||
|
|
||||||
|
.loop_16bit:
|
||||||
|
movu m2, [bq + indexq * 2] ; B
|
||||||
|
movu m3, [aq + indexq * 2] ; A
|
||||||
|
mova m6, m2
|
||||||
|
psubusw m5, m2, m3 ; ba
|
||||||
|
|
||||||
|
movu m4, [cq + indexq * 2] ; C
|
||||||
|
add indexq, %1
|
||||||
|
psubusw m3, m2 ; ab
|
||||||
|
CMP indexd, widthd
|
||||||
|
|
||||||
|
psubusw m6, m4 ; bc
|
||||||
|
psubusw m4, m2 ; cb
|
||||||
|
|
||||||
|
PABS_DIFF_WD m3, m6, m7 ; |ab - bc|
|
||||||
|
PABS_DIFF_WD m5, m4, m7 ; |ba - cb|
|
||||||
|
paddd m_sum, m3
|
||||||
|
paddd m_sum, m5
|
||||||
|
jl .loop_16bit
|
||||||
|
|
||||||
|
mova m2, m_sum
|
||||||
|
%if mmsize == 16
|
||||||
|
psrldq m2, 4
|
||||||
|
paddd m_sum, m2
|
||||||
|
psrldq m2, 4
|
||||||
|
paddd m_sum, m2
|
||||||
|
psrldq m2, 4
|
||||||
|
paddd m_sum, m2
|
||||||
|
%else
|
||||||
|
psrlq m2, 32
|
||||||
|
paddd m_sum, m2
|
||||||
|
%endif
|
||||||
|
movd eax, m_sum
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_XMM sse2
|
||||||
|
IDET_FILTER_LINE_16BIT 8
|
||||||
|
%if ARCH_X86_32
|
||||||
|
INIT_MMX mmx
|
||||||
|
IDET_FILTER_LINE_16BIT 4
|
||||||
|
%endif
|
||||||
|
|
||||||
|
;******************************************************************************
|
||||||
; SSE2 8-bit implementation that does 16-bytes at a time:
|
; SSE2 8-bit implementation that does 16-bytes at a time:
|
||||||
|
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
|
cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
|
||||||
xor indexq, indexq
|
xor indexq, indexq
|
||||||
|
@ -23,6 +23,8 @@
|
|||||||
#include "libavutil/x86/cpu.h"
|
#include "libavutil/x86/cpu.h"
|
||||||
#include "libavfilter/vf_idet.h"
|
#include "libavfilter/vf_idet.h"
|
||||||
|
|
||||||
|
#if HAVE_YASM
|
||||||
|
|
||||||
/* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */
|
/* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */
|
||||||
#define FUNC_MAIN_DECL(KIND, SPAN) \
|
#define FUNC_MAIN_DECL(KIND, SPAN) \
|
||||||
int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \
|
int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \
|
||||||
@ -39,32 +41,47 @@ static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \
|
|||||||
return sum; \
|
return sum; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#if HAVE_YASM
|
|
||||||
|
#define FUNC_MAIN_DECL_16bit(KIND, SPAN) \
|
||||||
|
int ff_idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \
|
||||||
|
const uint16_t *c, int w); \
|
||||||
|
static int idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \
|
||||||
|
const uint16_t *c, int w) { \
|
||||||
|
int sum = 0; \
|
||||||
|
const int left_over = w & (SPAN - 1); \
|
||||||
|
w -= left_over; \
|
||||||
|
if (w > 0) \
|
||||||
|
sum += ff_idet_filter_line_16bit_##KIND(a, b, c, w); \
|
||||||
|
if (left_over > 0) \
|
||||||
|
sum += ff_idet_filter_line_c_16bit(a + w, b + w, c + w, left_over); \
|
||||||
|
return sum; \
|
||||||
|
}
|
||||||
|
|
||||||
FUNC_MAIN_DECL(sse2, 16)
|
FUNC_MAIN_DECL(sse2, 16)
|
||||||
|
FUNC_MAIN_DECL_16bit(sse2, 8)
|
||||||
#if ARCH_X86_32
|
#if ARCH_X86_32
|
||||||
FUNC_MAIN_DECL(mmx, 8)
|
FUNC_MAIN_DECL(mmx, 8)
|
||||||
FUNC_MAIN_DECL(mmxext, 8)
|
FUNC_MAIN_DECL(mmxext, 8)
|
||||||
|
FUNC_MAIN_DECL_16bit(mmx, 4)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
av_cold void ff_idet_init_x86(IDETContext *idet, int for_16b)
|
||||||
av_cold void ff_idet_init_x86(IDETContext *idet)
|
|
||||||
{
|
{
|
||||||
#if HAVE_YASM
|
#if HAVE_YASM
|
||||||
const int cpu_flags = av_get_cpu_flags();
|
const int cpu_flags = av_get_cpu_flags();
|
||||||
|
|
||||||
#if ARCH_X86_32
|
#if ARCH_X86_32
|
||||||
if (EXTERNAL_MMX(cpu_flags)) {
|
if (EXTERNAL_MMX(cpu_flags)) {
|
||||||
idet->filter_line = idet_filter_line_mmx;
|
idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmx;
|
||||||
}
|
}
|
||||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||||
idet->filter_line = idet_filter_line_mmxext;
|
idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmxext;
|
||||||
}
|
}
|
||||||
#endif // ARCH_x86_32
|
#endif // ARCH_x86_32
|
||||||
|
|
||||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||||
idet->filter_line = idet_filter_line_sse2;
|
idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_sse2 : idet_filter_line_sse2;
|
||||||
}
|
}
|
||||||
#endif // HAVE_YASM
|
#endif // HAVE_YASM
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user