1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-11-21 10:55:51 +02:00

avcodec/x86: add an 8-bit simple IDCT function based on the x86-64 high depth functions

Includes add/put functions

Rounding contributed by Ronald S. Bultje
This commit is contained in:
James Darnley 2017-06-02 15:20:19 +02:00
parent 8b19467d07
commit d7246ea9f2
4 changed files with 126 additions and 0 deletions

View File

@ -88,10 +88,12 @@ static const struct algo idct_tab_arch[] = {
#if HAVE_X86ASM
#if ARCH_X86_64
#if HAVE_SSE2_EXTERNAL
{ "SIMPLE8-SSE2", ff_simple_idct8_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
{ "SIMPLE10-SSE2", ff_simple_idct10_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
{ "SIMPLE12-SSE2", ff_simple_idct12_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
#endif
#if HAVE_AVX_EXTERNAL
{ "SIMPLE8-AVX", ff_simple_idct8_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
{ "SIMPLE10-AVX", ff_simple_idct10_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
{ "SIMPLE12-AVX", ff_simple_idct12_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX, 1 },
#endif

View File

@ -94,9 +94,32 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
c->idct_add = ff_simple_idct_add_sse2;
c->perm_type = FF_IDCT_PERM_SIMPLE;
}
if (ARCH_X86_64 &&
!high_bit_depth &&
avctx->lowres == 0 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
c->idct = ff_simple_idct8_sse2;
c->idct_put = ff_simple_idct8_put_sse2;
c->idct_add = ff_simple_idct8_add_sse2;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
}
if (ARCH_X86_64 && avctx->lowres == 0) {
if (EXTERNAL_AVX(cpu_flags) &&
!high_bit_depth &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
c->idct = ff_simple_idct8_avx;
c->idct_put = ff_simple_idct8_put_avx;
c->idct_add = ff_simple_idct8_add_avx;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
if (avctx->bits_per_raw_sample == 10 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||

View File

@ -29,6 +29,15 @@ void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct8_sse2(int16_t *block);
void ff_simple_idct8_avx(int16_t *block);
void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct8_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct8_add_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct10_sse2(int16_t *block);
void ff_simple_idct10_avx(int16_t *block);

View File

@ -31,11 +31,14 @@ SECTION_RODATA
cextern pw_2
cextern pw_16
cextern pw_32
cextern pw_1023
cextern pw_4095
pd_round_11: times 4 dd 1<<(11-1)
pd_round_12: times 4 dd 1<<(12-1)
pd_round_15: times 4 dd 1<<(15-1)
pd_round_19: times 4 dd 1<<(19-1)
pd_round_20: times 4 dd 1<<(20-1)
%macro CONST_DEC 3
const %1
@ -77,8 +80,97 @@ CONST_DEC w3_min_w7_lo, W3sh2_lo, -W7sh2
SECTION .text
%macro STORE_HI_LO 12
movq %1, %9
movq %3, %10
movq %5, %11
movq %7, %12
movhps %2, %9
movhps %4, %10
movhps %6, %11
movhps %8, %12
%endmacro
%macro LOAD_ZXBW_8 16
pmovzxbw %1, %9
pmovzxbw %2, %10
pmovzxbw %3, %11
pmovzxbw %4, %12
pmovzxbw %5, %13
pmovzxbw %6, %14
pmovzxbw %7, %15
pmovzxbw %8, %16
%endmacro
%macro LOAD_ZXBW_4 9
movh %1, %5
movh %2, %6
movh %3, %7
movh %4, %8
punpcklbw %1, %9
punpcklbw %2, %9
punpcklbw %3, %9
punpcklbw %4, %9
%endmacro
%define PASS4ROWS(base, stride, stride3) \
[base], [base + stride], [base + 2*stride], [base + stride3]
%macro idct_fn 0
define_constants _lo
cglobal simple_idct8, 1, 1, 16, 32, block
IDCT_FN "", 11, pw_32, 20, "store"
RET
cglobal simple_idct8_put, 3, 4, 16, 32, pixels, lsize, block
IDCT_FN "", 11, pw_32, 20
lea r3, [3*lsizeq]
lea r2, [pixelsq + r3]
packuswb m8, m0
packuswb m1, m2
packuswb m4, m11
packuswb m9, m10
STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9
RET
cglobal simple_idct8_add, 3, 4, 16, 32, pixels, lsize, block
IDCT_FN "", 11, pw_32, 20
lea r2, [3*lsizeq]
%if cpuflag(sse4)
lea r3, [pixelsq + r2]
LOAD_ZXBW_8 m3, m5, m6, m7, m12, m13, m14, m15, PASS8ROWS(pixelsq, r3, lsizeq, r2)
paddsw m8, m3
paddsw m0, m5
paddsw m1, m6
paddsw m2, m7
paddsw m4, m12
paddsw m11, m13
paddsw m9, m14
paddsw m10, m15
%else
pxor m12, m12
LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(pixelsq, lsizeq, r2), m12
paddsw m8, m3
paddsw m0, m5
paddsw m1, m6
paddsw m2, m7
lea r3, [pixelsq + 4*lsizeq]
LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(r3, lsizeq, r2), m12
paddsw m4, m3
paddsw m11, m5
paddsw m9, m6
paddsw m10, m7
lea r3, [pixelsq + r2]
%endif
packuswb m8, m0
packuswb m1, m2
packuswb m4, m11
packuswb m9, m10
STORE_HI_LO PASS8ROWS(pixelsq, r3, lsizeq, r2), m8, m1, m4, m9
RET
define_constants _hi
cglobal simple_idct10, 1, 1, 16, block