mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
avcodec/x86: add an 8-bit simple IDCT function based on the x86-64 high depth functions
Includes add/put functions Rounding contributed by Ronald S. Bultje
This commit is contained in:
parent
8b19467d07
commit
d7246ea9f2
@ -88,10 +88,12 @@ static const struct algo idct_tab_arch[] = {
|
||||
#if HAVE_X86ASM
|
||||
#if ARCH_X86_64
|
||||
#if HAVE_SSE2_EXTERNAL
|
||||
{ "SIMPLE8-SSE2", ff_simple_idct8_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
|
||||
{ "SIMPLE10-SSE2", ff_simple_idct10_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
|
||||
{ "SIMPLE12-SSE2", ff_simple_idct12_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
|
||||
#endif
|
||||
#if HAVE_AVX_EXTERNAL
|
||||
{ "SIMPLE8-AVX", ff_simple_idct8_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
|
||||
{ "SIMPLE10-AVX", ff_simple_idct10_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
|
||||
{ "SIMPLE12-AVX", ff_simple_idct12_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX, 1 },
|
||||
#endif
|
||||
|
@ -94,9 +94,32 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
|
||||
c->idct_add = ff_simple_idct_add_sse2;
|
||||
c->perm_type = FF_IDCT_PERM_SIMPLE;
|
||||
}
|
||||
|
||||
if (ARCH_X86_64 &&
|
||||
!high_bit_depth &&
|
||||
avctx->lowres == 0 &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
|
||||
c->idct = ff_simple_idct8_sse2;
|
||||
c->idct_put = ff_simple_idct8_put_sse2;
|
||||
c->idct_add = ff_simple_idct8_add_sse2;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
}
|
||||
|
||||
if (ARCH_X86_64 && avctx->lowres == 0) {
|
||||
if (EXTERNAL_AVX(cpu_flags) &&
|
||||
!high_bit_depth &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
|
||||
c->idct = ff_simple_idct8_avx;
|
||||
c->idct_put = ff_simple_idct8_put_avx;
|
||||
c->idct_add = ff_simple_idct8_add_avx;
|
||||
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
|
||||
}
|
||||
|
||||
if (avctx->bits_per_raw_sample == 10 &&
|
||||
(avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
|
@ -29,6 +29,15 @@ void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
|
||||
void ff_simple_idct8_sse2(int16_t *block);
|
||||
void ff_simple_idct8_avx(int16_t *block);
|
||||
|
||||
void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
|
||||
void ff_simple_idct8_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
void ff_simple_idct8_add_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
|
||||
|
||||
void ff_simple_idct10_sse2(int16_t *block);
|
||||
void ff_simple_idct10_avx(int16_t *block);
|
||||
|
||||
|
@ -31,11 +31,14 @@ SECTION_RODATA
|
||||
|
||||
cextern pw_2
|
||||
cextern pw_16
|
||||
cextern pw_32
|
||||
cextern pw_1023
|
||||
cextern pw_4095
|
||||
pd_round_11: times 4 dd 1<<(11-1)
|
||||
pd_round_12: times 4 dd 1<<(12-1)
|
||||
pd_round_15: times 4 dd 1<<(15-1)
|
||||
pd_round_19: times 4 dd 1<<(19-1)
|
||||
pd_round_20: times 4 dd 1<<(20-1)
|
||||
|
||||
%macro CONST_DEC 3
|
||||
const %1
|
||||
@ -77,8 +80,97 @@ CONST_DEC w3_min_w7_lo, W3sh2_lo, -W7sh2
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro STORE_HI_LO 12
|
||||
movq %1, %9
|
||||
movq %3, %10
|
||||
movq %5, %11
|
||||
movq %7, %12
|
||||
movhps %2, %9
|
||||
movhps %4, %10
|
||||
movhps %6, %11
|
||||
movhps %8, %12
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_ZXBW_8 16
|
||||
pmovzxbw %1, %9
|
||||
pmovzxbw %2, %10
|
||||
pmovzxbw %3, %11
|
||||
pmovzxbw %4, %12
|
||||
pmovzxbw %5, %13
|
||||
pmovzxbw %6, %14
|
||||
pmovzxbw %7, %15
|
||||
pmovzxbw %8, %16
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_ZXBW_4 9
|
||||
movh %1, %5
|
||||
movh %2, %6
|
||||
movh %3, %7
|
||||
movh %4, %8
|
||||
punpcklbw %1, %9
|
||||
punpcklbw %2, %9
|
||||
punpcklbw %3, %9
|
||||
punpcklbw %4, %9
|
||||
%endmacro
|
||||
|
||||
%define PASS4ROWS(base, stride, stride3) \
|
||||
[base], [base + stride], [base + 2*stride], [base + stride3]
|
||||
|
||||
%macro idct_fn 0
|
||||
|
||||
define_constants _lo
|
||||
|
||||
cglobal simple_idct8, 1, 1, 16, 32, block
|
||||
IDCT_FN "", 11, pw_32, 20, "store"
|
||||
RET
|
||||
|
||||
cglobal simple_idct8_put, 3, 4, 16, 32, pixels, lsize, block
|
||||
IDCT_FN "", 11, pw_32, 20
|
||||
lea r3, [3*lsizeq]
|
||||
lea r2, [pixelsq + r3]
|
||||
packuswb m8, m0
|
||||
packuswb m1, m2
|
||||
packuswb m4, m11
|
||||
packuswb m9, m10
|
||||
STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9
|
||||
RET
|
||||
|
||||
cglobal simple_idct8_add, 3, 4, 16, 32, pixels, lsize, block
|
||||
IDCT_FN "", 11, pw_32, 20
|
||||
lea r2, [3*lsizeq]
|
||||
%if cpuflag(sse4)
|
||||
lea r3, [pixelsq + r2]
|
||||
LOAD_ZXBW_8 m3, m5, m6, m7, m12, m13, m14, m15, PASS8ROWS(pixelsq, r3, lsizeq, r2)
|
||||
paddsw m8, m3
|
||||
paddsw m0, m5
|
||||
paddsw m1, m6
|
||||
paddsw m2, m7
|
||||
paddsw m4, m12
|
||||
paddsw m11, m13
|
||||
paddsw m9, m14
|
||||
paddsw m10, m15
|
||||
%else
|
||||
pxor m12, m12
|
||||
LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(pixelsq, lsizeq, r2), m12
|
||||
paddsw m8, m3
|
||||
paddsw m0, m5
|
||||
paddsw m1, m6
|
||||
paddsw m2, m7
|
||||
lea r3, [pixelsq + 4*lsizeq]
|
||||
LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(r3, lsizeq, r2), m12
|
||||
paddsw m4, m3
|
||||
paddsw m11, m5
|
||||
paddsw m9, m6
|
||||
paddsw m10, m7
|
||||
lea r3, [pixelsq + r2]
|
||||
%endif
|
||||
packuswb m8, m0
|
||||
packuswb m1, m2
|
||||
packuswb m4, m11
|
||||
packuswb m9, m10
|
||||
STORE_HI_LO PASS8ROWS(pixelsq, r3, lsizeq, r2), m8, m1, m4, m9
|
||||
RET
|
||||
|
||||
define_constants _hi
|
||||
|
||||
cglobal simple_idct10, 1, 1, 16, block
|
||||
|
Loading…
Reference in New Issue
Block a user