mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents().
This commit is contained in:
parent
8b7b2d6aae
commit
f99a5ef92e
@ -32,6 +32,11 @@ cextern ac3_bap_bits
|
||||
pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
|
||||
pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
|
||||
|
||||
; used in ff_ac3_extract_exponents()
|
||||
pd_1: times 4 dd 1
|
||||
pd_151: times 4 dd 151
|
||||
pb_shuf_4dwb: db 0, 4, 8, 12
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
@ -346,3 +351,100 @@ cglobal ac3_compute_mantissa_size_sse2, 1,2,4, mant_cnt, sum
|
||||
movd eax, m0
|
||||
add eax, sumd
|
||||
RET
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro PABSD_MMX 2 ; src/dst, tmp
|
||||
pxor %2, %2
|
||||
pcmpgtd %2, %1
|
||||
pxor %1, %2
|
||||
psubd %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro PABSD_SSSE3 1-2 ; src/dst, unused
|
||||
pabsd %1, %1
|
||||
%endmacro
|
||||
|
||||
%ifdef HAVE_AMD3DNOW
|
||||
INIT_MMX
|
||||
cglobal ac3_extract_exponents_3dnow, 3,3,0, exp, coef, len
|
||||
add expq, lenq
|
||||
lea coefq, [coefq+4*lenq]
|
||||
neg lenq
|
||||
movq m3, [pd_1]
|
||||
movq m4, [pd_151]
|
||||
.loop:
|
||||
movq m0, [coefq+4*lenq ]
|
||||
movq m1, [coefq+4*lenq+8]
|
||||
PABSD_MMX m0, m2
|
||||
PABSD_MMX m1, m2
|
||||
pslld m0, 1
|
||||
por m0, m3
|
||||
pi2fd m2, m0
|
||||
psrld m2, 23
|
||||
movq m0, m4
|
||||
psubd m0, m2
|
||||
pslld m1, 1
|
||||
por m1, m3
|
||||
pi2fd m2, m1
|
||||
psrld m2, 23
|
||||
movq m1, m4
|
||||
psubd m1, m2
|
||||
packssdw m0, m0
|
||||
packuswb m0, m0
|
||||
packssdw m1, m1
|
||||
packuswb m1, m1
|
||||
punpcklwd m0, m1
|
||||
movd [expq+lenq], m0
|
||||
add lenq, 4
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endif
|
||||
|
||||
%macro AC3_EXTRACT_EXPONENTS 1
|
||||
cglobal ac3_extract_exponents_%1, 3,3,5, exp, coef, len
|
||||
add expq, lenq
|
||||
lea coefq, [coefq+4*lenq]
|
||||
neg lenq
|
||||
mova m2, [pd_1]
|
||||
mova m3, [pd_151]
|
||||
%ifidn %1, ssse3 ;
|
||||
movd m4, [pb_shuf_4dwb]
|
||||
%endif
|
||||
.loop:
|
||||
; move 4 32-bit coefs to xmm0
|
||||
mova m0, [coefq+4*lenq]
|
||||
; absolute value
|
||||
PABSD m0, m1
|
||||
; convert to float and extract exponents
|
||||
pslld m0, 1
|
||||
por m0, m2
|
||||
cvtdq2ps m1, m0
|
||||
psrld m1, 23
|
||||
mova m0, m3
|
||||
psubd m0, m1
|
||||
; move the lowest byte in each of 4 dwords to the low dword
|
||||
%ifidn %1, ssse3
|
||||
pshufb m0, m4
|
||||
%else
|
||||
packssdw m0, m0
|
||||
packuswb m0, m0
|
||||
%endif
|
||||
movd [expq+lenq], m0
|
||||
|
||||
add lenq, 4
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%ifdef HAVE_SSE
|
||||
INIT_XMM
|
||||
%define PABSD PABSD_MMX
|
||||
AC3_EXTRACT_EXPONENTS sse2
|
||||
%ifdef HAVE_SSSE3
|
||||
%define PABSD PABSD_SSSE3
|
||||
AC3_EXTRACT_EXPONENTS ssse3
|
||||
%endif
|
||||
%endif
|
||||
|
@ -44,6 +44,10 @@ extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned i
|
||||
|
||||
extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
|
||||
|
||||
extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs);
|
||||
extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
|
||||
extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
|
||||
|
||||
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
@ -56,6 +60,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
|
||||
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
|
||||
}
|
||||
if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
|
||||
c->extract_exponents = ff_ac3_extract_exponents_3dnow;
|
||||
if (!bit_exact) {
|
||||
c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
|
||||
}
|
||||
@ -72,6 +77,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
|
||||
c->float_to_fixed24 = ff_float_to_fixed24_sse2;
|
||||
c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
|
||||
c->extract_exponents = ff_ac3_extract_exponents_sse2;
|
||||
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
|
||||
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
|
||||
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
|
||||
@ -79,6 +85,9 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
|
||||
}
|
||||
if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
|
||||
if (!(mm_flags & AV_CPU_FLAG_ATOM)) {
|
||||
c->extract_exponents = ff_ac3_extract_exponents_ssse3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user