1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-01-24 13:56:33 +02:00

ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents().

This commit is contained in:
Justin Ruggles 2011-06-30 17:48:44 -04:00
parent 8b7b2d6aae
commit f99a5ef92e
2 changed files with 111 additions and 0 deletions

View File

@ -32,6 +32,11 @@ cextern ac3_bap_bits
pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
; used in ff_ac3_extract_exponents()
pd_1: times 4 dd 1
pd_151: times 4 dd 151
pb_shuf_4dwb: db 0, 4, 8, 12
SECTION .text
;-----------------------------------------------------------------------------
@ -346,3 +351,100 @@ cglobal ac3_compute_mantissa_size_sse2, 1,2,4, mant_cnt, sum
movd eax, m0
add eax, sumd
RET
;------------------------------------------------------------------------------
; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
;------------------------------------------------------------------------------
%macro PABSD_MMX 2 ; src/dst, tmp
pxor %2, %2
pcmpgtd %2, %1
pxor %1, %2
psubd %1, %2
%endmacro
%macro PABSD_SSSE3 1-2 ; src/dst, unused
pabsd %1, %1
%endmacro
%ifdef HAVE_AMD3DNOW
INIT_MMX
cglobal ac3_extract_exponents_3dnow, 3,3,0, exp, coef, len
add expq, lenq
lea coefq, [coefq+4*lenq]
neg lenq
movq m3, [pd_1]
movq m4, [pd_151]
.loop:
movq m0, [coefq+4*lenq ]
movq m1, [coefq+4*lenq+8]
PABSD_MMX m0, m2
PABSD_MMX m1, m2
pslld m0, 1
por m0, m3
pi2fd m2, m0
psrld m2, 23
movq m0, m4
psubd m0, m2
pslld m1, 1
por m1, m3
pi2fd m2, m1
psrld m2, 23
movq m1, m4
psubd m1, m2
packssdw m0, m0
packuswb m0, m0
packssdw m1, m1
packuswb m1, m1
punpcklwd m0, m1
movd [expq+lenq], m0
add lenq, 4
jl .loop
REP_RET
%endif
%macro AC3_EXTRACT_EXPONENTS 1
cglobal ac3_extract_exponents_%1, 3,3,5, exp, coef, len
add expq, lenq
lea coefq, [coefq+4*lenq]
neg lenq
mova m2, [pd_1]
mova m3, [pd_151]
%ifidn %1, ssse3 ;
movd m4, [pb_shuf_4dwb]
%endif
.loop:
; move 4 32-bit coefs to xmm0
mova m0, [coefq+4*lenq]
; absolute value
PABSD m0, m1
; convert to float and extract exponents
pslld m0, 1
por m0, m2
cvtdq2ps m1, m0
psrld m1, 23
mova m0, m3
psubd m0, m1
; move the lowest byte in each of 4 dwords to the low dword
%ifidn %1, ssse3
pshufb m0, m4
%else
packssdw m0, m0
packuswb m0, m0
%endif
movd [expq+lenq], m0
add lenq, 4
jl .loop
REP_RET
%endmacro
%ifdef HAVE_SSE
INIT_XMM
%define PABSD PABSD_MMX
AC3_EXTRACT_EXPONENTS sse2
%ifdef HAVE_SSSE3
%define PABSD PABSD_SSSE3
AC3_EXTRACT_EXPONENTS ssse3
%endif
%endif

View File

@ -44,6 +44,10 @@ extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned i
extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs);
extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
{
int mm_flags = av_get_cpu_flags();
@ -56,6 +60,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
}
if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
c->extract_exponents = ff_ac3_extract_exponents_3dnow;
if (!bit_exact) {
c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
}
@ -72,6 +77,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
c->float_to_fixed24 = ff_float_to_fixed24_sse2;
c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
c->extract_exponents = ff_ac3_extract_exponents_sse2;
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
@ -79,6 +85,9 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
}
if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
if (!(mm_flags & AV_CPU_FLAG_ATOM)) {
c->extract_exponents = ff_ac3_extract_exponents_ssse3;
}
}
#endif
}