From 6204feb160c843320f6001d7e2bb2361c82b90ca Mon Sep 17 00:00:00 2001 From: Vitor Sessak Date: Sat, 14 May 2011 14:17:15 +0200 Subject: [PATCH] dct32: Add AVX implementation of 32-point DCT --- libavcodec/mpegaudiodec.c | 4 +- libavcodec/x86/dct32_sse.asm | 350 ++++++++++++++++++++++------------- libavcodec/x86/fft.c | 4 +- libavcodec/x86/fft.h | 1 + 4 files changed, 232 insertions(+), 127 deletions(-) diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index 960d13d1e8..ccc93ad78a 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -69,9 +69,9 @@ typedef struct MPADecodeContext { uint32_t free_format_next_header; GetBitContext gb; GetBitContext in_gb; - DECLARE_ALIGNED(16, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2]; + DECLARE_ALIGNED(32, MPA_INT, synth_buf)[MPA_MAX_CHANNELS][512 * 2]; int synth_buf_offset[MPA_MAX_CHANNELS]; - DECLARE_ALIGNED(16, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT]; + DECLARE_ALIGNED(32, INTFLOAT, sb_samples)[MPA_MAX_CHANNELS][36][SBLIMIT]; INTFLOAT mdct_buf[MPA_MAX_CHANNELS][SBLIMIT * 18]; /* previous samples, for layer 3 MDCT */ GranuleDef granules[2][2]; /* Used in Layer 3 */ #ifdef DEBUG diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm index fa0a502acf..2e1176cd84 100644 --- a/libavcodec/x86/dct32_sse.asm +++ b/libavcodec/x86/dct32_sse.asm @@ -20,31 +20,41 @@ ;****************************************************************************** %include "x86inc.asm" +%include "config.asm" SECTION_RODATA 32 align 32 ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 dd 0.553104, 0.582935, 0.622504, 0.674808 - dd -1.169440, -0.972568, -0.839350, -0.744536 dd -10.190008, -3.407609, -2.057781, -1.484165 + dd -1.169440, -0.972568, -0.839350, -0.744536 dd 0.502419, 0.522499, 0.566944, 0.646822 dd 0.788155, 1.060678, 1.722447, 5.101149 dd 0.509796, 0.601345, 0.899976, 2.562916 + dd 0.509796, 0.601345, 0.899976, 2.562916 dd 1.000000, 1.000000, 1.306563, 0.541196 + dd 1.000000, 1.000000, 1.306563, 0.541196 + dd 1.000000, 0.707107, 1.000000, -0.707107 dd 1.000000, 0.707107, 1.000000, -0.707107 -ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 -%macro BUTTERFLY 4 +%macro BUTTERFLY_SSE 4 movaps %4, %1 subps %1, %2 addps %2, %4 mulps %1, %3 %endmacro -%macro BUTTERFLY0 5 +%macro BUTTERFLY_AVX 4 + vsubps %4, %1, %2 + vaddps %2, %2, %1 + vmulps %1, %4, %3 +%endmacro + +%macro BUTTERFLY0_SSE 5 movaps %4, %1 shufps %1, %1, %5 xorps %4, %2 @@ -52,6 +62,13 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 mulps %1, %3 %endmacro +%macro BUTTERFLY0_AVX 5 + vshufps %4, %1, %1, %5 + vxorps %1, %1, %2 + vaddps %4, %4, %1 + vmulps %1, %4, %3 +%endmacro + %macro BUTTERFLY2 4 BUTTERFLY0 %1, %2, %3, %4, 0x1b %endmacro @@ -60,126 +77,7 @@ ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 BUTTERFLY0 %1, %2, %3, %4, 0xb1 %endmacro -INIT_XMM -section .text align=16 -; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) -cglobal dct32_float_sse, 2,3,8, out, in, tmp - ; pass 1 - - movaps m0, [inq+0] - movaps m1, [inq+112] - shufps m1, m1, 0x1b - BUTTERFLY m0, m1, [ps_cos_vec], m3 - - movaps m7, [inq+64] - movaps m4, [inq+48] - shufps m4, m4, 0x1b - BUTTERFLY m7, m4, [ps_cos_vec+48], m3 - - - ; pass 2 - movaps m2, [ps_cos_vec+64] - BUTTERFLY m1, m4, m2, m3 - movaps [outq+48], m1 - movaps [outq+ 0], m4 - - ; pass 1 - movaps m1, [inq+16] - movaps m6, [inq+96] - shufps m6, m6, 0x1b - BUTTERFLY m1, m6, [ps_cos_vec+16], m3 - - movaps m4, [inq+80] - movaps m5, [inq+32] - shufps m5, m5, 0x1b - BUTTERFLY m4, m5, [ps_cos_vec+32], m3 - - ; pass 2 - BUTTERFLY m0, m7, m2, m3 - - movaps m2, [ps_cos_vec+80] - BUTTERFLY m6, m5, m2, m3 - - BUTTERFLY m1, m4, m2, m3 - - ; pass 3 - movaps m2, [ps_cos_vec+96] - shufps m1, m1, 0x1b - BUTTERFLY m0, m1, m2, m3 - movaps [outq+112], m0 - movaps [outq+ 96], m1 - - movaps m0, [outq+0] - shufps m5, m5, 0x1b - BUTTERFLY m0, m5, m2, m3 - - movaps m1, [outq+48] - shufps m6, m6, 0x1b - BUTTERFLY m1, m6, m2, m3 - movaps [outq+48], m1 - - shufps m4, m4, 0x1b - BUTTERFLY m7, m4, m2, m3 - - ; pass 4 - movaps m3, [ps_p1p1m1m1+0] - movaps m2, [ps_cos_vec+112] - - BUTTERFLY2 m5, m3, m2, m1 - - BUTTERFLY2 m0, m3, m2, m1 - movaps [outq+16], m0 - - BUTTERFLY2 m6, m3, m2, m1 - movaps [outq+32], m6 - - movaps m0, [outq+48] - BUTTERFLY2 m0, m3, m2, m1 - movaps [outq+48], m0 - - BUTTERFLY2 m4, m3, m2, m1 - - BUTTERFLY2 m7, m3, m2, m1 - - movaps m6, [outq+96] - BUTTERFLY2 m6, m3, m2, m1 - - movaps m0, [outq+112] - BUTTERFLY2 m0, m3, m2, m1 - - ; pass 5 - movaps m2, [ps_cos_vec+128] - shufps m3, m3, 0xcc - - BUTTERFLY3 m5, m3, m2, m1 - movaps [outq+0], m5 - - movaps m1, [outq+16] - BUTTERFLY3 m1, m3, m2, m5 - movaps [outq+96], m1 - - BUTTERFLY3 m4, m3, m2, m5 - movaps [outq+64], m4 - - BUTTERFLY3 m7, m3, m2, m5 - movaps [outq+80], m7 - - movaps m5, [outq+32] - BUTTERFLY3 m5, m3, m2, m7 - movaps [outq+32], m5 - - movaps m4, [outq+48] - BUTTERFLY3 m4, m3, m2, m7 - movaps [outq+48], m4 - - BUTTERFLY3 m6, m3, m2, m7 - movaps [outq+16], m6 - - BUTTERFLY3 m0, m3, m2, m7 - movaps [outq+112], m0 - - - ; pass 6, no SIMD... +%macro PASS6_AND_PERMUTE 0 mov tmpd, [outq+4] movss m7, [outq+72] addss m7, [outq+76] @@ -286,4 +184,208 @@ cglobal dct32_float_sse, 2,3,8, out, in, tmp movss [outq+92], m7 addss m6, [outq+124] movss [outq+116], m6 +%endmacro + +%define BUTTERFLY BUTTERFLY_AVX +%define BUTTERFLY0 BUTTERFLY0_AVX + +INIT_YMM +section .text align=16 +%ifdef HAVE_AVX +; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) +cglobal dct32_float_avx, 2,3,8, out, in, tmp + ; pass 1 + vmovaps m4, [inq+0] + vinsertf128 m5, m5, [inq+96], 1 + vinsertf128 m5, m5, [inq+112], 0 + vshufps m5, m5, m5, 0x1b + BUTTERFLY m4, m5, [ps_cos_vec], m6 + + vmovaps m2, [inq+64] + vinsertf128 m6, m6, [inq+32], 1 + vinsertf128 m6, m6, [inq+48], 0 + vshufps m6, m6, m6, 0x1b + BUTTERFLY m2, m6, [ps_cos_vec+32], m0 + + ; pass 2 + + BUTTERFLY m5, m6, [ps_cos_vec+64], m0 + BUTTERFLY m4, m2, [ps_cos_vec+64], m7 + + + ; pass 3 + vperm2f128 m3, m6, m4, 0x31 + vperm2f128 m1, m6, m4, 0x20 + vshufps m3, m3, m3, 0x1b + + BUTTERFLY m1, m3, [ps_cos_vec+96], m6 + + + vperm2f128 m4, m5, m2, 0x20 + vperm2f128 m5, m5, m2, 0x31 + vshufps m5, m5, m5, 0x1b + + BUTTERFLY m4, m5, [ps_cos_vec+96], m6 + + ; pass 4 + vmovaps m6, [ps_p1p1m1m1+0] + vmovaps m2, [ps_cos_vec+128] + + BUTTERFLY2 m5, m6, m2, m7 + BUTTERFLY2 m4, m6, m2, m7 + BUTTERFLY2 m1, m6, m2, m7 + BUTTERFLY2 m3, m6, m2, m7 + + + ; pass 5 + vshufps m6, m6, m6, 0xcc + vmovaps m2, [ps_cos_vec+160] + + BUTTERFLY3 m5, m6, m2, m7 + BUTTERFLY3 m4, m6, m2, m7 + BUTTERFLY3 m1, m6, m2, m7 + BUTTERFLY3 m3, m6, m2, m7 + + vperm2f128 m6, m3, m3, 0x31 + vmovaps [outq], m3 + + vextractf128 [outq+64], m5, 1 + vextractf128 [outq+32], m5, 0 + + vextractf128 [outq+80], m4, 1 + vextractf128 [outq+48], m4, 0 + + vperm2f128 m0, m1, m1, 0x31 + vmovaps [outq+96], m1 + + vzeroupper + + ; pass 6, no SIMD... +INIT_XMM + PASS6_AND_PERMUTE + RET +%endif + +%define BUTTERFLY BUTTERFLY_SSE +%define BUTTERFLY0 BUTTERFLY0_SSE + +INIT_XMM +; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) +cglobal dct32_float_sse, 2,3,8, out, in, tmp + ; pass 1 + + movaps m0, [inq+0] + movaps m1, [inq+112] + shufps m1, m1, 0x1b + BUTTERFLY m0, m1, [ps_cos_vec], m3 + + movaps m7, [inq+64] + movaps m4, [inq+48] + shufps m4, m4, 0x1b + BUTTERFLY m7, m4, [ps_cos_vec+32], m3 + + ; pass 2 + movaps m2, [ps_cos_vec+64] + BUTTERFLY m1, m4, m2, m3 + movaps [outq+48], m1 + movaps [outq+ 0], m4 + + ; pass 1 + movaps m1, [inq+16] + movaps m6, [inq+96] + shufps m6, m6, 0x1b + BUTTERFLY m1, m6, [ps_cos_vec+16], m3 + + movaps m4, [inq+80] + movaps m5, [inq+32] + shufps m5, m5, 0x1b + BUTTERFLY m4, m5, [ps_cos_vec+48], m3 + + ; pass 2 + BUTTERFLY m0, m7, m2, m3 + + movaps m2, [ps_cos_vec+80] + BUTTERFLY m6, m5, m2, m3 + + BUTTERFLY m1, m4, m2, m3 + + ; pass 3 + movaps m2, [ps_cos_vec+96] + shufps m1, m1, 0x1b + BUTTERFLY m0, m1, m2, m3 + movaps [outq+112], m0 + movaps [outq+ 96], m1 + + movaps m0, [outq+0] + shufps m5, m5, 0x1b + BUTTERFLY m0, m5, m2, m3 + + movaps m1, [outq+48] + shufps m6, m6, 0x1b + BUTTERFLY m1, m6, m2, m3 + movaps [outq+48], m1 + + shufps m4, m4, 0x1b + BUTTERFLY m7, m4, m2, m3 + + ; pass 4 + movaps m3, [ps_p1p1m1m1+0] + movaps m2, [ps_cos_vec+128] + + BUTTERFLY2 m5, m3, m2, m1 + + BUTTERFLY2 m0, m3, m2, m1 + movaps [outq+16], m0 + + BUTTERFLY2 m6, m3, m2, m1 + movaps [outq+32], m6 + + movaps m0, [outq+48] + BUTTERFLY2 m0, m3, m2, m1 + movaps [outq+48], m0 + + BUTTERFLY2 m4, m3, m2, m1 + + BUTTERFLY2 m7, m3, m2, m1 + + movaps m6, [outq+96] + BUTTERFLY2 m6, m3, m2, m1 + + movaps m0, [outq+112] + BUTTERFLY2 m0, m3, m2, m1 + + ; pass 5 + movaps m2, [ps_cos_vec+160] + shufps m3, m3, 0xcc + + BUTTERFLY3 m5, m3, m2, m1 + movaps [outq+0], m5 + + movaps m1, [outq+16] + BUTTERFLY3 m1, m3, m2, m5 + movaps [outq+96], m1 + + BUTTERFLY3 m4, m3, m2, m5 + movaps [outq+64], m4 + + BUTTERFLY3 m7, m3, m2, m5 + movaps [outq+80], m7 + + movaps m5, [outq+32] + BUTTERFLY3 m5, m3, m2, m7 + movaps [outq+32], m5 + + movaps m4, [outq+48] + BUTTERFLY3 m4, m3, m2, m7 + movaps [outq+48], m4 + + BUTTERFLY3 m6, m3, m2, m7 + movaps [outq+16], m6 + + BUTTERFLY3 m0, m3, m2, m7 + movaps [outq+112], m0 + + + ; pass 6, no SIMD... + PASS6_AND_PERMUTE RET diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c index b29412c1dc..8eef4214a2 100644 --- a/libavcodec/x86/fft.c +++ b/libavcodec/x86/fft.c @@ -57,7 +57,9 @@ av_cold void ff_fft_init_mmx(FFTContext *s) av_cold void ff_dct_init_mmx(DCTContext *s) { int has_vectors = av_get_cpu_flags(); - if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) + if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX) + s->dct32 = ff_dct32_float_avx; + else if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) s->dct32 = ff_dct32_float_sse; } #endif diff --git a/libavcodec/x86/fft.h b/libavcodec/x86/fft.h index c6379050d9..0ade2b2e7b 100644 --- a/libavcodec/x86/fft.h +++ b/libavcodec/x86/fft.h @@ -35,5 +35,6 @@ void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); void ff_dct32_float_sse(FFTSample *out, const FFTSample *in); +void ff_dct32_float_avx(FFTSample *out, const FFTSample *in); #endif /* AVCODEC_X86_FFT_H */