diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c index dbf4d25136..9f5da254bf 100644 --- a/libavcodec/mpegvideo_enc.c +++ b/libavcodec/mpegvideo_enc.c @@ -2296,7 +2296,7 @@ static av_always_inline void encode_mb_internal(MPVEncContext *const s, * and neither of these encoders currently supports 444. */ #define INTERLACED_DCT(s) ((chroma_format == CHROMA_420 || chroma_format == CHROMA_422) && \ (s)->c.avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT) - int16_t weight[12][64]; + DECLARE_ALIGNED(16, int16_t, weight)[12][64]; int16_t orig[12][64]; const int mb_x = s->c.mb_x; const int mb_y = s->c.mb_y; @@ -4293,7 +4293,7 @@ static int dct_quantize_trellis_c(MPVEncContext *const s, return last_non_zero; } -static int16_t basis[64][64]; +static DECLARE_ALIGNED(16, int16_t, basis)[64][64]; static void build_basis(uint8_t *perm){ int i, j, x, y; @@ -4317,7 +4317,7 @@ static void build_basis(uint8_t *perm){ static int dct_quantize_refine(MPVEncContext *const s, //FIXME breaks denoise? int16_t *block, int16_t *weight, int16_t *orig, int n, int qscale){ - int16_t rem[64]; + DECLARE_ALIGNED(16, int16_t, rem)[64]; LOCAL_ALIGNED_16(int16_t, d1, [64]); const uint8_t *scantable; const uint8_t *perm_scantable; diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c index dc8fcd8833..3cd16fefbf 100644 --- a/libavcodec/x86/mpegvideoencdsp_init.c +++ b/libavcodec/x86/mpegvideoencdsp_init.c @@ -35,13 +35,6 @@ int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size); #if HAVE_SSSE3_INLINE #define SCALE_OFFSET -1 -/* - * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30] - */ -#define PMULHRW(x, y, s, o) \ - "pmulhrsw " #s ", " #x " \n\t" \ - "pmulhrsw " #s ", " #y " \n\t" - #define MAX_ABS 512 static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], const int16_t basis[64], int scale) @@ -52,36 +45,39 @@ static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], c scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "movd %4, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq 8(%1, %0), %%mm1 \n\t" - PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) - "paddw (%2, %0), %%mm0 \n\t" - "paddw 8(%2, %0), %%mm1 \n\t" - "psraw $6, %%mm0 \n\t" - "psraw $6, %%mm1 \n\t" - "pmullw (%3, %0), %%mm0 \n\t" - "pmullw 8(%3, %0), %%mm1 \n\t" - "pmaddwd %%mm0, %%mm0 \n\t" - "pmaddwd %%mm1, %%mm1 \n\t" - "paddd %%mm1, %%mm0 \n\t" - "psrld $4, %%mm0 \n\t" - "paddd %%mm0, %%mm7 \n\t" - "add $16, %0 \n\t" - "cmp $128, %0 \n\t" //FIXME optimize & bench - " jb 1b \n\t" - "pshufw $0x0E, %%mm7, %%mm6 \n\t" - "paddd %%mm6, %%mm7 \n\t" // faster than phaddd on core2 - "psrld $2, %%mm7 \n\t" - "movd %%mm7, %0 \n\t" - + "pxor %%xmm2, %%xmm2 \n\t" + "movd %4, %%xmm3 \n\t" + "punpcklwd %%xmm3, %%xmm3 \n\t" + "pshufd $0, %%xmm3, %%xmm3 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movdqa (%1, %0), %%xmm0 \n\t" + "movdqa 16(%1, %0), %%xmm1 \n\t" + "pmulhrsw %%xmm3, %%xmm0 \n\t" + "pmulhrsw %%xmm3, %%xmm1 \n\t" + "paddw (%2, %0), %%xmm0 \n\t" + "paddw 16(%2, %0), %%xmm1 \n\t" + "psraw $6, %%xmm0 \n\t" + "psraw $6, %%xmm1 \n\t" + "pmullw (%3, %0), %%xmm0 \n\t" + "pmullw 16(%3, %0), %%xmm1 \n\t" + "pmaddwd %%xmm0, %%xmm0 \n\t" + "pmaddwd %%xmm1, %%xmm1 \n\t" + "paddd %%xmm1, %%xmm0 \n\t" + "psrld $4, %%xmm0 \n\t" + "paddd %%xmm0, %%xmm2 \n\t" + "add $32, %0 \n\t" + "cmp $128, %0 \n\t" //FIXME optimize & bench + " jb 1b \n\t" + "pshufd $0x0E, %%xmm2, %%xmm0 \n\t" + "paddd %%xmm0, %%xmm2 \n\t" + "pshufd $0x01, %%xmm2, %%xmm0 \n\t" + "paddd %%xmm0, %%xmm2 \n\t" + "psrld $2, %%xmm2 \n\t" + "movd %%xmm2, %0 \n\t" : "+r" (i) : "r"(basis), "r"(rem), "r"(weight), "g"(scale) + XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3") ); return i; } @@ -93,24 +89,25 @@ static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int sca if (FFABS(scale) < 1024) { scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; __asm__ volatile( - "movd %3, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq 8(%1, %0), %%mm1 \n\t" - PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) - "paddw (%2, %0), %%mm0 \n\t" - "paddw 8(%2, %0), %%mm1 \n\t" - "movq %%mm0, (%2, %0) \n\t" - "movq %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" - "cmp $128, %0 \n\t" // FIXME optimize & bench - " jb 1b \n\t" - + "movd %3, %%xmm2 \n\t" + "punpcklwd %%xmm2, %%xmm2 \n\t" + "pshufd $0, %%xmm2, %%xmm2 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movdqa (%1, %0), %%xmm0 \n\t" + "movdqa 16(%1, %0), %%xmm1 \n\t" + "pmulhrsw %%xmm2, %%xmm0 \n\t" + "pmulhrsw %%xmm2, %%xmm1 \n\t" + "paddw (%2, %0), %%xmm0 \n\t" + "paddw 16(%2, %0), %%xmm1 \n\t" + "movdqa %%xmm0, (%2, %0) \n\t" + "movdqa %%xmm1, 16(%2, %0) \n\t" + "add $32, %0 \n\t" + "cmp $128, %0 \n\t" // FIXME optimize & bench + " jb 1b \n\t" : "+r" (i) : "r"(basis), "r"(rem), "g"(scale) + XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2") ); } else { for (i=0; i<8*8; i++) { diff --git a/tests/checkasm/mpegvideoencdsp.c b/tests/checkasm/mpegvideoencdsp.c index 281195cd5f..a4a4fa6f5c 100644 --- a/tests/checkasm/mpegvideoencdsp.c +++ b/tests/checkasm/mpegvideoencdsp.c @@ -39,13 +39,13 @@ static void check_add_8x8basis(MpegvideoEncDSPContext *c) { - declare_func_emms(AV_CPU_FLAG_SSSE3, void, int16_t rem[64], const int16_t basis[64], int scale); + declare_func(void, int16_t rem[64], const int16_t basis[64], int scale); if (check_func(c->add_8x8basis, "add_8x8basis")) { // FIXME: What are the actual ranges for these values? int scale = sign_extend(rnd(), 12); - int16_t rem1[64]; - int16_t rem2[64]; - int16_t basis[64]; + DECLARE_ALIGNED(16, int16_t, rem1)[64]; + DECLARE_ALIGNED(16, int16_t, rem2)[64]; + DECLARE_ALIGNED(16, int16_t, basis)[64]; randomize_buffer_clipped(basis, -15760, 15760); randomize_buffers(rem1, sizeof(rem1));