diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index dbf4d25136..9f5da254bf 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -2296,7 +2296,7 @@ static av_always_inline void encode_mb_internal(MPVEncContext *const s,
  * and neither of these encoders currently supports 444. */
 #define INTERLACED_DCT(s) ((chroma_format == CHROMA_420 || chroma_format == CHROMA_422) && \
                            (s)->c.avctx->flags & AV_CODEC_FLAG_INTERLACED_DCT)
-    int16_t weight[12][64];
+    DECLARE_ALIGNED(16, int16_t, weight)[12][64];
     int16_t orig[12][64];
     const int mb_x = s->c.mb_x;
     const int mb_y = s->c.mb_y;
@@ -4293,7 +4293,7 @@ static int dct_quantize_trellis_c(MPVEncContext *const s,
     return last_non_zero;
 }
 
-static int16_t basis[64][64];
+static DECLARE_ALIGNED(16, int16_t, basis)[64][64];
 
 static void build_basis(uint8_t *perm){
     int i, j, x, y;
@@ -4317,7 +4317,7 @@ static void build_basis(uint8_t *perm){
 static int dct_quantize_refine(MPVEncContext *const s, //FIXME breaks denoise?
                         int16_t *block, int16_t *weight, int16_t *orig,
                         int n, int qscale){
-    int16_t rem[64];
+    DECLARE_ALIGNED(16, int16_t, rem)[64];
     LOCAL_ALIGNED_16(int16_t, d1, [64]);
     const uint8_t *scantable;
     const uint8_t *perm_scantable;
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c
index dc8fcd8833..3cd16fefbf 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -35,13 +35,6 @@ int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
 #if HAVE_SSSE3_INLINE
 #define SCALE_OFFSET -1
 
-/*
- * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
- */
-#define PMULHRW(x, y, s, o)                     \
-    "pmulhrsw " #s ", " #x "            \n\t"   \
-    "pmulhrsw " #s ", " #y "            \n\t"
-
 #define MAX_ABS 512
 
 static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], const int16_t basis[64], int scale)
@@ -52,36 +45,39 @@ static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], c
     scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
 
     __asm__ volatile(
-        "pxor %%mm7, %%mm7              \n\t"
-        "movd  %4, %%mm5                \n\t"
-        "punpcklwd %%mm5, %%mm5         \n\t"
-        "punpcklwd %%mm5, %%mm5         \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq  (%1, %0), %%mm0          \n\t"
-        "movq  8(%1, %0), %%mm1         \n\t"
-        PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
-        "paddw (%2, %0), %%mm0          \n\t"
-        "paddw 8(%2, %0), %%mm1         \n\t"
-        "psraw $6, %%mm0                \n\t"
-        "psraw $6, %%mm1                \n\t"
-        "pmullw (%3, %0), %%mm0         \n\t"
-        "pmullw 8(%3, %0), %%mm1        \n\t"
-        "pmaddwd %%mm0, %%mm0           \n\t"
-        "pmaddwd %%mm1, %%mm1           \n\t"
-        "paddd %%mm1, %%mm0             \n\t"
-        "psrld $4, %%mm0                \n\t"
-        "paddd %%mm0, %%mm7             \n\t"
-        "add $16, %0                    \n\t"
-        "cmp $128, %0                   \n\t" //FIXME optimize & bench
-        " jb 1b                         \n\t"
-        "pshufw $0x0E, %%mm7, %%mm6     \n\t"
-        "paddd %%mm6, %%mm7             \n\t" // faster than phaddd on core2
-        "psrld $2, %%mm7                \n\t"
-        "movd %%mm7, %0                 \n\t"
-
+        "pxor            %%xmm2, %%xmm2     \n\t"
+        "movd                %4, %%xmm3     \n\t"
+        "punpcklwd       %%xmm3, %%xmm3     \n\t"
+        "pshufd      $0, %%xmm3, %%xmm3     \n\t"
+        ".p2align 4                         \n\t"
+        "1:                                 \n\t"
+        "movdqa        (%1, %0), %%xmm0     \n\t"
+        "movdqa      16(%1, %0), %%xmm1     \n\t"
+        "pmulhrsw        %%xmm3, %%xmm0     \n\t"
+        "pmulhrsw        %%xmm3, %%xmm1     \n\t"
+        "paddw         (%2, %0), %%xmm0     \n\t"
+        "paddw       16(%2, %0), %%xmm1     \n\t"
+        "psraw               $6, %%xmm0     \n\t"
+        "psraw               $6, %%xmm1     \n\t"
+        "pmullw        (%3, %0), %%xmm0     \n\t"
+        "pmullw      16(%3, %0), %%xmm1     \n\t"
+        "pmaddwd         %%xmm0, %%xmm0     \n\t"
+        "pmaddwd         %%xmm1, %%xmm1     \n\t"
+        "paddd           %%xmm1, %%xmm0     \n\t"
+        "psrld               $4, %%xmm0     \n\t"
+        "paddd           %%xmm0, %%xmm2     \n\t"
+        "add                $32, %0         \n\t"
+        "cmp               $128, %0         \n\t" //FIXME optimize & bench
+        " jb                 1b             \n\t"
+        "pshufd   $0x0E, %%xmm2, %%xmm0     \n\t"
+        "paddd           %%xmm0, %%xmm2     \n\t"
+        "pshufd   $0x01, %%xmm2, %%xmm0     \n\t"
+        "paddd           %%xmm0, %%xmm2     \n\t"
+        "psrld               $2, %%xmm2     \n\t"
+        "movd            %%xmm2, %0         \n\t"
         : "+r" (i)
         : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
+        XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")
     );
     return i;
 }
@@ -93,24 +89,25 @@ static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int sca
     if (FFABS(scale) < 1024) {
         scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
         __asm__ volatile(
-                "movd  %3, %%mm5        \n\t"
-                "punpcklwd %%mm5, %%mm5 \n\t"
-                "punpcklwd %%mm5, %%mm5 \n\t"
-                ".p2align 4             \n\t"
-                "1:                     \n\t"
-                "movq  (%1, %0), %%mm0  \n\t"
-                "movq  8(%1, %0), %%mm1 \n\t"
-                PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
-                "paddw (%2, %0), %%mm0  \n\t"
-                "paddw 8(%2, %0), %%mm1 \n\t"
-                "movq %%mm0, (%2, %0)   \n\t"
-                "movq %%mm1, 8(%2, %0)  \n\t"
-                "add $16, %0            \n\t"
-                "cmp $128, %0           \n\t" // FIXME optimize & bench
-                " jb 1b                 \n\t"
-
+                "movd                %3, %%xmm2     \n\t"
+                "punpcklwd       %%xmm2, %%xmm2     \n\t"
+                "pshufd      $0, %%xmm2, %%xmm2     \n\t"
+                ".p2align 4                         \n\t"
+                "1:                                 \n\t"
+                "movdqa        (%1, %0), %%xmm0     \n\t"
+                "movdqa      16(%1, %0), %%xmm1     \n\t"
+                "pmulhrsw        %%xmm2, %%xmm0     \n\t"
+                "pmulhrsw        %%xmm2, %%xmm1     \n\t"
+                "paddw         (%2, %0), %%xmm0     \n\t"
+                "paddw       16(%2, %0), %%xmm1     \n\t"
+                "movdqa          %%xmm0, (%2, %0)   \n\t"
+                "movdqa          %%xmm1, 16(%2, %0) \n\t"
+                "add                $32, %0         \n\t"
+                "cmp               $128, %0         \n\t" // FIXME optimize & bench
+                " jb                 1b             \n\t"
                 : "+r" (i)
                 : "r"(basis), "r"(rem), "g"(scale)
+                XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2")
         );
     } else {
         for (i=0; i<8*8; i++) {
diff --git a/tests/checkasm/mpegvideoencdsp.c b/tests/checkasm/mpegvideoencdsp.c
index 281195cd5f..a4a4fa6f5c 100644
--- a/tests/checkasm/mpegvideoencdsp.c
+++ b/tests/checkasm/mpegvideoencdsp.c
@@ -39,13 +39,13 @@
 
 static void check_add_8x8basis(MpegvideoEncDSPContext *c)
 {
-    declare_func_emms(AV_CPU_FLAG_SSSE3, void, int16_t rem[64], const int16_t basis[64], int scale);
+    declare_func(void, int16_t rem[64], const int16_t basis[64], int scale);
     if (check_func(c->add_8x8basis, "add_8x8basis")) {
         // FIXME: What are the actual ranges for these values?
         int scale = sign_extend(rnd(), 12);
-        int16_t rem1[64];
-        int16_t rem2[64];
-        int16_t basis[64];
+        DECLARE_ALIGNED(16, int16_t, rem1)[64];
+        DECLARE_ALIGNED(16, int16_t, rem2)[64];
+        DECLARE_ALIGNED(16, int16_t, basis)[64];
 
         randomize_buffer_clipped(basis, -15760, 15760);
         randomize_buffers(rem1, sizeof(rem1));