diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index 9570969cea..133817cb71 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -54,8 +54,8 @@ SECTION .text
 ; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
 ; of 2. $offset is either 0 or 3. $dither holds 8 values.
 ;-----------------------------------------------------------------------------
-%macro yuv2planeX_mainloop 1
-.pixelloop:
+%macro yuv2planeX_mainloop 2
+.pixelloop_%2:
 %assign %%i 0
     ; the rep here is for the 8bit output mmx case, where dither covers
     ; 8 pixels but we can only handle 2 pixels per register, and thus 4
@@ -82,7 +82,7 @@ SECTION .text
     mova            m2,  m1
 %endif ; %1 == 8/9/10/16
     movsx     cntr_reg,  fltsizem
-.filterloop_ %+ %%i:
+.filterloop_%2_ %+ %%i:
     ; input pixels
     mov             r6, [srcq+gprsize*cntr_reg-2*gprsize]
 %if %1 == 16
@@ -129,7 +129,7 @@ SECTION .text
 %endif ; %1 == 8/9/10/16
 
     sub       cntr_reg,  2
-    jg .filterloop_ %+ %%i
+    jg .filterloop_%2_ %+ %%i
 
 %if %1 == 16
     psrad           m2,  31 - %1
@@ -156,7 +156,7 @@ SECTION .text
 %endif ; mmxext/sse2/sse4/avx
     pminsw          m2, [yuv2yuvX_%1_upper]
 %endif ; %1 == 9/10/16
-    mova   [dstq+r5*2],  m2
+    mov%2   [dstq+r5*2],  m2
 %endif ; %1 == 8/9/10/16
 
     add             r5,  mmsize/2
@@ -164,7 +164,7 @@ SECTION .text
 
 %assign %%i %%i+2
 %endrep
-    jg .pixelloop
+    jg .pixelloop_%2
 %endmacro
 
 %macro yuv2planeX_fn 3
@@ -235,7 +235,16 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
 
     xor             r5,  r5
 
-yuv2planeX_mainloop %1
+%if mmsize == 8 || %1 == 8
+    yuv2planeX_mainloop %1, a
+%else ; mmsize == 16
+    test          dstq, 15
+    jnz .unaligned
+    yuv2planeX_mainloop %1, a
+    REP_RET
+.unaligned:
+    yuv2planeX_mainloop %1, u
+%endif ; mmsize == 8/16
 
 %if %1 == 8
 %if ARCH_X86_32