diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm index 9570969cea..133817cb71 100644 --- a/libswscale/x86/output.asm +++ b/libswscale/x86/output.asm @@ -54,8 +54,8 @@ SECTION .text ; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple ; of 2. $offset is either 0 or 3. $dither holds 8 values. ;----------------------------------------------------------------------------- -%macro yuv2planeX_mainloop 1 -.pixelloop: +%macro yuv2planeX_mainloop 2 +.pixelloop_%2: %assign %%i 0 ; the rep here is for the 8bit output mmx case, where dither covers ; 8 pixels but we can only handle 2 pixels per register, and thus 4 @@ -82,7 +82,7 @@ SECTION .text mova m2, m1 %endif ; %1 == 8/9/10/16 movsx cntr_reg, fltsizem -.filterloop_ %+ %%i: +.filterloop_%2_ %+ %%i: ; input pixels mov r6, [srcq+gprsize*cntr_reg-2*gprsize] %if %1 == 16 @@ -129,7 +129,7 @@ SECTION .text %endif ; %1 == 8/9/10/16 sub cntr_reg, 2 - jg .filterloop_ %+ %%i + jg .filterloop_%2_ %+ %%i %if %1 == 16 psrad m2, 31 - %1 @@ -156,7 +156,7 @@ SECTION .text %endif ; mmxext/sse2/sse4/avx pminsw m2, [yuv2yuvX_%1_upper] %endif ; %1 == 9/10/16 - mova [dstq+r5*2], m2 + mov%2 [dstq+r5*2], m2 %endif ; %1 == 8/9/10/16 add r5, mmsize/2 @@ -164,7 +164,7 @@ SECTION .text %assign %%i %%i+2 %endrep - jg .pixelloop + jg .pixelloop_%2 %endmacro %macro yuv2planeX_fn 3 @@ -235,7 +235,16 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset xor r5, r5 -yuv2planeX_mainloop %1 +%if mmsize == 8 || %1 == 8 + yuv2planeX_mainloop %1, a +%else ; mmsize == 16 + test dstq, 15 + jnz .unaligned + yuv2planeX_mainloop %1, a + REP_RET +.unaligned: + yuv2planeX_mainloop %1, u +%endif ; mmsize == 8/16 %if %1 == 8 %if ARCH_X86_32