FFmpeg/libavcodec/x86/vp56dsp.asm

;******************************************************************************
;* MMX/SSE2-optimized functions for the VP6 decoder
;* Copyright (C) 2009  Sebastien Lucas <sebastien.lucas@gmail.com>
;* Copyright (C) 2009  Zuxy Meng <zuxy.meng@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86inc.asm"
%include "libavutil/x86/x86util.asm"

cextern pw_64

SECTION .text

%macro DIAG4_MMX 6
    movq          m0, [%1+%2]
    movq          m1, [%1+%3]
    movq          m3, m0
    movq          m4, m1
    punpcklbw     m0, m7
    punpcklbw     m1, m7
    punpckhbw     m3, m7
    punpckhbw     m4, m7
    pmullw        m0, [rsp+8*11] ; src[x-8 ] * biweight [0]
    pmullw        m1, [rsp+8*12] ; src[x   ] * biweight [1]
    pmullw        m3, [rsp+8*11] ; src[x-8 ] * biweight [0]
    pmullw        m4, [rsp+8*12] ; src[x   ] * biweight [1]
    paddw         m0, m1
    paddw         m3, m4
    movq          m1, [%1+%4]
    movq          m2, [%1+%5]
    movq          m4, m1
    movq          m5, m2
    punpcklbw     m1, m7
    punpcklbw     m2, m7
    punpckhbw     m4, m7
    punpckhbw     m5, m7
    pmullw        m1, [rsp+8*13] ; src[x+8 ] * biweight [2]
    pmullw        m2, [rsp+8*14] ; src[x+16] * biweight [3]
    pmullw        m4, [rsp+8*13] ; src[x+8 ] * biweight [2]
    pmullw        m5, [rsp+8*14] ; src[x+16] * biweight [3]
    paddw         m1, m2
    paddw         m4, m5
    paddsw        m0, m1
    paddsw        m3, m4
    paddsw        m0, m6         ; Add 64
    paddsw        m3, m6         ; Add 64
    psraw         m0, 7
    psraw         m3, 7
    packuswb      m0, m3
    movq        [%6], m0
%endmacro

%macro DIAG4_SSE2 6
    movq          m0, [%1+%2]
    movq          m1, [%1+%3]
    punpcklbw     m0, m7
    punpcklbw     m1, m7
    pmullw        m0, m4         ; src[x-8 ] * biweight [0]
    pmullw        m1, m5         ; src[x   ] * biweight [1]
    paddw         m0, m1
    movq          m1, [%1+%4]
    movq          m2, [%1+%5]
    punpcklbw     m1, m7
    punpcklbw     m2, m7
    pmullw        m1, m6         ; src[x+8 ] * biweight [2]
    pmullw        m2, m3         ; src[x+16] * biweight [3]
    paddw         m1, m2
    paddsw        m0, m1
    paddsw        m0, [pw_64]    ; Add 64
    psraw         m0, 7
    packuswb      m0, m0
    movq        [%6], m0
%endmacro

%macro SPLAT4REGS_MMX 0
    movq         m5, m3
    punpcklwd    m3, m3
    movq         m4, m3
    punpckldq    m3, m3
    punpckhdq    m4, m4
    punpckhwd    m5, m5
    movq         m2, m5
    punpckhdq    m2, m2
    punpckldq    m5, m5
    movq [rsp+8*11], m3
    movq [rsp+8*12], m4
    movq [rsp+8*13], m5
    movq [rsp+8*14], m2
%endmacro

%macro SPLAT4REGS_SSE2 0
    pshuflw      m4, m3, 0x0
    pshuflw      m5, m3, 0x55
    pshuflw      m6, m3, 0xAA
    pshuflw      m3, m3, 0xFF
    punpcklqdq   m4, m4
    punpcklqdq   m5, m5
    punpcklqdq   m6, m6
    punpcklqdq   m3, m3
%endmacro

%macro vp6_filter_diag4 2
; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride,
;                                const int16_t h_weight[4], const int16_t v_weights[4])
cglobal vp6_filter_diag4_%1, 5, 7, %2
    mov          r5, rsp         ; backup stack pointer
    and         rsp, ~(mmsize-1) ; align stack
%ifidn %1, sse2
    sub         rsp, 8*11
%else
    sub         rsp, 8*15
    movq         m6, [pw_64]
%endif
%ifdef ARCH_X86_64
    movsxd       r2, r2d
%endif

    sub          r1, r2

    pxor         m7, m7
    movq         m3, [r3]
    SPLAT4REGS

    mov          r3, rsp
    mov          r6, 11
.nextrow
    DIAG4        r1, -1, 0, 1, 2, r3
    add          r3, 8
    add          r1, r2
    dec          r6
    jnz .nextrow

    movq         m3, [r4]
    SPLAT4REGS

    lea          r3, [rsp+8]
    mov          r6, 8
.nextcol
    DIAG4        r3, -8, 0, 8, 16, r0
    add          r3, 8
    add          r0, r2
    dec          r6
    jnz .nextcol

    mov         rsp, r5          ; restore stack pointer
    RET
%endmacro

INIT_MMX
%define DIAG4      DIAG4_MMX
%define SPLAT4REGS SPLAT4REGS_MMX
vp6_filter_diag4 mmx,  0

INIT_XMM
%define DIAG4      DIAG4_SSE2
%define SPLAT4REGS SPLAT4REGS_SSE2
vp6_filter_diag4 sse2, 8
Move vp6_filter_diag4() x86 SIMD code from inline ASM to YASM. This should help in fixing the Win64 fate failures. Originally committed as revision 24922 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-08-25 16:44:16 +03:00			`;******************************************************************************`
			`;* MMX/SSE2-optimized functions for the VP6 decoder`
			`;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>`
			`;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>`
			`;*`
			`;* This file is part of FFmpeg.`
			`;*`
			`;* FFmpeg is free software; you can redistribute it and/or`
			`;* modify it under the terms of the GNU Lesser General Public`
			`;* License as published by the Free Software Foundation; either`
			`;* version 2.1 of the License, or (at your option) any later version.`
			`;*`
			`;* FFmpeg is distributed in the hope that it will be useful,`
			`;* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`;* Lesser General Public License for more details.`
			`;*`
			`;* You should have received a copy of the GNU Lesser General Public`
			`;* License along with FFmpeg; if not, write to the Free Software`
Fix FSF address copy paste error in some license headers. 2011-05-14 22:32:31 +03:00			`;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
Move vp6_filter_diag4() x86 SIMD code from inline ASM to YASM. This should help in fixing the Win64 fate failures. Originally committed as revision 24922 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-08-25 16:44:16 +03:00			`;******************************************************************************`

Move x264asm to libavutil. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-06-05 17:19:16 +03:00			`%include "libavutil/x86/x86inc.asm"`
			`%include "libavutil/x86/x86util.asm"`
Move vp6_filter_diag4() x86 SIMD code from inline ASM to YASM. This should help in fixing the Win64 fate failures. Originally committed as revision 24922 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-08-25 16:44:16 +03:00
			`cextern pw_64`

			`SECTION .text`

			`%macro DIAG4_MMX 6`
			`movq m0, [%1+%2]`
			`movq m1, [%1+%3]`
			`movq m3, m0`
			`movq m4, m1`
			`punpcklbw m0, m7`
			`punpcklbw m1, m7`
			`punpckhbw m3, m7`
			`punpckhbw m4, m7`
			`pmullw m0, [rsp+811] ; src[x-8 ] biweight [0]`
			`pmullw m1, [rsp+812] ; src[x ] biweight [1]`
			`pmullw m3, [rsp+811] ; src[x-8 ] biweight [0]`
			`pmullw m4, [rsp+812] ; src[x ] biweight [1]`
			`paddw m0, m1`
			`paddw m3, m4`
			`movq m1, [%1+%4]`
			`movq m2, [%1+%5]`
			`movq m4, m1`
			`movq m5, m2`
			`punpcklbw m1, m7`
			`punpcklbw m2, m7`
Fix typos when converting inline asm to yasm, fixes MMX-only fate-ea-vp61. Originally committed as revision 24948 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-08-26 17:33:39 +03:00			`punpckhbw m4, m7`
			`punpckhbw m5, m7`
Move vp6_filter_diag4() x86 SIMD code from inline ASM to YASM. This should help in fixing the Win64 fate failures. Originally committed as revision 24922 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-08-25 16:44:16 +03:00			`pmullw m1, [rsp+813] ; src[x+8 ] biweight [2]`
			`pmullw m2, [rsp+814] ; src[x+16] biweight [3]`
			`pmullw m4, [rsp+813] ; src[x+8 ] biweight [2]`
			`pmullw m5, [rsp+814] ; src[x+16] biweight [3]`
			`paddw m1, m2`
			`paddw m4, m5`
			`paddsw m0, m1`
			`paddsw m3, m4`
			`paddsw m0, m6 ; Add 64`
			`paddsw m3, m6 ; Add 64`
			`psraw m0, 7`
			`psraw m3, 7`
			`packuswb m0, m3`
			`movq [%6], m0`
			`%endmacro`

			`%macro DIAG4_SSE2 6`
			`movq m0, [%1+%2]`
			`movq m1, [%1+%3]`
			`punpcklbw m0, m7`
			`punpcklbw m1, m7`
			`pmullw m0, m4 ; src[x-8 ] * biweight [0]`
			`pmullw m1, m5 ; src[x ] * biweight [1]`
			`paddw m0, m1`
			`movq m1, [%1+%4]`
			`movq m2, [%1+%5]`
			`punpcklbw m1, m7`
			`punpcklbw m2, m7`
			`pmullw m1, m6 ; src[x+8 ] * biweight [2]`
			`pmullw m2, m3 ; src[x+16] * biweight [3]`
			`paddw m1, m2`
			`paddsw m0, m1`
			`paddsw m0, [pw_64] ; Add 64`
			`psraw m0, 7`
			`packuswb m0, m0`
			`movq [%6], m0`
			`%endmacro`

			`%macro SPLAT4REGS_MMX 0`
			`movq m5, m3`
			`punpcklwd m3, m3`
			`movq m4, m3`
			`punpckldq m3, m3`
			`punpckhdq m4, m4`
			`punpckhwd m5, m5`
Fix typos when converting inline asm to yasm, fixes MMX-only fate-ea-vp61. Originally committed as revision 24948 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-08-26 17:33:39 +03:00			`movq m2, m5`
			`punpckhdq m2, m2`
Move vp6_filter_diag4() x86 SIMD code from inline ASM to YASM. This should help in fixing the Win64 fate failures. Originally committed as revision 24922 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-08-25 16:44:16 +03:00			`punpckldq m5, m5`
			`movq [rsp+8*11], m3`
			`movq [rsp+8*12], m4`
			`movq [rsp+8*13], m5`
Fix typos when converting inline asm to yasm, fixes MMX-only fate-ea-vp61. Originally committed as revision 24948 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-08-26 17:33:39 +03:00			`movq [rsp+8*14], m2`
Move vp6_filter_diag4() x86 SIMD code from inline ASM to YASM. This should help in fixing the Win64 fate failures. Originally committed as revision 24922 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-08-25 16:44:16 +03:00			`%endmacro`

			`%macro SPLAT4REGS_SSE2 0`
			`pshuflw m4, m3, 0x0`
			`pshuflw m5, m3, 0x55`
			`pshuflw m6, m3, 0xAA`
			`pshuflw m3, m3, 0xFF`
			`punpcklqdq m4, m4`
			`punpcklqdq m5, m5`
			`punpcklqdq m6, m6`
			`punpcklqdq m3, m3`
			`%endmacro`

			`%macro vp6_filter_diag4 2`
			`; void ff_vp6_filter_diag4_<opt>(uint8_t dst, uint8_t src, int stride,`
			`; const int16_t h_weight[4], const int16_t v_weights[4])`
			`cglobal vp6_filter_diag4_%1, 5, 7, %2`
			`mov r5, rsp ; backup stack pointer`
			`and rsp, ~(mmsize-1) ; align stack`
			`%ifidn %1, sse2`
			`sub rsp, 8*11`
			`%else`
			`sub rsp, 8*15`
			`movq m6, [pw_64]`
			`%endif`
VP6: fix vp6_filter_diag4_mmx/sse on 64-bit The stride can be negative and must be sign extended before being used in pointer arithmetic. Originally committed as revision 24926 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-08-25 18:41:11 +03:00			`%ifdef ARCH_X86_64`
			`movsxd r2, r2d`
			`%endif`
Move vp6_filter_diag4() x86 SIMD code from inline ASM to YASM. This should help in fixing the Win64 fate failures. Originally committed as revision 24922 to svn://svn.ffmpeg.org/ffmpeg/trunk 2010-08-25 16:44:16 +03:00
			`sub r1, r2`

			`pxor m7, m7`
			`movq m3, [r3]`
			`SPLAT4REGS`

			`mov r3, rsp`
			`mov r6, 11`
			`.nextrow`
			`DIAG4 r1, -1, 0, 1, 2, r3`
			`add r3, 8`
			`add r1, r2`
			`dec r6`
			`jnz .nextrow`

			`movq m3, [r4]`
			`SPLAT4REGS`

			`lea r3, [rsp+8]`
			`mov r6, 8`
			`.nextcol`
			`DIAG4 r3, -8, 0, 8, 16, r0`
			`add r3, 8`
			`add r0, r2`
			`dec r6`
			`jnz .nextcol`

			`mov rsp, r5 ; restore stack pointer`
			`RET`
			`%endmacro`

			`INIT_MMX`
			`%define DIAG4 DIAG4_MMX`
			`%define SPLAT4REGS SPLAT4REGS_MMX`
			`vp6_filter_diag4 mmx, 0`

			`INIT_XMM`
			`%define DIAG4 DIAG4_SSE2`
			`%define SPLAT4REGS SPLAT4REGS_SSE2`
			`vp6_filter_diag4 sse2, 8`