FFmpeg/libavcodec/x86/dwt_yasm.asm

;******************************************************************************
;* MMX optimized discrete wavelet trasnform
;* Copyright (c) 2010 David Conrad
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA
pw_1: times 8 dw 1
pw_2: times 8 dw 2
pw_8: times 8 dw 8
pw_16: times 8 dw 16
pw_1991: times 4 dw 9,-1

section .text

; %1 -= (%2 + %3 + 2)>>2     %4 is pw_2
%macro COMPOSE_53iL0 4
    paddw   %2, %3
    paddw   %2, %4
    psraw   %2, 2
    psubw   %1, %2
%endm

; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
; if %4 is supplied, %1 is loaded unaligned from there
; m2: clobbered  m3: pw_8  m4: pw_1991
%macro COMPOSE_DD97iH0 3-4
    paddw   m0, %3
    paddw   m1, %2
    psubw   m0, m3
    mova    m2, m1
    punpcklwd m1, m0
    punpckhwd m2, m0
    pmaddwd m1, m4
    pmaddwd m2, m4
%if %0 > 3
    movu    %1, %4
%endif
    psrad   m1, 4
    psrad   m2, 4
    packssdw m1, m2
    paddw   m1, %1
%endm

%macro COMPOSE_VERTICAL 1
; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
;                                  int width)
cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
    mova    m2, [pw_2]
%if ARCH_X86_64
    mov     widthd, widthd
%endif
.loop:
    sub     widthq, mmsize/2
    mova    m1, [b0q+2*widthq]
    mova    m0, [b1q+2*widthq]
    COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
    mova    [b1q+2*widthq], m0
    jg      .loop
    REP_RET

; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
;                                  int width)
cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
    mova    m1, [pw_1]
%if ARCH_X86_64
    mov     widthd, widthd
%endif
.loop:
    sub     widthq, mmsize/2
    mova    m0, [b0q+2*widthq]
    paddw   m0, [b2q+2*widthq]
    paddw   m0, m1
    psraw   m0, 1
    paddw   m0, [b1q+2*widthq]
    mova    [b1q+2*widthq], m0
    jg      .loop
    REP_RET

; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
;                               IDWTELEM *b3, IDWTELEM *b4, int width)
cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
    mova    m3, [pw_8]
    mova    m4, [pw_1991]
%if ARCH_X86_64
    mov     widthd, widthd
%endif
.loop:
    sub     widthq, mmsize/2
    mova    m0, [b0q+2*widthq]
    mova    m1, [b1q+2*widthq]
    COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
    mova    [b2q+2*widthq], m1
    jg      .loop
    REP_RET

; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
;                                IDWTELEM *b3, IDWTELEM *b4, int width)
cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
    mova    m3, [pw_16]
    mova    m4, [pw_1991]
%if ARCH_X86_64
    mov     widthd, widthd
%endif
.loop:
    sub     widthq, mmsize/2
    mova    m0, [b0q+2*widthq]
    mova    m1, [b1q+2*widthq]
    mova    m5, [b2q+2*widthq]
    paddw   m0, [b4q+2*widthq]
    paddw   m1, [b3q+2*widthq]
    psubw   m0, m3
    mova    m2, m1
    punpcklwd m1, m0
    punpckhwd m2, m0
    pmaddwd m1, m4
    pmaddwd m2, m4
    psrad   m1, 5
    psrad   m2, 5
    packssdw m1, m2
    psubw   m5, m1
    mova    [b2q+2*widthq], m5
    jg      .loop
    REP_RET

; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
    mova    m3, [pw_1]
%if ARCH_X86_64
    mov     widthd, widthd
%endif
.loop:
    sub     widthq, mmsize/2
    mova    m1, [b1q+2*widthq]
    mova    m0, [b0q+2*widthq]
    mova    m2, m1
    paddw   m1, m3
    psraw   m1, 1
    psubw   m0, m1
    mova    [b0q+2*widthq], m0
    paddw   m2, m0
    mova    [b1q+2*widthq], m2
    jg      .loop
    REP_RET
%endmacro

; extend the left and right edges of the tmp array by %1 and %2 respectively
%macro EDGE_EXTENSION 3
    mov     %3, [tmpq]
%assign %%i 1
%rep %1
    mov     [tmpq-2*%%i], %3
    %assign %%i %%i+1
%endrep
    mov     %3, [tmpq+2*w2q-2]
%assign %%i 0
%rep %2
    mov     [tmpq+2*w2q+2*%%i], %3
    %assign %%i %%i+1
%endrep
%endmacro


%macro HAAR_HORIZONTAL 2
; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
    mov    w2d, wd
    xor     xq, xq
    shr    w2d, 1
    lea  b_w2q, [bq+wq]
    mova    m3, [pw_1]
.lowpass_loop:
    movu    m1, [b_w2q + 2*xq]
    mova    m0, [bq    + 2*xq]
    paddw   m1, m3
    psraw   m1, 1
    psubw   m0, m1
    mova    [tmpq + 2*xq], m0
    add     xq, mmsize/2
    cmp     xq, w2q
    jl      .lowpass_loop

    xor     xq, xq
    and    w2q, ~(mmsize/2 - 1)
    cmp    w2q, mmsize/2
    jl      .end

.highpass_loop:
    movu    m1, [b_w2q + 2*xq]
    mova    m0, [tmpq  + 2*xq]
    paddw   m1, m0

    ; shift and interleave
%if %2 == 1
    paddw   m0, m3
    paddw   m1, m3
    psraw   m0, 1
    psraw   m1, 1
%endif
    mova    m2, m0
    punpcklwd m0, m1
    punpckhwd m2, m1
    mova    [bq+4*xq], m0
    mova    [bq+4*xq+mmsize], m2

    add     xq, mmsize/2
    cmp     xq, w2q
    jl      .highpass_loop
.end:
    REP_RET
%endmacro


INIT_XMM
; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
    mov    w2d, wd
    xor     xd, xd
    shr    w2d, 1
    lea  b_w2q, [bq+wq]
    movu    m4, [bq+wq]
    mova    m7, [pw_2]
    pslldq  m4, 14
.lowpass_loop:
    movu    m1, [b_w2q + 2*xq]
    mova    m0, [bq    + 2*xq]
    mova    m2, m1
    palignr m1, m4, 14
    mova    m4, m2
    COMPOSE_53iL0 m0, m1, m2, m7
    mova    [tmpq + 2*xq], m0
    add     xd, mmsize/2
    cmp     xd, w2d
    jl      .lowpass_loop

    EDGE_EXTENSION 1, 2, xw
    ; leave the last up to 7 (sse) or 3 (mmx) values for C
    xor     xd, xd
    and    w2d, ~(mmsize/2 - 1)
    cmp    w2d, mmsize/2
    jl      .end

    mova    m7, [tmpq-mmsize]
    mova    m0, [tmpq]
    mova    m5, [pw_1]
    mova    m3, [pw_8]
    mova    m4, [pw_1991]
.highpass_loop:
    mova    m6, m0
    palignr m0, m7, 14
    mova    m7, [tmpq + 2*xq + 16]
    mova    m1, m7
    mova    m2, m7
    palignr m1, m6, 2
    palignr m2, m6, 4
    COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
    mova    m0, m7
    mova    m7, m6

    ; shift and interleave
    paddw   m6, m5
    paddw   m1, m5
    psraw   m6, 1
    psraw   m1, 1
    mova    m2, m6
    punpcklwd m6, m1
    punpckhwd m2, m1
    mova    [bq+4*xq], m6
    mova    [bq+4*xq+mmsize], m2

    add     xd, mmsize/2
    cmp     xd, w2d
    jl      .highpass_loop
.end:
    REP_RET


%if ARCH_X86_64 == 0
INIT_MMX
COMPOSE_VERTICAL mmx
HAAR_HORIZONTAL mmx, 0
HAAR_HORIZONTAL mmx, 1
%endif

;;INIT_XMM
INIT_XMM
COMPOSE_VERTICAL sse2
HAAR_HORIZONTAL sse2, 0
HAAR_HORIZONTAL sse2, 1
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`;******************************************************************************`
			`;* MMX optimized discrete wavelet trasnform`
			`;* Copyright (c) 2010 David Conrad`
			`;*`
			`;* This file is part of FFmpeg.`
			`;*`
			`;* FFmpeg is free software; you can redistribute it and/or`
			`;* modify it under the terms of the GNU Lesser General Public`
			`;* License as published by the Free Software Foundation; either`
			`;* version 2.1 of the License, or (at your option) any later version.`
			`;*`
			`;* FFmpeg is distributed in the hope that it will be useful,`
			`;* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`;* Lesser General Public License for more details.`
			`;*`
			`;* You should have received a copy of the GNU Lesser General Public`
			`;* License along with FFmpeg; if not, write to the Free Software`
			`;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`;******************************************************************************`

Merge commit '04581c8c77ce779e4e70684ac45302972766be0f' * commit '04581c8c77ce779e4e70684ac45302972766be0f': x86: yasm: Use complete source path for macro helper %includes Conflicts: Makefile Merged-by: Michael Niedermayer <michaelni@gmx.at> 2012-10-31 15:57:09 +03:00			`%include "libavutil/x86/x86util.asm"`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00
			`SECTION_RODATA`
			`pw_1: times 8 dw 1`
			`pw_2: times 8 dw 2`
			`pw_8: times 8 dw 8`
			`pw_16: times 8 dw 16`
			`pw_1991: times 4 dw 9,-1`

			`section .text`

			`; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2`
			`%macro COMPOSE_53iL0 4`
			`paddw %2, %3`
			`paddw %2, %4`
			`psraw %2, 2`
			`psubw %1, %2`
			`%endm`

			`; m1 = %1 + (-m0 + 9m1 + 9%2 -%3 + 8)>>4`
			`; if %4 is supplied, %1 is loaded unaligned from there`
			`; m2: clobbered m3: pw_8 m4: pw_1991`
			`%macro COMPOSE_DD97iH0 3-4`
			`paddw m0, %3`
			`paddw m1, %2`
			`psubw m0, m3`
			`mova m2, m1`
			`punpcklwd m1, m0`
			`punpckhwd m2, m0`
			`pmaddwd m1, m4`
			`pmaddwd m2, m4`
			`%if %0 > 3`
			`movu %1, %4`
			`%endif`
			`psrad m1, 4`
			`psrad m2, 4`
			`packssdw m1, m2`
			`paddw m1, %1`
			`%endm`

			`%macro COMPOSE_VERTICAL 1`
			`; void vertical_compose53iL0(IDWTELEM b0, IDWTELEM b1, IDWTELEM *b2,`
			`; int width)`
			`cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width`
			`mova m2, [pw_2]`
x86/dirac: fix asm on win64 This could also be fixed by changing the argument type if someone prefers that and wants to change it ... Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-01-15 01:26:33 +03:00			`%if ARCH_X86_64`
			`mov widthd, widthd`
			`%endif`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`.loop:`
dwt_yasm/vertical_compose: fix width witdth argument. Fixes out of array accesses Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2012-11-11 14:40:38 +03:00			`sub widthq, mmsize/2`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`mova m1, [b0q+2*widthq]`
			`mova m0, [b1q+2*widthq]`
			`COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2`
			`mova [b1q+2*widthq], m0`
			`jg .loop`
			`REP_RET`

			`; void vertical_compose_dirac53iH0(IDWTELEM b0, IDWTELEM b1, IDWTELEM *b2,`
			`; int width)`
			`cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width`
			`mova m1, [pw_1]`
x86/dirac: fix asm on win64 This could also be fixed by changing the argument type if someone prefers that and wants to change it ... Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-01-15 01:26:33 +03:00			`%if ARCH_X86_64`
			`mov widthd, widthd`
			`%endif`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`.loop:`
dwt_yasm/vertical_compose: fix width witdth argument. Fixes out of array accesses Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2012-11-11 14:40:38 +03:00			`sub widthq, mmsize/2`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`mova m0, [b0q+2*widthq]`
			`paddw m0, [b2q+2*widthq]`
			`paddw m0, m1`
			`psraw m0, 1`
			`paddw m0, [b1q+2*widthq]`
			`mova [b1q+2*widthq], m0`
			`jg .loop`
			`REP_RET`

			`; void vertical_compose_dd97iH0(IDWTELEM b0, IDWTELEM b1, IDWTELEM *b2,`
			`; IDWTELEM b3, IDWTELEM b4, int width)`
			`cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width`
			`mova m3, [pw_8]`
			`mova m4, [pw_1991]`
x86/dirac: fix asm on win64 This could also be fixed by changing the argument type if someone prefers that and wants to change it ... Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-01-15 01:26:33 +03:00			`%if ARCH_X86_64`
			`mov widthd, widthd`
			`%endif`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`.loop:`
dwt_yasm/vertical_compose: fix width witdth argument. Fixes out of array accesses Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2012-11-11 14:40:38 +03:00			`sub widthq, mmsize/2`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`mova m0, [b0q+2*widthq]`
			`mova m1, [b1q+2*widthq]`
			`COMPOSE_DD97iH0 [b2q+2widthq], [b3q+2widthq], [b4q+2*widthq]`
			`mova [b2q+2*widthq], m1`
			`jg .loop`
			`REP_RET`

			`; void vertical_compose_dd137iL0(IDWTELEM b0, IDWTELEM b1, IDWTELEM *b2,`
			`; IDWTELEM b3, IDWTELEM b4, int width)`
			`cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width`
			`mova m3, [pw_16]`
			`mova m4, [pw_1991]`
x86/dirac: fix asm on win64 This could also be fixed by changing the argument type if someone prefers that and wants to change it ... Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-01-15 01:26:33 +03:00			`%if ARCH_X86_64`
			`mov widthd, widthd`
			`%endif`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`.loop:`
dwt_yasm/vertical_compose: fix width witdth argument. Fixes out of array accesses Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2012-11-11 14:40:38 +03:00			`sub widthq, mmsize/2`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`mova m0, [b0q+2*widthq]`
			`mova m1, [b1q+2*widthq]`
			`mova m5, [b2q+2*widthq]`
			`paddw m0, [b4q+2*widthq]`
			`paddw m1, [b3q+2*widthq]`
			`psubw m0, m3`
			`mova m2, m1`
			`punpcklwd m1, m0`
			`punpckhwd m2, m0`
			`pmaddwd m1, m4`
			`pmaddwd m2, m4`
			`psrad m1, 5`
			`psrad m2, 5`
			`packssdw m1, m2`
			`psubw m5, m1`
			`mova [b2q+2*widthq], m5`
			`jg .loop`
			`REP_RET`

			`; void vertical_compose_haar(IDWTELEM b0, IDWTELEM b1, int width)`
			`cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width`
			`mova m3, [pw_1]`
x86/dirac: fix asm on win64 This could also be fixed by changing the argument type if someone prefers that and wants to change it ... Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2013-01-15 01:26:33 +03:00			`%if ARCH_X86_64`
			`mov widthd, widthd`
			`%endif`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`.loop:`
dwt_yasm/vertical_compose: fix width witdth argument. Fixes out of array accesses Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2012-11-11 14:40:38 +03:00			`sub widthq, mmsize/2`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`mova m1, [b1q+2*widthq]`
			`mova m0, [b0q+2*widthq]`
			`mova m2, m1`
			`paddw m1, m3`
			`psraw m1, 1`
			`psubw m0, m1`
			`mova [b0q+2*widthq], m0`
			`paddw m2, m0`
			`mova [b1q+2*widthq], m2`
			`jg .loop`
			`REP_RET`
			`%endmacro`

			`; extend the left and right edges of the tmp array by %1 and %2 respectively`
			`%macro EDGE_EXTENSION 3`
			`mov %3, [tmpq]`
			`%assign %%i 1`
			`%rep %1`
			`mov [tmpq-2*%%i], %3`
			`%assign %%i %%i+1`
			`%endrep`
			`mov %3, [tmpq+2*w2q-2]`
			`%assign %%i 0`
			`%rep %2`
			`mov [tmpq+2w2q+2%%i], %3`
			`%assign %%i %%i+1`
			`%endrep`
			`%endmacro`


			`%macro HAAR_HORIZONTAL 2`
			`; void horizontal_compose_haari(IDWTELEM b, IDWTELEM tmp, int width)`
			`cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2`
			`mov w2d, wd`
dirac: Fix mmx/sse haar wavelet compose Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-11-01 23:41:01 +03:00			`xor xq, xq`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`shr w2d, 1`
			`lea b_w2q, [bq+wq]`
			`mova m3, [pw_1]`
			`.lowpass_loop:`
			`movu m1, [b_w2q + 2*xq]`
			`mova m0, [bq + 2*xq]`
			`paddw m1, m3`
			`psraw m1, 1`
			`psubw m0, m1`
			`mova [tmpq + 2*xq], m0`
dirac: Fix mmx/sse haar wavelet compose Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-11-01 23:41:01 +03:00			`add xq, mmsize/2`
			`cmp xq, w2q`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`jl .lowpass_loop`

dirac: Fix mmx/sse haar wavelet compose Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-11-01 23:41:01 +03:00			`xor xq, xq`
			`and w2q, ~(mmsize/2 - 1)`
			`cmp w2q, mmsize/2`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`jl .end`

			`.highpass_loop:`
dirac: fix segfault in horizontal_compose_haar Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-31 18:34:26 +03:00			`movu m1, [b_w2q + 2*xq]`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`mova m0, [tmpq + 2*xq]`
			`paddw m1, m0`

			`; shift and interleave`
			`%if %2 == 1`
			`paddw m0, m3`
			`paddw m1, m3`
			`psraw m0, 1`
			`psraw m1, 1`
			`%endif`
			`mova m2, m0`
			`punpcklwd m0, m1`
			`punpckhwd m2, m1`
			`mova [bq+4*xq], m0`
			`mova [bq+4*xq+mmsize], m2`

dirac: Fix mmx/sse haar wavelet compose Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-11-01 23:41:01 +03:00			`add xq, mmsize/2`
			`cmp xq, w2q`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`jl .highpass_loop`
			`.end:`
dirac: Fix mmx/sse haar wavelet compose Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-11-01 23:41:01 +03:00			`REP_RET`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`%endmacro`


			`INIT_XMM`
			`; void horizontal_compose_dd97i(IDWTELEM b, IDWTELEM tmp, int width)`
			`cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2`
			`mov w2d, wd`
			`xor xd, xd`
			`shr w2d, 1`
			`lea b_w2q, [bq+wq]`
			`movu m4, [bq+wq]`
			`mova m7, [pw_2]`
			`pslldq m4, 14`
			`.lowpass_loop:`
			`movu m1, [b_w2q + 2*xq]`
			`mova m0, [bq + 2*xq]`
			`mova m2, m1`
			`palignr m1, m4, 14`
			`mova m4, m2`
			`COMPOSE_53iL0 m0, m1, m2, m7`
			`mova [tmpq + 2*xq], m0`
			`add xd, mmsize/2`
			`cmp xd, w2d`
			`jl .lowpass_loop`

			`EDGE_EXTENSION 1, 2, xw`
			`; leave the last up to 7 (sse) or 3 (mmx) values for C`
			`xor xd, xd`
			`and w2d, ~(mmsize/2 - 1)`
			`cmp w2d, mmsize/2`
			`jl .end`

			`mova m7, [tmpq-mmsize]`
			`mova m0, [tmpq]`
			`mova m5, [pw_1]`
			`mova m3, [pw_8]`
			`mova m4, [pw_1991]`
			`.highpass_loop:`
			`mova m6, m0`
			`palignr m0, m7, 14`
			`mova m7, [tmpq + 2*xq + 16]`
			`mova m1, m7`
			`mova m2, m7`
			`palignr m1, m6, 2`
			`palignr m2, m6, 4`
			`COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]`
			`mova m0, m7`
			`mova m7, m6`

			`; shift and interleave`
			`paddw m6, m5`
			`paddw m1, m5`
			`psraw m6, 1`
			`psraw m1, 1`
			`mova m2, m6`
			`punpcklwd m6, m1`
			`punpckhwd m2, m1`
			`mova [bq+4*xq], m6`
			`mova [bq+4*xq+mmsize], m2`

			`add xd, mmsize/2`
			`cmp xd, w2d`
			`jl .highpass_loop`
			`.end:`
dirac: fix horizontal_compose_dd97i_ssse3 Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-11-01 23:55:37 +03:00			`REP_RET`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00

dirac_yasm: fix linking failure due to %ifndef Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2012-01-28 11:06:03 +03:00			`%if ARCH_X86_64 == 0`
DIRAC Decoder stable version, MMX support removed. Look for MMX_DISABLED to find the disabled functions. Authors of this code are Marco Gerards <marco@gnu.org> and David Conrad <lessen42@gmail.com> With changes from Jordi Ortiz <nenjordi@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-10-06 18:57:17 +03:00			`INIT_MMX`
			`COMPOSE_VERTICAL mmx`
			`HAAR_HORIZONTAL mmx, 0`
			`HAAR_HORIZONTAL mmx, 1`
			`%endif`

			`;;INIT_XMM`
			`INIT_XMM`
			`COMPOSE_VERTICAL sse2`
			`HAAR_HORIZONTAL sse2, 0`
			`HAAR_HORIZONTAL sse2, 1`