FFmpeg/libavcodec/x86/fmtconvert.asm

;******************************************************************************
;* x86 optimized Format Conversion Utils
;* Copyright (c) 2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION .text

;------------------------------------------------------------------------------
; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul,
;                                    int len);
;------------------------------------------------------------------------------
%macro INT32_TO_FLOAT_FMUL_SCALAR 1
%if UNIX64
cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
%else
cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
%endif
%if WIN64
    SWAP 0, 2
%elif ARCH_X86_32
    movss   m0, mulm
%endif
    SPLATD  m0
    shl     lenq, 2
    add     srcq, lenq
    add     dstq, lenq
    neg     lenq
.loop:
%if cpuflag(sse2)
    cvtdq2ps  m1, [srcq+lenq   ]
    cvtdq2ps  m2, [srcq+lenq+16]
%else
    cvtpi2ps  m1, [srcq+lenq   ]
    cvtpi2ps  m3, [srcq+lenq+ 8]
    cvtpi2ps  m2, [srcq+lenq+16]
    cvtpi2ps  m4, [srcq+lenq+24]
    movlhps   m1, m3
    movlhps   m2, m4
%endif
    mulps     m1, m0
    mulps     m2, m0
    mova  [dstq+lenq   ], m1
    mova  [dstq+lenq+16], m2
    add     lenq, 32
    jl .loop
    REP_RET
%endmacro

INIT_XMM sse
INT32_TO_FLOAT_FMUL_SCALAR 5
INIT_XMM sse2
INT32_TO_FLOAT_FMUL_SCALAR 3

;------------------------------------------------------------------------------
; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
;                                    const float *mul, int len);
;------------------------------------------------------------------------------
%macro INT32_TO_FLOAT_FMUL_ARRAY8 0
cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
    shl     lend, 2
    add     srcq, lenq
    add     dstq, lenq
    neg     lenq
.loop:
    movss     m0, [mulq]
    SPLATD    m0
%if cpuflag(sse2)
    cvtdq2ps  m1, [srcq+lenq   ]
    cvtdq2ps  m2, [srcq+lenq+16]
%else
    cvtpi2ps  m1, [srcq+lenq   ]
    cvtpi2ps  m3, [srcq+lenq+ 8]
    cvtpi2ps  m2, [srcq+lenq+16]
    cvtpi2ps  m4, [srcq+lenq+24]
    movlhps   m1, m3
    movlhps   m2, m4
%endif
    mulps     m1, m0
    mulps     m2, m0
    mova  [dstq+lenq   ], m1
    mova  [dstq+lenq+16], m2
    add     mulq, 4
    add     lenq, 32
    jl .loop
    REP_RET
%endmacro

INIT_XMM sse
INT32_TO_FLOAT_FMUL_ARRAY8
INIT_XMM sse2
INT32_TO_FLOAT_FMUL_ARRAY8
Separate format conversion DSP functions from DSPContext. This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae) 2011-01-30 17:06:46 +02:00			`;******************************************************************************`
			`;* x86 optimized Format Conversion Utils`
			`;* Copyright (c) 2008 Loren Merritt`
			`;*`
			`;* This file is part of FFmpeg.`
			`;*`
			`;* FFmpeg is free software; you can redistribute it and/or`
			`;* modify it under the terms of the GNU Lesser General Public`
			`;* License as published by the Free Software Foundation; either`
			`;* version 2.1 of the License, or (at your option) any later version.`
			`;*`
			`;* FFmpeg is distributed in the hope that it will be useful,`
			`;* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`;* Lesser General Public License for more details.`
			`;*`
			`;* You should have received a copy of the GNU Lesser General Public`
			`;* License along with FFmpeg; if not, write to the Free Software`
Fix FSF address copy paste error in some license headers. 2011-05-14 22:32:31 +03:00			`;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
Separate format conversion DSP functions from DSPContext. This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae) 2011-01-30 17:06:46 +02:00			`;******************************************************************************`

Move x264asm to libavutil. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-06-05 17:19:16 +03:00			`%include "libavutil/x86/x86util.asm"`
Separate format conversion DSP functions from DSPContext. This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae) 2011-01-30 17:06:46 +02:00
x86inc: Drop SECTION_TEXT macro The .text section is already 16-byte aligned by default on all supported platforms so `SECTION_TEXT` isn't any different from `SECTION .text`. 2015-05-27 21:38:14 +02:00			`SECTION .text`
Separate format conversion DSP functions from DSPContext. This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae) 2011-01-30 17:06:46 +02:00
x86: Make function prototype comments in assembly code consistent This helps grepping for functions, among other things. 2014-01-28 22:35:58 +03:00			`;------------------------------------------------------------------------------`
			`; void ff_int32_to_float_fmul_scalar(float dst, const int32_t src, float mul,`
			`; int len);`
			`;------------------------------------------------------------------------------`
x86: fmtconvert: port to cpuflags 2012-07-15 16:42:17 +03:00			`%macro INT32_TO_FLOAT_FMUL_SCALAR 1`
config.asm: change %ifdef directives to %if directives. This allows combining multiple conditionals in a single statement. 2012-01-23 12:45:58 +03:00			`%if UNIX64`
x86: fmtconvert: port to cpuflags 2012-07-15 16:42:17 +03:00			`cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len`
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm 2011-10-10 06:52:03 +03:00			`%else`
x86: fmtconvert: port to cpuflags 2012-07-15 16:42:17 +03:00			`cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len`
fmtconvert: fix int32_to_float_fmul_scalar() for windows x86_64 The calling convention only allows 4 non-stack parameter, with each float or int register being skipped if not used. fixes Bug 64 2011-11-01 23:57:41 +03:00			`%endif`
config.asm: change %ifdef directives to %if directives. This allows combining multiple conditionals in a single statement. 2012-01-23 12:45:58 +03:00			`%if WIN64`
fmtconvert: fix int32_to_float_fmul_scalar() for windows x86_64 The calling convention only allows 4 non-stack parameter, with each float or int register being skipped if not used. fixes Bug 64 2011-11-01 23:57:41 +03:00			`SWAP 0, 2`
config.asm: change %ifdef directives to %if directives. This allows combining multiple conditionals in a single statement. 2012-01-23 12:45:58 +03:00			`%elif ARCH_X86_32`
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm 2011-10-10 06:52:03 +03:00			`movss m0, mulm`
			`%endif`
			`SPLATD m0`
			`shl lenq, 2`
			`add srcq, lenq`
			`add dstq, lenq`
			`neg lenq`
			`.loop:`
x86: fmtconvert: port to cpuflags 2012-07-15 16:42:17 +03:00			`%if cpuflag(sse2)`
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm 2011-10-10 06:52:03 +03:00			`cvtdq2ps m1, [srcq+lenq ]`
			`cvtdq2ps m2, [srcq+lenq+16]`
			`%else`
			`cvtpi2ps m1, [srcq+lenq ]`
			`cvtpi2ps m3, [srcq+lenq+ 8]`
			`cvtpi2ps m2, [srcq+lenq+16]`
			`cvtpi2ps m4, [srcq+lenq+24]`
			`movlhps m1, m3`
			`movlhps m2, m4`
			`%endif`
			`mulps m1, m0`
			`mulps m2, m0`
			`mova [dstq+lenq ], m1`
			`mova [dstq+lenq+16], m2`
			`add lenq, 32`
			`jl .loop`
			`REP_RET`
			`%endmacro`

x86: fmtconvert: port to cpuflags 2012-07-15 16:42:17 +03:00			`INIT_XMM sse`
			`INT32_TO_FLOAT_FMUL_SCALAR 5`
			`INIT_XMM sse2`
			`INT32_TO_FLOAT_FMUL_SCALAR 3`
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm 2011-10-10 06:52:03 +03:00
x86/fmtconvert: add ff_int32_to_float_fmul_array8_{sse,sse2} About two times faster than the c wrapper. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com> 2014-09-27 02:46:41 +03:00			`;------------------------------------------------------------------------------`
			`; void ff_int32_to_float_fmul_array8(FmtConvertContext c, float dst, const int32_t *src,`
			`; const float *mul, int len);`
			`;------------------------------------------------------------------------------`
			`%macro INT32_TO_FLOAT_FMUL_ARRAY8 0`
			`cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len`
avcodec/x86/fmtconvert: Fix operand size in ff_int32_to_float_fmul_array8_sse* Fixes acodec-dca2 fate failure Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-09-28 19:56:54 +03:00			`shl lend, 2`
x86/fmtconvert: add ff_int32_to_float_fmul_array8_{sse,sse2} About two times faster than the c wrapper. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com> 2014-09-27 02:46:41 +03:00			`add srcq, lenq`
			`add dstq, lenq`
			`neg lenq`
			`.loop:`
			`movss m0, [mulq]`
			`SPLATD m0`
			`%if cpuflag(sse2)`
			`cvtdq2ps m1, [srcq+lenq ]`
			`cvtdq2ps m2, [srcq+lenq+16]`
			`%else`
			`cvtpi2ps m1, [srcq+lenq ]`
			`cvtpi2ps m3, [srcq+lenq+ 8]`
			`cvtpi2ps m2, [srcq+lenq+16]`
			`cvtpi2ps m4, [srcq+lenq+24]`
			`movlhps m1, m3`
			`movlhps m2, m4`
			`%endif`
			`mulps m1, m0`
			`mulps m2, m0`
			`mova [dstq+lenq ], m1`
			`mova [dstq+lenq+16], m2`
			`add mulq, 4`
			`add lenq, 32`
			`jl .loop`
			`REP_RET`
			`%endmacro`

			`INIT_XMM sse`
			`INT32_TO_FLOAT_FMUL_ARRAY8`
			`INIT_XMM sse2`
			`INT32_TO_FLOAT_FMUL_ARRAY8`
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm 2011-10-10 06:52:03 +03:00