mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-26 19:01:44 +02:00
9630b3fc06
The unique user so far is wmalossless 24bits. The few samples tested show an order of 8, so more unrolling or an avx2 version do not make sense. Timings: 68 -> 49 cycles Reviewed-by: Paul B Mahol <onemda@gmail.com> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
191 lines
4.7 KiB
NASM
191 lines
4.7 KiB
NASM
;******************************************************************************
|
|
;* Copyright (c) 2008 Loren Merritt
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
SECTION .text
|
|
|
|
%macro SCALARPRODUCT 0
|
|
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
|
|
; int order, int mul)
|
|
cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
|
|
shl orderq, 1
|
|
movd m7, mulm
|
|
%if mmsize == 16
|
|
pshuflw m7, m7, 0
|
|
punpcklqdq m7, m7
|
|
%else
|
|
pshufw m7, m7, 0
|
|
%endif
|
|
pxor m6, m6
|
|
add v1q, orderq
|
|
add v2q, orderq
|
|
add v3q, orderq
|
|
neg orderq
|
|
.loop:
|
|
movu m0, [v2q + orderq]
|
|
movu m1, [v2q + orderq + mmsize]
|
|
mova m4, [v1q + orderq]
|
|
mova m5, [v1q + orderq + mmsize]
|
|
movu m2, [v3q + orderq]
|
|
movu m3, [v3q + orderq + mmsize]
|
|
pmaddwd m0, m4
|
|
pmaddwd m1, m5
|
|
pmullw m2, m7
|
|
pmullw m3, m7
|
|
paddd m6, m0
|
|
paddd m6, m1
|
|
paddw m2, m4
|
|
paddw m3, m5
|
|
mova [v1q + orderq], m2
|
|
mova [v1q + orderq + mmsize], m3
|
|
add orderq, mmsize*2
|
|
jl .loop
|
|
HADDD m6, m0
|
|
movd eax, m6
|
|
RET
|
|
%endmacro
|
|
|
|
INIT_MMX mmxext
|
|
SCALARPRODUCT
|
|
INIT_XMM sse2
|
|
SCALARPRODUCT
|
|
|
|
INIT_XMM sse4
|
|
; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3,
|
|
; int order, int mul)
|
|
cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
|
|
shl orderq, 1
|
|
movd m7, mulm
|
|
SPLATW m7, m7
|
|
pxor m6, m6
|
|
add v1q, orderq
|
|
lea v2q, [v2q + 2*orderq]
|
|
add v3q, orderq
|
|
neg orderq
|
|
.loop:
|
|
mova m3, [v1q + orderq]
|
|
movu m0, [v2q + 2*orderq]
|
|
pmovsxwd m4, m3
|
|
movu m1, [v2q + 2*orderq + mmsize]
|
|
movhlps m5, m3
|
|
movu m2, [v3q + orderq]
|
|
pmovsxwd m5, m5
|
|
pmullw m2, m7
|
|
pmulld m0, m4
|
|
pmulld m1, m5
|
|
paddw m2, m3
|
|
paddd m6, m0
|
|
paddd m6, m1
|
|
mova [v1q + orderq], m2
|
|
add orderq, 16
|
|
jl .loop
|
|
HADDD m6, m0
|
|
movd eax, m6
|
|
RET
|
|
|
|
%macro SCALARPRODUCT_LOOP 1
|
|
align 16
|
|
.loop%1:
|
|
sub orderq, mmsize*2
|
|
%if %1
|
|
mova m1, m4
|
|
mova m4, [v2q + orderq]
|
|
mova m0, [v2q + orderq + mmsize]
|
|
palignr m1, m0, %1
|
|
palignr m0, m4, %1
|
|
mova m3, m5
|
|
mova m5, [v3q + orderq]
|
|
mova m2, [v3q + orderq + mmsize]
|
|
palignr m3, m2, %1
|
|
palignr m2, m5, %1
|
|
%else
|
|
mova m0, [v2q + orderq]
|
|
mova m1, [v2q + orderq + mmsize]
|
|
mova m2, [v3q + orderq]
|
|
mova m3, [v3q + orderq + mmsize]
|
|
%endif
|
|
%define t0 [v1q + orderq]
|
|
%define t1 [v1q + orderq + mmsize]
|
|
%if ARCH_X86_64
|
|
mova m8, t0
|
|
mova m9, t1
|
|
%define t0 m8
|
|
%define t1 m9
|
|
%endif
|
|
pmaddwd m0, t0
|
|
pmaddwd m1, t1
|
|
pmullw m2, m7
|
|
pmullw m3, m7
|
|
paddw m2, t0
|
|
paddw m3, t1
|
|
paddd m6, m0
|
|
paddd m6, m1
|
|
mova [v1q + orderq], m2
|
|
mova [v1q + orderq + mmsize], m3
|
|
jg .loop%1
|
|
%if %1
|
|
jmp .end
|
|
%endif
|
|
%endmacro
|
|
|
|
; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
|
|
; int order, int mul)
|
|
INIT_XMM ssse3
|
|
cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
|
|
shl orderq, 1
|
|
movd m7, mulm
|
|
pshuflw m7, m7, 0
|
|
punpcklqdq m7, m7
|
|
pxor m6, m6
|
|
mov r4d, v2d
|
|
and r4d, 15
|
|
and v2q, ~15
|
|
and v3q, ~15
|
|
mova m4, [v2q + orderq]
|
|
mova m5, [v3q + orderq]
|
|
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
|
|
cmp r4d, 0
|
|
je .loop0
|
|
cmp r4d, 2
|
|
je .loop2
|
|
cmp r4d, 4
|
|
je .loop4
|
|
cmp r4d, 6
|
|
je .loop6
|
|
cmp r4d, 8
|
|
je .loop8
|
|
cmp r4d, 10
|
|
je .loop10
|
|
cmp r4d, 12
|
|
je .loop12
|
|
SCALARPRODUCT_LOOP 14
|
|
SCALARPRODUCT_LOOP 12
|
|
SCALARPRODUCT_LOOP 10
|
|
SCALARPRODUCT_LOOP 8
|
|
SCALARPRODUCT_LOOP 6
|
|
SCALARPRODUCT_LOOP 4
|
|
SCALARPRODUCT_LOOP 2
|
|
SCALARPRODUCT_LOOP 0
|
|
.end:
|
|
HADDD m6, m0
|
|
movd eax, m6
|
|
RET
|