x86/opusdsp: implement FMA3 accelerated postfilter and deemphasis
58893 decicycles in deemphasis_c, 130548 runs, 524 skips
9475 decicycles in deemphasis_fma3, 130686 runs, 386 skips -> 6.21x speedup
24866 decicycles in postfilter_c, 65386 runs, 150 skips
5268 decicycles in postfilter_fma3, 65505 runs, 31 skips -> 4.72x speedup
Total decoder speedup: ~14%
Deemphasis SIMD based on the following unrolling:
const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
float state = coeff;
for (int i = 0; i < len; i += 4) {
y[0] = x[0] + c1*state;
y[1] = x[1] + c2*state + c1*x[0];
y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];
state = y[3];
y += 4;
x += 4;
}
2019-03-15 16:43:04 +02:00
|
|
|
;******************************************************************************
|
|
|
|
;* Opus SIMD functions
|
|
|
|
;*
|
|
|
|
;* This file is part of FFmpeg.
|
|
|
|
;*
|
|
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
|
|
;* License as published by the Free Software Foundation; either
|
|
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
;*
|
|
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
;* Lesser General Public License for more details.
|
|
|
|
;*
|
|
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
;******************************************************************************
|
|
|
|
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
|
|
|
|
SECTION_RODATA
|
|
|
|
|
|
|
|
SECTION .text
|
|
|
|
|
|
|
|
INIT_XMM fma3
|
|
|
|
%if UNIX64
|
2024-01-29 05:31:43 +02:00
|
|
|
cglobal opus_deemphasis, 4, 4, 8, out, in, weights, len
|
x86/opusdsp: implement FMA3 accelerated postfilter and deemphasis
58893 decicycles in deemphasis_c, 130548 runs, 524 skips
9475 decicycles in deemphasis_fma3, 130686 runs, 386 skips -> 6.21x speedup
24866 decicycles in postfilter_c, 65386 runs, 150 skips
5268 decicycles in postfilter_fma3, 65505 runs, 31 skips -> 4.72x speedup
Total decoder speedup: ~14%
Deemphasis SIMD based on the following unrolling:
const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
float state = coeff;
for (int i = 0; i < len; i += 4) {
y[0] = x[0] + c1*state;
y[1] = x[1] + c2*state + c1*x[0];
y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];
state = y[3];
y += 4;
x += 4;
}
2019-03-15 16:43:04 +02:00
|
|
|
%else
|
2024-01-29 05:31:43 +02:00
|
|
|
cglobal opus_deemphasis, 5, 5, 8, out, in, coeff, weights, len
|
x86/opusdsp: implement FMA3 accelerated postfilter and deemphasis
58893 decicycles in deemphasis_c, 130548 runs, 524 skips
9475 decicycles in deemphasis_fma3, 130686 runs, 386 skips -> 6.21x speedup
24866 decicycles in postfilter_c, 65386 runs, 150 skips
5268 decicycles in postfilter_fma3, 65505 runs, 31 skips -> 4.72x speedup
Total decoder speedup: ~14%
Deemphasis SIMD based on the following unrolling:
const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
float state = coeff;
for (int i = 0; i < len; i += 4) {
y[0] = x[0] + c1*state;
y[1] = x[1] + c2*state + c1*x[0];
y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];
state = y[3];
y += 4;
x += 4;
}
2019-03-15 16:43:04 +02:00
|
|
|
%endif
|
|
|
|
%if ARCH_X86_32
|
|
|
|
VBROADCASTSS m0, coeffm
|
2019-04-01 14:06:34 +02:00
|
|
|
%elif WIN64
|
|
|
|
shufps m0, m2, m2, 0
|
x86/opusdsp: implement FMA3 accelerated postfilter and deemphasis
58893 decicycles in deemphasis_c, 130548 runs, 524 skips
9475 decicycles in deemphasis_fma3, 130686 runs, 386 skips -> 6.21x speedup
24866 decicycles in postfilter_c, 65386 runs, 150 skips
5268 decicycles in postfilter_fma3, 65505 runs, 31 skips -> 4.72x speedup
Total decoder speedup: ~14%
Deemphasis SIMD based on the following unrolling:
const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
float state = coeff;
for (int i = 0; i < len; i += 4) {
y[0] = x[0] + c1*state;
y[1] = x[1] + c2*state + c1*x[0];
y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];
state = y[3];
y += 4;
x += 4;
}
2019-03-15 16:43:04 +02:00
|
|
|
%else
|
|
|
|
shufps m0, m0, 0
|
|
|
|
%endif
|
|
|
|
|
2024-01-29 05:31:43 +02:00
|
|
|
movaps m4, [weightsq]
|
2019-04-24 13:19:48 +02:00
|
|
|
VBROADCASTSS m5, m4
|
|
|
|
shufps m6, m4, m4, q1111
|
|
|
|
shufps m7, m4, m4, q2222
|
x86/opusdsp: implement FMA3 accelerated postfilter and deemphasis
58893 decicycles in deemphasis_c, 130548 runs, 524 skips
9475 decicycles in deemphasis_fma3, 130686 runs, 386 skips -> 6.21x speedup
24866 decicycles in postfilter_c, 65386 runs, 150 skips
5268 decicycles in postfilter_fma3, 65505 runs, 31 skips -> 4.72x speedup
Total decoder speedup: ~14%
Deemphasis SIMD based on the following unrolling:
const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
float state = coeff;
for (int i = 0; i < len; i += 4) {
y[0] = x[0] + c1*state;
y[1] = x[1] + c2*state + c1*x[0];
y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];
state = y[3];
y += 4;
x += 4;
}
2019-03-15 16:43:04 +02:00
|
|
|
|
|
|
|
.loop:
|
|
|
|
movaps m1, [inq] ; x0, x1, x2, x3
|
|
|
|
|
|
|
|
pslldq m2, m1, 4 ; 0, x0, x1, x2
|
|
|
|
pslldq m3, m1, 8 ; 0, 0, x0, x1
|
|
|
|
|
|
|
|
fmaddps m2, m2, m5, m1 ; x + c1*x[0-2]
|
|
|
|
pslldq m1, 12 ; 0, 0, 0, x0
|
|
|
|
|
|
|
|
fmaddps m2, m3, m6, m2 ; x + c1*x[0-2] + c2*x[0-1]
|
|
|
|
fmaddps m1, m1, m7, m2 ; x + c1*x[0-2] + c2*x[0-1] + c3*x[0]
|
|
|
|
fmaddps m0, m0, m4, m1 ; x + c1*x[0-2] + c2*x[0-1] + c3*x[0] + c*s
|
|
|
|
|
|
|
|
movaps [outq], m0
|
|
|
|
shufps m0, m0, q3333 ; new state
|
|
|
|
|
|
|
|
add inq, mmsize
|
|
|
|
add outq, mmsize
|
2019-09-12 01:22:17 +02:00
|
|
|
sub lend, mmsize >> 2
|
x86/opusdsp: implement FMA3 accelerated postfilter and deemphasis
58893 decicycles in deemphasis_c, 130548 runs, 524 skips
9475 decicycles in deemphasis_fma3, 130686 runs, 386 skips -> 6.21x speedup
24866 decicycles in postfilter_c, 65386 runs, 150 skips
5268 decicycles in postfilter_fma3, 65505 runs, 31 skips -> 4.72x speedup
Total decoder speedup: ~14%
Deemphasis SIMD based on the following unrolling:
const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
float state = coeff;
for (int i = 0; i < len; i += 4) {
y[0] = x[0] + c1*state;
y[1] = x[1] + c2*state + c1*x[0];
y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];
state = y[3];
y += 4;
x += 4;
}
2019-03-15 16:43:04 +02:00
|
|
|
jg .loop
|
|
|
|
|
|
|
|
%if ARCH_X86_64 == 0
|
|
|
|
movss r0m, m0
|
|
|
|
fld dword r0m
|
|
|
|
%endif
|
|
|
|
RET
|
|
|
|
|
|
|
|
|
|
|
|
INIT_XMM fma3
|
|
|
|
cglobal opus_postfilter, 4, 4, 8, data, period, gains, len
|
|
|
|
VBROADCASTSS m0, [gainsq + 0]
|
|
|
|
VBROADCASTSS m1, [gainsq + 4]
|
|
|
|
VBROADCASTSS m2, [gainsq + 8]
|
|
|
|
|
2019-09-12 01:22:17 +02:00
|
|
|
shl periodd, 2
|
|
|
|
add periodq, 8
|
x86/opusdsp: implement FMA3 accelerated postfilter and deemphasis
58893 decicycles in deemphasis_c, 130548 runs, 524 skips
9475 decicycles in deemphasis_fma3, 130686 runs, 386 skips -> 6.21x speedup
24866 decicycles in postfilter_c, 65386 runs, 150 skips
5268 decicycles in postfilter_fma3, 65505 runs, 31 skips -> 4.72x speedup
Total decoder speedup: ~14%
Deemphasis SIMD based on the following unrolling:
const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
float state = coeff;
for (int i = 0; i < len; i += 4) {
y[0] = x[0] + c1*state;
y[1] = x[1] + c2*state + c1*x[0];
y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];
state = y[3];
y += 4;
x += 4;
}
2019-03-15 16:43:04 +02:00
|
|
|
neg periodq
|
|
|
|
|
|
|
|
movups m3, [dataq + periodq]
|
|
|
|
mulps m3, m2
|
|
|
|
|
|
|
|
.loop:
|
|
|
|
movups m4, [dataq + periodq + 4]
|
|
|
|
movups m5, [dataq + periodq + 8]
|
|
|
|
movups m6, [dataq + periodq + 12]
|
|
|
|
movups m7, [dataq + periodq + 16]
|
|
|
|
|
|
|
|
fmaddps m3, m7, m2, m3
|
|
|
|
addps m6, m4
|
|
|
|
|
|
|
|
fmaddps m5, m5, m0, [dataq]
|
|
|
|
fmaddps m6, m6, m1, m3
|
|
|
|
|
|
|
|
addps m5, m6
|
|
|
|
mulps m3, m7, m2
|
|
|
|
|
|
|
|
movaps [dataq], m5
|
|
|
|
|
|
|
|
add dataq, mmsize
|
2019-09-12 01:22:17 +02:00
|
|
|
sub lend, mmsize >> 2
|
x86/opusdsp: implement FMA3 accelerated postfilter and deemphasis
58893 decicycles in deemphasis_c, 130548 runs, 524 skips
9475 decicycles in deemphasis_fma3, 130686 runs, 386 skips -> 6.21x speedup
24866 decicycles in postfilter_c, 65386 runs, 150 skips
5268 decicycles in postfilter_fma3, 65505 runs, 31 skips -> 4.72x speedup
Total decoder speedup: ~14%
Deemphasis SIMD based on the following unrolling:
const float c1 = CELT_EMPH_COEFF, c2 = c1*c1, c3 = c2*c1, c4 = c3*c1;
float state = coeff;
for (int i = 0; i < len; i += 4) {
y[0] = x[0] + c1*state;
y[1] = x[1] + c2*state + c1*x[0];
y[2] = x[2] + c3*state + c1*x[1] + c2*x[0];
y[3] = x[3] + c4*state + c1*x[2] + c2*x[1] + c3*x[0];
state = y[3];
y += 4;
x += 4;
}
2019-03-15 16:43:04 +02:00
|
|
|
jg .loop
|
|
|
|
|
|
|
|
RET
|