FFmpeg/libavcodec/x86/sbcdsp.asm

;******************************************************************************
;* SIMD optimized SBC encoder DSP functions
;*
;* Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
;* Copyright (C) 2008-2010  Nokia Corporation
;* Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
;* Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
;* Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA

scale_mask: times 2 dd 0x8000    ; 1 << (SBC_PROTO_FIXED_SCALE - 1)

SECTION .text

%macro NIDN 3
%ifnidn %2, %3
    %1            %2, %3
%endif
%endmacro

%macro ANALYZE_MAC 9 ; out1, out2, in1, in2, tmp1, tmp2, add1, add2, offset
    NIDN movq,    %5, %3
    NIDN movq,    %6, %4
    pmaddwd       %5, [constsq+%9]
    pmaddwd       %6, [constsq+%9+8]
    NIDN paddd,   %1, %7
    NIDN paddd,   %2, %8
%endmacro

%macro ANALYZE_MAC_IN 7 ; out1, out2, tmp1, tmp2, add1, add2, offset
    ANALYZE_MAC   %1, %2, [inq+%7], [inq+%7+8], %3, %4, %5, %6, %7
%endmacro

%macro ANALYZE_MAC_REG 7 ; out1, out2, in, tmp1, tmp2, offset, pack
%ifidn %7, pack
    psrad         %3, 16    ; SBC_PROTO_FIXED_SCALE
    packssdw      %3, %3
%endif
    ANALYZE_MAC   %1, %2, %3, %3, %4, %5, %4, %5, %6
%endmacro

;*******************************************************************
;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts);
;*******************************************************************
INIT_MMX mmx
cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
    ANALYZE_MAC_IN   m0, m1, m0, m1, [scale_mask], [scale_mask], 0
    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 16
    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 32
    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 48
    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 64

    ANALYZE_MAC_REG  m0, m2, m0, m0, m2, 80, pack
    ANALYZE_MAC_REG  m0, m2, m1, m1, m3, 96, pack

    movq          [outq  ], m0
    movq          [outq+8], m2

    RET


;*******************************************************************
;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts);
;*******************************************************************
INIT_MMX mmx
cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
    ANALYZE_MAC_IN   m0, m1, m0, m1, [scale_mask], [scale_mask],  0
    ANALYZE_MAC_IN   m2, m3, m2, m3, [scale_mask], [scale_mask], 16
    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  32
    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7,  48
    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  64
    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7,  80
    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  96
    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7, 112
    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5, 128
    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7, 144

    ANALYZE_MAC_REG  m4, m5, m0, m4, m5, 160, pack
    ANALYZE_MAC_REG  m4, m5, m1, m6, m7, 192, pack
    ANALYZE_MAC_REG  m4, m5, m2, m6, m7, 224, pack
    ANALYZE_MAC_REG  m4, m5, m3, m6, m7, 256, pack

    movq          [outq  ], m4
    movq          [outq+8], m5

    ANALYZE_MAC_REG  m0, m5, m0, m0, m5, 176, no
    ANALYZE_MAC_REG  m0, m5, m1, m1, m7, 208, no
    ANALYZE_MAC_REG  m0, m5, m2, m2, m7, 240, no
    ANALYZE_MAC_REG  m0, m5, m3, m3, m7, 272, no

    movq          [outq+16], m0
    movq          [outq+24], m5

    RET


;*******************************************************************
;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
;                              uint32_t scale_factor[2][8],
;                              int blocks, int channels, int subbands)
;*******************************************************************
INIT_MMX mmx
cglobal sbc_calc_scalefactors, 5, 7, 4, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk
    ; subbands = 4 * subbands * channels
    movq          m3, [scale_mask]
    shl           subbandsd, 2
    cmp           channelsd, 2
    jl            .loop_1
    shl           subbandsd, 1

.loop_1:
    sub           subbandsq, 8
    lea           ptrq, [sb_sample_fq + subbandsq]

    ; blk = (blocks - 1) * 64;
    lea           blkq, [blocksq - 1]
    shl           blkd, 6

    movq          m0, m3
.loop_2:
    movq          m1, [ptrq+blkq]
    pxor          m2, m2
    pcmpgtd       m1, m2
    paddd         m1, [ptrq+blkq]
    pcmpgtd       m2, m1
    pxor          m1, m2

    por           m0, m1

    sub           blkq, 64
    jns           .loop_2

    movd          blkd, m0
    psrlq         m0,   32
    bsr           blkd, blkd
    sub           blkd, 15    ; SCALE_OUT_BITS
    mov           [scale_factorq + subbandsq], blkd

    movd          blkd, m0
    bsr           blkd, blkd
    sub           blkd, 15    ; SCALE_OUT_BITS
    mov           [scale_factorq + subbandsq + 4], blkd

    cmp           subbandsq, 0
    jg            .loop_1

    emms
    RET
sbcenc: add MMX optimizations This was originally based on libsbc, and was fully integrated into ffmpeg. Rough speed test: C version: speed= 592x MMX version: speed= 785x 2017-12-17 21:07:33 +02:00			`;******************************************************************************`
			`;* SIMD optimized SBC encoder DSP functions`
			`;*`
			`;* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org>`
			`;* Copyright (C) 2008-2010 Nokia Corporation`
			`;* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org>`
			`;* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch>`
			`;* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com>`
			`;*`
			`;* This file is part of FFmpeg.`
			`;*`
			`;* FFmpeg is free software; you can redistribute it and/or`
			`;* modify it under the terms of the GNU Lesser General Public`
			`;* License as published by the Free Software Foundation; either`
			`;* version 2.1 of the License, or (at your option) any later version.`
			`;*`
			`;* FFmpeg is distributed in the hope that it will be useful,`
			`;* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`;* Lesser General Public License for more details.`
			`;*`
			`;* You should have received a copy of the GNU Lesser General Public`
			`;* License along with FFmpeg; if not, write to the Free Software`
			`;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`;******************************************************************************`

			`%include "libavutil/x86/x86util.asm"`

			`SECTION_RODATA`

			`scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1)`

			`SECTION .text`

			`%macro NIDN 3`
			`%ifnidn %2, %3`
			`%1 %2, %3`
			`%endif`
			`%endmacro`

			`%macro ANALYZE_MAC 9 ; out1, out2, in1, in2, tmp1, tmp2, add1, add2, offset`
			`NIDN movq, %5, %3`
			`NIDN movq, %6, %4`
			`pmaddwd %5, [constsq+%9]`
			`pmaddwd %6, [constsq+%9+8]`
			`NIDN paddd, %1, %7`
			`NIDN paddd, %2, %8`
			`%endmacro`

			`%macro ANALYZE_MAC_IN 7 ; out1, out2, tmp1, tmp2, add1, add2, offset`
			`ANALYZE_MAC %1, %2, [inq+%7], [inq+%7+8], %3, %4, %5, %6, %7`
			`%endmacro`

			`%macro ANALYZE_MAC_REG 7 ; out1, out2, in, tmp1, tmp2, offset, pack`
			`%ifidn %7, pack`
			`psrad %3, 16 ; SBC_PROTO_FIXED_SCALE`
			`packssdw %3, %3`
			`%endif`
			`ANALYZE_MAC %1, %2, %3, %3, %4, %5, %4, %5, %6`
			`%endmacro`

			`;*******************************************************************`
			`;void ff_sbc_analyze_4(const int16_t in, int32_t out, const int16_t *consts);`
			`;*******************************************************************`
			`INIT_MMX mmx`
			`cglobal sbc_analyze_4, 3, 3, 4, in, out, consts`
			`ANALYZE_MAC_IN m0, m1, m0, m1, [scale_mask], [scale_mask], 0`
			`ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 16`
			`ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 32`
			`ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 48`
			`ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 64`

			`ANALYZE_MAC_REG m0, m2, m0, m0, m2, 80, pack`
			`ANALYZE_MAC_REG m0, m2, m1, m1, m3, 96, pack`

			`movq [outq ], m0`
			`movq [outq+8], m2`

			`RET`


			`;*******************************************************************`
			`;void ff_sbc_analyze_8(const int16_t in, int32_t out, const int16_t *consts);`
			`;*******************************************************************`
			`INIT_MMX mmx`
			`cglobal sbc_analyze_8, 3, 3, 4, in, out, consts`
			`ANALYZE_MAC_IN m0, m1, m0, m1, [scale_mask], [scale_mask], 0`
			`ANALYZE_MAC_IN m2, m3, m2, m3, [scale_mask], [scale_mask], 16`
			`ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 32`
			`ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 48`
			`ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 64`
			`ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 80`
			`ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 96`
			`ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 112`
			`ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 128`
			`ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 144`

			`ANALYZE_MAC_REG m4, m5, m0, m4, m5, 160, pack`
			`ANALYZE_MAC_REG m4, m5, m1, m6, m7, 192, pack`
			`ANALYZE_MAC_REG m4, m5, m2, m6, m7, 224, pack`
			`ANALYZE_MAC_REG m4, m5, m3, m6, m7, 256, pack`

			`movq [outq ], m4`
			`movq [outq+8], m5`

			`ANALYZE_MAC_REG m0, m5, m0, m0, m5, 176, no`
			`ANALYZE_MAC_REG m0, m5, m1, m1, m7, 208, no`
			`ANALYZE_MAC_REG m0, m5, m2, m2, m7, 240, no`
			`ANALYZE_MAC_REG m0, m5, m3, m3, m7, 272, no`

			`movq [outq+16], m0`
			`movq [outq+24], m5`

			`RET`


			`;*******************************************************************`
			`;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],`
			`; uint32_t scale_factor[2][8],`
			`; int blocks, int channels, int subbands)`
			`;*******************************************************************`
			`INIT_MMX mmx`
			`cglobal sbc_calc_scalefactors, 5, 7, 4, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk`
			`; subbands = 4 * subbands * channels`
			`movq m3, [scale_mask]`
			`shl subbandsd, 2`
			`cmp channelsd, 2`
			`jl .loop_1`
			`shl subbandsd, 1`

			`.loop_1:`
			`sub subbandsq, 8`
			`lea ptrq, [sb_sample_fq + subbandsq]`

			`; blk = (blocks - 1) * 64;`
			`lea blkq, [blocksq - 1]`
			`shl blkd, 6`

			`movq m0, m3`
			`.loop_2:`
			`movq m1, [ptrq+blkq]`
			`pxor m2, m2`
			`pcmpgtd m1, m2`
			`paddd m1, [ptrq+blkq]`
			`pcmpgtd m2, m1`
			`pxor m1, m2`

			`por m0, m1`

			`sub blkq, 64`
			`jns .loop_2`

			`movd blkd, m0`
			`psrlq m0, 32`
			`bsr blkd, blkd`
			`sub blkd, 15 ; SCALE_OUT_BITS`
			`mov [scale_factorq + subbandsq], blkd`

			`movd blkd, m0`
			`bsr blkd, blkd`
			`sub blkd, 15 ; SCALE_OUT_BITS`
			`mov [scale_factorq + subbandsq + 4], blkd`

			`cmp subbandsq, 0`
			`jg .loop_1`

			`emms`
			`RET`