FFmpeg/libavcodec/aarch64/idctdsp_neon.S

/*
 * IDCT AArch64 NEON optimisations
 *
 * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/aarch64/asm.S"

// Clamp 16-bit signed block coefficients to unsigned 8-bit
// On entry:
//   x0 -> array of 64x 16-bit coefficients
//   x1 -> 8-bit results
//   x2 = row stride for results, bytes
function ff_put_pixels_clamped_neon, export=1
        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
        sqxtun          v0.8b, v0.8h
        sqxtun          v1.8b, v1.8h
        sqxtun          v2.8b, v2.8h
        sqxtun          v3.8b, v3.8h
        sqxtun          v4.8b, v4.8h
        st1             {v0.8b}, [x1], x2
        sqxtun          v0.8b, v5.8h
        st1             {v1.8b}, [x1], x2
        sqxtun          v1.8b, v6.8h
        st1             {v2.8b}, [x1], x2
        sqxtun          v2.8b, v7.8h
        st1             {v3.8b}, [x1], x2
        st1             {v4.8b}, [x1], x2
        st1             {v0.8b}, [x1], x2
        st1             {v1.8b}, [x1], x2
        st1             {v2.8b}, [x1]
        ret
endfunc

// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
// On entry:
//   x0 -> array of 64x 16-bit coefficients
//   x1 -> 8-bit results
//   x2 = row stride for results, bytes
function ff_put_signed_pixels_clamped_neon, export=1
        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
        movi            v4.8b, #128
        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
        sqxtn           v0.8b, v0.8h
        sqxtn           v1.8b, v1.8h
        sqxtn           v2.8b, v2.8h
        sqxtn           v3.8b, v3.8h
        sqxtn           v5.8b, v16.8h
        add             v0.8b, v0.8b, v4.8b
        sqxtn           v6.8b, v17.8h
        add             v1.8b, v1.8b, v4.8b
        sqxtn           v7.8b, v18.8h
        add             v2.8b, v2.8b, v4.8b
        sqxtn           v16.8b, v19.8h
        add             v3.8b, v3.8b, v4.8b
        st1             {v0.8b}, [x1], x2
        add             v0.8b, v5.8b, v4.8b
        st1             {v1.8b}, [x1], x2
        add             v1.8b, v6.8b, v4.8b
        st1             {v2.8b}, [x1], x2
        add             v2.8b, v7.8b, v4.8b
        st1             {v3.8b}, [x1], x2
        add             v3.8b, v16.8b, v4.8b
        st1             {v0.8b}, [x1], x2
        st1             {v1.8b}, [x1], x2
        st1             {v2.8b}, [x1], x2
        st1             {v3.8b}, [x1]
        ret
endfunc

// Add 16-bit signed block coefficients to unsigned 8-bit
// On entry:
//   x0 -> array of 64x 16-bit coefficients
//   x1 -> 8-bit input and results
//   x2 = row stride for 8-bit input and results, bytes
function ff_add_pixels_clamped_neon, export=1
        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
        mov             x3, x1
        ld1             {v4.8b}, [x1], x2
        ld1             {v5.8b}, [x1], x2
        ld1             {v6.8b}, [x1], x2
        ld1             {v7.8b}, [x1], x2
        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
        uaddw           v0.8h, v0.8h, v4.8b
        uaddw           v1.8h, v1.8h, v5.8b
        uaddw           v2.8h, v2.8h, v6.8b
        ld1             {v4.8b}, [x1], x2
        uaddw           v3.8h, v3.8h, v7.8b
        ld1             {v5.8b}, [x1], x2
        sqxtun          v0.8b, v0.8h
        ld1             {v6.8b}, [x1], x2
        sqxtun          v1.8b, v1.8h
        ld1             {v7.8b}, [x1]
        sqxtun          v2.8b, v2.8h
        sqxtun          v3.8b, v3.8h
        uaddw           v4.8h, v16.8h, v4.8b
        st1             {v0.8b}, [x3], x2
        uaddw           v0.8h, v17.8h, v5.8b
        st1             {v1.8b}, [x3], x2
        uaddw           v1.8h, v18.8h, v6.8b
        st1             {v2.8b}, [x3], x2
        uaddw           v2.8h, v19.8h, v7.8b
        sqxtun          v4.8b, v4.8h
        sqxtun          v0.8b, v0.8h
        st1             {v3.8b}, [x3], x2
        sqxtun          v1.8b, v1.8h
        sqxtun          v2.8b, v2.8h
        st1             {v4.8b}, [x3], x2
        st1             {v0.8b}, [x3], x2
        st1             {v1.8b}, [x3], x2
        st1             {v2.8b}, [x3]
        ret
endfunc
avcodec/idctdsp: Arm 64-bit NEON block add and clamp fast paths checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. idctdsp.add_pixels_clamped_c: 313.3 idctdsp.add_pixels_clamped_neon: 24.3 idctdsp.put_pixels_clamped_c: 220.3 idctdsp.put_pixels_clamped_neon: 15.5 idctdsp.put_signed_pixels_clamped_c: 210.5 idctdsp.put_signed_pixels_clamped_neon: 19.5 Signed-off-by: Ben Avison <bavison@riscosopen.org> Signed-off-by: Martin Storsjö <martin@martin.st> 2022-03-31 19:23:49 +02:00			`/*`
			`* IDCT AArch64 NEON optimisations`
			`*`
			`* Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>`
			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with FFmpeg; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#include "libavutil/aarch64/asm.S"`

			`// Clamp 16-bit signed block coefficients to unsigned 8-bit`
			`// On entry:`
			`// x0 -> array of 64x 16-bit coefficients`
			`// x1 -> 8-bit results`
			`// x2 = row stride for results, bytes`
			`function ff_put_pixels_clamped_neon, export=1`
			`ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64`
			`ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]`
			`sqxtun v0.8b, v0.8h`
			`sqxtun v1.8b, v1.8h`
			`sqxtun v2.8b, v2.8h`
			`sqxtun v3.8b, v3.8h`
			`sqxtun v4.8b, v4.8h`
			`st1 {v0.8b}, [x1], x2`
			`sqxtun v0.8b, v5.8h`
			`st1 {v1.8b}, [x1], x2`
			`sqxtun v1.8b, v6.8h`
			`st1 {v2.8b}, [x1], x2`
			`sqxtun v2.8b, v7.8h`
			`st1 {v3.8b}, [x1], x2`
			`st1 {v4.8b}, [x1], x2`
			`st1 {v0.8b}, [x1], x2`
			`st1 {v1.8b}, [x1], x2`
			`st1 {v2.8b}, [x1]`
			`ret`
			`endfunc`

			`// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)`
			`// On entry:`
			`// x0 -> array of 64x 16-bit coefficients`
			`// x1 -> 8-bit results`
			`// x2 = row stride for results, bytes`
			`function ff_put_signed_pixels_clamped_neon, export=1`
			`ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64`
			`movi v4.8b, #128`
			`ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]`
			`sqxtn v0.8b, v0.8h`
			`sqxtn v1.8b, v1.8h`
			`sqxtn v2.8b, v2.8h`
			`sqxtn v3.8b, v3.8h`
			`sqxtn v5.8b, v16.8h`
			`add v0.8b, v0.8b, v4.8b`
			`sqxtn v6.8b, v17.8h`
			`add v1.8b, v1.8b, v4.8b`
			`sqxtn v7.8b, v18.8h`
			`add v2.8b, v2.8b, v4.8b`
			`sqxtn v16.8b, v19.8h`
			`add v3.8b, v3.8b, v4.8b`
			`st1 {v0.8b}, [x1], x2`
			`add v0.8b, v5.8b, v4.8b`
			`st1 {v1.8b}, [x1], x2`
			`add v1.8b, v6.8b, v4.8b`
			`st1 {v2.8b}, [x1], x2`
			`add v2.8b, v7.8b, v4.8b`
			`st1 {v3.8b}, [x1], x2`
			`add v3.8b, v16.8b, v4.8b`
			`st1 {v0.8b}, [x1], x2`
			`st1 {v1.8b}, [x1], x2`
			`st1 {v2.8b}, [x1], x2`
			`st1 {v3.8b}, [x1]`
			`ret`
			`endfunc`

			`// Add 16-bit signed block coefficients to unsigned 8-bit`
			`// On entry:`
			`// x0 -> array of 64x 16-bit coefficients`
			`// x1 -> 8-bit input and results`
			`// x2 = row stride for 8-bit input and results, bytes`
			`function ff_add_pixels_clamped_neon, export=1`
			`ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64`
			`mov x3, x1`
			`ld1 {v4.8b}, [x1], x2`
			`ld1 {v5.8b}, [x1], x2`
			`ld1 {v6.8b}, [x1], x2`
			`ld1 {v7.8b}, [x1], x2`
			`ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]`
			`uaddw v0.8h, v0.8h, v4.8b`
			`uaddw v1.8h, v1.8h, v5.8b`
			`uaddw v2.8h, v2.8h, v6.8b`
			`ld1 {v4.8b}, [x1], x2`
			`uaddw v3.8h, v3.8h, v7.8b`
			`ld1 {v5.8b}, [x1], x2`
			`sqxtun v0.8b, v0.8h`
			`ld1 {v6.8b}, [x1], x2`
			`sqxtun v1.8b, v1.8h`
			`ld1 {v7.8b}, [x1]`
			`sqxtun v2.8b, v2.8h`
			`sqxtun v3.8b, v3.8h`
			`uaddw v4.8h, v16.8h, v4.8b`
			`st1 {v0.8b}, [x3], x2`
			`uaddw v0.8h, v17.8h, v5.8b`
			`st1 {v1.8b}, [x3], x2`
			`uaddw v1.8h, v18.8h, v6.8b`
			`st1 {v2.8b}, [x3], x2`
			`uaddw v2.8h, v19.8h, v7.8b`
			`sqxtun v4.8b, v4.8h`
			`sqxtun v0.8b, v0.8h`
			`st1 {v3.8b}, [x3], x2`
			`sqxtun v1.8b, v1.8h`
			`sqxtun v2.8b, v2.8h`
			`st1 {v4.8b}, [x3], x2`
			`st1 {v0.8b}, [x3], x2`
			`st1 {v1.8b}, [x3], x2`
			`st1 {v2.8b}, [x3]`
			`ret`
			`endfunc`