mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-08 13:22:53 +02:00
aarch64: h264 loop filter NEON optimizations
Ported from ARMv7 NEON.
This commit is contained in:
parent
c65d67ef50
commit
36e3b1f2fd
@ -6,7 +6,8 @@ OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o
|
||||
|
||||
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264idct_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
|
||||
aarch64/h264idct_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
|
||||
aarch64/hpeldsp_neon.o
|
||||
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
|
||||
|
@ -25,6 +25,15 @@
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/h264dsp.h"
|
||||
|
||||
void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
|
||||
void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
|
||||
@ -49,6 +58,11 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags) && bit_depth == 8) {
|
||||
c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
|
||||
c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
|
||||
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
|
||||
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
|
||||
|
||||
c->h264_idct_add = ff_h264_idct_add_neon;
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_neon;
|
||||
|
259
libavcodec/aarch64/h264dsp_neon.S
Normal file
259
libavcodec/aarch64/h264dsp_neon.S
Normal file
@ -0,0 +1,259 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
.macro h264_loop_filter_start
|
||||
cmp w2, #0
|
||||
ldr w6, [x4]
|
||||
ccmp w3, #0, #0, ne
|
||||
mov v24.S[0], w6
|
||||
and w6, w6, w6, lsl #16
|
||||
b.eq 1f
|
||||
ands w6, w6, w6, lsl #8
|
||||
b.ge 2f
|
||||
1:
|
||||
ret
|
||||
2:
|
||||
.endm
|
||||
|
||||
.macro h264_loop_filter_luma
|
||||
dup v22.16B, w2 // alpha
|
||||
uxtl v24.8H, v24.8B
|
||||
uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
|
||||
uxtl v24.4S, v24.4H
|
||||
uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
|
||||
sli v24.8H, v24.8H, #8
|
||||
uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
|
||||
sli v24.4S, v24.4S, #16
|
||||
cmhi v21.16B, v22.16B, v21.16B // < alpha
|
||||
dup v22.16B, w3 // beta
|
||||
cmlt v23.16B, v24.16B, #0
|
||||
cmhi v28.16B, v22.16B, v28.16B // < beta
|
||||
cmhi v30.16B, v22.16B, v30.16B // < beta
|
||||
bic v21.16B, v21.16B, v23.16B
|
||||
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
|
||||
and v21.16B, v21.16B, v28.16B
|
||||
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
|
||||
cmhi v17.16B, v22.16B, v17.16B // < beta
|
||||
and v21.16B, v21.16B, v30.16B
|
||||
cmhi v19.16B, v22.16B, v19.16B // < beta
|
||||
and v17.16B, v17.16B, v21.16B
|
||||
and v19.16B, v19.16B, v21.16B
|
||||
and v24.16B, v24.16B, v21.16B
|
||||
urhadd v28.16B, v16.16B, v0.16B
|
||||
sub v21.16B, v24.16B, v17.16B
|
||||
uqadd v23.16B, v18.16B, v24.16B
|
||||
uhadd v20.16B, v20.16B, v28.16B
|
||||
sub v21.16B, v21.16B, v19.16B
|
||||
uhadd v28.16B, v4.16B, v28.16B
|
||||
umin v23.16B, v23.16B, v20.16B
|
||||
uqsub v22.16B, v18.16B, v24.16B
|
||||
uqadd v4.16B, v2.16B, v24.16B
|
||||
umax v23.16B, v23.16B, v22.16B
|
||||
uqsub v22.16B, v2.16B, v24.16B
|
||||
umin v28.16B, v4.16B, v28.16B
|
||||
uxtl v4.8H, v0.8B
|
||||
umax v28.16B, v28.16B, v22.16B
|
||||
uxtl2 v20.8H, v0.16B
|
||||
usubw v4.8H, v4.8H, v16.8B
|
||||
usubw2 v20.8H, v20.8H, v16.16B
|
||||
shl v4.8H, v4.8H, #2
|
||||
shl v20.8H, v20.8H, #2
|
||||
uaddw v4.8H, v4.8H, v18.8B
|
||||
uaddw2 v20.8H, v20.8H, v18.16B
|
||||
usubw v4.8H, v4.8H, v2.8B
|
||||
usubw2 v20.8H, v20.8H, v2.16B
|
||||
rshrn v4.8B, v4.8H, #3
|
||||
rshrn2 v4.16B, v20.8H, #3
|
||||
bsl v17.16B, v23.16B, v18.16B
|
||||
bsl v19.16B, v28.16B, v2.16B
|
||||
neg v23.16B, v21.16B
|
||||
uxtl v28.8H, v16.8B
|
||||
smin v4.16B, v4.16B, v21.16B
|
||||
uxtl2 v21.8H, v16.16B
|
||||
smax v4.16B, v4.16B, v23.16B
|
||||
uxtl v22.8H, v0.8B
|
||||
uxtl2 v24.8H, v0.16B
|
||||
saddw v28.8H, v28.8H, v4.8B
|
||||
saddw2 v21.8H, v21.8H, v4.16B
|
||||
ssubw v22.8H, v22.8H, v4.8B
|
||||
ssubw2 v24.8H, v24.8H, v4.16B
|
||||
sqxtun v16.8B, v28.8H
|
||||
sqxtun2 v16.16B, v21.8H
|
||||
sqxtun v0.8B, v22.8H
|
||||
sqxtun2 v0.16B, v24.8H
|
||||
.endm
|
||||
|
||||
function ff_h264_v_loop_filter_luma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
sxtw x1, w1
|
||||
|
||||
ld1 {v0.16B}, [x0], x1
|
||||
ld1 {v2.16B}, [x0], x1
|
||||
ld1 {v4.16B}, [x0], x1
|
||||
sub x0, x0, x1, lsl #2
|
||||
sub x0, x0, x1, lsl #1
|
||||
ld1 {v20.16B}, [x0], x1
|
||||
ld1 {v18.16B}, [x0], x1
|
||||
ld1 {v16.16B}, [x0], x1
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v17.16B}, [x0], x1
|
||||
st1 {v16.16B}, [x0], x1
|
||||
st1 {v0.16B}, [x0], x1
|
||||
st1 {v19.16B}, [x0]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_luma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub x0, x0, #4
|
||||
ld1 {v6.8B}, [x0], x1
|
||||
ld1 {v20.8B}, [x0], x1
|
||||
ld1 {v18.8B}, [x0], x1
|
||||
ld1 {v16.8B}, [x0], x1
|
||||
ld1 {v0.8B}, [x0], x1
|
||||
ld1 {v2.8B}, [x0], x1
|
||||
ld1 {v4.8B}, [x0], x1
|
||||
ld1 {v26.8B}, [x0], x1
|
||||
ld1 {v6.D}[1], [x0], x1
|
||||
ld1 {v20.D}[1], [x0], x1
|
||||
ld1 {v18.D}[1], [x0], x1
|
||||
ld1 {v16.D}[1], [x0], x1
|
||||
ld1 {v0.D}[1], [x0], x1
|
||||
ld1 {v2.D}[1], [x0], x1
|
||||
ld1 {v4.D}[1], [x0], x1
|
||||
ld1 {v26.D}[1], [x0], x1
|
||||
|
||||
transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
|
||||
|
||||
sub x0, x0, x1, lsl #4
|
||||
add x0, x0, #2
|
||||
st1 {v17.S}[0], [x0], x1
|
||||
st1 {v16.S}[0], [x0], x1
|
||||
st1 {v0.S}[0], [x0], x1
|
||||
st1 {v19.S}[0], [x0], x1
|
||||
st1 {v17.S}[1], [x0], x1
|
||||
st1 {v16.S}[1], [x0], x1
|
||||
st1 {v0.S}[1], [x0], x1
|
||||
st1 {v19.S}[1], [x0], x1
|
||||
st1 {v17.S}[2], [x0], x1
|
||||
st1 {v16.S}[2], [x0], x1
|
||||
st1 {v0.S}[2], [x0], x1
|
||||
st1 {v19.S}[2], [x0], x1
|
||||
st1 {v17.S}[3], [x0], x1
|
||||
st1 {v16.S}[3], [x0], x1
|
||||
st1 {v0.S}[3], [x0], x1
|
||||
st1 {v19.S}[3], [x0], x1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_chroma
|
||||
dup v22.8B, w2 // alpha
|
||||
uxtl v24.8H, v24.8B
|
||||
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
|
||||
uxtl v4.8H, v0.8B
|
||||
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
|
||||
usubw v4.8H, v4.8H, v16.8B
|
||||
sli v24.8H, v24.8H, #8
|
||||
shl v4.8H, v4.8H, #2
|
||||
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
|
||||
uaddw v4.8H, v4.8H, v18.8B
|
||||
cmhi v26.8B, v22.8B, v26.8B // < alpha
|
||||
usubw v4.8H, v4.8H, v2.8B
|
||||
dup v22.8B, w3 // beta
|
||||
rshrn v4.8B, v4.8H, #3
|
||||
cmhi v28.8B, v22.8B, v28.8B // < beta
|
||||
cmhi v30.8B, v22.8B, v30.8B // < beta
|
||||
smin v4.8B, v4.8B, v24.8B
|
||||
neg v25.8B, v24.8B
|
||||
and v26.8B, v26.8B, v28.8B
|
||||
smax v4.8B, v4.8B, v25.8B
|
||||
and v26.8B, v26.8B, v30.8B
|
||||
uxtl v22.8H, v0.8B
|
||||
and v4.8B, v4.8B, v26.8B
|
||||
uxtl v28.8H, v16.8B
|
||||
saddw v28.8H, v28.8H, v4.8B
|
||||
ssubw v22.8H, v22.8H, v4.8B
|
||||
sqxtun v16.8B, v28.8H
|
||||
sqxtun v0.8B, v22.8H
|
||||
.endm
|
||||
|
||||
function ff_h264_v_loop_filter_chroma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
ld1 {v18.8B}, [x0], x1
|
||||
ld1 {v16.8B}, [x0], x1
|
||||
ld1 {v0.8B}, [x0], x1
|
||||
ld1 {v2.8B}, [x0]
|
||||
|
||||
h264_loop_filter_chroma
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v16.8B}, [x0], x1
|
||||
st1 {v0.8B}, [x0], x1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_chroma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub x0, x0, #2
|
||||
ld1 {v18.S}[0], [x0], x1
|
||||
ld1 {v16.S}[0], [x0], x1
|
||||
ld1 {v0.S}[0], [x0], x1
|
||||
ld1 {v2.S}[0], [x0], x1
|
||||
ld1 {v18.S}[1], [x0], x1
|
||||
ld1 {v16.S}[1], [x0], x1
|
||||
ld1 {v0.S}[1], [x0], x1
|
||||
ld1 {v2.S}[1], [x0], x1
|
||||
|
||||
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
|
||||
|
||||
h264_loop_filter_chroma
|
||||
|
||||
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
|
||||
|
||||
sub x0, x0, x1, lsl #3
|
||||
st1 {v18.S}[0], [x0], x1
|
||||
st1 {v16.S}[0], [x0], x1
|
||||
st1 {v0.S}[0], [x0], x1
|
||||
st1 {v2.S}[0], [x0], x1
|
||||
st1 {v18.S}[1], [x0], x1
|
||||
st1 {v16.S}[1], [x0], x1
|
||||
st1 {v0.S}[1], [x0], x1
|
||||
st1 {v2.S}[1], [x0], x1
|
||||
|
||||
ret
|
||||
endfunc
|
@ -80,6 +80,30 @@
|
||||
trn2 \r7\().4S, \t1\().4S, \r7\().4S
|
||||
.endm
|
||||
|
||||
.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().16B, \r0\().16B, \r1\().16B
|
||||
trn2 \t5\().16B, \r0\().16B, \r1\().16B
|
||||
trn1 \t6\().16B, \r2\().16B, \r3\().16B
|
||||
trn2 \t7\().16B, \r2\().16B, \r3\().16B
|
||||
|
||||
trn1 \r0\().8H, \t4\().8H, \t6\().8H
|
||||
trn2 \r2\().8H, \t4\().8H, \t6\().8H
|
||||
trn1 \r1\().8H, \t5\().8H, \t7\().8H
|
||||
trn2 \r3\().8H, \t5\().8H, \t7\().8H
|
||||
.endm
|
||||
|
||||
.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().8B, \r0\().8B, \r1\().8B
|
||||
trn2 \t5\().8B, \r0\().8B, \r1\().8B
|
||||
trn1 \t6\().8B, \r2\().8B, \r3\().8B
|
||||
trn2 \t7\().8B, \r2\().8B, \r3\().8B
|
||||
|
||||
trn1 \r0\().4H, \t4\().4H, \t6\().4H
|
||||
trn2 \r2\().4H, \t4\().4H, \t6\().4H
|
||||
trn1 \r1\().4H, \t5\().4H, \t7\().4H
|
||||
trn2 \r3\().4H, \t5\().4H, \t7\().4H
|
||||
.endm
|
||||
|
||||
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
|
||||
trn1 \r4\().4H, \r0\().4H, \r1\().4H
|
||||
trn2 \r5\().4H, \r0\().4H, \r1\().4H
|
||||
|
Loading…
Reference in New Issue
Block a user