mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
ARM: NEON optimised simple_idct
Originally committed as revision 16146 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
569f5a756a
commit
7eec43289a
@ -450,6 +450,7 @@ OBJS-$(HAVE_IWMMXT) += armv4l/dsputil_iwmmxt.o \
|
||||
|
||||
OBJS-$(HAVE_NEON) += armv4l/dsputil_neon.o \
|
||||
armv4l/dsputil_neon_s.o \
|
||||
armv4l/simple_idct_neon.o \
|
||||
|
||||
OBJS-$(ARCH_BFIN) += bfin/dsputil_bfin.o \
|
||||
bfin/fdct_bfin.o \
|
||||
|
@ -39,6 +39,10 @@ void ff_simple_idct_armv6(DCTELEM *data);
|
||||
void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data);
|
||||
void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data);
|
||||
|
||||
void ff_simple_idct_neon(DCTELEM *data);
|
||||
void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
|
||||
void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
|
||||
|
||||
/* XXX: local hack */
|
||||
static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
|
||||
static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
|
||||
@ -128,6 +132,8 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
|
||||
if(idct_algo == FF_IDCT_AUTO){
|
||||
#if defined(HAVE_IPP)
|
||||
idct_algo = FF_IDCT_IPP;
|
||||
#elif defined(HAVE_NEON)
|
||||
idct_algo = FF_IDCT_SIMPLENEON;
|
||||
#elif defined(HAVE_ARMV6)
|
||||
idct_algo = FF_IDCT_SIMPLEARMV6;
|
||||
#elif defined(HAVE_ARMV5TE)
|
||||
@ -167,6 +173,13 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
|
||||
c->idct_add= simple_idct_ipp_add;
|
||||
c->idct = simple_idct_ipp;
|
||||
c->idct_permutation_type= FF_NO_IDCT_PERM;
|
||||
#endif
|
||||
#ifdef HAVE_NEON
|
||||
} else if (idct_algo==FF_IDCT_SIMPLENEON){
|
||||
c->idct_put= ff_simple_idct_put_neon;
|
||||
c->idct_add= ff_simple_idct_add_neon;
|
||||
c->idct = ff_simple_idct_neon;
|
||||
c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
362
libavcodec/armv4l/simple_idct_neon.S
Normal file
362
libavcodec/armv4l/simple_idct_neon.S
Normal file
@ -0,0 +1,362 @@
|
||||
/*
|
||||
* ARM NEON IDCT
|
||||
*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* Based on Simple IDCT
|
||||
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "asm.S"
|
||||
|
||||
#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define W4c ((1<<(COL_SHIFT-1))/W4)
|
||||
#define ROW_SHIFT 11
|
||||
#define COL_SHIFT 20
|
||||
|
||||
#define w1 d0[0]
|
||||
#define w2 d0[1]
|
||||
#define w3 d0[2]
|
||||
#define w4 d0[3]
|
||||
#define w5 d1[0]
|
||||
#define w6 d1[1]
|
||||
#define w7 d1[2]
|
||||
#define w4c d1[3]
|
||||
|
||||
.fpu neon
|
||||
|
||||
.macro idct_col4_top
|
||||
vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */
|
||||
vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */
|
||||
vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */
|
||||
vadd.i32 q11, q15, q7
|
||||
vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */
|
||||
vadd.i32 q12, q15, q8
|
||||
vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */
|
||||
vsub.i32 q13, q15, q8
|
||||
vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */
|
||||
vsub.i32 q14, q15, q7
|
||||
|
||||
vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */
|
||||
vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */
|
||||
vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */
|
||||
vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */
|
||||
.endm
|
||||
|
||||
.text
|
||||
.align 6
|
||||
|
||||
function idct_row4_neon
|
||||
vmov.i32 q15, #(1<<(ROW_SHIFT-1))
|
||||
vld1.64 {d2-d5}, [r2,:128]!
|
||||
vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */
|
||||
vld1.64 {d6,d7}, [r2,:128]!
|
||||
vorr d10, d3, d5
|
||||
vld1.64 {d8,d9}, [r2,:128]!
|
||||
add r2, r2, #-64
|
||||
|
||||
vorr d11, d7, d9
|
||||
vorr d10, d10, d11
|
||||
vmov r3, r4, d10
|
||||
|
||||
idct_col4_top
|
||||
|
||||
orrs r3, r3, r4
|
||||
beq 1f
|
||||
|
||||
vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
|
||||
vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
|
||||
vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
|
||||
vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
|
||||
vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
|
||||
vadd.i32 q11, q11, q7
|
||||
vsub.i32 q12, q12, q7
|
||||
vsub.i32 q13, q13, q7
|
||||
vadd.i32 q14, q14, q7
|
||||
vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
|
||||
vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
|
||||
vmlal.s16 q9, d9, w7
|
||||
vmlsl.s16 q10, d9, w5
|
||||
vmlal.s16 q5, d9, w3
|
||||
vmlsl.s16 q6, d9, w1
|
||||
vadd.i32 q11, q11, q7
|
||||
vsub.i32 q12, q12, q8
|
||||
vadd.i32 q13, q13, q8
|
||||
vsub.i32 q14, q14, q7
|
||||
|
||||
1: vadd.i32 q3, q11, q9
|
||||
vadd.i32 q4, q12, q10
|
||||
vshrn.i32 d2, q3, #ROW_SHIFT
|
||||
vshrn.i32 d4, q4, #ROW_SHIFT
|
||||
vadd.i32 q7, q13, q5
|
||||
vadd.i32 q8, q14, q6
|
||||
vtrn.16 d2, d4
|
||||
vshrn.i32 d6, q7, #ROW_SHIFT
|
||||
vshrn.i32 d8, q8, #ROW_SHIFT
|
||||
vsub.i32 q14, q14, q6
|
||||
vsub.i32 q11, q11, q9
|
||||
vtrn.16 d6, d8
|
||||
vsub.i32 q13, q13, q5
|
||||
vshrn.i32 d3, q14, #ROW_SHIFT
|
||||
vtrn.32 d2, d6
|
||||
vsub.i32 q12, q12, q10
|
||||
vtrn.32 d4, d8
|
||||
vshrn.i32 d5, q13, #ROW_SHIFT
|
||||
vshrn.i32 d7, q12, #ROW_SHIFT
|
||||
vshrn.i32 d9, q11, #ROW_SHIFT
|
||||
|
||||
vtrn.16 d3, d5
|
||||
vtrn.16 d7, d9
|
||||
vtrn.32 d3, d7
|
||||
vtrn.32 d5, d9
|
||||
|
||||
vst1.64 {d2-d5}, [r2,:128]!
|
||||
vst1.64 {d6-d9}, [r2,:128]!
|
||||
|
||||
bx lr
|
||||
.endfunc
|
||||
|
||||
function idct_col4_neon
|
||||
mov ip, #16
|
||||
vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */
|
||||
vdup.16 d30, w4c
|
||||
vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */
|
||||
vadd.i16 d30, d30, d2
|
||||
vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */
|
||||
vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/
|
||||
vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */
|
||||
|
||||
ldrd r4, [r2]
|
||||
ldrd r6, [r2, #16]
|
||||
orrs r4, r4, r5
|
||||
|
||||
idct_col4_top
|
||||
addeq r2, r2, #16
|
||||
beq 1f
|
||||
|
||||
vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */
|
||||
vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
|
||||
vadd.i32 q11, q11, q7
|
||||
vsub.i32 q12, q12, q7
|
||||
vsub.i32 q13, q13, q7
|
||||
vadd.i32 q14, q14, q7
|
||||
|
||||
1: orrs r6, r6, r7
|
||||
ldrd r4, [r2, #16]
|
||||
addeq r2, r2, #16
|
||||
beq 2f
|
||||
|
||||
vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */
|
||||
vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
|
||||
vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
|
||||
vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
|
||||
vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
|
||||
|
||||
2: orrs r4, r4, r5
|
||||
ldrd r4, [r2, #16]
|
||||
addeq r2, r2, #16
|
||||
beq 3f
|
||||
|
||||
vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */
|
||||
vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
|
||||
vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
|
||||
vadd.i32 q11, q11, q7
|
||||
vsub.i32 q14, q14, q7
|
||||
vsub.i32 q12, q12, q8
|
||||
vadd.i32 q13, q13, q8
|
||||
|
||||
3: orrs r4, r4, r5
|
||||
addeq r2, r2, #16
|
||||
beq 4f
|
||||
|
||||
vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */
|
||||
vmlal.s16 q9, d9, w7
|
||||
vmlsl.s16 q10, d9, w5
|
||||
vmlal.s16 q5, d9, w3
|
||||
vmlsl.s16 q6, d9, w1
|
||||
|
||||
4: vaddhn.i32 d2, q11, q9
|
||||
vaddhn.i32 d3, q12, q10
|
||||
vaddhn.i32 d4, q13, q5
|
||||
vaddhn.i32 d5, q14, q6
|
||||
vsubhn.i32 d9, q11, q9
|
||||
vsubhn.i32 d8, q12, q10
|
||||
vsubhn.i32 d7, q13, q5
|
||||
vsubhn.i32 d6, q14, q6
|
||||
|
||||
bx lr
|
||||
.endfunc
|
||||
|
||||
.align 6
|
||||
|
||||
function idct_col4_st8_neon
|
||||
vqshrun.s16 d2, q1, #COL_SHIFT-16
|
||||
vqshrun.s16 d3, q2, #COL_SHIFT-16
|
||||
vqshrun.s16 d4, q3, #COL_SHIFT-16
|
||||
vqshrun.s16 d5, q4, #COL_SHIFT-16
|
||||
vst1.32 {d2[0]}, [r0,:32], r1
|
||||
vst1.32 {d2[1]}, [r0,:32], r1
|
||||
vst1.32 {d3[0]}, [r0,:32], r1
|
||||
vst1.32 {d3[1]}, [r0,:32], r1
|
||||
vst1.32 {d4[0]}, [r0,:32], r1
|
||||
vst1.32 {d4[1]}, [r0,:32], r1
|
||||
vst1.32 {d5[0]}, [r0,:32], r1
|
||||
vst1.32 {d5[1]}, [r0,:32], r1
|
||||
|
||||
bx lr
|
||||
.endfunc
|
||||
|
||||
.section .rodata
|
||||
.align 4
|
||||
const: .short W1, W2, W3, W4, W5, W6, W7, W4c
|
||||
.previous
|
||||
|
||||
.macro idct_start data
|
||||
push {r4-r7, lr}
|
||||
pld [\data]
|
||||
pld [\data, #64]
|
||||
vpush {d8-d15}
|
||||
movw r3, #:lower16:const
|
||||
movt r3, #:upper16:const
|
||||
vld1.64 {d0,d1}, [r3,:128]
|
||||
.endm
|
||||
|
||||
.macro idct_end
|
||||
vpop {d8-d15}
|
||||
pop {r4-r7, pc}
|
||||
.endm
|
||||
|
||||
/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */
|
||||
function ff_simple_idct_put_neon, export=1
|
||||
idct_start r2
|
||||
|
||||
bl idct_row4_neon
|
||||
bl idct_row4_neon
|
||||
add r2, r2, #-128
|
||||
bl idct_col4_neon
|
||||
bl idct_col4_st8_neon
|
||||
sub r0, r0, r1, lsl #3
|
||||
add r0, r0, #4
|
||||
add r2, r2, #-120
|
||||
bl idct_col4_neon
|
||||
bl idct_col4_st8_neon
|
||||
|
||||
idct_end
|
||||
.endfunc
|
||||
|
||||
.align 6
|
||||
|
||||
function idct_col4_add8_neon
|
||||
mov ip, r0
|
||||
|
||||
vld1.32 {d10[0]}, [r0,:32], r1
|
||||
vshr.s16 q1, q1, #COL_SHIFT-16
|
||||
vld1.32 {d10[1]}, [r0,:32], r1
|
||||
vshr.s16 q2, q2, #COL_SHIFT-16
|
||||
vld1.32 {d11[0]}, [r0,:32], r1
|
||||
vshr.s16 q3, q3, #COL_SHIFT-16
|
||||
vld1.32 {d11[1]}, [r0,:32], r1
|
||||
vshr.s16 q4, q4, #COL_SHIFT-16
|
||||
vld1.32 {d12[0]}, [r0,:32], r1
|
||||
vaddw.u8 q1, q1, d10
|
||||
vld1.32 {d12[1]}, [r0,:32], r1
|
||||
vaddw.u8 q2, q2, d11
|
||||
vld1.32 {d13[0]}, [r0,:32], r1
|
||||
vqmovun.s16 d2, q1
|
||||
vld1.32 {d13[1]}, [r0,:32], r1
|
||||
vaddw.u8 q3, q3, d12
|
||||
vst1.32 {d2[0]}, [ip,:32], r1
|
||||
vqmovun.s16 d3, q2
|
||||
vst1.32 {d2[1]}, [ip,:32], r1
|
||||
vaddw.u8 q4, q4, d13
|
||||
vst1.32 {d3[0]}, [ip,:32], r1
|
||||
vqmovun.s16 d4, q3
|
||||
vst1.32 {d3[1]}, [ip,:32], r1
|
||||
vqmovun.s16 d5, q4
|
||||
vst1.32 {d4[0]}, [ip,:32], r1
|
||||
vst1.32 {d4[1]}, [ip,:32], r1
|
||||
vst1.32 {d5[0]}, [ip,:32], r1
|
||||
vst1.32 {d5[1]}, [ip,:32], r1
|
||||
|
||||
bx lr
|
||||
.endfunc
|
||||
|
||||
/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */
|
||||
function ff_simple_idct_add_neon, export=1
|
||||
idct_start r2
|
||||
|
||||
bl idct_row4_neon
|
||||
bl idct_row4_neon
|
||||
add r2, r2, #-128
|
||||
bl idct_col4_neon
|
||||
bl idct_col4_add8_neon
|
||||
sub r0, r0, r1, lsl #3
|
||||
add r0, r0, #4
|
||||
add r2, r2, #-120
|
||||
bl idct_col4_neon
|
||||
bl idct_col4_add8_neon
|
||||
|
||||
idct_end
|
||||
.endfunc
|
||||
|
||||
.align 6
|
||||
|
||||
function idct_col4_st16_neon
|
||||
mov ip, #16
|
||||
|
||||
vshr.s16 q1, q1, #COL_SHIFT-16
|
||||
vshr.s16 q2, q2, #COL_SHIFT-16
|
||||
vst1.64 {d2}, [r2,:64], ip
|
||||
vshr.s16 q3, q3, #COL_SHIFT-16
|
||||
vst1.64 {d3}, [r2,:64], ip
|
||||
vshr.s16 q4, q4, #COL_SHIFT-16
|
||||
vst1.64 {d4}, [r2,:64], ip
|
||||
vst1.64 {d5}, [r2,:64], ip
|
||||
vst1.64 {d6}, [r2,:64], ip
|
||||
vst1.64 {d7}, [r2,:64], ip
|
||||
vst1.64 {d8}, [r2,:64], ip
|
||||
vst1.64 {d9}, [r2,:64], ip
|
||||
|
||||
bx lr
|
||||
.endfunc
|
||||
|
||||
/* void ff_simple_idct_neon(DCTELEM *data); */
|
||||
function ff_simple_idct_neon, export=1
|
||||
idct_start r0
|
||||
|
||||
mov r2, r0
|
||||
bl idct_row4_neon
|
||||
bl idct_row4_neon
|
||||
add r2, r2, #-128
|
||||
bl idct_col4_neon
|
||||
add r2, r2, #-128
|
||||
bl idct_col4_st16_neon
|
||||
add r2, r2, #-120
|
||||
bl idct_col4_neon
|
||||
add r2, r2, #-128
|
||||
bl idct_col4_st16_neon
|
||||
|
||||
idct_end
|
||||
.endfunc
|
@ -1390,6 +1390,7 @@ typedef struct AVCodecContext {
|
||||
#define FF_IDCT_WMV2 19
|
||||
#define FF_IDCT_FAAN 20
|
||||
#define FF_IDCT_EA 21
|
||||
#define FF_IDCT_SIMPLENEON 22
|
||||
|
||||
/**
|
||||
* slice count
|
||||
|
@ -564,6 +564,7 @@ static const AVOption options[]={
|
||||
{"simplearm", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARM, INT_MIN, INT_MAX, V|E|D, "idct"},
|
||||
{"simplearmv5te", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARMV5TE, INT_MIN, INT_MAX, V|E|D, "idct"},
|
||||
{"simplearmv6", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARMV6, INT_MIN, INT_MAX, V|E|D, "idct"},
|
||||
{"simpleneon", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLENEON, INT_MIN, INT_MAX, V|E|D, "idct"},
|
||||
{"h264", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_H264, INT_MIN, INT_MAX, V|E|D, "idct"},
|
||||
{"vp3", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_VP3, INT_MIN, INT_MAX, V|E|D, "idct"},
|
||||
{"ipp", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_IPP, INT_MIN, INT_MAX, V|E|D, "idct"},
|
||||
|
Loading…
x
Reference in New Issue
Block a user