From f5c05b9aa5aeb6079b76f9da452f8ee4050e8955 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Mon, 5 Dec 2011 21:18:05 +0000 Subject: [PATCH] rv40: NEON optimised chroma MC Signed-off-by: Mans Rullgard --- libavcodec/arm/Makefile | 2 + libavcodec/arm/h264cmc_neon.S | 80 ++++++++++++++++++++++++++++-- libavcodec/arm/rv40dsp_init_neon.c | 38 ++++++++++++++ libavcodec/rv34dsp.h | 1 + libavcodec/rv40dsp.c | 2 + 5 files changed, 118 insertions(+), 5 deletions(-) create mode 100644 libavcodec/arm/rv40dsp_init_neon.c diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index c125a59078..a948e6db3f 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -68,6 +68,8 @@ NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_neon.o \ NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_neon.o \ arm/rv34dsp_neon.o \ + arm/rv40dsp_init_neon.o \ + arm/h264cmc_neon.o \ NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o diff --git a/libavcodec/arm/h264cmc_neon.S b/libavcodec/arm/h264cmc_neon.S index e10adaca10..a6feadd189 100644 --- a/libavcodec/arm/h264cmc_neon.S +++ b/libavcodec/arm/h264cmc_neon.S @@ -21,8 +21,8 @@ #include "asm.S" /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ -.macro h264_chroma_mc8 type -function ff_\type\()_h264_chroma_mc8_neon, export=1 +.macro h264_chroma_mc8 type, codec=h264 +function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 push {r4-r7, lr} ldrd r4, [sp, #20] .ifc \type,avg @@ -31,6 +31,15 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1 pld [r1] pld [r1, r2] + .ifc \codec,rv40 + movrel r6, rv40bias + lsr r7, r5, #1 + add r6, r6, r7, lsl #3 + lsr r7, r4, #1 + add r6, r6, r7, lsl #1 + vld1.16 {d22[],d23[]}, [r6,:16] + .endif + A muls r7, r4, r5 T mul r7, r4, r5 T cmp r7, #0 @@ -67,10 +76,17 @@ T cmp r7, #0 vmlal.u8 q9, d7, d1 vmlal.u8 q9, d4, d2 vmlal.u8 q9, d5, d3 - vrshrn.u16 d16, q8, #6 vld1.8 {d6, d7}, [r5], r4 pld [r1] + .ifc \codec,h264 + vrshrn.u16 d16, q8, #6 vrshrn.u16 d17, q9, #6 + .else + vadd.u16 q8, q8, q11 + vadd.u16 q9, q9, q11 + vshrn.u16 d16, q8, #6 + vshrn.u16 d17, q9, #6 + .endif .ifc \type,avg vld1.8 {d20}, [lr,:64], r2 vld1.8 {d21}, [lr,:64], r2 @@ -102,8 +118,15 @@ T cmp r7, #0 vmull.u8 q9, d6, d0 vmlal.u8 q9, d4, d1 vld1.8 {d6}, [r5], r4 + .ifc \codec,h264 vrshrn.u16 d16, q8, #6 vrshrn.u16 d17, q9, #6 + .else + vadd.u16 q8, q8, q11 + vadd.u16 q9, q9, q11 + vshrn.u16 d16, q8, #6 + vshrn.u16 d17, q9, #6 + .endif .ifc \type,avg vld1.8 {d20}, [lr,:64], r2 vld1.8 {d21}, [lr,:64], r2 @@ -131,8 +154,15 @@ T cmp r7, #0 vmlal.u8 q9, d7, d1 pld [r1] vext.8 d5, d4, d5, #1 + .ifc \codec,h264 vrshrn.u16 d16, q8, #6 vrshrn.u16 d17, q9, #6 + .else + vadd.u16 q8, q8, q11 + vadd.u16 q9, q9, q11 + vshrn.u16 d16, q8, #6 + vshrn.u16 d17, q9, #6 + .endif .ifc \type,avg vld1.8 {d20}, [lr,:64], r2 vld1.8 {d21}, [lr,:64], r2 @@ -149,8 +179,8 @@ endfunc .endm /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ -.macro h264_chroma_mc4 type -function ff_\type\()_h264_chroma_mc4_neon, export=1 +.macro h264_chroma_mc4 type, codec=h264 +function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 push {r4-r7, lr} ldrd r4, [sp, #20] .ifc \type,avg @@ -159,6 +189,15 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1 pld [r1] pld [r1, r2] + .ifc \codec,rv40 + movrel r6, rv40bias + lsr r7, r5, #1 + add r6, r6, r7, lsl #3 + lsr r7, r4, #1 + add r6, r6, r7, lsl #1 + vld1.16 {d22[],d23[]}, [r6,:16] + .endif + A muls r7, r4, r5 T mul r7, r4, r5 T cmp r7, #0 @@ -199,7 +238,12 @@ T cmp r7, #0 vld1.8 {d6}, [r5], r4 vadd.i16 d16, d16, d17 vadd.i16 d17, d18, d19 + .ifc \codec,h264 vrshrn.u16 d16, q8, #6 + .else + vadd.u16 q8, q8, q11 + vshrn.u16 d16, q8, #6 + .endif subs r3, r3, #2 pld [r1] .ifc \type,avg @@ -236,7 +280,12 @@ T cmp r7, #0 vld1.32 {d4[1]}, [r5], r4 vadd.i16 d16, d16, d17 vadd.i16 d17, d18, d19 + .ifc \codec,h264 vrshrn.u16 d16, q8, #6 + .else + vadd.u16 q8, q8, q11 + vshrn.u16 d16, q8, #6 + .endif .ifc \type,avg vld1.32 {d20[0]}, [lr,:32], r2 vld1.32 {d20[1]}, [lr,:32], r2 @@ -266,7 +315,12 @@ T cmp r7, #0 vadd.i16 d16, d16, d17 vadd.i16 d17, d18, d19 pld [r1] + .ifc \codec,h264 vrshrn.u16 d16, q8, #6 + .else + vadd.u16 q8, q8, q11 + vshrn.u16 d16, q8, #6 + .endif .ifc \type,avg vld1.32 {d20[0]}, [lr,:32], r2 vld1.32 {d20[1]}, [lr,:32], r2 @@ -352,9 +406,25 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1 endfunc .endm +#if CONFIG_H264_DECODER h264_chroma_mc8 put h264_chroma_mc8 avg h264_chroma_mc4 put h264_chroma_mc4 avg h264_chroma_mc2 put h264_chroma_mc2 avg +#endif + +#if CONFIG_RV40_DECODER +const rv40bias + .short 0, 16, 32, 16 + .short 32, 28, 32, 28 + .short 0, 32, 16, 32 + .short 32, 28, 32, 28 +endconst + + h264_chroma_mc8 put, rv40 + h264_chroma_mc8 avg, rv40 + h264_chroma_mc4 put, rv40 + h264_chroma_mc4 avg, rv40 +#endif diff --git a/libavcodec/arm/rv40dsp_init_neon.c b/libavcodec/arm/rv40dsp_init_neon.c new file mode 100644 index 0000000000..aa4a88da1a --- /dev/null +++ b/libavcodec/arm/rv40dsp_init_neon.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2011 Janne Grunau + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavcodec/avcodec.h" +#include "libavcodec/rv34dsp.h" + +void ff_put_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); + +void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); +void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); + +void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp) +{ + c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon; + c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon; + c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon; + c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon; +} diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h index a1636e6eb5..695af06970 100644 --- a/libavcodec/rv34dsp.h +++ b/libavcodec/rv34dsp.h @@ -59,5 +59,6 @@ void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp); void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext *dsp); void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp); +void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext *dsp); #endif /* AVCODEC_RV34DSP_H */ diff --git a/libavcodec/rv40dsp.c b/libavcodec/rv40dsp.c index f193b6050d..06bdf18c42 100644 --- a/libavcodec/rv40dsp.c +++ b/libavcodec/rv40dsp.c @@ -534,4 +534,6 @@ av_cold void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp) { if (HAVE_MMX) ff_rv40dsp_init_x86(c, dsp); + if (HAVE_NEON) + ff_rv40dsp_init_neon(c, dsp); }