From 2957d29f0531ccd8a6f4378293424dfd92db3044 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sun, 10 Mar 2013 16:53:07 -0700 Subject: [PATCH] alpha: hpeldsp: Move half-pel assembly from dsputil to hpeldsp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Martin Storsjö --- libavcodec/alpha/Makefile | 2 + libavcodec/alpha/dsputil_alpha.c | 185 ----------------------- libavcodec/alpha/dsputil_alpha.h | 2 - libavcodec/alpha/dsputil_alpha_asm.S | 103 ------------- libavcodec/alpha/hpeldsp_alpha.c | 213 +++++++++++++++++++++++++++ libavcodec/alpha/hpeldsp_alpha.h | 28 ++++ libavcodec/alpha/hpeldsp_alpha_asm.S | 124 ++++++++++++++++ libavcodec/alpha/regdef.h | 11 ++ libavcodec/hpeldsp.c | 2 + libavcodec/hpeldsp.h | 1 + 10 files changed, 381 insertions(+), 290 deletions(-) create mode 100644 libavcodec/alpha/hpeldsp_alpha.c create mode 100644 libavcodec/alpha/hpeldsp_alpha.h create mode 100644 libavcodec/alpha/hpeldsp_alpha_asm.S diff --git a/libavcodec/alpha/Makefile b/libavcodec/alpha/Makefile index e28200d45a..6f22137167 100644 --- a/libavcodec/alpha/Makefile +++ b/libavcodec/alpha/Makefile @@ -4,4 +4,6 @@ OBJS += alpha/dsputil_alpha.o \ alpha/motion_est_mvi_asm.o \ alpha/simple_idct_alpha.o \ +OBJS-$(CONFIG_HPELDSP) += alpha/hpeldsp_alpha.o \ + alpha/hpeldsp_alpha_asm.o OBJS-$(CONFIG_MPEGVIDEO) += alpha/mpegvideo_alpha.o diff --git a/libavcodec/alpha/dsputil_alpha.c b/libavcodec/alpha/dsputil_alpha.c index 34fe2ebcad..7a41cb8ebb 100644 --- a/libavcodec/alpha/dsputil_alpha.c +++ b/libavcodec/alpha/dsputil_alpha.c @@ -119,196 +119,11 @@ static void clear_blocks_axp(int16_t *blocks) { } while (n); } -static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) -{ - return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); -} - -static inline uint64_t avg2(uint64_t a, uint64_t b) -{ - return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); -} - -#if 0 -/* The XY2 routines basically utilize this scheme, but reuse parts in - each iteration. */ -static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) -{ - uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) - + ((l2 & ~BYTE_VEC(0x03)) >> 2) - + ((l3 & ~BYTE_VEC(0x03)) >> 2) - + ((l4 & ~BYTE_VEC(0x03)) >> 2); - uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) - + (l2 & BYTE_VEC(0x03)) - + (l3 & BYTE_VEC(0x03)) - + (l4 & BYTE_VEC(0x03)) - + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); - return r1 + r2; -} -#endif - -#define OP(LOAD, STORE) \ - do { \ - STORE(LOAD(pixels), block); \ - pixels += line_size; \ - block += line_size; \ - } while (--h) - -#define OP_X2(LOAD, STORE) \ - do { \ - uint64_t pix1, pix2; \ - \ - pix1 = LOAD(pixels); \ - pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ - STORE(AVG2(pix1, pix2), block); \ - pixels += line_size; \ - block += line_size; \ - } while (--h) - -#define OP_Y2(LOAD, STORE) \ - do { \ - uint64_t pix = LOAD(pixels); \ - do { \ - uint64_t next_pix; \ - \ - pixels += line_size; \ - next_pix = LOAD(pixels); \ - STORE(AVG2(pix, next_pix), block); \ - block += line_size; \ - pix = next_pix; \ - } while (--h); \ - } while (0) - -#define OP_XY2(LOAD, STORE) \ - do { \ - uint64_t pix1 = LOAD(pixels); \ - uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ - uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ - + (pix2 & BYTE_VEC(0x03)); \ - uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ - + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ - \ - do { \ - uint64_t npix1, npix2; \ - uint64_t npix_l, npix_h; \ - uint64_t avg; \ - \ - pixels += line_size; \ - npix1 = LOAD(pixels); \ - npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ - npix_l = (npix1 & BYTE_VEC(0x03)) \ - + (npix2 & BYTE_VEC(0x03)); \ - npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ - + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ - avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ - + pix_h + npix_h; \ - STORE(avg, block); \ - \ - block += line_size; \ - pix_l = npix_l; \ - pix_h = npix_h; \ - } while (--h); \ - } while (0) - -#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ -static void OPNAME ## _pixels ## SUFF ## _axp \ - (uint8_t *restrict block, const uint8_t *restrict pixels, \ - ptrdiff_t line_size, int h) \ -{ \ - if ((size_t) pixels & 0x7) { \ - OPKIND(uldq, STORE); \ - } else { \ - OPKIND(ldq, STORE); \ - } \ -} \ - \ -static void OPNAME ## _pixels16 ## SUFF ## _axp \ - (uint8_t *restrict block, const uint8_t *restrict pixels, \ - ptrdiff_t line_size, int h) \ -{ \ - OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ - OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ -} - -#define PIXOP(OPNAME, STORE) \ - MAKE_OP(OPNAME, , OP, STORE) \ - MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ - MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ - MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) - -/* Rounding primitives. */ -#define AVG2 avg2 -#define AVG4 avg4 -#define AVG4_ROUNDER BYTE_VEC(0x02) -#define STORE(l, b) stq(l, b) -PIXOP(put, STORE); - -#undef STORE -#define STORE(l, b) stq(AVG2(l, ldq(b)), b); -PIXOP(avg, STORE); - -/* Not rounding primitives. */ -#undef AVG2 -#undef AVG4 -#undef AVG4_ROUNDER -#undef STORE -#define AVG2 avg2_no_rnd -#define AVG4 avg4_no_rnd -#define AVG4_ROUNDER BYTE_VEC(0x01) -#define STORE(l, b) stq(l, b) -PIXOP(put_no_rnd, STORE); - -#undef STORE -#define STORE(l, b) stq(AVG2(l, ldq(b)), b); -PIXOP(avg_no_rnd, STORE); - -static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - put_pixels_axp_asm(block, pixels, line_size, h); - put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); -} - av_cold void ff_dsputil_init_alpha(DSPContext *c, AVCodecContext *avctx) { const int high_bit_depth = avctx->bits_per_raw_sample > 8; if (!high_bit_depth) { - c->put_pixels_tab[0][0] = put_pixels16_axp_asm; - c->put_pixels_tab[0][1] = put_pixels16_x2_axp; - c->put_pixels_tab[0][2] = put_pixels16_y2_axp; - c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; - - c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; - c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; - c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; - c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; - - c->avg_pixels_tab[0][0] = avg_pixels16_axp; - c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; - c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; - c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; - - c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp; - c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp; - c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp; - c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp; - - c->put_pixels_tab[1][0] = put_pixels_axp_asm; - c->put_pixels_tab[1][1] = put_pixels_x2_axp; - c->put_pixels_tab[1][2] = put_pixels_y2_axp; - c->put_pixels_tab[1][3] = put_pixels_xy2_axp; - - c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; - c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; - - c->avg_pixels_tab[1][0] = avg_pixels_axp; - c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; - c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; - c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; - c->clear_blocks = clear_blocks_axp; } diff --git a/libavcodec/alpha/dsputil_alpha.h b/libavcodec/alpha/dsputil_alpha.h index fcea47c665..d976c18e2e 100644 --- a/libavcodec/alpha/dsputil_alpha.h +++ b/libavcodec/alpha/dsputil_alpha.h @@ -26,8 +26,6 @@ void ff_simple_idct_axp(int16_t *block); void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block); void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block); -void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h); void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, int line_size); void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, diff --git a/libavcodec/alpha/dsputil_alpha_asm.S b/libavcodec/alpha/dsputil_alpha_asm.S index 1f589aa0c0..afe02cc391 100644 --- a/libavcodec/alpha/dsputil_alpha_asm.S +++ b/libavcodec/alpha/dsputil_alpha_asm.S @@ -26,114 +26,11 @@ #include "regdef.h" -/* Some nicer register names. */ -#define ta t10 -#define tb t11 -#define tc t12 -#define td AT -/* Danger: these overlap with the argument list and the return value */ -#define te a5 -#define tf a4 -#define tg a3 -#define th v0 - .set noat .set noreorder .arch pca56 .text -/************************************************************************ - * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, - * int line_size, int h) - */ - .align 6 - .globl put_pixels_axp_asm - .ent put_pixels_axp_asm -put_pixels_axp_asm: - .frame sp, 0, ra - .prologue 0 - - and a1, 7, t0 - beq t0, $aligned - - .align 4 -$unaligned: - ldq_u t0, 0(a1) - ldq_u t1, 8(a1) - addq a1, a2, a1 - nop - - ldq_u t2, 0(a1) - ldq_u t3, 8(a1) - addq a1, a2, a1 - nop - - ldq_u t4, 0(a1) - ldq_u t5, 8(a1) - addq a1, a2, a1 - nop - - ldq_u t6, 0(a1) - ldq_u t7, 8(a1) - extql t0, a1, t0 - addq a1, a2, a1 - - extqh t1, a1, t1 - addq a0, a2, t8 - extql t2, a1, t2 - addq t8, a2, t9 - - extqh t3, a1, t3 - addq t9, a2, ta - extql t4, a1, t4 - or t0, t1, t0 - - extqh t5, a1, t5 - or t2, t3, t2 - extql t6, a1, t6 - or t4, t5, t4 - - extqh t7, a1, t7 - or t6, t7, t6 - stq t0, 0(a0) - stq t2, 0(t8) - - stq t4, 0(t9) - subq a3, 4, a3 - stq t6, 0(ta) - addq ta, a2, a0 - - bne a3, $unaligned - ret - - .align 4 -$aligned: - ldq t0, 0(a1) - addq a1, a2, a1 - ldq t1, 0(a1) - addq a1, a2, a1 - - ldq t2, 0(a1) - addq a1, a2, a1 - ldq t3, 0(a1) - - addq a0, a2, t4 - addq a1, a2, a1 - addq t4, a2, t5 - subq a3, 4, a3 - - stq t0, 0(a0) - addq t5, a2, t6 - stq t1, 0(t4) - addq t6, a2, a0 - - stq t2, 0(t5) - stq t3, 0(t6) - - bne a3, $aligned - ret - .end put_pixels_axp_asm - /************************************************************************ * void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, * int line_size) diff --git a/libavcodec/alpha/hpeldsp_alpha.c b/libavcodec/alpha/hpeldsp_alpha.c new file mode 100644 index 0000000000..144fa223f9 --- /dev/null +++ b/libavcodec/alpha/hpeldsp_alpha.c @@ -0,0 +1,213 @@ +/* + * Alpha optimized DSP utils + * Copyright (c) 2002 Falk Hueffner + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavcodec/hpeldsp.h" +#include "hpeldsp_alpha.h" +#include "asm.h" + +static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) +{ + return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); +} + +static inline uint64_t avg2(uint64_t a, uint64_t b) +{ + return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); +} + +#if 0 +/* The XY2 routines basically utilize this scheme, but reuse parts in + each iteration. */ +static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) +{ + uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) + + ((l2 & ~BYTE_VEC(0x03)) >> 2) + + ((l3 & ~BYTE_VEC(0x03)) >> 2) + + ((l4 & ~BYTE_VEC(0x03)) >> 2); + uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) + + (l2 & BYTE_VEC(0x03)) + + (l3 & BYTE_VEC(0x03)) + + (l4 & BYTE_VEC(0x03)) + + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); + return r1 + r2; +} +#endif + +#define OP(LOAD, STORE) \ + do { \ + STORE(LOAD(pixels), block); \ + pixels += line_size; \ + block += line_size; \ + } while (--h) + +#define OP_X2(LOAD, STORE) \ + do { \ + uint64_t pix1, pix2; \ + \ + pix1 = LOAD(pixels); \ + pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ + STORE(AVG2(pix1, pix2), block); \ + pixels += line_size; \ + block += line_size; \ + } while (--h) + +#define OP_Y2(LOAD, STORE) \ + do { \ + uint64_t pix = LOAD(pixels); \ + do { \ + uint64_t next_pix; \ + \ + pixels += line_size; \ + next_pix = LOAD(pixels); \ + STORE(AVG2(pix, next_pix), block); \ + block += line_size; \ + pix = next_pix; \ + } while (--h); \ + } while (0) + +#define OP_XY2(LOAD, STORE) \ + do { \ + uint64_t pix1 = LOAD(pixels); \ + uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ + uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ + + (pix2 & BYTE_VEC(0x03)); \ + uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ + + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ + \ + do { \ + uint64_t npix1, npix2; \ + uint64_t npix_l, npix_h; \ + uint64_t avg; \ + \ + pixels += line_size; \ + npix1 = LOAD(pixels); \ + npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ + npix_l = (npix1 & BYTE_VEC(0x03)) \ + + (npix2 & BYTE_VEC(0x03)); \ + npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ + + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ + avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ + + pix_h + npix_h; \ + STORE(avg, block); \ + \ + block += line_size; \ + pix_l = npix_l; \ + pix_h = npix_h; \ + } while (--h); \ + } while (0) + +#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ +static void OPNAME ## _pixels ## SUFF ## _axp \ + (uint8_t *restrict block, const uint8_t *restrict pixels, \ + ptrdiff_t line_size, int h) \ +{ \ + if ((size_t) pixels & 0x7) { \ + OPKIND(uldq, STORE); \ + } else { \ + OPKIND(ldq, STORE); \ + } \ +} \ + \ +static void OPNAME ## _pixels16 ## SUFF ## _axp \ + (uint8_t *restrict block, const uint8_t *restrict pixels, \ + ptrdiff_t line_size, int h) \ +{ \ + OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ + OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ +} + +#define PIXOP(OPNAME, STORE) \ + MAKE_OP(OPNAME, , OP, STORE) \ + MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ + MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ + MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) + +/* Rounding primitives. */ +#define AVG2 avg2 +#define AVG4 avg4 +#define AVG4_ROUNDER BYTE_VEC(0x02) +#define STORE(l, b) stq(l, b) +PIXOP(put, STORE); + +#undef STORE +#define STORE(l, b) stq(AVG2(l, ldq(b)), b); +PIXOP(avg, STORE); + +/* Not rounding primitives. */ +#undef AVG2 +#undef AVG4 +#undef AVG4_ROUNDER +#undef STORE +#define AVG2 avg2_no_rnd +#define AVG4 avg4_no_rnd +#define AVG4_ROUNDER BYTE_VEC(0x01) +#define STORE(l, b) stq(l, b) +PIXOP(put_no_rnd, STORE); + +#undef STORE +#define STORE(l, b) stq(AVG2(l, ldq(b)), b); +PIXOP(avg_no_rnd, STORE); + +static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + put_pixels_axp_asm(block, pixels, line_size, h); + put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); +} + +av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags) +{ + c->put_pixels_tab[0][0] = put_pixels16_axp_asm; + c->put_pixels_tab[0][1] = put_pixels16_x2_axp; + c->put_pixels_tab[0][2] = put_pixels16_y2_axp; + c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; + + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; + + c->avg_pixels_tab[0][0] = avg_pixels16_axp; + c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; + c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; + + c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp; + c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp; + c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp; + c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp; + + c->put_pixels_tab[1][0] = put_pixels_axp_asm; + c->put_pixels_tab[1][1] = put_pixels_x2_axp; + c->put_pixels_tab[1][2] = put_pixels_y2_axp; + c->put_pixels_tab[1][3] = put_pixels_xy2_axp; + + c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; + + c->avg_pixels_tab[1][0] = avg_pixels_axp; + c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; + c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; + c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; +} diff --git a/libavcodec/alpha/hpeldsp_alpha.h b/libavcodec/alpha/hpeldsp_alpha.h new file mode 100644 index 0000000000..e44ff503f7 --- /dev/null +++ b/libavcodec/alpha/hpeldsp_alpha.h @@ -0,0 +1,28 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ALPHA_HPELDSP_ALPHA_H +#define AVCODEC_ALPHA_HPELDSP_ALPHA_H + +#include +#include + +void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h); + +#endif /* AVCODEC_ALPHA_HPELDSP_ALPHA_H */ diff --git a/libavcodec/alpha/hpeldsp_alpha_asm.S b/libavcodec/alpha/hpeldsp_alpha_asm.S new file mode 100644 index 0000000000..b23d24f806 --- /dev/null +++ b/libavcodec/alpha/hpeldsp_alpha_asm.S @@ -0,0 +1,124 @@ +/* + * Alpha optimized DSP utils + * Copyright (c) 2002 Falk Hueffner + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * These functions are scheduled for pca56. They should work + * reasonably on ev6, though. + */ + +#include "regdef.h" + + .set noat + .set noreorder + .arch pca56 + .text + +/************************************************************************ + * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, + * int line_size, int h) + */ + .align 6 + .globl put_pixels_axp_asm + .ent put_pixels_axp_asm +put_pixels_axp_asm: + .frame sp, 0, ra + .prologue 0 + + and a1, 7, t0 + beq t0, $aligned + + .align 4 +$unaligned: + ldq_u t0, 0(a1) + ldq_u t1, 8(a1) + addq a1, a2, a1 + nop + + ldq_u t2, 0(a1) + ldq_u t3, 8(a1) + addq a1, a2, a1 + nop + + ldq_u t4, 0(a1) + ldq_u t5, 8(a1) + addq a1, a2, a1 + nop + + ldq_u t6, 0(a1) + ldq_u t7, 8(a1) + extql t0, a1, t0 + addq a1, a2, a1 + + extqh t1, a1, t1 + addq a0, a2, t8 + extql t2, a1, t2 + addq t8, a2, t9 + + extqh t3, a1, t3 + addq t9, a2, ta + extql t4, a1, t4 + or t0, t1, t0 + + extqh t5, a1, t5 + or t2, t3, t2 + extql t6, a1, t6 + or t4, t5, t4 + + extqh t7, a1, t7 + or t6, t7, t6 + stq t0, 0(a0) + stq t2, 0(t8) + + stq t4, 0(t9) + subq a3, 4, a3 + stq t6, 0(ta) + addq ta, a2, a0 + + bne a3, $unaligned + ret + + .align 4 +$aligned: + ldq t0, 0(a1) + addq a1, a2, a1 + ldq t1, 0(a1) + addq a1, a2, a1 + + ldq t2, 0(a1) + addq a1, a2, a1 + ldq t3, 0(a1) + + addq a0, a2, t4 + addq a1, a2, a1 + addq t4, a2, t5 + subq a3, 4, a3 + + stq t0, 0(a0) + addq t5, a2, t6 + stq t1, 0(t4) + addq t6, a2, a0 + + stq t2, 0(t5) + stq t3, 0(t6) + + bne a3, $aligned + ret + .end put_pixels_axp_asm diff --git a/libavcodec/alpha/regdef.h b/libavcodec/alpha/regdef.h index fa9ad98e5c..100594352d 100644 --- a/libavcodec/alpha/regdef.h +++ b/libavcodec/alpha/regdef.h @@ -63,4 +63,15 @@ #define sp $30 /* stack pointer */ #define zero $31 /* reads as zero, writes are noops */ +/* Some nicer register names. */ +#define ta t10 +#define tb t11 +#define tc t12 +#define td AT +/* Danger: these overlap with the argument list and the return value */ +#define te a5 +#define tf a4 +#define tg a3 +#define th v0 + #endif /* AVCODEC_ALPHA_REGDEF_H */ diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c index aed1cc982a..f00c449b12 100644 --- a/libavcodec/hpeldsp.c +++ b/libavcodec/hpeldsp.c @@ -54,6 +54,8 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags) hpel_funcs(avg, [3], 2); hpel_funcs(avg_no_rnd,, 16); + if (ARCH_ALPHA) + ff_hpeldsp_init_alpha(c, flags); if (ARCH_ARM) ff_hpeldsp_init_arm(c, flags); if (ARCH_BFIN) diff --git a/libavcodec/hpeldsp.h b/libavcodec/hpeldsp.h index d454453ed7..68190ad8c3 100644 --- a/libavcodec/hpeldsp.h +++ b/libavcodec/hpeldsp.h @@ -94,6 +94,7 @@ typedef struct HpelDSPContext { void ff_hpeldsp_init(HpelDSPContext *c, int flags); +void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags); void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags); void ff_hpeldsp_init_bfin(HpelDSPContext *c, int flags); void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);