From 2957d29f0531ccd8a6f4378293424dfd92db3044 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Sun, 10 Mar 2013 16:53:07 -0700
Subject: [PATCH] alpha: hpeldsp: Move half-pel assembly from dsputil to
 hpeldsp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/alpha/Makefile            |   2 +
 libavcodec/alpha/dsputil_alpha.c     | 185 -----------------------
 libavcodec/alpha/dsputil_alpha.h     |   2 -
 libavcodec/alpha/dsputil_alpha_asm.S | 103 -------------
 libavcodec/alpha/hpeldsp_alpha.c     | 213 +++++++++++++++++++++++++++
 libavcodec/alpha/hpeldsp_alpha.h     |  28 ++++
 libavcodec/alpha/hpeldsp_alpha_asm.S | 124 ++++++++++++++++
 libavcodec/alpha/regdef.h            |  11 ++
 libavcodec/hpeldsp.c                 |   2 +
 libavcodec/hpeldsp.h                 |   1 +
 10 files changed, 381 insertions(+), 290 deletions(-)
 create mode 100644 libavcodec/alpha/hpeldsp_alpha.c
 create mode 100644 libavcodec/alpha/hpeldsp_alpha.h
 create mode 100644 libavcodec/alpha/hpeldsp_alpha_asm.S

diff --git a/libavcodec/alpha/Makefile b/libavcodec/alpha/Makefile
index e28200d45a..6f22137167 100644
--- a/libavcodec/alpha/Makefile
+++ b/libavcodec/alpha/Makefile
@@ -4,4 +4,6 @@ OBJS += alpha/dsputil_alpha.o                                           \
         alpha/motion_est_mvi_asm.o                                      \
         alpha/simple_idct_alpha.o                                       \
 
+OBJS-$(CONFIG_HPELDSP)                  += alpha/hpeldsp_alpha.o        \
+                                           alpha/hpeldsp_alpha_asm.o
 OBJS-$(CONFIG_MPEGVIDEO)                += alpha/mpegvideo_alpha.o
diff --git a/libavcodec/alpha/dsputil_alpha.c b/libavcodec/alpha/dsputil_alpha.c
index 34fe2ebcad..7a41cb8ebb 100644
--- a/libavcodec/alpha/dsputil_alpha.c
+++ b/libavcodec/alpha/dsputil_alpha.c
@@ -119,196 +119,11 @@ static void clear_blocks_axp(int16_t *blocks) {
     } while (n);
 }
 
-static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
-{
-    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
-}
-
-static inline uint64_t avg2(uint64_t a, uint64_t b)
-{
-    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
-}
-
-#if 0
-/* The XY2 routines basically utilize this scheme, but reuse parts in
-   each iteration.  */
-static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
-{
-    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
-                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
-                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
-                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
-    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
-                    + (l2 & BYTE_VEC(0x03))
-                    + (l3 & BYTE_VEC(0x03))
-                    + (l4 & BYTE_VEC(0x03))
-                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
-    return r1 + r2;
-}
-#endif
-
-#define OP(LOAD, STORE)                         \
-    do {                                        \
-        STORE(LOAD(pixels), block);             \
-        pixels += line_size;                    \
-        block += line_size;                     \
-    } while (--h)
-
-#define OP_X2(LOAD, STORE)                                      \
-    do {                                                        \
-        uint64_t pix1, pix2;                                    \
-                                                                \
-        pix1 = LOAD(pixels);                                    \
-        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
-        STORE(AVG2(pix1, pix2), block);                         \
-        pixels += line_size;                                    \
-        block += line_size;                                     \
-    } while (--h)
-
-#define OP_Y2(LOAD, STORE)                      \
-    do {                                        \
-        uint64_t pix = LOAD(pixels);            \
-        do {                                    \
-            uint64_t next_pix;                  \
-                                                \
-            pixels += line_size;                \
-            next_pix = LOAD(pixels);            \
-            STORE(AVG2(pix, next_pix), block);  \
-            block += line_size;                 \
-            pix = next_pix;                     \
-        } while (--h);                          \
-    } while (0)
-
-#define OP_XY2(LOAD, STORE)                                                 \
-    do {                                                                    \
-        uint64_t pix1 = LOAD(pixels);                                       \
-        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
-        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
-                       + (pix2 & BYTE_VEC(0x03));                           \
-        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
-                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
-                                                                            \
-        do {                                                                \
-            uint64_t npix1, npix2;                                          \
-            uint64_t npix_l, npix_h;                                        \
-            uint64_t avg;                                                   \
-                                                                            \
-            pixels += line_size;                                            \
-            npix1 = LOAD(pixels);                                           \
-            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
-            npix_l = (npix1 & BYTE_VEC(0x03))                               \
-                   + (npix2 & BYTE_VEC(0x03));                              \
-            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
-                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
-            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
-                + pix_h + npix_h;                                           \
-            STORE(avg, block);                                              \
-                                                                            \
-            block += line_size;                                             \
-            pix_l = npix_l;                                                 \
-            pix_h = npix_h;                                                 \
-        } while (--h);                                                      \
-    } while (0)
-
-#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                                \
-static void OPNAME ## _pixels ## SUFF ## _axp                               \
-        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
-         ptrdiff_t line_size, int h)                                        \
-{                                                                           \
-    if ((size_t) pixels & 0x7) {                                            \
-        OPKIND(uldq, STORE);                                                \
-    } else {                                                                \
-        OPKIND(ldq, STORE);                                                 \
-    }                                                                       \
-}                                                                           \
-                                                                            \
-static void OPNAME ## _pixels16 ## SUFF ## _axp                             \
-        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
-         ptrdiff_t line_size, int h)                                        \
-{                                                                           \
-    OPNAME ## _pixels ## SUFF ## _axp(block,     pixels,     line_size, h); \
-    OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
-}
-
-#define PIXOP(OPNAME, STORE)                    \
-    MAKE_OP(OPNAME, ,     OP,     STORE)        \
-    MAKE_OP(OPNAME, _x2,  OP_X2,  STORE)        \
-    MAKE_OP(OPNAME, _y2,  OP_Y2,  STORE)        \
-    MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
-
-/* Rounding primitives.  */
-#define AVG2 avg2
-#define AVG4 avg4
-#define AVG4_ROUNDER BYTE_VEC(0x02)
-#define STORE(l, b) stq(l, b)
-PIXOP(put, STORE);
-
-#undef STORE
-#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
-PIXOP(avg, STORE);
-
-/* Not rounding primitives.  */
-#undef AVG2
-#undef AVG4
-#undef AVG4_ROUNDER
-#undef STORE
-#define AVG2 avg2_no_rnd
-#define AVG4 avg4_no_rnd
-#define AVG4_ROUNDER BYTE_VEC(0x01)
-#define STORE(l, b) stq(l, b)
-PIXOP(put_no_rnd, STORE);
-
-#undef STORE
-#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
-PIXOP(avg_no_rnd, STORE);
-
-static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
-                                 ptrdiff_t line_size, int h)
-{
-    put_pixels_axp_asm(block,     pixels,     line_size, h);
-    put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
-}
-
 av_cold void ff_dsputil_init_alpha(DSPContext *c, AVCodecContext *avctx)
 {
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
 
     if (!high_bit_depth) {
-    c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
-    c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
-    c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
-    c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
-
-    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
-    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
-    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
-    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
-
-    c->avg_pixels_tab[0][0] = avg_pixels16_axp;
-    c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
-    c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
-    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
-
-    c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp;
-    c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp;
-    c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp;
-    c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp;
-
-    c->put_pixels_tab[1][0] = put_pixels_axp_asm;
-    c->put_pixels_tab[1][1] = put_pixels_x2_axp;
-    c->put_pixels_tab[1][2] = put_pixels_y2_axp;
-    c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
-
-    c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
-    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
-    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
-    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
-
-    c->avg_pixels_tab[1][0] = avg_pixels_axp;
-    c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
-    c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
-    c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
-
     c->clear_blocks = clear_blocks_axp;
     }
 
diff --git a/libavcodec/alpha/dsputil_alpha.h b/libavcodec/alpha/dsputil_alpha.h
index fcea47c665..d976c18e2e 100644
--- a/libavcodec/alpha/dsputil_alpha.h
+++ b/libavcodec/alpha/dsputil_alpha.h
@@ -26,8 +26,6 @@ void ff_simple_idct_axp(int16_t *block);
 void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block);
 void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block);
 
-void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
-                        ptrdiff_t line_size, int h);
 void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
                                 int line_size);
 void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
diff --git a/libavcodec/alpha/dsputil_alpha_asm.S b/libavcodec/alpha/dsputil_alpha_asm.S
index 1f589aa0c0..afe02cc391 100644
--- a/libavcodec/alpha/dsputil_alpha_asm.S
+++ b/libavcodec/alpha/dsputil_alpha_asm.S
@@ -26,114 +26,11 @@
 
 #include "regdef.h"
 
-/* Some nicer register names.  */
-#define ta t10
-#define tb t11
-#define tc t12
-#define td AT
-/* Danger: these overlap with the argument list and the return value */
-#define te a5
-#define tf a4
-#define tg a3
-#define th v0
-
         .set noat
         .set noreorder
         .arch pca56
         .text
 
-/************************************************************************
- * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
- *                         int line_size, int h)
- */
-        .align 6
-        .globl put_pixels_axp_asm
-        .ent put_pixels_axp_asm
-put_pixels_axp_asm:
-        .frame sp, 0, ra
-        .prologue 0
-
-        and     a1, 7, t0
-        beq     t0, $aligned
-
-        .align 4
-$unaligned:
-        ldq_u   t0, 0(a1)
-        ldq_u   t1, 8(a1)
-        addq    a1, a2, a1
-        nop
-
-        ldq_u   t2, 0(a1)
-        ldq_u   t3, 8(a1)
-        addq    a1, a2, a1
-        nop
-
-        ldq_u   t4, 0(a1)
-        ldq_u   t5, 8(a1)
-        addq    a1, a2, a1
-        nop
-
-        ldq_u   t6, 0(a1)
-        ldq_u   t7, 8(a1)
-        extql   t0, a1, t0
-        addq    a1, a2, a1
-
-        extqh   t1, a1, t1
-        addq    a0, a2, t8
-        extql   t2, a1, t2
-        addq    t8, a2, t9
-
-        extqh   t3, a1, t3
-        addq    t9, a2, ta
-        extql   t4, a1, t4
-        or      t0, t1, t0
-
-        extqh   t5, a1, t5
-        or      t2, t3, t2
-        extql   t6, a1, t6
-        or      t4, t5, t4
-
-        extqh   t7, a1, t7
-        or      t6, t7, t6
-        stq     t0, 0(a0)
-        stq     t2, 0(t8)
-
-        stq     t4, 0(t9)
-        subq    a3, 4, a3
-        stq     t6, 0(ta)
-        addq    ta, a2, a0
-
-        bne     a3, $unaligned
-        ret
-
-        .align 4
-$aligned:
-        ldq     t0, 0(a1)
-        addq    a1, a2, a1
-        ldq     t1, 0(a1)
-        addq    a1, a2, a1
-
-        ldq     t2, 0(a1)
-        addq    a1, a2, a1
-        ldq     t3, 0(a1)
-
-        addq    a0, a2, t4
-        addq    a1, a2, a1
-        addq    t4, a2, t5
-        subq    a3, 4, a3
-
-        stq     t0, 0(a0)
-        addq    t5, a2, t6
-        stq     t1, 0(t4)
-        addq    t6, a2, a0
-
-        stq     t2, 0(t5)
-        stq     t3, 0(t6)
-
-        bne     a3, $aligned
-        ret
-        .end put_pixels_axp_asm
-
 /************************************************************************
  * void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels,
  *                                 int line_size)
diff --git a/libavcodec/alpha/hpeldsp_alpha.c b/libavcodec/alpha/hpeldsp_alpha.c
new file mode 100644
index 0000000000..144fa223f9
--- /dev/null
+++ b/libavcodec/alpha/hpeldsp_alpha.c
@@ -0,0 +1,213 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/hpeldsp.h"
+#include "hpeldsp_alpha.h"
+#include "asm.h"
+
+static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
+{
+    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
+}
+
+static inline uint64_t avg2(uint64_t a, uint64_t b)
+{
+    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
+}
+
+#if 0
+/* The XY2 routines basically utilize this scheme, but reuse parts in
+   each iteration.  */
+static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
+{
+    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
+                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
+    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
+                    + (l2 & BYTE_VEC(0x03))
+                    + (l3 & BYTE_VEC(0x03))
+                    + (l4 & BYTE_VEC(0x03))
+                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
+    return r1 + r2;
+}
+#endif
+
+#define OP(LOAD, STORE)                         \
+    do {                                        \
+        STORE(LOAD(pixels), block);             \
+        pixels += line_size;                    \
+        block += line_size;                     \
+    } while (--h)
+
+#define OP_X2(LOAD, STORE)                                      \
+    do {                                                        \
+        uint64_t pix1, pix2;                                    \
+                                                                \
+        pix1 = LOAD(pixels);                                    \
+        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
+        STORE(AVG2(pix1, pix2), block);                         \
+        pixels += line_size;                                    \
+        block += line_size;                                     \
+    } while (--h)
+
+#define OP_Y2(LOAD, STORE)                      \
+    do {                                        \
+        uint64_t pix = LOAD(pixels);            \
+        do {                                    \
+            uint64_t next_pix;                  \
+                                                \
+            pixels += line_size;                \
+            next_pix = LOAD(pixels);            \
+            STORE(AVG2(pix, next_pix), block);  \
+            block += line_size;                 \
+            pix = next_pix;                     \
+        } while (--h);                          \
+    } while (0)
+
+#define OP_XY2(LOAD, STORE)                                                 \
+    do {                                                                    \
+        uint64_t pix1 = LOAD(pixels);                                       \
+        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
+        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
+                       + (pix2 & BYTE_VEC(0x03));                           \
+        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
+                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
+                                                                            \
+        do {                                                                \
+            uint64_t npix1, npix2;                                          \
+            uint64_t npix_l, npix_h;                                        \
+            uint64_t avg;                                                   \
+                                                                            \
+            pixels += line_size;                                            \
+            npix1 = LOAD(pixels);                                           \
+            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
+            npix_l = (npix1 & BYTE_VEC(0x03))                               \
+                   + (npix2 & BYTE_VEC(0x03));                              \
+            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
+                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
+            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
+                + pix_h + npix_h;                                           \
+            STORE(avg, block);                                              \
+                                                                            \
+            block += line_size;                                             \
+            pix_l = npix_l;                                                 \
+            pix_h = npix_h;                                                 \
+        } while (--h);                                                      \
+    } while (0)
+
+#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                                \
+static void OPNAME ## _pixels ## SUFF ## _axp                               \
+        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
+         ptrdiff_t line_size, int h)                                        \
+{                                                                           \
+    if ((size_t) pixels & 0x7) {                                            \
+        OPKIND(uldq, STORE);                                                \
+    } else {                                                                \
+        OPKIND(ldq, STORE);                                                 \
+    }                                                                       \
+}                                                                           \
+                                                                            \
+static void OPNAME ## _pixels16 ## SUFF ## _axp                             \
+        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
+         ptrdiff_t line_size, int h)                                        \
+{                                                                           \
+    OPNAME ## _pixels ## SUFF ## _axp(block,     pixels,     line_size, h); \
+    OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
+}
+
+#define PIXOP(OPNAME, STORE)                    \
+    MAKE_OP(OPNAME, ,     OP,     STORE)        \
+    MAKE_OP(OPNAME, _x2,  OP_X2,  STORE)        \
+    MAKE_OP(OPNAME, _y2,  OP_Y2,  STORE)        \
+    MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
+
+/* Rounding primitives.  */
+#define AVG2 avg2
+#define AVG4 avg4
+#define AVG4_ROUNDER BYTE_VEC(0x02)
+#define STORE(l, b) stq(l, b)
+PIXOP(put, STORE);
+
+#undef STORE
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+PIXOP(avg, STORE);
+
+/* Not rounding primitives.  */
+#undef AVG2
+#undef AVG4
+#undef AVG4_ROUNDER
+#undef STORE
+#define AVG2 avg2_no_rnd
+#define AVG4 avg4_no_rnd
+#define AVG4_ROUNDER BYTE_VEC(0x01)
+#define STORE(l, b) stq(l, b)
+PIXOP(put_no_rnd, STORE);
+
+#undef STORE
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+PIXOP(avg_no_rnd, STORE);
+
+static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
+                                 ptrdiff_t line_size, int h)
+{
+    put_pixels_axp_asm(block,     pixels,     line_size, h);
+    put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
+}
+
+av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags)
+{
+    c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
+    c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
+    c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
+    c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
+
+    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
+    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
+    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
+    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
+
+    c->avg_pixels_tab[0][0] = avg_pixels16_axp;
+    c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
+    c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
+    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
+
+    c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp;
+    c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp;
+    c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp;
+    c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp;
+
+    c->put_pixels_tab[1][0] = put_pixels_axp_asm;
+    c->put_pixels_tab[1][1] = put_pixels_x2_axp;
+    c->put_pixels_tab[1][2] = put_pixels_y2_axp;
+    c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
+
+    c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
+    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
+    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
+    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
+
+    c->avg_pixels_tab[1][0] = avg_pixels_axp;
+    c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
+    c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
+    c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
+}
diff --git a/libavcodec/alpha/hpeldsp_alpha.h b/libavcodec/alpha/hpeldsp_alpha.h
new file mode 100644
index 0000000000..e44ff503f7
--- /dev/null
+++ b/libavcodec/alpha/hpeldsp_alpha.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ALPHA_HPELDSP_ALPHA_H
+#define AVCODEC_ALPHA_HPELDSP_ALPHA_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+                        ptrdiff_t line_size, int h);
+
+#endif /* AVCODEC_ALPHA_HPELDSP_ALPHA_H */
diff --git a/libavcodec/alpha/hpeldsp_alpha_asm.S b/libavcodec/alpha/hpeldsp_alpha_asm.S
new file mode 100644
index 0000000000..b23d24f806
--- /dev/null
+++ b/libavcodec/alpha/hpeldsp_alpha_asm.S
@@ -0,0 +1,124 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * These functions are scheduled for pca56. They should work
+ * reasonably on ev6, though.
+ */
+
+#include "regdef.h"
+
+        .set noat
+        .set noreorder
+        .arch pca56
+        .text
+
+/************************************************************************
+ * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+ *                         int line_size, int h)
+ */
+        .align 6
+        .globl put_pixels_axp_asm
+        .ent put_pixels_axp_asm
+put_pixels_axp_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+        and     a1, 7, t0
+        beq     t0, $aligned
+
+        .align 4
+$unaligned:
+        ldq_u   t0, 0(a1)
+        ldq_u   t1, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t2, 0(a1)
+        ldq_u   t3, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t4, 0(a1)
+        ldq_u   t5, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t6, 0(a1)
+        ldq_u   t7, 8(a1)
+        extql   t0, a1, t0
+        addq    a1, a2, a1
+
+        extqh   t1, a1, t1
+        addq    a0, a2, t8
+        extql   t2, a1, t2
+        addq    t8, a2, t9
+
+        extqh   t3, a1, t3
+        addq    t9, a2, ta
+        extql   t4, a1, t4
+        or      t0, t1, t0
+
+        extqh   t5, a1, t5
+        or      t2, t3, t2
+        extql   t6, a1, t6
+        or      t4, t5, t4
+
+        extqh   t7, a1, t7
+        or      t6, t7, t6
+        stq     t0, 0(a0)
+        stq     t2, 0(t8)
+
+        stq     t4, 0(t9)
+        subq    a3, 4, a3
+        stq     t6, 0(ta)
+        addq    ta, a2, a0
+
+        bne     a3, $unaligned
+        ret
+
+        .align 4
+$aligned:
+        ldq     t0, 0(a1)
+        addq    a1, a2, a1
+        ldq     t1, 0(a1)
+        addq    a1, a2, a1
+
+        ldq     t2, 0(a1)
+        addq    a1, a2, a1
+        ldq     t3, 0(a1)
+
+        addq    a0, a2, t4
+        addq    a1, a2, a1
+        addq    t4, a2, t5
+        subq    a3, 4, a3
+
+        stq     t0, 0(a0)
+        addq    t5, a2, t6
+        stq     t1, 0(t4)
+        addq    t6, a2, a0
+
+        stq     t2, 0(t5)
+        stq     t3, 0(t6)
+
+        bne     a3, $aligned
+        ret
+        .end put_pixels_axp_asm
diff --git a/libavcodec/alpha/regdef.h b/libavcodec/alpha/regdef.h
index fa9ad98e5c..100594352d 100644
--- a/libavcodec/alpha/regdef.h
+++ b/libavcodec/alpha/regdef.h
@@ -63,4 +63,15 @@
 #define sp      $30     /* stack pointer */
 #define zero    $31     /* reads as zero, writes are noops */
 
+/* Some nicer register names.  */
+#define ta t10
+#define tb t11
+#define tc t12
+#define td AT
+/* Danger: these overlap with the argument list and the return value */
+#define te a5
+#define tf a4
+#define tg a3
+#define th v0
+
 #endif /* AVCODEC_ALPHA_REGDEF_H */
diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c
index aed1cc982a..f00c449b12 100644
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@@ -54,6 +54,8 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
     hpel_funcs(avg, [3],  2);
     hpel_funcs(avg_no_rnd,, 16);
 
+    if (ARCH_ALPHA)
+        ff_hpeldsp_init_alpha(c, flags);
     if (ARCH_ARM)
         ff_hpeldsp_init_arm(c, flags);
     if (ARCH_BFIN)
diff --git a/libavcodec/hpeldsp.h b/libavcodec/hpeldsp.h
index d454453ed7..68190ad8c3 100644
--- a/libavcodec/hpeldsp.h
+++ b/libavcodec/hpeldsp.h
@@ -94,6 +94,7 @@ typedef struct HpelDSPContext {
 
 void ff_hpeldsp_init(HpelDSPContext *c, int flags);
 
+void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_bfin(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);