mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
Originally committed as revision 274 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
4bdd9157cc
commit
1e98dffb7a
@ -37,6 +37,12 @@ OBJS += mlib/dsputil_mlib.o
|
||||
CFLAGS += $(MLIB_INC)
|
||||
endif
|
||||
|
||||
# alpha specific stuff
|
||||
ifeq ($(TARGET_ARCH_ALPHA),yes)
|
||||
OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o
|
||||
CFLAGS += -Wa,-mpca56
|
||||
endif
|
||||
|
||||
SRCS = $(OBJS:.o=.c) $(ASM_OBJS:.o=.s)
|
||||
|
||||
LIB= libavcodec.a
|
||||
@ -74,6 +80,7 @@ clean:
|
||||
rm -f *.o *~ $(LIB) $(SLIB) *.so i386/*.o i386/*~ \
|
||||
armv4l/*.o armv4l/*~ \
|
||||
mlib/*.o mlib/*~ \
|
||||
alpha/*.o alpha/*~ \
|
||||
libac3/*.o libac3/*~ \
|
||||
apiexample $(TESTS)
|
||||
|
||||
|
141
libavcodec/alpha/asm.h
Normal file
141
libavcodec/alpha/asm.h
Normal file
@ -0,0 +1,141 @@
|
||||
/*
|
||||
* Alpha optimized DSP utils
|
||||
* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
#ifndef LIBAVCODEC_ALPHA_ASM_H
|
||||
#define LIBAVCODEC_ALPHA_ASM_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define AMASK_BWX (1 << 0)
|
||||
#define AMASK_FIX (1 << 1)
|
||||
#define AMASK_MVI (1 << 8)
|
||||
|
||||
static inline uint64_t BYTE_VEC(uint64_t x)
|
||||
{
|
||||
x |= x << 8;
|
||||
x |= x << 16;
|
||||
x |= x << 32;
|
||||
return x;
|
||||
}
|
||||
static inline uint64_t WORD_VEC(uint64_t x)
|
||||
{
|
||||
x |= x << 16;
|
||||
x |= x << 32;
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline int32_t ldl(const void* p)
|
||||
{
|
||||
return *(const int32_t*) p;
|
||||
}
|
||||
static inline uint64_t ldq(const void* p)
|
||||
{
|
||||
return *(const uint64_t*) p;
|
||||
}
|
||||
/* FIXME ccc doesn't seem to get it? Use inline asm? */
|
||||
static inline uint64_t ldq_u(const void* p)
|
||||
{
|
||||
return *(const uint64_t*) ((uintptr_t) p & ~7ul);
|
||||
}
|
||||
static inline void stl(uint32_t l, void* p)
|
||||
{
|
||||
*(uint32_t*) p = l;
|
||||
}
|
||||
static inline void stq(uint64_t l, void* p)
|
||||
{
|
||||
*(uint64_t*) p = l;
|
||||
}
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define OPCODE1(name) \
|
||||
static inline uint64_t name(uint64_t l) \
|
||||
{ \
|
||||
uint64_t r; \
|
||||
asm (#name " %1, %0" : "=r" (r) : "r" (l)); \
|
||||
return r; \
|
||||
}
|
||||
|
||||
#define OPCODE2(name) \
|
||||
static inline uint64_t name(uint64_t l1, uint64_t l2) \
|
||||
{ \
|
||||
uint64_t r; \
|
||||
asm (#name " %1, %2, %0" : "=r" (r) : "r" (l1), "rI" (l2)); \
|
||||
return r; \
|
||||
}
|
||||
|
||||
/* We don't want gcc to move this around or combine it with another
|
||||
rpcc, so mark it volatile. */
|
||||
static inline uint64_t rpcc(void)
|
||||
{
|
||||
uint64_t r;
|
||||
asm volatile ("rpcc %0" : "=r" (r));
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline uint64_t uldq(const void* v)
|
||||
{
|
||||
struct foo {
|
||||
unsigned long l;
|
||||
} __attribute__((packed));
|
||||
|
||||
return ((const struct foo*) v)->l;
|
||||
}
|
||||
|
||||
#elif defined(__DECC) /* Compaq "ccc" compiler */
|
||||
|
||||
#include <c_asm.h>
|
||||
#define OPCODE1(name) \
|
||||
static inline uint64_t name(uint64_t l) \
|
||||
{ \
|
||||
return asm (#name " %a0, %v0", l); \
|
||||
}
|
||||
|
||||
#define OPCODE2(name) \
|
||||
static inline uint64_t name(uint64_t l1, uint64_t l2) \
|
||||
{ \
|
||||
return asm (#name " %a0, %a1, %v0", l1, l2); \
|
||||
}
|
||||
|
||||
static inline uint64_t rpcc(void)
|
||||
{
|
||||
return asm ("rpcc %v0");
|
||||
}
|
||||
|
||||
static inline uint64_t uldq(const void* v)
|
||||
{
|
||||
return *(const __unaligned uint64_t *) v;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
OPCODE1(amask);
|
||||
OPCODE1(unpkbw);
|
||||
OPCODE1(pkwb);
|
||||
OPCODE2(extql);
|
||||
OPCODE2(extqh);
|
||||
OPCODE2(zap);
|
||||
OPCODE2(cmpbge);
|
||||
OPCODE2(minsw4);
|
||||
OPCODE2(minuw4);
|
||||
OPCODE2(minub8);
|
||||
OPCODE2(maxsw4);
|
||||
OPCODE2(maxuw4);
|
||||
OPCODE2(perr);
|
||||
|
||||
#endif /* LIBAVCODEC_ALPHA_ASM_H */
|
223
libavcodec/alpha/dsputil_alpha.c
Normal file
223
libavcodec/alpha/dsputil_alpha.c
Normal file
@ -0,0 +1,223 @@
|
||||
/*
|
||||
* Alpha optimized DSP utils
|
||||
* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
#include "asm.h"
|
||||
#include "../dsputil.h"
|
||||
|
||||
void simple_idct_axp(DCTELEM *block);
|
||||
|
||||
static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels,
|
||||
int line_size)
|
||||
{
|
||||
int i = 8;
|
||||
do {
|
||||
UINT64 shorts;
|
||||
|
||||
shorts = ldq(block);
|
||||
shorts = maxsw4(shorts, 0);
|
||||
shorts = minsw4(shorts, WORD_VEC(0x00ff));
|
||||
stl(pkwb(shorts), pixels);
|
||||
|
||||
shorts = ldq(block + 4);
|
||||
shorts = maxsw4(shorts, 0);
|
||||
shorts = minsw4(shorts, WORD_VEC(0x00ff));
|
||||
stl(pkwb(shorts), pixels + 4);
|
||||
|
||||
pixels += line_size;
|
||||
block += 8;
|
||||
} while (--i);
|
||||
}
|
||||
|
||||
static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels,
|
||||
int line_size)
|
||||
{
|
||||
int i = 8;
|
||||
do {
|
||||
UINT64 shorts;
|
||||
|
||||
shorts = ldq(block);
|
||||
shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */
|
||||
shorts += unpkbw(ldl(pixels));
|
||||
shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */
|
||||
shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */
|
||||
shorts &= ~WORD_VEC(0x4000); /* ...and zap them */
|
||||
shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */
|
||||
stl(pkwb(shorts), pixels);
|
||||
|
||||
/* next 4 */
|
||||
shorts = ldq(block + 4);
|
||||
shorts &= ~WORD_VEC(0x8000);
|
||||
shorts += unpkbw(ldl(pixels + 4));
|
||||
shorts &= ~WORD_VEC(0x8000);
|
||||
shorts = minuw4(shorts, WORD_VEC(0x4000));
|
||||
shorts &= ~WORD_VEC(0x4000);
|
||||
shorts = minsw4(shorts, WORD_VEC(0x00ff));
|
||||
stl(pkwb(shorts), pixels + 4);
|
||||
|
||||
pixels += line_size;
|
||||
block += 8;
|
||||
} while (--i);
|
||||
}
|
||||
|
||||
/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
|
||||
Since the immediate result could be greater than 255, we do the
|
||||
shift first. The result is too low by one if the bytes were both
|
||||
odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */
|
||||
static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2)
|
||||
{
|
||||
UINT64 correction = (l1 & l2) & BYTE_VEC(0x01);
|
||||
l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
|
||||
l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
|
||||
return l1 + l2 + correction;
|
||||
}
|
||||
|
||||
/* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1
|
||||
The '1' only has an effect when one byte is even and the other odd,
|
||||
i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01).
|
||||
Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01). */
|
||||
static inline UINT64 avg2(UINT64 l1, UINT64 l2)
|
||||
{
|
||||
UINT64 correction = (l1 | l2) & BYTE_VEC(0x01);
|
||||
l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
|
||||
l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
|
||||
return l1 + l2 + correction;
|
||||
}
|
||||
|
||||
static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
|
||||
{
|
||||
UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
|
||||
+ ((l2 & ~BYTE_VEC(0x03)) >> 2)
|
||||
+ ((l3 & ~BYTE_VEC(0x03)) >> 2)
|
||||
+ ((l4 & ~BYTE_VEC(0x03)) >> 2);
|
||||
UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
|
||||
+ (l2 & BYTE_VEC(0x03))
|
||||
+ (l3 & BYTE_VEC(0x03))
|
||||
+ (l4 & BYTE_VEC(0x03))
|
||||
+ BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
|
||||
return r1 + r2;
|
||||
}
|
||||
|
||||
static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
|
||||
{
|
||||
UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
|
||||
+ ((l2 & ~BYTE_VEC(0x03)) >> 2)
|
||||
+ ((l3 & ~BYTE_VEC(0x03)) >> 2)
|
||||
+ ((l4 & ~BYTE_VEC(0x03)) >> 2);
|
||||
UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
|
||||
+ (l2 & BYTE_VEC(0x03))
|
||||
+ (l3 & BYTE_VEC(0x03))
|
||||
+ (l4 & BYTE_VEC(0x03))
|
||||
+ BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
|
||||
return r1 + r2;
|
||||
}
|
||||
|
||||
#define PIXOPNAME(suffix) put ## suffix
|
||||
#define BTYPE UINT8
|
||||
#define AVG2 avg2
|
||||
#define AVG4 avg4
|
||||
#define STORE(l, b) stq(l, b)
|
||||
#include "pixops.h"
|
||||
#undef PIXOPNAME
|
||||
#undef BTYPE
|
||||
#undef AVG2
|
||||
#undef AVG4
|
||||
#undef STORE
|
||||
|
||||
#define PIXOPNAME(suffix) put_no_rnd ## suffix
|
||||
#define BTYPE UINT8
|
||||
#define AVG2 avg2_no_rnd
|
||||
#define AVG4 avg4_no_rnd
|
||||
#define STORE(l, b) stq(l, b)
|
||||
#include "pixops.h"
|
||||
#undef PIXOPNAME
|
||||
#undef BTYPE
|
||||
#undef AVG2
|
||||
#undef AVG4
|
||||
#undef STORE
|
||||
|
||||
/* The following functions are untested. */
|
||||
#if 0
|
||||
|
||||
#define PIXOPNAME(suffix) avg ## suffix
|
||||
#define BTYPE UINT8
|
||||
#define AVG2 avg2
|
||||
#define AVG4 avg4
|
||||
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
|
||||
#include "pixops.h"
|
||||
#undef PIXOPNAME
|
||||
#undef BTYPE
|
||||
#undef AVG2
|
||||
#undef AVG4
|
||||
#undef STORE
|
||||
|
||||
#define PIXOPNAME(suffix) avg_no_rnd ## suffix
|
||||
#define BTYPE UINT8
|
||||
#define AVG2 avg2_no_rnd
|
||||
#define AVG4 avg4_no_rnd
|
||||
#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
|
||||
#include "pixops.h"
|
||||
#undef PIXOPNAME
|
||||
#undef BTYPE
|
||||
#undef AVG2
|
||||
#undef AVG4
|
||||
#undef STORE
|
||||
|
||||
#define PIXOPNAME(suffix) sub ## suffix
|
||||
#define BTYPE DCTELEM
|
||||
#define AVG2 avg2
|
||||
#define AVG4 avg4
|
||||
#define STORE(l, block) do { \
|
||||
UINT64 xxx = l; \
|
||||
(block)[0] -= (xxx >> 0) & 0xff; \
|
||||
(block)[1] -= (xxx >> 8) & 0xff; \
|
||||
(block)[2] -= (xxx >> 16) & 0xff; \
|
||||
(block)[3] -= (xxx >> 24) & 0xff; \
|
||||
(block)[4] -= (xxx >> 32) & 0xff; \
|
||||
(block)[5] -= (xxx >> 40) & 0xff; \
|
||||
(block)[6] -= (xxx >> 48) & 0xff; \
|
||||
(block)[7] -= (xxx >> 56) & 0xff; \
|
||||
} while (0)
|
||||
#include "pixops.h"
|
||||
#undef PIXOPNAME
|
||||
#undef BTYPE
|
||||
#undef AVG2
|
||||
#undef AVG4
|
||||
#undef STORE
|
||||
|
||||
#endif
|
||||
|
||||
void dsputil_init_alpha(void)
|
||||
{
|
||||
put_pixels_tab[0] = put_pixels_axp;
|
||||
put_pixels_tab[1] = put_pixels_x2_axp;
|
||||
put_pixels_tab[2] = put_pixels_y2_axp;
|
||||
put_pixels_tab[3] = put_pixels_xy2_axp;
|
||||
|
||||
put_no_rnd_pixels_tab[0] = put_pixels_axp;
|
||||
put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
|
||||
put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
|
||||
put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
|
||||
|
||||
/* amask clears all bits that correspond to present features. */
|
||||
if (amask(AMASK_MVI) == 0) {
|
||||
fprintf(stderr, "MVI extension detected\n");
|
||||
put_pixels_clamped = put_pixels_clamped_axp;
|
||||
add_pixels_clamped = add_pixels_clamped_axp;
|
||||
}
|
||||
}
|
88
libavcodec/alpha/mpegvideo_alpha.c
Normal file
88
libavcodec/alpha/mpegvideo_alpha.c
Normal file
@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Alpha optimized DSP utils
|
||||
* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
#include "asm.h"
|
||||
#include "../dsputil.h"
|
||||
#include "../mpegvideo.h"
|
||||
|
||||
extern UINT8 zigzag_end[64];
|
||||
|
||||
static void dct_unquantize_h263_axp(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
int i, level;
|
||||
UINT64 qmul, qadd;
|
||||
if (s->mb_intra) {
|
||||
if (n < 4)
|
||||
block[0] = block[0] * s->y_dc_scale;
|
||||
else
|
||||
block[0] = block[0] * s->c_dc_scale;
|
||||
/* Catch up to aligned point. */
|
||||
qmul = s->qscale << 1;
|
||||
qadd = (s->qscale - 1) | 1;
|
||||
for (i = 1; i < 4; ++i) {
|
||||
level = block[i];
|
||||
if (level) {
|
||||
if (level < 0) {
|
||||
level = level * qmul - qadd;
|
||||
} else {
|
||||
level = level * qmul + qadd;
|
||||
}
|
||||
block[i] = level;
|
||||
}
|
||||
}
|
||||
block += 4;
|
||||
i = 60 / 4;
|
||||
} else {
|
||||
i = zigzag_end[s->block_last_index[n]] / 4;
|
||||
}
|
||||
qmul = s->qscale << 1;
|
||||
qadd = WORD_VEC((qscale - 1) | 1);
|
||||
do {
|
||||
UINT64 levels, negmask, zeromask, corr;
|
||||
levels = ldq(block);
|
||||
if (levels == 0)
|
||||
continue;
|
||||
zeromask = cmpbge(0, levels);
|
||||
zeromask &= zeromask >> 1;
|
||||
/* Negate all negative words. */
|
||||
negmask = maxsw4(levels, WORD_VEC(0xffff)); /* negative -> ffff (-1) */
|
||||
negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */
|
||||
corr = negmask & WORD_VEC(0x0001); /* twos-complement correction */
|
||||
levels ^= negmask;
|
||||
levels += corr;
|
||||
|
||||
levels = levels * qmul;
|
||||
levels += zap(qadd, zeromask);
|
||||
|
||||
/* Re-negate negative words. */
|
||||
levels -= corr;
|
||||
levels ^= negmask;
|
||||
|
||||
stq(levels, block);
|
||||
} while (block += 4, --i);
|
||||
}
|
||||
|
||||
void MPV_common_init_axp(MpegEncContext *s)
|
||||
{
|
||||
if (amask(AMASK_MVI) == 0) {
|
||||
if (s->out_format == FMT_H263)
|
||||
s->dct_unquantize = dct_unquantize_h263_axp;
|
||||
}
|
||||
}
|
135
libavcodec/alpha/pixops.h
Normal file
135
libavcodec/alpha/pixops.h
Normal file
@ -0,0 +1,135 @@
|
||||
/*
|
||||
* Alpha optimized DSP utils
|
||||
* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
/* This file is intended to be #included with proper definitions of
|
||||
* PIXOPNAME, BTYPE, AVG2, AVG4 and STORE. */
|
||||
|
||||
static void PIXOPNAME(_pixels_axp)(BTYPE *block, const UINT8 *pixels,
|
||||
int line_size, int h)
|
||||
{
|
||||
if ((size_t) pixels & 0x7) {
|
||||
do {
|
||||
STORE(uldq(pixels), block);
|
||||
pixels += line_size;
|
||||
block += line_size;
|
||||
} while (--h);
|
||||
} else {
|
||||
do {
|
||||
STORE(ldq(pixels), block);
|
||||
pixels += line_size;
|
||||
block += line_size;
|
||||
} while (--h);
|
||||
}
|
||||
}
|
||||
|
||||
static void PIXOPNAME(_pixels_x2_axp)(BTYPE *block, const UINT8 *pixels,
|
||||
int line_size, int h)
|
||||
{
|
||||
if ((size_t) pixels & 0x7) {
|
||||
do {
|
||||
UINT64 pix1, pix2;
|
||||
|
||||
pix1 = uldq(pixels);
|
||||
pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
|
||||
STORE(AVG2(pix1, pix2), block);
|
||||
pixels += line_size;
|
||||
block += line_size;
|
||||
} while (--h);
|
||||
} else {
|
||||
do {
|
||||
UINT64 pix1, pix2;
|
||||
|
||||
pix1 = ldq(pixels);
|
||||
pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
|
||||
STORE(AVG2(pix1, pix2), block);
|
||||
pixels += line_size;
|
||||
block += line_size;
|
||||
} while (--h);
|
||||
}
|
||||
}
|
||||
|
||||
static void PIXOPNAME(_pixels_y2_axp)(BTYPE *block, const UINT8 *pixels,
|
||||
int line_size, int h)
|
||||
{
|
||||
if ((size_t) pixels & 0x7) {
|
||||
UINT64 pix = uldq(pixels);
|
||||
do {
|
||||
UINT64 next_pix;
|
||||
|
||||
pixels += line_size;
|
||||
next_pix = uldq(pixels);
|
||||
STORE(AVG2(pix, next_pix), block);
|
||||
block += line_size;
|
||||
pix = next_pix;
|
||||
} while (--h);
|
||||
} else {
|
||||
UINT64 pix = ldq(pixels);
|
||||
do {
|
||||
UINT64 next_pix;
|
||||
|
||||
pixels += line_size;
|
||||
next_pix = ldq(pixels);
|
||||
STORE(AVG2(pix, next_pix), block);
|
||||
block += line_size;
|
||||
pix = next_pix;
|
||||
} while (--h);
|
||||
}
|
||||
}
|
||||
|
||||
/* This could be further sped up by recycling AVG4 intermediate
|
||||
results from the previous loop pass. */
|
||||
static void PIXOPNAME(_pixels_xy2_axp)(BTYPE *block, const UINT8 *pixels,
|
||||
int line_size, int h)
|
||||
{
|
||||
if ((size_t) pixels & 0x7) {
|
||||
UINT64 pix1 = uldq(pixels);
|
||||
UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
|
||||
|
||||
do {
|
||||
UINT64 next_pix1, next_pix2;
|
||||
|
||||
pixels += line_size;
|
||||
next_pix1 = uldq(pixels);
|
||||
next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56);
|
||||
|
||||
STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block);
|
||||
|
||||
block += line_size;
|
||||
pix1 = next_pix1;
|
||||
pix2 = next_pix2;
|
||||
} while (--h);
|
||||
} else {
|
||||
UINT64 pix1 = ldq(pixels);
|
||||
UINT64 pix2 = pix1 >> 8 | ((UINT64) pixels[8] << 56);
|
||||
|
||||
do {
|
||||
UINT64 next_pix1, next_pix2;
|
||||
|
||||
pixels += line_size;
|
||||
next_pix1 = ldq(pixels);
|
||||
next_pix2 = next_pix1 >> 8 | ((UINT64) pixels[8] << 56);
|
||||
|
||||
STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block);
|
||||
|
||||
block += line_size;
|
||||
pix1 = next_pix1;
|
||||
pix2 = next_pix2;
|
||||
} while (--h);
|
||||
}
|
||||
}
|
@ -497,6 +497,10 @@ void dsputil_init(void)
|
||||
dsputil_init_mlib();
|
||||
use_permuted_idct = 0;
|
||||
#endif
|
||||
#ifdef ARCH_ALPHA
|
||||
dsputil_init_alpha();
|
||||
use_permuted_idct = 0;
|
||||
#endif
|
||||
|
||||
#ifdef SIMPLE_IDCT
|
||||
if(ff_idct == simple_idct) use_permuted_idct=0;
|
||||
|
@ -123,6 +123,13 @@ void dsputil_init_armv4l(void);
|
||||
|
||||
void dsputil_init_mlib(void);
|
||||
|
||||
#elif defined(ARCH_ALPHA)
|
||||
|
||||
#define emms_c()
|
||||
#define __align8 __attribute__ ((aligned (8)))
|
||||
|
||||
void dsputil_init_alpha(void);
|
||||
|
||||
#else
|
||||
|
||||
#define emms_c()
|
||||
|
@ -460,7 +460,19 @@ static int msmpeg4_pred_dc(MpegEncContext * s, int n,
|
||||
: "r" (scale)
|
||||
: "%eax", "%edx"
|
||||
);
|
||||
#else
|
||||
#elif defined (ARCH_ALPHA)
|
||||
/* Divisions are extremely costly on Alpha; optimize the most
|
||||
common case. */
|
||||
if (scale == 8) {
|
||||
a = (a + (8 >> 1)) / 8;
|
||||
b = (b + (8 >> 1)) / 8;
|
||||
c = (c + (8 >> 1)) / 8;
|
||||
} else {
|
||||
a = (a + (scale >> 1)) / scale;
|
||||
b = (b + (scale >> 1)) / scale;
|
||||
c = (c + (scale >> 1)) / scale;
|
||||
}
|
||||
#else
|
||||
a = (a + (scale >> 1)) / scale;
|
||||
b = (b + (scale >> 1)) / scale;
|
||||
c = (c + (scale >> 1)) / scale;
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "simple_idct.h"
|
||||
#include "../config.h"
|
||||
|
||||
#if 0
|
||||
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
|
||||
@ -102,6 +103,107 @@ static int inline idctRowCondZ (int16_t * row)
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifdef ARCH_ALPHA
|
||||
static int inline idctRowCondDC(int16_t *row)
|
||||
{
|
||||
int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
|
||||
uint64_t *lrow = (uint64_t *) row;
|
||||
|
||||
if (lrow[1] == 0) {
|
||||
if (lrow[0] == 0)
|
||||
return 0;
|
||||
if ((lrow[0] & ~0xffffULL) == 0) {
|
||||
uint64_t v;
|
||||
|
||||
a0 = W4 * row[0];
|
||||
a0 += 1 << (ROW_SHIFT - 1);
|
||||
a0 >>= ROW_SHIFT;
|
||||
v = (uint16_t) a0;
|
||||
v += v << 16;
|
||||
v += v << 32;
|
||||
lrow[0] = v;
|
||||
lrow[1] = v;
|
||||
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
a0 = W4 * row[0];
|
||||
a1 = W4 * row[0];
|
||||
a2 = W4 * row[0];
|
||||
a3 = W4 * row[0];
|
||||
|
||||
if (row[2]) {
|
||||
a0 += W2 * row[2];
|
||||
a1 += W6 * row[2];
|
||||
a2 -= W6 * row[2];
|
||||
a3 -= W2 * row[2];
|
||||
}
|
||||
|
||||
if (row[4]) {
|
||||
a0 += W4 * row[4];
|
||||
a1 -= W4 * row[4];
|
||||
a2 -= W4 * row[4];
|
||||
a3 += W4 * row[4];
|
||||
}
|
||||
|
||||
if (row[6]) {
|
||||
a0 += W6 * row[6];
|
||||
a1 -= W2 * row[6];
|
||||
a2 += W2 * row[6];
|
||||
a3 -= W6 * row[6];
|
||||
}
|
||||
|
||||
a0 += 1 << (ROW_SHIFT - 1);
|
||||
a1 += 1 << (ROW_SHIFT - 1);
|
||||
a2 += 1 << (ROW_SHIFT - 1);
|
||||
a3 += 1 << (ROW_SHIFT - 1);
|
||||
|
||||
if (row[1]) {
|
||||
b0 = W1 * row[1];
|
||||
b1 = W3 * row[1];
|
||||
b2 = W5 * row[1];
|
||||
b3 = W7 * row[1];
|
||||
} else {
|
||||
b0 = 0;
|
||||
b1 = 0;
|
||||
b2 = 0;
|
||||
b3 = 0;
|
||||
}
|
||||
|
||||
if (row[3]) {
|
||||
b0 += W3 * row[3];
|
||||
b1 -= W7 * row[3];
|
||||
b2 -= W1 * row[3];
|
||||
b3 -= W5 * row[3];
|
||||
}
|
||||
|
||||
if (row[5]) {
|
||||
b0 += W5 * row[5];
|
||||
b1 -= W1 * row[5];
|
||||
b2 += W7 * row[5];
|
||||
b3 += W3 * row[5];
|
||||
}
|
||||
|
||||
if (row[7]) {
|
||||
b0 += W7 * row[7];
|
||||
b1 -= W5 * row[7];
|
||||
b2 += W3 * row[7];
|
||||
b3 -= W1 * row[7];
|
||||
}
|
||||
|
||||
row[0] = (a0 + b0) >> ROW_SHIFT;
|
||||
row[1] = (a1 + b1) >> ROW_SHIFT;
|
||||
row[2] = (a2 + b2) >> ROW_SHIFT;
|
||||
row[3] = (a3 + b3) >> ROW_SHIFT;
|
||||
row[4] = (a3 - b3) >> ROW_SHIFT;
|
||||
row[5] = (a2 - b2) >> ROW_SHIFT;
|
||||
row[6] = (a1 - b1) >> ROW_SHIFT;
|
||||
row[7] = (a0 - b0) >> ROW_SHIFT;
|
||||
|
||||
return 1;
|
||||
}
|
||||
#else /* not ARCH_ALPHA */
|
||||
static int inline idctRowCondDC (int16_t * row)
|
||||
{
|
||||
int a0, a1, a2, a3, b0, b1, b2, b3;
|
||||
@ -147,6 +249,7 @@ static int inline idctRowCondDC (int16_t * row)
|
||||
|
||||
return 1;
|
||||
}
|
||||
#endif /* not ARCH_ALPHA */
|
||||
|
||||
static void inline idctCol (int16_t * col)
|
||||
{
|
||||
@ -243,6 +346,7 @@ static void inline idctSparseCol (int16_t * col)
|
||||
b3 += - W1*col[8*7];
|
||||
}
|
||||
|
||||
#ifndef ARCH_ALPHA
|
||||
if(!(b0|b1|b2|b3)){
|
||||
col[8*0] = (a0) >> COL_SHIFT;
|
||||
col[8*7] = (a0) >> COL_SHIFT;
|
||||
@ -253,6 +357,7 @@ static void inline idctSparseCol (int16_t * col)
|
||||
col[8*3] = (a3) >> COL_SHIFT;
|
||||
col[8*4] = (a3) >> COL_SHIFT;
|
||||
}else{
|
||||
#endif
|
||||
col[8*0] = (a0 + b0) >> COL_SHIFT;
|
||||
col[8*7] = (a0 - b0) >> COL_SHIFT;
|
||||
col[8*1] = (a1 + b1) >> COL_SHIFT;
|
||||
@ -261,7 +366,9 @@ static void inline idctSparseCol (int16_t * col)
|
||||
col[8*5] = (a2 - b2) >> COL_SHIFT;
|
||||
col[8*3] = (a3 + b3) >> COL_SHIFT;
|
||||
col[8*4] = (a3 - b3) >> COL_SHIFT;
|
||||
#ifndef ARCH_ALPHA
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void inline idctSparse2Col (int16_t * col)
|
||||
@ -337,6 +444,34 @@ static void inline idctSparse2Col (int16_t * col)
|
||||
col[8*4] = (a3 - b3) >> COL_SHIFT;
|
||||
}
|
||||
|
||||
#ifdef ARCH_ALPHA
|
||||
/* If all rows but the first one are zero after row transformation,
|
||||
all rows will be identical after column transformation. */
|
||||
static inline void idctCol2(int16_t *col)
|
||||
{
|
||||
int i;
|
||||
uint64_t l, r;
|
||||
uint64_t *lcol = (uint64_t *) col;
|
||||
|
||||
for (i = 0; i < 8; ++i) {
|
||||
int a0 = col[0] + (1 << (COL_SHIFT - 1)) / W4;
|
||||
|
||||
a0 *= W4;
|
||||
col[0] = a0 >> COL_SHIFT;
|
||||
++col;
|
||||
}
|
||||
|
||||
l = lcol[0];
|
||||
r = lcol[1];
|
||||
lcol[ 2] = l; lcol[ 3] = r;
|
||||
lcol[ 4] = l; lcol[ 5] = r;
|
||||
lcol[ 6] = l; lcol[ 7] = r;
|
||||
lcol[ 8] = l; lcol[ 9] = r;
|
||||
lcol[10] = l; lcol[11] = r;
|
||||
lcol[12] = l; lcol[13] = r;
|
||||
lcol[14] = l; lcol[15] = r;
|
||||
}
|
||||
#endif
|
||||
|
||||
void simple_idct (short *block)
|
||||
{
|
||||
@ -411,7 +546,22 @@ void simple_idct (short *block)
|
||||
for(i=0; i<8; i++)
|
||||
idctSparse2Col(block + i);
|
||||
}
|
||||
#else
|
||||
#elif defined(ARCH_ALPHA)
|
||||
int shortcut = 1;
|
||||
|
||||
for (i = 0; i < 8; i++) {
|
||||
int anynonzero = idctRowCondDC(block + 8 * i);
|
||||
if (i > 0 && anynonzero)
|
||||
shortcut = 0;
|
||||
}
|
||||
|
||||
if (shortcut) {
|
||||
idctCol2(block);
|
||||
} else {
|
||||
for (i = 0; i < 8; i++)
|
||||
idctSparseCol(block + i);
|
||||
}
|
||||
#else
|
||||
for(i=0; i<8; i++)
|
||||
idctRowCondDC(block + i*8);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user