mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
Sanitize altivec code so it can be built with runtime check properly
Originally committed as revision 10640 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
298726ba55
commit
89523beea4
@ -403,7 +403,7 @@ OBJS-$(ARCH_SH4) += sh4/idct_sh4.o \
|
|||||||
sh4/dsputil_align.o \
|
sh4/dsputil_align.o \
|
||||||
sh4/dsputil_sh4.o \
|
sh4/dsputil_sh4.o \
|
||||||
|
|
||||||
OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \
|
ALTIVEC-OBJS-yes += ppc/dsputil_altivec.o \
|
||||||
ppc/fdct_altivec.o \
|
ppc/fdct_altivec.o \
|
||||||
ppc/fft_altivec.o \
|
ppc/fft_altivec.o \
|
||||||
ppc/float_altivec.o \
|
ppc/float_altivec.o \
|
||||||
@ -413,12 +413,17 @@ OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \
|
|||||||
ppc/mpegvideo_altivec.o \
|
ppc/mpegvideo_altivec.o \
|
||||||
ppc/mpegvideo_ppc.o \
|
ppc/mpegvideo_ppc.o \
|
||||||
|
|
||||||
ifeq ($(HAVE_ALTIVEC),yes)
|
ALTIVEC-OBJS-$(CONFIG_H264_DECODER) += ppc/h264_altivec.o
|
||||||
OBJS-$(CONFIG_H264_DECODER) += ppc/h264_altivec.o
|
ALTIVEC-OBJS-$(CONFIG_SNOW_DECODER) += ppc/snow_altivec.o
|
||||||
OBJS-$(CONFIG_SNOW_DECODER) += ppc/snow_altivec.o
|
ALTIVEC-OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o
|
||||||
OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o
|
ALTIVEC-OBJS-$(CONFIG_WMV3_DECODER) += ppc/vc1dsp_altivec.o
|
||||||
OBJS-$(CONFIG_WMV3_DECODER) += ppc/vc1dsp_altivec.o
|
|
||||||
endif
|
# -maltivec is needed in order to build AltiVec code.
|
||||||
|
$(ALTIVEC-OBJS-yes): CFLAGS += -maltivec -mabi=altivec
|
||||||
|
|
||||||
|
# check_altivec must be built without -maltivec
|
||||||
|
OBJS-$(HAVE_ALTIVEC) += $(ALTIVEC-OBJS-yes) \
|
||||||
|
ppc/check_altivec.o
|
||||||
|
|
||||||
OBJS-$(ARCH_BFIN) += bfin/dsputil_bfin.o \
|
OBJS-$(ARCH_BFIN) += bfin/dsputil_bfin.o \
|
||||||
bfin/mpegvideo_bfin.o \
|
bfin/mpegvideo_bfin.o \
|
||||||
|
@ -557,12 +557,6 @@ extern int mm_flags;
|
|||||||
|
|
||||||
extern int mm_flags;
|
extern int mm_flags;
|
||||||
|
|
||||||
#if defined(HAVE_ALTIVEC) && !defined(__APPLE_CC__)
|
|
||||||
#define pixel altivec_pixel
|
|
||||||
#include <altivec.h>
|
|
||||||
#undef pixel
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define DECLARE_ALIGNED_8(t, v) DECLARE_ALIGNED(16, t, v)
|
#define DECLARE_ALIGNED_8(t, v) DECLARE_ALIGNED(16, t, v)
|
||||||
#define STRIDE_ALIGN 16
|
#define STRIDE_ALIGN 16
|
||||||
|
|
||||||
|
@ -28,6 +28,10 @@
|
|||||||
#include "swscale.h"
|
#include "swscale.h"
|
||||||
#include "dsputil.h"
|
#include "dsputil.h"
|
||||||
|
|
||||||
|
#ifdef HAVE_ALTIVEC
|
||||||
|
#include "ppc/imgresample_altivec.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#define NB_COMPONENTS 3
|
#define NB_COMPONENTS 3
|
||||||
|
|
||||||
#define PHASE_BITS 4
|
#define PHASE_BITS 4
|
||||||
@ -281,133 +285,6 @@ static void v_resample4_mmx(uint8_t *dst, int dst_width, const uint8_t *src,
|
|||||||
}
|
}
|
||||||
#endif /* HAVE_MMX */
|
#endif /* HAVE_MMX */
|
||||||
|
|
||||||
#ifdef HAVE_ALTIVEC
|
|
||||||
typedef union {
|
|
||||||
vector unsigned char v;
|
|
||||||
unsigned char c[16];
|
|
||||||
} vec_uc_t;
|
|
||||||
|
|
||||||
typedef union {
|
|
||||||
vector signed short v;
|
|
||||||
signed short s[8];
|
|
||||||
} vec_ss_t;
|
|
||||||
|
|
||||||
void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
|
|
||||||
int wrap, int16_t *filter)
|
|
||||||
{
|
|
||||||
int sum, i;
|
|
||||||
const uint8_t *s;
|
|
||||||
vector unsigned char *tv, tmp, dstv, zero;
|
|
||||||
vec_ss_t srchv[4], srclv[4], fv[4];
|
|
||||||
vector signed short zeros, sumhv, sumlv;
|
|
||||||
s = src;
|
|
||||||
|
|
||||||
for(i=0;i<4;i++)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
The vec_madds later on does an implicit >>15 on the result.
|
|
||||||
Since FILTER_BITS is 8, and we have 15 bits of magnitude in
|
|
||||||
a signed short, we have just enough bits to pre-shift our
|
|
||||||
filter constants <<7 to compensate for vec_madds.
|
|
||||||
*/
|
|
||||||
fv[i].s[0] = filter[i] << (15-FILTER_BITS);
|
|
||||||
fv[i].v = vec_splat(fv[i].v, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
zero = vec_splat_u8(0);
|
|
||||||
zeros = vec_splat_s16(0);
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
When we're resampling, we'd ideally like both our input buffers,
|
|
||||||
and output buffers to be 16-byte aligned, so we can do both aligned
|
|
||||||
reads and writes. Sadly we can't always have this at the moment, so
|
|
||||||
we opt for aligned writes, as unaligned writes have a huge overhead.
|
|
||||||
To do this, do enough scalar resamples to get dst 16-byte aligned.
|
|
||||||
*/
|
|
||||||
i = (-(int)dst) & 0xf;
|
|
||||||
while(i>0) {
|
|
||||||
sum = s[0 * wrap] * filter[0] +
|
|
||||||
s[1 * wrap] * filter[1] +
|
|
||||||
s[2 * wrap] * filter[2] +
|
|
||||||
s[3 * wrap] * filter[3];
|
|
||||||
sum = sum >> FILTER_BITS;
|
|
||||||
if (sum<0) sum = 0; else if (sum>255) sum=255;
|
|
||||||
dst[0] = sum;
|
|
||||||
dst++;
|
|
||||||
s++;
|
|
||||||
dst_width--;
|
|
||||||
i--;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Do our altivec resampling on 16 pixels at once. */
|
|
||||||
while(dst_width>=16) {
|
|
||||||
/*
|
|
||||||
Read 16 (potentially unaligned) bytes from each of
|
|
||||||
4 lines into 4 vectors, and split them into shorts.
|
|
||||||
Interleave the multipy/accumulate for the resample
|
|
||||||
filter with the loads to hide the 3 cycle latency
|
|
||||||
the vec_madds have.
|
|
||||||
*/
|
|
||||||
tv = (vector unsigned char *) &s[0 * wrap];
|
|
||||||
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
|
|
||||||
srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
|
|
||||||
srclv[0].v = (vector signed short) vec_mergel(zero, tmp);
|
|
||||||
sumhv = vec_madds(srchv[0].v, fv[0].v, zeros);
|
|
||||||
sumlv = vec_madds(srclv[0].v, fv[0].v, zeros);
|
|
||||||
|
|
||||||
tv = (vector unsigned char *) &s[1 * wrap];
|
|
||||||
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap]));
|
|
||||||
srchv[1].v = (vector signed short) vec_mergeh(zero, tmp);
|
|
||||||
srclv[1].v = (vector signed short) vec_mergel(zero, tmp);
|
|
||||||
sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv);
|
|
||||||
sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv);
|
|
||||||
|
|
||||||
tv = (vector unsigned char *) &s[2 * wrap];
|
|
||||||
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap]));
|
|
||||||
srchv[2].v = (vector signed short) vec_mergeh(zero, tmp);
|
|
||||||
srclv[2].v = (vector signed short) vec_mergel(zero, tmp);
|
|
||||||
sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv);
|
|
||||||
sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv);
|
|
||||||
|
|
||||||
tv = (vector unsigned char *) &s[3 * wrap];
|
|
||||||
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap]));
|
|
||||||
srchv[3].v = (vector signed short) vec_mergeh(zero, tmp);
|
|
||||||
srclv[3].v = (vector signed short) vec_mergel(zero, tmp);
|
|
||||||
sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
|
|
||||||
sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
|
|
||||||
|
|
||||||
/*
|
|
||||||
Pack the results into our destination vector,
|
|
||||||
and do an aligned write of that back to memory.
|
|
||||||
*/
|
|
||||||
dstv = vec_packsu(sumhv, sumlv) ;
|
|
||||||
vec_st(dstv, 0, (vector unsigned char *) dst);
|
|
||||||
|
|
||||||
dst+=16;
|
|
||||||
s+=16;
|
|
||||||
dst_width-=16;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
If there are any leftover pixels, resample them
|
|
||||||
with the slow scalar method.
|
|
||||||
*/
|
|
||||||
while(dst_width>0) {
|
|
||||||
sum = s[0 * wrap] * filter[0] +
|
|
||||||
s[1 * wrap] * filter[1] +
|
|
||||||
s[2 * wrap] * filter[2] +
|
|
||||||
s[3 * wrap] * filter[3];
|
|
||||||
sum = sum >> FILTER_BITS;
|
|
||||||
if (sum<0) sum = 0; else if (sum>255) sum=255;
|
|
||||||
dst[0] = sum;
|
|
||||||
dst++;
|
|
||||||
s++;
|
|
||||||
dst_width--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif /* HAVE_ALTIVEC */
|
|
||||||
|
|
||||||
/* slow version to handle limit cases. Does not need optimisation */
|
/* slow version to handle limit cases. Does not need optimisation */
|
||||||
static void h_resample_slow(uint8_t *dst, int dst_width,
|
static void h_resample_slow(uint8_t *dst, int dst_width,
|
||||||
const uint8_t *src, int src_width,
|
const uint8_t *src, int src_width,
|
||||||
|
95
libavcodec/ppc/check_altivec.c
Normal file
95
libavcodec/ppc/check_altivec.c
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
/*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file check_altivec.c
|
||||||
|
* Checks for AltiVec presence.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
#include <sys/sysctl.h>
|
||||||
|
#elif __AMIGAOS4__
|
||||||
|
#include <exec/exec.h>
|
||||||
|
#include <interfaces/exec.h>
|
||||||
|
#include <proto/exec.h>
|
||||||
|
#else
|
||||||
|
#include <signal.h>
|
||||||
|
#include <setjmp.h>
|
||||||
|
|
||||||
|
static sigjmp_buf jmpbuf;
|
||||||
|
static volatile sig_atomic_t canjump = 0;
|
||||||
|
|
||||||
|
static void sigill_handler (int sig)
|
||||||
|
{
|
||||||
|
if (!canjump) {
|
||||||
|
signal (sig, SIG_DFL);
|
||||||
|
raise (sig);
|
||||||
|
}
|
||||||
|
|
||||||
|
canjump = 0;
|
||||||
|
siglongjmp (jmpbuf, 1);
|
||||||
|
}
|
||||||
|
#endif /* __APPLE__ */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This function MAY rely on signal() or fork() in order to make sure altivec
|
||||||
|
* is present
|
||||||
|
*/
|
||||||
|
|
||||||
|
int has_altivec(void)
|
||||||
|
{
|
||||||
|
#ifdef __AMIGAOS4__
|
||||||
|
ULONG result = 0;
|
||||||
|
extern struct ExecIFace *IExec;
|
||||||
|
|
||||||
|
IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
|
||||||
|
if (result == VECTORTYPE_ALTIVEC) return 1;
|
||||||
|
return 0;
|
||||||
|
#elif __APPLE__
|
||||||
|
int sels[2] = {CTL_HW, HW_VECTORUNIT};
|
||||||
|
int has_vu = 0;
|
||||||
|
size_t len = sizeof(has_vu);
|
||||||
|
int err;
|
||||||
|
|
||||||
|
err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
|
||||||
|
|
||||||
|
if (err == 0) return (has_vu != 0);
|
||||||
|
return 0;
|
||||||
|
#else
|
||||||
|
/* Do it the brute-force way, borrowed from the libmpeg2 library. */
|
||||||
|
{
|
||||||
|
signal (SIGILL, sigill_handler);
|
||||||
|
if (sigsetjmp (jmpbuf, 1)) {
|
||||||
|
signal (SIGILL, SIG_DFL);
|
||||||
|
} else {
|
||||||
|
canjump = 1;
|
||||||
|
|
||||||
|
asm volatile ("mtspr 256, %0\n\t"
|
||||||
|
"vand %%v0, %%v0, %%v0"
|
||||||
|
:
|
||||||
|
: "r" (-1));
|
||||||
|
|
||||||
|
signal (SIGILL, SIG_DFL);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
#endif /* __AMIGAOS4__ */
|
||||||
|
}
|
||||||
|
|
@ -25,31 +25,7 @@
|
|||||||
#include "gcc_fixes.h"
|
#include "gcc_fixes.h"
|
||||||
|
|
||||||
#include "dsputil_altivec.h"
|
#include "dsputil_altivec.h"
|
||||||
|
#include "util_altivec.h"
|
||||||
#ifdef __APPLE__
|
|
||||||
#include <sys/sysctl.h>
|
|
||||||
#elif __AMIGAOS4__
|
|
||||||
#include <exec/exec.h>
|
|
||||||
#include <interfaces/exec.h>
|
|
||||||
#include <proto/exec.h>
|
|
||||||
#else
|
|
||||||
#include <signal.h>
|
|
||||||
#include <setjmp.h>
|
|
||||||
|
|
||||||
static sigjmp_buf jmpbuf;
|
|
||||||
static volatile sig_atomic_t canjump = 0;
|
|
||||||
|
|
||||||
static void sigill_handler (int sig)
|
|
||||||
{
|
|
||||||
if (!canjump) {
|
|
||||||
signal (sig, SIG_DFL);
|
|
||||||
raise (sig);
|
|
||||||
}
|
|
||||||
|
|
||||||
canjump = 0;
|
|
||||||
siglongjmp (jmpbuf, 1);
|
|
||||||
}
|
|
||||||
#endif /* __APPLE__ */
|
|
||||||
|
|
||||||
int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
||||||
{
|
{
|
||||||
@ -1417,47 +1393,6 @@ POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
|
|||||||
return score;
|
return score;
|
||||||
}
|
}
|
||||||
|
|
||||||
int has_altivec(void)
|
|
||||||
{
|
|
||||||
#ifdef __AMIGAOS4__
|
|
||||||
ULONG result = 0;
|
|
||||||
extern struct ExecIFace *IExec;
|
|
||||||
|
|
||||||
IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
|
|
||||||
if (result == VECTORTYPE_ALTIVEC) return 1;
|
|
||||||
return 0;
|
|
||||||
#elif __APPLE__
|
|
||||||
int sels[2] = {CTL_HW, HW_VECTORUNIT};
|
|
||||||
int has_vu = 0;
|
|
||||||
size_t len = sizeof(has_vu);
|
|
||||||
int err;
|
|
||||||
|
|
||||||
err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
|
|
||||||
|
|
||||||
if (err == 0) return (has_vu != 0);
|
|
||||||
return 0;
|
|
||||||
#else
|
|
||||||
/* Do it the brute-force way, borrowed from the libmpeg2 library. */
|
|
||||||
{
|
|
||||||
signal (SIGILL, sigill_handler);
|
|
||||||
if (sigsetjmp (jmpbuf, 1)) {
|
|
||||||
signal (SIGILL, SIG_DFL);
|
|
||||||
} else {
|
|
||||||
canjump = 1;
|
|
||||||
|
|
||||||
asm volatile ("mtspr 256, %0\n\t"
|
|
||||||
"vand %%v0, %%v0, %%v0"
|
|
||||||
:
|
|
||||||
: "r" (-1));
|
|
||||||
|
|
||||||
signal (SIGILL, SIG_DFL);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
#endif /* __AMIGAOS4__ */
|
|
||||||
}
|
|
||||||
|
|
||||||
static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
|
static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
|
||||||
int blocksize)
|
int blocksize)
|
||||||
{
|
{
|
||||||
|
@ -31,83 +31,4 @@ void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size,
|
|||||||
|
|
||||||
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||||
|
|
||||||
// used to build registers permutation vectors (vcprm)
|
|
||||||
// the 's' are for words in the _s_econd vector
|
|
||||||
#define WORD_0 0x00,0x01,0x02,0x03
|
|
||||||
#define WORD_1 0x04,0x05,0x06,0x07
|
|
||||||
#define WORD_2 0x08,0x09,0x0a,0x0b
|
|
||||||
#define WORD_3 0x0c,0x0d,0x0e,0x0f
|
|
||||||
#define WORD_s0 0x10,0x11,0x12,0x13
|
|
||||||
#define WORD_s1 0x14,0x15,0x16,0x17
|
|
||||||
#define WORD_s2 0x18,0x19,0x1a,0x1b
|
|
||||||
#define WORD_s3 0x1c,0x1d,0x1e,0x1f
|
|
||||||
|
|
||||||
#ifdef __APPLE_CC__
|
|
||||||
#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
|
|
||||||
#else
|
|
||||||
#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// vcprmle is used to keep the same index as in the SSE version.
|
|
||||||
// it's the same as vcprm, with the index inversed
|
|
||||||
// ('le' is Little Endian)
|
|
||||||
#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
|
|
||||||
|
|
||||||
// used to build inverse/identity vectors (vcii)
|
|
||||||
// n is _n_egative, p is _p_ositive
|
|
||||||
#define FLOAT_n -1.
|
|
||||||
#define FLOAT_p 1.
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef __APPLE_CC__
|
|
||||||
#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
|
|
||||||
#else
|
|
||||||
#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Transpose 8x8 matrix of 16-bit elements (in-place)
|
|
||||||
#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
|
|
||||||
do { \
|
|
||||||
vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
|
|
||||||
vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
|
|
||||||
\
|
|
||||||
A1 = vec_mergeh (a, e); \
|
|
||||||
B1 = vec_mergel (a, e); \
|
|
||||||
C1 = vec_mergeh (b, f); \
|
|
||||||
D1 = vec_mergel (b, f); \
|
|
||||||
E1 = vec_mergeh (c, g); \
|
|
||||||
F1 = vec_mergel (c, g); \
|
|
||||||
G1 = vec_mergeh (d, h); \
|
|
||||||
H1 = vec_mergel (d, h); \
|
|
||||||
\
|
|
||||||
A2 = vec_mergeh (A1, E1); \
|
|
||||||
B2 = vec_mergel (A1, E1); \
|
|
||||||
C2 = vec_mergeh (B1, F1); \
|
|
||||||
D2 = vec_mergel (B1, F1); \
|
|
||||||
E2 = vec_mergeh (C1, G1); \
|
|
||||||
F2 = vec_mergel (C1, G1); \
|
|
||||||
G2 = vec_mergeh (D1, H1); \
|
|
||||||
H2 = vec_mergel (D1, H1); \
|
|
||||||
\
|
|
||||||
a = vec_mergeh (A2, E2); \
|
|
||||||
b = vec_mergel (A2, E2); \
|
|
||||||
c = vec_mergeh (B2, F2); \
|
|
||||||
d = vec_mergel (B2, F2); \
|
|
||||||
e = vec_mergeh (C2, G2); \
|
|
||||||
f = vec_mergel (C2, G2); \
|
|
||||||
g = vec_mergeh (D2, H2); \
|
|
||||||
h = vec_mergel (D2, H2); \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
|
|
||||||
/** \brief loads unaligned vector \a *src with offset \a offset
|
|
||||||
and returns it */
|
|
||||||
static inline vector unsigned char unaligned_load(int offset, uint8_t *src)
|
|
||||||
{
|
|
||||||
register vector unsigned char first = vec_ld(offset, src);
|
|
||||||
register vector unsigned char second = vec_ld(offset+15, src);
|
|
||||||
register vector unsigned char mask = vec_lvsl(offset, src);
|
|
||||||
return vec_perm(first, second, mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* DSPUTIL_ALTIVEC_H */
|
#endif /* DSPUTIL_ALTIVEC_H */
|
||||||
|
@ -24,8 +24,8 @@
|
|||||||
|
|
||||||
#include "gcc_fixes.h"
|
#include "gcc_fixes.h"
|
||||||
|
|
||||||
#include "dsputil_altivec.h"
|
#include "dsputil_ppc.h"
|
||||||
|
#include "util_altivec.h"
|
||||||
/*
|
/*
|
||||||
those three macros are from libavcodec/fft.c
|
those three macros are from libavcodec/fft.c
|
||||||
and are required for the reference C code
|
and are required for the reference C code
|
||||||
|
@ -24,7 +24,8 @@
|
|||||||
|
|
||||||
#include "gcc_fixes.h"
|
#include "gcc_fixes.h"
|
||||||
|
|
||||||
#include "dsputil_altivec.h"
|
#include "dsputil_ppc.h"
|
||||||
|
#include "util_altivec.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
|
altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
|
||||||
|
@ -22,7 +22,8 @@
|
|||||||
|
|
||||||
#include "gcc_fixes.h"
|
#include "gcc_fixes.h"
|
||||||
|
|
||||||
#include "dsputil_altivec.h"
|
#include "dsputil_ppc.h"
|
||||||
|
#include "util_altivec.h"
|
||||||
#include "types_altivec.h"
|
#include "types_altivec.h"
|
||||||
|
|
||||||
#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
|
#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
|
||||||
|
153
libavcodec/ppc/imgresample_altivec.c
Normal file
153
libavcodec/ppc/imgresample_altivec.c
Normal file
@ -0,0 +1,153 @@
|
|||||||
|
/*
|
||||||
|
* High quality image resampling with polyphase filters
|
||||||
|
* Copyright (c) 2001 Fabrice Bellard.
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file imgresample_altivec.c
|
||||||
|
* High quality image resampling with polyphase filters - AltiVec bits
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "gcc_fixes.h"
|
||||||
|
|
||||||
|
typedef union {
|
||||||
|
vector unsigned char v;
|
||||||
|
unsigned char c[16];
|
||||||
|
} vec_uc_t;
|
||||||
|
|
||||||
|
typedef union {
|
||||||
|
vector signed short v;
|
||||||
|
signed short s[8];
|
||||||
|
} vec_ss_t;
|
||||||
|
|
||||||
|
void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
|
||||||
|
int wrap, int16_t *filter)
|
||||||
|
{
|
||||||
|
int sum, i;
|
||||||
|
const uint8_t *s;
|
||||||
|
vector unsigned char *tv, tmp, dstv, zero;
|
||||||
|
vec_ss_t srchv[4], srclv[4], fv[4];
|
||||||
|
vector signed short zeros, sumhv, sumlv;
|
||||||
|
s = src;
|
||||||
|
|
||||||
|
for(i=0;i<4;i++)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
The vec_madds later on does an implicit >>15 on the result.
|
||||||
|
Since FILTER_BITS is 8, and we have 15 bits of magnitude in
|
||||||
|
a signed short, we have just enough bits to pre-shift our
|
||||||
|
filter constants <<7 to compensate for vec_madds.
|
||||||
|
*/
|
||||||
|
fv[i].s[0] = filter[i] << (15-FILTER_BITS);
|
||||||
|
fv[i].v = vec_splat(fv[i].v, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
zero = vec_splat_u8(0);
|
||||||
|
zeros = vec_splat_s16(0);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
When we're resampling, we'd ideally like both our input buffers,
|
||||||
|
and output buffers to be 16-byte aligned, so we can do both aligned
|
||||||
|
reads and writes. Sadly we can't always have this at the moment, so
|
||||||
|
we opt for aligned writes, as unaligned writes have a huge overhead.
|
||||||
|
To do this, do enough scalar resamples to get dst 16-byte aligned.
|
||||||
|
*/
|
||||||
|
i = (-(int)dst) & 0xf;
|
||||||
|
while(i>0) {
|
||||||
|
sum = s[0 * wrap] * filter[0] +
|
||||||
|
s[1 * wrap] * filter[1] +
|
||||||
|
s[2 * wrap] * filter[2] +
|
||||||
|
s[3 * wrap] * filter[3];
|
||||||
|
sum = sum >> FILTER_BITS;
|
||||||
|
if (sum<0) sum = 0; else if (sum>255) sum=255;
|
||||||
|
dst[0] = sum;
|
||||||
|
dst++;
|
||||||
|
s++;
|
||||||
|
dst_width--;
|
||||||
|
i--;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Do our altivec resampling on 16 pixels at once. */
|
||||||
|
while(dst_width>=16) {
|
||||||
|
/*
|
||||||
|
Read 16 (potentially unaligned) bytes from each of
|
||||||
|
4 lines into 4 vectors, and split them into shorts.
|
||||||
|
Interleave the multipy/accumulate for the resample
|
||||||
|
filter with the loads to hide the 3 cycle latency
|
||||||
|
the vec_madds have.
|
||||||
|
*/
|
||||||
|
tv = (vector unsigned char *) &s[0 * wrap];
|
||||||
|
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
|
||||||
|
srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
|
||||||
|
srclv[0].v = (vector signed short) vec_mergel(zero, tmp);
|
||||||
|
sumhv = vec_madds(srchv[0].v, fv[0].v, zeros);
|
||||||
|
sumlv = vec_madds(srclv[0].v, fv[0].v, zeros);
|
||||||
|
|
||||||
|
tv = (vector unsigned char *) &s[1 * wrap];
|
||||||
|
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap]));
|
||||||
|
srchv[1].v = (vector signed short) vec_mergeh(zero, tmp);
|
||||||
|
srclv[1].v = (vector signed short) vec_mergel(zero, tmp);
|
||||||
|
sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv);
|
||||||
|
sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv);
|
||||||
|
|
||||||
|
tv = (vector unsigned char *) &s[2 * wrap];
|
||||||
|
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap]));
|
||||||
|
srchv[2].v = (vector signed short) vec_mergeh(zero, tmp);
|
||||||
|
srclv[2].v = (vector signed short) vec_mergel(zero, tmp);
|
||||||
|
sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv);
|
||||||
|
sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv);
|
||||||
|
|
||||||
|
tv = (vector unsigned char *) &s[3 * wrap];
|
||||||
|
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap]));
|
||||||
|
srchv[3].v = (vector signed short) vec_mergeh(zero, tmp);
|
||||||
|
srclv[3].v = (vector signed short) vec_mergel(zero, tmp);
|
||||||
|
sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
|
||||||
|
sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
|
||||||
|
|
||||||
|
/*
|
||||||
|
Pack the results into our destination vector,
|
||||||
|
and do an aligned write of that back to memory.
|
||||||
|
*/
|
||||||
|
dstv = vec_packsu(sumhv, sumlv) ;
|
||||||
|
vec_st(dstv, 0, (vector unsigned char *) dst);
|
||||||
|
|
||||||
|
dst+=16;
|
||||||
|
s+=16;
|
||||||
|
dst_width-=16;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
If there are any leftover pixels, resample them
|
||||||
|
with the slow scalar method.
|
||||||
|
*/
|
||||||
|
while(dst_width>0) {
|
||||||
|
sum = s[0 * wrap] * filter[0] +
|
||||||
|
s[1 * wrap] * filter[1] +
|
||||||
|
s[2 * wrap] * filter[2] +
|
||||||
|
s[3 * wrap] * filter[3];
|
||||||
|
sum = sum >> FILTER_BITS;
|
||||||
|
if (sum<0) sum = 0; else if (sum>255) sum=255;
|
||||||
|
dst[0] = sum;
|
||||||
|
dst++;
|
||||||
|
s++;
|
||||||
|
dst_width--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
24
libavcodec/ppc/imgresample_altivec.h
Normal file
24
libavcodec/ppc/imgresample_altivec.h
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
/*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef IMGRESAMPLE_ALTIVEC_H
|
||||||
|
#define IMGRESAMPLE_ALTIVEC_H
|
||||||
|
|
||||||
|
void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
|
||||||
|
int wrap, int16_t *filter);
|
||||||
|
#endif /* IMGRESAMPLE_ALTIVEC_H */
|
@ -28,8 +28,8 @@
|
|||||||
|
|
||||||
#include "gcc_fixes.h"
|
#include "gcc_fixes.h"
|
||||||
|
|
||||||
#include "dsputil_altivec.h"
|
#include "dsputil_ppc.h"
|
||||||
|
#include "util_altivec.h"
|
||||||
// Swaps two variables (used for altivec registers)
|
// Swaps two variables (used for altivec registers)
|
||||||
#define SWAP(a,b) \
|
#define SWAP(a,b) \
|
||||||
do { \
|
do { \
|
||||||
|
106
libavcodec/ppc/util_altivec.h
Normal file
106
libavcodec/ppc/util_altivec.h
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
/*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file util_altivec.h
|
||||||
|
* Contains misc utility macros and inline functions
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef UTIL_ALTIVEC_H
|
||||||
|
#define UTIL_ALTIVEC_H
|
||||||
|
|
||||||
|
// used to build registers permutation vectors (vcprm)
|
||||||
|
// the 's' are for words in the _s_econd vector
|
||||||
|
#define WORD_0 0x00,0x01,0x02,0x03
|
||||||
|
#define WORD_1 0x04,0x05,0x06,0x07
|
||||||
|
#define WORD_2 0x08,0x09,0x0a,0x0b
|
||||||
|
#define WORD_3 0x0c,0x0d,0x0e,0x0f
|
||||||
|
#define WORD_s0 0x10,0x11,0x12,0x13
|
||||||
|
#define WORD_s1 0x14,0x15,0x16,0x17
|
||||||
|
#define WORD_s2 0x18,0x19,0x1a,0x1b
|
||||||
|
#define WORD_s3 0x1c,0x1d,0x1e,0x1f
|
||||||
|
|
||||||
|
#ifdef __APPLE_CC__
|
||||||
|
#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
|
||||||
|
#else
|
||||||
|
#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// vcprmle is used to keep the same index as in the SSE version.
|
||||||
|
// it's the same as vcprm, with the index inversed
|
||||||
|
// ('le' is Little Endian)
|
||||||
|
#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
|
||||||
|
|
||||||
|
// used to build inverse/identity vectors (vcii)
|
||||||
|
// n is _n_egative, p is _p_ositive
|
||||||
|
#define FLOAT_n -1.
|
||||||
|
#define FLOAT_p 1.
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __APPLE_CC__
|
||||||
|
#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
|
||||||
|
#else
|
||||||
|
#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Transpose 8x8 matrix of 16-bit elements (in-place)
|
||||||
|
#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
|
||||||
|
do { \
|
||||||
|
vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
|
||||||
|
vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
|
||||||
|
\
|
||||||
|
A1 = vec_mergeh (a, e); \
|
||||||
|
B1 = vec_mergel (a, e); \
|
||||||
|
C1 = vec_mergeh (b, f); \
|
||||||
|
D1 = vec_mergel (b, f); \
|
||||||
|
E1 = vec_mergeh (c, g); \
|
||||||
|
F1 = vec_mergel (c, g); \
|
||||||
|
G1 = vec_mergeh (d, h); \
|
||||||
|
H1 = vec_mergel (d, h); \
|
||||||
|
\
|
||||||
|
A2 = vec_mergeh (A1, E1); \
|
||||||
|
B2 = vec_mergel (A1, E1); \
|
||||||
|
C2 = vec_mergeh (B1, F1); \
|
||||||
|
D2 = vec_mergel (B1, F1); \
|
||||||
|
E2 = vec_mergeh (C1, G1); \
|
||||||
|
F2 = vec_mergel (C1, G1); \
|
||||||
|
G2 = vec_mergeh (D1, H1); \
|
||||||
|
H2 = vec_mergel (D1, H1); \
|
||||||
|
\
|
||||||
|
a = vec_mergeh (A2, E2); \
|
||||||
|
b = vec_mergel (A2, E2); \
|
||||||
|
c = vec_mergeh (B2, F2); \
|
||||||
|
d = vec_mergel (B2, F2); \
|
||||||
|
e = vec_mergeh (C2, G2); \
|
||||||
|
f = vec_mergel (C2, G2); \
|
||||||
|
g = vec_mergeh (D2, H2); \
|
||||||
|
h = vec_mergel (D2, H2); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
/** \brief loads unaligned vector \a *src with offset \a offset
|
||||||
|
and returns it */
|
||||||
|
static inline vector unsigned char unaligned_load(int offset, uint8_t *src)
|
||||||
|
{
|
||||||
|
register vector unsigned char first = vec_ld(offset, src);
|
||||||
|
register vector unsigned char second = vec_ld(offset+15, src);
|
||||||
|
register vector unsigned char mask = vec_lvsl(offset, src);
|
||||||
|
return vec_perm(first, second, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* UTIL_ALTIVEC_H */
|
@ -23,7 +23,7 @@
|
|||||||
|
|
||||||
#include "gcc_fixes.h"
|
#include "gcc_fixes.h"
|
||||||
|
|
||||||
#include "dsputil_altivec.h"
|
#include "util_altivec.h"
|
||||||
|
|
||||||
// main steps of 8x8 transform
|
// main steps of 8x8 transform
|
||||||
#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
|
#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
|
||||||
|
Loading…
Reference in New Issue
Block a user