From 89523beea45e265d985aace8be79b45e94f21e6b Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Tue, 2 Oct 2007 11:39:32 +0000 Subject: [PATCH] Sanitize altivec code so it can be built with runtime check properly Originally committed as revision 10640 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/Makefile | 19 ++-- libavcodec/dsputil.h | 6 -- libavcodec/imgresample.c | 131 +---------------------- libavcodec/ppc/check_altivec.c | 95 +++++++++++++++++ libavcodec/ppc/dsputil_altivec.c | 67 +----------- libavcodec/ppc/dsputil_altivec.h | 79 -------------- libavcodec/ppc/fft_altivec.c | 4 +- libavcodec/ppc/gmc_altivec.c | 3 +- libavcodec/ppc/h264_altivec.c | 3 +- libavcodec/ppc/imgresample_altivec.c | 153 +++++++++++++++++++++++++++ libavcodec/ppc/imgresample_altivec.h | 24 +++++ libavcodec/ppc/mpegvideo_altivec.c | 4 +- libavcodec/ppc/util_altivec.h | 106 +++++++++++++++++++ libavcodec/ppc/vc1dsp_altivec.c | 2 +- 14 files changed, 404 insertions(+), 292 deletions(-) create mode 100644 libavcodec/ppc/check_altivec.c create mode 100644 libavcodec/ppc/imgresample_altivec.c create mode 100644 libavcodec/ppc/imgresample_altivec.h create mode 100644 libavcodec/ppc/util_altivec.h diff --git a/libavcodec/Makefile b/libavcodec/Makefile index a41dd8f53c..e5d6b13b2d 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -403,7 +403,7 @@ OBJS-$(ARCH_SH4) += sh4/idct_sh4.o \ sh4/dsputil_align.o \ sh4/dsputil_sh4.o \ -OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \ +ALTIVEC-OBJS-yes += ppc/dsputil_altivec.o \ ppc/fdct_altivec.o \ ppc/fft_altivec.o \ ppc/float_altivec.o \ @@ -413,12 +413,17 @@ OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \ ppc/mpegvideo_altivec.o \ ppc/mpegvideo_ppc.o \ -ifeq ($(HAVE_ALTIVEC),yes) -OBJS-$(CONFIG_H264_DECODER) += ppc/h264_altivec.o -OBJS-$(CONFIG_SNOW_DECODER) += ppc/snow_altivec.o -OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o -OBJS-$(CONFIG_WMV3_DECODER) += ppc/vc1dsp_altivec.o -endif +ALTIVEC-OBJS-$(CONFIG_H264_DECODER) += ppc/h264_altivec.o +ALTIVEC-OBJS-$(CONFIG_SNOW_DECODER) += ppc/snow_altivec.o +ALTIVEC-OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o +ALTIVEC-OBJS-$(CONFIG_WMV3_DECODER) += ppc/vc1dsp_altivec.o + +# -maltivec is needed in order to build AltiVec code. +$(ALTIVEC-OBJS-yes): CFLAGS += -maltivec -mabi=altivec + +# check_altivec must be built without -maltivec +OBJS-$(HAVE_ALTIVEC) += $(ALTIVEC-OBJS-yes) \ + ppc/check_altivec.o OBJS-$(ARCH_BFIN) += bfin/dsputil_bfin.o \ bfin/mpegvideo_bfin.o \ diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 3c6121c3b6..dcccee991b 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -557,12 +557,6 @@ extern int mm_flags; extern int mm_flags; -#if defined(HAVE_ALTIVEC) && !defined(__APPLE_CC__) -#define pixel altivec_pixel -#include -#undef pixel -#endif - #define DECLARE_ALIGNED_8(t, v) DECLARE_ALIGNED(16, t, v) #define STRIDE_ALIGN 16 diff --git a/libavcodec/imgresample.c b/libavcodec/imgresample.c index 3e1f3fe110..d1d6757c6c 100644 --- a/libavcodec/imgresample.c +++ b/libavcodec/imgresample.c @@ -28,6 +28,10 @@ #include "swscale.h" #include "dsputil.h" +#ifdef HAVE_ALTIVEC +#include "ppc/imgresample_altivec.h" +#endif + #define NB_COMPONENTS 3 #define PHASE_BITS 4 @@ -281,133 +285,6 @@ static void v_resample4_mmx(uint8_t *dst, int dst_width, const uint8_t *src, } #endif /* HAVE_MMX */ -#ifdef HAVE_ALTIVEC -typedef union { - vector unsigned char v; - unsigned char c[16]; -} vec_uc_t; - -typedef union { - vector signed short v; - signed short s[8]; -} vec_ss_t; - -void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, - int wrap, int16_t *filter) -{ - int sum, i; - const uint8_t *s; - vector unsigned char *tv, tmp, dstv, zero; - vec_ss_t srchv[4], srclv[4], fv[4]; - vector signed short zeros, sumhv, sumlv; - s = src; - - for(i=0;i<4;i++) - { - /* - The vec_madds later on does an implicit >>15 on the result. - Since FILTER_BITS is 8, and we have 15 bits of magnitude in - a signed short, we have just enough bits to pre-shift our - filter constants <<7 to compensate for vec_madds. - */ - fv[i].s[0] = filter[i] << (15-FILTER_BITS); - fv[i].v = vec_splat(fv[i].v, 0); - } - - zero = vec_splat_u8(0); - zeros = vec_splat_s16(0); - - - /* - When we're resampling, we'd ideally like both our input buffers, - and output buffers to be 16-byte aligned, so we can do both aligned - reads and writes. Sadly we can't always have this at the moment, so - we opt for aligned writes, as unaligned writes have a huge overhead. - To do this, do enough scalar resamples to get dst 16-byte aligned. - */ - i = (-(int)dst) & 0xf; - while(i>0) { - sum = s[0 * wrap] * filter[0] + - s[1 * wrap] * filter[1] + - s[2 * wrap] * filter[2] + - s[3 * wrap] * filter[3]; - sum = sum >> FILTER_BITS; - if (sum<0) sum = 0; else if (sum>255) sum=255; - dst[0] = sum; - dst++; - s++; - dst_width--; - i--; - } - - /* Do our altivec resampling on 16 pixels at once. */ - while(dst_width>=16) { - /* - Read 16 (potentially unaligned) bytes from each of - 4 lines into 4 vectors, and split them into shorts. - Interleave the multipy/accumulate for the resample - filter with the loads to hide the 3 cycle latency - the vec_madds have. - */ - tv = (vector unsigned char *) &s[0 * wrap]; - tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); - srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); - srclv[0].v = (vector signed short) vec_mergel(zero, tmp); - sumhv = vec_madds(srchv[0].v, fv[0].v, zeros); - sumlv = vec_madds(srclv[0].v, fv[0].v, zeros); - - tv = (vector unsigned char *) &s[1 * wrap]; - tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap])); - srchv[1].v = (vector signed short) vec_mergeh(zero, tmp); - srclv[1].v = (vector signed short) vec_mergel(zero, tmp); - sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv); - sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv); - - tv = (vector unsigned char *) &s[2 * wrap]; - tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap])); - srchv[2].v = (vector signed short) vec_mergeh(zero, tmp); - srclv[2].v = (vector signed short) vec_mergel(zero, tmp); - sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv); - sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv); - - tv = (vector unsigned char *) &s[3 * wrap]; - tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap])); - srchv[3].v = (vector signed short) vec_mergeh(zero, tmp); - srclv[3].v = (vector signed short) vec_mergel(zero, tmp); - sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); - sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); - - /* - Pack the results into our destination vector, - and do an aligned write of that back to memory. - */ - dstv = vec_packsu(sumhv, sumlv) ; - vec_st(dstv, 0, (vector unsigned char *) dst); - - dst+=16; - s+=16; - dst_width-=16; - } - - /* - If there are any leftover pixels, resample them - with the slow scalar method. - */ - while(dst_width>0) { - sum = s[0 * wrap] * filter[0] + - s[1 * wrap] * filter[1] + - s[2 * wrap] * filter[2] + - s[3 * wrap] * filter[3]; - sum = sum >> FILTER_BITS; - if (sum<0) sum = 0; else if (sum>255) sum=255; - dst[0] = sum; - dst++; - s++; - dst_width--; - } -} -#endif /* HAVE_ALTIVEC */ - /* slow version to handle limit cases. Does not need optimisation */ static void h_resample_slow(uint8_t *dst, int dst_width, const uint8_t *src, int src_width, diff --git a/libavcodec/ppc/check_altivec.c b/libavcodec/ppc/check_altivec.c new file mode 100644 index 0000000000..72b4afe48a --- /dev/null +++ b/libavcodec/ppc/check_altivec.c @@ -0,0 +1,95 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +/** + * @file check_altivec.c + * Checks for AltiVec presence. + */ + +#ifdef __APPLE__ +#include +#elif __AMIGAOS4__ +#include +#include +#include +#else +#include +#include + +static sigjmp_buf jmpbuf; +static volatile sig_atomic_t canjump = 0; + +static void sigill_handler (int sig) +{ + if (!canjump) { + signal (sig, SIG_DFL); + raise (sig); + } + + canjump = 0; + siglongjmp (jmpbuf, 1); +} +#endif /* __APPLE__ */ + +/** + * This function MAY rely on signal() or fork() in order to make sure altivec + * is present + */ + +int has_altivec(void) +{ +#ifdef __AMIGAOS4__ + ULONG result = 0; + extern struct ExecIFace *IExec; + + IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE); + if (result == VECTORTYPE_ALTIVEC) return 1; + return 0; +#elif __APPLE__ + int sels[2] = {CTL_HW, HW_VECTORUNIT}; + int has_vu = 0; + size_t len = sizeof(has_vu); + int err; + + err = sysctl(sels, 2, &has_vu, &len, NULL, 0); + + if (err == 0) return (has_vu != 0); + return 0; +#else +/* Do it the brute-force way, borrowed from the libmpeg2 library. */ + { + signal (SIGILL, sigill_handler); + if (sigsetjmp (jmpbuf, 1)) { + signal (SIGILL, SIG_DFL); + } else { + canjump = 1; + + asm volatile ("mtspr 256, %0\n\t" + "vand %%v0, %%v0, %%v0" + : + : "r" (-1)); + + signal (SIGILL, SIG_DFL); + return 1; + } + } + return 0; +#endif /* __AMIGAOS4__ */ +} + diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c index 0fb13d91eb..da243d989f 100644 --- a/libavcodec/ppc/dsputil_altivec.c +++ b/libavcodec/ppc/dsputil_altivec.c @@ -25,31 +25,7 @@ #include "gcc_fixes.h" #include "dsputil_altivec.h" - -#ifdef __APPLE__ -#include -#elif __AMIGAOS4__ -#include -#include -#include -#else -#include -#include - -static sigjmp_buf jmpbuf; -static volatile sig_atomic_t canjump = 0; - -static void sigill_handler (int sig) -{ - if (!canjump) { - signal (sig, SIG_DFL); - raise (sig); - } - - canjump = 0; - siglongjmp (jmpbuf, 1); -} -#endif /* __APPLE__ */ +#include "util_altivec.h" int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { @@ -1417,47 +1393,6 @@ POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1); return score; } -int has_altivec(void) -{ -#ifdef __AMIGAOS4__ - ULONG result = 0; - extern struct ExecIFace *IExec; - - IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE); - if (result == VECTORTYPE_ALTIVEC) return 1; - return 0; -#elif __APPLE__ - int sels[2] = {CTL_HW, HW_VECTORUNIT}; - int has_vu = 0; - size_t len = sizeof(has_vu); - int err; - - err = sysctl(sels, 2, &has_vu, &len, NULL, 0); - - if (err == 0) return (has_vu != 0); - return 0; -#else -/* Do it the brute-force way, borrowed from the libmpeg2 library. */ - { - signal (SIGILL, sigill_handler); - if (sigsetjmp (jmpbuf, 1)) { - signal (SIGILL, SIG_DFL); - } else { - canjump = 1; - - asm volatile ("mtspr 256, %0\n\t" - "vand %%v0, %%v0, %%v0" - : - : "r" (-1)); - - signal (SIGILL, SIG_DFL); - return 1; - } - } - return 0; -#endif /* __AMIGAOS4__ */ -} - static void vorbis_inverse_coupling_altivec(float *mag, float *ang, int blocksize) { diff --git a/libavcodec/ppc/dsputil_altivec.h b/libavcodec/ppc/dsputil_altivec.h index 846b4c4d68..539fa04e74 100644 --- a/libavcodec/ppc/dsputil_altivec.h +++ b/libavcodec/ppc/dsputil_altivec.h @@ -31,83 +31,4 @@ void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); -// used to build registers permutation vectors (vcprm) -// the 's' are for words in the _s_econd vector -#define WORD_0 0x00,0x01,0x02,0x03 -#define WORD_1 0x04,0x05,0x06,0x07 -#define WORD_2 0x08,0x09,0x0a,0x0b -#define WORD_3 0x0c,0x0d,0x0e,0x0f -#define WORD_s0 0x10,0x11,0x12,0x13 -#define WORD_s1 0x14,0x15,0x16,0x17 -#define WORD_s2 0x18,0x19,0x1a,0x1b -#define WORD_s3 0x1c,0x1d,0x1e,0x1f - -#ifdef __APPLE_CC__ -#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) -#else -#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} -#endif - -// vcprmle is used to keep the same index as in the SSE version. -// it's the same as vcprm, with the index inversed -// ('le' is Little Endian) -#define vcprmle(a,b,c,d) vcprm(d,c,b,a) - -// used to build inverse/identity vectors (vcii) -// n is _n_egative, p is _p_ositive -#define FLOAT_n -1. -#define FLOAT_p 1. - - -#ifdef __APPLE_CC__ -#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) -#else -#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} -#endif - -// Transpose 8x8 matrix of 16-bit elements (in-place) -#define TRANSPOSE8(a,b,c,d,e,f,g,h) \ -do { \ - vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \ - vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \ - \ - A1 = vec_mergeh (a, e); \ - B1 = vec_mergel (a, e); \ - C1 = vec_mergeh (b, f); \ - D1 = vec_mergel (b, f); \ - E1 = vec_mergeh (c, g); \ - F1 = vec_mergel (c, g); \ - G1 = vec_mergeh (d, h); \ - H1 = vec_mergel (d, h); \ - \ - A2 = vec_mergeh (A1, E1); \ - B2 = vec_mergel (A1, E1); \ - C2 = vec_mergeh (B1, F1); \ - D2 = vec_mergel (B1, F1); \ - E2 = vec_mergeh (C1, G1); \ - F2 = vec_mergel (C1, G1); \ - G2 = vec_mergeh (D1, H1); \ - H2 = vec_mergel (D1, H1); \ - \ - a = vec_mergeh (A2, E2); \ - b = vec_mergel (A2, E2); \ - c = vec_mergeh (B2, F2); \ - d = vec_mergel (B2, F2); \ - e = vec_mergeh (C2, G2); \ - f = vec_mergel (C2, G2); \ - g = vec_mergeh (D2, H2); \ - h = vec_mergel (D2, H2); \ -} while (0) - - -/** \brief loads unaligned vector \a *src with offset \a offset - and returns it */ -static inline vector unsigned char unaligned_load(int offset, uint8_t *src) -{ - register vector unsigned char first = vec_ld(offset, src); - register vector unsigned char second = vec_ld(offset+15, src); - register vector unsigned char mask = vec_lvsl(offset, src); - return vec_perm(first, second, mask); -} - #endif /* DSPUTIL_ALTIVEC_H */ diff --git a/libavcodec/ppc/fft_altivec.c b/libavcodec/ppc/fft_altivec.c index 62de81f5d8..e0b77807f8 100644 --- a/libavcodec/ppc/fft_altivec.c +++ b/libavcodec/ppc/fft_altivec.c @@ -24,8 +24,8 @@ #include "gcc_fixes.h" -#include "dsputil_altivec.h" - +#include "dsputil_ppc.h" +#include "util_altivec.h" /* those three macros are from libavcodec/fft.c and are required for the reference C code diff --git a/libavcodec/ppc/gmc_altivec.c b/libavcodec/ppc/gmc_altivec.c index 1126eb89ca..b7a477dd96 100644 --- a/libavcodec/ppc/gmc_altivec.c +++ b/libavcodec/ppc/gmc_altivec.c @@ -24,7 +24,8 @@ #include "gcc_fixes.h" -#include "dsputil_altivec.h" +#include "dsputil_ppc.h" +#include "util_altivec.h" /* altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, diff --git a/libavcodec/ppc/h264_altivec.c b/libavcodec/ppc/h264_altivec.c index 6e42694feb..8a98435163 100644 --- a/libavcodec/ppc/h264_altivec.c +++ b/libavcodec/ppc/h264_altivec.c @@ -22,7 +22,8 @@ #include "gcc_fixes.h" -#include "dsputil_altivec.h" +#include "dsputil_ppc.h" +#include "util_altivec.h" #include "types_altivec.h" #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s diff --git a/libavcodec/ppc/imgresample_altivec.c b/libavcodec/ppc/imgresample_altivec.c new file mode 100644 index 0000000000..3b161c5a6d --- /dev/null +++ b/libavcodec/ppc/imgresample_altivec.c @@ -0,0 +1,153 @@ +/* + * High quality image resampling with polyphase filters + * Copyright (c) 2001 Fabrice Bellard. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file imgresample_altivec.c + * High quality image resampling with polyphase filters - AltiVec bits + */ + +#include "gcc_fixes.h" + +typedef union { + vector unsigned char v; + unsigned char c[16]; +} vec_uc_t; + +typedef union { + vector signed short v; + signed short s[8]; +} vec_ss_t; + +void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, + int wrap, int16_t *filter) +{ + int sum, i; + const uint8_t *s; + vector unsigned char *tv, tmp, dstv, zero; + vec_ss_t srchv[4], srclv[4], fv[4]; + vector signed short zeros, sumhv, sumlv; + s = src; + + for(i=0;i<4;i++) + { + /* + The vec_madds later on does an implicit >>15 on the result. + Since FILTER_BITS is 8, and we have 15 bits of magnitude in + a signed short, we have just enough bits to pre-shift our + filter constants <<7 to compensate for vec_madds. + */ + fv[i].s[0] = filter[i] << (15-FILTER_BITS); + fv[i].v = vec_splat(fv[i].v, 0); + } + + zero = vec_splat_u8(0); + zeros = vec_splat_s16(0); + + + /* + When we're resampling, we'd ideally like both our input buffers, + and output buffers to be 16-byte aligned, so we can do both aligned + reads and writes. Sadly we can't always have this at the moment, so + we opt for aligned writes, as unaligned writes have a huge overhead. + To do this, do enough scalar resamples to get dst 16-byte aligned. + */ + i = (-(int)dst) & 0xf; + while(i>0) { + sum = s[0 * wrap] * filter[0] + + s[1 * wrap] * filter[1] + + s[2 * wrap] * filter[2] + + s[3 * wrap] * filter[3]; + sum = sum >> FILTER_BITS; + if (sum<0) sum = 0; else if (sum>255) sum=255; + dst[0] = sum; + dst++; + s++; + dst_width--; + i--; + } + + /* Do our altivec resampling on 16 pixels at once. */ + while(dst_width>=16) { + /* + Read 16 (potentially unaligned) bytes from each of + 4 lines into 4 vectors, and split them into shorts. + Interleave the multipy/accumulate for the resample + filter with the loads to hide the 3 cycle latency + the vec_madds have. + */ + tv = (vector unsigned char *) &s[0 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); + srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[0].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[0].v, fv[0].v, zeros); + sumlv = vec_madds(srclv[0].v, fv[0].v, zeros); + + tv = (vector unsigned char *) &s[1 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap])); + srchv[1].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[1].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv); + sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv); + + tv = (vector unsigned char *) &s[2 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap])); + srchv[2].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[2].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv); + sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv); + + tv = (vector unsigned char *) &s[3 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap])); + srchv[3].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[3].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); + sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); + + /* + Pack the results into our destination vector, + and do an aligned write of that back to memory. + */ + dstv = vec_packsu(sumhv, sumlv) ; + vec_st(dstv, 0, (vector unsigned char *) dst); + + dst+=16; + s+=16; + dst_width-=16; + } + + /* + If there are any leftover pixels, resample them + with the slow scalar method. + */ + while(dst_width>0) { + sum = s[0 * wrap] * filter[0] + + s[1 * wrap] * filter[1] + + s[2 * wrap] * filter[2] + + s[3 * wrap] * filter[3]; + sum = sum >> FILTER_BITS; + if (sum<0) sum = 0; else if (sum>255) sum=255; + dst[0] = sum; + dst++; + s++; + dst_width--; + } +} + diff --git a/libavcodec/ppc/imgresample_altivec.h b/libavcodec/ppc/imgresample_altivec.h new file mode 100644 index 0000000000..e7a1254591 --- /dev/null +++ b/libavcodec/ppc/imgresample_altivec.h @@ -0,0 +1,24 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef IMGRESAMPLE_ALTIVEC_H +#define IMGRESAMPLE_ALTIVEC_H + +void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, + int wrap, int16_t *filter); +#endif /* IMGRESAMPLE_ALTIVEC_H */ diff --git a/libavcodec/ppc/mpegvideo_altivec.c b/libavcodec/ppc/mpegvideo_altivec.c index 2292dbc6fb..12795e49ff 100644 --- a/libavcodec/ppc/mpegvideo_altivec.c +++ b/libavcodec/ppc/mpegvideo_altivec.c @@ -28,8 +28,8 @@ #include "gcc_fixes.h" -#include "dsputil_altivec.h" - +#include "dsputil_ppc.h" +#include "util_altivec.h" // Swaps two variables (used for altivec registers) #define SWAP(a,b) \ do { \ diff --git a/libavcodec/ppc/util_altivec.h b/libavcodec/ppc/util_altivec.h new file mode 100644 index 0000000000..63b7f1a0c4 --- /dev/null +++ b/libavcodec/ppc/util_altivec.h @@ -0,0 +1,106 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file util_altivec.h + * Contains misc utility macros and inline functions + */ + +#ifndef UTIL_ALTIVEC_H +#define UTIL_ALTIVEC_H + +// used to build registers permutation vectors (vcprm) +// the 's' are for words in the _s_econd vector +#define WORD_0 0x00,0x01,0x02,0x03 +#define WORD_1 0x04,0x05,0x06,0x07 +#define WORD_2 0x08,0x09,0x0a,0x0b +#define WORD_3 0x0c,0x0d,0x0e,0x0f +#define WORD_s0 0x10,0x11,0x12,0x13 +#define WORD_s1 0x14,0x15,0x16,0x17 +#define WORD_s2 0x18,0x19,0x1a,0x1b +#define WORD_s3 0x1c,0x1d,0x1e,0x1f + +#ifdef __APPLE_CC__ +#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) +#else +#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} +#endif + +// vcprmle is used to keep the same index as in the SSE version. +// it's the same as vcprm, with the index inversed +// ('le' is Little Endian) +#define vcprmle(a,b,c,d) vcprm(d,c,b,a) + +// used to build inverse/identity vectors (vcii) +// n is _n_egative, p is _p_ositive +#define FLOAT_n -1. +#define FLOAT_p 1. + + +#ifdef __APPLE_CC__ +#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) +#else +#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} +#endif + +// Transpose 8x8 matrix of 16-bit elements (in-place) +#define TRANSPOSE8(a,b,c,d,e,f,g,h) \ +do { \ + vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \ + vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \ + \ + A1 = vec_mergeh (a, e); \ + B1 = vec_mergel (a, e); \ + C1 = vec_mergeh (b, f); \ + D1 = vec_mergel (b, f); \ + E1 = vec_mergeh (c, g); \ + F1 = vec_mergel (c, g); \ + G1 = vec_mergeh (d, h); \ + H1 = vec_mergel (d, h); \ + \ + A2 = vec_mergeh (A1, E1); \ + B2 = vec_mergel (A1, E1); \ + C2 = vec_mergeh (B1, F1); \ + D2 = vec_mergel (B1, F1); \ + E2 = vec_mergeh (C1, G1); \ + F2 = vec_mergel (C1, G1); \ + G2 = vec_mergeh (D1, H1); \ + H2 = vec_mergel (D1, H1); \ + \ + a = vec_mergeh (A2, E2); \ + b = vec_mergel (A2, E2); \ + c = vec_mergeh (B2, F2); \ + d = vec_mergel (B2, F2); \ + e = vec_mergeh (C2, G2); \ + f = vec_mergel (C2, G2); \ + g = vec_mergeh (D2, H2); \ + h = vec_mergel (D2, H2); \ +} while (0) + + +/** \brief loads unaligned vector \a *src with offset \a offset + and returns it */ +static inline vector unsigned char unaligned_load(int offset, uint8_t *src) +{ + register vector unsigned char first = vec_ld(offset, src); + register vector unsigned char second = vec_ld(offset+15, src); + register vector unsigned char mask = vec_lvsl(offset, src); + return vec_perm(first, second, mask); +} + +#endif /* UTIL_ALTIVEC_H */ diff --git a/libavcodec/ppc/vc1dsp_altivec.c b/libavcodec/ppc/vc1dsp_altivec.c index 99f72e1f04..c00f80f1e0 100644 --- a/libavcodec/ppc/vc1dsp_altivec.c +++ b/libavcodec/ppc/vc1dsp_altivec.c @@ -23,7 +23,7 @@ #include "gcc_fixes.h" -#include "dsputil_altivec.h" +#include "util_altivec.h" // main steps of 8x8 transform #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \