1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-12-23 12:43:46 +02:00

spelling/grammar/wording overhaul

Originally committed as revision 27190 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
This commit is contained in:
Diego Biurrun 2008-07-04 13:49:45 +00:00
parent 4bdc44c7fe
commit 8a3227968c
13 changed files with 178 additions and 176 deletions

View File

@ -2,8 +2,8 @@
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
* April 20, 2007 * April 20, 2007
* *
* Blackfin Video Color Space Converters Operations * Blackfin video color space converter operations
* convert I420 YV12 to RGB in various formats, * convert I420 YV12 to RGB in various formats
* *
* This file is part of FFmpeg. * This file is part of FFmpeg.
* *
@ -24,8 +24,8 @@
/* /*
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
The following calculation is used for the conversion: The following calculation is used for the conversion:
@ -34,36 +34,36 @@ The following calculation is used for the conversion:
g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
b = clipz((y-oy)*cy + cbu*(u-128)) b = clipz((y-oy)*cy + cbu*(u-128))
y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision. y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
New factorization to eliminate the truncation error which was New factorization to eliminate the truncation error which was
occuring due to the byteop3p. occurring due to the byteop3p.
1) use the bytop16m to subtract quad bytes we use this in U8 this 1) Use the bytop16m to subtract quad bytes we use this in U8 this
then so the offsets need to be renormalized to 8bits. then so the offsets need to be renormalized to 8bits.
2) scale operands up by a factor of 4 not 8 because Blackfin 2) Scale operands up by a factor of 4 not 8 because Blackfin
multiplies include a shift. multiplies include a shift.
3) compute into the accumulators cy*yx0, cy*yx1 3) Compute into the accumulators cy*yx0, cy*yx1.
4) compute each of the linear equations 4) Compute each of the linear equations:
r = clipz((y - oy) * cy + crv * (v - 128)) r = clipz((y - oy) * cy + crv * (v - 128))
g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128)) g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
b = clipz((y - oy) * cy + cbu * (u - 128)) b = clipz((y - oy) * cy + cbu * (u - 128))
reuse of the accumulators requires that we actually multiply Reuse of the accumulators requires that we actually multiply
twice once with addition and the second time with a subtaction. twice once with addition and the second time with a subtraction.
because of this we need to compute the equations in the order R B Because of this we need to compute the equations in the order R B
then G saving the writes for B in the case of 24/32 bit color then G saving the writes for B in the case of 24/32 bit color
formats. formats.
api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
int dW, uint32_t *coeffs); int dW, uint32_t *coeffs);
A B A B
@ -77,13 +77,13 @@ uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
coeffs is a pointer to oy. coeffs is a pointer to oy.
the {rgb} masks are only utilized by the 565 packing algorithm. Note the data The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
replication is used to simplify the internal algorithms for the dual mac architecture replication is used to simplify the internal algorithms for the dual Mac
of BlackFin. architecture of BlackFin.
All routines are exported with _ff_bfin_ as a symbol prefix All routines are exported with _ff_bfin_ as a symbol prefix.
rough performance gain compared against -O3: Rough performance gain compared against -O3:
2779809/1484290 187.28% 2779809/1484290 187.28%

View File

@ -1,10 +1,10 @@
/* /*
* rgb2rgb.c, Software RGB to RGB convertor * software RGB to RGB converter
* pluralize by Software PAL8 to RGB convertor * pluralize by software PAL8 to RGB converter
* Software YUV to YUV convertor * software YUV to YUV converter
* Software YUV to RGB convertor * software YUV to RGB converter
* Written by Nick Kurshev. * Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* *
* This file is part of FFmpeg. * This file is part of FFmpeg.
* *
@ -22,8 +22,8 @@
* along with FFmpeg; if not, write to the Free Software * along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
* *
* the C code (not assembly, mmx, ...) of this file can be used * The C code (not assembly, MMX, ...) of this file can be used
* under the LGPL license too * under the LGPL license.
*/ */
#include <inttypes.h> #include <inttypes.h>
#include "config.h" #include "config.h"
@ -33,7 +33,7 @@
#include "swscale.h" #include "swscale.h"
#include "swscale_internal.h" #include "swscale_internal.h"
#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit #define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients
void (*rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size); void (*rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size);
void (*rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size); void (*rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size);
@ -149,8 +149,8 @@ static uint64_t __attribute__((aligned(8))) dither8[2]={
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5)) #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
//Note: we have C, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one //Note: We have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW + MMX2 one.
//Plain C versions //plain C versions
#undef HAVE_MMX #undef HAVE_MMX
#undef HAVE_MMX2 #undef HAVE_MMX2
#undef HAVE_3DNOW #undef HAVE_3DNOW
@ -190,10 +190,10 @@ static uint64_t __attribute__((aligned(8))) dither8[2]={
#endif //ARCH_X86 || ARCH_X86_64 #endif //ARCH_X86 || ARCH_X86_64
/* /*
rgb15->rgb16 Original by Strepto/Astral RGB15->RGB16 original by Strepto/Astral
ported to gcc & bugfixed : A'rpi ported to gcc & bugfixed : A'rpi
MMX2, 3DNOW optimization by Nick Kurshev MMX2, 3DNOW optimization by Nick Kurshev
32bit c version, and and&add trick by Michael Niedermayer 32-bit C version, and and&add trick by Michael Niedermayer
*/ */
void sws_rgb2rgb_init(int flags){ void sws_rgb2rgb_init(int flags){
@ -266,7 +266,7 @@ void palette8torgb24(const uint8_t *src, uint8_t *dst, long num_pixels, const ui
{ {
long i; long i;
/* /*
writes 1 byte o much and might cause alignment issues on some architectures? Writes 1 byte too much and might cause alignment issues on some architectures?
for (i=0; i<num_pixels; i++) for (i=0; i<num_pixels; i++)
((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]]; ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]];
*/ */
@ -284,7 +284,7 @@ void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const ui
{ {
long i; long i;
/* /*
writes 1 byte o much and might cause alignment issues on some architectures? Writes 1 byte too much and might cause alignment issues on some architectures?
for (i=0; i<num_pixels; i++) for (i=0; i<num_pixels; i++)
((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]]; ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]];
*/ */
@ -299,7 +299,7 @@ void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const ui
} }
/** /**
* Palette is assumed to contain bgr16, see rgb32to16 to convert the palette * Palette is assumed to contain BGR16, see rgb32to16 to convert the palette.
*/ */
void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
{ {

View File

@ -1,8 +1,8 @@
/* /*
* rgb2rgb.h, Software RGB to RGB convertor * software RGB to RGB converter
* pluralize by Software PAL8 to RGB convertor * pluralize by Software PAL8 to RGB converter
* Software YUV to YUV convertor * Software YUV to YUV converter
* Software YUV to RGB convertor * Software YUV to RGB converter
* Written by Nick Kurshev. * Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* *
@ -28,7 +28,7 @@
#include <inttypes.h> #include <inttypes.h>
/* A full collection of rgb to rgb(bgr) convertors */ /* A full collection of RGB to RGB(BGR) converters */
extern void (*rgb24to32) (const uint8_t *src, uint8_t *dst, long src_size); extern void (*rgb24to32) (const uint8_t *src, uint8_t *dst, long src_size);
extern void (*rgb24to16) (const uint8_t *src, uint8_t *dst, long src_size); extern void (*rgb24to16) (const uint8_t *src, uint8_t *dst, long src_size);
extern void (*rgb24to15) (const uint8_t *src, uint8_t *dst, long src_size); extern void (*rgb24to15) (const uint8_t *src, uint8_t *dst, long src_size);
@ -71,53 +71,49 @@ extern void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, c
extern void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette); extern void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
/** /**
* * Height should be a multiple of 2 and width should be a multiple of 16.
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a * (If this is a problem for anyone then tell me, and I will fix it.)
* problem for anyone then tell me, and ill fix it) * Chrominance data is only taken from every second line, others are ignored.
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version * FIXME: Write HQ version.
*/ */
//void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, //void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
/** /**
* * Height should be a multiple of 2 and width should be a multiple of 16.
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a * (If this is a problem for anyone then tell me, and I will fix it.)
* problem for anyone then tell me, and ill fix it)
*/ */
extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
long width, long height, long width, long height,
long lumStride, long chromStride, long dstStride); long lumStride, long chromStride, long dstStride);
/** /**
* * Width should be a multiple of 16.
* width should be a multiple of 16
*/ */
extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
long width, long height, long width, long height,
long lumStride, long chromStride, long dstStride); long lumStride, long chromStride, long dstStride);
/** /**
* * Height should be a multiple of 2 and width should be a multiple of 16.
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a * (If this is a problem for anyone then tell me, and I will fix it.)
* problem for anyone then tell me, and ill fix it)
*/ */
extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
long width, long height, long width, long height,
long lumStride, long chromStride, long srcStride); long lumStride, long chromStride, long srcStride);
/** /**
* * Height should be a multiple of 2 and width should be a multiple of 16.
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a * (If this is a problem for anyone then tell me, and I will fix it.)
* problem for anyone then tell me, and ill fix it)
*/ */
extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
long width, long height, long width, long height,
long lumStride, long chromStride, long dstStride); long lumStride, long chromStride, long dstStride);
/** /**
* * Height should be a multiple of 2 and width should be a multiple of 2.
* height should be a multiple of 2 and width should be a multiple of 2 (if this is a * (If this is a problem for anyone then tell me, and I will fix it.)
* problem for anyone then tell me, and ill fix it) * Chrominance data is only taken from every second line, others are ignored.
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version * FIXME: Write HQ version.
*/ */
extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
long width, long height, long width, long height,

View File

@ -1,11 +1,11 @@
/* /*
* rgb2rgb.c, Software RGB to RGB convertor * software RGB to RGB converter
* pluralize by Software PAL8 to RGB convertor * pluralize by software PAL8 to RGB converter
* Software YUV to YUV convertor * software YUV to YUV converter
* Software YUV to RGB convertor * software YUV to RGB converter
* Written by Nick Kurshev. * Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* lot of big-endian byteorder fixes by Alex Beregszaszi * lot of big-endian byte order fixes by Alex Beregszaszi
* *
* This file is part of FFmpeg. * This file is part of FFmpeg.
* *
@ -23,7 +23,7 @@
* along with FFmpeg; if not, write to the Free Software * along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
* *
* The C code (not assembly, mmx, ...) of this file can be used * The C code (not assembly, MMX, ...) of this file can be used
* under the LGPL license. * under the LGPL license.
*/ */
@ -229,10 +229,10 @@ static inline void RENAME(rgb32to24)(const uint8_t *src, uint8_t *dst, long src_
} }
/* /*
Original by Strepto/Astral original by Strepto/Astral
ported to gcc & bugfixed : A'rpi ported to gcc & bugfixed: A'rpi
MMX2, 3DNOW optimization by Nick Kurshev MMX2, 3DNOW optimization by Nick Kurshev
32 bit C version, and and&add trick by Michael Niedermayer 32-bit C version, and and&add trick by Michael Niedermayer
*/ */
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size) static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
{ {
@ -926,9 +926,9 @@ static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long s
---------------- ----------------
1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 0
|=======| |===| |=======| |===|
| Leftmost Bits Repeated to Fill Open Bits | leftmost bits repeated to fill open bits
| |
Original Bits original bits
*/ */
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size) static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
{ {
@ -1006,7 +1006,7 @@ static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_
:"=m"(*d) :"=m"(*d)
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
:"memory"); :"memory");
/* Borrowed 32 to 24 */ /* borrowed 32 to 24 */
asm volatile( asm volatile(
"movq %%mm0, %%mm4 \n\t" "movq %%mm0, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t" "movq %%mm3, %%mm5 \n\t"
@ -1147,7 +1147,7 @@ static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_
:"=m"(*d) :"=m"(*d)
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
:"memory"); :"memory");
/* Borrowed 32 to 24 */ /* borrowed 32 to 24 */
asm volatile( asm volatile(
"movq %%mm0, %%mm4 \n\t" "movq %%mm0, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t" "movq %%mm3, %%mm5 \n\t"
@ -1479,7 +1479,7 @@ static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long s
asm volatile(SFENCE:::"memory"); asm volatile(SFENCE:::"memory");
asm volatile(EMMS:::"memory"); asm volatile(EMMS:::"memory");
if (mmx_size==23) return; //finihsed, was multiple of 8 if (mmx_size==23) return; //finished, was multiple of 8
src+= src_size; src+= src_size;
dst+= src_size; dst+= src_size;
@ -1638,8 +1638,8 @@ asm( EMMS" \n\t"
} }
/** /**
* Height should be a multiple of 2 and width should be a multiple of 16 (if * Height should be a multiple of 2 and width should be a multiple of 16.
* this is a problem for anyone then tell me, and I will fix it). * (If this is a problem for anyone then tell me, and I will fix it.)
*/ */
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
long width, long height, long width, long height,
@ -1720,7 +1720,7 @@ static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *u
(vc[0] << 8) + (yc[1] << 0); (vc[0] << 8) + (yc[1] << 0);
#else #else
*idst++ = uc[0] + (yc[0] << 8) + *idst++ = uc[0] + (yc[0] << 8) +
(vc[0] << 16) + (yc[1] << 24); (vc[0] << 16) + (yc[1] << 24);
#endif #endif
yc += 2; yc += 2;
uc++; uc++;
@ -1744,8 +1744,8 @@ asm( EMMS" \n\t"
} }
/** /**
* Height should be a multiple of 2 and width should be a multiple of 16 (if * Height should be a multiple of 2 and width should be a multiple of 16
* this is a problem for anyone then tell me, and I will fix it). * (If this is a problem for anyone then tell me, and I will fix it.)
*/ */
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
long width, long height, long width, long height,
@ -1766,8 +1766,8 @@ static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usr
} }
/** /**
* Height should be a multiple of 2 and width should be a multiple of 16 (if * Height should be a multiple of 2 and width should be a multiple of 16.
* this is a problem for anyone then tell me, and I will fix it). * (If this is a problem for anyone then tell me, and I will fix it.)
*/ */
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
long width, long height, long width, long height,
@ -2002,9 +2002,9 @@ asm volatile( EMMS" \n\t"
} }
/** /**
* Height should be a multiple of 2 and width should be a multiple of 16 (if * Height should be a multiple of 2 and width should be a multiple of 16.
* this is a problem for anyone then tell me, and I will fix it). * (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every secound line, others are ignored. * Chrominance data is only taken from every second line, others are ignored.
* FIXME: Write HQ version. * FIXME: Write HQ version.
*/ */
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
@ -2128,9 +2128,9 @@ asm volatile( EMMS" \n\t"
} }
/** /**
* Height should be a multiple of 2 and width should be a multiple of 2 (if * Height should be a multiple of 2 and width should be a multiple of 2.
* this is a problem for anyone then tell me, and I will fix it). * (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every secound line, * Chrominance data is only taken from every second line,
* others are ignored in the C version. * others are ignored in the C version.
* FIXME: Write HQ version. * FIXME: Write HQ version.
*/ */

View File

@ -245,12 +245,12 @@ static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int
src_v = vec_mergeh(src_v, (vector signed short)vzero); src_v = vec_mergeh(src_v, (vector signed short)vzero);
filter_v = vec_ld(i << 3, filter); filter_v = vec_ld(i << 3, filter);
// the 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2) // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
// the neat trick : we only care for half the elements, // The neat trick: We only care for half the elements,
// high or low depending on (i<<3)%16 (it's 0 or 8 here), // high or low depending on (i<<3)%16 (it's 0 or 8 here),
// and we're going to use vec_mule, so we chose // and we're going to use vec_mule, so we choose
// carefully how to "unpack" the elements into the even slots // carefully how to "unpack" the elements into the even slots.
if ((i << 3) % 16) if ((i << 3) % 16)
filter_v = vec_mergel(filter_v, (vector signed short)vzero); filter_v = vec_mergel(filter_v, (vector signed short)vzero);
else else
@ -405,12 +405,12 @@ static inline int yv12toyuy2_unscaled_altivec(SwsContext *c, uint8_t* src[], int
return srcSliceH; return srcSliceH;
} }
/* this code assume: /* This code assumes:
1) dst is 16 bytes-aligned 1) dst is 16 bytes-aligned
2) dstStride is a multiple of 16 2) dstStride is a multiple of 16
3) width is a multiple of 16 3) width is a multiple of 16
4) lum&chrom stride are multiple of 8 4) lum & chrom stride are multiples of 8
*/ */
for (y=0; y<height; y++) { for (y=0; y<height; y++) {
@ -482,12 +482,12 @@ static inline int yv12touyvy_unscaled_altivec(SwsContext *c, uint8_t* src[], int
return srcSliceH; return srcSliceH;
} }
/* this code assume: /* This code assumes:
1) dst is 16 bytes-aligned 1) dst is 16 bytes-aligned
2) dstStride is a multiple of 16 2) dstStride is a multiple of 16
3) width is a multiple of 16 3) width is a multiple of 16
4) lum&chrom stride are multiple of 8 4) lum & chrom stride are multiples of 8
*/ */
for (y=0; y<height; y++) { for (y=0; y<height; y++) {

View File

@ -1,7 +1,7 @@
/* /*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
* *
* Blackfin Software Video SCALER Operations * Blackfin software video scaler operations
* *
* This file is part of FFmpeg. * This file is part of FFmpeg.
* *

View File

@ -37,7 +37,7 @@
typedef int (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY, typedef int (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[]); int srcSliceH, uint8_t* dst[], int dstStride[]);
/* this struct should be aligned on at least 32-byte boundary */ /* This struct should be aligned on at least a 32-byte boundary. */
typedef struct SwsContext{ typedef struct SwsContext{
/** /**
* info on struct for av_log * info on struct for av_log
@ -73,7 +73,7 @@ typedef struct SwsContext{
int16_t *vChrFilter; int16_t *vChrFilter;
int16_t *vChrFilterPos; int16_t *vChrFilterPos;
uint8_t formatConvBuffer[VOF]; //FIXME dynamic alloc, but we have to change a lot of code for this to be useful uint8_t formatConvBuffer[VOF]; //FIXME dynamic allocation, but we have to change a lot of code for this to be useful
int hLumFilterSize; int hLumFilterSize;
int hChrFilterSize; int hChrFilterSize;
@ -122,7 +122,7 @@ typedef struct SwsContext{
#define V_OFFSET "10*8" #define V_OFFSET "10*8"
#define LUM_MMX_FILTER_OFFSET "11*8" #define LUM_MMX_FILTER_OFFSET "11*8"
#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256" #define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the asm #define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM
#define ESP_OFFSET "11*8+4*4*256*2+8" #define ESP_OFFSET "11*8+4*4*256*2+8"
#define VROUNDER_OFFSET "11*8+4*4*256*2+16" #define VROUNDER_OFFSET "11*8+4*4*256*2+16"
#define U_TEMP "11*8+4*4*256*2+24" #define U_TEMP "11*8+4*4*256*2+24"

View File

@ -17,8 +17,8 @@
* along with FFmpeg; if not, write to the Free Software * along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
* *
* the C code (not assembly, mmx, ...) of this file can be used * The C code (not assembly, MMX, ...) of this file can be used
* under the LGPL license too * under the LGPL license.
*/ */
#undef REAL_MOVNTQ #undef REAL_MOVNTQ
@ -30,7 +30,7 @@
#undef SFENCE #undef SFENCE
#ifdef HAVE_3DNOW #ifdef HAVE_3DNOW
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
#define EMMS "femms" #define EMMS "femms"
#else #else
#define EMMS "emms" #define EMMS "emms"
@ -1503,7 +1503,7 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *
const int yalpha1=0; const int yalpha1=0;
int i; int i;
uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1 uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
const int yalpha= 4096; //FIXME ... const int yalpha= 4096; //FIXME ...
if (flags&SWS_FULL_CHR_H_INT) if (flags&SWS_FULL_CHR_H_INT)
@ -1700,7 +1700,7 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *
} }
} }
//FIXME yuy2* can read upto 7 samples to much //FIXME yuy2* can read up to 7 samples too much
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width) static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
{ {
@ -2297,7 +2297,7 @@ static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1,
} }
} }
// Bilinear / Bicubic scaling // bilinear / bicubic scaling
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
int16_t *filter, int16_t *filterPos, long filterSize) int16_t *filter, int16_t *filterPos, long filterSize)
{ {
@ -2544,7 +2544,7 @@ static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, i
} }
#ifdef HAVE_MMX #ifdef HAVE_MMX
// use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one) // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
#else #else
if (!(flags&SWS_FAST_BILINEAR)) if (!(flags&SWS_FAST_BILINEAR))
@ -2552,7 +2552,7 @@ static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, i
{ {
RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
} }
else // Fast Bilinear upscale / crap downscale else // fast bilinear upscale / crap downscale
{ {
#if defined(ARCH_X86) #if defined(ARCH_X86)
#ifdef HAVE_MMX2 #ifdef HAVE_MMX2
@ -2761,7 +2761,7 @@ inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1,
} }
#ifdef HAVE_MMX #ifdef HAVE_MMX
// use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one) // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
#else #else
if (!(flags&SWS_FAST_BILINEAR)) if (!(flags&SWS_FAST_BILINEAR))
@ -2770,7 +2770,7 @@ inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1,
RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
} }
else // Fast Bilinear upscale / crap downscale else // fast bilinear upscale / crap downscale
{ {
#if defined(ARCH_X86) #if defined(ARCH_X86)
#ifdef HAVE_MMX2 #ifdef HAVE_MMX2
@ -2890,8 +2890,8 @@ FUNNY_UV_CODE
"cmp %2, %%"REG_a" \n\t" "cmp %2, %%"REG_a" \n\t"
" jb 1b \n\t" " jb 1b \n\t"
/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
which is needed to support GCC-4.0 */ which is needed to support GCC 4.0. */
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) #if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
:: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
#else #else
@ -2963,7 +2963,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
int lastDstY; int lastDstY;
uint8_t *pal=NULL; uint8_t *pal=NULL;
/* vars whch will change and which we need to storw back in the context */ /* vars which will change and which we need to store back in the context */
int dstY= c->dstY; int dstY= c->dstY;
int lumBufIndex= c->lumBufIndex; int lumBufIndex= c->lumBufIndex;
int chrBufIndex= c->chrBufIndex; int chrBufIndex= c->chrBufIndex;
@ -3004,13 +3004,14 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
if (flags & SWS_PRINT_INFO && firstTime) if (flags & SWS_PRINT_INFO && firstTime)
{ {
av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n" av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
" ->cannot do aligned memory acesses anymore\n"); " ->cannot do aligned memory accesses anymore\n");
firstTime=0; firstTime=0;
} }
} }
/* Note the user might start scaling the picture in the middle so this will not get executed /* Note the user might start scaling the picture in the middle so this
this is not really intended but works currently, so ppl might do it */ will not get executed. This is not really intended but works
currently, so people might do it. */
if (srcSliceY ==0){ if (srcSliceY ==0){
lumBufIndex=0; lumBufIndex=0;
chrBufIndex=0; chrBufIndex=0;
@ -3182,7 +3183,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
{ {
const int chrSkipMask= (1<<c->chrDstVSubSample)-1; const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
if (vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
{ {
int16_t *lumBuf = lumPixBuf[0]; int16_t *lumBuf = lumPixBuf[0];
int16_t *chrBuf= chrPixBuf[0]; int16_t *chrBuf= chrPixBuf[0];
@ -3200,13 +3201,13 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
{ {
ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
if (vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
{ {
int chrAlpha= vChrFilter[2*dstY+1]; int chrAlpha= vChrFilter[2*dstY+1];
RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
dest, dstW, chrAlpha, dstFormat, flags, dstY); dest, dstW, chrAlpha, dstFormat, flags, dstY);
} }
else if (vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
{ {
int lumAlpha= vLumFilter[2*dstY+1]; int lumAlpha= vLumFilter[2*dstY+1];
int chrAlpha= vChrFilter[2*dstY+1]; int chrAlpha= vChrFilter[2*dstY+1];
@ -3217,7 +3218,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
dest, dstW, lumAlpha, chrAlpha, dstY); dest, dstW, lumAlpha, chrAlpha, dstY);
} }
else //General RGB else //general RGB
{ {
RENAME(yuv2packedX)(c, RENAME(yuv2packedX)(c,
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,

View File

@ -39,7 +39,7 @@
#include "swscale.h" #include "swscale.h"
#include "swscale_internal.h" #include "swscale_internal.h"
#define DITHER1XBPP // only for mmx #define DITHER1XBPP // only for MMX
const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={ const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={
{ 1, 3, 1, 3, 1, 3, 1, 3, }, { 1, 3, 1, 3, 1, 3, 1, 3, },
@ -155,8 +155,8 @@ DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw) = 0x00ff00ff00ff00ffULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL; DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL; DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
// the volatile is required because gcc otherwise optimizes some writes away not knowing that these // The volatile is required because gcc otherwise optimizes some writes away
// are read in the asm block // not knowing that these are read in the ASM block.
static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither; static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither;
static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither; static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither;
static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither; static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither;
@ -641,7 +641,7 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
} }
#endif #endif
av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found\n"); av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found.\n");
switch(c->dstFormat){ switch(c->dstFormat){
case PIX_FMT_BGR32: case PIX_FMT_BGR32:

View File

@ -21,63 +21,68 @@
*/ */
/* /*
convert I420 YV12 to RGB in various formats, Convert I420 YV12 to RGB in various formats,
it rejects images that are not in 420 formats it rejects images that are not in 420 formats,
it rejects images that don't have widths of multiples of 16 it rejects images that don't have widths of multiples of 16,
it rejects images that don't have heights of multiples of 2 it rejects images that don't have heights of multiples of 2.
reject defers to C simulation codes. Reject defers to C simulation code.
lots of optimizations to be done here Lots of optimizations to be done here.
1. need to fix saturation code, I just couldn't get it to fly with packs and adds. 1. Need to fix saturation code. I just couldn't get it to fly with packs
so we currently use max min to clip and adds, so we currently use max/min to clip.
2. the inefficient use of chroma loading needs a bit of brushing up 2. The inefficient use of chroma loading needs a bit of brushing up.
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls 3. Analysis of pipeline stalls needs to be done. Use shark to identify
pipeline stalls.
MODIFIED to calculate coeffs from currently selected color space. MODIFIED to calculate coeffs from currently selected color space.
MODIFIED core to be a macro which you spec the output format. MODIFIED core to be a macro where you specify the output format.
ADDED UYVY conversion which is never called due to some thing in SWSCALE. ADDED UYVY conversion which is never called due to some thing in swscale.
CORRECTED algorithim selection to be strict on input formats. CORRECTED algorithim selection to be strict on input formats.
ADDED runtime detection of altivec. ADDED runtime detection of AltiVec.
ADDED altivec_yuv2packedX vertical scl + RGB converter ADDED altivec_yuv2packedX vertical scl + RGB converter
March 27,2004 March 27,2004
PERFORMANCE ANALYSIS PERFORMANCE ANALYSIS
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence used as test.
The AltiVec version uses 10% of the processor or ~100Mips for D1 video
same sequence.
720*480*30 ~10MPS 720 * 480 * 30 ~10MPS
so we have roughly 10clocks per pixel this is too high something has to be wrong. so we have roughly 10 clocks per pixel. This is too high, something has
to be wrong.
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min. OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
need for vec_min.
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
guaranteed to have the input video frame it was just decompressed so the input video frame, it was just decompressed so it probably resides in L1
it probably resides in L1 caches. However we are creating the caches. However, we are creating the output video stream. This needs to use the
output video stream this needs to use the DSTST instruction to DSTST instruction to optimize for the cache. We couple this with the fact that
optimize for the cache. We couple this with the fact that we are we are not going to be visiting the input buffer again so we mark it Least
not going to be visiting the input buffer again so we mark it Least Recently Used. This shaves 25% of the processor cycles off.
Recently Used. This shaves 25% of the processor cycles off.
Now MEMCPY is the largest mips consumer in the system, probably due Now memcpy is the largest mips consumer in the system, probably due
to the inefficient X11 stuff. to the inefficient X11 stuff.
GL libraries seem to be very slow on this machine 1.33Ghz PB running GL libraries seem to be very slow on this machine 1.33Ghz PB running
Jaguar, this is not the case for my 1Ghz PB. I thought it might be Jaguar, this is not the case for my 1Ghz PB. I thought it might be
a versioning issues, however I have libGL.1.2.dylib for both a versioning issue, however I have libGL.1.2.dylib for both
machines. ((We need to figure this out now)) machines. (We need to figure this out now.)
GL2 libraries work now with patch for RGB32 GL2 libraries work now with patch for RGB32.
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. Integrated luma prescaling adjustment for saturation/contrast/brightness
adjustment.
*/ */
#include <stdio.h> #include <stdio.h>

View File

@ -1,9 +1,8 @@
/* /*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
* April 20, 2007
* *
* Blackfin Video Color Space Converters Operations * Blackfin video color space converter operations
* convert I420 YV12 to RGB in various formats, * convert I420 YV12 to RGB in various formats
* *
* This file is part of FFmpeg. * This file is part of FFmpeg.
* *
@ -200,7 +199,7 @@ SwsFunc ff_bfin_yuv2rgb_get_func_ptr (SwsContext *c)
return 0; return 0;
} }
av_log(c, AV_LOG_INFO, "BlackFin Accelerated Color Space Converter %s\n", av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
sws_format_name (c->dstFormat)); sws_format_name (c->dstFormat));
return f; return f;

View File

@ -1,5 +1,6 @@
/* /*
* yuv2rgb_mlib.c, Software YUV to RGB converter using mediaLib * software YUV to RGB converter using mediaLib
*
* Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at> * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
* *
* This file is part of FFmpeg. * This file is part of FFmpeg.

View File

@ -1,5 +1,5 @@
/* /*
* yuv2rgb_mmx.c, Software YUV to RGB converter with Intel MMX "technology" * yuv2rgb_mmx.c, software YUV to RGB converter with Intel MMX "technology"
* *
* Copyright (C) 2000, Silicon Integrated System Corp. * Copyright (C) 2000, Silicon Integrated System Corp.
* *
@ -31,7 +31,7 @@
#undef SFENCE #undef SFENCE
#ifdef HAVE_3DNOW #ifdef HAVE_3DNOW
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
#define EMMS "femms" #define EMMS "femms"
#else #else
#define EMMS "emms" #define EMMS "emms"
@ -147,8 +147,8 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStr
g6Dither= ff_dither4[y&1]; g6Dither= ff_dither4[y&1];
g5Dither= ff_dither8[y&1]; g5Dither= ff_dither8[y&1];
r5Dither= ff_dither8[(y+1)&1]; r5Dither= ff_dither8[(y+1)&1];
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 /* This MMX assembly code deals with a SINGLE scan line at a time,
pixels in each iteration */ * it converts 8 pixels in each iteration. */
asm volatile ( asm volatile (
/* load data for start of next scan line */ /* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
@ -156,8 +156,8 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStr
"movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ "movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
//".balign 16 \n\t" //".balign 16 \n\t"
"1: \n\t" "1: \n\t"
/* no speed diference on my p3@500 with prefetch, /* No speed difference on my p3@500 with prefetch,
* if it is faster for anyone with -benchmark then tell me * if it is faster for anyone with -benchmark then tell me.
PREFETCH" 64(%0) \n\t" PREFETCH" 64(%0) \n\t"
PREFETCH" 64(%1) \n\t" PREFETCH" 64(%1) \n\t"
PREFETCH" 64(%2) \n\t" PREFETCH" 64(%2) \n\t"
@ -180,7 +180,7 @@ YUV2RGB
"movq %%mm0, %%mm5;" /* Copy B7-B0 */ "movq %%mm0, %%mm5;" /* Copy B7-B0 */
"movq %%mm2, %%mm7;" /* Copy G7-G0 */ "movq %%mm2, %%mm7;" /* Copy G7-G0 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */ /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
"punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
"punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
@ -190,7 +190,7 @@ YUV2RGB
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */ /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
"punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
"punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
@ -242,8 +242,8 @@ static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStr
g6Dither= ff_dither4[y&1]; g6Dither= ff_dither4[y&1];
g5Dither= ff_dither8[y&1]; g5Dither= ff_dither8[y&1];
r5Dither= ff_dither8[(y+1)&1]; r5Dither= ff_dither8[(y+1)&1];
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 /* This MMX assembly code deals with a SINGLE scan line at a time,
pixels in each iteration */ * it converts 8 pixels in each iteration. */
asm volatile ( asm volatile (
/* load data for start of next scan line */ /* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
@ -271,7 +271,7 @@ YUV2RGB
"movq %%mm0, %%mm5;" /* Copy B7-B0 */ "movq %%mm0, %%mm5;" /* Copy B7-B0 */
"movq %%mm2, %%mm7;" /* Copy G7-G0 */ "movq %%mm2, %%mm7;" /* Copy G7-G0 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */ /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
"punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */ "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
"punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
@ -281,7 +281,7 @@ YUV2RGB
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */ /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
"punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */ "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
"punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
@ -326,8 +326,8 @@ static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStr
uint8_t *pv = src[2] + (y>>1)*srcStride[2]; uint8_t *pv = src[2] + (y>>1)*srcStride[2];
long index= -h_size/2; long index= -h_size/2;
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 /* This MMX assembly code deals with a SINGLE scan line at a time,
pixels in each iteration */ * it converts 8 pixels in each iteration. */
asm volatile ( asm volatile (
/* load data for start of next scan line */ /* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
@ -472,8 +472,8 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStr
uint8_t *pv = src[2] + (y>>1)*srcStride[2]; uint8_t *pv = src[2] + (y>>1)*srcStride[2];
long index= -h_size/2; long index= -h_size/2;
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8 /* This MMX assembly code deals with a SINGLE scan line at a time,
pixels in each iteration */ * it converts 8 pixels in each iteration. */
asm volatile ( asm volatile (
/* load data for start of next scan line */ /* load data for start of next scan line */
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ "movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */