mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
whitespace cosmetics
Originally committed as revision 27188 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
This commit is contained in:
parent
7a4d5e174c
commit
4bdc44c7fe
@ -24,74 +24,73 @@
|
||||
|
||||
|
||||
/*
|
||||
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
|
||||
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts
|
||||
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
|
||||
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts
|
||||
|
||||
|
||||
The following calculation is used for the conversion:
|
||||
The following calculation is used for the conversion:
|
||||
|
||||
r = clipz((y-oy)*cy + crv*(v-128))
|
||||
g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
|
||||
b = clipz((y-oy)*cy + cbu*(u-128))
|
||||
r = clipz((y-oy)*cy + crv*(v-128))
|
||||
g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
|
||||
b = clipz((y-oy)*cy + cbu*(u-128))
|
||||
|
||||
y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
|
||||
y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
|
||||
|
||||
|
||||
New factorization to eliminate the truncation error which was
|
||||
occuring due to the byteop3p.
|
||||
New factorization to eliminate the truncation error which was
|
||||
occuring due to the byteop3p.
|
||||
|
||||
|
||||
1) use the bytop16m to subtract quad bytes we use this in U8 this
|
||||
then so the offsets need to be renormalized to 8bits.
|
||||
1) use the bytop16m to subtract quad bytes we use this in U8 this
|
||||
then so the offsets need to be renormalized to 8bits.
|
||||
|
||||
2) scale operands up by a factor of 4 not 8 because Blackfin
|
||||
multiplies include a shift.
|
||||
2) scale operands up by a factor of 4 not 8 because Blackfin
|
||||
multiplies include a shift.
|
||||
|
||||
3) compute into the accumulators cy*yx0, cy*yx1
|
||||
3) compute into the accumulators cy*yx0, cy*yx1
|
||||
|
||||
4) compute each of the linear equations
|
||||
r = clipz((y-oy)*cy + crv*(v-128))
|
||||
4) compute each of the linear equations
|
||||
r = clipz((y - oy) * cy + crv * (v - 128))
|
||||
|
||||
g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
|
||||
g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
|
||||
|
||||
b = clipz((y-oy)*cy + cbu*(u-128))
|
||||
b = clipz((y - oy) * cy + cbu * (u - 128))
|
||||
|
||||
reuse of the accumulators requires that we actually multiply
|
||||
twice once with addition and the second time with a subtaction.
|
||||
reuse of the accumulators requires that we actually multiply
|
||||
twice once with addition and the second time with a subtaction.
|
||||
|
||||
because of this we need to compute the equations in the order R B
|
||||
then G saving the writes for B in the case of 24/32 bit color
|
||||
formats.
|
||||
because of this we need to compute the equations in the order R B
|
||||
then G saving the writes for B in the case of 24/32 bit color
|
||||
formats.
|
||||
|
||||
api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
|
||||
int dW, uint32_t *coeffs);
|
||||
api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
|
||||
int dW, uint32_t *coeffs);
|
||||
|
||||
A B
|
||||
--- ---
|
||||
i2 = cb i3 = cr
|
||||
i1 = coeff i0 = y
|
||||
A B
|
||||
--- ---
|
||||
i2 = cb i3 = cr
|
||||
i1 = coeff i0 = y
|
||||
|
||||
Where coeffs have the following layout in memory.
|
||||
Where coeffs have the following layout in memory.
|
||||
|
||||
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
|
||||
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
|
||||
|
||||
coeffs is a pointer to oy.
|
||||
coeffs is a pointer to oy.
|
||||
|
||||
the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
|
||||
replication is used to simplify the internal algorithms for the dual mac architecture
|
||||
of BlackFin.
|
||||
the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
|
||||
replication is used to simplify the internal algorithms for the dual mac architecture
|
||||
of BlackFin.
|
||||
|
||||
All routines are exported with _ff_bfin_ as a symbol prefix
|
||||
All routines are exported with _ff_bfin_ as a symbol prefix
|
||||
|
||||
rough performance gain compared against -O3:
|
||||
rough performance gain compared against -O3:
|
||||
|
||||
2779809/1484290 187.28%
|
||||
|
||||
which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
|
||||
c/pel for the optimized implementations. Not sure why there is such a
|
||||
huge variation on the reference codes on Blackfin I guess it must have
|
||||
to do with the memory system.
|
||||
2779809/1484290 187.28%
|
||||
|
||||
which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
|
||||
c/pel for the optimized implementations. Not sure why there is such a
|
||||
huge variation on the reference codes on Blackfin I guess it must have
|
||||
to do with the memory system.
|
||||
*/
|
||||
|
||||
#define mL3 .text
|
||||
|
@ -21,63 +21,63 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
convert I420 YV12 to RGB in various formats,
|
||||
it rejects images that are not in 420 formats
|
||||
it rejects images that don't have widths of multiples of 16
|
||||
it rejects images that don't have heights of multiples of 2
|
||||
reject defers to C simulation codes.
|
||||
convert I420 YV12 to RGB in various formats,
|
||||
it rejects images that are not in 420 formats
|
||||
it rejects images that don't have widths of multiples of 16
|
||||
it rejects images that don't have heights of multiples of 2
|
||||
reject defers to C simulation codes.
|
||||
|
||||
lots of optimizations to be done here
|
||||
lots of optimizations to be done here
|
||||
|
||||
1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
|
||||
so we currently use max min to clip
|
||||
1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
|
||||
so we currently use max min to clip
|
||||
|
||||
2. the inefficient use of chroma loading needs a bit of brushing up
|
||||
2. the inefficient use of chroma loading needs a bit of brushing up
|
||||
|
||||
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
|
||||
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
|
||||
|
||||
|
||||
MODIFIED to calculate coeffs from currently selected color space.
|
||||
MODIFIED core to be a macro which you spec the output format.
|
||||
ADDED UYVY conversion which is never called due to some thing in SWSCALE.
|
||||
CORRECTED algorithim selection to be strict on input formats.
|
||||
ADDED runtime detection of altivec.
|
||||
MODIFIED to calculate coeffs from currently selected color space.
|
||||
MODIFIED core to be a macro which you spec the output format.
|
||||
ADDED UYVY conversion which is never called due to some thing in SWSCALE.
|
||||
CORRECTED algorithim selection to be strict on input formats.
|
||||
ADDED runtime detection of altivec.
|
||||
|
||||
ADDED altivec_yuv2packedX vertical scl + RGB converter
|
||||
ADDED altivec_yuv2packedX vertical scl + RGB converter
|
||||
|
||||
March 27,2004
|
||||
PERFORMANCE ANALYSIS
|
||||
March 27,2004
|
||||
PERFORMANCE ANALYSIS
|
||||
|
||||
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
|
||||
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
|
||||
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
|
||||
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
|
||||
|
||||
720*480*30 ~10MPS
|
||||
720*480*30 ~10MPS
|
||||
|
||||
so we have roughly 10clocks per pixel this is too high something has to be wrong.
|
||||
so we have roughly 10clocks per pixel this is too high something has to be wrong.
|
||||
|
||||
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
|
||||
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
|
||||
|
||||
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
|
||||
guaranteed to have the input video frame it was just decompressed so
|
||||
it probably resides in L1 caches. However we are creating the
|
||||
output video stream this needs to use the DSTST instruction to
|
||||
optimize for the cache. We couple this with the fact that we are
|
||||
not going to be visiting the input buffer again so we mark it Least
|
||||
Recently Used. This shaves 25% of the processor cycles off.
|
||||
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
|
||||
guaranteed to have the input video frame it was just decompressed so
|
||||
it probably resides in L1 caches. However we are creating the
|
||||
output video stream this needs to use the DSTST instruction to
|
||||
optimize for the cache. We couple this with the fact that we are
|
||||
not going to be visiting the input buffer again so we mark it Least
|
||||
Recently Used. This shaves 25% of the processor cycles off.
|
||||
|
||||
Now MEMCPY is the largest mips consumer in the system, probably due
|
||||
to the inefficient X11 stuff.
|
||||
Now MEMCPY is the largest mips consumer in the system, probably due
|
||||
to the inefficient X11 stuff.
|
||||
|
||||
GL libraries seem to be very slow on this machine 1.33Ghz PB running
|
||||
Jaguar, this is not the case for my 1Ghz PB. I thought it might be
|
||||
a versioning issues, however I have libGL.1.2.dylib for both
|
||||
machines. ((We need to figure this out now))
|
||||
GL libraries seem to be very slow on this machine 1.33Ghz PB running
|
||||
Jaguar, this is not the case for my 1Ghz PB. I thought it might be
|
||||
a versioning issues, however I have libGL.1.2.dylib for both
|
||||
machines. ((We need to figure this out now))
|
||||
|
||||
GL2 libraries work now with patch for RGB32
|
||||
GL2 libraries work now with patch for RGB32
|
||||
|
||||
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
|
||||
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
|
||||
|
||||
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
|
||||
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
Loading…
Reference in New Issue
Block a user