mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-24 13:56:33 +02:00
Blackfin optimized YUV420 to RGB CSC Color Space Converters.
YUV2 -> RGB BGR for 565, 555 and 888 a.k.a. 24bit color. Speed-up compared to C version compiled with -O3 187.28% Patch by Marc Hoffman %mmh A pleasantst P com% Original thread: Date: May 9, 2007 2:46 AM Subject: [FFmpeg-devel] PATCH BlackFin yuv2rgb color space conversion Originally committed as revision 23307 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
This commit is contained in:
parent
79d4c96a1a
commit
d3f3eea92d
@ -12,6 +12,9 @@ OBJS= swscale.o rgb2rgb.o
|
||||
OBJS-$(TARGET_ALTIVEC) += yuv2rgb_altivec.o
|
||||
OBJS-$(CONFIG_GPL) += yuv2rgb.o
|
||||
|
||||
OBJS-$(TARGET_ARCH_BFIN) += yuv2rgb_bfin.o
|
||||
ASM_OBJS-$(TARGET_ARCH_BFIN) += internal_bfin.o
|
||||
|
||||
HEADERS = swscale.h rgb2rgb.h
|
||||
|
||||
include ../common.mak
|
||||
|
454
libswscale/internal_bfin.S
Normal file
454
libswscale/internal_bfin.S
Normal file
@ -0,0 +1,454 @@
|
||||
/*
|
||||
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
|
||||
* April 20, 2007
|
||||
*
|
||||
* Blackfin Video Color Space Converters Operations
|
||||
* convert I420 YV12 to RGB in various formats,
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
|
||||
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts
|
||||
|
||||
|
||||
The following calculation is used for the conversion:
|
||||
|
||||
r = clipz((y-oy)*cy + crv*(v-128))
|
||||
g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
|
||||
b = clipz((y-oy)*cy + cbu*(u-128))
|
||||
|
||||
y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
|
||||
|
||||
|
||||
New factorization to elliminate the truncation error which was
|
||||
occuring due to the byteop3p.
|
||||
|
||||
|
||||
1) use the bytop16m to subtract quad bytes we use this in U8 this
|
||||
then so the offsets need to be renormalized to 8bits.
|
||||
|
||||
2) scale operands up by a factor of 4 not 8 because Blackfin
|
||||
multiplies include a shift.
|
||||
|
||||
3) compute into the accumulators cy*yx0, cy*yx1
|
||||
|
||||
4) compute each of the linear equations
|
||||
r = clipz((y-oy)*cy + crv*(v-128))
|
||||
|
||||
g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128))
|
||||
|
||||
b = clipz((y-oy)*cy + cbu*(u-128))
|
||||
|
||||
reuse of the accumulators requires that we actually multiply
|
||||
twice once with addition and the second time with a subtaction.
|
||||
|
||||
because of this we need to compute the equations in the order R B
|
||||
then G saving the writes for B in the case of 24/32 bit color
|
||||
formats.
|
||||
|
||||
api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
|
||||
int dW, uint32_t *coeffs);
|
||||
|
||||
A B
|
||||
--- ---
|
||||
i2 = cb i3 = cr
|
||||
i1 = coeff i0 = y
|
||||
|
||||
Where coeffs have the following layout in memory.
|
||||
|
||||
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
|
||||
|
||||
coeffs is a pointer to oy.
|
||||
|
||||
the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
|
||||
replication is used to simplify the internal algorithms for the dual mac architecture
|
||||
of BlackFin.
|
||||
|
||||
All routines are exported with _ff_bfin_ as a symbol prefix
|
||||
|
||||
rough performance gain compared against -O3:
|
||||
|
||||
2779809/1484290 187.28%
|
||||
|
||||
which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
|
||||
c/pel for the optimized implementations. Not sure why there is such a
|
||||
huge variation on the reference codes on Blackfin I guess it must have
|
||||
to do with the memory system.
|
||||
|
||||
*/
|
||||
|
||||
#define mL1 .l1.text
|
||||
#define mL3 .text
|
||||
#define MEM mL1
|
||||
|
||||
#define DEFUN(fname,where,interface) \
|
||||
.section where; \
|
||||
.global _ff_bfin_ ## fname; \
|
||||
.type _ff_bfin_ ## fname, STT_FUNC; \
|
||||
.align 8; \
|
||||
_ff_bfin_ ## fname
|
||||
|
||||
#define DEFUN_END(fname) \
|
||||
.size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
|
||||
|
||||
|
||||
.text
|
||||
|
||||
#define COEFF_LEN 11*4
|
||||
#define COEFF_REL_CY_OFF 4*4
|
||||
|
||||
#define ARG_OUT 20
|
||||
#define ARG_W 24
|
||||
#define ARG_COEFF 28
|
||||
|
||||
DEFUN(yuv2rgb565_line,MEM,
|
||||
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
|
||||
link 0;
|
||||
[--sp] = (r7:4);
|
||||
p1 = [fp+ARG_OUT];
|
||||
r3 = [fp+ARG_W];
|
||||
|
||||
i0 = r0;
|
||||
i2 = r1;
|
||||
i3 = r2;
|
||||
|
||||
r0 = [fp+ARG_COEFF];
|
||||
i1 = r0;
|
||||
b1 = i1;
|
||||
l1 = COEFF_LEN;
|
||||
m0 = COEFF_REL_CY_OFF;
|
||||
p0 = r3;
|
||||
|
||||
r0 = [i0++]; // 2Y
|
||||
r1.l = w[i2++]; // 2u
|
||||
r1.h = w[i3++]; // 2v
|
||||
p0 = p0>>2;
|
||||
|
||||
lsetup (.L0565, .L1565) lc0 = p0;
|
||||
|
||||
/*
|
||||
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
|
||||
r0 -- used to load 4ys
|
||||
r1 -- used to load 2us,2vs
|
||||
r4 -- y3,y2
|
||||
r5 -- y1,y0
|
||||
r6 -- u1,u0
|
||||
r7 -- v1,v0
|
||||
*/
|
||||
r2=[i1++]; // oy
|
||||
.L0565:
|
||||
/*
|
||||
rrrrrrrr gggggggg bbbbbbbb
|
||||
5432109876543210
|
||||
bbbbb >>3
|
||||
gggggggg <<3
|
||||
rrrrrrrr <<8
|
||||
rrrrrggggggbbbbb
|
||||
*/
|
||||
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
|
||||
(r7,r6) = byteop16m (r1:0, r3:2) (r);
|
||||
r5 = r5 << 2 (v); // y1,y0
|
||||
r4 = r4 << 2 (v); // y3,y2
|
||||
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
|
||||
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
|
||||
/* Y' = y*cy */
|
||||
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
|
||||
|
||||
/* R = Y+ crv*(Cr-128) */
|
||||
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
|
||||
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
|
||||
r2 = r2 >> 3 (v);
|
||||
r3 = r2 & r5;
|
||||
|
||||
/* B = Y+ cbu*(Cb-128) */
|
||||
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
|
||||
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
|
||||
r2 = r2 << 8 (v);
|
||||
r2 = r2 & r5;
|
||||
r3 = r3 | r2;
|
||||
|
||||
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
|
||||
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
|
||||
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
|
||||
r2 = r2 << 3 (v);
|
||||
r2 = r2 & r5;
|
||||
r3 = r3 | r2;
|
||||
[p1++]=r3 || r1=[i1++]; // cy
|
||||
|
||||
/* Y' = y*cy */
|
||||
|
||||
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
|
||||
|
||||
/* R = Y+ crv*(Cr-128) */
|
||||
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
|
||||
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
|
||||
r2 = r2 >> 3 (v);
|
||||
r3 = r2 & r5;
|
||||
|
||||
/* B = Y+ cbu*(Cb-128) */
|
||||
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
|
||||
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
|
||||
r2 = r2 << 8 (v);
|
||||
r2 = r2 & r5;
|
||||
r3 = r3 | r2;
|
||||
|
||||
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
|
||||
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
|
||||
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
|
||||
r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
|
||||
r2 = r2 & r5;
|
||||
r3 = r3 | r2;
|
||||
[p1++]=r3 || r1.h = w[i3++]; // 2v
|
||||
.L1565: r2=[i1++]; // oy
|
||||
|
||||
l1 = 0;
|
||||
|
||||
(r7:4) = [sp++];
|
||||
unlink;
|
||||
rts;
|
||||
DEFUN_END(yuv2rgb565_line)
|
||||
|
||||
DEFUN(yuv2rgb555_line,MEM,
|
||||
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
|
||||
link 0;
|
||||
[--sp] = (r7:4);
|
||||
p1 = [fp+ARG_OUT];
|
||||
r3 = [fp+ARG_W];
|
||||
|
||||
i0 = r0;
|
||||
i2 = r1;
|
||||
i3 = r2;
|
||||
|
||||
r0 = [fp+ARG_COEFF];
|
||||
i1 = r0;
|
||||
b1 = i1;
|
||||
l1 = COEFF_LEN;
|
||||
m0 = COEFF_REL_CY_OFF;
|
||||
p0 = r3;
|
||||
|
||||
r0 = [i0++]; // 2Y
|
||||
r1.l = w[i2++]; // 2u
|
||||
r1.h = w[i3++]; // 2v
|
||||
p0 = p0>>2;
|
||||
|
||||
lsetup (.L0555, .L1555) lc0 = p0;
|
||||
|
||||
/*
|
||||
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
|
||||
r0 -- used to load 4ys
|
||||
r1 -- used to load 2us,2vs
|
||||
r4 -- y3,y2
|
||||
r5 -- y1,y0
|
||||
r6 -- u1,u0
|
||||
r7 -- v1,v0
|
||||
*/
|
||||
r2=[i1++]; // oy
|
||||
.L0555:
|
||||
/*
|
||||
rrrrrrrr gggggggg bbbbbbbb
|
||||
5432109876543210
|
||||
bbbbb >>3
|
||||
gggggggg <<2
|
||||
rrrrrrrr <<7
|
||||
xrrrrrgggggbbbbb
|
||||
*/
|
||||
|
||||
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
|
||||
(r7,r6) = byteop16m (r1:0, r3:2) (r);
|
||||
r5 = r5 << 2 (v); // y1,y0
|
||||
r4 = r4 << 2 (v); // y3,y2
|
||||
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
|
||||
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
|
||||
/* Y' = y*cy */
|
||||
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
|
||||
|
||||
/* R = Y+ crv*(Cr-128) */
|
||||
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
|
||||
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
|
||||
r2 = r2 >> 3 (v);
|
||||
r3 = r2 & r5;
|
||||
|
||||
/* B = Y+ cbu*(Cb-128) */
|
||||
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
|
||||
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
|
||||
r2 = r2 << 7 (v);
|
||||
r2 = r2 & r5;
|
||||
r3 = r3 | r2;
|
||||
|
||||
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
|
||||
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
|
||||
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
|
||||
r2 = r2 << 2 (v);
|
||||
r2 = r2 & r5;
|
||||
r3 = r3 | r2;
|
||||
[p1++]=r3 || r1=[i1++]; // cy
|
||||
|
||||
/* Y' = y*cy */
|
||||
|
||||
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
|
||||
|
||||
/* R = Y+ crv*(Cr-128) */
|
||||
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
|
||||
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
|
||||
r2 = r2 >> 3 (v);
|
||||
r3 = r2 & r5;
|
||||
|
||||
/* B = Y+ cbu*(Cb-128) */
|
||||
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
|
||||
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
|
||||
r2 = r2 << 7 (v);
|
||||
r2 = r2 & r5;
|
||||
r3 = r3 | r2;
|
||||
|
||||
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
|
||||
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
|
||||
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
|
||||
r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
|
||||
r2 = r2 & r5;
|
||||
r3 = r3 | r2;
|
||||
[p1++]=r3 || r1.h=w[i3++]; // 2v
|
||||
|
||||
.L1555: r2=[i1++]; // oy
|
||||
|
||||
l1 = 0;
|
||||
|
||||
(r7:4) = [sp++];
|
||||
unlink;
|
||||
rts;
|
||||
DEFUN_END(yuv2rgb555_line)
|
||||
|
||||
DEFUN(yuv2rgb24_line,MEM,
|
||||
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
|
||||
link 0;
|
||||
[--sp] = (r7:4);
|
||||
p1 = [fp+ARG_OUT];
|
||||
r3 = [fp+ARG_W];
|
||||
p2 = p1;
|
||||
p2 += 3;
|
||||
|
||||
i0 = r0;
|
||||
i2 = r1;
|
||||
i3 = r2;
|
||||
|
||||
r0 = [fp+ARG_COEFF]; // coeff buffer
|
||||
i1 = r0;
|
||||
b1 = i1;
|
||||
l1 = COEFF_LEN;
|
||||
m0 = COEFF_REL_CY_OFF;
|
||||
p0 = r3;
|
||||
|
||||
r0 = [i0++]; // 2Y
|
||||
r1.l = w[i2++]; // 2u
|
||||
r1.h = w[i3++]; // 2v
|
||||
p0 = p0>>2;
|
||||
|
||||
lsetup (.L0888, .L1888) lc0 = p0;
|
||||
|
||||
/*
|
||||
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
|
||||
r0 -- used to load 4ys
|
||||
r1 -- used to load 2us,2vs
|
||||
r4 -- y3,y2
|
||||
r5 -- y1,y0
|
||||
r6 -- u1,u0
|
||||
r7 -- v1,v0
|
||||
*/
|
||||
r2=[i1++]; // oy
|
||||
.L0888:
|
||||
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
|
||||
(r7,r6) = byteop16m (r1:0, r3:2) (r);
|
||||
r5 = r5 << 2 (v); // y1,y0
|
||||
r4 = r4 << 2 (v); // y3,y2
|
||||
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
|
||||
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
|
||||
|
||||
/* Y' = y*cy */
|
||||
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
|
||||
|
||||
/* R = Y+ crv*(Cr-128) */
|
||||
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
|
||||
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
|
||||
r2=r2>>16 || B[p1++]=r2;
|
||||
B[p2++]=r2;
|
||||
|
||||
/* B = Y+ cbu*(Cb-128) */
|
||||
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
|
||||
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
|
||||
r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
|
||||
|
||||
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
|
||||
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
|
||||
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
|
||||
|
||||
r2=r2>>16 || B[p1++]=r2;
|
||||
B[p2++]=r2;
|
||||
|
||||
r3=r3>>16 || B[p1++]=r3;
|
||||
B[p2++]=r3 || r1=[i1++]; // cy
|
||||
|
||||
p1+=3;
|
||||
p2+=3;
|
||||
/* Y' = y*cy */
|
||||
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
|
||||
|
||||
/* R = Y+ crv*(Cr-128) */
|
||||
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
|
||||
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
|
||||
r2=r2>>16 || B[p1++]=r2;
|
||||
B[p2++]=r2;
|
||||
|
||||
/* B = Y+ cbu*(Cb-128) */
|
||||
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
|
||||
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
|
||||
r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
|
||||
|
||||
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
|
||||
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
|
||||
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
|
||||
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
|
||||
r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
|
||||
B[p2++]=r2 || r1.l = w[i2++]; // 2u
|
||||
r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
|
||||
B[p2++]=r3 || r2=[i1++]; // oy
|
||||
|
||||
p1+=3;
|
||||
.L1888: p2+=3;
|
||||
|
||||
l1 = 0;
|
||||
|
||||
(r7:4) = [sp++];
|
||||
unlink;
|
||||
rts;
|
||||
DEFUN_END(yuv2rgb888_line)
|
@ -1992,7 +1992,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
|
||||
#endif
|
||||
|
||||
#if !defined(RUNTIME_CPUDETECT) || !defined (CONFIG_GPL) //ensure that the flags match the compiled variant if cpudetect is off
|
||||
flags &= ~(SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2|SWS_CPU_CAPS_3DNOW|SWS_CPU_CAPS_ALTIVEC);
|
||||
flags &= ~(SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2|SWS_CPU_CAPS_3DNOW|SWS_CPU_CAPS_ALTIVEC|SWS_CPU_CAPS_BFIN);
|
||||
#ifdef HAVE_MMX2
|
||||
flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2;
|
||||
#elif defined (HAVE_3DNOW)
|
||||
@ -2001,6 +2001,8 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
|
||||
flags |= SWS_CPU_CAPS_MMX;
|
||||
#elif defined (HAVE_ALTIVEC)
|
||||
flags |= SWS_CPU_CAPS_ALTIVEC;
|
||||
#elif defined (ARCH_BFIN)
|
||||
flags |= SWS_CPU_CAPS_BFIN;
|
||||
#endif
|
||||
#endif /* RUNTIME_CPUDETECT */
|
||||
if (clip_table[512] != 255) globalInit();
|
||||
|
@ -74,6 +74,7 @@ extern "C" {
|
||||
#define SWS_CPU_CAPS_MMX2 0x20000000
|
||||
#define SWS_CPU_CAPS_3DNOW 0x40000000
|
||||
#define SWS_CPU_CAPS_ALTIVEC 0x10000000
|
||||
#define SWS_CPU_CAPS_BFIN 0x01000000
|
||||
|
||||
#define SWS_MAX_REDUCE_CUTOFF 0.002
|
||||
|
||||
|
@ -162,6 +162,22 @@ typedef struct SwsContext{
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef ARCH_BFIN
|
||||
uint32_t oy __attribute__((aligned(4)));
|
||||
uint32_t oc __attribute__((aligned(4)));
|
||||
uint32_t zero __attribute__((aligned(4)));
|
||||
uint32_t cy __attribute__((aligned(4)));
|
||||
uint32_t crv __attribute__((aligned(4)));
|
||||
uint32_t rmask __attribute__((aligned(4)));
|
||||
uint32_t cbu __attribute__((aligned(4)));
|
||||
uint32_t bmask __attribute__((aligned(4)));
|
||||
uint32_t cgu __attribute__((aligned(4)));
|
||||
uint32_t cgv __attribute__((aligned(4)));
|
||||
uint32_t gmask __attribute__((aligned(4)));
|
||||
#endif
|
||||
|
||||
|
||||
} SwsContext;
|
||||
//FIXME check init (where 0)
|
||||
|
||||
|
@ -611,6 +611,14 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_BFIN
|
||||
if (c->flags & SWS_CPU_CAPS_BFIN)
|
||||
{
|
||||
SwsFunc t = ff_bfin_yuv2rgb_get_func_ptr (c);
|
||||
if (t) return t;
|
||||
}
|
||||
#endif
|
||||
|
||||
av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found\n");
|
||||
|
||||
switch(c->dstFormat){
|
||||
|
205
libswscale/yuv2rgb_bfin.c
Normal file
205
libswscale/yuv2rgb_bfin.c
Normal file
@ -0,0 +1,205 @@
|
||||
/*
|
||||
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
|
||||
* April 20, 2007
|
||||
*
|
||||
* Blackfin Video Color Space Converters Operations
|
||||
* convert I420 YV12 to RGB in various formats,
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <inttypes.h>
|
||||
#include <assert.h>
|
||||
#include "config.h"
|
||||
#ifdef HAVE_MALLOC_H
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
#include <bits/bfin_sram.h>
|
||||
#include "rgb2rgb.h"
|
||||
#include "swscale.h"
|
||||
#include "swscale_internal.h"
|
||||
|
||||
|
||||
#define L1CODE __attribute__ ((l1_text))
|
||||
|
||||
extern void ff_bfin_yuv2rgb555_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
|
||||
int w, uint32_t *coeffs) L1CODE;
|
||||
|
||||
extern void ff_bfin_yuv2rgb565_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
|
||||
int w, uint32_t *coeffs) L1CODE;
|
||||
|
||||
extern void ff_bfin_yuv2rgb24_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
|
||||
int w, uint32_t *coeffs) L1CODE;
|
||||
|
||||
typedef void (* ltransform_t)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
|
||||
int w, uint32_t *coeffs);
|
||||
|
||||
|
||||
static void bfin_prepare_coefficients (SwsContext *c, int rgb, int masks)
|
||||
{
|
||||
int oy;
|
||||
oy = c->yOffset&0xffff;
|
||||
oy = oy >> 3; // keep everything U8.0 for offset calculation
|
||||
|
||||
c->oc = 128*0x01010101U;
|
||||
c->oy = oy*0x01010101U;
|
||||
|
||||
/* copy 64bit vector coeffs down to 32bit vector coeffs */
|
||||
c->cy = c->yCoeff;
|
||||
c->zero = 0;
|
||||
|
||||
if (rgb) {
|
||||
c->crv = c->vrCoeff;
|
||||
c->cbu = c->ubCoeff;
|
||||
c->cgu = c->ugCoeff;
|
||||
c->cgv = c->vgCoeff;
|
||||
} else {
|
||||
c->crv = c->ubCoeff;
|
||||
c->cbu = c->vrCoeff;
|
||||
c->cgu = c->vgCoeff;
|
||||
c->cgv = c->ugCoeff;
|
||||
}
|
||||
|
||||
|
||||
if (masks == 555) {
|
||||
c->rmask = 0x001f * 0x00010001U;
|
||||
c->gmask = 0x03e0 * 0x00010001U;
|
||||
c->bmask = 0x7c00 * 0x00010001U;
|
||||
} else if (masks == 565) {
|
||||
c->rmask = 0x001f * 0x00010001U;
|
||||
c->gmask = 0x07e0 * 0x00010001U;
|
||||
c->bmask = 0xf800 * 0x00010001U;
|
||||
}
|
||||
}
|
||||
|
||||
static int core_yuv420_rgb (SwsContext *c,
|
||||
uint8_t **in, int *instrides,
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t **oplanes, int *outstrides,
|
||||
ltransform_t lcscf, int rgb, int masks)
|
||||
{
|
||||
uint8_t *py,*pu,*pv,*op;
|
||||
int w = instrides[0];
|
||||
int h2 = srcSliceH>>1;
|
||||
int i;
|
||||
|
||||
bfin_prepare_coefficients (c, rgb, masks);
|
||||
|
||||
py = in[0];
|
||||
pu = in[1+(1^rgb)];
|
||||
pv = in[1+(0^rgb)];
|
||||
|
||||
op = oplanes[0] + srcSliceY*outstrides[0];
|
||||
|
||||
for (i=0;i<h2;i++) {
|
||||
|
||||
lcscf (py,pu,pv,op,w,&c->oy);
|
||||
|
||||
py += instrides[0];
|
||||
op += outstrides[0];
|
||||
|
||||
lcscf (py,pu,pv,op,w,&c->oy);
|
||||
|
||||
py += instrides[0];
|
||||
pu += instrides[1];
|
||||
pv += instrides[2];
|
||||
op += outstrides[0];
|
||||
}
|
||||
|
||||
return srcSliceH;
|
||||
}
|
||||
|
||||
|
||||
static int bfin_yuv420_rgb555 (SwsContext *c,
|
||||
uint8_t **in, int *instrides,
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t **oplanes, int *outstrides)
|
||||
{
|
||||
return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides,
|
||||
ff_bfin_yuv2rgb555_line, 1, 555);
|
||||
}
|
||||
|
||||
static int bfin_yuv420_bgr555 (SwsContext *c,
|
||||
uint8_t **in, int *instrides,
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t **oplanes, int *outstrides)
|
||||
{
|
||||
return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides,
|
||||
ff_bfin_yuv2rgb555_line, 0, 555);
|
||||
}
|
||||
|
||||
static int bfin_yuv420_rgb24 (SwsContext *c,
|
||||
uint8_t **in, int *instrides,
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t **oplanes, int *outstrides)
|
||||
{
|
||||
return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides,
|
||||
ff_bfin_yuv2rgb24_line, 1, 888);
|
||||
}
|
||||
|
||||
static int bfin_yuv420_bgr24 (SwsContext *c,
|
||||
uint8_t **in, int *instrides,
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t **oplanes, int *outstrides)
|
||||
{
|
||||
return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides,
|
||||
ff_bfin_yuv2rgb24_line, 0, 888);
|
||||
}
|
||||
|
||||
static int bfin_yuv420_rgb565 (SwsContext *c,
|
||||
uint8_t **in, int *instrides,
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t **oplanes, int *outstrides)
|
||||
{
|
||||
return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides,
|
||||
ff_bfin_yuv2rgb565_line, 1, 565);
|
||||
}
|
||||
|
||||
static int bfin_yuv420_bgr565 (SwsContext *c,
|
||||
uint8_t **in, int *instrides,
|
||||
int srcSliceY, int srcSliceH,
|
||||
uint8_t **oplanes, int *outstrides)
|
||||
{
|
||||
return core_yuv420_rgb (c,in,instrides,srcSliceY,srcSliceH,oplanes,outstrides,
|
||||
ff_bfin_yuv2rgb565_line, 0, 565);
|
||||
}
|
||||
|
||||
|
||||
SwsFunc ff_bfin_yuv2rgb_get_func_ptr (SwsContext *c)
|
||||
{
|
||||
SwsFunc f;
|
||||
|
||||
switch(c->dstFormat) {
|
||||
case PIX_FMT_RGB555: f = bfin_yuv420_rgb555; break;
|
||||
case PIX_FMT_BGR555: f = bfin_yuv420_bgr555; break;
|
||||
case PIX_FMT_RGB565: f = bfin_yuv420_rgb565; break;
|
||||
case PIX_FMT_BGR565: f = bfin_yuv420_bgr565; break;
|
||||
case PIX_FMT_RGB24: f = bfin_yuv420_rgb24; break;
|
||||
case PIX_FMT_BGR24: f = bfin_yuv420_bgr24; break;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
av_log(c, AV_LOG_INFO, "BlackFin Accelerated Color Space Converter %s\n",
|
||||
sws_format_name (c->dstFormat));
|
||||
|
||||
return f;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user