mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
swscale/ppc: VSX-optimize yuv2422_1
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \ -s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \ -cpuflags 0 -v error - 15.3x speedup: yuyv422 14513 UNITS in yuv2packed1, 32768 runs, 0 skips 949 UNITS in yuv2packed1, 32767 runs, 1 skips yvyu422 14516 UNITS in yuv2packed1, 32767 runs, 1 skips 943 UNITS in yuv2packed1, 32767 runs, 1 skips uyvy422 14530 UNITS in yuv2packed1, 32767 runs, 1 skips 941 UNITS in yuv2packed1, 32766 runs, 2 skips
This commit is contained in:
parent
4e8cbbf70e
commit
a6a31ca3d9
@ -664,6 +664,143 @@ YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
|
||||
YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
|
||||
YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
|
||||
|
||||
static av_always_inline void
|
||||
write422(const vector int16_t vy1, const vector int16_t vy2,
|
||||
const vector int16_t vu, const vector int16_t vv,
|
||||
uint8_t *dest, const enum AVPixelFormat target)
|
||||
{
|
||||
vector uint8_t vd1, vd2, tmp;
|
||||
const vector uint8_t yuyv1 = (vector uint8_t) {
|
||||
0x0, 0x10, 0x1, 0x18,
|
||||
0x2, 0x11, 0x3, 0x19,
|
||||
0x4, 0x12, 0x5, 0x1a,
|
||||
0x6, 0x13, 0x7, 0x1b };
|
||||
const vector uint8_t yuyv2 = (vector uint8_t) {
|
||||
0x8, 0x14, 0x9, 0x1c,
|
||||
0xa, 0x15, 0xb, 0x1d,
|
||||
0xc, 0x16, 0xd, 0x1e,
|
||||
0xe, 0x17, 0xf, 0x1f };
|
||||
const vector uint8_t yvyu1 = (vector uint8_t) {
|
||||
0x0, 0x18, 0x1, 0x10,
|
||||
0x2, 0x19, 0x3, 0x11,
|
||||
0x4, 0x1a, 0x5, 0x12,
|
||||
0x6, 0x1b, 0x7, 0x13 };
|
||||
const vector uint8_t yvyu2 = (vector uint8_t) {
|
||||
0x8, 0x1c, 0x9, 0x14,
|
||||
0xa, 0x1d, 0xb, 0x15,
|
||||
0xc, 0x1e, 0xd, 0x16,
|
||||
0xe, 0x1f, 0xf, 0x17 };
|
||||
const vector uint8_t uyvy1 = (vector uint8_t) {
|
||||
0x10, 0x0, 0x18, 0x1,
|
||||
0x11, 0x2, 0x19, 0x3,
|
||||
0x12, 0x4, 0x1a, 0x5,
|
||||
0x13, 0x6, 0x1b, 0x7 };
|
||||
const vector uint8_t uyvy2 = (vector uint8_t) {
|
||||
0x14, 0x8, 0x1c, 0x9,
|
||||
0x15, 0xa, 0x1d, 0xb,
|
||||
0x16, 0xc, 0x1e, 0xd,
|
||||
0x17, 0xe, 0x1f, 0xf };
|
||||
|
||||
vd1 = vec_packsu(vy1, vy2);
|
||||
vd2 = vec_packsu(vu, vv);
|
||||
|
||||
switch (target) {
|
||||
case AV_PIX_FMT_YUYV422:
|
||||
tmp = vec_perm(vd1, vd2, yuyv1);
|
||||
vec_st(tmp, 0, dest);
|
||||
tmp = vec_perm(vd1, vd2, yuyv2);
|
||||
vec_st(tmp, 16, dest);
|
||||
break;
|
||||
case AV_PIX_FMT_YVYU422:
|
||||
tmp = vec_perm(vd1, vd2, yvyu1);
|
||||
vec_st(tmp, 0, dest);
|
||||
tmp = vec_perm(vd1, vd2, yvyu2);
|
||||
vec_st(tmp, 16, dest);
|
||||
break;
|
||||
case AV_PIX_FMT_UYVY422:
|
||||
tmp = vec_perm(vd1, vd2, uyvy1);
|
||||
vec_st(tmp, 0, dest);
|
||||
tmp = vec_perm(vd1, vd2, uyvy2);
|
||||
vec_st(tmp, 16, dest);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static av_always_inline void
|
||||
yuv2422_1_vsx_template(SwsContext *c, const int16_t *buf0,
|
||||
const int16_t *ubuf[2], const int16_t *vbuf[2],
|
||||
const int16_t *abuf0, uint8_t *dest, int dstW,
|
||||
int uvalpha, int y, enum AVPixelFormat target)
|
||||
{
|
||||
const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
|
||||
vector int16_t vy1, vy2, vu, vv, tmp;
|
||||
const vector int16_t add64 = vec_splats((int16_t) 64);
|
||||
const vector int16_t add128 = vec_splats((int16_t) 128);
|
||||
const vector uint16_t shift7 = vec_splat_u16(7);
|
||||
const vector uint16_t shift8 = vec_splat_u16(8);
|
||||
int i;
|
||||
|
||||
if (uvalpha < 2048) {
|
||||
for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
|
||||
vy1 = vec_ld(0, &buf0[i * 2]);
|
||||
vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
|
||||
vu = vec_ld(0, &ubuf0[i]);
|
||||
vv = vec_ld(0, &vbuf0[i]);
|
||||
|
||||
vy1 = vec_add(vy1, add64);
|
||||
vy2 = vec_add(vy2, add64);
|
||||
vu = vec_add(vu, add64);
|
||||
vv = vec_add(vv, add64);
|
||||
|
||||
vy1 = vec_sra(vy1, shift7);
|
||||
vy2 = vec_sra(vy2, shift7);
|
||||
vu = vec_sra(vu, shift7);
|
||||
vv = vec_sra(vv, shift7);
|
||||
|
||||
write422(vy1, vy2, vu, vv, &dest[i * 4], target);
|
||||
}
|
||||
} else {
|
||||
const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
|
||||
for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
|
||||
vy1 = vec_ld(0, &buf0[i * 2]);
|
||||
vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
|
||||
vu = vec_ld(0, &ubuf0[i]);
|
||||
tmp = vec_ld(0, &ubuf1[i]);
|
||||
vu = vec_adds(vu, tmp);
|
||||
vv = vec_ld(0, &vbuf0[i]);
|
||||
tmp = vec_ld(0, &vbuf1[i]);
|
||||
vv = vec_adds(vv, tmp);
|
||||
|
||||
vy1 = vec_add(vy1, add64);
|
||||
vy2 = vec_add(vy2, add64);
|
||||
vu = vec_adds(vu, add128);
|
||||
vv = vec_adds(vv, add128);
|
||||
|
||||
vy1 = vec_sra(vy1, shift7);
|
||||
vy2 = vec_sra(vy2, shift7);
|
||||
vu = vec_sra(vu, shift8);
|
||||
vv = vec_sra(vv, shift8);
|
||||
|
||||
write422(vy1, vy2, vu, vv, &dest[i * 4], target);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
|
||||
static void name ## ext ## _1_vsx(SwsContext *c, const int16_t *buf0, \
|
||||
const int16_t *ubuf[2], const int16_t *vbuf[2], \
|
||||
const int16_t *abuf0, uint8_t *dest, int dstW, \
|
||||
int uvalpha, int y) \
|
||||
{ \
|
||||
name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, \
|
||||
abuf0, dest, dstW, uvalpha, \
|
||||
y, fmt); \
|
||||
}
|
||||
|
||||
YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422)
|
||||
YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422)
|
||||
YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
|
||||
|
||||
#endif /* !HAVE_BIGENDIAN */
|
||||
|
||||
#endif /* HAVE_VSX */
|
||||
@ -768,6 +905,18 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
|
||||
}
|
||||
break;
|
||||
}
|
||||
} else { /* !SWS_FULL_CHR_H_INT */
|
||||
switch (dstFormat) {
|
||||
case AV_PIX_FMT_YUYV422:
|
||||
c->yuv2packed1 = yuv2yuyv422_1_vsx;
|
||||
break;
|
||||
case AV_PIX_FMT_YVYU422:
|
||||
c->yuv2packed1 = yuv2yvyu422_1_vsx;
|
||||
break;
|
||||
case AV_PIX_FMT_UYVY422:
|
||||
c->yuv2packed1 = yuv2uyvy422_1_vsx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif /* !HAVE_BIGENDIAN */
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user