mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-12-23 12:43:46 +02:00
swscale: [LA] Optimize yuv2plane1_8_c.
Reviewed-by: colleague of Shiyou Yin Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
parent
f3fe2cb5f7
commit
8b76df9142
@ -23,11 +23,11 @@
|
||||
|
||||
#include "libavcodec/loongarch/loongson_asm.S"
|
||||
|
||||
/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
|
||||
/* static void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
|
||||
* const int16_t **src, uint8_t *dest, int dstW,
|
||||
* const uint8_t *dither, int offset)
|
||||
*/
|
||||
function ff_yuv2planeX_8_lsx
|
||||
function yuv2planeX_8_lsx
|
||||
addi.w t1, a6, 1
|
||||
addi.w t2, a6, 2
|
||||
addi.w t3, a6, 3
|
||||
@ -136,3 +136,253 @@ function ff_yuv2planeX_8_lsx
|
||||
blt zero, a4, .DEST
|
||||
.END:
|
||||
endfunc
|
||||
|
||||
/*
|
||||
* void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
|
||||
* const uint8_t *dither, int offset)
|
||||
*/
|
||||
function yuv2plane1_8_lsx
|
||||
addi.w t1, a4, 1
|
||||
addi.w t2, a4, 2
|
||||
addi.w t3, a4, 3
|
||||
addi.w t4, a4, 4
|
||||
addi.w t5, a4, 5
|
||||
addi.w t6, a4, 6
|
||||
addi.w t7, a4, 7
|
||||
andi t0, a4, 7
|
||||
andi t1, t1, 7
|
||||
andi t2, t2, 7
|
||||
andi t3, t3, 7
|
||||
andi t4, t4, 7
|
||||
andi t5, t5, 7
|
||||
andi t6, t6, 7
|
||||
andi t7, t7, 7
|
||||
ldx.bu t0, a3, t0
|
||||
ldx.bu t1, a3, t1
|
||||
ldx.bu t2, a3, t2
|
||||
ldx.bu t3, a3, t3
|
||||
ldx.bu t4, a3, t4
|
||||
ldx.bu t5, a3, t5
|
||||
ldx.bu t6, a3, t6
|
||||
ldx.bu t7, a3, t7
|
||||
vinsgr2vr.h vr1, t0, 0
|
||||
vinsgr2vr.h vr1, t1, 1
|
||||
vinsgr2vr.h vr1, t2, 2
|
||||
vinsgr2vr.h vr1, t3, 3
|
||||
vinsgr2vr.h vr1, t4, 4
|
||||
vinsgr2vr.h vr1, t5, 5
|
||||
vinsgr2vr.h vr1, t6, 6
|
||||
vinsgr2vr.h vr1, t7, 7
|
||||
vsub.h vr0, vr0, vr0
|
||||
vilvl.h vr2, vr0, vr1
|
||||
vilvh.h vr3, vr0, vr1
|
||||
|
||||
andi t8, a2, 7
|
||||
srli.d a2, a2, 3
|
||||
beqz a2, 2f
|
||||
1:
|
||||
vld vr1, a0, 0
|
||||
addi.d a0, a0, 16
|
||||
vshuf4i.d vr0, vr1, 8
|
||||
vexth.w.h vr4, vr0
|
||||
vexth.w.h vr5, vr1
|
||||
|
||||
vadd.w vr4, vr2, vr4
|
||||
vadd.w vr5, vr3, vr5
|
||||
vsrai.w vr4, vr4, 7
|
||||
vsrai.w vr5, vr5, 7
|
||||
vclip255.w vr4, vr4
|
||||
vclip255.w vr5, vr5
|
||||
vpickev.h vr1, vr5, vr4
|
||||
vpickev.b vr1, vr1, vr1
|
||||
fst.d f1, a1, 0
|
||||
addi.d a1, a1, 8
|
||||
addi.d a2, a2, -1
|
||||
bnez a2, 1b
|
||||
2:
|
||||
beqz t8, 4f
|
||||
3:
|
||||
add.w a4, a4, t8
|
||||
addi.w t1, a4, 1
|
||||
addi.w t2, a4, 2
|
||||
addi.w t3, a4, 3
|
||||
addi.w t4, a4, 4
|
||||
addi.w t5, a4, 5
|
||||
addi.w t6, a4, 6
|
||||
addi.w t7, a4, 7
|
||||
andi t0, a4, 7
|
||||
andi t1, t1, 7
|
||||
andi t2, t2, 7
|
||||
andi t3, t3, 7
|
||||
andi t4, t4, 7
|
||||
andi t5, t5, 7
|
||||
andi t6, t6, 7
|
||||
andi t7, t7, 7
|
||||
ldx.bu t0, a3, t0
|
||||
ldx.bu t1, a3, t1
|
||||
ldx.bu t2, a3, t2
|
||||
ldx.bu t3, a3, t3
|
||||
ldx.bu t4, a3, t4
|
||||
ldx.bu t5, a3, t5
|
||||
ldx.bu t6, a3, t6
|
||||
ldx.bu t7, a3, t7
|
||||
vinsgr2vr.h vr1, t0, 0
|
||||
vinsgr2vr.h vr1, t1, 1
|
||||
vinsgr2vr.h vr1, t2, 2
|
||||
vinsgr2vr.h vr1, t3, 3
|
||||
vinsgr2vr.h vr1, t4, 4
|
||||
vinsgr2vr.h vr1, t5, 5
|
||||
vinsgr2vr.h vr1, t6, 6
|
||||
vinsgr2vr.h vr1, t7, 7
|
||||
vsub.h vr0, vr0, vr0
|
||||
vilvl.h vr2, vr0, vr1
|
||||
vilvh.h vr3, vr0, vr1
|
||||
|
||||
addi.d a0, a0, -16
|
||||
add.d a0, a0, t8
|
||||
add.d a0, a0, t8
|
||||
addi.d a1, a1, -8
|
||||
add.d a1, a1, t8
|
||||
|
||||
vld vr1, a0, 0
|
||||
vshuf4i.d vr0, vr1, 8
|
||||
vexth.w.h vr4, vr0
|
||||
vexth.w.h vr5, vr1
|
||||
|
||||
vadd.w vr4, vr2, vr4
|
||||
vadd.w vr5, vr3, vr5
|
||||
vsrai.w vr4, vr4, 7
|
||||
vsrai.w vr5, vr5, 7
|
||||
vclip255.w vr4, vr4
|
||||
vclip255.w vr5, vr5
|
||||
vpickev.h vr1, vr5, vr4
|
||||
vpickev.b vr1, vr1, vr1
|
||||
fst.d f1, a1, 0
|
||||
4:
|
||||
endfunc
|
||||
|
||||
function yuv2plane1_8_lasx
|
||||
addi.w t1, a4, 1
|
||||
addi.w t2, a4, 2
|
||||
addi.w t3, a4, 3
|
||||
addi.w t4, a4, 4
|
||||
addi.w t5, a4, 5
|
||||
addi.w t6, a4, 6
|
||||
addi.w t7, a4, 7
|
||||
andi t0, a4, 7
|
||||
andi t1, t1, 7
|
||||
andi t2, t2, 7
|
||||
andi t3, t3, 7
|
||||
andi t4, t4, 7
|
||||
andi t5, t5, 7
|
||||
andi t6, t6, 7
|
||||
andi t7, t7, 7
|
||||
ldx.bu t0, a3, t0
|
||||
ldx.bu t1, a3, t1
|
||||
ldx.bu t2, a3, t2
|
||||
ldx.bu t3, a3, t3
|
||||
ldx.bu t4, a3, t4
|
||||
ldx.bu t5, a3, t5
|
||||
ldx.bu t6, a3, t6
|
||||
ldx.bu t7, a3, t7
|
||||
vinsgr2vr.h vr1, t0, 0
|
||||
vinsgr2vr.h vr1, t1, 1
|
||||
vinsgr2vr.h vr1, t2, 2
|
||||
vinsgr2vr.h vr1, t3, 3
|
||||
vinsgr2vr.h vr1, t4, 4
|
||||
vinsgr2vr.h vr1, t5, 5
|
||||
vinsgr2vr.h vr1, t6, 6
|
||||
vinsgr2vr.h vr1, t7, 7
|
||||
xvpermi.q xr1, xr1, 0
|
||||
xvsub.h xr0, xr0, xr0
|
||||
xvilvl.h xr2, xr0, xr1
|
||||
xvilvh.h xr3, xr0, xr1
|
||||
|
||||
andi t8, a2, 15
|
||||
srli.d a2, a2, 4
|
||||
beqz a2, 2f
|
||||
1:
|
||||
xvld xr1, a0, 0
|
||||
addi.d a0, a0, 32
|
||||
xvpermi.d xr0, xr1, 0xa0
|
||||
xvexth.w.h xr4, xr0
|
||||
xvexth.w.h xr5, xr1
|
||||
|
||||
xvadd.w xr4, xr2, xr4
|
||||
xvadd.w xr5, xr3, xr5
|
||||
xvsrai.w xr4, xr4, 7
|
||||
xvsrai.w xr5, xr5, 7
|
||||
xvclip255.w xr4, xr4
|
||||
xvclip255.w xr5, xr5
|
||||
xvpickev.h xr1, xr5, xr4
|
||||
xvpickev.b xr0, xr1, xr1
|
||||
xvpermi.q xr1, xr0, 1
|
||||
fst.d f0, a1, 0
|
||||
fst.d f1, a1, 8
|
||||
addi.d a1, a1, 16
|
||||
addi.d a2, a2, -1
|
||||
bnez a2, 1b
|
||||
2:
|
||||
beqz t8, 4f
|
||||
3:
|
||||
add.w a4, a4, t8
|
||||
addi.w t1, a4, 1
|
||||
addi.w t2, a4, 2
|
||||
addi.w t3, a4, 3
|
||||
addi.w t4, a4, 4
|
||||
addi.w t5, a4, 5
|
||||
addi.w t6, a4, 6
|
||||
addi.w t7, a4, 7
|
||||
andi t0, a4, 7
|
||||
andi t1, t1, 7
|
||||
andi t2, t2, 7
|
||||
andi t3, t3, 7
|
||||
andi t4, t4, 7
|
||||
andi t5, t5, 7
|
||||
andi t6, t6, 7
|
||||
andi t7, t7, 7
|
||||
ldx.bu t0, a3, t0
|
||||
ldx.bu t1, a3, t1
|
||||
ldx.bu t2, a3, t2
|
||||
ldx.bu t3, a3, t3
|
||||
ldx.bu t4, a3, t4
|
||||
ldx.bu t5, a3, t5
|
||||
ldx.bu t6, a3, t6
|
||||
ldx.bu t7, a3, t7
|
||||
vinsgr2vr.h vr1, t0, 0
|
||||
vinsgr2vr.h vr1, t1, 1
|
||||
vinsgr2vr.h vr1, t2, 2
|
||||
vinsgr2vr.h vr1, t3, 3
|
||||
vinsgr2vr.h vr1, t4, 4
|
||||
vinsgr2vr.h vr1, t5, 5
|
||||
vinsgr2vr.h vr1, t6, 6
|
||||
vinsgr2vr.h vr1, t7, 7
|
||||
xvpermi.q xr1, xr1, 0
|
||||
xvsub.h xr0, xr0, xr0
|
||||
xvilvl.h xr2, xr0, xr1
|
||||
xvilvh.h xr3, xr0, xr1
|
||||
|
||||
addi.d a0, a0, -32
|
||||
add.d a0, a0, t8
|
||||
add.d a0, a0, t8
|
||||
addi.d a1, a1, -16
|
||||
add.d a1, a1, t8
|
||||
|
||||
xvld xr1, a0, 0
|
||||
xvpermi.d xr0, xr1, 0xa0
|
||||
xvexth.w.h xr4, xr0
|
||||
xvexth.w.h xr5, xr1
|
||||
|
||||
xvadd.w xr4, xr2, xr4
|
||||
xvadd.w xr5, xr3, xr5
|
||||
xvsrai.w xr4, xr4, 7
|
||||
xvsrai.w xr5, xr5, 7
|
||||
xvclip255.w xr4, xr4
|
||||
xvclip255.w xr5, xr5
|
||||
xvpickev.h xr1, xr5, xr4
|
||||
xvpickev.b xr0, xr1, xr1
|
||||
xvpermi.q xr1, xr0, 1
|
||||
fst.d f0, a1, 0
|
||||
fst.d f1, a1, 8
|
||||
4:
|
||||
endfunc
|
||||
|
@ -22,7 +22,7 @@
|
||||
#include "swscale_loongarch.h"
|
||||
#include "libavutil/loongarch/loongson_intrinsics.h"
|
||||
|
||||
void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
|
||||
void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
|
||||
const int16_t **src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset)
|
||||
{
|
||||
@ -1775,8 +1775,27 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
|
||||
YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
|
||||
|
||||
|
||||
av_cold void ff_sws_init_output_lasx(SwsContext *c)
|
||||
av_cold void ff_sws_init_output_lasx(SwsContext *c,
|
||||
yuv2planar1_fn *yuv2plane1,
|
||||
yuv2planarX_fn *yuv2planeX,
|
||||
yuv2interleavedX_fn *yuv2nv12cX,
|
||||
yuv2packed1_fn *yuv2packed1,
|
||||
yuv2packed2_fn *yuv2packed2,
|
||||
yuv2packedX_fn *yuv2packedX,
|
||||
yuv2anyX_fn *yuv2anyX)
|
||||
{
|
||||
enum AVPixelFormat dstFormat = c->dstFormat;
|
||||
|
||||
/* Add initialization once optimized */
|
||||
if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
|
||||
} else if (is16BPS(dstFormat)) {
|
||||
} else if (isNBPS(dstFormat)) {
|
||||
} else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
|
||||
} else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
|
||||
} else {
|
||||
*yuv2plane1 = yuv2plane1_8_lasx;
|
||||
*yuv2planeX = yuv2planeX_8_lasx;
|
||||
}
|
||||
|
||||
if(c->flags & SWS_FULL_CHR_H_INT) {
|
||||
switch (c->dstFormat) {
|
||||
|
@ -1624,8 +1624,28 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
|
||||
YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
|
||||
|
||||
|
||||
av_cold void ff_sws_init_output_lsx(SwsContext *c)
|
||||
av_cold void ff_sws_init_output_lsx(SwsContext *c,
|
||||
yuv2planar1_fn *yuv2plane1,
|
||||
yuv2planarX_fn *yuv2planeX,
|
||||
yuv2interleavedX_fn *yuv2nv12cX,
|
||||
yuv2packed1_fn *yuv2packed1,
|
||||
yuv2packed2_fn *yuv2packed2,
|
||||
yuv2packedX_fn *yuv2packedX,
|
||||
yuv2anyX_fn *yuv2anyX)
|
||||
{
|
||||
enum AVPixelFormat dstFormat = c->dstFormat;
|
||||
|
||||
/* Add initialization once optimized */
|
||||
if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
|
||||
} else if (is16BPS(dstFormat)) {
|
||||
} else if (isNBPS(dstFormat)) {
|
||||
} else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
|
||||
} else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
|
||||
} else {
|
||||
*yuv2plane1 = yuv2plane1_8_lsx;
|
||||
*yuv2planeX = yuv2planeX_8_lsx;
|
||||
}
|
||||
|
||||
if(c->flags & SWS_FULL_CHR_H_INT) {
|
||||
switch (c->dstFormat) {
|
||||
case AV_PIX_FMT_RGBA:
|
||||
|
@ -60,7 +60,9 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (have_lsx(cpu_flags)) {
|
||||
ff_sws_init_output_lsx(c);
|
||||
ff_sws_init_output_lsx(c, &c->yuv2plane1, &c->yuv2planeX,
|
||||
&c->yuv2nv12cX, &c->yuv2packed1,
|
||||
&c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
|
||||
if (c->srcBpc == 8) {
|
||||
if (c->dstBpc <= 14) {
|
||||
c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
|
||||
@ -80,12 +82,12 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (c->dstBpc == 8)
|
||||
c->yuv2planeX = ff_yuv2planeX_8_lsx;
|
||||
}
|
||||
#if HAVE_LASX
|
||||
if (have_lasx(cpu_flags)) {
|
||||
ff_sws_init_output_lasx(c);
|
||||
ff_sws_init_output_lasx(c, &c->yuv2plane1, &c->yuv2planeX,
|
||||
&c->yuv2nv12cX, &c->yuv2packed1,
|
||||
&c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
|
||||
if (c->srcBpc == 8) {
|
||||
if (c->dstBpc <= 14) {
|
||||
c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
|
||||
@ -105,8 +107,6 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (c->dstBpc == 8)
|
||||
c->yuv2planeX = ff_yuv2planeX_8_lasx;
|
||||
}
|
||||
#endif // #if HAVE_LASX
|
||||
ff_sws_init_range_convert_loongarch(c);
|
||||
|
@ -61,11 +61,21 @@ void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
|
||||
void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4], int width,
|
||||
int32_t *rgb2yuv, void *opq);
|
||||
|
||||
void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
|
||||
void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
|
||||
const int16_t **src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset);
|
||||
|
||||
av_cold void ff_sws_init_output_lsx(SwsContext *c);
|
||||
void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset);
|
||||
|
||||
av_cold void ff_sws_init_output_lsx(SwsContext *c,
|
||||
yuv2planar1_fn *yuv2plane1,
|
||||
yuv2planarX_fn *yuv2planeX,
|
||||
yuv2interleavedX_fn *yuv2nv12cX,
|
||||
yuv2packed1_fn *yuv2packed1,
|
||||
yuv2packed2_fn *yuv2packed2,
|
||||
yuv2packedX_fn *yuv2packedX,
|
||||
yuv2anyX_fn *yuv2anyX);
|
||||
|
||||
int yuv420_rgb24_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
|
||||
int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
|
||||
@ -135,12 +145,21 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
|
||||
uint8_t *dest, int width, int height,
|
||||
int src1Stride, int src2Stride, int dstStride);
|
||||
|
||||
void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
|
||||
void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
|
||||
const int16_t **src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset);
|
||||
|
||||
av_cold void ff_sws_init_output_lasx(SwsContext *c);
|
||||
void yuv2plane1_8_lasx(const int16_t *src, uint8_t *dest, int dstW,
|
||||
const uint8_t *dither, int offset);
|
||||
|
||||
av_cold void ff_sws_init_output_lasx(SwsContext *c,
|
||||
yuv2planar1_fn *yuv2plane1,
|
||||
yuv2planarX_fn *yuv2planeX,
|
||||
yuv2interleavedX_fn *yuv2nv12cX,
|
||||
yuv2packed1_fn *yuv2packed1,
|
||||
yuv2packed2_fn *yuv2packed2,
|
||||
yuv2packedX_fn *yuv2packedX,
|
||||
yuv2anyX_fn *yuv2anyX);
|
||||
#endif // #if HAVE_LASX
|
||||
|
||||
#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
|
||||
|
Loading…
Reference in New Issue
Block a user