mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2024-11-21 10:55:51 +02:00
swscale: [LA] Optimize range convert for yuvj420p.
Reviewed-by: 陈昊 <chenhao@loongson.cn> Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
parent
dd5f665b40
commit
f3fe2cb5f7
@ -1866,3 +1866,371 @@ function ff_hscale_16_to_19_sub_lsx
|
||||
ld.d s8, sp, 64
|
||||
addi.d sp, sp, 72
|
||||
endfunc
|
||||
|
||||
function lumRangeFromJpeg_lsx
|
||||
li.w t0, 14071
|
||||
li.w t1, 33561947
|
||||
vreplgr2vr.h vr0, t0
|
||||
srli.w t2, a1, 3
|
||||
andi t3, a1, 7
|
||||
beqz t2, 2f
|
||||
1:
|
||||
vld vr1, a0, 0
|
||||
vreplgr2vr.w vr2, t1
|
||||
vreplgr2vr.w vr3, t1
|
||||
vmaddwev.w.h vr2, vr0, vr1
|
||||
vmaddwod.w.h vr3, vr0, vr1
|
||||
vsrai.w vr2, vr2, 14
|
||||
vsrai.w vr3, vr3, 14
|
||||
vpackev.h vr1, vr3, vr2
|
||||
vst vr1, a0, 0
|
||||
addi.d a0, a0, 16
|
||||
addi.d t2, t2, -1
|
||||
bnez t2, 1b
|
||||
2:
|
||||
beqz t3, 4f
|
||||
3:
|
||||
ld.h t4, a0, 0
|
||||
mul.w t4, t4, t0
|
||||
add.w t4, t4, t1
|
||||
srai.w t4, t4, 14
|
||||
st.h t4, a0, 0
|
||||
addi.d a0, a0, 2
|
||||
addi.d t3, t3, -1
|
||||
bnez t3, 3b
|
||||
4:
|
||||
endfunc
|
||||
|
||||
function lumRangeFromJpeg_lasx
|
||||
li.w t0, 14071
|
||||
li.w t1, 33561947
|
||||
xvreplgr2vr.h xr0, t0
|
||||
srli.w t2, a1, 4
|
||||
andi t3, a1, 15
|
||||
beqz t2, 2f
|
||||
1:
|
||||
xvld xr1, a0, 0
|
||||
xvreplgr2vr.w xr2, t1
|
||||
xvreplgr2vr.w xr3, t1
|
||||
xvmaddwev.w.h xr2, xr0, xr1
|
||||
xvmaddwod.w.h xr3, xr0, xr1
|
||||
xvsrai.w xr2, xr2, 14
|
||||
xvsrai.w xr3, xr3, 14
|
||||
xvpackev.h xr1, xr3, xr2
|
||||
xvst xr1, a0, 0
|
||||
addi.d a0, a0, 32
|
||||
addi.d t2, t2, -1
|
||||
bnez t2, 1b
|
||||
2:
|
||||
beqz t3, 4f
|
||||
3:
|
||||
ld.h t4, a0, 0
|
||||
mul.w t4, t4, t0
|
||||
add.w t4, t4, t1
|
||||
srai.w t4, t4, 14
|
||||
st.h t4, a0, 0
|
||||
addi.d a0, a0, 2
|
||||
addi.d t3, t3, -1
|
||||
bnez t3, 3b
|
||||
4:
|
||||
endfunc
|
||||
|
||||
function lumRangeToJpeg_lsx
|
||||
li.w t0, 19077
|
||||
li.w t1, -39057361
|
||||
li.w t2, 30189
|
||||
vreplgr2vr.h vr0, t0
|
||||
vreplgr2vr.h vr4, t2
|
||||
srli.w t2, a1, 3
|
||||
andi t3, a1, 7
|
||||
beqz t2, 2f
|
||||
1:
|
||||
vld vr1, a0, 0
|
||||
vreplgr2vr.w vr2, t1
|
||||
vreplgr2vr.w vr3, t1
|
||||
vmin.h vr1, vr1, vr4
|
||||
vmaddwev.w.h vr2, vr0, vr1
|
||||
vmaddwod.w.h vr3, vr0, vr1
|
||||
vsrai.w vr2, vr2, 14
|
||||
vsrai.w vr3, vr3, 14
|
||||
vpackev.h vr1, vr3, vr2
|
||||
vst vr1, a0, 0
|
||||
addi.d a0, a0, 16
|
||||
addi.d t2, t2, -1
|
||||
bnez t2, 1b
|
||||
2:
|
||||
beqz t3, 4f
|
||||
3:
|
||||
ld.h t4, a0, 0
|
||||
vreplgr2vr.h vr1, t4
|
||||
vmin.h vr1, vr1, vr4
|
||||
vpickve2gr.h t4, vr1, 0
|
||||
mul.w t4, t4, t0
|
||||
add.w t4, t4, t1
|
||||
srai.w t4, t4, 14
|
||||
st.h t4, a0, 0
|
||||
addi.d a0, a0, 2
|
||||
addi.d t3, t3, -1
|
||||
bnez t3, 3b
|
||||
4:
|
||||
endfunc
|
||||
|
||||
function lumRangeToJpeg_lasx
|
||||
li.w t0, 19077
|
||||
li.w t1, -39057361
|
||||
li.w t2, 30189
|
||||
xvreplgr2vr.h xr0, t0
|
||||
xvreplgr2vr.h xr4, t2
|
||||
srli.w t2, a1, 4
|
||||
andi t3, a1, 15
|
||||
beqz t2, 2f
|
||||
1:
|
||||
xvld xr1, a0, 0
|
||||
xvreplgr2vr.w xr2, t1
|
||||
xvreplgr2vr.w xr3, t1
|
||||
xvmin.h xr1, xr1, xr4
|
||||
xvmaddwev.w.h xr2, xr0, xr1
|
||||
xvmaddwod.w.h xr3, xr0, xr1
|
||||
xvsrai.w xr2, xr2, 14
|
||||
xvsrai.w xr3, xr3, 14
|
||||
xvpackev.h xr1, xr3, xr2
|
||||
xvst xr1, a0, 0
|
||||
addi.d a0, a0, 32
|
||||
addi.d t2, t2, -1
|
||||
bnez t2, 1b
|
||||
2:
|
||||
beqz t3, 4f
|
||||
3:
|
||||
ld.h t4, a0, 0
|
||||
vreplgr2vr.h vr1, t4
|
||||
vmin.h vr1, vr1, vr4
|
||||
vpickve2gr.h t4, vr1, 0
|
||||
mul.w t4, t4, t0
|
||||
add.w t4, t4, t1
|
||||
srai.w t4, t4, 14
|
||||
st.h t4, a0, 0
|
||||
addi.d a0, a0, 2
|
||||
addi.d t3, t3, -1
|
||||
bnez t3, 3b
|
||||
4:
|
||||
endfunc
|
||||
|
||||
function chrRangeFromJpeg_lsx
|
||||
li.w t0, 1799
|
||||
li.w t1, 4081085
|
||||
vreplgr2vr.h vr0, t0
|
||||
srli.w t2, a2, 3
|
||||
andi t3, a2, 7
|
||||
beqz t2, 2f
|
||||
1:
|
||||
vld vr1, a0, 0
|
||||
vld vr2, a1, 0
|
||||
vreplgr2vr.w vr3, t1
|
||||
vreplgr2vr.w vr4, t1
|
||||
vreplgr2vr.w vr5, t1
|
||||
vreplgr2vr.w vr6, t1
|
||||
vmaddwev.w.h vr3, vr0, vr1
|
||||
vmaddwod.w.h vr4, vr0, vr1
|
||||
vmaddwev.w.h vr5, vr0, vr2
|
||||
vmaddwod.w.h vr6, vr0, vr2
|
||||
vsrai.w vr3, vr3, 11
|
||||
vsrai.w vr4, vr4, 11
|
||||
vsrai.w vr5, vr5, 11
|
||||
vsrai.w vr6, vr6, 11
|
||||
vpackev.h vr1, vr4, vr3
|
||||
vpackev.h vr2, vr6, vr5
|
||||
vst vr1, a0, 0
|
||||
vst vr2, a1, 0
|
||||
addi.d a0, a0, 16
|
||||
addi.d a1, a1, 16
|
||||
addi.d t2, t2, -1
|
||||
bnez t2, 1b
|
||||
2:
|
||||
beqz t3, 4f
|
||||
3:
|
||||
ld.h t4, a0, 0
|
||||
ld.h t5, a1, 0
|
||||
mul.w t4, t4, t0
|
||||
mul.w t5, t5, t0
|
||||
add.w t4, t4, t1
|
||||
add.w t5, t5, t1
|
||||
srai.w t4, t4, 11
|
||||
srai.w t5, t5, 11
|
||||
st.h t4, a0, 0
|
||||
st.h t5, a1, 0
|
||||
addi.d a0, a0, 2
|
||||
addi.d a1, a1, 2
|
||||
addi.d t3, t3, -1
|
||||
bnez t3, 3b
|
||||
4:
|
||||
endfunc
|
||||
|
||||
function chrRangeFromJpeg_lasx
|
||||
li.w t0, 1799
|
||||
li.w t1, 4081085
|
||||
xvreplgr2vr.h xr0, t0
|
||||
srli.w t2, a2, 4
|
||||
andi t3, a2, 15
|
||||
beqz t2, 2f
|
||||
1:
|
||||
xvld xr1, a0, 0
|
||||
xvld xr2, a1, 0
|
||||
xvreplgr2vr.w xr3, t1
|
||||
xvreplgr2vr.w xr4, t1
|
||||
xvreplgr2vr.w xr5, t1
|
||||
xvreplgr2vr.w xr6, t1
|
||||
xvmaddwev.w.h xr3, xr0, xr1
|
||||
xvmaddwod.w.h xr4, xr0, xr1
|
||||
xvmaddwev.w.h xr5, xr0, xr2
|
||||
xvmaddwod.w.h xr6, xr0, xr2
|
||||
xvsrai.w xr3, xr3, 11
|
||||
xvsrai.w xr4, xr4, 11
|
||||
xvsrai.w xr5, xr5, 11
|
||||
xvsrai.w xr6, xr6, 11
|
||||
xvpackev.h xr1, xr4, xr3
|
||||
xvpackev.h xr2, xr6, xr5
|
||||
xvst xr1, a0, 0
|
||||
xvst xr2, a1, 0
|
||||
addi.d a0, a0, 32
|
||||
addi.d a1, a1, 32
|
||||
addi.d t2, t2, -1
|
||||
bnez t2, 1b
|
||||
2:
|
||||
beqz t3, 4f
|
||||
3:
|
||||
ld.h t4, a0, 0
|
||||
ld.h t5, a1, 0
|
||||
mul.w t4, t4, t0
|
||||
mul.w t5, t5, t0
|
||||
add.w t4, t4, t1
|
||||
add.w t5, t5, t1
|
||||
srai.w t4, t4, 11
|
||||
srai.w t5, t5, 11
|
||||
st.h t4, a0, 0
|
||||
st.h t5, a1, 0
|
||||
addi.d a0, a0, 2
|
||||
addi.d a1, a1, 2
|
||||
addi.d t3, t3, -1
|
||||
bnez t3, 3b
|
||||
4:
|
||||
endfunc
|
||||
|
||||
function chrRangeToJpeg_lsx
|
||||
li.w t0, 4663
|
||||
li.w t1, -9289992
|
||||
li.w t2, 30775
|
||||
vreplgr2vr.h vr0, t0
|
||||
vreplgr2vr.h vr7, t2
|
||||
srli.w t2, a2, 3
|
||||
andi t3, a2, 7
|
||||
beqz t2, 2f
|
||||
1:
|
||||
vld vr1, a0, 0
|
||||
vld vr2, a1, 0
|
||||
vreplgr2vr.w vr3, t1
|
||||
vreplgr2vr.w vr4, t1
|
||||
vreplgr2vr.w vr5, t1
|
||||
vreplgr2vr.w vr6, t1
|
||||
vmin.h vr1, vr1, vr7
|
||||
vmin.h vr2, vr2, vr7
|
||||
vmaddwev.w.h vr3, vr0, vr1
|
||||
vmaddwod.w.h vr4, vr0, vr1
|
||||
vmaddwev.w.h vr5, vr0, vr2
|
||||
vmaddwod.w.h vr6, vr0, vr2
|
||||
vsrai.w vr3, vr3, 12
|
||||
vsrai.w vr4, vr4, 12
|
||||
vsrai.w vr5, vr5, 12
|
||||
vsrai.w vr6, vr6, 12
|
||||
vpackev.h vr1, vr4, vr3
|
||||
vpackev.h vr2, vr6, vr5
|
||||
vst vr1, a0, 0
|
||||
vst vr2, a1, 0
|
||||
addi.d a0, a0, 16
|
||||
addi.d a1, a1, 16
|
||||
addi.d t2, t2, -1
|
||||
bnez t2, 1b
|
||||
2:
|
||||
beqz t3, 4f
|
||||
3:
|
||||
ld.h t4, a0, 0
|
||||
ld.h t5, a1, 0
|
||||
vreplgr2vr.h vr1, t4
|
||||
vreplgr2vr.h vr2, t5
|
||||
vmin.h vr1, vr1, vr7
|
||||
vmin.h vr2, vr2, vr7
|
||||
vpickve2gr.h t4, vr1, 0
|
||||
vpickve2gr.h t5, vr2, 0
|
||||
mul.w t4, t4, t0
|
||||
mul.w t5, t5, t0
|
||||
add.w t4, t4, t1
|
||||
add.w t5, t5, t1
|
||||
srai.w t4, t4, 12
|
||||
srai.w t5, t5, 12
|
||||
st.h t4, a0, 0
|
||||
st.h t5, a1, 0
|
||||
addi.d a0, a0, 2
|
||||
addi.d a1, a1, 2
|
||||
addi.d t3, t3, -1
|
||||
bnez t3, 3b
|
||||
4:
|
||||
endfunc
|
||||
|
||||
function chrRangeToJpeg_lasx
|
||||
li.w t0, 4663
|
||||
li.w t1, -9289992
|
||||
li.w t2, 30775
|
||||
xvreplgr2vr.h xr0, t0
|
||||
xvreplgr2vr.h xr7, t2
|
||||
srli.w t2, a2, 4
|
||||
andi t3, a2, 15
|
||||
beqz t2, 2f
|
||||
1:
|
||||
xvld xr1, a0, 0
|
||||
xvld xr2, a1, 0
|
||||
xvreplgr2vr.w xr3, t1
|
||||
xvreplgr2vr.w xr4, t1
|
||||
xvreplgr2vr.w xr5, t1
|
||||
xvreplgr2vr.w xr6, t1
|
||||
xvmin.h xr1, xr1, xr7
|
||||
xvmin.h xr2, xr2, xr7
|
||||
xvmaddwev.w.h xr3, xr0, xr1
|
||||
xvmaddwod.w.h xr4, xr0, xr1
|
||||
xvmaddwev.w.h xr5, xr0, xr2
|
||||
xvmaddwod.w.h xr6, xr0, xr2
|
||||
xvsrai.w xr3, xr3, 12
|
||||
xvsrai.w xr4, xr4, 12
|
||||
xvsrai.w xr5, xr5, 12
|
||||
xvsrai.w xr6, xr6, 12
|
||||
xvpackev.h xr1, xr4, xr3
|
||||
xvpackev.h xr2, xr6, xr5
|
||||
xvst xr1, a0, 0
|
||||
xvst xr2, a1, 0
|
||||
addi.d a0, a0, 32
|
||||
addi.d a1, a1, 32
|
||||
addi.d t2, t2, -1
|
||||
bnez t2, 1b
|
||||
2:
|
||||
beqz t3, 4f
|
||||
3:
|
||||
ld.h t4, a0, 0
|
||||
ld.h t5, a1, 0
|
||||
vreplgr2vr.h vr1, t4
|
||||
vreplgr2vr.h vr2, t5
|
||||
vmin.h vr1, vr1, vr7
|
||||
vmin.h vr2, vr2, vr7
|
||||
vpickve2gr.h t4, vr1, 0
|
||||
vpickve2gr.h t5, vr2, 0
|
||||
mul.w t4, t4, t0
|
||||
mul.w t5, t5, t0
|
||||
add.w t4, t4, t1
|
||||
add.w t5, t5, t1
|
||||
srai.w t4, t4, 12
|
||||
srai.w t5, t5, 12
|
||||
st.h t4, a0, 0
|
||||
st.h t5, a1, 0
|
||||
addi.d a0, a0, 2
|
||||
addi.d a1, a1, 2
|
||||
addi.d t3, t3, -1
|
||||
bnez t3, 3b
|
||||
4:
|
||||
endfunc
|
||||
|
@ -24,6 +24,38 @@
|
||||
#include "libswscale/rgb2rgb.h"
|
||||
#include "libavutil/loongarch/cpu.h"
|
||||
|
||||
av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_lsx(cpu_flags)) {
|
||||
if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
|
||||
if (c->dstBpc <= 14) {
|
||||
if (c->srcRange) {
|
||||
c->lumConvertRange = lumRangeFromJpeg_lsx;
|
||||
c->chrConvertRange = chrRangeFromJpeg_lsx;
|
||||
} else {
|
||||
c->lumConvertRange = lumRangeToJpeg_lsx;
|
||||
c->chrConvertRange = chrRangeToJpeg_lsx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (have_lasx(cpu_flags)) {
|
||||
if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
|
||||
if (c->dstBpc <= 14) {
|
||||
if (c->srcRange) {
|
||||
c->lumConvertRange = lumRangeFromJpeg_lasx;
|
||||
c->chrConvertRange = chrRangeFromJpeg_lasx;
|
||||
} else {
|
||||
c->lumConvertRange = lumRangeToJpeg_lasx;
|
||||
c->chrConvertRange = chrRangeToJpeg_lasx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
@ -77,6 +109,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
|
||||
c->yuv2planeX = ff_yuv2planeX_8_lasx;
|
||||
}
|
||||
#endif // #if HAVE_LASX
|
||||
ff_sws_init_range_convert_loongarch(c);
|
||||
}
|
||||
|
||||
av_cold void rgb2rgb_init_loongarch(void)
|
||||
|
@ -50,6 +50,11 @@ void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
|
||||
const uint8_t *_src, const int16_t *filter,
|
||||
const int32_t *filterPos, int filterSize, int sh);
|
||||
|
||||
void lumRangeFromJpeg_lsx(int16_t *dst, int width);
|
||||
void chrRangeFromJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
|
||||
void lumRangeToJpeg_lsx(int16_t *dst, int width);
|
||||
void chrRangeToJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
|
||||
|
||||
void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
|
||||
int width, int32_t *rgb2yuv, void *opq);
|
||||
|
||||
@ -97,6 +102,11 @@ void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
|
||||
const uint8_t *_src, const int16_t *filter,
|
||||
const int32_t *filterPos, int filterSize);
|
||||
|
||||
void lumRangeFromJpeg_lasx(int16_t *dst, int width);
|
||||
void chrRangeFromJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
|
||||
void lumRangeToJpeg_lasx(int16_t *dst, int width);
|
||||
void chrRangeToJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
|
||||
|
||||
void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
|
||||
int width, int32_t *rgb2yuv, void *opq);
|
||||
|
||||
@ -130,6 +140,7 @@ void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
|
||||
const uint8_t *dither, int offset);
|
||||
|
||||
av_cold void ff_sws_init_output_lasx(SwsContext *c);
|
||||
|
||||
#endif // #if HAVE_LASX
|
||||
|
||||
#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
|
||||
|
@ -697,6 +697,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
|
||||
void ff_updateMMXDitherTables(SwsContext *c, int dstY);
|
||||
|
||||
av_cold void ff_sws_init_range_convert(SwsContext *c);
|
||||
av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
|
||||
|
||||
SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
|
||||
SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
|
||||
|
@ -1078,8 +1078,12 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
|
||||
c->srcRange = srcRange;
|
||||
c->dstRange = dstRange;
|
||||
|
||||
if (need_reinit)
|
||||
if (need_reinit) {
|
||||
ff_sws_init_range_convert(c);
|
||||
#if ARCH_LOONGARCH64
|
||||
ff_sws_init_range_convert_loongarch(c);
|
||||
#endif
|
||||
}
|
||||
|
||||
c->dstFormatBpp = av_get_bits_per_pixel(desc_dst);
|
||||
c->srcFormatBpp = av_get_bits_per_pixel(desc_src);
|
||||
|
Loading…
Reference in New Issue
Block a user