You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-04 22:03:09 +02:00
swscale/x86/rgb2rgb: add AVX512ICL version of uyvytoyuv422
The scalar loop is replaced with masked AVX512 instructions. For extracting the Y from UYVY, vperm2b is used instead of various AND and packuswb. Instead of loading the vectors with interleaved lanes as done in AVX2 version, normal load is used. At the end of packuswb, for U and V, an extra permute operation is done to get the required layout. AMD 7950x Zen 4 benchmark data: uyvytoyuv422_c: 29105.0 ( 1.00x) uyvytoyuv422_sse2: 3888.0 ( 7.49x) uyvytoyuv422_avx: 3374.2 ( 8.63x) uyvytoyuv422_avx2: 2649.8 (10.98x) uyvytoyuv422_avx512icl: 1615.0 (18.02x) Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
committed by
James Almer
parent
08e37fa082
commit
e18f87ed9f
@ -2383,6 +2383,9 @@ void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
|||||||
void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
||||||
const uint8_t *src, int width, int height,
|
const uint8_t *src, int width, int height,
|
||||||
int lumStride, int chromStride, int srcStride);
|
int lumStride, int chromStride, int srcStride);
|
||||||
|
void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
||||||
|
const uint8_t *src, int width, int height,
|
||||||
|
int lumStride, int chromStride, int srcStride);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define DEINTERLEAVE_BYTES(cpuext) \
|
#define DEINTERLEAVE_BYTES(cpuext) \
|
||||||
@ -2477,6 +2480,9 @@ av_cold void rgb2rgb_init_x86(void)
|
|||||||
}
|
}
|
||||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||||
uyvytoyuv422 = ff_uyvytoyuv422_avx2;
|
uyvytoyuv422 = ff_uyvytoyuv422_avx2;
|
||||||
|
}
|
||||||
|
if (EXTERNAL_AVX512ICL(cpu_flags)) {
|
||||||
|
uyvytoyuv422 = ff_uyvytoyuv422_avx512icl;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -35,6 +35,20 @@ pb_shuffle2013: db 2, 0, 1, 3, 6, 4, 5, 7, 10, 8, 9, 11, 14, 12, 13, 15
|
|||||||
pb_shuffle2130: db 2, 1, 3, 0, 6, 5, 7, 4, 10, 9, 11, 8, 14, 13, 15, 12
|
pb_shuffle2130: db 2, 1, 3, 0, 6, 5, 7, 4, 10, 9, 11, 8, 14, 13, 15, 12
|
||||||
pb_shuffle1203: db 1, 2, 0, 3, 5, 6, 4, 7, 9, 10, 8, 11, 13, 14, 12, 15
|
pb_shuffle1203: db 1, 2, 0, 3, 5, 6, 4, 7, 9, 10, 8, 11, 13, 14, 12, 15
|
||||||
|
|
||||||
|
%if HAVE_AVX512ICL_EXTERNAL
|
||||||
|
; shuffle vector to rearrange packuswb result to be linear
|
||||||
|
shuf_packus: db 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51,\
|
||||||
|
4, 5, 6, 7, 20, 21, 22, 23, 36, 37, 38, 39, 52, 53, 54, 55,\
|
||||||
|
8, 9, 10, 11, 24, 25, 26, 27, 40, 41, 42, 43, 56, 57, 58, 59,\
|
||||||
|
12, 13, 14, 15, 28, 29, 30, 31, 44, 45, 46, 47, 60, 61, 62, 63
|
||||||
|
|
||||||
|
; shuffle vector to combine odd elements from two vectors to extract Y
|
||||||
|
shuf_perm2b: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,\
|
||||||
|
33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,\
|
||||||
|
65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,\
|
||||||
|
97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127
|
||||||
|
%endif
|
||||||
|
|
||||||
SECTION .text
|
SECTION .text
|
||||||
|
|
||||||
%macro RSHIFT_COPY 5
|
%macro RSHIFT_COPY 5
|
||||||
@ -156,9 +170,20 @@ SHUFFLE_BYTES 1, 2, 0, 3
|
|||||||
; int lumStride, int chromStride, int srcStride)
|
; int lumStride, int chromStride, int srcStride)
|
||||||
;-----------------------------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------------------------
|
||||||
%macro UYVY_TO_YUV422 0
|
%macro UYVY_TO_YUV422 0
|
||||||
|
%if mmsize == 64
|
||||||
|
; need two more registers to store shuffle vectors for AVX512ICL
|
||||||
|
cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
|
||||||
|
%else
|
||||||
cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
|
cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
|
||||||
|
%endif
|
||||||
pxor m0, m0
|
pxor m0, m0
|
||||||
|
%if mmsize == 64
|
||||||
|
vpternlogd m1, m1, m1, 0xff ; m1 = _mm512_set1_epi8(0xff)
|
||||||
|
movu m8, [shuf_packus]
|
||||||
|
movu m9, [shuf_perm2b]
|
||||||
|
%else
|
||||||
pcmpeqw m1, m1
|
pcmpeqw m1, m1
|
||||||
|
%endif
|
||||||
psrlw m1, 8
|
psrlw m1, 8
|
||||||
|
|
||||||
movsxdifnidn wq, wd
|
movsxdifnidn wq, wd
|
||||||
@ -188,6 +213,63 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|||||||
and xq, mmsize * 2 - 1
|
and xq, mmsize * 2 - 1
|
||||||
je .loop_simd
|
je .loop_simd
|
||||||
|
|
||||||
|
%if mmsize == 64
|
||||||
|
shr xq, 1
|
||||||
|
mov tmpq, -1
|
||||||
|
shlx tmpq, tmpq, xq
|
||||||
|
not tmpq
|
||||||
|
kmovq k7, tmpq ; write mask for U/V
|
||||||
|
kmovd k1, tmpd ; write mask for 1st half of Y
|
||||||
|
kmovw k3, tmpd ; read mask for 1st vector
|
||||||
|
shr tmpq, 16
|
||||||
|
kmovw k4, tmpd ; read mask for 2nd vector
|
||||||
|
shr tmpq, 16
|
||||||
|
kmovd k2, tmpd ; write mask for 2nd half of Y
|
||||||
|
kmovw k5, tmpd ; read mask for 3rd vector
|
||||||
|
shr tmpd, 16
|
||||||
|
kmovw k6, tmpd ; read mask for 4th vector
|
||||||
|
|
||||||
|
vmovdqu32 m2{k3}{z}, [srcq + wtwoq ]
|
||||||
|
vmovdqu32 m3{k4}{z}, [srcq + wtwoq + mmsize ]
|
||||||
|
vmovdqu32 m4{k5}{z}, [srcq + wtwoq + mmsize * 2]
|
||||||
|
vmovdqu32 m5{k6}{z}, [srcq + wtwoq + mmsize * 3]
|
||||||
|
|
||||||
|
; extract y part 1
|
||||||
|
mova m6, m9
|
||||||
|
vpermi2b m6, m2, m3 ; UYVY UYVY -> YYYY using permute
|
||||||
|
vmovdqu16 [ydstq + wq]{k1}, m6
|
||||||
|
|
||||||
|
; extract y part 2
|
||||||
|
mova m7, m9
|
||||||
|
vpermi2b m7, m4, m5 ; UYVY UYVY -> YYYY using permute
|
||||||
|
vmovdqu16 [ydstq + wq + mmsize]{k2}, m7
|
||||||
|
|
||||||
|
; extract uv
|
||||||
|
pand m2, m1 ; UxVx...
|
||||||
|
pand m3, m1 ; UxVx...
|
||||||
|
pand m4, m1 ; UxVx...
|
||||||
|
pand m5, m1 ; UxVx...
|
||||||
|
packuswb m2, m3 ; UVUV...
|
||||||
|
packuswb m4, m5 ; UVUV...
|
||||||
|
|
||||||
|
; U
|
||||||
|
pand m6, m2, m1 ; UxUx...
|
||||||
|
pand m7, m4, m1 ; UxUx...
|
||||||
|
packuswb m6, m7 ; UUUU
|
||||||
|
vpermb m6, m8, m6
|
||||||
|
vmovdqu8 [udstq + whalfq]{k7}, m6
|
||||||
|
|
||||||
|
; V
|
||||||
|
psrlw m2, 8 ; VxVx...
|
||||||
|
psrlw m4, 8 ; VxVx...
|
||||||
|
packuswb m2, m4 ; VVVV
|
||||||
|
vpermb m2, m8, m2
|
||||||
|
vmovdqu8 [vdstq + whalfq]{k7}, m2
|
||||||
|
|
||||||
|
lea wq, [ wq + 2 * xq]
|
||||||
|
lea wtwoq, [wtwoq + 4 * xq]
|
||||||
|
add whalfq, xq
|
||||||
|
%else
|
||||||
.loop_scalar:
|
.loop_scalar:
|
||||||
mov tmpb, [srcq + wtwoq + 0]
|
mov tmpb, [srcq + wtwoq + 0]
|
||||||
mov [udstq + whalfq], tmpb
|
mov [udstq + whalfq], tmpb
|
||||||
@ -206,6 +288,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|||||||
add whalfq, 1
|
add whalfq, 1
|
||||||
sub xq, 2
|
sub xq, 2
|
||||||
jg .loop_scalar
|
jg .loop_scalar
|
||||||
|
%endif
|
||||||
|
|
||||||
; check if simd loop is need
|
; check if simd loop is need
|
||||||
cmp wq, 0
|
cmp wq, 0
|
||||||
@ -228,6 +311,17 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|||||||
movu m5, [srcq + wtwoq + mmsize * 3]
|
movu m5, [srcq + wtwoq + mmsize * 3]
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
|
%if mmsize == 64
|
||||||
|
; extract y part 1
|
||||||
|
mova m6, m9
|
||||||
|
vpermi2b m6, m2, m3 ; UYVY UYVY -> YYYY using permute
|
||||||
|
movu [ydstq + wq], m6
|
||||||
|
|
||||||
|
; extract y part 2
|
||||||
|
mova m7, m9
|
||||||
|
vpermi2b m7, m4, m5 ; UYVY UYVY -> YYYY using permute
|
||||||
|
movu [ydstq + wq + mmsize], m7
|
||||||
|
%else
|
||||||
; extract y part 1
|
; extract y part 1
|
||||||
RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY...
|
RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY...
|
||||||
pand m6, m1; YxYx YxYx...
|
pand m6, m1; YxYx YxYx...
|
||||||
@ -247,6 +341,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|||||||
|
|
||||||
packuswb m6, m7 ; YYYY YYYY...
|
packuswb m6, m7 ; YYYY YYYY...
|
||||||
movu [ydstq + wq + mmsize], m6
|
movu [ydstq + wq + mmsize], m6
|
||||||
|
%endif
|
||||||
|
|
||||||
; extract uv
|
; extract uv
|
||||||
pand m2, m1 ; UxVx...
|
pand m2, m1 ; UxVx...
|
||||||
@ -262,6 +357,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|||||||
pand m7, m4, m1 ; UxUx...
|
pand m7, m4, m1 ; UxUx...
|
||||||
|
|
||||||
packuswb m6, m7 ; UUUU
|
packuswb m6, m7 ; UUUU
|
||||||
|
%if mmsize == 64
|
||||||
|
vpermb m6, m8, m6
|
||||||
|
%endif
|
||||||
movu [udstq + whalfq], m6
|
movu [udstq + whalfq], m6
|
||||||
|
|
||||||
|
|
||||||
@ -269,6 +367,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|||||||
psrlw m2, 8 ; VxVx...
|
psrlw m2, 8 ; VxVx...
|
||||||
psrlw m4, 8 ; VxVx...
|
psrlw m4, 8 ; VxVx...
|
||||||
packuswb m2, m4 ; VVVV
|
packuswb m2, m4 ; VVVV
|
||||||
|
%if mmsize == 64
|
||||||
|
vpermb m2, m8, m2
|
||||||
|
%endif
|
||||||
movu [vdstq + whalfq], m2
|
movu [vdstq + whalfq], m2
|
||||||
|
|
||||||
add whalfq, mmsize
|
add whalfq, mmsize
|
||||||
@ -303,4 +404,8 @@ UYVY_TO_YUV422
|
|||||||
INIT_YMM avx2
|
INIT_YMM avx2
|
||||||
UYVY_TO_YUV422
|
UYVY_TO_YUV422
|
||||||
%endif
|
%endif
|
||||||
|
%if HAVE_AVX512ICL_EXTERNAL
|
||||||
|
INIT_ZMM avx512icl
|
||||||
|
UYVY_TO_YUV422
|
||||||
|
%endif
|
||||||
%endif
|
%endif
|
||||||
|
Reference in New Issue
Block a user