You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-04 22:03:09 +02:00
swscale/x86/rgb2rgb: optimize AVX2 version of uyvytoyuv422
Currently the AVX2 version of uyvytoyuv422 in the SIMD loop does the following: 4 vinsertq to have interleaving of the vector lanes during load from memory. 4 vperm2i128 inside 4 RSHIFT_COPY calls to achieve the desired layout. This patch replaces the above 8 instructions with 2 vpermq and 2 vpermd with a vector register similar to AVX512ICL version. Observed the following numbers on various microarchitectures: On AMD Zen3 laptop: Before: uyvytoyuv422_c: 51979.7 ( 1.00x) uyvytoyuv422_sse2: 5410.5 ( 9.61x) uyvytoyuv422_avx: 4642.7 (11.20x) uyvytoyuv422_avx2: 4249.0 (12.23x) After: uyvytoyuv422_c: 51659.8 ( 1.00x) uyvytoyuv422_sse2: 5420.8 ( 9.53x) uyvytoyuv422_avx: 4651.2 (11.11x) uyvytoyuv422_avx2: 3953.8 (13.07x) On Intel Macbook Pro 2019: Before: uyvytoyuv422_c: 185014.4 ( 1.00x) uyvytoyuv422_sse2: 22800.4 ( 8.11x) uyvytoyuv422_avx: 19796.9 ( 9.35x) uyvytoyuv422_avx2: 13141.9 (14.08x) After: uyvytoyuv422_c: 185093.4 ( 1.00x) uyvytoyuv422_sse2: 22795.4 ( 8.12x) uyvytoyuv422_avx: 19791.9 ( 9.35x) uyvytoyuv422_avx2: 12043.1 (15.37x) On AMD Zen4 desktop: Before: uyvytoyuv422_c: 29105.0 ( 1.00x) uyvytoyuv422_sse2: 3888.0 ( 7.49x) uyvytoyuv422_avx: 3374.2 ( 8.63x) uyvytoyuv422_avx2: 2649.8 (10.98x) uyvytoyuv422_avx512icl: 1615.0 (18.02x) After: uyvytoyuv422_c: 29093.4 ( 1.00x) uyvytoyuv422_sse2: 3874.4 ( 7.51x) uyvytoyuv422_avx: 3371.6 ( 8.63x) uyvytoyuv422_avx2: 2174.6 (13.38x) uyvytoyuv422_avx512icl: 1625.1 (17.90x) Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
This commit is contained in:
committed by
Kieran Kunhya
parent
fc44ccd981
commit
26f2f03e0d
@ -49,18 +49,21 @@ shuf_perm2b: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25,
|
|||||||
97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127
|
97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
|
%if HAVE_AVX2_EXTERNAL
|
||||||
|
; shuffle vector to rearrange packuswb result to be linear
|
||||||
|
shuf_packus_avx2: db 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0,\
|
||||||
|
2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0,
|
||||||
|
%endif
|
||||||
|
|
||||||
SECTION .text
|
SECTION .text
|
||||||
|
|
||||||
%macro RSHIFT_COPY 5
|
%macro RSHIFT_COPY 3
|
||||||
; %1 dst ; %2 src ; %3 shift
|
; %1 dst ; %2 src ; %3 shift
|
||||||
%if mmsize == 32
|
%if cpuflag(avx) || cpuflag(avx2) || cpuflag(avx512icl)
|
||||||
vperm2i128 %1, %2, %3, %5
|
psrldq %1, %2, %3
|
||||||
RSHIFT %1, %4
|
|
||||||
%elif cpuflag(avx)
|
|
||||||
psrldq %1, %2, %4
|
|
||||||
%else
|
%else
|
||||||
mova %1, %2
|
mova %1, %2
|
||||||
RSHIFT %1, %4
|
RSHIFT %1, %3
|
||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
@ -170,18 +173,16 @@ SHUFFLE_BYTES 1, 2, 0, 3
|
|||||||
; int lumStride, int chromStride, int srcStride)
|
; int lumStride, int chromStride, int srcStride)
|
||||||
;-----------------------------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------------------------
|
||||||
%macro UYVY_TO_YUV422 0
|
%macro UYVY_TO_YUV422 0
|
||||||
%if mmsize == 64
|
cglobal uyvytoyuv422, 9, 14, 8 + cpuflag(avx2) + cpuflag(avx512icl), ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
|
||||||
; need two more registers to store shuffle vectors for AVX512ICL
|
|
||||||
cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
|
|
||||||
%else
|
|
||||||
cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
|
|
||||||
%endif
|
|
||||||
pxor m0, m0
|
pxor m0, m0
|
||||||
%if mmsize == 64
|
%if mmsize == 64
|
||||||
vpternlogd m1, m1, m1, 0xff ; m1 = _mm512_set1_epi8(0xff)
|
vpternlogd m1, m1, m1, 0xff ; m1 = _mm512_set1_epi8(0xff)
|
||||||
movu m8, [shuf_packus]
|
movu m8, [shuf_packus]
|
||||||
movu m9, [shuf_perm2b]
|
movu m9, [shuf_perm2b]
|
||||||
%else
|
%else
|
||||||
|
%if cpuflag(avx2)
|
||||||
|
movu m8, [shuf_packus_avx2]
|
||||||
|
%endif
|
||||||
pcmpeqw m1, m1
|
pcmpeqw m1, m1
|
||||||
%endif
|
%endif
|
||||||
psrlw m1, 8
|
psrlw m1, 8
|
||||||
@ -295,21 +296,10 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|||||||
jge .end_line
|
jge .end_line
|
||||||
|
|
||||||
.loop_simd:
|
.loop_simd:
|
||||||
%if mmsize == 32
|
|
||||||
movu xm2, [srcq + wtwoq ]
|
|
||||||
movu xm3, [srcq + wtwoq + 16 ]
|
|
||||||
movu xm4, [srcq + wtwoq + 16 * 2]
|
|
||||||
movu xm5, [srcq + wtwoq + 16 * 3]
|
|
||||||
vinserti128 m2, m2, [srcq + wtwoq + 16 * 4], 1
|
|
||||||
vinserti128 m3, m3, [srcq + wtwoq + 16 * 5], 1
|
|
||||||
vinserti128 m4, m4, [srcq + wtwoq + 16 * 6], 1
|
|
||||||
vinserti128 m5, m5, [srcq + wtwoq + 16 * 7], 1
|
|
||||||
%else
|
|
||||||
movu m2, [srcq + wtwoq ]
|
movu m2, [srcq + wtwoq ]
|
||||||
movu m3, [srcq + wtwoq + mmsize ]
|
movu m3, [srcq + wtwoq + mmsize ]
|
||||||
movu m4, [srcq + wtwoq + mmsize * 2]
|
movu m4, [srcq + wtwoq + mmsize * 2]
|
||||||
movu m5, [srcq + wtwoq + mmsize * 3]
|
movu m5, [srcq + wtwoq + mmsize * 3]
|
||||||
%endif
|
|
||||||
|
|
||||||
%if mmsize == 64
|
%if mmsize == 64
|
||||||
; extract y part 1
|
; extract y part 1
|
||||||
@ -323,23 +313,29 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|||||||
movu [ydstq + wq + mmsize], m7
|
movu [ydstq + wq + mmsize], m7
|
||||||
%else
|
%else
|
||||||
; extract y part 1
|
; extract y part 1
|
||||||
RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY...
|
RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
|
||||||
pand m6, m1; YxYx YxYx...
|
pand m6, m1 ; YxYx YxYx...
|
||||||
|
|
||||||
RSHIFT_COPY m7, m3, m5, 1, 0x20 ; UYVY UYVY -> YVYU YVY...
|
RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
|
||||||
pand m7, m1 ; YxYx YxYx...
|
pand m7, m1 ; YxYx YxYx...
|
||||||
|
|
||||||
packuswb m6, m7 ; YYYY YYYY...
|
packuswb m6, m7 ; YYYY YYYY...
|
||||||
|
%if mmsize == 32
|
||||||
|
vpermq m6, m6, 0xd8
|
||||||
|
%endif
|
||||||
movu [ydstq + wq], m6
|
movu [ydstq + wq], m6
|
||||||
|
|
||||||
; extract y part 2
|
; extract y part 2
|
||||||
RSHIFT_COPY m6, m4, m2, 1, 0x13 ; UYVY UYVY -> YVYU YVY...
|
RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
|
||||||
pand m6, m1; YxYx YxYx...
|
pand m6, m1 ; YxYx YxYx...
|
||||||
|
|
||||||
RSHIFT_COPY m7, m5, m3, 1, 0x13 ; UYVY UYVY -> YVYU YVY...
|
RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
|
||||||
pand m7, m1 ; YxYx YxYx...
|
pand m7, m1 ; YxYx YxYx...
|
||||||
|
|
||||||
packuswb m6, m7 ; YYYY YYYY...
|
packuswb m6, m7 ; YYYY YYYY...
|
||||||
|
%if mmsize == 32
|
||||||
|
vpermq m6, m6, 0xd8
|
||||||
|
%endif
|
||||||
movu [ydstq + wq + mmsize], m6
|
movu [ydstq + wq + mmsize], m6
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
@ -359,6 +355,8 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|||||||
packuswb m6, m7 ; UUUU
|
packuswb m6, m7 ; UUUU
|
||||||
%if mmsize == 64
|
%if mmsize == 64
|
||||||
vpermb m6, m8, m6
|
vpermb m6, m8, m6
|
||||||
|
%elif mmsize == 32
|
||||||
|
vpermd m6, m8, m6
|
||||||
%endif
|
%endif
|
||||||
movu [udstq + whalfq], m6
|
movu [udstq + whalfq], m6
|
||||||
|
|
||||||
@ -369,6 +367,8 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
|
|||||||
packuswb m2, m4 ; VVVV
|
packuswb m2, m4 ; VVVV
|
||||||
%if mmsize == 64
|
%if mmsize == 64
|
||||||
vpermb m2, m8, m2
|
vpermb m2, m8, m2
|
||||||
|
%elif mmsize == 32
|
||||||
|
vpermd m2, m8, m2
|
||||||
%endif
|
%endif
|
||||||
movu [vdstq + whalfq], m2
|
movu [vdstq + whalfq], m2
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user