1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2025-08-04 22:03:09 +02:00

swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes

On a AMD 7950x Zen 4

shuffle_bytes_0321_c:                                   56.5 ( 1.00x)
shuffle_bytes_0321_ssse3:                               15.2 ( 3.70x)
shuffle_bytes_0321_avx2:                                10.2 ( 5.51x)
shuffle_bytes_0321_avx512icl:                            9.2 ( 6.11x)
shuffle_bytes_1230_c:                                   84.5 ( 1.00x)
shuffle_bytes_1230_ssse3:                               14.2 ( 5.93x)
shuffle_bytes_1230_avx2:                                15.2 ( 5.54x)
shuffle_bytes_1230_avx512icl:                           11.2 ( 7.51x)
shuffle_bytes_2103_c:                                   48.5 ( 1.00x)
shuffle_bytes_2103_ssse3:                               21.2 ( 2.28x)
shuffle_bytes_2103_avx2:                                13.8 ( 3.53x)
shuffle_bytes_2103_avx512icl:                            9.2 ( 5.24x)
shuffle_bytes_3012_c:                                   84.5 ( 1.00x)
shuffle_bytes_3012_ssse3:                               14.2 ( 5.93x)
shuffle_bytes_3012_avx2:                                16.2 ( 5.20x)
shuffle_bytes_3012_avx512icl:                           10.2 ( 8.24x)
shuffle_bytes_3210_c:                                   89.2 ( 1.00x)
shuffle_bytes_3210_ssse3:                               24.2 ( 3.68x)
shuffle_bytes_3210_avx2:                                16.2 ( 5.49x)
shuffle_bytes_3210_avx512icl:                            9.2 ( 9.65x)

Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
This commit is contained in:
Shreesh Adiga
2025-01-28 21:18:57 +05:30
committed by James Almer
parent 957eb2323a
commit 59f9dbaa31
2 changed files with 76 additions and 27 deletions

View File

@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void)
shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
}
if (EXTERNAL_AVX512ICL(cpu_flags)) {
shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
}
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
uyvytoyuv422 = ff_uyvytoyuv422_avx2;
#endif

View File

@ -57,40 +57,53 @@ SECTION .text
%macro SHUFFLE_BYTES 4
cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
VBROADCASTI128 m0, [pb_shuffle%1%2%3%4]
movsxdifnidn wq, wd
mov xq, wq
movsxdifnidn wq, wd
mov xq, wq
add srcq, wq
add dstq, wq
neg wq
add srcq, wq
add dstq, wq
neg wq
;calc scalar loop
%if mmsize == 64
and xq, mmsize - 4
shr xq, 2
mov tmpd, -1
shlx tmpd, tmpd, xd
not tmpd
kmovw k7, tmpd
vmovdqu32 m1{k7}{z}, [srcq + wq]
pshufb m1, m0
vmovdqu32 [dstq + wq]{k7}, m1
lea wq, [wq + 4 * xq]
%else
;calc scalar loop
and xq, mmsize-4
je .loop_simd
.loop_scalar:
mov tmpb, [srcq + wq + %1]
mov [dstq+wq + 0], tmpb
mov tmpb, [srcq + wq + %2]
mov [dstq+wq + 1], tmpb
mov tmpb, [srcq + wq + %3]
mov [dstq+wq + 2], tmpb
mov tmpb, [srcq + wq + %4]
mov [dstq+wq + 3], tmpb
add wq, 4
sub xq, 4
jg .loop_scalar
.loop_scalar:
mov tmpb, [srcq + wq + %1]
mov [dstq+wq + 0], tmpb
mov tmpb, [srcq + wq + %2]
mov [dstq+wq + 1], tmpb
mov tmpb, [srcq + wq + %3]
mov [dstq+wq + 2], tmpb
mov tmpb, [srcq + wq + %4]
mov [dstq+wq + 3], tmpb
add wq, 4
sub xq, 4
jg .loop_scalar
%endif
;check if src_size < mmsize
cmp wq, 0
jge .end
;check if src_size < mmsize
cmp wq, 0
jge .end
.loop_simd:
movu m1, [srcq+wq]
pshufb m1, m0
movu [dstq+wq], m1
add wq, mmsize
jl .loop_simd
.loop_simd:
movu m1, [srcq + wq]
pshufb m1, m0
movu [dstq + wq], m1
add wq, mmsize
jl .loop_simd
.end:
RET
@ -122,6 +135,21 @@ SHUFFLE_BYTES 1, 2, 0, 3
%endif
%endif
%if ARCH_X86_64
%if HAVE_AVX512ICL_EXTERNAL
INIT_ZMM avx512icl
SHUFFLE_BYTES 2, 1, 0, 3
SHUFFLE_BYTES 0, 3, 2, 1
SHUFFLE_BYTES 1, 2, 3, 0
SHUFFLE_BYTES 3, 0, 1, 2
SHUFFLE_BYTES 3, 2, 1, 0
SHUFFLE_BYTES 3, 1, 0, 2
SHUFFLE_BYTES 2, 0, 1, 3
SHUFFLE_BYTES 2, 1, 3, 0
SHUFFLE_BYTES 1, 2, 0, 3
%endif
%endif
;-----------------------------------------------------------------------------------------------
; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
; const uint8_t *src, int width, int height,