You've already forked FFmpeg
mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-08-04 22:03:09 +02:00
swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes
On a AMD 7950x Zen 4 shuffle_bytes_0321_c: 56.5 ( 1.00x) shuffle_bytes_0321_ssse3: 15.2 ( 3.70x) shuffle_bytes_0321_avx2: 10.2 ( 5.51x) shuffle_bytes_0321_avx512icl: 9.2 ( 6.11x) shuffle_bytes_1230_c: 84.5 ( 1.00x) shuffle_bytes_1230_ssse3: 14.2 ( 5.93x) shuffle_bytes_1230_avx2: 15.2 ( 5.54x) shuffle_bytes_1230_avx512icl: 11.2 ( 7.51x) shuffle_bytes_2103_c: 48.5 ( 1.00x) shuffle_bytes_2103_ssse3: 21.2 ( 2.28x) shuffle_bytes_2103_avx2: 13.8 ( 3.53x) shuffle_bytes_2103_avx512icl: 9.2 ( 5.24x) shuffle_bytes_3012_c: 84.5 ( 1.00x) shuffle_bytes_3012_ssse3: 14.2 ( 5.93x) shuffle_bytes_3012_avx2: 16.2 ( 5.20x) shuffle_bytes_3012_avx512icl: 10.2 ( 8.24x) shuffle_bytes_3210_c: 89.2 ( 1.00x) shuffle_bytes_3210_ssse3: 24.2 ( 3.68x) shuffle_bytes_3210_avx2: 16.2 ( 5.49x) shuffle_bytes_3210_avx512icl: 9.2 ( 9.65x) Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
This commit is contained in:
committed by
James Almer
parent
957eb2323a
commit
59f9dbaa31
@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size);
|
|||||||
void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size);
|
void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size);
|
||||||
void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size);
|
void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size);
|
||||||
|
|
||||||
|
void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
|
||||||
|
void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
|
||||||
|
void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
|
||||||
|
void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
|
||||||
|
void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
|
||||||
|
void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
|
||||||
|
void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
|
||||||
|
void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
|
||||||
|
void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
|
||||||
|
|
||||||
void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
||||||
const uint8_t *src, int width, int height,
|
const uint8_t *src, int width, int height,
|
||||||
int lumStride, int chromStride, int srcStride);
|
int lumStride, int chromStride, int srcStride);
|
||||||
@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void)
|
|||||||
shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
|
shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
|
||||||
shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
|
shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
|
||||||
}
|
}
|
||||||
|
if (EXTERNAL_AVX512ICL(cpu_flags)) {
|
||||||
|
shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
|
||||||
|
shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
|
||||||
|
shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
|
||||||
|
shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
|
||||||
|
shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
|
||||||
|
shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
|
||||||
|
shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
|
||||||
|
shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
|
||||||
|
shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
|
||||||
|
}
|
||||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||||
uyvytoyuv422 = ff_uyvytoyuv422_avx2;
|
uyvytoyuv422 = ff_uyvytoyuv422_avx2;
|
||||||
#endif
|
#endif
|
||||||
|
@ -57,40 +57,53 @@ SECTION .text
|
|||||||
%macro SHUFFLE_BYTES 4
|
%macro SHUFFLE_BYTES 4
|
||||||
cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
|
cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
|
||||||
VBROADCASTI128 m0, [pb_shuffle%1%2%3%4]
|
VBROADCASTI128 m0, [pb_shuffle%1%2%3%4]
|
||||||
movsxdifnidn wq, wd
|
movsxdifnidn wq, wd
|
||||||
mov xq, wq
|
mov xq, wq
|
||||||
|
|
||||||
add srcq, wq
|
add srcq, wq
|
||||||
add dstq, wq
|
add dstq, wq
|
||||||
neg wq
|
neg wq
|
||||||
|
|
||||||
;calc scalar loop
|
%if mmsize == 64
|
||||||
|
and xq, mmsize - 4
|
||||||
|
shr xq, 2
|
||||||
|
mov tmpd, -1
|
||||||
|
shlx tmpd, tmpd, xd
|
||||||
|
not tmpd
|
||||||
|
kmovw k7, tmpd
|
||||||
|
vmovdqu32 m1{k7}{z}, [srcq + wq]
|
||||||
|
pshufb m1, m0
|
||||||
|
vmovdqu32 [dstq + wq]{k7}, m1
|
||||||
|
lea wq, [wq + 4 * xq]
|
||||||
|
%else
|
||||||
|
;calc scalar loop
|
||||||
and xq, mmsize-4
|
and xq, mmsize-4
|
||||||
je .loop_simd
|
je .loop_simd
|
||||||
|
|
||||||
.loop_scalar:
|
.loop_scalar:
|
||||||
mov tmpb, [srcq + wq + %1]
|
mov tmpb, [srcq + wq + %1]
|
||||||
mov [dstq+wq + 0], tmpb
|
mov [dstq+wq + 0], tmpb
|
||||||
mov tmpb, [srcq + wq + %2]
|
mov tmpb, [srcq + wq + %2]
|
||||||
mov [dstq+wq + 1], tmpb
|
mov [dstq+wq + 1], tmpb
|
||||||
mov tmpb, [srcq + wq + %3]
|
mov tmpb, [srcq + wq + %3]
|
||||||
mov [dstq+wq + 2], tmpb
|
mov [dstq+wq + 2], tmpb
|
||||||
mov tmpb, [srcq + wq + %4]
|
mov tmpb, [srcq + wq + %4]
|
||||||
mov [dstq+wq + 3], tmpb
|
mov [dstq+wq + 3], tmpb
|
||||||
add wq, 4
|
add wq, 4
|
||||||
sub xq, 4
|
sub xq, 4
|
||||||
jg .loop_scalar
|
jg .loop_scalar
|
||||||
|
%endif
|
||||||
|
|
||||||
;check if src_size < mmsize
|
;check if src_size < mmsize
|
||||||
cmp wq, 0
|
cmp wq, 0
|
||||||
jge .end
|
jge .end
|
||||||
|
|
||||||
.loop_simd:
|
.loop_simd:
|
||||||
movu m1, [srcq+wq]
|
movu m1, [srcq + wq]
|
||||||
pshufb m1, m0
|
pshufb m1, m0
|
||||||
movu [dstq+wq], m1
|
movu [dstq + wq], m1
|
||||||
add wq, mmsize
|
add wq, mmsize
|
||||||
jl .loop_simd
|
jl .loop_simd
|
||||||
|
|
||||||
.end:
|
.end:
|
||||||
RET
|
RET
|
||||||
@ -122,6 +135,21 @@ SHUFFLE_BYTES 1, 2, 0, 3
|
|||||||
%endif
|
%endif
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
|
%if ARCH_X86_64
|
||||||
|
%if HAVE_AVX512ICL_EXTERNAL
|
||||||
|
INIT_ZMM avx512icl
|
||||||
|
SHUFFLE_BYTES 2, 1, 0, 3
|
||||||
|
SHUFFLE_BYTES 0, 3, 2, 1
|
||||||
|
SHUFFLE_BYTES 1, 2, 3, 0
|
||||||
|
SHUFFLE_BYTES 3, 0, 1, 2
|
||||||
|
SHUFFLE_BYTES 3, 2, 1, 0
|
||||||
|
SHUFFLE_BYTES 3, 1, 0, 2
|
||||||
|
SHUFFLE_BYTES 2, 0, 1, 3
|
||||||
|
SHUFFLE_BYTES 2, 1, 3, 0
|
||||||
|
SHUFFLE_BYTES 1, 2, 0, 3
|
||||||
|
%endif
|
||||||
|
%endif
|
||||||
|
|
||||||
;-----------------------------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------------------------
|
||||||
; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
||||||
; const uint8_t *src, int width, int height,
|
; const uint8_t *src, int width, int height,
|
||||||
|
Reference in New Issue
Block a user