1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-11-21 10:55:51 +02:00

libavcodec/bswapdsp : add AVX2 func for bswap_buf (swap uint32_t)

This commit is contained in:
Martin Vignali 2017-10-22 19:06:50 +02:00 committed by James Darnley
parent 9b0510a8e3
commit e9930883a2
2 changed files with 38 additions and 13 deletions

View File

@ -35,14 +35,18 @@ SECTION .text
mov r3d, r2d
sar r2d, 3
jz .left4_%1
%if cpuflag(avx2)
sar r2d, 1
jz .left8_%1
%endif
.loop8_%1:
mov%1 m0, [r1 + 0]
mov%1 m1, [r1 + 16]
%if cpuflag(ssse3)
mov%1 m1, [r1 + mmsize]
%if cpuflag(ssse3)||cpuflag(avx2)
pshufb m0, m2
pshufb m1, m2
mov%1 [r0 + 0], m0
mov%1 [r0 + 16], m1
mov%1 [r0 + mmsize], m1
%else
pshuflw m0, m0, 10110001b
pshuflw m1, m1, 10110001b
@ -59,18 +63,29 @@ SECTION .text
mov%1 [r0 + 0], m2
mov%1 [r0 + 16], m3
%endif
add r0, 32
add r1, 32
add r0, mmsize*2
add r1, mmsize*2
dec r2d
jnz .loop8_%1
%if cpuflag(avx2)
.left8_%1:
mov r2d, r3d
test r3d, 8
jz .left4_%1
mov%1 m0, [r1]
pshufb m0, m2
mov%1 [r0 + 0], m0
add r1, mmsize
add r0, mmsize
%endif
.left4_%1:
mov r2d, r3d
test r3d, 4
jz .left
mov%1 m0, [r1]
mov%1 xm0, [r1]
%if cpuflag(ssse3)
pshufb m0, m2
mov%1 [r0], m0
pshufb xm0, xm2
mov%1 [r0], xm0
%else
pshuflw m0, m0, 10110001b
pshufhw m0, m0, 10110001b
@ -86,16 +101,20 @@ SECTION .text
; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
%macro BSWAP32_BUF 0
%if cpuflag(ssse3)
%if cpuflag(ssse3)||cpuflag(avx2)
cglobal bswap32_buf, 3,4,3
mov r3, r1
%if cpuflag(avx2)
vbroadcasti128 m2, [pb_bswap32]
%else
mova m2, [pb_bswap32]
%endif
%else
cglobal bswap32_buf, 3,4,5
mov r3, r1
%endif
or r3, r0
test r3, 15
test r3, mmsize - 1
jz .start_align
BSWAP_LOOPS u
jmp .left
@ -105,9 +124,9 @@ cglobal bswap32_buf, 3,4,5
%if cpuflag(ssse3)
test r2d, 2
jz .left1
movq m0, [r1]
pshufb m0, m2
movq [r0], m0
movq xm0, [r1]
pshufb xm0, xm2
movq [r0], xm0
add r1, 8
add r0, 8
.left1:
@ -137,3 +156,6 @@ BSWAP32_BUF
INIT_XMM ssse3
BSWAP32_BUF
INIT_YMM avx2
BSWAP32_BUF

View File

@ -25,6 +25,7 @@
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_avx2(uint32_t *dst, const uint32_t *src, int w);
av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
{
@ -34,4 +35,6 @@ av_cold void ff_bswapdsp_init_x86(BswapDSPContext *c)
c->bswap_buf = ff_bswap32_buf_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
c->bswap_buf = ff_bswap32_buf_ssse3;
if (EXTERNAL_AVX2_FAST(cpu_flags))
c->bswap_buf = ff_bswap32_buf_avx2;
}