1
0
mirror of https://github.com/FFmpeg/FFmpeg.git synced 2024-11-26 19:01:44 +02:00

lavc/bswapdsp: purge RISC-V V bswap32

This cannot beat the Zbb implementation, and it is unlikely that a real
meaningful CPU design would support V and not Zbb. The best loop rewrite
that I could come up with (4 shifts, 2 ands, 3 ors) is still ~40% slower
than Zbb.

A proper faster vector implementation should be feasible with the
cryptographic vector extensions, but that is a story for another time.
This commit is contained in:
Rémi Denis-Courmont 2023-07-16 17:27:45 +03:00
parent 5de1db5370
commit 61e5ca4ded
2 changed files with 1 additions and 27 deletions

View File

@ -26,7 +26,6 @@
#include "libavcodec/bswapdsp.h"
void ff_bswap32_buf_rvb(uint32_t *dst, const uint32_t *src, int len);
void ff_bswap32_buf_rvv(uint32_t *dst, const uint32_t *src, int len);
void ff_bswap16_buf_rvv(uint16_t *dst, const uint16_t *src, int len);
av_cold void ff_bswapdsp_init_riscv(BswapDSPContext *c)
@ -39,10 +38,8 @@ av_cold void ff_bswapdsp_init_riscv(BswapDSPContext *c)
c->bswap_buf = ff_bswap32_buf_rvb;
#endif
#if HAVE_RVV
if (flags & AV_CPU_FLAG_RVV_I32) {
c->bswap_buf = ff_bswap32_buf_rvv;
if (flags & AV_CPU_FLAG_RVV_I32)
c->bswap16_buf = ff_bswap16_buf_rvv;
}
#endif
}
}

View File

@ -21,29 +21,6 @@
#include "config.h"
#include "libavutil/riscv/asm.S"
func ff_bswap32_buf_rvv, zve32x
li t4, 4
addi t1, a0, 1
addi t2, a0, 2
addi t3, a0, 3
1:
vsetvli t0, a2, e8, m1, ta, ma
vlseg4e8.v v8, (a1)
sub a2, a2, t0
sh2add a1, t0, a1
vsse8.v v8, (t3), t4
sh2add t3, t0, t3
vsse8.v v9, (t2), t4
sh2add t2, t0, t2
vsse8.v v10, (t1), t4
sh2add t1, t0, t1
vsse8.v v11, (a0), t4
sh2add a0, t0, a0
bnez a2, 1b
ret
endfunc
func ff_bswap16_buf_rvv, zve32x
1:
vsetvli t0, a2, e16, m8, ta, ma