mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-01-19 05:49:09 +02:00
lavc/h264dsp: R-V V idct4_add8 (all depths)
These are really just wrappers for idct4_add16intra functions, which are in turn mostly wrappers for idct4_add and idct4_dc_add functions. For benchmarks refer to the later two sets.
This commit is contained in:
parent
eb3cc508d8
commit
4edfc11a28
@ -54,7 +54,13 @@ void ff_h264_idct_add16intra_##depth##_rvv(uint8_t *d, const int *soffset, \
|
||||
const uint8_t nnzc[5 * 8]); \
|
||||
void ff_h264_idct8_add4_##depth##_rvv(uint8_t *d, const int *soffset, \
|
||||
int16_t *s, int stride, \
|
||||
const uint8_t nnzc[5 * 8]);
|
||||
const uint8_t nnzc[5 * 8]); \
|
||||
void ff_h264_idct4_add8_##depth##_rvv(uint8_t **d, const int *soffset, \
|
||||
int16_t *s, int stride, \
|
||||
const uint8_t nnzc[5 * 8]); \
|
||||
void ff_h264_idct4_add8_422_##depth##_rvv(uint8_t **d, const int *soffset, \
|
||||
int16_t *s, int stride, \
|
||||
const uint8_t nnzc[5 * 8]);
|
||||
|
||||
IDCT_DEPTH(8)
|
||||
IDCT_DEPTH(9)
|
||||
@ -104,6 +110,10 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
|
||||
dsp->h264_idct_add16intra = ff_h264_idct_add16intra_8_rvv;
|
||||
# if __riscv_xlen == 64
|
||||
dsp->h264_idct8_add4 = ff_h264_idct8_add4_8_rvv;
|
||||
if (chroma_format_idc <= 1)
|
||||
dsp->h264_idct_add8 = ff_h264_idct4_add8_8_rvv;
|
||||
else
|
||||
dsp->h264_idct_add8 = ff_h264_idct4_add8_422_8_rvv;
|
||||
# endif
|
||||
}
|
||||
if (flags & AV_CPU_FLAG_RVV_I64) {
|
||||
@ -123,10 +133,16 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
|
||||
if (zvl128b && (flags & AV_CPU_FLAG_RVB)) { \
|
||||
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_##depth##_rvv; \
|
||||
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_##depth##_rvv; \
|
||||
dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \
|
||||
dsp->h264_idct_add16intra = \
|
||||
ff_h264_idct_add16intra_##depth##_rvv; \
|
||||
if (__riscv_xlen == 64) { \
|
||||
dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \
|
||||
dsp->h264_idct_add16intra = \
|
||||
ff_h264_idct_add16intra_##depth##_rvv; \
|
||||
if (chroma_format_idc <= 1) \
|
||||
dsp->h264_idct_add8 = \
|
||||
ff_h264_idct4_add8_##depth##_rvv; \
|
||||
else \
|
||||
dsp->h264_idct_add8 = \
|
||||
ff_h264_idct4_add8_422_##depth##_rvv; \
|
||||
} \
|
||||
} \
|
||||
if (__riscv_xlen == 64 && (flags & AV_CPU_FLAG_RVB)) \
|
||||
|
@ -57,7 +57,7 @@ endfunc
|
||||
func ff_h264_idct_add_8_rvv, zve32x
|
||||
lpad 0
|
||||
csrwi vxrm, 0
|
||||
.Lidct_add4_8_rvv:
|
||||
.Lidct4_add_8_rvv:
|
||||
vsetivli zero, 4, e16, mf2, ta, ma
|
||||
addi t1, a1, 1 * 4 * 2
|
||||
vle16.v v0, (a1)
|
||||
@ -111,7 +111,7 @@ endfunc
|
||||
|
||||
func ff_h264_idct_add_16_rvv, zve32x
|
||||
csrwi vxrm, 0
|
||||
.Lidct_add4_16_rvv:
|
||||
.Lidct4_add_16_rvv:
|
||||
vsetivli zero, 4, e32, m1, ta, ma
|
||||
addi t1, a1, 1 * 4 * 4
|
||||
vle32.v v0, (a1)
|
||||
@ -543,19 +543,26 @@ endfunc
|
||||
.endr
|
||||
|
||||
const ff_h264_scan8
|
||||
.byte 014, 015, 024, 025, 016, 017, 026, 027
|
||||
.byte 034, 035, 044, 045, 036, 037, 046, 047
|
||||
.byte 014, 015, 024, 025, 016, 017, 026, 027
|
||||
.byte 034, 035, 044, 045, 036, 037, 046, 047
|
||||
.byte 064, 065, 074, 075, 066, 067, 076, 077
|
||||
.byte 0104, 0105, 0114, 0115, 0106, 0107, 0116, 0117
|
||||
.byte 0134, 0135, 0144, 0145, 0136, 0137, 0146, 0147
|
||||
.byte 0154, 0155, 0164, 0165, 0156, 0157, 0166, 0167
|
||||
endconst
|
||||
|
||||
.macro idct4_adds type, depth
|
||||
.macro idct4_add16 type, depth
|
||||
func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b
|
||||
.if \depth == 8
|
||||
lpad 0
|
||||
.endif
|
||||
csrwi vxrm, 0
|
||||
lla t0, ff_h264_scan8
|
||||
li t1, 32 * (\depth / 8)
|
||||
vsetivli zero, 16, e8, m1, ta, ma
|
||||
.ifc \type, 16intra
|
||||
.Lidct4_add4_\depth\()_rvv:
|
||||
.endif
|
||||
li t1, 32 * (\depth / 8)
|
||||
vle8.v v8, (t0)
|
||||
.if \depth == 8
|
||||
vlse16.v v16, (a2), t1
|
||||
@ -587,7 +594,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b
|
||||
mv t5, a1
|
||||
mv a1, a2
|
||||
mv a2, a3
|
||||
li a3, 16
|
||||
csrr a3, vl
|
||||
mv a7, ra
|
||||
1:
|
||||
andi t0, a4, 1
|
||||
@ -603,7 +610,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b
|
||||
.else
|
||||
beqz t0, 2f # if (nnzc[scan8[i]])
|
||||
.endif
|
||||
jal .Lidct_add4_\depth\()_rvv
|
||||
jal .Lidct4_add_\depth\()_rvv
|
||||
j 3f
|
||||
2:
|
||||
.ifnc \type, 16
|
||||
@ -621,9 +628,67 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro idct4_add8 type, depth
|
||||
func ff_h264_idct4_add\type\()_\depth\()_rvv, zve32x
|
||||
.if \depth == 8
|
||||
lpad 0
|
||||
.endif
|
||||
csrwi vxrm, 0
|
||||
addi sp, sp, -32
|
||||
addi a2, a2, 16 * 16 * 2 * (\depth / 8) # &block[16 * 16]
|
||||
lla t0, ff_h264_scan8 + 16
|
||||
sd s0, 0(sp)
|
||||
sd ra, 8(sp)
|
||||
mv s0, sp
|
||||
sd a0, 16(sp)
|
||||
sd a4, 24(sp)
|
||||
ld a0, 0(a0) # dest[0]
|
||||
addi a1, a1, 16 * 4 # &block_offset[16]
|
||||
vsetivli zero, 4, e8, mf4, ta, ma
|
||||
jal .Lidct4_add4_\depth\()_rvv
|
||||
|
||||
ld a4, 24(sp) # nnzc
|
||||
ld a0, 16(sp)
|
||||
mv a3, a2 # stride
|
||||
addi a2, a1, (16 - 4) * 16 * 2 * (\depth / 8) # &block[32 * 16]
|
||||
addi a1, t5, (16 - 4) * 4 # &block_offset[32]
|
||||
ld a0, 8(a0) # dest[1]
|
||||
lla t0, ff_h264_scan8 + 32
|
||||
.ifc \type, 8_422
|
||||
vsetivli zero, 4, e8, mf4, ta, ma
|
||||
jal .Lidct4_add4_\depth\()_rvv
|
||||
|
||||
ld a4, 24(sp) # nnzc
|
||||
ld a0, 16(sp)
|
||||
mv a3, a2 # stride
|
||||
addi a2, a1, (-12- 4) * 16 * 2 * (\depth / 8) # &block[20 * 16]
|
||||
addi a1, t5, (-8 - 4) * 4 # &block_offset[24]
|
||||
ld a0, 0(a0) # dest[0]
|
||||
lla t0, ff_h264_scan8 + 24
|
||||
vsetivli zero, 4, e8, mf4, ta, ma
|
||||
jal .Lidct4_add4_\depth\()_rvv
|
||||
|
||||
ld a4, 24(sp) # nnzc
|
||||
ld a0, 16(sp)
|
||||
mv a3, a2 # stride
|
||||
addi a2, a1, (16 - 4) * 16 * 2 * (\depth / 8) # &block[36 * 16]
|
||||
addi a1, t5, (16 - 4) * 4 # &block_offset[40]
|
||||
ld a0, 8(a0) # dest[1]
|
||||
lla t0, ff_h264_scan8 + 40
|
||||
.endif
|
||||
ld ra, 8(sp)
|
||||
ld s0, 0(sp)
|
||||
addi sp, sp, 32
|
||||
vsetivli zero, 4, e8, mf4, ta, ma
|
||||
j .Lidct4_add4_\depth\()_rvv
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.irp depth, 8, 16
|
||||
idct4_adds 16, \depth
|
||||
idct4_adds 16intra, \depth
|
||||
idct4_add16 16, \depth
|
||||
idct4_add16 16intra, \depth
|
||||
idct4_add8 8, \depth
|
||||
idct4_add8 8_422, \depth
|
||||
|
||||
#if (__riscv_xlen == 64)
|
||||
func ff_h264_idct8_add4_\depth\()_rvv, zve32x, b
|
||||
@ -724,5 +789,17 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
|
||||
li a5, (1 << \depth) - 1
|
||||
j ff_h264_idct8_add4_16_rvv
|
||||
endfunc
|
||||
|
||||
func ff_h264_idct4_add8_\depth\()_rvv, zve32x
|
||||
lpad 0
|
||||
li a5, (1 << \depth) - 1
|
||||
j ff_h264_idct4_add8_16_rvv
|
||||
endfunc
|
||||
|
||||
func ff_h264_idct4_add8_422_\depth\()_rvv, zve32x
|
||||
lpad 0
|
||||
li a5, (1 << \depth) - 1
|
||||
j ff_h264_idct4_add8_422_16_rvv
|
||||
endfunc
|
||||
#endif
|
||||
.endr
|
||||
|
Loading…
x
Reference in New Issue
Block a user