1
0
mirror of https://github.com/facebook/zstd.git synced 2025-03-07 01:10:04 +02:00

Optimize decompression speed for gcc and clang (#1892)

* Optimize `ZSTD_decodeSequence()`
* Optimize Huffman decoding
* Optimize `ZSTD_decompressSequences()`
* Delete `ZSTD_decodeSequenceLong()`
This commit is contained in:
Nick Terrell 2019-11-25 18:26:19 -08:00 committed by GitHub
parent 762a0dfc45
commit 718f00ff6f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 174 additions and 148 deletions

View File

@ -48,6 +48,7 @@ extern "C" {
* Dependencies
******************************************/
#include "mem.h" /* unaligned access routines */
#include "compiler.h" /* UNLIKELY() */
#include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */
#include "error_private.h" /* error codes and messages */
@ -411,6 +412,23 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
return value;
}
/*! BIT_reloadDStreamFast() :
* Similar to BIT_reloadDStream(), but with two differences:
* 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
* 2. Returns BIT_DStream_overflow when bitD->ptr < bitD->limitPtr, at this
* point you must use BIT_reloadDStream() to reload.
*/
MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
{
if (UNLIKELY(bitD->ptr < bitD->limitPtr))
return BIT_DStream_overflow;
assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
bitD->ptr -= bitD->bitsConsumed >> 3;
bitD->bitsConsumed &= 7;
bitD->bitContainer = MEM_readLEST(bitD->ptr);
return BIT_DStream_unfinished;
}
/*! BIT_reloadDStream() :
* Refill `bitD` from buffer previously set in BIT_initDStream() .
* This function is safe, it guarantees it will not read beyond src buffer.
@ -422,10 +440,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
return BIT_DStream_overflow;
if (bitD->ptr >= bitD->limitPtr) {
bitD->ptr -= bitD->bitsConsumed >> 3;
bitD->bitsConsumed &= 7;
bitD->bitContainer = MEM_readLEST(bitD->ptr);
return BIT_DStream_unfinished;
return BIT_reloadDStreamFast(bitD);
}
if (bitD->ptr == bitD->start) {
if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;

View File

@ -146,6 +146,19 @@
# define DONT_VECTORIZE
#endif
/* Tell the compiler that a branch is likely or unlikely.
* Only use these macros if it causes the compiler to generate better code.
* If you can remove a LIKELY/UNLIKELY annotation without speed changes in gcc
* and clang, please do.
*/
#if defined(__GNUC__)
#define LIKELY(x) (__builtin_expect((x), 1))
#define UNLIKELY(x) (__builtin_expect((x), 0))
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
/* disable warnings */
#ifdef _MSC_VER /* Visual Studio */
# include <intrin.h> /* For Visual 2005 */

View File

@ -294,6 +294,7 @@ HUF_decompress4X1_usingDTable_internal_body(
{ const BYTE* const istart = (const BYTE*) cSrc;
BYTE* const ostart = (BYTE*) dst;
BYTE* const oend = ostart + dstSize;
BYTE* const olimit = oend - 3;
const void* const dtPtr = DTable + 1;
const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
@ -318,9 +319,9 @@ HUF_decompress4X1_usingDTable_internal_body(
BYTE* op2 = opStart2;
BYTE* op3 = opStart3;
BYTE* op4 = opStart4;
U32 endSignal = BIT_DStream_unfinished;
DTableDesc const dtd = HUF_getDTableDesc(DTable);
U32 const dtLog = dtd.tableLog;
U32 endSignal = 1;
if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */
CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
@ -329,8 +330,7 @@ HUF_decompress4X1_usingDTable_internal_body(
CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
/* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
while ( (endSignal==BIT_DStream_unfinished) && (op4<(oend-3)) ) {
for ( ; (endSignal) & (op4 < olimit) ; ) {
HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
@ -347,10 +347,10 @@ HUF_decompress4X1_usingDTable_internal_body(
HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
BIT_reloadDStream(&bitD1);
BIT_reloadDStream(&bitD2);
BIT_reloadDStream(&bitD3);
BIT_reloadDStream(&bitD4);
endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
}
/* check corruption */
@ -769,7 +769,6 @@ HUF_decompress1X2_usingDTable_internal_body(
return dstSize;
}
FORCE_INLINE_TEMPLATE size_t
HUF_decompress4X2_usingDTable_internal_body(
void* dst, size_t dstSize,
@ -781,6 +780,7 @@ HUF_decompress4X2_usingDTable_internal_body(
{ const BYTE* const istart = (const BYTE*) cSrc;
BYTE* const ostart = (BYTE*) dst;
BYTE* const oend = ostart + dstSize;
BYTE* const olimit = oend - (sizeof(size_t)-1);
const void* const dtPtr = DTable+1;
const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
@ -805,7 +805,7 @@ HUF_decompress4X2_usingDTable_internal_body(
BYTE* op2 = opStart2;
BYTE* op3 = opStart3;
BYTE* op4 = opStart4;
U32 endSignal;
U32 endSignal = 1;
DTableDesc const dtd = HUF_getDTableDesc(DTable);
U32 const dtLog = dtd.tableLog;
@ -816,8 +816,29 @@ HUF_decompress4X2_usingDTable_internal_body(
CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
/* 16-32 symbols per loop (4-8 symbols per stream) */
endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
for ( ; (endSignal==BIT_DStream_unfinished) & (op4<(oend-(sizeof(bitD4.bitContainer)-1))) ; ) {
for ( ; (endSignal) & (op4 < olimit); ) {
#ifdef __clang__
HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
#else
HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
@ -834,8 +855,11 @@ HUF_decompress4X2_usingDTable_internal_body(
HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
#endif
}
/* check corruption */

View File

@ -715,7 +715,7 @@ size_t ZSTD_execSequence(BYTE* op,
/* Errors and uncommon cases handled here. */
assert(oLitEnd < oMatchEnd);
if (iLitEnd > litLimit || oMatchEnd > oend_w)
if (UNLIKELY(iLitEnd > litLimit || oMatchEnd > oend_w))
return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
/* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
@ -729,7 +729,7 @@ size_t ZSTD_execSequence(BYTE* op,
*/
assert(WILDCOPY_OVERLENGTH >= 16);
ZSTD_copy16(op, (*litPtr));
if (sequence.litLength > 16) {
if (UNLIKELY(sequence.litLength > 16)) {
ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
}
op = oLitEnd;
@ -738,7 +738,7 @@ size_t ZSTD_execSequence(BYTE* op,
/* Copy Match */
if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
/* offset beyond prefix -> go into extDict */
RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected);
RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected);
match = dictEnd + (match - prefixStart);
if (match + sequence.matchLength <= dictEnd) {
memmove(oLitEnd, match, sequence.matchLength);
@ -760,7 +760,7 @@ size_t ZSTD_execSequence(BYTE* op,
/* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
* without overlap checking.
*/
if (sequence.offset >= WILDCOPY_VECLEN) {
if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
/* We bet on a full wildcopy for matches, since we expect matches to be
* longer than literals (in general). In silesia, ~10% of matches are longer
* than 16 bytes.
@ -802,6 +802,14 @@ ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
DStatePtr->state = DInfo.nextState + lowBits;
}
FORCE_INLINE_TEMPLATE void
ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo)
{
U32 const nbBits = DInfo.nbBits;
size_t const lowBits = BIT_readBits(bitD, nbBits);
DStatePtr->state = DInfo.nextState + lowBits;
}
/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
* offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
* bits before reloading. This value is the maximum number of bytes we read
@ -813,25 +821,26 @@ ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
: 0)
typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e;
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
FORCE_INLINE_TEMPLATE seq_t
ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch)
{
seq_t seq;
U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
U32 const totalBits = llBits+mlBits+ofBits;
U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state];
ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state];
ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state];
U32 const llBase = llDInfo.baseValue;
U32 const mlBase = mlDInfo.baseValue;
U32 const ofBase = ofDInfo.baseValue;
BYTE const llBits = llDInfo.nbAdditionalBits;
BYTE const mlBits = mlDInfo.nbAdditionalBits;
BYTE const ofBits = ofDInfo.nbAdditionalBits;
BYTE const totalBits = llBits+mlBits+ofBits;
/* sequence */
{ size_t offset;
if (!ofBits)
offset = 0;
else {
if (ofBits > 1) {
ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
assert(ofBits <= MaxOff);
@ -845,53 +854,89 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
}
}
if (ofBits <= 1) {
offset += (llBase==0);
if (offset) {
size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
seqState->prevOffset[1] = seqState->prevOffset[0];
seqState->prevOffset[0] = offset = temp;
} else { /* offset == 0 */
offset = seqState->prevOffset[0];
}
} else {
seqState->prevOffset[2] = seqState->prevOffset[1];
seqState->prevOffset[1] = seqState->prevOffset[0];
seqState->prevOffset[0] = offset;
}
} else {
U32 const ll0 = (llBase == 0);
if (LIKELY((ofBits == 0))) {
if (LIKELY(!ll0))
offset = seqState->prevOffset[0];
else {
offset = seqState->prevOffset[1];
seqState->prevOffset[1] = seqState->prevOffset[0];
seqState->prevOffset[0] = offset;
}
} else {
offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
{ size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
seqState->prevOffset[1] = seqState->prevOffset[0];
seqState->prevOffset[0] = offset = temp;
} } }
seq.offset = offset;
}
seq.matchLength = mlBase
+ ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/) : 0); /* <= 16 bits */
seq.matchLength = mlBase;
if (mlBits > 0)
seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
BIT_reloadDStream(&seqState->DStream);
if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
BIT_reloadDStream(&seqState->DStream);
/* Ensure there are enough bits to read the rest of data in 64-bit mode. */
ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
seq.litLength = llBase
+ ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits/*>0*/) : 0); /* <= 16 bits */
seq.litLength = llBase;
if (llBits > 0)
seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
if (MEM_32bits())
BIT_reloadDStream(&seqState->DStream);
DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
(U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
/* ANS state update */
ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
if (prefetch == ZSTD_p_prefetch) {
size_t const pos = seqState->pos + seq.litLength;
const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
* No consequence though : no memory access will occur, offset is only used for prefetching */
seqState->pos = pos + seq.matchLength;
}
/* ANS state update
* gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
* clang-9.2.0 does 7% worse with ZSTD_updateFseState().
* Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
* better option, so it is the default for other compilers. But, if you
* measure that it is worse, please put up a pull request.
*/
{
#if defined(__GNUC__) && !defined(__clang__)
const int kUseUpdateFseState = 1;
#else
const int kUseUpdateFseState = 0;
#endif
if (kUseUpdateFseState) {
ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
} else {
ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */
ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */
if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */
}
}
return seq;
}
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
FORCE_INLINE_TEMPLATE size_t
DONT_VECTORIZE
ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
@ -914,6 +959,7 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
/* Regen sequences */
if (nbSeq) {
seqState_t seqState;
size_t error = 0;
dctx->fseEntropy = 1;
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
RETURN_ERROR_IF(
@ -928,17 +974,25 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
BIT_DStream_endOfBuffer < BIT_DStream_completed &&
BIT_DStream_completed < BIT_DStream_overflow);
for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) {
nbSeq--;
{ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
op += oneSeqSize;
} }
for ( ; ; ) {
seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch);
size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
BIT_reloadDStream(&(seqState.DStream));
/* gcc and clang both don't like early returns in this loop.
* gcc doesn't like early breaks either.
* Instead save an error and report it at the end.
* When there is an error, don't increment op, so we don't
* overwrite.
*/
if (UNLIKELY(ZSTD_isError(oneSeqSize))) error = oneSeqSize;
else op += oneSeqSize;
if (UNLIKELY(!--nbSeq)) break;
}
/* check if reached exact end */
DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
if (ZSTD_isError(error)) return error;
RETURN_ERROR_IF(nbSeq, corruption_detected);
RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected);
/* save reps for next block */
@ -965,87 +1019,7 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
}
#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
FORCE_INLINE_TEMPLATE seq_t
ZSTD_decodeSequenceLong(seqState_t* seqState, ZSTD_longOffset_e const longOffsets)
{
seq_t seq;
U32 const llBits = seqState->stateLL.table[seqState->stateLL.state].nbAdditionalBits;
U32 const mlBits = seqState->stateML.table[seqState->stateML.state].nbAdditionalBits;
U32 const ofBits = seqState->stateOffb.table[seqState->stateOffb.state].nbAdditionalBits;
U32 const totalBits = llBits+mlBits+ofBits;
U32 const llBase = seqState->stateLL.table[seqState->stateLL.state].baseValue;
U32 const mlBase = seqState->stateML.table[seqState->stateML.state].baseValue;
U32 const ofBase = seqState->stateOffb.table[seqState->stateOffb.state].baseValue;
/* sequence */
{ size_t offset;
if (!ofBits)
offset = 0;
else {
ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
assert(ofBits <= MaxOff);
if (MEM_32bits() && longOffsets) {
U32 const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN_32-1);
offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
if (MEM_32bits() || extraBits) BIT_reloadDStream(&seqState->DStream);
if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
} else {
offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */
if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
}
}
if (ofBits <= 1) {
offset += (llBase==0);
if (offset) {
size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
seqState->prevOffset[1] = seqState->prevOffset[0];
seqState->prevOffset[0] = offset = temp;
} else {
offset = seqState->prevOffset[0];
}
} else {
seqState->prevOffset[2] = seqState->prevOffset[1];
seqState->prevOffset[1] = seqState->prevOffset[0];
seqState->prevOffset[0] = offset;
}
seq.offset = offset;
}
seq.matchLength = mlBase + ((mlBits>0) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */
if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
BIT_reloadDStream(&seqState->DStream);
if (MEM_64bits() && (totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
BIT_reloadDStream(&seqState->DStream);
/* Verify that there is enough bits to read the rest of the data in 64-bit mode. */
ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
seq.litLength = llBase + ((llBits>0) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */
if (MEM_32bits())
BIT_reloadDStream(&seqState->DStream);
{ size_t const pos = seqState->pos + seq.litLength;
const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
* No consequence though : no memory access will occur, overly large offset will be detected in ZSTD_execSequenceLong() */
seqState->pos = pos + seq.matchLength;
}
/* ANS state update */
ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */
ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */
if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */
ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */
return seq;
}
FORCE_INLINE_TEMPLATE size_t
ZSTD_decompressSequencesLong_body(
ZSTD_DCtx* dctx,
@ -1088,14 +1062,14 @@ ZSTD_decompressSequencesLong_body(
/* prepare in advance */
for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
sequences[seqNb] = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
}
RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected);
/* decode and decompress */
for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */