diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index bfa98efaa..48a2a7b53 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -341,23 +341,28 @@ MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, * Private declarations *********************************************/ typedef struct seqDef_s { - U32 offset; + U32 offset; /* Offset code of the sequence */ U16 litLength; U16 matchLength; } seqDef; typedef struct { seqDef* sequencesStart; - seqDef* sequences; + seqDef* sequences; /* ptr to end of sequences */ BYTE* litStart; - BYTE* lit; + BYTE* lit; /* ptr to end of literals */ BYTE* llCode; BYTE* mlCode; BYTE* ofCode; size_t maxNbSeq; size_t maxNbLit; - U32 longLengthID; /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */ - U32 longLengthPos; + + /* longLengthPos and longLengthID to allow us to represent either a single litLength or matchLength + * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment + * the existing value of the litLength or matchLength by 0x10000. + */ + U32 longLengthID; /* 0 == no longLength; 1 == Represent the long literal; 2 == Represent the long match; */ + U32 longLengthPos; /* Index of the sequence to apply long length modification to */ } seqStore_t; typedef struct { diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index b19b74127..a054fa38d 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2464,17 +2464,22 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) { const seqStore_t* seqStore = ZSTD_getSeqStore(zc); - const seqDef* seqs = seqStore->sequencesStart; - size_t seqsSize = seqStore->sequences - seqs; + const seqDef* seqStoreSeqs = seqStore->sequencesStart; + size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs; + size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart); + size_t literalsRead = 0; + size_t lastLLSize; ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; - size_t i; size_t position; int repIdx; + size_t i; + int repIdx; assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences); - for (i = 0, position = 0; i < seqsSize; ++i) { - outSeqs[i].offset = seqs[i].offset; - outSeqs[i].litLength = seqs[i].litLength; - outSeqs[i].matchLength = seqs[i].matchLength + MINMATCH; + /* Ensure we have enough space for last literals "sequence" */ + assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1); + for (i = 0; i < seqStoreSeqSize; ++i) { + outSeqs[i].litLength = seqStoreSeqs[i].litLength; + outSeqs[i].matchLength = seqStoreSeqs[i].matchLength + MINMATCH; if (i == seqStore->longLengthPos) { if (seqStore->longLengthID == 1) { @@ -2484,32 +2489,39 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) } } - if (outSeqs[i].offset <= ZSTD_REP_NUM) { - outSeqs[i].rep = outSeqs[i].offset; - repIdx = (unsigned int)i - outSeqs[i].offset; + if (seqStoreSeqs[i].offset <= ZSTD_REP_NUM) { + outSeqs[i].rep = seqStoreSeqs[i].offset; + repIdx = (unsigned int)i - seqStoreSeqs[i].offset; - if (outSeqs[i].litLength == 0) { - if (outSeqs[i].offset < 3) { + if (seqStoreSeqs[i].litLength == 0) { + if (seqStoreSeqs[i].offset < 3) { --repIdx; } else { repIdx = (unsigned int)i - 1; } - ++outSeqs[i].rep; } assert(repIdx >= -3); outSeqs[i].offset = repIdx >= 0 ? outSeqs[repIdx].offset : repStartValue[-repIdx - 1]; - if (outSeqs[i].rep == 4) { + if (outSeqs[i].rep == 3 && outSeqs[i].litLength == 0) { --outSeqs[i].offset; } } else { - outSeqs[i].offset -= ZSTD_REP_NUM; + outSeqs[i].offset = seqStoreSeqs[i].offset - ZSTD_REP_NUM; } - - position += outSeqs[i].litLength; - outSeqs[i].matchPos = (unsigned int)position; - position += outSeqs[i].matchLength; + literalsRead += outSeqs[i].litLength; } - zc->seqCollector.seqIndex += seqsSize; + + /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. + * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker + * for the block boundary, according to the API. + */ + assert(seqStoreLiteralsSize >= literalsRead); + lastLLSize = seqStoreLiteralsSize - literalsRead; + outSeqs[i].litLength = (U32)lastLLSize; + outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0; + seqStoreSeqSize++; + + zc->seqCollector.seqIndex += seqStoreSeqSize; } size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, @@ -2584,6 +2596,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, if (zc->seqCollector.collectSequences) { ZSTD_copyBlockSequences(zc); + ZSTD_confirmRepcodesAndEntropyTables(zc); return 0; } diff --git a/lib/zstd.h b/lib/zstd.h index 733595d9c..293cc7d2e 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -1116,21 +1116,39 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params; typedef struct { - unsigned int matchPos; /* Match pos in dst */ - /* If seqDef.offset > 3, then this is seqDef.offset - 3 - * If seqDef.offset < 3, then this is the corresponding repeat offset - * But if seqDef.offset < 3 and litLength == 0, this is the - * repeat offset before the corresponding repeat offset - * And if seqDef.offset == 3 and litLength == 0, this is the - * most recent repeat offset - 1 - */ - unsigned int offset; - unsigned int litLength; /* Literal length */ - unsigned int matchLength; /* Match length */ - /* 0 when seq not rep and seqDef.offset otherwise - * when litLength == 0 this will be <= 4, otherwise <= 3 like normal - */ - unsigned int rep; + unsigned int offset; /* The offset of the match. (NOT the same as the offset code) + * If offset == 0 and matchLength == 0, this sequence represents the last + * literals in the block of litLength size. + */ + + unsigned int litLength; /* Literal length of the sequence. */ + unsigned int matchLength; /* Match length of the sequence. */ + + /* Note: Users of this API may provide a sequence with matchLength == litLength == offset == 0. + * In this case, we will treat the sequence as a marker for a block boundary. + */ + + unsigned int rep; /* Represents which repeat offset is represented by the field 'offset'. + * Ranges from [0, 3]. + * + * Repeat offsets are essentially previous offsets from previous sequences sorted in + * recency order. For more detail, see doc/zstd_compression_format.md + * + * If rep == 0, then 'offset' does not contain a repeat offset. + * If rep > 0: + * If litLength != 0: + * rep == 1 --> offset == repeat_offset_1 + * rep == 2 --> offset == repeat_offset_2 + * rep == 3 --> offset == repeat_offset_3 + * If litLength == 0: + * rep == 1 --> offset == repeat_offset_2 + * rep == 2 --> offset == repeat_offset_3 + * rep == 3 --> offset == repeat_offset_1 - 1 + * + * Note: This field is optional. ZSTD_getSequences() will calculate the value of + * 'rep', but repeat offsets do not necessarily need to be calculated from an external + * sequence provider's perspective. + */ } ZSTD_Sequence; typedef struct { @@ -1276,7 +1294,9 @@ ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcS ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); /*! ZSTD_getSequences() : - * Extract sequences from the sequence store + * Extract sequences from the sequence store. + * Each block will end with a dummy sequence with offset == 0, matchLength == 0, and litLength == length of last literals. + * * zc can be used to insert custom compression params. * This function invokes ZSTD_compress2 * @return : number of sequences extracted diff --git a/tests/fuzzer.c b/tests/fuzzer.c index 52d30710b..755c13bda 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -309,22 +309,23 @@ static void FUZ_decodeSequences(BYTE* dst, ZSTD_Sequence* seqs, size_t seqsSize, { size_t i; size_t j; - for(i = 0; i < seqsSize - 1; ++i) { - assert(dst + seqs[i].litLength + seqs[i].matchLength < dst + size); - assert(src + seqs[i].litLength + seqs[i].matchLength < src + size); + for(i = 0; i < seqsSize; ++i) { + assert(dst + seqs[i].litLength + seqs[i].matchLength <= dst + size); + assert(src + seqs[i].litLength + seqs[i].matchLength <= src + size); memcpy(dst, src, seqs[i].litLength); dst += seqs[i].litLength; src += seqs[i].litLength; size -= seqs[i].litLength; - for (j = 0; j < seqs[i].matchLength; ++j) - dst[j] = dst[j - seqs[i].offset]; - dst += seqs[i].matchLength; - src += seqs[i].matchLength; - size -= seqs[i].matchLength; + if (seqs[i].offset != 0) { + for (j = 0; j < seqs[i].matchLength; ++j) + dst[j] = dst[j - seqs[i].offset]; + dst += seqs[i].matchLength; + src += seqs[i].matchLength; + size -= seqs[i].matchLength; + } } - memcpy(dst, src, size); } /*============================================= @@ -2726,6 +2727,7 @@ static int basicUnitTests(U32 const seed, double compressibility) ZSTD_freeCCtx(cctx); free(seqs); } + DISPLAYLEVEL(3, "OK \n"); /* Multiple blocks of zeros test */ #define LONGZEROSLENGTH 1000000 /* 1MB of zeros */