1
0
mirror of https://github.com/facebook/zstd.git synced 2025-03-06 16:56:49 +02:00

Merge pull request #2376 from senhuang42/clarify_sequence_extraction_api

Refine external ZSTD_Sequence API
This commit is contained in:
sen 2020-10-30 15:47:25 -04:00 committed by GitHub
commit c37c714ef1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 90 additions and 50 deletions

View File

@ -341,23 +341,28 @@ MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src,
* Private declarations * Private declarations
*********************************************/ *********************************************/
typedef struct seqDef_s { typedef struct seqDef_s {
U32 offset; U32 offset; /* Offset code of the sequence */
U16 litLength; U16 litLength;
U16 matchLength; U16 matchLength;
} seqDef; } seqDef;
typedef struct { typedef struct {
seqDef* sequencesStart; seqDef* sequencesStart;
seqDef* sequences; seqDef* sequences; /* ptr to end of sequences */
BYTE* litStart; BYTE* litStart;
BYTE* lit; BYTE* lit; /* ptr to end of literals */
BYTE* llCode; BYTE* llCode;
BYTE* mlCode; BYTE* mlCode;
BYTE* ofCode; BYTE* ofCode;
size_t maxNbSeq; size_t maxNbSeq;
size_t maxNbLit; size_t maxNbLit;
U32 longLengthID; /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
U32 longLengthPos; /* longLengthPos and longLengthID to allow us to represent either a single litLength or matchLength
* in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
* the existing value of the litLength or matchLength by 0x10000.
*/
U32 longLengthID; /* 0 == no longLength; 1 == Represent the long literal; 2 == Represent the long match; */
U32 longLengthPos; /* Index of the sequence to apply long length modification to */
} seqStore_t; } seqStore_t;
typedef struct { typedef struct {

View File

@ -2464,17 +2464,22 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
{ {
const seqStore_t* seqStore = ZSTD_getSeqStore(zc); const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
const seqDef* seqs = seqStore->sequencesStart; const seqDef* seqStoreSeqs = seqStore->sequencesStart;
size_t seqsSize = seqStore->sequences - seqs; size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
size_t literalsRead = 0;
size_t lastLLSize;
ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
size_t i; size_t position; int repIdx; size_t i;
int repIdx;
assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences); assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
for (i = 0, position = 0; i < seqsSize; ++i) { /* Ensure we have enough space for last literals "sequence" */
outSeqs[i].offset = seqs[i].offset; assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
outSeqs[i].litLength = seqs[i].litLength; for (i = 0; i < seqStoreSeqSize; ++i) {
outSeqs[i].matchLength = seqs[i].matchLength + MINMATCH; outSeqs[i].litLength = seqStoreSeqs[i].litLength;
outSeqs[i].matchLength = seqStoreSeqs[i].matchLength + MINMATCH;
if (i == seqStore->longLengthPos) { if (i == seqStore->longLengthPos) {
if (seqStore->longLengthID == 1) { if (seqStore->longLengthID == 1) {
@ -2484,32 +2489,39 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
} }
} }
if (outSeqs[i].offset <= ZSTD_REP_NUM) { if (seqStoreSeqs[i].offset <= ZSTD_REP_NUM) {
outSeqs[i].rep = outSeqs[i].offset; outSeqs[i].rep = seqStoreSeqs[i].offset;
repIdx = (unsigned int)i - outSeqs[i].offset; repIdx = (unsigned int)i - seqStoreSeqs[i].offset;
if (outSeqs[i].litLength == 0) { if (seqStoreSeqs[i].litLength == 0) {
if (outSeqs[i].offset < 3) { if (seqStoreSeqs[i].offset < 3) {
--repIdx; --repIdx;
} else { } else {
repIdx = (unsigned int)i - 1; repIdx = (unsigned int)i - 1;
} }
++outSeqs[i].rep;
} }
assert(repIdx >= -3); assert(repIdx >= -3);
outSeqs[i].offset = repIdx >= 0 ? outSeqs[repIdx].offset : repStartValue[-repIdx - 1]; outSeqs[i].offset = repIdx >= 0 ? outSeqs[repIdx].offset : repStartValue[-repIdx - 1];
if (outSeqs[i].rep == 4) { if (outSeqs[i].rep == 3 && outSeqs[i].litLength == 0) {
--outSeqs[i].offset; --outSeqs[i].offset;
} }
} else { } else {
outSeqs[i].offset -= ZSTD_REP_NUM; outSeqs[i].offset = seqStoreSeqs[i].offset - ZSTD_REP_NUM;
} }
literalsRead += outSeqs[i].litLength;
position += outSeqs[i].litLength;
outSeqs[i].matchPos = (unsigned int)position;
position += outSeqs[i].matchLength;
} }
zc->seqCollector.seqIndex += seqsSize;
/* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
* If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
* for the block boundary, according to the API.
*/
assert(seqStoreLiteralsSize >= literalsRead);
lastLLSize = seqStoreLiteralsSize - literalsRead;
outSeqs[i].litLength = (U32)lastLLSize;
outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
seqStoreSeqSize++;
zc->seqCollector.seqIndex += seqStoreSeqSize;
} }
size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
@ -2584,6 +2596,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
if (zc->seqCollector.collectSequences) { if (zc->seqCollector.collectSequences) {
ZSTD_copyBlockSequences(zc); ZSTD_copyBlockSequences(zc);
ZSTD_confirmRepcodesAndEntropyTables(zc);
return 0; return 0;
} }

View File

@ -1116,21 +1116,39 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params; typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
typedef struct { typedef struct {
unsigned int matchPos; /* Match pos in dst */ unsigned int offset; /* The offset of the match. (NOT the same as the offset code)
/* If seqDef.offset > 3, then this is seqDef.offset - 3 * If offset == 0 and matchLength == 0, this sequence represents the last
* If seqDef.offset < 3, then this is the corresponding repeat offset * literals in the block of litLength size.
* But if seqDef.offset < 3 and litLength == 0, this is the */
* repeat offset before the corresponding repeat offset
* And if seqDef.offset == 3 and litLength == 0, this is the unsigned int litLength; /* Literal length of the sequence. */
* most recent repeat offset - 1 unsigned int matchLength; /* Match length of the sequence. */
*/
unsigned int offset; /* Note: Users of this API may provide a sequence with matchLength == litLength == offset == 0.
unsigned int litLength; /* Literal length */ * In this case, we will treat the sequence as a marker for a block boundary.
unsigned int matchLength; /* Match length */ */
/* 0 when seq not rep and seqDef.offset otherwise
* when litLength == 0 this will be <= 4, otherwise <= 3 like normal unsigned int rep; /* Represents which repeat offset is represented by the field 'offset'.
*/ * Ranges from [0, 3].
unsigned int rep; *
* Repeat offsets are essentially previous offsets from previous sequences sorted in
* recency order. For more detail, see doc/zstd_compression_format.md
*
* If rep == 0, then 'offset' does not contain a repeat offset.
* If rep > 0:
* If litLength != 0:
* rep == 1 --> offset == repeat_offset_1
* rep == 2 --> offset == repeat_offset_2
* rep == 3 --> offset == repeat_offset_3
* If litLength == 0:
* rep == 1 --> offset == repeat_offset_2
* rep == 2 --> offset == repeat_offset_3
* rep == 3 --> offset == repeat_offset_1 - 1
*
* Note: This field is optional. ZSTD_getSequences() will calculate the value of
* 'rep', but repeat offsets do not necessarily need to be calculated from an external
* sequence provider's perspective.
*/
} ZSTD_Sequence; } ZSTD_Sequence;
typedef struct { typedef struct {
@ -1276,7 +1294,9 @@ ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcS
ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
/*! ZSTD_getSequences() : /*! ZSTD_getSequences() :
* Extract sequences from the sequence store * Extract sequences from the sequence store.
* Each block will end with a dummy sequence with offset == 0, matchLength == 0, and litLength == length of last literals.
*
* zc can be used to insert custom compression params. * zc can be used to insert custom compression params.
* This function invokes ZSTD_compress2 * This function invokes ZSTD_compress2
* @return : number of sequences extracted * @return : number of sequences extracted

View File

@ -309,22 +309,23 @@ static void FUZ_decodeSequences(BYTE* dst, ZSTD_Sequence* seqs, size_t seqsSize,
{ {
size_t i; size_t i;
size_t j; size_t j;
for(i = 0; i < seqsSize - 1; ++i) { for(i = 0; i < seqsSize; ++i) {
assert(dst + seqs[i].litLength + seqs[i].matchLength < dst + size); assert(dst + seqs[i].litLength + seqs[i].matchLength <= dst + size);
assert(src + seqs[i].litLength + seqs[i].matchLength < src + size); assert(src + seqs[i].litLength + seqs[i].matchLength <= src + size);
memcpy(dst, src, seqs[i].litLength); memcpy(dst, src, seqs[i].litLength);
dst += seqs[i].litLength; dst += seqs[i].litLength;
src += seqs[i].litLength; src += seqs[i].litLength;
size -= seqs[i].litLength; size -= seqs[i].litLength;
for (j = 0; j < seqs[i].matchLength; ++j) if (seqs[i].offset != 0) {
dst[j] = dst[j - seqs[i].offset]; for (j = 0; j < seqs[i].matchLength; ++j)
dst += seqs[i].matchLength; dst[j] = dst[j - seqs[i].offset];
src += seqs[i].matchLength; dst += seqs[i].matchLength;
size -= seqs[i].matchLength; src += seqs[i].matchLength;
size -= seqs[i].matchLength;
}
} }
memcpy(dst, src, size);
} }
/*============================================= /*=============================================
@ -2726,6 +2727,7 @@ static int basicUnitTests(U32 const seed, double compressibility)
ZSTD_freeCCtx(cctx); ZSTD_freeCCtx(cctx);
free(seqs); free(seqs);
} }
DISPLAYLEVEL(3, "OK \n");
/* Multiple blocks of zeros test */ /* Multiple blocks of zeros test */
#define LONGZEROSLENGTH 1000000 /* 1MB of zeros */ #define LONGZEROSLENGTH 1000000 /* 1MB of zeros */