1
0
mirror of https://github.com/facebook/zstd.git synced 2025-03-06 16:56:49 +02:00

Merge pull request #2376 from senhuang42/clarify_sequence_extraction_api

Refine external ZSTD_Sequence API
This commit is contained in:
sen 2020-10-30 15:47:25 -04:00 committed by GitHub
commit c37c714ef1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 90 additions and 50 deletions

View File

@ -341,23 +341,28 @@ MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src,
* Private declarations
*********************************************/
typedef struct seqDef_s {
U32 offset;
U32 offset; /* Offset code of the sequence */
U16 litLength;
U16 matchLength;
} seqDef;
typedef struct {
seqDef* sequencesStart;
seqDef* sequences;
seqDef* sequences; /* ptr to end of sequences */
BYTE* litStart;
BYTE* lit;
BYTE* lit; /* ptr to end of literals */
BYTE* llCode;
BYTE* mlCode;
BYTE* ofCode;
size_t maxNbSeq;
size_t maxNbLit;
U32 longLengthID; /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
U32 longLengthPos;
/* longLengthPos and longLengthID to allow us to represent either a single litLength or matchLength
* in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
* the existing value of the litLength or matchLength by 0x10000.
*/
U32 longLengthID; /* 0 == no longLength; 1 == Represent the long literal; 2 == Represent the long match; */
U32 longLengthPos; /* Index of the sequence to apply long length modification to */
} seqStore_t;
typedef struct {

View File

@ -2464,17 +2464,22 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
{
const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
const seqDef* seqs = seqStore->sequencesStart;
size_t seqsSize = seqStore->sequences - seqs;
const seqDef* seqStoreSeqs = seqStore->sequencesStart;
size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
size_t literalsRead = 0;
size_t lastLLSize;
ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
size_t i; size_t position; int repIdx;
size_t i;
int repIdx;
assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
for (i = 0, position = 0; i < seqsSize; ++i) {
outSeqs[i].offset = seqs[i].offset;
outSeqs[i].litLength = seqs[i].litLength;
outSeqs[i].matchLength = seqs[i].matchLength + MINMATCH;
/* Ensure we have enough space for last literals "sequence" */
assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
for (i = 0; i < seqStoreSeqSize; ++i) {
outSeqs[i].litLength = seqStoreSeqs[i].litLength;
outSeqs[i].matchLength = seqStoreSeqs[i].matchLength + MINMATCH;
if (i == seqStore->longLengthPos) {
if (seqStore->longLengthID == 1) {
@ -2484,32 +2489,39 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
}
}
if (outSeqs[i].offset <= ZSTD_REP_NUM) {
outSeqs[i].rep = outSeqs[i].offset;
repIdx = (unsigned int)i - outSeqs[i].offset;
if (seqStoreSeqs[i].offset <= ZSTD_REP_NUM) {
outSeqs[i].rep = seqStoreSeqs[i].offset;
repIdx = (unsigned int)i - seqStoreSeqs[i].offset;
if (outSeqs[i].litLength == 0) {
if (outSeqs[i].offset < 3) {
if (seqStoreSeqs[i].litLength == 0) {
if (seqStoreSeqs[i].offset < 3) {
--repIdx;
} else {
repIdx = (unsigned int)i - 1;
}
++outSeqs[i].rep;
}
assert(repIdx >= -3);
outSeqs[i].offset = repIdx >= 0 ? outSeqs[repIdx].offset : repStartValue[-repIdx - 1];
if (outSeqs[i].rep == 4) {
if (outSeqs[i].rep == 3 && outSeqs[i].litLength == 0) {
--outSeqs[i].offset;
}
} else {
outSeqs[i].offset -= ZSTD_REP_NUM;
outSeqs[i].offset = seqStoreSeqs[i].offset - ZSTD_REP_NUM;
}
position += outSeqs[i].litLength;
outSeqs[i].matchPos = (unsigned int)position;
position += outSeqs[i].matchLength;
literalsRead += outSeqs[i].litLength;
}
zc->seqCollector.seqIndex += seqsSize;
/* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
* If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
* for the block boundary, according to the API.
*/
assert(seqStoreLiteralsSize >= literalsRead);
lastLLSize = seqStoreLiteralsSize - literalsRead;
outSeqs[i].litLength = (U32)lastLLSize;
outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
seqStoreSeqSize++;
zc->seqCollector.seqIndex += seqStoreSeqSize;
}
size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
@ -2584,6 +2596,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
if (zc->seqCollector.collectSequences) {
ZSTD_copyBlockSequences(zc);
ZSTD_confirmRepcodesAndEntropyTables(zc);
return 0;
}

View File

@ -1116,21 +1116,39 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
typedef struct {
unsigned int matchPos; /* Match pos in dst */
/* If seqDef.offset > 3, then this is seqDef.offset - 3
* If seqDef.offset < 3, then this is the corresponding repeat offset
* But if seqDef.offset < 3 and litLength == 0, this is the
* repeat offset before the corresponding repeat offset
* And if seqDef.offset == 3 and litLength == 0, this is the
* most recent repeat offset - 1
*/
unsigned int offset;
unsigned int litLength; /* Literal length */
unsigned int matchLength; /* Match length */
/* 0 when seq not rep and seqDef.offset otherwise
* when litLength == 0 this will be <= 4, otherwise <= 3 like normal
*/
unsigned int rep;
unsigned int offset; /* The offset of the match. (NOT the same as the offset code)
* If offset == 0 and matchLength == 0, this sequence represents the last
* literals in the block of litLength size.
*/
unsigned int litLength; /* Literal length of the sequence. */
unsigned int matchLength; /* Match length of the sequence. */
/* Note: Users of this API may provide a sequence with matchLength == litLength == offset == 0.
* In this case, we will treat the sequence as a marker for a block boundary.
*/
unsigned int rep; /* Represents which repeat offset is represented by the field 'offset'.
* Ranges from [0, 3].
*
* Repeat offsets are essentially previous offsets from previous sequences sorted in
* recency order. For more detail, see doc/zstd_compression_format.md
*
* If rep == 0, then 'offset' does not contain a repeat offset.
* If rep > 0:
* If litLength != 0:
* rep == 1 --> offset == repeat_offset_1
* rep == 2 --> offset == repeat_offset_2
* rep == 3 --> offset == repeat_offset_3
* If litLength == 0:
* rep == 1 --> offset == repeat_offset_2
* rep == 2 --> offset == repeat_offset_3
* rep == 3 --> offset == repeat_offset_1 - 1
*
* Note: This field is optional. ZSTD_getSequences() will calculate the value of
* 'rep', but repeat offsets do not necessarily need to be calculated from an external
* sequence provider's perspective.
*/
} ZSTD_Sequence;
typedef struct {
@ -1276,7 +1294,9 @@ ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcS
ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
/*! ZSTD_getSequences() :
* Extract sequences from the sequence store
* Extract sequences from the sequence store.
* Each block will end with a dummy sequence with offset == 0, matchLength == 0, and litLength == length of last literals.
*
* zc can be used to insert custom compression params.
* This function invokes ZSTD_compress2
* @return : number of sequences extracted

View File

@ -309,22 +309,23 @@ static void FUZ_decodeSequences(BYTE* dst, ZSTD_Sequence* seqs, size_t seqsSize,
{
size_t i;
size_t j;
for(i = 0; i < seqsSize - 1; ++i) {
assert(dst + seqs[i].litLength + seqs[i].matchLength < dst + size);
assert(src + seqs[i].litLength + seqs[i].matchLength < src + size);
for(i = 0; i < seqsSize; ++i) {
assert(dst + seqs[i].litLength + seqs[i].matchLength <= dst + size);
assert(src + seqs[i].litLength + seqs[i].matchLength <= src + size);
memcpy(dst, src, seqs[i].litLength);
dst += seqs[i].litLength;
src += seqs[i].litLength;
size -= seqs[i].litLength;
for (j = 0; j < seqs[i].matchLength; ++j)
dst[j] = dst[j - seqs[i].offset];
dst += seqs[i].matchLength;
src += seqs[i].matchLength;
size -= seqs[i].matchLength;
if (seqs[i].offset != 0) {
for (j = 0; j < seqs[i].matchLength; ++j)
dst[j] = dst[j - seqs[i].offset];
dst += seqs[i].matchLength;
src += seqs[i].matchLength;
size -= seqs[i].matchLength;
}
}
memcpy(dst, src, size);
}
/*=============================================
@ -2726,6 +2727,7 @@ static int basicUnitTests(U32 const seed, double compressibility)
ZSTD_freeCCtx(cctx);
free(seqs);
}
DISPLAYLEVEL(3, "OK \n");
/* Multiple blocks of zeros test */
#define LONGZEROSLENGTH 1000000 /* 1MB of zeros */