mirror of
https://github.com/facebook/zstd.git
synced 2025-03-06 16:56:49 +02:00
Merge pull request #2376 from senhuang42/clarify_sequence_extraction_api
Refine external ZSTD_Sequence API
This commit is contained in:
commit
c37c714ef1
@ -341,23 +341,28 @@ MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src,
|
|||||||
* Private declarations
|
* Private declarations
|
||||||
*********************************************/
|
*********************************************/
|
||||||
typedef struct seqDef_s {
|
typedef struct seqDef_s {
|
||||||
U32 offset;
|
U32 offset; /* Offset code of the sequence */
|
||||||
U16 litLength;
|
U16 litLength;
|
||||||
U16 matchLength;
|
U16 matchLength;
|
||||||
} seqDef;
|
} seqDef;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
seqDef* sequencesStart;
|
seqDef* sequencesStart;
|
||||||
seqDef* sequences;
|
seqDef* sequences; /* ptr to end of sequences */
|
||||||
BYTE* litStart;
|
BYTE* litStart;
|
||||||
BYTE* lit;
|
BYTE* lit; /* ptr to end of literals */
|
||||||
BYTE* llCode;
|
BYTE* llCode;
|
||||||
BYTE* mlCode;
|
BYTE* mlCode;
|
||||||
BYTE* ofCode;
|
BYTE* ofCode;
|
||||||
size_t maxNbSeq;
|
size_t maxNbSeq;
|
||||||
size_t maxNbLit;
|
size_t maxNbLit;
|
||||||
U32 longLengthID; /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
|
|
||||||
U32 longLengthPos;
|
/* longLengthPos and longLengthID to allow us to represent either a single litLength or matchLength
|
||||||
|
* in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
|
||||||
|
* the existing value of the litLength or matchLength by 0x10000.
|
||||||
|
*/
|
||||||
|
U32 longLengthID; /* 0 == no longLength; 1 == Represent the long literal; 2 == Represent the long match; */
|
||||||
|
U32 longLengthPos; /* Index of the sequence to apply long length modification to */
|
||||||
} seqStore_t;
|
} seqStore_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -2464,17 +2464,22 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
|
|||||||
static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
|
static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
|
||||||
{
|
{
|
||||||
const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
|
const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
|
||||||
const seqDef* seqs = seqStore->sequencesStart;
|
const seqDef* seqStoreSeqs = seqStore->sequencesStart;
|
||||||
size_t seqsSize = seqStore->sequences - seqs;
|
size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
|
||||||
|
size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
|
||||||
|
size_t literalsRead = 0;
|
||||||
|
size_t lastLLSize;
|
||||||
|
|
||||||
ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
|
ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
|
||||||
size_t i; size_t position; int repIdx;
|
size_t i;
|
||||||
|
int repIdx;
|
||||||
|
|
||||||
assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
|
assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
|
||||||
for (i = 0, position = 0; i < seqsSize; ++i) {
|
/* Ensure we have enough space for last literals "sequence" */
|
||||||
outSeqs[i].offset = seqs[i].offset;
|
assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
|
||||||
outSeqs[i].litLength = seqs[i].litLength;
|
for (i = 0; i < seqStoreSeqSize; ++i) {
|
||||||
outSeqs[i].matchLength = seqs[i].matchLength + MINMATCH;
|
outSeqs[i].litLength = seqStoreSeqs[i].litLength;
|
||||||
|
outSeqs[i].matchLength = seqStoreSeqs[i].matchLength + MINMATCH;
|
||||||
|
|
||||||
if (i == seqStore->longLengthPos) {
|
if (i == seqStore->longLengthPos) {
|
||||||
if (seqStore->longLengthID == 1) {
|
if (seqStore->longLengthID == 1) {
|
||||||
@ -2484,32 +2489,39 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (outSeqs[i].offset <= ZSTD_REP_NUM) {
|
if (seqStoreSeqs[i].offset <= ZSTD_REP_NUM) {
|
||||||
outSeqs[i].rep = outSeqs[i].offset;
|
outSeqs[i].rep = seqStoreSeqs[i].offset;
|
||||||
repIdx = (unsigned int)i - outSeqs[i].offset;
|
repIdx = (unsigned int)i - seqStoreSeqs[i].offset;
|
||||||
|
|
||||||
if (outSeqs[i].litLength == 0) {
|
if (seqStoreSeqs[i].litLength == 0) {
|
||||||
if (outSeqs[i].offset < 3) {
|
if (seqStoreSeqs[i].offset < 3) {
|
||||||
--repIdx;
|
--repIdx;
|
||||||
} else {
|
} else {
|
||||||
repIdx = (unsigned int)i - 1;
|
repIdx = (unsigned int)i - 1;
|
||||||
}
|
}
|
||||||
++outSeqs[i].rep;
|
|
||||||
}
|
}
|
||||||
assert(repIdx >= -3);
|
assert(repIdx >= -3);
|
||||||
outSeqs[i].offset = repIdx >= 0 ? outSeqs[repIdx].offset : repStartValue[-repIdx - 1];
|
outSeqs[i].offset = repIdx >= 0 ? outSeqs[repIdx].offset : repStartValue[-repIdx - 1];
|
||||||
if (outSeqs[i].rep == 4) {
|
if (outSeqs[i].rep == 3 && outSeqs[i].litLength == 0) {
|
||||||
--outSeqs[i].offset;
|
--outSeqs[i].offset;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
outSeqs[i].offset -= ZSTD_REP_NUM;
|
outSeqs[i].offset = seqStoreSeqs[i].offset - ZSTD_REP_NUM;
|
||||||
}
|
}
|
||||||
|
literalsRead += outSeqs[i].litLength;
|
||||||
position += outSeqs[i].litLength;
|
|
||||||
outSeqs[i].matchPos = (unsigned int)position;
|
|
||||||
position += outSeqs[i].matchLength;
|
|
||||||
}
|
}
|
||||||
zc->seqCollector.seqIndex += seqsSize;
|
|
||||||
|
/* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
|
||||||
|
* If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
|
||||||
|
* for the block boundary, according to the API.
|
||||||
|
*/
|
||||||
|
assert(seqStoreLiteralsSize >= literalsRead);
|
||||||
|
lastLLSize = seqStoreLiteralsSize - literalsRead;
|
||||||
|
outSeqs[i].litLength = (U32)lastLLSize;
|
||||||
|
outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
|
||||||
|
seqStoreSeqSize++;
|
||||||
|
|
||||||
|
zc->seqCollector.seqIndex += seqStoreSeqSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
|
size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
|
||||||
@ -2584,6 +2596,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
|
|||||||
|
|
||||||
if (zc->seqCollector.collectSequences) {
|
if (zc->seqCollector.collectSequences) {
|
||||||
ZSTD_copyBlockSequences(zc);
|
ZSTD_copyBlockSequences(zc);
|
||||||
|
ZSTD_confirmRepcodesAndEntropyTables(zc);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
52
lib/zstd.h
52
lib/zstd.h
@ -1116,21 +1116,39 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
|
|||||||
typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
|
typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
unsigned int matchPos; /* Match pos in dst */
|
unsigned int offset; /* The offset of the match. (NOT the same as the offset code)
|
||||||
/* If seqDef.offset > 3, then this is seqDef.offset - 3
|
* If offset == 0 and matchLength == 0, this sequence represents the last
|
||||||
* If seqDef.offset < 3, then this is the corresponding repeat offset
|
* literals in the block of litLength size.
|
||||||
* But if seqDef.offset < 3 and litLength == 0, this is the
|
*/
|
||||||
* repeat offset before the corresponding repeat offset
|
|
||||||
* And if seqDef.offset == 3 and litLength == 0, this is the
|
unsigned int litLength; /* Literal length of the sequence. */
|
||||||
* most recent repeat offset - 1
|
unsigned int matchLength; /* Match length of the sequence. */
|
||||||
*/
|
|
||||||
unsigned int offset;
|
/* Note: Users of this API may provide a sequence with matchLength == litLength == offset == 0.
|
||||||
unsigned int litLength; /* Literal length */
|
* In this case, we will treat the sequence as a marker for a block boundary.
|
||||||
unsigned int matchLength; /* Match length */
|
*/
|
||||||
/* 0 when seq not rep and seqDef.offset otherwise
|
|
||||||
* when litLength == 0 this will be <= 4, otherwise <= 3 like normal
|
unsigned int rep; /* Represents which repeat offset is represented by the field 'offset'.
|
||||||
*/
|
* Ranges from [0, 3].
|
||||||
unsigned int rep;
|
*
|
||||||
|
* Repeat offsets are essentially previous offsets from previous sequences sorted in
|
||||||
|
* recency order. For more detail, see doc/zstd_compression_format.md
|
||||||
|
*
|
||||||
|
* If rep == 0, then 'offset' does not contain a repeat offset.
|
||||||
|
* If rep > 0:
|
||||||
|
* If litLength != 0:
|
||||||
|
* rep == 1 --> offset == repeat_offset_1
|
||||||
|
* rep == 2 --> offset == repeat_offset_2
|
||||||
|
* rep == 3 --> offset == repeat_offset_3
|
||||||
|
* If litLength == 0:
|
||||||
|
* rep == 1 --> offset == repeat_offset_2
|
||||||
|
* rep == 2 --> offset == repeat_offset_3
|
||||||
|
* rep == 3 --> offset == repeat_offset_1 - 1
|
||||||
|
*
|
||||||
|
* Note: This field is optional. ZSTD_getSequences() will calculate the value of
|
||||||
|
* 'rep', but repeat offsets do not necessarily need to be calculated from an external
|
||||||
|
* sequence provider's perspective.
|
||||||
|
*/
|
||||||
} ZSTD_Sequence;
|
} ZSTD_Sequence;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@ -1276,7 +1294,9 @@ ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcS
|
|||||||
ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
|
ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
|
||||||
|
|
||||||
/*! ZSTD_getSequences() :
|
/*! ZSTD_getSequences() :
|
||||||
* Extract sequences from the sequence store
|
* Extract sequences from the sequence store.
|
||||||
|
* Each block will end with a dummy sequence with offset == 0, matchLength == 0, and litLength == length of last literals.
|
||||||
|
*
|
||||||
* zc can be used to insert custom compression params.
|
* zc can be used to insert custom compression params.
|
||||||
* This function invokes ZSTD_compress2
|
* This function invokes ZSTD_compress2
|
||||||
* @return : number of sequences extracted
|
* @return : number of sequences extracted
|
||||||
|
@ -309,22 +309,23 @@ static void FUZ_decodeSequences(BYTE* dst, ZSTD_Sequence* seqs, size_t seqsSize,
|
|||||||
{
|
{
|
||||||
size_t i;
|
size_t i;
|
||||||
size_t j;
|
size_t j;
|
||||||
for(i = 0; i < seqsSize - 1; ++i) {
|
for(i = 0; i < seqsSize; ++i) {
|
||||||
assert(dst + seqs[i].litLength + seqs[i].matchLength < dst + size);
|
assert(dst + seqs[i].litLength + seqs[i].matchLength <= dst + size);
|
||||||
assert(src + seqs[i].litLength + seqs[i].matchLength < src + size);
|
assert(src + seqs[i].litLength + seqs[i].matchLength <= src + size);
|
||||||
|
|
||||||
memcpy(dst, src, seqs[i].litLength);
|
memcpy(dst, src, seqs[i].litLength);
|
||||||
dst += seqs[i].litLength;
|
dst += seqs[i].litLength;
|
||||||
src += seqs[i].litLength;
|
src += seqs[i].litLength;
|
||||||
size -= seqs[i].litLength;
|
size -= seqs[i].litLength;
|
||||||
|
|
||||||
for (j = 0; j < seqs[i].matchLength; ++j)
|
if (seqs[i].offset != 0) {
|
||||||
dst[j] = dst[j - seqs[i].offset];
|
for (j = 0; j < seqs[i].matchLength; ++j)
|
||||||
dst += seqs[i].matchLength;
|
dst[j] = dst[j - seqs[i].offset];
|
||||||
src += seqs[i].matchLength;
|
dst += seqs[i].matchLength;
|
||||||
size -= seqs[i].matchLength;
|
src += seqs[i].matchLength;
|
||||||
|
size -= seqs[i].matchLength;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
memcpy(dst, src, size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*=============================================
|
/*=============================================
|
||||||
@ -2726,6 +2727,7 @@ static int basicUnitTests(U32 const seed, double compressibility)
|
|||||||
ZSTD_freeCCtx(cctx);
|
ZSTD_freeCCtx(cctx);
|
||||||
free(seqs);
|
free(seqs);
|
||||||
}
|
}
|
||||||
|
DISPLAYLEVEL(3, "OK \n");
|
||||||
|
|
||||||
/* Multiple blocks of zeros test */
|
/* Multiple blocks of zeros test */
|
||||||
#define LONGZEROSLENGTH 1000000 /* 1MB of zeros */
|
#define LONGZEROSLENGTH 1000000 /* 1MB of zeros */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user