1
0
mirror of https://github.com/facebook/zstd.git synced 2025-10-31 16:47:48 +02:00

speed optimized version of targetCBlockSize

note that the size of individual compressed blocks will vary more wildly with this modification.
But it seems good enough for a first test, and fix the speed regression issue.
Further refinements can be attempted later.
This commit is contained in:
Yann Collet
2024-02-23 14:03:26 -08:00
parent 68a232c591
commit cc4530924b
2 changed files with 58 additions and 59 deletions

View File

@@ -122,7 +122,7 @@ ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
}
*entropyWritten = 1;
DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
return op-ostart;
return (size_t)(op-ostart);
}
static size_t
@@ -187,7 +187,7 @@ ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
else
op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
if (nbSeq==0) {
return op - ostart;
return (size_t)(op - ostart);
}
/* seqHead : flags for FSE encoding type */
@@ -209,7 +209,7 @@ ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
}
{ size_t const bitstreamSize = ZSTD_encodeSequences(
op, oend - op,
op, (size_t)(oend - op),
fseTables->matchlengthCTable, mlCode,
fseTables->offcodeCTable, ofCode,
fseTables->litlengthCTable, llCode,
@@ -253,7 +253,7 @@ ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
#endif
*entropyWritten = 1;
return op - ostart;
return (size_t)(op - ostart);
}
/** ZSTD_compressSubBlock() :
@@ -296,11 +296,11 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
op += cSeqSize;
}
/* Write block header */
{ size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
{ size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize;
U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
MEM_writeLE24(ostart, cBlockHeader24);
}
return op-ostart;
return (size_t)(op-ostart);
}
static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
@@ -419,6 +419,16 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe
return 0;
}
static size_t countLiterals(const seqDef* sp, size_t seqCount)
{
size_t n, total = 0;
assert(sp != NULL);
for (n=0; n<seqCount; n++) {
total += sp[n].litLength;
}
return total;
}
/** ZSTD_compressSubBlock_multi() :
* Breaks super-block into multiple sub-blocks and compresses them.
* Entropy will be written to the first block.
@@ -438,10 +448,12 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
{
const seqDef* const sstart = seqStorePtr->sequencesStart;
const seqDef* const send = seqStorePtr->sequences;
const seqDef* sp = sstart;
size_t const nbSeqs = (size_t)(send - sstart);
size_t nbSeqsPerBlock = nbSeqs;
const BYTE* const lstart = seqStorePtr->litStart;
const BYTE* const lend = seqStorePtr->lit;
const BYTE* lp = lstart;
size_t const nbLiterals = (size_t)(lend - lstart);
BYTE const* ip = (BYTE const*)src;
BYTE const* const iend = ip + srcSize;
BYTE* const ostart = (BYTE*)dst;
@@ -451,52 +463,50 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
const BYTE* mlCodePtr = seqStorePtr->mlCode;
const BYTE* ofCodePtr = seqStorePtr->ofCode;
size_t targetCBlockSize = cctxParams->targetCBlockSize;
size_t litSize, seqCount;
int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
int writeSeqEntropy = 1;
int lastSequence = 0;
size_t nbSubBlocks = 1;
DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
(unsigned)(lend-lp), (unsigned)(send-sstart));
litSize = 0;
seqCount = 0;
do {
size_t cBlockSizeEstimate = 0;
if (sstart == send) {
lastSequence = 1;
} else {
const seqDef* const sequence = sp + seqCount;
lastSequence = sequence == send - 1;
litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
seqCount++;
}
if (lastSequence) {
assert(lp <= lend);
assert(litSize <= (size_t)(lend - lp));
litSize = (size_t)(lend - lp);
}
/* I think there is an optimization opportunity here.
* Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
* since it recalculates estimate from scratch.
* For example, it would recount literal distribution and symbol codes every time.
*/
cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
&nextCBlock->entropy, entropyMetadata,
workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
/* let's start by a general estimation for the full block */
{ size_t const cBlockSizeEstimate =
ZSTD_estimateSubBlockSize(lp, nbLiterals,
ofCodePtr, llCodePtr, mlCodePtr, nbSeqs,
&nextCBlock->entropy, entropyMetadata,
workspace, wkspSize,
writeLitEntropy, writeSeqEntropy);
/* quick estimation */
nbSubBlocks = (cBlockSizeEstimate + (targetCBlockSize-1)) / targetCBlockSize;
assert(nbSubBlocks > 0);
nbSeqsPerBlock = nbSeqs / nbSubBlocks;
/* Note: this is very approximative. Obviously, some sub-blocks will be larger and others faster.
* But the contract of this feature has always been approximative, so for now we'll leverage it for speed.
* It can be refined later, for closer-to-target compressed block size, if it ever matters. */
}
/* write sub-blocks */
{ size_t n;
for (n=0; n < nbSubBlocks; n++) {
const seqDef* sp = sstart + n*nbSeqsPerBlock;
int lastSubBlock = (n==nbSubBlocks-1);
size_t const nbSeqsLastSubBlock = nbSeqs - (nbSubBlocks-1) * nbSeqsPerBlock;
size_t seqCount = lastSubBlock ? nbSeqsLastSubBlock : nbSeqsPerBlock;
size_t litSize = lastSubBlock ? (size_t)(lend-lp) : countLiterals(sp, seqCount);
int litEntropyWritten = 0;
int seqEntropyWritten = 0;
const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
const size_t decompressedSize =
ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSubBlock);
const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
sp, seqCount,
lp, litSize,
llCodePtr, mlCodePtr, ofCodePtr,
cctxParams,
op, oend-op,
op, (size_t)(oend-op),
bmi2, writeLitEntropy, writeSeqEntropy,
&litEntropyWritten, &seqEntropyWritten,
lastBlock && lastSequence);
lastBlock && lastSubBlock);
FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
if (cSize > 0 && cSize < decompressedSize) {
DEBUGLOG(5, "Committed the sub-block");
@@ -519,7 +529,8 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
}
}
}
} while (!lastSequence);
}
if (writeLitEntropy) {
DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
@@ -531,25 +542,10 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
return 0;
}
if (ip < iend) {
size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
assert(cSize != 0);
op += cSize;
/* We have to regenerate the repcodes because we've skipped some sequences */
if (sp < send) {
seqDef const* seq;
repcodes_t rep;
ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
for (seq = sstart; seq < sp; ++seq) {
ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
}
ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
}
}
DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
return op-ostart;
assert(ip == iend); (void)iend;
DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed: %u subBlocks, total compressed size = %u",
(unsigned)nbSubBlocks, (unsigned)(op-ostart));
return (size_t)(op-ostart);
}
size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,

View File

@@ -221,7 +221,10 @@ the last one takes effect.
* `--target-compressed-block-size=#`:
Attempt to produce compressed blocks of approximately this size.
This will split larger blocks in order to approach this target.
Notably useful to improve latency when the receiver can make use of early data sooner.
This feature is notably useful for improved latency, when the receiver can leverage receiving early incomplete data.
This parameter defines a loose target: compressed blocks will target this size "on average", but individual blocks can still be larger or smaller.
Enabling this feature can decrease compression speed by up to ~10% at level 1.
Higher levels will see smaller relative speed regression, becoming invisible at higher settings.
* `-o FILE`:
save result into `FILE`.
* `-f`, `--force`: