1
0
mirror of https://github.com/facebook/zstd.git synced 2025-09-16 09:36:32 +02:00

Improve ZSTD_get1BlockSummary

Add a faster scalar implementation of ZSTD_get1BlockSummary which
removes the data dependency of the accumulators in the hot loop to
leverage the superscalar potential of recent out-of-order CPUs.
The new algorithm leverages SWAR (SIMD Within A Register) methodology
to exploit the capabilities of 64-bit architectures. It achieves this
by packing two 32-bit data elements into a single 64-bit register,
enabling parallel operations on these subcomponents while ensuring
that the 32-bit boundaries prevent overflow, thereby optimizing
computational efficiency.

Corresponding unit tests are included.

Relative performance to GCC-13 using: `./fullbench -b19 -l5 enwik5`

Neoverse-V2   before     after
GCC-13:      100.000%  290.527%
GCC-14:      100.000%  291.714%
GCC-15:       99.914%  291.495%
Clang-18:    148.072%  264.524%
Clang-19:    148.075%  264.512%
Clang-20:    148.062%  264.490%

Cortex-A720   before     after
GCC-13:      100.000%  235.261%
GCC-14:      101.064%  234.903%
GCC-15:      112.977%  218.547%
Clang-18:    127.135%  180.359%
Clang-19:    127.149%  180.297%
Clang-20:    127.154%  180.260%

Co-authored by, Thomas Daubney <Thomas.Daubney@arm.com>
This commit is contained in:
Arpad Panyik
2025-07-08 17:05:45 +00:00
parent 1dbc2e0908
commit 8e4400463a
2 changed files with 171 additions and 13 deletions

View File

@@ -7604,29 +7604,104 @@ BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
#else
/*
* The function assumes `litMatchLength` is a packed 64-bit value where the
* lower 32 bits represent the match length. The check varies based on the
* system's endianness:
* - On little-endian systems, it verifies if the entire 64-bit value is at most
* 0xFFFFFFFF, indicating the match length (lower 32 bits) is zero.
* - On big-endian systems, it directly checks if the lower 32 bits are zero.
*
* @returns 1 if the match length is zero, 0 otherwise.
*/
FORCE_INLINE_TEMPLATE int matchLengthHalfIsZero(U64 litMatchLength)
{
if (MEM_isLittleEndian()) {
return litMatchLength <= 0xFFFFFFFFULL;
} else {
return (U32)litMatchLength == 0;
}
}
BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
{
size_t totalMatchSize = 0;
size_t litSize = 0;
size_t n;
/* Use multiple accumulators for efficient use of wide out-of-order machines. */
U64 litMatchSize0 = 0;
U64 litMatchSize1 = 0;
U64 litMatchSize2 = 0;
U64 litMatchSize3 = 0;
size_t n = 0;
ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) + 4 == offsetof(ZSTD_Sequence, matchLength));
ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) + 4 == offsetof(ZSTD_Sequence, rep));
assert(seqs);
for (n=0; n<nbSeqs; n++) {
totalMatchSize += seqs[n].matchLength;
litSize += seqs[n].litLength;
if (seqs[n].matchLength == 0) {
if (nbSeqs > 3) {
/* Process the input in 4 independent streams to reach high throughput. */
do {
/* Load `litLength` and `matchLength` as a packed `U64`. It is safe
* to use 64-bit unsigned arithmetic here because the sum of `litLength`
* and `matchLength` cannot exceed the block size, so the 32-bit
* subparts will never overflow. */
U64 litMatchLength = MEM_read64(&seqs[n].litLength);
litMatchSize0 += litMatchLength;
if (matchLengthHalfIsZero(litMatchLength)) {
assert(seqs[n].offset == 0);
goto _out;
}
litMatchLength = MEM_read64(&seqs[n + 1].litLength);
litMatchSize1 += litMatchLength;
if (matchLengthHalfIsZero(litMatchLength)) {
n += 1;
assert(seqs[n].offset == 0);
goto _out;
}
litMatchLength = MEM_read64(&seqs[n + 2].litLength);
litMatchSize2 += litMatchLength;
if (matchLengthHalfIsZero(litMatchLength)) {
n += 2;
assert(seqs[n].offset == 0);
goto _out;
}
litMatchLength = MEM_read64(&seqs[n + 3].litLength);
litMatchSize3 += litMatchLength;
if (matchLengthHalfIsZero(litMatchLength)) {
n += 3;
assert(seqs[n].offset == 0);
goto _out;
}
n += 4;
} while(n < nbSeqs - 3);
}
for (; n < nbSeqs; n++) {
U64 litMatchLength = MEM_read64(&seqs[n].litLength);
litMatchSize0 += litMatchLength;
if (matchLengthHalfIsZero(litMatchLength)) {
assert(seqs[n].offset == 0);
break;
goto _out;
}
}
if (n==nbSeqs) {
BlockSummary bs;
/* At this point n == nbSeqs, so no end terminator. */
{ BlockSummary bs;
bs.nbSequences = ERROR(externalSequences_invalid);
return bs;
}
_out:
litMatchSize0 += litMatchSize1 + litMatchSize2 + litMatchSize3;
{ BlockSummary bs;
bs.nbSequences = n+1;
bs.blockSize = litSize + totalMatchSize;
bs.litSize = litSize;
bs.nbSequences = n + 1;
if (MEM_isLittleEndian()) {
bs.litSize = (U32)litMatchSize0;
bs.blockSize = bs.litSize + (litMatchSize0 >> 32);
} else {
bs.litSize = litMatchSize0 >> 32;
bs.blockSize = bs.litSize + (U32)litMatchSize0;
}
return bs;
}
}

View File

@@ -45,6 +45,7 @@
#include "zstd_internal.h" /* ZSTD_WORKSPACETOOLARGE_MAXDURATION, ZSTD_WORKSPACETOOLARGE_FACTOR, KB, MB */
#include "threading.h" /* ZSTD_pthread_create, ZSTD_pthread_join */
#include "compress/hist.h" /* HIST_count_wksp */
#include "compress/zstd_compress_internal.h" /* ZSTD_get1BlockSummary */
/*-************************************
@@ -769,6 +770,86 @@ static void test_blockSplitter_incompressibleExpansionProtection(unsigned testNb
DISPLAYLEVEL(3, "OK \n");
}
static unsigned test_get1BlockSummary(unsigned testNb)
{
static const ZSTD_Sequence nseqs[] = {
{ 10, 2, 4, 1 },
{ 20, 3, 5, 2 },
{ 30, 6, 8, 3 },
{ 40, 7, 9, 4 },
{ 50, 10, 12, 5 },
{ 60, 11, 13, 6 },
{ 0, 14, 0, 7 },
{ 70, 15, 17, 8 },
{ 80, 16, 18, 9 },
{ 90, 19, 21, 1 },
{ 99, 20, 22, 2 },
};
static const BlockSummary blocks[] = {
{ 7, 104, 53 },
{ 6, 98, 51 },
{ 5, 90, 48 },
{ 4, 76, 42 },
{ 3, 60, 35 },
{ 2, 38, 25 },
{ 1, 14, 14 },
};
size_t i;
DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with empty array : ", testNb++);
{
BlockSummary bs = ZSTD_get1BlockSummary(nseqs, 0);
CHECK_EQ(bs.nbSequences, ERROR(externalSequences_invalid));
}
DISPLAYLEVEL(3, "OK \n");
DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with 1 literal only : ", testNb++);
{
static const ZSTD_Sequence seqs[] = { { 0, 5, 0, 0 } };
BlockSummary bs = ZSTD_get1BlockSummary(seqs, 1);
CHECK_EQ(bs.nbSequences, 1);
CHECK_EQ(bs.litSize, 5);
CHECK_EQ(bs.blockSize, 5);
}
DISPLAYLEVEL(3, "OK \n");
DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with no terminator : ", testNb++);
{
static const ZSTD_Sequence seqs[] = { { 10, 2, 4, 0 }, { 20, 3, 5, 0 } };
BlockSummary bs = ZSTD_get1BlockSummary(seqs, 2);
CHECK_EQ(bs.nbSequences, ERROR(externalSequences_invalid));
}
DISPLAYLEVEL(3, "OK \n");
DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with rep ignored : ", testNb++);
{
static const ZSTD_Sequence seqs[] = {
{ 10, 2, 4, 2 },
{ 10, 3, 5, 2 },
{ 0, 7, 0, 3 },
};
BlockSummary bs = ZSTD_get1BlockSummary(seqs, 3);
CHECK_EQ(bs.nbSequences, 3);
CHECK_EQ(bs.litSize, 2 + 3 + 7);
CHECK_EQ(bs.blockSize, (4 + 5) + (2 + 3 + 7));
}
DISPLAYLEVEL(3, "OK \n");
assert(COUNTOF(nseqs) > COUNTOF(blocks));
for (i = 0; i < COUNTOF(blocks); ++i) {
BlockSummary bs;
DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with %u inputs : ",
testNb++, (unsigned)(COUNTOF(nseqs) - i));
bs = ZSTD_get1BlockSummary(nseqs + i, COUNTOF(nseqs) - i);
CHECK_EQ(bs.nbSequences, blocks[i].nbSequences);
CHECK_EQ(bs.litSize, blocks[i].litSize);
CHECK_EQ(bs.blockSize, blocks[i].blockSize);
DISPLAYLEVEL(3, "OK \n");
}
return testNb;
}
/* ============================================================= */
static int basicUnitTests(U32 const seed, double compressibility)
@@ -4004,6 +4085,8 @@ static int basicUnitTests(U32 const seed, double compressibility)
}
DISPLAYLEVEL(3, "OK \n");
testNb = test_get1BlockSummary(testNb);
DISPLAYLEVEL(3, "test%3i : ZSTD_compressSequencesAndLiterals : ", testNb++);
{
const size_t srcSize = 497000;