mirror of
https://github.com/facebook/zstd.git
synced 2025-09-16 09:36:32 +02:00
Improve ZSTD_get1BlockSummary
Add a faster scalar implementation of ZSTD_get1BlockSummary which removes the data dependency of the accumulators in the hot loop to leverage the superscalar potential of recent out-of-order CPUs. The new algorithm leverages SWAR (SIMD Within A Register) methodology to exploit the capabilities of 64-bit architectures. It achieves this by packing two 32-bit data elements into a single 64-bit register, enabling parallel operations on these subcomponents while ensuring that the 32-bit boundaries prevent overflow, thereby optimizing computational efficiency. Corresponding unit tests are included. Relative performance to GCC-13 using: `./fullbench -b19 -l5 enwik5` Neoverse-V2 before after GCC-13: 100.000% 290.527% GCC-14: 100.000% 291.714% GCC-15: 99.914% 291.495% Clang-18: 148.072% 264.524% Clang-19: 148.075% 264.512% Clang-20: 148.062% 264.490% Cortex-A720 before after GCC-13: 100.000% 235.261% GCC-14: 101.064% 234.903% GCC-15: 112.977% 218.547% Clang-18: 127.135% 180.359% Clang-19: 127.149% 180.297% Clang-20: 127.154% 180.260% Co-authored by, Thomas Daubney <Thomas.Daubney@arm.com>
This commit is contained in:
@@ -7604,29 +7604,104 @@ BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
* The function assumes `litMatchLength` is a packed 64-bit value where the
|
||||
* lower 32 bits represent the match length. The check varies based on the
|
||||
* system's endianness:
|
||||
* - On little-endian systems, it verifies if the entire 64-bit value is at most
|
||||
* 0xFFFFFFFF, indicating the match length (lower 32 bits) is zero.
|
||||
* - On big-endian systems, it directly checks if the lower 32 bits are zero.
|
||||
*
|
||||
* @returns 1 if the match length is zero, 0 otherwise.
|
||||
*/
|
||||
FORCE_INLINE_TEMPLATE int matchLengthHalfIsZero(U64 litMatchLength)
|
||||
{
|
||||
if (MEM_isLittleEndian()) {
|
||||
return litMatchLength <= 0xFFFFFFFFULL;
|
||||
} else {
|
||||
return (U32)litMatchLength == 0;
|
||||
}
|
||||
}
|
||||
|
||||
BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs)
|
||||
{
|
||||
size_t totalMatchSize = 0;
|
||||
size_t litSize = 0;
|
||||
size_t n;
|
||||
/* Use multiple accumulators for efficient use of wide out-of-order machines. */
|
||||
U64 litMatchSize0 = 0;
|
||||
U64 litMatchSize1 = 0;
|
||||
U64 litMatchSize2 = 0;
|
||||
U64 litMatchSize3 = 0;
|
||||
size_t n = 0;
|
||||
|
||||
ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) + 4 == offsetof(ZSTD_Sequence, matchLength));
|
||||
ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) + 4 == offsetof(ZSTD_Sequence, rep));
|
||||
assert(seqs);
|
||||
for (n=0; n<nbSeqs; n++) {
|
||||
totalMatchSize += seqs[n].matchLength;
|
||||
litSize += seqs[n].litLength;
|
||||
if (seqs[n].matchLength == 0) {
|
||||
|
||||
if (nbSeqs > 3) {
|
||||
/* Process the input in 4 independent streams to reach high throughput. */
|
||||
do {
|
||||
/* Load `litLength` and `matchLength` as a packed `U64`. It is safe
|
||||
* to use 64-bit unsigned arithmetic here because the sum of `litLength`
|
||||
* and `matchLength` cannot exceed the block size, so the 32-bit
|
||||
* subparts will never overflow. */
|
||||
U64 litMatchLength = MEM_read64(&seqs[n].litLength);
|
||||
litMatchSize0 += litMatchLength;
|
||||
if (matchLengthHalfIsZero(litMatchLength)) {
|
||||
assert(seqs[n].offset == 0);
|
||||
goto _out;
|
||||
}
|
||||
|
||||
litMatchLength = MEM_read64(&seqs[n + 1].litLength);
|
||||
litMatchSize1 += litMatchLength;
|
||||
if (matchLengthHalfIsZero(litMatchLength)) {
|
||||
n += 1;
|
||||
assert(seqs[n].offset == 0);
|
||||
goto _out;
|
||||
}
|
||||
|
||||
litMatchLength = MEM_read64(&seqs[n + 2].litLength);
|
||||
litMatchSize2 += litMatchLength;
|
||||
if (matchLengthHalfIsZero(litMatchLength)) {
|
||||
n += 2;
|
||||
assert(seqs[n].offset == 0);
|
||||
goto _out;
|
||||
}
|
||||
|
||||
litMatchLength = MEM_read64(&seqs[n + 3].litLength);
|
||||
litMatchSize3 += litMatchLength;
|
||||
if (matchLengthHalfIsZero(litMatchLength)) {
|
||||
n += 3;
|
||||
assert(seqs[n].offset == 0);
|
||||
goto _out;
|
||||
}
|
||||
|
||||
n += 4;
|
||||
} while(n < nbSeqs - 3);
|
||||
}
|
||||
|
||||
for (; n < nbSeqs; n++) {
|
||||
U64 litMatchLength = MEM_read64(&seqs[n].litLength);
|
||||
litMatchSize0 += litMatchLength;
|
||||
if (matchLengthHalfIsZero(litMatchLength)) {
|
||||
assert(seqs[n].offset == 0);
|
||||
break;
|
||||
goto _out;
|
||||
}
|
||||
}
|
||||
if (n==nbSeqs) {
|
||||
BlockSummary bs;
|
||||
/* At this point n == nbSeqs, so no end terminator. */
|
||||
{ BlockSummary bs;
|
||||
bs.nbSequences = ERROR(externalSequences_invalid);
|
||||
return bs;
|
||||
}
|
||||
_out:
|
||||
litMatchSize0 += litMatchSize1 + litMatchSize2 + litMatchSize3;
|
||||
{ BlockSummary bs;
|
||||
bs.nbSequences = n+1;
|
||||
bs.blockSize = litSize + totalMatchSize;
|
||||
bs.litSize = litSize;
|
||||
bs.nbSequences = n + 1;
|
||||
if (MEM_isLittleEndian()) {
|
||||
bs.litSize = (U32)litMatchSize0;
|
||||
bs.blockSize = bs.litSize + (litMatchSize0 >> 32);
|
||||
} else {
|
||||
bs.litSize = litMatchSize0 >> 32;
|
||||
bs.blockSize = bs.litSize + (U32)litMatchSize0;
|
||||
}
|
||||
return bs;
|
||||
}
|
||||
}
|
||||
|
@@ -45,6 +45,7 @@
|
||||
#include "zstd_internal.h" /* ZSTD_WORKSPACETOOLARGE_MAXDURATION, ZSTD_WORKSPACETOOLARGE_FACTOR, KB, MB */
|
||||
#include "threading.h" /* ZSTD_pthread_create, ZSTD_pthread_join */
|
||||
#include "compress/hist.h" /* HIST_count_wksp */
|
||||
#include "compress/zstd_compress_internal.h" /* ZSTD_get1BlockSummary */
|
||||
|
||||
|
||||
/*-************************************
|
||||
@@ -769,6 +770,86 @@ static void test_blockSplitter_incompressibleExpansionProtection(unsigned testNb
|
||||
DISPLAYLEVEL(3, "OK \n");
|
||||
}
|
||||
|
||||
static unsigned test_get1BlockSummary(unsigned testNb)
|
||||
{
|
||||
static const ZSTD_Sequence nseqs[] = {
|
||||
{ 10, 2, 4, 1 },
|
||||
{ 20, 3, 5, 2 },
|
||||
{ 30, 6, 8, 3 },
|
||||
{ 40, 7, 9, 4 },
|
||||
{ 50, 10, 12, 5 },
|
||||
{ 60, 11, 13, 6 },
|
||||
{ 0, 14, 0, 7 },
|
||||
{ 70, 15, 17, 8 },
|
||||
{ 80, 16, 18, 9 },
|
||||
{ 90, 19, 21, 1 },
|
||||
{ 99, 20, 22, 2 },
|
||||
};
|
||||
static const BlockSummary blocks[] = {
|
||||
{ 7, 104, 53 },
|
||||
{ 6, 98, 51 },
|
||||
{ 5, 90, 48 },
|
||||
{ 4, 76, 42 },
|
||||
{ 3, 60, 35 },
|
||||
{ 2, 38, 25 },
|
||||
{ 1, 14, 14 },
|
||||
};
|
||||
size_t i;
|
||||
|
||||
DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with empty array : ", testNb++);
|
||||
{
|
||||
BlockSummary bs = ZSTD_get1BlockSummary(nseqs, 0);
|
||||
CHECK_EQ(bs.nbSequences, ERROR(externalSequences_invalid));
|
||||
}
|
||||
DISPLAYLEVEL(3, "OK \n");
|
||||
|
||||
DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with 1 literal only : ", testNb++);
|
||||
{
|
||||
static const ZSTD_Sequence seqs[] = { { 0, 5, 0, 0 } };
|
||||
BlockSummary bs = ZSTD_get1BlockSummary(seqs, 1);
|
||||
CHECK_EQ(bs.nbSequences, 1);
|
||||
CHECK_EQ(bs.litSize, 5);
|
||||
CHECK_EQ(bs.blockSize, 5);
|
||||
}
|
||||
DISPLAYLEVEL(3, "OK \n");
|
||||
|
||||
DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with no terminator : ", testNb++);
|
||||
{
|
||||
static const ZSTD_Sequence seqs[] = { { 10, 2, 4, 0 }, { 20, 3, 5, 0 } };
|
||||
BlockSummary bs = ZSTD_get1BlockSummary(seqs, 2);
|
||||
CHECK_EQ(bs.nbSequences, ERROR(externalSequences_invalid));
|
||||
}
|
||||
DISPLAYLEVEL(3, "OK \n");
|
||||
|
||||
DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with rep ignored : ", testNb++);
|
||||
{
|
||||
static const ZSTD_Sequence seqs[] = {
|
||||
{ 10, 2, 4, 2 },
|
||||
{ 10, 3, 5, 2 },
|
||||
{ 0, 7, 0, 3 },
|
||||
};
|
||||
BlockSummary bs = ZSTD_get1BlockSummary(seqs, 3);
|
||||
CHECK_EQ(bs.nbSequences, 3);
|
||||
CHECK_EQ(bs.litSize, 2 + 3 + 7);
|
||||
CHECK_EQ(bs.blockSize, (4 + 5) + (2 + 3 + 7));
|
||||
}
|
||||
DISPLAYLEVEL(3, "OK \n");
|
||||
|
||||
assert(COUNTOF(nseqs) > COUNTOF(blocks));
|
||||
for (i = 0; i < COUNTOF(blocks); ++i) {
|
||||
BlockSummary bs;
|
||||
DISPLAYLEVEL(3, "test%3u : ZSTD_get1BlockSummary with %u inputs : ",
|
||||
testNb++, (unsigned)(COUNTOF(nseqs) - i));
|
||||
bs = ZSTD_get1BlockSummary(nseqs + i, COUNTOF(nseqs) - i);
|
||||
CHECK_EQ(bs.nbSequences, blocks[i].nbSequences);
|
||||
CHECK_EQ(bs.litSize, blocks[i].litSize);
|
||||
CHECK_EQ(bs.blockSize, blocks[i].blockSize);
|
||||
DISPLAYLEVEL(3, "OK \n");
|
||||
}
|
||||
|
||||
return testNb;
|
||||
}
|
||||
|
||||
/* ============================================================= */
|
||||
|
||||
static int basicUnitTests(U32 const seed, double compressibility)
|
||||
@@ -4004,6 +4085,8 @@ static int basicUnitTests(U32 const seed, double compressibility)
|
||||
}
|
||||
DISPLAYLEVEL(3, "OK \n");
|
||||
|
||||
testNb = test_get1BlockSummary(testNb);
|
||||
|
||||
DISPLAYLEVEL(3, "test%3i : ZSTD_compressSequencesAndLiterals : ", testNb++);
|
||||
{
|
||||
const size_t srcSize = 497000;
|
||||
|
Reference in New Issue
Block a user