mirror of
https://github.com/facebook/zstd.git
synced 2025-09-16 09:36:32 +02:00
AArch64: Enhance struct access in Huffman decode 2X
In the multi-stream multi-symbol Huffman decoder GCC generates suboptimal code - emitting more loads for HUF_DEltX2 struct member accesses. Forcing it to use 32-bit loads and bit arithmetic to extract the necessary parts (UBFX) improves the overall decode speed. Also avoid integer type conversions in the symbol decodes, which leads to better instruction selection in table lookup accesses. On AArch64 the decoder no longer runs into register-pressure limits, so we can simplify the hot path and improve throughput Decompression uplifts on a Neoverse V2 system, using Zstd-1.5.8 compiled with "-O3 -march=armv8.2-a+sve2": Clang-20 Clang-* GCC-13 GCC-14 GCC-15 1#silesia.tar: +0.820% +1.365% +2.480% +1.348% +0.987% 2#silesia.tar: +0.426% +0.784% +1.218% +0.665% +0.554% 3#silesia.tar: +0.112% +0.389% +0.508% +0.188% +0.261% * Requires Clang-21 support from LLVM commit hash `a53003fe23cb6c871e72d70ff2d3a075a7490da2` (Clang-21 hasn’t been released as of this writing)
This commit is contained in:
@@ -785,19 +785,19 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \
|
#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \
|
||||||
do { \
|
do { \
|
||||||
int const index = (int)(bits[(_stream)] >> 53); \
|
U64 const index = bits[(_stream)] >> 53; \
|
||||||
int const entry = (int)dtable[index]; \
|
U16 const entry = dtable[index]; \
|
||||||
bits[(_stream)] <<= (entry & 0x3F); \
|
bits[(_stream)] <<= entry & 0x3F; \
|
||||||
op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
|
op[(_stream)][(_symbol)] = (BYTE)(entry >> 8); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define HUF_4X1_RELOAD_STREAM(_stream) \
|
#define HUF_5X1_RELOAD_STREAM(_stream) \
|
||||||
do { \
|
do { \
|
||||||
int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
|
U64 const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
|
||||||
int const nbBits = ctz & 7; \
|
U64 const nbBits = ctz & 7; \
|
||||||
int const nbBytes = ctz >> 3; \
|
U64 const nbBytes = ctz >> 3; \
|
||||||
op[(_stream)] += 5; \
|
op[(_stream)] += 5; \
|
||||||
ip[(_stream)] -= nbBytes; \
|
ip[(_stream)] -= nbBytes; \
|
||||||
bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
|
bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
|
||||||
@@ -816,11 +816,11 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
|
|||||||
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
|
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
|
||||||
|
|
||||||
/* Reload each of the 4 the bitstreams */
|
/* Reload each of the 4 the bitstreams */
|
||||||
HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
|
HUF_4X_FOR_EACH_STREAM(HUF_5X1_RELOAD_STREAM);
|
||||||
} while (op[3] < olimit);
|
} while (op[3] < olimit);
|
||||||
|
|
||||||
#undef HUF_4X1_DECODE_SYMBOL
|
#undef HUF_4X1_DECODE_SYMBOL
|
||||||
#undef HUF_4X1_RELOAD_STREAM
|
#undef HUF_5X1_RELOAD_STREAM
|
||||||
}
|
}
|
||||||
|
|
||||||
_out:
|
_out:
|
||||||
@@ -1603,57 +1603,65 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \
|
#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \
|
||||||
do { \
|
do { \
|
||||||
if ((_decode3) || (_stream) != 3) { \
|
if ((_decode3) || (_stream) != 3) { \
|
||||||
int const index = (int)(bits[(_stream)] >> 53); \
|
U64 const index = bits[(_stream)] >> 53; \
|
||||||
HUF_DEltX2 const entry = dtable[index]; \
|
size_t const entry = MEM_readLE32(&dtable[index]); \
|
||||||
MEM_write16(op[(_stream)], entry.sequence); \
|
MEM_write16(op[(_stream)], (U16)entry); \
|
||||||
bits[(_stream)] <<= (entry.nbBits) & 0x3F; \
|
bits[(_stream)] <<= (entry >> 16) & 0x3F; \
|
||||||
op[(_stream)] += (entry.length); \
|
op[(_stream)] += entry >> 24; \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define HUF_4X2_RELOAD_STREAM(_stream) \
|
#define HUF_5X2_RELOAD_STREAM(_stream, _decode3) \
|
||||||
do { \
|
do { \
|
||||||
HUF_4X2_DECODE_SYMBOL(3, 1); \
|
if (_decode3) HUF_4X2_DECODE_SYMBOL(3, 1); \
|
||||||
{ \
|
{ \
|
||||||
int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
|
U64 const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
|
||||||
int const nbBits = ctz & 7; \
|
U64 const nbBits = ctz & 7; \
|
||||||
int const nbBytes = ctz >> 3; \
|
U64 const nbBytes = ctz >> 3; \
|
||||||
ip[(_stream)] -= nbBytes; \
|
ip[(_stream)] -= nbBytes; \
|
||||||
bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
|
bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
|
||||||
bits[(_stream)] <<= nbBits; \
|
bits[(_stream)] <<= nbBits; \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#if defined(__aarch64__)
|
||||||
|
# define HUF_4X2_4WAY 1
|
||||||
|
#else
|
||||||
|
# define HUF_4X2_4WAY 0
|
||||||
|
#endif
|
||||||
|
#define HUF_4X2_3WAY !HUF_4X2_4WAY
|
||||||
|
|
||||||
/* Manually unroll the loop because compilers don't consistently
|
/* Manually unroll the loop because compilers don't consistently
|
||||||
* unroll the inner loops, which destroys performance.
|
* unroll the inner loops, which destroys performance.
|
||||||
*/
|
*/
|
||||||
do {
|
do {
|
||||||
/* Decode 5 symbols from each of the first 3 streams.
|
/* Decode 5 symbols from each of the first 3 or 4 streams.
|
||||||
* The final stream will be decoded during the reload phase
|
* In the 3-way case the final stream will be decoded during
|
||||||
* to reduce register pressure.
|
* the reload phase to reduce register pressure.
|
||||||
*/
|
*/
|
||||||
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
|
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
|
||||||
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
|
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
|
||||||
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
|
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
|
||||||
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
|
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
|
||||||
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
|
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
|
||||||
|
|
||||||
/* Decode one symbol from the final stream */
|
/* In the 3-way case decode one symbol from the final stream. */
|
||||||
HUF_4X2_DECODE_SYMBOL(3, 1);
|
HUF_4X2_DECODE_SYMBOL(3, HUF_4X2_3WAY);
|
||||||
|
|
||||||
/* Decode 4 symbols from the final stream & reload bitstreams.
|
/* In the 3-way case decode 4 symbols from the final stream &
|
||||||
* The final stream is reloaded last, meaning that all 5 symbols
|
* reload bitstreams. The final stream is reloaded last, meaning
|
||||||
* are decoded from the final stream before it is reloaded.
|
* that all 5 symbols are decoded from the final stream before
|
||||||
|
* it is reloaded.
|
||||||
*/
|
*/
|
||||||
HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
|
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_5X2_RELOAD_STREAM, HUF_4X2_3WAY);
|
||||||
} while (op[3] < olimit);
|
} while (op[3] < olimit);
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef HUF_4X2_DECODE_SYMBOL
|
#undef HUF_4X2_DECODE_SYMBOL
|
||||||
#undef HUF_4X2_RELOAD_STREAM
|
#undef HUF_5X2_RELOAD_STREAM
|
||||||
|
|
||||||
_out:
|
_out:
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user