1
0
mirror of https://github.com/facebook/zstd.git synced 2025-03-06 16:56:49 +02:00

Merge pull request #4288 from facebook/stronger_patchfrom

Improve compression ratio of the --patch-from mode
This commit is contained in:
Yann Collet 2025-02-10 12:09:16 -08:00 committed by GitHub
commit d84d70bd04
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 69 additions and 53 deletions

View File

@ -16,7 +16,7 @@
#include "zstd_double_fast.h" /* ZSTD_fillDoubleHashTable() */
#include "zstd_ldm_geartab.h"
#define LDM_BUCKET_SIZE_LOG 3
#define LDM_BUCKET_SIZE_LOG 4
#define LDM_MIN_MATCH_LENGTH 64
#define LDM_HASH_RLOG 7
@ -133,21 +133,35 @@ done:
}
void ZSTD_ldm_adjustParameters(ldmParams_t* params,
ZSTD_compressionParameters const* cParams)
const ZSTD_compressionParameters* cParams)
{
params->windowLog = cParams->windowLog;
ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH;
if (params->hashLog == 0) {
params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG);
assert(params->hashLog <= ZSTD_HASHLOG_MAX);
}
if (params->hashRateLog == 0) {
params->hashRateLog = params->windowLog < params->hashLog
? 0
: params->windowLog - params->hashLog;
if (params->hashLog > 0) {
/* if params->hashLog is set, derive hashRateLog from it */
assert(params->hashLog <= ZSTD_HASHLOG_MAX);
if (params->windowLog > params->hashLog) {
params->hashRateLog = params->windowLog - params->hashLog;
}
} else {
assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9);
/* mapping from [fast, rate7] to [btultra2, rate4] */
params->hashRateLog = 7 - (cParams->strategy/3);
}
}
if (params->hashLog == 0) {
params->hashLog = BOUNDED(ZSTD_HASHLOG_MIN, params->windowLog - params->hashRateLog, ZSTD_HASHLOG_MAX);
}
if (params->minMatchLength == 0) {
params->minMatchLength = LDM_MIN_MATCH_LENGTH;
if (cParams->strategy >= ZSTD_btultra)
params->minMatchLength /= 2;
}
if (params->bucketSizeLog==0) {
assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9);
params->bucketSizeLog = BOUNDED(LDM_BUCKET_SIZE_LOG, (U32)cParams->strategy, ZSTD_LDM_BUCKETSIZELOG_MAX);
}
params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
}

View File

@ -1100,11 +1100,12 @@ static void FIO_adjustParamsForPatchFromMode(FIO_prefs_t* const prefs,
FIO_setLdmFlag(prefs, 1);
}
if (cParams.strategy >= ZSTD_btopt) {
DISPLAYLEVEL(3, "[Optimal parser notes] Consider the following to improve patch size at the cost of speed:\n");
DISPLAYLEVEL(3, "- Use --single-thread mode in the zstd cli\n");
DISPLAYLEVEL(3, "- Set a larger targetLength (e.g. --zstd=targetLength=4096)\n");
DISPLAYLEVEL(3, "- Set a larger chainLog (e.g. --zstd=chainLog=%u)\n", ZSTD_CHAINLOG_MAX);
DISPLAYLEVEL(3, "Also consider playing around with searchLog and hashLog\n");
DISPLAYLEVEL(4, "[Optimal parser notes] Consider the following to improve patch size at the cost of speed:\n");
DISPLAYLEVEL(4, "- Set a larger targetLength (e.g. --zstd=targetLength=4096)\n");
DISPLAYLEVEL(4, "- Set a larger chainLog (e.g. --zstd=chainLog=%u)\n", ZSTD_CHAINLOG_MAX);
DISPLAYLEVEL(4, "- Set a larger LDM hashLog (e.g. --zstd=ldmHashLog=%u)\n", ZSTD_LDM_HASHLOG_MAX);
DISPLAYLEVEL(4, "- Set a smaller LDM rateLog (e.g. --zstd=ldmHashRateLog=%u)\n", ZSTD_LDM_HASHRATELOG_MIN);
DISPLAYLEVEL(4, "Also consider playing around with searchLog and hashLog\n");
}
}

View File

@ -455,6 +455,17 @@ The list of available _options_:
Value 0 is special and means "default": _ovlog_ is automatically determined by `zstd`.
In which case, _ovlog_ will range from 6 to 9, depending on selected _strat_.
- `ldmHashRateLog`=_lhrlog_, `lhrlog`=_lhrlog_:
Specify the frequency of inserting entries into the long distance matching
hash table.
This option is ignored unless long distance matching is enabled.
Larger values will improve compression speed. Deviating far from the
default value will likely result in a decrease in compression ratio.
The default value varies between 4 and 7, depending on `strategy`.
- `ldmHashLog`=_lhlog_, `lhlog`=_lhlog_:
Specify the maximum size for a hash table used for long distance matching.
@ -463,7 +474,7 @@ The list of available _options_:
Bigger hash tables usually improve compression ratio at the expense of more
memory during compression and a decrease in compression speed.
The minimum _lhlog_ is 6 and the maximum is 30 (default: 20).
The minimum _lhlog_ is 6 and the maximum is 30 (default: `windowLog - ldmHashRateLog`).
- `ldmMinMatch`=_lmml_, `lmml`=_lmml_:
Specify the minimum searched length of a match for long distance matching.
@ -472,7 +483,7 @@ The list of available _options_:
Larger/very small values usually decrease compression ratio.
The minimum _lmml_ is 4 and the maximum is 4096 (default: 64).
The minimum _lmml_ is 4 and the maximum is 4096 (default: 32 to 64, depending on `strategy`).
- `ldmBucketSizeLog`=_lblog_, `lblog`=_lblog_:
Specify the size of each bucket for the hash table used for long distance
@ -483,18 +494,8 @@ The list of available _options_:
Larger bucket sizes improve collision resolution but decrease compression
speed.
The minimum _lblog_ is 1 and the maximum is 8 (default: 3).
The minimum _lblog_ is 1 and the maximum is 8 (default: 4 to 8, depending on `strategy`).
- `ldmHashRateLog`=_lhrlog_, `lhrlog`=_lhrlog_:
Specify the frequency of inserting entries into the long distance matching
hash table.
This option is ignored unless long distance matching is enabled.
Larger values will improve compression speed. Deviating far from the
default value will likely result in a decrease in compression ratio.
The default value is `wlog - lhlog`.
### Example
The following parameters sets advanced compression options to something

View File

@ -111,9 +111,9 @@ silesia, level 9, zstdcli,
silesia, level 13, zstdcli, 4488438
silesia, level 16, zstdcli, 4358150
silesia, level 19, zstdcli, 4265929
silesia, long distance mode, zstdcli, 4824875
silesia, long distance mode, zstdcli, 4824341
silesia, multithreaded, zstdcli, 4833113
silesia, multithreaded long distance mode, zstdcli, 4824875
silesia, multithreaded long distance mode, zstdcli, 4824341
silesia, small window log, zstdcli, 7094528
silesia, small hash log, zstdcli, 6527214
silesia, small chain log, zstdcli, 4911647
@ -137,9 +137,9 @@ silesia.tar, level 13, zstdcli,
silesia.tar, level 16, zstdcli, 4357018
silesia.tar, level 19, zstdcli, 4259593
silesia.tar, no source size, zstdcli, 4836000
silesia.tar, long distance mode, zstdcli, 4828171
silesia.tar, long distance mode, zstdcli, 4827830
silesia.tar, multithreaded, zstdcli, 4836004
silesia.tar, multithreaded long distance mode, zstdcli, 4828171
silesia.tar, multithreaded long distance mode, zstdcli, 4827830
silesia.tar, small window log, zstdcli, 7100110
silesia.tar, small hash log, zstdcli, 6530127
silesia.tar, small chain log, zstdcli, 4915865
@ -217,9 +217,9 @@ github.tar, level 19, zstdcli,
github.tar, level 19 with dict, zstdcli, 32705
github.tar, no source size, zstdcli, 38885
github.tar, no source size with dict, zstdcli, 38115
github.tar, long distance mode, zstdcli, 40227
github.tar, long distance mode, zstdcli, 40143
github.tar, multithreaded, zstdcli, 38888
github.tar, multithreaded long distance mode, zstdcli, 40227
github.tar, multithreaded long distance mode, zstdcli, 40143
github.tar, small window log, zstdcli, 198539
github.tar, small hash log, zstdcli, 129874
github.tar, small chain log, zstdcli, 41673
@ -251,9 +251,9 @@ silesia, level 13, advanced
silesia, level 16, advanced one pass, 4356799
silesia, level 19, advanced one pass, 4265851
silesia, no source size, advanced one pass, 4832054
silesia, long distance mode, advanced one pass, 4823833
silesia, long distance mode, advanced one pass, 4823264
silesia, multithreaded, advanced one pass, 4833065
silesia, multithreaded long distance mode, advanced one pass, 4824827
silesia, multithreaded long distance mode, advanced one pass, 4824293
silesia, small window log, advanced one pass, 7094480
silesia, small hash log, advanced one pass, 6525510
silesia, small chain log, advanced one pass, 4912248
@ -285,9 +285,9 @@ silesia.tar, level 13, advanced
silesia.tar, level 16, advanced one pass, 4355572
silesia.tar, level 19, advanced one pass, 4257629
silesia.tar, no source size, advanced one pass, 4829268
silesia.tar, long distance mode, advanced one pass, 4816169
silesia.tar, long distance mode, advanced one pass, 4815868
silesia.tar, multithreaded, advanced one pass, 4836000
silesia.tar, multithreaded long distance mode, advanced one pass, 4828167
silesia.tar, multithreaded long distance mode, advanced one pass, 4827826
silesia.tar, small window log, advanced one pass, 7100064
silesia.tar, small hash log, advanced one pass, 6530222
silesia.tar, small chain log, advanced one pass, 4915689
@ -535,9 +535,9 @@ github.tar, level 19 with dict copy, advanced
github.tar, level 19 with dict load, advanced one pass, 32428
github.tar, no source size, advanced one pass, 38884
github.tar, no source size with dict, advanced one pass, 37995
github.tar, long distance mode, advanced one pass, 40242
github.tar, long distance mode, advanced one pass, 40156
github.tar, multithreaded, advanced one pass, 38884
github.tar, multithreaded long distance mode, advanced one pass, 40223
github.tar, multithreaded long distance mode, advanced one pass, 40139
github.tar, small window log, advanced one pass, 198535
github.tar, small hash log, advanced one pass, 129870
github.tar, small chain log, advanced one pass, 41669
@ -569,9 +569,9 @@ silesia, level 13, advanced
silesia, level 16, advanced one pass small out, 4356799
silesia, level 19, advanced one pass small out, 4265851
silesia, no source size, advanced one pass small out, 4832054
silesia, long distance mode, advanced one pass small out, 4823833
silesia, long distance mode, advanced one pass small out, 4823264
silesia, multithreaded, advanced one pass small out, 4833065
silesia, multithreaded long distance mode, advanced one pass small out, 4824827
silesia, multithreaded long distance mode, advanced one pass small out, 4824293
silesia, small window log, advanced one pass small out, 7094480
silesia, small hash log, advanced one pass small out, 6525510
silesia, small chain log, advanced one pass small out, 4912248
@ -603,9 +603,9 @@ silesia.tar, level 13, advanced
silesia.tar, level 16, advanced one pass small out, 4355572
silesia.tar, level 19, advanced one pass small out, 4257629
silesia.tar, no source size, advanced one pass small out, 4829268
silesia.tar, long distance mode, advanced one pass small out, 4816169
silesia.tar, long distance mode, advanced one pass small out, 4815868
silesia.tar, multithreaded, advanced one pass small out, 4836000
silesia.tar, multithreaded long distance mode, advanced one pass small out, 4828167
silesia.tar, multithreaded long distance mode, advanced one pass small out, 4827826
silesia.tar, small window log, advanced one pass small out, 7100064
silesia.tar, small hash log, advanced one pass small out, 6530222
silesia.tar, small chain log, advanced one pass small out, 4915689
@ -853,9 +853,9 @@ github.tar, level 19 with dict copy, advanced
github.tar, level 19 with dict load, advanced one pass small out, 32428
github.tar, no source size, advanced one pass small out, 38884
github.tar, no source size with dict, advanced one pass small out, 37995
github.tar, long distance mode, advanced one pass small out, 40242
github.tar, long distance mode, advanced one pass small out, 40156
github.tar, multithreaded, advanced one pass small out, 38884
github.tar, multithreaded long distance mode, advanced one pass small out, 40223
github.tar, multithreaded long distance mode, advanced one pass small out, 40139
github.tar, small window log, advanced one pass small out, 198535
github.tar, small hash log, advanced one pass small out, 129870
github.tar, small chain log, advanced one pass small out, 41669
@ -887,9 +887,9 @@ silesia, level 13, advanced
silesia, level 16, advanced streaming, 4358094
silesia, level 19, advanced streaming, 4265908
silesia, no source size, advanced streaming, 4835768
silesia, long distance mode, advanced streaming, 4827592
silesia, long distance mode, advanced streaming, 4827032
silesia, multithreaded, advanced streaming, 4833065
silesia, multithreaded long distance mode, advanced streaming, 4824827
silesia, multithreaded long distance mode, advanced streaming, 4824293
silesia, small window log, advanced streaming, 7110591
silesia, small hash log, advanced streaming, 6525259
silesia, small chain log, advanced streaming, 4911577
@ -921,9 +921,9 @@ silesia.tar, level 13, advanced
silesia.tar, level 16, advanced streaming, 4358029
silesia.tar, level 19, advanced streaming, 4258228
silesia.tar, no source size, advanced streaming, 4846779
silesia.tar, long distance mode, advanced streaming, 4826177
silesia.tar, long distance mode, advanced streaming, 4825842
silesia.tar, multithreaded, advanced streaming, 4836000
silesia.tar, multithreaded long distance mode, advanced streaming, 4828167
silesia.tar, multithreaded long distance mode, advanced streaming, 4827826
silesia.tar, small window log, advanced streaming, 7117024
silesia.tar, small hash log, advanced streaming, 6529503
silesia.tar, small chain log, advanced streaming, 4915956
@ -1171,9 +1171,9 @@ github.tar, level 19 with dict copy, advanced
github.tar, level 19 with dict load, advanced streaming, 32428
github.tar, no source size, advanced streaming, 38881
github.tar, no source size with dict, advanced streaming, 38111
github.tar, long distance mode, advanced streaming, 40242
github.tar, long distance mode, advanced streaming, 40156
github.tar, multithreaded, advanced streaming, 38884
github.tar, multithreaded long distance mode, advanced streaming, 40223
github.tar, multithreaded long distance mode, advanced streaming, 40139
github.tar, small window log, advanced streaming, 199553
github.tar, small hash log, advanced streaming, 129870
github.tar, small chain log, advanced streaming, 41669

1 Data Config Method Total compressed size
111 silesia level 13 zstdcli 4488438
112 silesia level 16 zstdcli 4358150
113 silesia level 19 zstdcli 4265929
114 silesia long distance mode zstdcli 4824875 4824341
115 silesia multithreaded zstdcli 4833113
116 silesia multithreaded long distance mode zstdcli 4824875 4824341
117 silesia small window log zstdcli 7094528
118 silesia small hash log zstdcli 6527214
119 silesia small chain log zstdcli 4911647
137 silesia.tar level 16 zstdcli 4357018
138 silesia.tar level 19 zstdcli 4259593
139 silesia.tar no source size zstdcli 4836000
140 silesia.tar long distance mode zstdcli 4828171 4827830
141 silesia.tar multithreaded zstdcli 4836004
142 silesia.tar multithreaded long distance mode zstdcli 4828171 4827830
143 silesia.tar small window log zstdcli 7100110
144 silesia.tar small hash log zstdcli 6530127
145 silesia.tar small chain log zstdcli 4915865
217 github.tar level 19 with dict zstdcli 32705
218 github.tar no source size zstdcli 38885
219 github.tar no source size with dict zstdcli 38115
220 github.tar long distance mode zstdcli 40227 40143
221 github.tar multithreaded zstdcli 38888
222 github.tar multithreaded long distance mode zstdcli 40227 40143
223 github.tar small window log zstdcli 198539
224 github.tar small hash log zstdcli 129874
225 github.tar small chain log zstdcli 41673
251 silesia level 16 advanced one pass 4356799
252 silesia level 19 advanced one pass 4265851
253 silesia no source size advanced one pass 4832054
254 silesia long distance mode advanced one pass 4823833 4823264
255 silesia multithreaded advanced one pass 4833065
256 silesia multithreaded long distance mode advanced one pass 4824827 4824293
257 silesia small window log advanced one pass 7094480
258 silesia small hash log advanced one pass 6525510
259 silesia small chain log advanced one pass 4912248
285 silesia.tar level 16 advanced one pass 4355572
286 silesia.tar level 19 advanced one pass 4257629
287 silesia.tar no source size advanced one pass 4829268
288 silesia.tar long distance mode advanced one pass 4816169 4815868
289 silesia.tar multithreaded advanced one pass 4836000
290 silesia.tar multithreaded long distance mode advanced one pass 4828167 4827826
291 silesia.tar small window log advanced one pass 7100064
292 silesia.tar small hash log advanced one pass 6530222
293 silesia.tar small chain log advanced one pass 4915689
535 github.tar level 19 with dict load advanced one pass 32428
536 github.tar no source size advanced one pass 38884
537 github.tar no source size with dict advanced one pass 37995
538 github.tar long distance mode advanced one pass 40242 40156
539 github.tar multithreaded advanced one pass 38884
540 github.tar multithreaded long distance mode advanced one pass 40223 40139
541 github.tar small window log advanced one pass 198535
542 github.tar small hash log advanced one pass 129870
543 github.tar small chain log advanced one pass 41669
569 silesia level 16 advanced one pass small out 4356799
570 silesia level 19 advanced one pass small out 4265851
571 silesia no source size advanced one pass small out 4832054
572 silesia long distance mode advanced one pass small out 4823833 4823264
573 silesia multithreaded advanced one pass small out 4833065
574 silesia multithreaded long distance mode advanced one pass small out 4824827 4824293
575 silesia small window log advanced one pass small out 7094480
576 silesia small hash log advanced one pass small out 6525510
577 silesia small chain log advanced one pass small out 4912248
603 silesia.tar level 16 advanced one pass small out 4355572
604 silesia.tar level 19 advanced one pass small out 4257629
605 silesia.tar no source size advanced one pass small out 4829268
606 silesia.tar long distance mode advanced one pass small out 4816169 4815868
607 silesia.tar multithreaded advanced one pass small out 4836000
608 silesia.tar multithreaded long distance mode advanced one pass small out 4828167 4827826
609 silesia.tar small window log advanced one pass small out 7100064
610 silesia.tar small hash log advanced one pass small out 6530222
611 silesia.tar small chain log advanced one pass small out 4915689
853 github.tar level 19 with dict load advanced one pass small out 32428
854 github.tar no source size advanced one pass small out 38884
855 github.tar no source size with dict advanced one pass small out 37995
856 github.tar long distance mode advanced one pass small out 40242 40156
857 github.tar multithreaded advanced one pass small out 38884
858 github.tar multithreaded long distance mode advanced one pass small out 40223 40139
859 github.tar small window log advanced one pass small out 198535
860 github.tar small hash log advanced one pass small out 129870
861 github.tar small chain log advanced one pass small out 41669
887 silesia level 16 advanced streaming 4358094
888 silesia level 19 advanced streaming 4265908
889 silesia no source size advanced streaming 4835768
890 silesia long distance mode advanced streaming 4827592 4827032
891 silesia multithreaded advanced streaming 4833065
892 silesia multithreaded long distance mode advanced streaming 4824827 4824293
893 silesia small window log advanced streaming 7110591
894 silesia small hash log advanced streaming 6525259
895 silesia small chain log advanced streaming 4911577
921 silesia.tar level 16 advanced streaming 4358029
922 silesia.tar level 19 advanced streaming 4258228
923 silesia.tar no source size advanced streaming 4846779
924 silesia.tar long distance mode advanced streaming 4826177 4825842
925 silesia.tar multithreaded advanced streaming 4836000
926 silesia.tar multithreaded long distance mode advanced streaming 4828167 4827826
927 silesia.tar small window log advanced streaming 7117024
928 silesia.tar small hash log advanced streaming 6529503
929 silesia.tar small chain log advanced streaming 4915956
1171 github.tar level 19 with dict load advanced streaming 32428
1172 github.tar no source size advanced streaming 38881
1173 github.tar no source size with dict advanced streaming 38111
1174 github.tar long distance mode advanced streaming 40242 40156
1175 github.tar multithreaded advanced streaming 38884
1176 github.tar multithreaded long distance mode advanced streaming 40223 40139
1177 github.tar small window log advanced streaming 199553
1178 github.tar small hash log advanced streaming 129870
1179 github.tar small chain log advanced streaming 41669