mirror of
				https://github.com/facebook/zstd.git
				synced 2025-10-31 16:47:48 +02:00 
			
		
		
		
	Merge pull request #982 from facebook/fix304
Fix for #304 and #977 : error during dictionary creation
This commit is contained in:
		| @@ -206,10 +206,10 @@ The following API allows targeting specific sub-functions for advanced tasks. | ||||
| For example, it's possible to compress several blocks using the same 'CTable', | ||||
| or to save and regenerate 'CTable' using external methods. | ||||
| */ | ||||
| /* FSE_count() : find it within "fse.h" */ | ||||
| /* FSE_count() : exposed within "fse.h" */ | ||||
| unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); | ||||
| typedef struct HUF_CElt_s HUF_CElt;   /* incomplete type */ | ||||
| size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); | ||||
| size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap, in which case, CTable will overwrite count content */ | ||||
| size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); | ||||
| size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); | ||||
|  | ||||
|   | ||||
| @@ -405,6 +405,7 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValu | ||||
| } | ||||
|  | ||||
| /** HUF_buildCTable() : | ||||
|  * @return : maxNbBits | ||||
|  *  Note : count is used before tree is written, so they can safely overlap | ||||
|  */ | ||||
| size_t HUF_buildCTable (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits) | ||||
|   | ||||
| @@ -2869,7 +2869,7 @@ size_t ZSTD_compress_generic (ZSTD_CCtx* cctx, | ||||
|         if (params.nbThreads > 1) { | ||||
|             if (cctx->mtctx == NULL || (params.nbThreads != ZSTDMT_getNbThreads(cctx->mtctx))) { | ||||
|                 DEBUGLOG(4, "ZSTD_compress_generic: creating new mtctx for nbThreads=%u (previous: %u)", | ||||
|                             params.nbThreads, ZSTDMT_getNbThreads(cctx->mtctx)); | ||||
|                             params.nbThreads, (U32)ZSTDMT_getNbThreads(cctx->mtctx)); | ||||
|                 ZSTDMT_freeCCtx(cctx->mtctx); | ||||
|                 cctx->mtctx = ZSTDMT_createCCtx_advanced(params.nbThreads, cctx->customMem); | ||||
|                 if (cctx->mtctx == NULL) return ERROR(memory_allocation); | ||||
|   | ||||
| @@ -537,8 +537,8 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer, | ||||
|   /* Checks */ | ||||
|   if (totalSamplesSize < MAX(d, sizeof(U64)) || | ||||
|       totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) { | ||||
|     DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n", | ||||
|                  (COVER_MAX_SAMPLES_SIZE >> 20)); | ||||
|     DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n", | ||||
|                  (U32)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20)); | ||||
|     return 0; | ||||
|   } | ||||
|   /* Zero the context */ | ||||
| @@ -651,12 +651,16 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs, | ||||
| } | ||||
|  | ||||
| ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( | ||||
|     void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, | ||||
|     const size_t *samplesSizes, unsigned nbSamples, | ||||
|     ZDICT_cover_params_t parameters) { | ||||
|   BYTE *const dict = (BYTE *)dictBuffer; | ||||
|     void *dictBuffer, size_t dictBufferCapacity, | ||||
|     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, | ||||
|     ZDICT_cover_params_t parameters) | ||||
| { | ||||
|   BYTE* const dict = (BYTE*)dictBuffer; | ||||
|   COVER_ctx_t ctx; | ||||
|   COVER_map_t activeDmers; | ||||
|  | ||||
|   /* Initialize global data */ | ||||
|   g_displayLevel = parameters.zParams.notificationLevel; | ||||
|   /* Checks */ | ||||
|   if (!COVER_checkParameters(parameters, dictBufferCapacity)) { | ||||
|     DISPLAYLEVEL(1, "Cover parameters incorrect\n"); | ||||
| @@ -671,8 +675,6 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( | ||||
|                  ZDICT_DICTSIZE_MIN); | ||||
|     return ERROR(dstSize_tooSmall); | ||||
|   } | ||||
|   /* Initialize global data */ | ||||
|   g_displayLevel = parameters.zParams.notificationLevel; | ||||
|   /* Initialize context and activeDmers */ | ||||
|   if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, | ||||
|                       parameters.d)) { | ||||
| @@ -947,6 +949,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( | ||||
|   unsigned k; | ||||
|   COVER_best_t best; | ||||
|   POOL_ctx *pool = NULL; | ||||
|  | ||||
|   /* Checks */ | ||||
|   if (kMinK < kMaxD || kMaxK < kMinK) { | ||||
|     LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n"); | ||||
|   | ||||
| @@ -207,7 +207,6 @@ static dictItem ZDICT_analyzePos( | ||||
|     U32 cumulLength[LLIMIT] = {0}; | ||||
|     U32 savings[LLIMIT] = {0}; | ||||
|     const BYTE* b = (const BYTE*)buffer; | ||||
|     size_t length; | ||||
|     size_t maxLength = LLIMIT; | ||||
|     size_t pos = suffix[start]; | ||||
|     U32 end = start; | ||||
| @@ -222,26 +221,30 @@ static dictItem ZDICT_analyzePos( | ||||
|        ||(MEM_read16(b+pos+1) == MEM_read16(b+pos+3)) | ||||
|        ||(MEM_read16(b+pos+2) == MEM_read16(b+pos+4)) ) { | ||||
|         /* skip and mark segment */ | ||||
|         U16 u16 = MEM_read16(b+pos+4); | ||||
|         U32 u, e = 6; | ||||
|         while (MEM_read16(b+pos+e) == u16) e+=2 ; | ||||
|         if (b[pos+e] == b[pos+e-1]) e++; | ||||
|         for (u=1; u<e; u++) | ||||
|         U16 const pattern16 = MEM_read16(b+pos+4); | ||||
|         U32 u, patternEnd = 6; | ||||
|         while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=2 ; | ||||
|         if (b[pos+patternEnd] == b[pos+patternEnd-1]) patternEnd++; | ||||
|         for (u=1; u<patternEnd; u++) | ||||
|             doneMarks[pos+u] = 1; | ||||
|         return solution; | ||||
|     } | ||||
|  | ||||
|     /* look forward */ | ||||
|     do { | ||||
|         end++; | ||||
|         length = ZDICT_count(b + pos, b + suffix[end]); | ||||
|     } while (length >=MINMATCHLENGTH); | ||||
|     {   size_t length; | ||||
|         do { | ||||
|             end++; | ||||
|             length = ZDICT_count(b + pos, b + suffix[end]); | ||||
|         } while (length >= MINMATCHLENGTH); | ||||
|     } | ||||
|  | ||||
|     /* look backward */ | ||||
|     do { | ||||
|         length = ZDICT_count(b + pos, b + *(suffix+start-1)); | ||||
|         if (length >=MINMATCHLENGTH) start--; | ||||
|     } while(length >= MINMATCHLENGTH); | ||||
|     {   size_t length; | ||||
|         do { | ||||
|             length = ZDICT_count(b + pos, b + *(suffix+start-1)); | ||||
|             if (length >=MINMATCHLENGTH) start--; | ||||
|         } while(length >= MINMATCHLENGTH); | ||||
|     } | ||||
|  | ||||
|     /* exit if not found a minimum nb of repetitions */ | ||||
|     if (end-start < minRatio) { | ||||
| @@ -268,7 +271,7 @@ static dictItem ZDICT_analyzePos( | ||||
|             U32 selectedCount = 0; | ||||
|             U32 selectedID = currentID; | ||||
|             for (id =refinedStart; id < refinedEnd; id++) { | ||||
|                 if (b[ suffix[id] + searchLength] != currentChar) { | ||||
|                 if (b[suffix[id] + searchLength] != currentChar) { | ||||
|                     if (currentCount > selectedCount) { | ||||
|                         selectedCount = currentCount; | ||||
|                         selectedID = currentID; | ||||
| @@ -297,20 +300,23 @@ static dictItem ZDICT_analyzePos( | ||||
|         memset(lengthList, 0, sizeof(lengthList)); | ||||
|  | ||||
|         /* look forward */ | ||||
|         do { | ||||
|             end++; | ||||
|             length = ZDICT_count(b + pos, b + suffix[end]); | ||||
|             if (length >= LLIMIT) length = LLIMIT-1; | ||||
|             lengthList[length]++; | ||||
|         } while (length >=MINMATCHLENGTH); | ||||
|         {   size_t length; | ||||
|             do { | ||||
|                 end++; | ||||
|                 length = ZDICT_count(b + pos, b + suffix[end]); | ||||
|                 if (length >= LLIMIT) length = LLIMIT-1; | ||||
|                 lengthList[length]++; | ||||
|             } while (length >=MINMATCHLENGTH); | ||||
|         } | ||||
|  | ||||
|         /* look backward */ | ||||
|         length = MINMATCHLENGTH; | ||||
|         while ((length >= MINMATCHLENGTH) & (start > 0)) { | ||||
|             length = ZDICT_count(b + pos, b + suffix[start - 1]); | ||||
|             if (length >= LLIMIT) length = LLIMIT - 1; | ||||
|             lengthList[length]++; | ||||
|             if (length >= MINMATCHLENGTH) start--; | ||||
|         {   size_t length = MINMATCHLENGTH; | ||||
|             while ((length >= MINMATCHLENGTH) & (start > 0)) { | ||||
|                 length = ZDICT_count(b + pos, b + suffix[start - 1]); | ||||
|                 if (length >= LLIMIT) length = LLIMIT - 1; | ||||
|                 lengthList[length]++; | ||||
|                 if (length >= MINMATCHLENGTH) start--; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         /* largest useful length */ | ||||
| @@ -345,12 +351,12 @@ static dictItem ZDICT_analyzePos( | ||||
|         /* mark positions done */ | ||||
|         {   U32 id; | ||||
|             for (id=start; id<end; id++) { | ||||
|                 U32 p, pEnd; | ||||
|                 U32 p, pEnd, length; | ||||
|                 U32 const testedPos = suffix[id]; | ||||
|                 if (testedPos == pos) | ||||
|                     length = solution.length; | ||||
|                 else { | ||||
|                     length = ZDICT_count(b+pos, b+testedPos); | ||||
|                     length = (U32)ZDICT_count(b+pos, b+testedPos); | ||||
|                     if (length > solution.length) length = solution.length; | ||||
|                 } | ||||
|                 pEnd = (U32)(testedPos + length); | ||||
| @@ -575,29 +581,30 @@ static void ZDICT_fillNoise(void* buffer, size_t length) | ||||
|  | ||||
| typedef struct | ||||
| { | ||||
|     ZSTD_CCtx* ref; | ||||
|     ZSTD_CCtx* zc; | ||||
|     ZSTD_CCtx* ref;    /* contains reference to dictionary */ | ||||
|     ZSTD_CCtx* zc;     /* working context */ | ||||
|     void* workPlace;   /* must be ZSTD_BLOCKSIZE_MAX allocated */ | ||||
| } EStats_ress_t; | ||||
|  | ||||
| #define MAXREPOFFSET 1024 | ||||
|  | ||||
| static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, | ||||
|                             U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets, | ||||
|                             const void* src, size_t srcSize, U32 notificationLevel) | ||||
|                               U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets, | ||||
|                               const void* src, size_t srcSize, | ||||
|                               U32 notificationLevel) | ||||
| { | ||||
|     size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, 1 << params.cParams.windowLog); | ||||
|     size_t cSize; | ||||
|  | ||||
|     if (srcSize > blockSizeMax) srcSize = blockSizeMax;   /* protection vs large samples */ | ||||
|     {  size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0); | ||||
|             if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } | ||||
|     {   size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, 0); | ||||
|         if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } | ||||
|     } | ||||
|     cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize); | ||||
|     if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; } | ||||
|  | ||||
|     if (cSize) {  /* if == 0; block is not compressible */ | ||||
|         const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc); | ||||
|         const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc); | ||||
|  | ||||
|         /* literals stats */ | ||||
|         {   const BYTE* bytePtr; | ||||
| @@ -659,6 +666,18 @@ static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+1], U32 val, | ||||
|     } | ||||
| } | ||||
|  | ||||
| /* ZDICT_flatLit() : | ||||
|  * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals. | ||||
|  * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode. | ||||
|  */ | ||||
| static void ZDICT_flatLit(U32* countLit) | ||||
| { | ||||
|     int u; | ||||
|     for (u=1; u<256; u++) countLit[u] = 2; | ||||
|     countLit[0]   = 4; | ||||
|     countLit[253] = 1; | ||||
|     countLit[254] = 1; | ||||
| } | ||||
|  | ||||
| #define OFFCODE_MAX 30  /* only applicable to first block */ | ||||
| static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize, | ||||
| @@ -688,6 +707,7 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize, | ||||
|     BYTE* dstPtr = (BYTE*)dstBuffer; | ||||
|  | ||||
|     /* init */ | ||||
|     DEBUGLOG(4, "ZDICT_analyzeEntropy"); | ||||
|     esr.ref = ZSTD_createCCtx(); | ||||
|     esr.zc = ZSTD_createCCtx(); | ||||
|     esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX); | ||||
| @@ -713,7 +733,7 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize, | ||||
|             goto _cleanup; | ||||
|     }   } | ||||
|  | ||||
|     /* collect stats on all files */ | ||||
|     /* collect stats on all samples */ | ||||
|     for (u=0; u<nbFiles; u++) { | ||||
|         ZDICT_countEStats(esr, params, | ||||
|                           countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset, | ||||
| @@ -722,14 +742,21 @@ static size_t ZDICT_analyzeEntropy(void*  dstBuffer, size_t maxDstSize, | ||||
|         pos += fileSizes[u]; | ||||
|     } | ||||
|  | ||||
|     /* analyze */ | ||||
|     errorCode = HUF_buildCTable (hufTable, countLit, 255, huffLog); | ||||
|     if (HUF_isError(errorCode)) { | ||||
|         eSize = ERROR(GENERIC); | ||||
|         DISPLAYLEVEL(1, "HUF_buildCTable error \n"); | ||||
|         goto _cleanup; | ||||
|     /* analyze, build stats, starting with literals */ | ||||
|     {   size_t maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog); | ||||
|         if (HUF_isError(maxNbBits)) { | ||||
|             eSize = ERROR(GENERIC); | ||||
|             DISPLAYLEVEL(1, " HUF_buildCTable error \n"); | ||||
|             goto _cleanup; | ||||
|         } | ||||
|         if (maxNbBits==8) {  /* not compressible : will fail on HUF_writeCTable() */ | ||||
|             DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n"); | ||||
|             ZDICT_flatLit(countLit);  /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */ | ||||
|             maxNbBits = HUF_buildCTable (hufTable, countLit, 255, huffLog); | ||||
|             assert(maxNbBits==9); | ||||
|         } | ||||
|         huffLog = (U32)maxNbBits; | ||||
|     } | ||||
|     huffLog = (U32)errorCode; | ||||
|  | ||||
|     /* looking for most common first offsets */ | ||||
|     {   U32 offset; | ||||
| @@ -850,6 +877,7 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, | ||||
|     U32 const notificationLevel = params.notificationLevel; | ||||
|  | ||||
|     /* check conditions */ | ||||
|     DEBUGLOG(4, "ZDICT_finalizeDictionary"); | ||||
|     if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall); | ||||
|     if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong); | ||||
|     if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall); | ||||
| @@ -1025,8 +1053,9 @@ size_t ZDICT_trainFromBuffer_unsafe_legacy( | ||||
| } | ||||
|  | ||||
|  | ||||
| /* issue : samplesBuffer need to be followed by a noisy guard band. | ||||
| *  work around : duplicate the buffer, and add the noise */ | ||||
| /* ZDICT_trainFromBuffer_legacy() : | ||||
|  * issue : samplesBuffer need to be followed by a noisy guard band. | ||||
|  * work around : duplicate the buffer, and add the noise */ | ||||
| size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity, | ||||
|                               const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, | ||||
|                               ZDICT_legacy_params_t params) | ||||
| @@ -1054,18 +1083,22 @@ size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, | ||||
|                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) | ||||
| { | ||||
|     ZDICT_cover_params_t params; | ||||
|     DEBUGLOG(3, "ZDICT_trainFromBuffer"); | ||||
|     memset(¶ms, 0, sizeof(params)); | ||||
|     params.d = 8; | ||||
|     params.steps = 4; | ||||
|     /* Default to level 6 since no compression level information is avaialble */ | ||||
|     /* Default to level 6 since no compression level information is available */ | ||||
|     params.zParams.compressionLevel = 6; | ||||
| #if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1) | ||||
|     params.zParams.notificationLevel = ZSTD_DEBUG; | ||||
| #endif | ||||
|     return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity, | ||||
|                                                samplesBuffer, samplesSizes, | ||||
|                                                nbSamples, ¶ms); | ||||
|                                                samplesBuffer, samplesSizes, nbSamples, | ||||
|                                                ¶ms); | ||||
| } | ||||
|  | ||||
| size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, | ||||
|                                         const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) | ||||
|                                   const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) | ||||
| { | ||||
|     ZDICT_params_t params; | ||||
|     memset(¶ms, 0, sizeof(params)); | ||||
|   | ||||
| @@ -38,21 +38,21 @@ extern "C" { | ||||
|  | ||||
|  | ||||
| /*! ZDICT_trainFromBuffer(): | ||||
|  * Train a dictionary from an array of samples. | ||||
|  * Uses ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4. | ||||
|  * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, | ||||
|  * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. | ||||
|  * The resulting dictionary will be saved into `dictBuffer`. | ||||
|  *  Train a dictionary from an array of samples. | ||||
|  *  Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4. | ||||
|  *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`, | ||||
|  *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. | ||||
|  *  The resulting dictionary will be saved into `dictBuffer`. | ||||
|  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) | ||||
|  *           or an error code, which can be tested with ZDICT_isError(). | ||||
|  * Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte. | ||||
|  * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. | ||||
|  *        It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. | ||||
|  *        In general, it's recommended to provide a few thousands samples, but this can vary a lot. | ||||
|  *          or an error code, which can be tested with ZDICT_isError(). | ||||
|  *  Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte. | ||||
|  *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB. | ||||
|  *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. | ||||
|  *        In general, it's recommended to provide a few thousands samples, though this can vary a lot. | ||||
|  *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary. | ||||
|  */ | ||||
| ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, | ||||
|                        const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); | ||||
|                                     const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); | ||||
|  | ||||
|  | ||||
| /*======   Helper functions   ======*/ | ||||
| @@ -72,14 +72,14 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode); | ||||
|  * ==================================================================================== */ | ||||
|  | ||||
| typedef struct { | ||||
|     int      compressionLevel;   /* 0 means default; target a specific zstd compression level */ | ||||
|     unsigned notificationLevel;  /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ | ||||
|     unsigned dictID;             /* 0 means auto mode (32-bits random value); other : force dictID value */ | ||||
|     int      compressionLevel;   /* optimize for a specific zstd compression level; 0 means default */ | ||||
|     unsigned notificationLevel;  /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ | ||||
|     unsigned dictID;             /* force dictID value; 0 means auto mode (32-bits random value) */ | ||||
| } ZDICT_params_t; | ||||
|  | ||||
| /*! ZDICT_cover_params_t: | ||||
|  *  For all values 0 means default. | ||||
|  *  k and d are the only required parameters. | ||||
|  *  For others, value 0 means default. | ||||
|  */ | ||||
| typedef struct { | ||||
|     unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ | ||||
| @@ -91,28 +91,28 @@ typedef struct { | ||||
|  | ||||
|  | ||||
| /*! ZDICT_trainFromBuffer_cover(): | ||||
|  * Train a dictionary from an array of samples using the COVER algorithm. | ||||
|  * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, | ||||
|  * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. | ||||
|  * The resulting dictionary will be saved into `dictBuffer`. | ||||
|  *  Train a dictionary from an array of samples using the COVER algorithm. | ||||
|  *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`, | ||||
|  *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. | ||||
|  *  The resulting dictionary will be saved into `dictBuffer`. | ||||
|  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) | ||||
|  *           or an error code, which can be tested with ZDICT_isError(). | ||||
|  * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte. | ||||
|  * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. | ||||
|  *        It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. | ||||
|  *        In general, it's recommended to provide a few thousands samples, but this can vary a lot. | ||||
|  *          or an error code, which can be tested with ZDICT_isError(). | ||||
|  *  Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte. | ||||
|  *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB. | ||||
|  *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. | ||||
|  *        In general, it's recommended to provide a few thousands samples, though this can vary a lot. | ||||
|  *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary. | ||||
|  */ | ||||
| ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( | ||||
|     void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, | ||||
|     const size_t *samplesSizes, unsigned nbSamples, | ||||
|     void *dictBuffer, size_t dictBufferCapacity, | ||||
|     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, | ||||
|     ZDICT_cover_params_t parameters); | ||||
|  | ||||
| /*! ZDICT_optimizeTrainFromBuffer_cover(): | ||||
|  * The same requirements as above hold for all the parameters except `parameters`. | ||||
|  * This function tries many parameter combinations and picks the best parameters. | ||||
|  * `*parameters` is filled with the best parameters found, and the dictionary | ||||
|  * constructed with those parameters is stored in `dictBuffer`. | ||||
|  * `*parameters` is filled with the best parameters found, | ||||
|  * dictionary constructed with those parameters is stored in `dictBuffer`. | ||||
|  * | ||||
|  * All of the parameters d, k, steps are optional. | ||||
|  * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}. | ||||
| @@ -125,9 +125,9 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( | ||||
|  * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread. | ||||
|  */ | ||||
| ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( | ||||
|     void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, | ||||
|     const size_t *samplesSizes, unsigned nbSamples, | ||||
|     ZDICT_cover_params_t *parameters); | ||||
|     void* dictBuffer, size_t dictBufferCapacity, | ||||
|     const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, | ||||
|     ZDICT_cover_params_t* parameters); | ||||
|  | ||||
| /*! ZDICT_finalizeDictionary(): | ||||
|  * Given a custom content as a basis for dictionary, and a set of samples, | ||||
| @@ -157,22 +157,23 @@ typedef struct { | ||||
| } ZDICT_legacy_params_t; | ||||
|  | ||||
| /*! ZDICT_trainFromBuffer_legacy(): | ||||
|  * Train a dictionary from an array of samples. | ||||
|  * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, | ||||
|  * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. | ||||
|  * The resulting dictionary will be saved into `dictBuffer`. | ||||
|  *  Train a dictionary from an array of samples. | ||||
|  *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`, | ||||
|  *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. | ||||
|  *  The resulting dictionary will be saved into `dictBuffer`. | ||||
|  * `parameters` is optional and can be provided with values set to 0 to mean "default". | ||||
|  * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) | ||||
|  *           or an error code, which can be tested with ZDICT_isError(). | ||||
|  * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. | ||||
|  *        It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`. | ||||
|  *        In general, it's recommended to provide a few thousands samples, but this can vary a lot. | ||||
|  *          or an error code, which can be tested with ZDICT_isError(). | ||||
|  *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB. | ||||
|  *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. | ||||
|  *        In general, it's recommended to provide a few thousands samples, though this can vary a lot. | ||||
|  *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary. | ||||
|  * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0. | ||||
|  *  Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0. | ||||
|  */ | ||||
| ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy( | ||||
|     void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer, | ||||
|     const size_t *samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t parameters); | ||||
|     void *dictBuffer, size_t dictBufferCapacity, | ||||
|     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, | ||||
|     ZDICT_legacy_params_t parameters); | ||||
|  | ||||
| /* Deprecation warnings */ | ||||
| /* It is generally possible to disable deprecation warnings from compiler, | ||||
|   | ||||
| @@ -666,12 +666,13 @@ static int basicUnitTests(U32 seed, double compressibility) | ||||
|  | ||||
|     /* Dictionary and dictBuilder tests */ | ||||
|     {   ZSTD_CCtx* const cctx = ZSTD_createCCtx(); | ||||
|         size_t dictSize = 16 KB; | ||||
|         void* dictBuffer = malloc(dictSize); | ||||
|         size_t const dictBufferCapacity = 16 KB; | ||||
|         void* dictBuffer = malloc(dictBufferCapacity); | ||||
|         size_t const totalSampleSize = 1 MB; | ||||
|         size_t const sampleUnitSize = 8 KB; | ||||
|         U32 const nbSamples = (U32)(totalSampleSize / sampleUnitSize); | ||||
|         size_t* const samplesSizes = (size_t*) malloc(nbSamples * sizeof(size_t)); | ||||
|         size_t dictSize; | ||||
|         U32 dictID; | ||||
|  | ||||
|         if (dictBuffer==NULL || samplesSizes==NULL) { | ||||
| @@ -680,9 +681,19 @@ static int basicUnitTests(U32 seed, double compressibility) | ||||
|             goto _output_error; | ||||
|         } | ||||
|  | ||||
|         DISPLAYLEVEL(4, "test%3i : dictBuilder on cyclic data : ", testNb++); | ||||
|         assert(compressedBufferSize >= totalSampleSize); | ||||
|         { U32 u; for (u=0; u<totalSampleSize; u++) ((BYTE*)decodedBuffer)[u] = (BYTE)u; } | ||||
|         { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; } | ||||
|         {   size_t const sDictSize = ZDICT_trainFromBuffer(dictBuffer, dictBufferCapacity, | ||||
|                                          decodedBuffer, samplesSizes, nbSamples); | ||||
|             if (ZDICT_isError(sDictSize)) goto _output_error; | ||||
|             DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)sDictSize); | ||||
|         } | ||||
|  | ||||
|         DISPLAYLEVEL(4, "test%3i : dictBuilder : ", testNb++); | ||||
|         { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; } | ||||
|         dictSize = ZDICT_trainFromBuffer(dictBuffer, dictSize, | ||||
|         dictSize = ZDICT_trainFromBuffer(dictBuffer, dictBufferCapacity, | ||||
|                                          CNBuffer, samplesSizes, nbSamples); | ||||
|         if (ZDICT_isError(dictSize)) goto _output_error; | ||||
|         DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)dictSize); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user