1
0
mirror of https://github.com/facebook/zstd.git synced 2025-09-16 09:36:32 +02:00

External matchfinder API (#3333)

* First building commit with sample matchfinder

* Set up ZSTD_externalMatchCtx struct

* move seqBuffer to ZSTD_Sequence*

* support non-contiguous dictionary

* clean up parens

* add clearExternalMatchfinder, handle allocation errors

* Add useExternalMatchfinder cParam

* validate useExternalMatchfinder cParam

* Disable LDM + external matchfinder

* Check for static CCtx

* Validate mState and mStateDestructor

* Improve LDM check to cover both branches

* Error API with optional fallback

* handle RLE properly for external matchfinder

* nit

* Move to a CDict-like model for resource ownership

* Add hidden useExternalMatchfinder bool to CCtx_params_s

* Eliminate malloc, move to cwksp allocation

* Handle CCtx reset properly

* Ensure seqStore has enough space for external sequences

* fix capitalization

* Add DEBUGLOG statements

* Add compressionLevel param to matchfinder API

* fix c99 issues and add a param combination error code

* nits

* Test external matchfinder API

* C90 compat for simpleExternalMatchFinder

* Fix some @nocommits and an ASAN bug

* nit

* nit

* nits

* forward declare copySequencesToSeqStore functions in zstd_compress_internal.h

* nit

* nit

* nits

* Update copyright headers

* Fix CMake zstreamtest build

* Fix copyright headers (again)

* typo

* Add externalMatchfinder demo program to make contrib

* Reduce memory consumption for small blockSize

* ZSTD_postProcessExternalMatchFinderResult nits

* test sum(matchlen) + sum(litlen) == srcSize in debug builds

* refExternalMatchFinder -> registerExternalMatchFinder

* C90 nit

* zstreamtest nits

* contrib nits

* contrib nits

* allow block splitter + external matchfinder, refactor

* add windowSize param

* add contrib/externalMatchfinder/README.md

* docs

* go back to old RLE heuristic because of the first block issue

* fix initializer element is not a constant expression

* ref contrib from zstd.h

* extremely pedantic compiler warning fix, meson fix, typo fix

* Additional docs on API limitations

* minor nits

* Refactor maxNbSeq calculation into a helper function

* Fix copyright
This commit is contained in:
Elliot Gorokhovsky
2022-12-28 16:45:14 -05:00
committed by GitHub
parent 90597d78ea
commit 2a402626dd
18 changed files with 929 additions and 40 deletions

View File

@@ -123,6 +123,7 @@ contrib: lib
$(MAKE) -C contrib/seekable_format/examples all
$(MAKE) -C contrib/seekable_format/tests test
$(MAKE) -C contrib/largeNbDicts all
$(MAKE) -C contrib/externalMatchfinder all
cd build/single_file_libs/ ; ./build_decoder_test.sh
cd build/single_file_libs/ ; ./build_library_test.sh
@@ -142,6 +143,7 @@ clean:
$(Q)$(MAKE) -C contrib/seekable_format/examples $@ > $(VOID)
$(Q)$(MAKE) -C contrib/seekable_format/tests $@ > $(VOID)
$(Q)$(MAKE) -C contrib/largeNbDicts $@ > $(VOID)
$(Q)$(MAKE) -C contrib/externalMatchfinder $@ > $(VOID)
$(Q)$(RM) zstd$(EXT) zstdmt$(EXT) tmp*
$(Q)$(RM) -r lz4
@echo Cleaning completed

View File

@@ -81,7 +81,7 @@ add_test(NAME fuzzer COMMAND fuzzer ${ZSTD_FUZZER_FLAGS})
#
# zstreamtest
#
add_executable(zstreamtest ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/seqgen.c ${TESTS_DIR}/zstreamtest.c)
add_executable(zstreamtest ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/seqgen.c ${TESTS_DIR}/zstreamtest.c ${TESTS_DIR}/external_matchfinder.c)
if (NOT MSVC)
target_compile_options(zstreamtest PRIVATE "-Wno-deprecated-declarations")
endif()

View File

@@ -65,8 +65,10 @@ fuzzer = executable('fuzzer',
dependencies: [ testcommon_dep, thread_dep ],
install: false)
zstreamtest_sources = [join_paths(zstd_rootdir, 'tests/seqgen.c'),
join_paths(zstd_rootdir, 'tests/zstreamtest.c')]
zstreamtest_sources = [
join_paths(zstd_rootdir, 'tests/seqgen.c'),
join_paths(zstd_rootdir, 'tests/zstreamtest.c'),
join_paths(zstd_rootdir, 'tests/external_matchfinder.c')]
zstreamtest = executable('zstreamtest',
zstreamtest_sources,
include_directories: test_includes,

View File

@@ -0,0 +1,2 @@
# build artifacts
externalMatchfinder

View File

@@ -0,0 +1,40 @@
# ################################################################
# Copyright (c) Yann Collet, Meta Platforms, Inc.
# All rights reserved.
#
# This source code is licensed under both the BSD-style license (found in the
# LICENSE file in the root directory of this source tree) and the GPLv2 (found
# in the COPYING file in the root directory of this source tree).
# ################################################################
PROGDIR = ../../programs
LIBDIR = ../../lib
LIBZSTD = $(LIBDIR)/libzstd.a
CPPFLAGS+= -I$(LIBDIR) -I$(LIBDIR)/compress -I$(LIBDIR)/common
CFLAGS ?= -O3
CFLAGS += -std=gnu99
DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
-Wstrict-aliasing=1 -Wswitch-enum \
-Wstrict-prototypes -Wundef -Wpointer-arith \
-Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
-Wredundant-decls
CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS)
default: externalMatchfinder
all: externalMatchfinder
externalMatchfinder: matchfinder.c main.c $(LIBZSTD)
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
.PHONY: $(LIBZSTD)
$(LIBZSTD):
$(MAKE) -C $(LIBDIR) libzstd.a CFLAGS="$(CFLAGS)"
clean:
$(RM) *.o
$(MAKE) -C $(LIBDIR) clean > /dev/null
$(RM) externalMatchfinder

View File

@@ -0,0 +1,14 @@
externalMatchfinder
=====================
`externalMatchfinder` is a test tool for the external matchfinder API.
It demonstrates how to use the API to perform a simple round-trip test.
A sample matchfinder is provided in matchfinder.c, but the user can swap
this out with a different one if desired. The sample matchfinder implements
LZ compression with a 1KB hashtable. Dictionary compression is not currently supported.
Command line :
```
externalMatchfinder filename
```

View File

@@ -0,0 +1,107 @@
/*
* Copyright (c) Yann Collet, Meta Platforms, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#define ZSTD_STATIC_LINKING_ONLY
#include "zstd.h"
#include "zstd_errors.h"
#include "matchfinder.h" // simpleExternalMatchFinder
#define CHECK(res) \
do { \
if (ZSTD_isError(res)) { \
printf("ERROR: %s\n", ZSTD_getErrorName(res)); \
return 1; \
} \
} while (0) \
int main(int argc, char *argv[]) {
if (argc != 2) {
printf("Usage: exampleMatchfinder <file>\n");
return 1;
}
ZSTD_CCtx* const zc = ZSTD_createCCtx();
int simpleExternalMatchState = 0xdeadbeef;
// Here is the crucial bit of code!
ZSTD_registerExternalMatchFinder(
zc,
&simpleExternalMatchState,
simpleExternalMatchFinder
);
{
size_t const res = ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, 1);
CHECK(res);
}
FILE *f = fopen(argv[1], "rb");
assert(f);
{
int const ret = fseek(f, 0, SEEK_END);
assert(ret == 0);
}
size_t const srcSize = ftell(f);
{
int const ret = fseek(f, 0, SEEK_SET);
assert(ret == 0);
}
char* const src = malloc(srcSize + 1);
assert(src);
{
size_t const ret = fread(src, srcSize, 1, f);
assert(ret == 1);
int const ret2 = fclose(f);
assert(ret2 == 0);
}
size_t const dstSize = ZSTD_compressBound(srcSize);
char* const dst = malloc(dstSize);
assert(dst);
size_t const cSize = ZSTD_compress2(zc, dst, dstSize, src, srcSize);
CHECK(cSize);
char* const val = malloc(srcSize);
assert(val);
{
size_t const res = ZSTD_decompress(val, srcSize, dst, cSize);
CHECK(res);
}
if (memcmp(src, val, srcSize) == 0) {
printf("Compression and decompression were successful!\n");
printf("Original size: %lu\n", srcSize);
printf("Compressed size: %lu\n", cSize);
} else {
printf("ERROR: input and validation buffers don't match!\n");
for (size_t i = 0; i < srcSize; i++) {
if (src[i] != val[i]) {
printf("First bad index: %zu\n", i);
break;
}
}
return 1;
}
ZSTD_freeCCtx(zc);
free(src);
free(dst);
free(val);
return 0;
}

View File

@@ -0,0 +1,80 @@
/*
* Copyright (c) Yann Collet, Meta Platforms, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/
#include "zstd_compress_internal.h"
#include "matchfinder.h"
#define HSIZE 1024
static U32 const HLOG = 10;
static U32 const MLS = 4;
static U32 const BADIDX = 0xffffffff;
size_t simpleExternalMatchFinder(
void* externalMatchState,
ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
const void* src, size_t srcSize,
const void* dict, size_t dictSize,
int compressionLevel,
size_t windowSize
) {
const BYTE* const istart = (const BYTE*)src;
const BYTE* const iend = istart + srcSize;
const BYTE* ip = istart;
const BYTE* anchor = istart;
size_t seqCount = 0;
U32 hashTable[HSIZE];
(void)externalMatchState;
(void)dict;
(void)dictSize;
(void)outSeqsCapacity;
(void)compressionLevel;
{ int i;
for (i=0; i < HSIZE; i++) {
hashTable[i] = BADIDX;
} }
while (ip + MLS < iend) {
size_t const hash = ZSTD_hashPtr(ip, HLOG, MLS);
U32 const matchIndex = hashTable[hash];
hashTable[hash] = (U32)(ip - istart);
if (matchIndex != BADIDX) {
const BYTE* const match = istart + matchIndex;
U32 const matchLen = (U32)ZSTD_count(ip, match, iend);
if (matchLen >= ZSTD_MINMATCH_MIN) {
U32 const litLen = (U32)(ip - anchor);
U32 const offset = (U32)(ip - match);
ZSTD_Sequence const seq = {
offset, litLen, matchLen, 0
};
/* Note: it's crucial to stay within the window size! */
if (offset <= windowSize) {
outSeqs[seqCount++] = seq;
ip += matchLen;
anchor = ip;
continue;
}
}
}
ip++;
}
{ ZSTD_Sequence const finalSeq = {
0, (U32)(iend - anchor), 0, 0
};
outSeqs[seqCount++] = finalSeq;
}
return seqCount;
}

View File

@@ -0,0 +1,26 @@
/*
* Copyright (c) Yann Collet, Meta Platforms, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/
#ifndef MATCHFINDER_H
#define MATCHFINDER_H
#define ZSTD_STATIC_LINKING_ONLY
#include "zstd.h"
size_t simpleExternalMatchFinder(
void* externalMatchState,
ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
const void* src, size_t srcSize,
const void* dict, size_t dictSize,
int compressionLevel,
size_t windowSize
);
#endif

View File

@@ -31,6 +31,7 @@ const char* ERR_getErrorString(ERR_enum code)
case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
case PREFIX(parameter_unsupported): return "Unsupported parameter";
case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
case PREFIX(init_missing): return "Context should be init first";
case PREFIX(memory_allocation): return "Allocation error : not enough memory";
@@ -51,6 +52,7 @@ const char* ERR_getErrorString(ERR_enum code)
case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
case PREFIX(externalMatchFinder_failed): return "External matchfinder returned an error code";
case PREFIX(maxCode):
default: return notErrorCode;
}

View File

@@ -278,6 +278,16 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
}
/* Enables validation for external sequences in debug builds. */
static int ZSTD_resolveExternalSequenceValidation(int mode) {
#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=2)
(void)mode;
return 1;
#else
return mode;
#endif
}
/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
* If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
@@ -301,6 +311,7 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
}
cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
assert(!ZSTD_checkCParams(cParams));
return cctxParams;
}
@@ -362,6 +373,7 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
}
@@ -584,6 +596,11 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
bounds.upperBound = (int)ZSTD_ps_disable;
return bounds;
case ZSTD_c_enableMatchFinderFallback:
bounds.lowerBound = 0;
bounds.upperBound = 1;
return bounds;
default:
bounds.error = ERROR(parameter_unsupported);
return bounds;
@@ -649,6 +666,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
case ZSTD_c_useRowMatchFinder:
case ZSTD_c_deterministicRefPrefix:
case ZSTD_c_prefetchCDictTables:
case ZSTD_c_enableMatchFinderFallback:
default:
return 0;
}
@@ -705,6 +723,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
case ZSTD_c_useRowMatchFinder:
case ZSTD_c_deterministicRefPrefix:
case ZSTD_c_prefetchCDictTables:
case ZSTD_c_enableMatchFinderFallback:
break;
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
@@ -937,6 +956,11 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
return CCtxParams->prefetchCDictTables;
case ZSTD_c_enableMatchFinderFallback:
BOUNDCHECK(ZSTD_c_enableMatchFinderFallback, value);
CCtxParams->enableMatchFinderFallback = value;
return CCtxParams->enableMatchFinderFallback;
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
}
}
@@ -1072,6 +1096,9 @@ size_t ZSTD_CCtxParams_getParameter(
case ZSTD_c_prefetchCDictTables:
*value = (int)CCtxParams->prefetchCDictTables;
break;
case ZSTD_c_enableMatchFinderFallback:
*value = CCtxParams->enableMatchFinderFallback;
break;
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
}
return 0;
@@ -1243,6 +1270,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
"Can't reset parameters only when not in init stage.");
ZSTD_clearAllDicts(cctx);
ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx));
return ZSTD_CCtxParams_reset(&cctx->requestedParams);
}
return 0;
@@ -1485,6 +1513,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
}
/* Helper function for calculating memory requirements.
* Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useExternalMatchFinder) {
U32 const divider = (minMatch==3 || useExternalMatchFinder) ? 3 : 4;
return blockSize / divider;
}
static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
const ZSTD_compressionParameters* cParams,
const ldmParams_t* ldmParams,
@@ -1492,12 +1527,12 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
const ZSTD_paramSwitch_e useRowMatchFinder,
const size_t buffInSize,
const size_t buffOutSize,
const U64 pledgedSrcSize)
const U64 pledgedSrcSize,
int useExternalMatchFinder)
{
size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
U32 const divider = (cParams->minMatch==3) ? 3 : 4;
size_t const maxNbSeq = blockSize / divider;
size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useExternalMatchFinder);
size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+ ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
+ 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
@@ -1516,6 +1551,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
size_t const externalSeqSpace = useExternalMatchFinder
? ZSTD_cwksp_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
: 0;
size_t const neededSpace =
cctxSpace +
entropySpace +
@@ -1524,7 +1564,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
ldmSeqSpace +
matchStateSize +
tokenSpace +
bufferSpace;
bufferSpace +
externalSeqSpace;
DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
return neededSpace;
@@ -1542,7 +1583,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
* be needed. However, we still allocate two 0-sized buffers, which can
* take space under ASAN. */
return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
&cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
&cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useExternalMatchFinder);
}
size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
@@ -1603,7 +1644,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
&cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
ZSTD_CONTENTSIZE_UNKNOWN);
ZSTD_CONTENTSIZE_UNKNOWN, params->useExternalMatchFinder);
}
}
@@ -1886,8 +1927,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
{ size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
U32 const divider = (params->cParams.minMatch==3) ? 3 : 4;
size_t const maxNbSeq = blockSize / divider;
size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useExternalMatchFinder);
size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
? ZSTD_compressBound(blockSize) + 1
: 0;
@@ -1904,7 +1944,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
size_t const neededSpace =
ZSTD_estimateCCtxSize_usingCCtxParams_internal(
&params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
buffInSize, buffOutSize, pledgedSrcSize);
buffInSize, buffOutSize, pledgedSrcSize, params->useExternalMatchFinder);
int resizeWorkspace;
FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
@@ -2017,6 +2057,14 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
zc->ldmState.loadedDictEnd = 0;
}
/* reserve space for block-level external sequences */
if (params->useExternalMatchFinder) {
size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq;
zc->externalMatchCtx.seqBuffer =
(ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
}
DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
@@ -2868,6 +2916,55 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
ssPtr->longLengthType = ZSTD_llt_none;
}
/* ZSTD_postProcessExternalMatchFinderResult() :
* Validates and post-processes sequences obtained through the external matchfinder API:
* - Checks whether nbExternalSeqs represents an error condition.
* - Appends a block delimiter to outSeqs if one is not already present.
* See zstd.h for context regarding block delimiters.
* Returns the number of sequences after post-processing, or an error code. */
static size_t ZSTD_postProcessExternalMatchFinderResult(
ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
) {
RETURN_ERROR_IF(
nbExternalSeqs > outSeqsCapacity,
externalMatchFinder_failed,
"External matchfinder returned error code %lu",
(unsigned long)nbExternalSeqs
);
RETURN_ERROR_IF(
nbExternalSeqs == 0 && srcSize > 0,
externalMatchFinder_failed,
"External matchfinder produced zero sequences for a non-empty src buffer!"
);
if (srcSize == 0) {
ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
return 1;
}
{
ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
/* We can return early if lastSeq is already a block delimiter. */
if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
return nbExternalSeqs;
}
/* This error condition is only possible if the external matchfinder
* produced an invalid parse, by definition of ZSTD_sequenceBound(). */
RETURN_ERROR_IF(
nbExternalSeqs == outSeqsCapacity,
externalMatchFinder_failed,
"nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
);
/* lastSeq is not a block delimiter, so we need to append one. */
ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
return nbExternalSeqs + 1;
}
}
typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
@@ -2915,6 +3012,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
}
if (zc->externSeqStore.pos < zc->externSeqStore.size) {
assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
/* External matchfinder + LDM is technically possible, just not implemented yet.
* We need to revisit soon and implement it. */
RETURN_ERROR_IF(
zc->appliedParams.useExternalMatchFinder,
parameter_combination_unsupported,
"Long-distance matching with external matchfinder enabled is not currently supported."
);
/* Updates ldmSeqStore.pos */
lastLLSize =
ZSTD_ldm_blockCompress(&zc->externSeqStore,
@@ -2926,6 +3032,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
} else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
/* External matchfinder + LDM is technically possible, just not implemented yet.
* We need to revisit soon and implement it. */
RETURN_ERROR_IF(
zc->appliedParams.useExternalMatchFinder,
parameter_combination_unsupported,
"Long-distance matching with external matchfinder enabled is not currently supported."
);
ldmSeqStore.seq = zc->ldmSequences;
ldmSeqStore.capacity = zc->maxNbLdmSequences;
/* Updates ldmSeqStore.size */
@@ -2940,10 +3054,64 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
zc->appliedParams.useRowMatchFinder,
src, srcSize);
assert(ldmSeqStore.pos == ldmSeqStore.size);
} else { /* not long range mode */
} else if (zc->appliedParams.useExternalMatchFinder) {
assert(
zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize)
);
assert(zc->externalMatchCtx.mFinder != NULL);
{ U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)(
zc->externalMatchCtx.mState,
zc->externalMatchCtx.seqBuffer,
zc->externalMatchCtx.seqBufferCapacity,
src, srcSize,
NULL, 0, /* dict and dictSize, currently not supported */
zc->appliedParams.compressionLevel,
windowSize
);
size_t const nbPostProcessedSeqs = ZSTD_postProcessExternalMatchFinderResult(
zc->externalMatchCtx.seqBuffer,
nbExternalSeqs,
zc->externalMatchCtx.seqBufferCapacity,
srcSize
);
/* Return early if there is no error, since we don't need to worry about last literals */
if (!ZSTD_isError(nbPostProcessedSeqs)) {
ZSTD_sequencePosition seqPos = {0,0,0};
ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
zc, &seqPos, zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs, src, srcSize
);
ms->ldmSeqStore = NULL;
DEBUGLOG(5, "Copied %lu sequences from external matchfinder to internal seqStore.", (unsigned long)nbExternalSeqs);
return ZSTDbss_compress;
}
/* Propagate the error if fallback is disabled */
if (!zc->appliedParams.enableMatchFinderFallback) {
return nbPostProcessedSeqs;
}
/* Fallback to software matchfinder */
{ ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
zc->appliedParams.useRowMatchFinder,
dictMode);
ms->ldmSeqStore = NULL;
DEBUGLOG(
5,
"External matchfinder returned error code %lu. Falling back to internal matchfinder.",
(unsigned long)nbExternalSeqs
);
lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
} }
} else { /* not long range mode and no external matchfinder */
ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
zc->appliedParams.useRowMatchFinder,
dictMode);
assert(zc->externalMatchCtx.mFinder == NULL);
ms->ldmSeqStore = NULL;
lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
}
@@ -5726,6 +5894,7 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
#ifdef ZSTD_MULTITHREAD
if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) {
@@ -5927,12 +6096,6 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
}
}
typedef struct {
U32 idx; /* Index in array of ZSTD_Sequence */
U32 posInSequence; /* Position within sequence at idx */
size_t posInSrc; /* Number of bytes given by sequences provided so far */
} ZSTD_sequencePosition;
/* ZSTD_validateSequence() :
* @offCode : is presumed to follow format required by ZSTD_storeSeq()
* @returns a ZSTD error code if sequence is not valid
@@ -5970,10 +6133,7 @@ static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32
return offBase;
}
/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
* ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
*/
static size_t
size_t
ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
ZSTD_sequencePosition* seqPos,
const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
@@ -6027,19 +6187,7 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
return 0;
}
/* Returns the number of bytes to move the current read position back by.
* Only non-zero if we ended up splitting a sequence.
* Otherwise, it may return a ZSTD error if something went wrong.
*
* This function will attempt to scan through blockSize bytes
* represented by the sequences in @inSeqs,
* storing any (partial) sequences.
*
* Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
* avoid splitting a match, or to avoid splitting a match such that it would produce a match
* smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
*/
static size_t
size_t
ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
const void* src, size_t blockSize)
@@ -6572,3 +6720,19 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
}
void ZSTD_registerExternalMatchFinder(
ZSTD_CCtx* zc, void* mState,
ZSTD_externalMatchFinder_F* mFinder
) {
ZSTD_externalMatchCtx emctx = {
mState,
mFinder,
/* seqBuffer is allocated later (from the cwskp) */
NULL, /* seqBuffer */
0 /* seqBufferCapacity */
};
zc->externalMatchCtx = emctx;
zc->requestedParams.useExternalMatchFinder = 1;
}

View File

@@ -150,6 +150,12 @@ typedef struct {
size_t capacity; /* The capacity starting from `seq` pointer */
} rawSeqStore_t;
typedef struct {
U32 idx; /* Index in array of ZSTD_Sequence */
U32 posInSequence; /* Position within sequence at idx */
size_t posInSrc; /* Number of bytes given by sequences provided so far */
} ZSTD_sequencePosition;
UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
typedef struct {
@@ -340,6 +346,15 @@ struct ZSTD_CCtx_params_s {
/* Controls prefetching in some dictMatchState matchfinders */
ZSTD_paramSwitch_e prefetchCDictTables;
/* Controls whether zstd will fall back to an internal matchfinder
* if the external matchfinder returns an error code. */
int enableMatchFinderFallback;
/* Indicates whether an external matchfinder has been referenced.
* Users can't set this externally.
* It is set internally in ZSTD_registerExternalMatchFinder(). */
int useExternalMatchFinder;
}; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
#define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
@@ -371,6 +386,14 @@ typedef struct {
ZSTD_entropyCTablesMetadata_t entropyMetadata;
} ZSTD_blockSplitCtx;
/* Context for block-level external matchfinder API */
typedef struct {
void* mState;
ZSTD_externalMatchFinder_F* mFinder;
ZSTD_Sequence* seqBuffer;
size_t seqBufferCapacity;
} ZSTD_externalMatchCtx;
struct ZSTD_CCtx_s {
ZSTD_compressionStage_e stage;
int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
@@ -440,6 +463,9 @@ struct ZSTD_CCtx_s {
/* Workspace for block splitter */
ZSTD_blockSplitCtx blockSplitCtx;
/* Workspace for external matchfinder */
ZSTD_externalMatchCtx externalMatchCtx;
};
typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
@@ -1411,4 +1437,31 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
*/
void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
* ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
* Note that the block delimiter must include the last literals of the block.
*/
size_t
ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
ZSTD_sequencePosition* seqPos,
const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
const void* src, size_t blockSize);
/* Returns the number of bytes to move the current read position back by.
* Only non-zero if we ended up splitting a sequence.
* Otherwise, it may return a ZSTD error if something went wrong.
*
* This function will attempt to scan through blockSize bytes
* represented by the sequences in @inSeqs,
* storing any (partial) sequences.
*
* Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
* avoid splitting a match, or to avoid splitting a match such that it would produce a match
* smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
*/
size_t
ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
const void* src, size_t blockSize);
#endif /* ZSTD_COMPRESS_H */

View File

@@ -478,6 +478,7 @@ typedef enum {
* ZSTD_c_useBlockSplitter
* ZSTD_c_useRowMatchFinder
* ZSTD_c_prefetchCDictTables
* ZSTD_c_enableMatchFinderFallback
* Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
* note : never ever use experimentalParam? names directly;
* also, the enums values themselves are unstable and can still change.
@@ -497,7 +498,8 @@ typedef enum {
ZSTD_c_experimentalParam13=1010,
ZSTD_c_experimentalParam14=1011,
ZSTD_c_experimentalParam15=1012,
ZSTD_c_experimentalParam16=1013
ZSTD_c_experimentalParam16=1013,
ZSTD_c_experimentalParam17=1014
} ZSTD_cParameter;
typedef struct {
@@ -560,7 +562,7 @@ typedef enum {
* They will be used to compress next frame.
* Resetting session never fails.
* - The parameters : changes all parameters back to "default".
* This removes any reference to any dictionary too.
* This also removes any reference to any dictionary or external matchfinder.
* Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
* otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
* - Both : similar to resetting the session, followed by resetting parameters.
@@ -2044,6 +2046,20 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
*/
#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
/* ZSTD_c_enableMatchFinderFallback
* Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
*
* Controls whether zstd will fall back to an internal matchfinder if an
* external matchfinder is registered and returns an error code. This fallback is
* block-by-block: the internal matchfinder will only be called for blocks where
* the external matchfinder returns an error code. Fallback compression will
* follow any other cParam settings, such as compression level, the same as in a
* normal (fully-internal) compression operation.
*
* The user is strongly encouraged to read the full external matchfinder API
* documentation (below) before setting this parameter. */
#define ZSTD_c_enableMatchFinderFallback ZSTD_c_experimentalParam17
/*! ZSTD_CCtx_getParameter() :
* Get the requested compression parameter value, selected by enum ZSTD_cParameter,
* and store it into int* value.
@@ -2676,6 +2692,142 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_
ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
/* ********************** EXTERNAL MATCHFINDER API **********************
*
* *** OVERVIEW ***
* This API allows users to replace the zstd internal block-level matchfinder
* with an external matchfinder function. Potential applications of the API
* include hardware-accelerated matchfinders and matchfinders specialized to
* particular types of data.
*
* See contrib/externalMatchfinder for an example program employing the
* external matchfinder API.
*
* *** USAGE ***
* The user is responsible for implementing a function of type
* ZSTD_externalMatchFinder_F. For each block, zstd will pass the following
* arguments to the user-provided function:
*
* - externalMatchState: a pointer to a user-managed state for the external
* matchfinder.
*
* - outSeqs, outSeqsCapacity: an output buffer for sequences produced by the
* external matchfinder. outSeqsCapacity is guaranteed >=
* ZSTD_sequenceBound(srcSize). The memory backing outSeqs is managed by
* the CCtx.
*
* - src, srcSize: an input buffer which the external matchfinder must parse
* into sequences. srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
*
* - dict, dictSize: a history buffer, which may be empty, which the external
* matchfinder may reference as it produces sequences for the src buffer.
* Currently, zstd will always pass dictSize == 0 into external matchfinders,
* but this will change in the future.
*
* - compressionLevel: a signed integer representing the zstd compression level
* set by the user for the current operation. The external matchfinder may
* choose to use this information to change its compression strategy and
* speed/ratio tradeoff. Note: The compression level does not reflect zstd
* parameters set through the advanced API.
*
* - windowSize: a size_t representing the maximum allowed offset for external
* sequences. Note that sequence offsets are sometimes allowed to exceed the
* windowSize if a dictionary is present, see doc/zstd_compression_format.md
* for details.
*
* The user-provided function shall return a size_t representing the number of
* sequences written to outSeqs. This return value will be treated as an error
* code if it is greater than outSeqsCapacity. The return value must be non-zero
* if srcSize is non-zero. The ZSTD_EXTERNAL_MATCHFINDER_ERROR macro is provided
* for convenience, but any value greater than outSeqsCapacity will be treated as
* an error code.
*
* If the user-provided function does not return an error code, the sequences
* written to outSeqs must be a valid parse of the src buffer. Data corruption may
* occur if the parse is not valid. A parse is defined to be valid if the
* following conditions hold:
* - The sum of matchLengths and literalLengths is equal to srcSize.
* - All sequences in the parse have matchLength != 0, except for the final
* sequence. matchLength is not constrained for the final sequence.
* - All offsets respect the windowSize parameter as specified in
* doc/zstd_compression_format.md.
*
* zstd will only validate these conditions (and fail compression if they do not
* hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
* validation has a performance cost.
*
* If the user-provided function returns an error, zstd will either fall back
* to an internal matchfinder or fail the compression operation. The user can
* choose between the two behaviors by setting the
* ZSTD_c_enableMatchFinderFallback cParam. Fallback compression will follow any
* other cParam settings, such as compression level, the same as in a normal
* compression operation.
*
* The user shall instruct zstd to use a particular ZSTD_externalMatchFinder_F
* function by calling ZSTD_registerExternalMatchFinder(cctx, externalMatchState,
* externalMatchFinder). This setting will persist until the next parameter reset
* of the CCtx.
*
* The externalMatchState must be initialized by the user before calling
* ZSTD_registerExternalMatchFinder. The user is responsible for destroying the
* externalMatchState.
*
* *** LIMITATIONS ***
* External matchfinders are compatible with all zstd compression APIs. There are
* only two limitations.
*
* First, the ZSTD_c_enableLongDistanceMatching cParam is not supported.
* COMPRESSION WILL FAIL if it is enabled and the user tries to compress with an
* external matchfinder.
* - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in
* some cases (see its documentation for details). Users must explicitly set
* ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an
* external matchfinder is registered.
* - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
* whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
* check the docs on ZSTD_c_enableLongDistanceMatching whenever the external
* matchfinder API is used in conjunction with advanced settings (like windowLog).
*
* Second, history buffers are not supported. Concretely, zstd will always pass
* dictSize == 0 to the external matchfinder (for now). This has two implications:
* - Dictionaries are not supported. Compression will *not* fail if the user
* references a dictionary, but the dictionary won't have any effect.
* - Stream history is not supported. All compression APIs, including streaming
* APIs, work with the external matchfinder, but the external matchfinder won't
* receive any history from the previous block. Each block is an independent chunk.
*
* Long-term, we plan to overcome both limitations. There is no technical blocker to
* overcoming them. It is purely a question of engineering effort.
*/
#define ZSTD_EXTERNAL_MATCHFINDER_ERROR ((size_t)(-1))
typedef size_t ZSTD_externalMatchFinder_F (
void* externalMatchState,
ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
const void* src, size_t srcSize,
const void* dict, size_t dictSize,
int compressionLevel,
size_t windowSize
);
/*! ZSTD_registerExternalMatchFinder() :
* Instruct zstd to use an external matchfinder function.
*
* The externalMatchState must be initialized by the caller, and the caller is
* responsible for managing its lifetime. This parameter is sticky across
* compressions. It will remain set until the user explicitly resets compression
* parameters.
*
* The user is strongly encouraged to read the full API documentation (above)
* before calling this function. */
ZSTDLIB_STATIC_API void
ZSTD_registerExternalMatchFinder(
ZSTD_CCtx* cctx,
void* externalMatchState,
ZSTD_externalMatchFinder_F* externalMatchFinder
);
#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
#if defined (__cplusplus)

View File

@@ -75,6 +75,7 @@ typedef enum {
ZSTD_error_dictionary_wrong = 32,
ZSTD_error_dictionaryCreation_failed = 34,
ZSTD_error_parameter_unsupported = 40,
ZSTD_error_parameter_combination_unsupported = 41,
ZSTD_error_parameter_outOfBound = 42,
ZSTD_error_tableLog_tooLarge = 44,
ZSTD_error_maxSymbolValue_tooLarge = 46,
@@ -92,6 +93,7 @@ typedef enum {
ZSTD_error_seekableIO = 102,
ZSTD_error_dstBuffer_wrong = 104,
ZSTD_error_srcBuffer_wrong = 105,
ZSTD_error_externalMatchFinder_failed = 106,
ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
} ZSTD_ErrorCode;

View File

@@ -169,7 +169,7 @@ fuzzer-dll : $(ZSTDDIR)/common/xxhash.c $(PRGDIR)/util.c $(PRGDIR)/timefn.c $(PR
$(CC) $(CPPFLAGS) $(CFLAGS) $(filter %.c,$^) $(LDFLAGS) -o $@$(EXT)
CLEAN += zstreamtest zstreamtest32
ZSTREAM_LOCAL_FILES := $(PRGDIR)/datagen.c $(PRGDIR)/util.c $(PRGDIR)/timefn.c seqgen.c zstreamtest.c
ZSTREAM_LOCAL_FILES := $(PRGDIR)/datagen.c $(PRGDIR)/util.c $(PRGDIR)/timefn.c seqgen.c zstreamtest.c external_matchfinder.c
ZSTREAM_PROPER_FILES := $(ZDICT_FILES) $(ZSTREAM_LOCAL_FILES)
ZSTREAMFILES := $(ZSTD_FILES) $(ZSTREAM_PROPER_FILES)
zstreamtest32 : CFLAGS += -m32

View File

@@ -0,0 +1,117 @@
/*
* Copyright (c) Yann Collet, Meta Platforms, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/
#include "external_matchfinder.h"
#include <string.h>
#include "zstd_compress_internal.h"
#define HSIZE 1024
static U32 const HLOG = 10;
static U32 const MLS = 4;
static U32 const BADIDX = 0xffffffff;
static size_t simpleExternalMatchFinder(
void* externalMatchState,
ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
const void* src, size_t srcSize,
const void* dict, size_t dictSize,
int compressionLevel,
size_t windowSize
) {
const BYTE* const istart = (const BYTE*)src;
const BYTE* const iend = istart + srcSize;
const BYTE* ip = istart;
const BYTE* anchor = istart;
size_t seqCount = 0;
U32 hashTable[HSIZE];
(void)externalMatchState;
(void)dict;
(void)dictSize;
(void)outSeqsCapacity;
(void)compressionLevel;
{ int i;
for (i=0; i < HSIZE; i++) {
hashTable[i] = BADIDX;
} }
while (ip + MLS < iend) {
size_t const hash = ZSTD_hashPtr(ip, HLOG, MLS);
U32 const matchIndex = hashTable[hash];
hashTable[hash] = (U32)(ip - istart);
if (matchIndex != BADIDX) {
const BYTE* const match = istart + matchIndex;
U32 const matchLen = (U32)ZSTD_count(ip, match, iend);
if (matchLen >= ZSTD_MINMATCH_MIN) {
U32 const litLen = (U32)(ip - anchor);
U32 const offset = (U32)(ip - match);
ZSTD_Sequence const seq = {
offset, litLen, matchLen, 0
};
/* Note: it's crucial to stay within the window size! */
if (offset <= windowSize) {
outSeqs[seqCount++] = seq;
ip += matchLen;
anchor = ip;
continue;
}
}
}
ip++;
}
{ ZSTD_Sequence const finalSeq = {
0, (U32)(iend - anchor), 0, 0
};
outSeqs[seqCount++] = finalSeq;
}
return seqCount;
}
size_t zstreamExternalMatchFinder(
void* externalMatchState,
ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
const void* src, size_t srcSize,
const void* dict, size_t dictSize,
int compressionLevel,
size_t windowSize
) {
EMF_testCase const testCase = *((EMF_testCase*)externalMatchState);
memset(outSeqs, 0, outSeqsCapacity);
switch (testCase) {
case EMF_ZERO_SEQS:
return 0;
case EMF_ONE_BIG_SEQ:
outSeqs[0].offset = 0;
outSeqs[0].matchLength = 0;
outSeqs[0].litLength = (U32)(srcSize);
return 1;
case EMF_LOTS_OF_SEQS:
return simpleExternalMatchFinder(
externalMatchState,
outSeqs, outSeqsCapacity,
src, srcSize,
dict, dictSize,
compressionLevel,
windowSize
);
case EMF_SMALL_ERROR:
return outSeqsCapacity + 1;
case EMF_BIG_ERROR:
default:
return ZSTD_EXTERNAL_MATCHFINDER_ERROR;
}
}

View File

@@ -0,0 +1,35 @@
/*
* Copyright (c) Yann Collet, Meta Platforms, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/
#ifndef EXTERNAL_MATCHFINDER
#define EXTERNAL_MATCHFINDER
#define ZSTD_STATIC_LINKING_ONLY
#include "zstd.h"
/* See external_matchfinder.c for details on each test case */
typedef enum {
EMF_ZERO_SEQS = 0,
EMF_ONE_BIG_SEQ = 1,
EMF_LOTS_OF_SEQS = 2,
EMF_BIG_ERROR = 3,
EMF_SMALL_ERROR = 4
} EMF_testCase;
size_t zstreamExternalMatchFinder(
void* externalMatchState,
ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
const void* src, size_t srcSize,
const void* dict, size_t dictSize,
int compressionLevel,
size_t windowSize
);
#endif // EXTERNAL_MATCHFINDER

View File

@@ -39,7 +39,7 @@
#include "seqgen.h"
#include "util.h"
#include "timefn.h" /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
#include "external_matchfinder.h" /* zstreamExternalMatchFinder, EMF_testCase */
/*-************************************
* Constants
@@ -1834,6 +1834,97 @@ static int basicUnitTests(U32 seed, double compressibility, int bigTests)
}
DISPLAYLEVEL(3, "OK \n");
DISPLAYLEVEL(3, "test%3i : External matchfinder API: ", testNb++);
{
size_t const dstBufSize = ZSTD_compressBound(CNBufferSize);
BYTE* const dstBuf = (BYTE*)malloc(ZSTD_compressBound(dstBufSize));
size_t const checkBufSize = CNBufferSize;
BYTE* const checkBuf = (BYTE*)malloc(checkBufSize);
int enableFallback;
EMF_testCase externalMatchState;
CHECK(dstBuf == NULL || checkBuf == NULL, "allocation failed");
ZSTD_CCtx_reset(zc, ZSTD_reset_session_and_parameters);
/* Reference external matchfinder outside the test loop to
* check that the reference is preserved across compressions */
ZSTD_registerExternalMatchFinder(
zc,
&externalMatchState,
zstreamExternalMatchFinder
);
for (enableFallback = 0; enableFallback < 1; enableFallback++) {
size_t testCaseId;
EMF_testCase const EMF_successCases[] = {
EMF_ONE_BIG_SEQ,
EMF_LOTS_OF_SEQS,
};
size_t const EMF_numSuccessCases = 2;
EMF_testCase const EMF_failureCases[] = {
EMF_ZERO_SEQS,
EMF_BIG_ERROR,
EMF_SMALL_ERROR,
};
size_t const EMF_numFailureCases = 3;
/* Test external matchfinder success scenarios */
for (testCaseId = 0; testCaseId < EMF_numSuccessCases; testCaseId++) {
size_t res;
externalMatchState = EMF_successCases[testCaseId];
ZSTD_CCtx_reset(zc, ZSTD_reset_session_only);
CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, enableFallback));
res = ZSTD_compress2(zc, dstBuf, dstBufSize, CNBuffer, CNBufferSize);
CHECK(ZSTD_isError(res), "EMF: Compression error: %s", ZSTD_getErrorName(res));
CHECK_Z(ZSTD_decompress(checkBuf, checkBufSize, dstBuf, res));
CHECK(memcmp(CNBuffer, checkBuf, CNBufferSize) != 0, "EMF: Corruption!");
}
/* Test external matchfinder failure scenarios */
for (testCaseId = 0; testCaseId < EMF_numFailureCases; testCaseId++) {
size_t res;
externalMatchState = EMF_failureCases[testCaseId];
ZSTD_CCtx_reset(zc, ZSTD_reset_session_only);
CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, enableFallback));
res = ZSTD_compress2(zc, dstBuf, dstBufSize, CNBuffer, CNBufferSize);
if (enableFallback) {
CHECK_Z(ZSTD_decompress(checkBuf, checkBufSize, dstBuf, res));
CHECK(memcmp(CNBuffer, checkBuf, CNBufferSize) != 0, "EMF: Corruption!");
} else {
CHECK(!ZSTD_isError(res), "EMF: Should have raised an error!");
CHECK(
ZSTD_getErrorCode(res) != ZSTD_error_externalMatchFinder_failed,
"EMF: Wrong error code: %s", ZSTD_getErrorName(res)
);
}
}
/* Test compression with external matchfinder + empty src buffer */
{
size_t res;
externalMatchState = EMF_ZERO_SEQS;
ZSTD_CCtx_reset(zc, ZSTD_reset_session_only);
CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, enableFallback));
res = ZSTD_compress2(zc, dstBuf, dstBufSize, CNBuffer, 0);
CHECK(ZSTD_isError(res), "EMF: Compression error: %s", ZSTD_getErrorName(res));
CHECK(ZSTD_decompress(checkBuf, checkBufSize, dstBuf, res) != 0, "EMF: Empty src round trip failed!");
}
}
/* Test that reset clears the external matchfinder */
ZSTD_CCtx_reset(zc, ZSTD_reset_session_and_parameters);
externalMatchState = EMF_BIG_ERROR; /* ensure zstd will fail if the matchfinder wasn't cleared */
CHECK_Z(ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, 0));
CHECK_Z(ZSTD_compress2(zc, dstBuf, dstBufSize, CNBuffer, CNBufferSize));
free(dstBuf);
free(checkBuf);
}
DISPLAYLEVEL(3, "OK \n");
_end:
FUZ_freeDictionary(dictionary);
ZSTD_freeCStream(zc);