Skip to content

Commit

Permalink
add ZSTD_c_fastExternalSequenceParsing cctxParam
Browse files Browse the repository at this point in the history
  • Loading branch information
embg committed Feb 1, 2023
1 parent 64052ef commit 7f8189c
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 18 deletions.
79 changes: 71 additions & 8 deletions lib/compress/zstd_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,15 @@ static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
}
}

static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
if (value != ZSTD_ps_auto) return value;
if (cLevel < 10) {
return ZSTD_ps_disable;
} else {
return ZSTD_ps_enable;
}
}

/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
* If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
Expand All @@ -315,6 +324,8 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
cctxParams.compressionLevel);
assert(!ZSTD_checkCParams(cParams));
return cctxParams;
}
Expand Down Expand Up @@ -381,6 +392,7 @@ ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
}
Expand Down Expand Up @@ -613,6 +625,11 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
return bounds;

case ZSTD_c_searchForExternalRepcodes:
bounds.lowerBound = (int)ZSTD_ps_auto;
bounds.upperBound = (int)ZSTD_ps_disable;
return bounds;

default:
bounds.error = ERROR(parameter_unsupported);
return bounds;
Expand Down Expand Up @@ -680,6 +697,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
case ZSTD_c_prefetchCDictTables:
case ZSTD_c_enableMatchFinderFallback:
case ZSTD_c_maxBlockSize:
case ZSTD_c_searchForExternalRepcodes:
default:
return 0;
}
Expand Down Expand Up @@ -738,6 +756,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
case ZSTD_c_prefetchCDictTables:
case ZSTD_c_enableMatchFinderFallback:
case ZSTD_c_maxBlockSize:
case ZSTD_c_searchForExternalRepcodes:
break;

default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
Expand Down Expand Up @@ -981,6 +1000,11 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
CCtxParams->maxBlockSize = value;
return CCtxParams->maxBlockSize;

case ZSTD_c_searchForExternalRepcodes:
BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
return CCtxParams->searchForExternalRepcodes;

default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
}
}
Expand Down Expand Up @@ -1122,6 +1146,9 @@ size_t ZSTD_CCtxParams_getParameter(
case ZSTD_c_maxBlockSize:
*value = (int)CCtxParams->maxBlockSize;
break;
case ZSTD_c_searchForExternalRepcodes:
*value = (int)CCtxParams->searchForExternalRepcodes;
break;
default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
}
return 0;
Expand Down Expand Up @@ -3184,7 +3211,8 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
zc, &seqPos,
zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs,
src, srcSize
src, srcSize,
zc->appliedParams.searchForExternalRepcodes
),
"Failed to copy external sequences to seqStore!"
);
Expand Down Expand Up @@ -6000,6 +6028,7 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);

#ifdef ZSTD_MULTITHREAD
/* If external matchfinder is enabled, make sure to fail before checking job size (for consistency) */
Expand Down Expand Up @@ -6259,9 +6288,11 @@ size_t
ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
ZSTD_sequencePosition* seqPos,
const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
const void* src, size_t blockSize)
const void* src, size_t blockSize,
ZSTD_paramSwitch_e externalRepSearch)
{
U32 idx = seqPos->idx;
U32 const startIdx = idx;
BYTE const* ip = (BYTE const*)(src);
const BYTE* const iend = ip + blockSize;
repcodes_t updatedRepcodes;
Expand All @@ -6279,10 +6310,16 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
U32 const litLength = inSeqs[idx].litLength;
U32 const ll0 = (litLength == 0);
U32 const matchLength = inSeqs[idx].matchLength;
U32 const offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
U32 offBase;

if (externalRepSearch == ZSTD_ps_disable) {
offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
} else {
U32 const ll0 = (litLength == 0);
offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
}

DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
if (cctx->appliedParams.validateSequences) {
Expand All @@ -6296,6 +6333,29 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
ip += matchLength + litLength;
}

/* If we skipped repcode search while parsing, we need to update repcodes now */
assert(idx >= startIdx);
if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
U32* const rep = updatedRepcodes.rep;
U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */

if (lastSeqIdx >= startIdx + 2) {
rep[2] = inSeqs[lastSeqIdx - 2].offset;
rep[1] = inSeqs[lastSeqIdx - 1].offset;
rep[0] = inSeqs[lastSeqIdx].offset;
} else if (lastSeqIdx == startIdx + 1) {
rep[2] = rep[0];
rep[1] = inSeqs[lastSeqIdx - 1].offset;
rep[0] = inSeqs[lastSeqIdx].offset;
} else {
assert(lastSeqIdx == startIdx);
rep[2] = rep[1];
rep[1] = rep[0];
rep[0] = inSeqs[lastSeqIdx].offset;
}
}

ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));

if (inSeqs[idx].litLength) {
Expand All @@ -6312,7 +6372,7 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
size_t
ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
const void* src, size_t blockSize)
const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch)
{
U32 idx = seqPos->idx;
U32 startPosInSequence = seqPos->posInSequence;
Expand All @@ -6324,6 +6384,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
U32 bytesAdjustment = 0;
U32 finalMatchSplit = 0;

/* TODO(embg) support fast parsing mode in noBlockDelim mode */
(void)externalRepSearch;

if (cctx->cdict) {
dictSize = cctx->cdict->dictContentSize;
} else if (cctx->prefixDict.dict) {
Expand Down Expand Up @@ -6431,7 +6494,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*

typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
const void* src, size_t blockSize);
const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
{
ZSTD_sequenceCopier sequenceCopier = NULL;
Expand Down Expand Up @@ -6539,7 +6602,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
ZSTD_resetSeqStore(&cctx->seqStore);
DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);

additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
blockSize -= additionalByteAdjustment;

Expand Down
7 changes: 5 additions & 2 deletions lib/compress/zstd_compress_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,9 @@ struct ZSTD_CCtx_params_s {

/* Adjust the max block size*/
size_t maxBlockSize;

/* Controls repcode search in external sequence parsing */
ZSTD_paramSwitch_e searchForExternalRepcodes;
}; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */

#define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
Expand Down Expand Up @@ -1453,7 +1456,7 @@ size_t
ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
ZSTD_sequencePosition* seqPos,
const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
const void* src, size_t blockSize);
const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);

/* Returns the number of bytes to move the current read position back by.
* Only non-zero if we ended up splitting a sequence.
Expand All @@ -1470,6 +1473,6 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
size_t
ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
const void* src, size_t blockSize);
const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);

#endif /* ZSTD_COMPRESS_H */
40 changes: 32 additions & 8 deletions lib/zstd.h
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,8 @@ typedef enum {
ZSTD_c_experimentalParam15=1012,
ZSTD_c_experimentalParam16=1013,
ZSTD_c_experimentalParam17=1014,
ZSTD_c_experimentalParam18=1015
ZSTD_c_experimentalParam18=1015,
ZSTD_c_experimentalParam19=1016
} ZSTD_cParameter;

typedef struct {
Expand Down Expand Up @@ -2126,18 +2127,41 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
* documentation (below) before setting this parameter. */
#define ZSTD_c_enableMatchFinderFallback ZSTD_c_experimentalParam17

/* ZSTD_c_maxBlockSize
* Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
* The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
/* ZSTD_c_maxBlockSize
* Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
* The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
*
* This parameter can be used to set an upper bound on the blocksize
* that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
* bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
* compressBound() innacurate). Only currently meant to be used for testing.
* This parameter can be used to set an upper bound on the blocksize
* that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
* bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
* compressBound() innacurate). Only currently meant to be used for testing.
*
*/
#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18

/* ZSTD_c_searchForExternalRepcodes
* This parameter affects how zstd parses external sequences, such as sequences
* provided through the compressSequences() API or from an external matchfinder.
*
* If set to ZSTD_ps_enable, the library will check for repeated offsets in
* external sequences, even if those repcodes are not explicitly indicated in
* the "rep" field. Note that this is the only way to exploit repcode matches
* while using compressSequences() or an external matchfinder, since zstd
* currently ignores the "rep" field of external sequences.
*
* If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
* external sequences, regardless of whether the "rep" field has been set. This
* reduces sequence compression overhead by about 25% while sacrificing some
* compression ratio.
*
* The default value is ZSTD_ps_auto, for which the library will enable/disable
* based on compression level.
*
* Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is
* set to ZSTD_sf_explicitBlockDelimiters. That may change in the future.
*/
#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19

/*! ZSTD_CCtx_getParameter() :
* Get the requested compression parameter value, selected by enum ZSTD_cParameter,
* and store it into int* value.
Expand Down
1 change: 1 addition & 0 deletions tests/fuzz/zstd_helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ void FUZZ_setRandomParameters(ZSTD_CCtx *cctx, size_t srcSize, FUZZ_dataProducer
setRand(cctx, ZSTD_c_prefetchCDictTables, 0, 2, producer);
setRand(cctx, ZSTD_c_maxBlockSize, ZSTD_BLOCKSIZE_MAX_MIN, ZSTD_BLOCKSIZE_MAX, producer);
setRand(cctx, ZSTD_c_validateSequences, 0, 1, producer);
setRand(cctx, ZSTD_c_searchForExternalRepcodes, 0, 2, producer);
if (FUZZ_dataProducer_uint32Range(producer, 0, 1) == 0) {
setRand(cctx, ZSTD_c_srcSizeHint, ZSTD_SRCSIZEHINT_MIN, 2 * srcSize, producer);
}
Expand Down

0 comments on commit 7f8189c

Please sign in to comment.