Merge branch 'dev' of github.com:facebook/zstd into dev

oUp2Uo · May 2, 2017 · 606c04c · 606c04c
2 parents b184589 + 072484a
commit 606c04c
Show file tree

Hide file tree

Showing 7 changed files with 176 additions and 86 deletions.
diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c
@@ -234,10 +234,22 @@ static size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
  * Returns 1 if the dmer at lp is greater than the dmer at rp.
  */
 static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
-  const U32 lhs = *(const U32 *)lp;
-  const U32 rhs = *(const U32 *)rp;
+  U32 const lhs = *(U32 const *)lp;
+  U32 const rhs = *(U32 const *)rp;
   return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
 }
+/**
+ * Faster version for d <= 8.
+ */
+static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
+  U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
+  U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
+  U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
+  if (lhs < rhs) {
+    return -1;
+  }
+  return (lhs > rhs);
+}
 
 /**
  * Same as COVER_cmp() except ties are broken by pointer value
@@ -251,6 +263,16 @@ static int COVER_strict_cmp(const void *lp, const void *rp) {
   }
   return result;
 }
+/**
+ * Faster version for d <= 8.
+ */
+static int COVER_strict_cmp8(const void *lp, const void *rp) {
+  int result = COVER_cmp8(g_ctx, lp, rp);
+  if (result == 0) {
+    result = lp < rp ? -1 : 1;
+  }
+  return result;
+}
 
 /**
  * Returns the first pointer in [first, last) whose element does not compare
@@ -506,7 +528,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
   const BYTE *const samples = (const BYTE *)samplesBuffer;
   const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
   /* Checks */
-  if (totalSamplesSize < d ||
+  if (totalSamplesSize < MAX(d, sizeof(U64)) ||
       totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
     DISPLAYLEVEL(1, "Total samples size is too large, maximum size is %u MB\n",
                  (COVER_MAX_SAMPLES_SIZE >> 20));
@@ -520,7 +542,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
   ctx->samplesSizes = samplesSizes;
   ctx->nbSamples = nbSamples;
   /* Partial suffix array */
-  ctx->suffixSize = totalSamplesSize - d + 1;
+  ctx->suffixSize = totalSamplesSize - MAX(d, sizeof(U64)) + 1;
   ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
   /* Maps index to the dmerID */
   ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
@@ -554,7 +576,8 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
     }
     /* qsort doesn't take an opaque pointer, so pass as a global */
     g_ctx = ctx;
-    qsort(ctx->suffix, ctx->suffixSize, sizeof(U32), &COVER_strict_cmp);
+    qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
+          (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
   }
   DISPLAYLEVEL(2, "Computing frequencies\n");
   /* For each dmer group (group of positions with the same first d bytes):
@@ -564,8 +587,8 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
    * 2. We calculate how many samples the dmer occurs in and save it in
    *    freqs[dmerId].
    */
-  COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx, &COVER_cmp,
-                &COVER_group);
+  COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
+                (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
   ctx->freqs = ctx->suffix;
   ctx->suffix = NULL;
   return 1;
@@ -916,10 +939,10 @@ ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer,
   /* constants */
   const unsigned nbThreads = parameters->nbThreads;
   const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
-  const unsigned kMaxD = parameters->d == 0 ? 16 : parameters->d;
-  const unsigned kMinK = parameters->k == 0 ? kMaxD : parameters->k;
-  const unsigned kMaxK = parameters->k == 0 ? 2048 : parameters->k;
-  const unsigned kSteps = parameters->steps == 0 ? 32 : parameters->steps;
+  const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
+  const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
+  const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
+  const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
   const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
   const unsigned kIterations =
       (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);

diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h
@@ -88,7 +88,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dict
 
 /*! COVER_params_t :
     For all values 0 means default.
-    kMin and d are the only required parameters.
+    k and d are the only required parameters.
 */
 typedef struct {
     unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */

diff --git a/programs/Makefile b/programs/Makefile
@@ -145,7 +145,7 @@ zstd-nogz : ZLIB_MSG := $(NO_ZLIB_MSG)
 zstd-nogz : LZMA_MSG := $(NO_LZMA_MSG)
 xzstd : CPPFLAGS += $(ZLIBCPP) $(LZMACPP)
 xzstd : LDFLAGS += $(ZLIBLD) $(LZMALD)
-xzstd : LZ4_MSG := $(NO_LZMA_MSG)
+xzstd : LZ4_MSG := $(NO_LZ4_MSG)
 zstd4 : CPPFLAGS += $(ZLIBCPP) $(LZ4CPP)
 zstd4 : LDFLAGS += $(ZLIBLD) $(LZ4LD)
 zstd4 : LZMA_MSG := $(NO_LZMA_MSG)

diff --git a/programs/zstd.1 b/programs/zstd.1
@@ -1,5 +1,5 @@
 .
-.TH "ZSTD" "1" "April 2017" "zstd 1.1.5" "User Commands"
+.TH "ZSTD" "1" "May 2017" "zstd 1.2.0" "User Commands"
 .
 .SH "NAME"
 \fBzstd\fR \- zstd, zstdmt, unzstd, zstdcat \- Compress or decompress \.zst files
@@ -168,49 +168,57 @@ All arguments after \fB\-\-\fR are treated as files
 .
 .TP
 \fB\-\-train FILEs\fR
-use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\.
+Use FILEs as training set to create a dictionary\. The training set should contain a lot of small files (> 100), and weight typically 100x the target dictionary size (for example, 10 MB for a 100 KB dictionary)\.
+.
+.IP
+Supports multithreading if \fBzstd\fR is compiled with threading support\. Additional parameters can be specified with \fB\-\-train\-cover\fR\. The legacy dictionary builder can be accessed with \fB\-\-train\-legacy\fR\. Equivalent to \fB\-\-train\-cover=d=8,steps=4\fR\.
 .
 .TP
 \fB\-o file\fR
-dictionary saved into \fBfile\fR (default name: dictionary)
+Dictionary saved into \fBfile\fR (default name: dictionary)\.
 .
 .TP
 \fB\-\-maxdict=#\fR
-limit dictionary to specified size (default : (112640)
+Limit dictionary to specified size (default: 112640)\.
 .
 .TP
 \fB\-\-dictID=#\fR
 A dictionary ID is a locally unique ID that a decoder can use to verify it is using the right dictionary\. By default, zstd will create a 4\-bytes random number ID\. It\'s possible to give a precise number instead\. Short numbers have an advantage : an ID < 256 will only need 1 byte in the compressed frame header, and an ID < 65536 will only need 2 bytes\. This compares favorably to 4 bytes default\. However, it\'s up to the dictionary manager to not assign twice the same ID to 2 different dictionaries\.
 .
 .TP
-\fB\-s#\fR
-dictionary selectivity level (default: 9) the smaller the value, the denser the dictionary, improving its efficiency but reducing its possible maximum size\.
+\fB\-\-train\-cover[=k#,d=#,steps=#]\fR
+Select parameters for the default dictionary builder algorithm named cover\. If \fId\fR is not specified, then it tries \fId\fR = 6 and \fId\fR = 8\. If \fIk\fR is not specified, then it tries \fIsteps\fR values in the range [50, 2000]\. If \fIsteps\fR is not specified, then the default value of 40 is used\. Requires that \fId\fR <= \fIk\fR\.
 .
-.TP
-\fB\-\-cover=k#,d=#\fR
-Use alternate dictionary builder algorithm named cover with parameters \fIk\fR and \fId\fR with \fId\fR <= \fIk\fR\. Selects segments of size \fIk\fR with the highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of of size \fId\fR\. Generally \fId\fR should be in the range [6, 24]\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [32, 2048]\.
+.IP
+Selects segments of size \fIk\fR with highest score to put in the dictionary\. The score of a segment is computed by the sum of the frequencies of all the subsegments of size \fId\fR\. Generally \fId\fR should be in the range [6, 8], occasionally up to 16, but the algorithm will run faster with d <= \fI8\fR\. Good values for \fIk\fR vary widely based on the input data, but a safe range is [2 * \fId\fR, 2000]\. Supports multithreading if \fBzstd\fR is compiled with threading support\.
 .
-.br
-Example: \fB\-\-train \-\-cover=k=64,d=8 FILEs\fR\.
+.IP
+Examples:
 .
-.TP
-\fB\-\-optimize\-cover[=steps=#,k=#,d=#]\fR
-If \fIsteps\fR is not specified, the default value of 32 is used\. If \fIk\fR is not specified, the \fIk\fR values in [16, 2048] are checked for each value of \fId\fR\. If \fId\fR is not specified, the values checked are [6, 8, \.\.\., 16]\.
+.IP
+\fBzstd \-\-train\-cover FILEs\fR
+.
+.IP
+\fBzstd \-\-train\-cover=k=50,d=8 FILEs\fR
 .
 .IP
-Runs the cover dictionary builder for each parameter set and saves the optimal parameters and dictionary\. Prints optimal parameters and writes optimal dictionary into output file\. Supports multithreading if \fBzstd\fR is compiled with threading support\.
+\fBzstd \-\-train\-cover=d=8,steps=500 FILEs\fR
 .
 .IP
-The parameter \fIk\fR is more sensitive than \fId\fR, and is faster to optimize over\. Suggested use is to run with a \fIsteps\fR <= 32 with neither \fIk\fR nor \fId\fR set\. Once it completes, use the value of \fId\fR it selects with a higher \fIsteps\fR (in the range [256, 1024])\.
+\fBzstd \-\-train\-cover=k=50 FILEs\fR
+.
+.TP
+\fB\-\-train\-legacy[=selectivity=#]\fR
+Use legacy dictionary builder algorithm with the given dictionary \fIselectivity\fR (default: 9)\. The smaller the \fIselectivity\fR value, the denser the dictionary, improving its efficiency but reducing its possible maximum size\. \fB\-\-train\-legacy=s=#\fR is also accepted\.
 .
 .IP
-Examples :
+Examples:
 .
 .IP
-\fBzstd \-\-train \-\-optimize\-cover FILEs\fR
+\fBzstd \-\-train\-legacy FILEs\fR
 .
 .IP
-\fBzstd \-\-train \-\-optimize\-cover=d=d,steps=512 FILEs\fR
+\fBzstd \-\-train\-legacy=selectivity=8 FILEs\fR
 .
 .SH "BENCHMARK"
 .

diff --git a/programs/zstd.1.md b/programs/zstd.1.md
@@ -158,14 +158,19 @@ It will improve compression ratio of small files.
 Typical gains range from 10% (at 64KB) to x5 better (at <1KB).
 
 * `--train FILEs`:
-    use FILEs as training set to create a dictionary.
+    Use FILEs as training set to create a dictionary.
     The training set should contain a lot of small files (> 100),
     and weight typically 100x the target dictionary size
     (for example, 10 MB for a 100 KB dictionary).
+
+    Supports multithreading if `zstd` is compiled with threading support.
+    Additional parameters can be specified with `--train-cover`.
+    The legacy dictionary builder can be accessed with `--train-legacy`.
+    Equivalent to `--train-cover=d=8,steps=4`.
 * `-o file`:
-    dictionary saved into `file` (default name: dictionary)
+    Dictionary saved into `file` (default name: dictionary).
 * `--maxdict=#`:
-    limit dictionary to specified size (default : (112640)
+    Limit dictionary to specified size (default: 112640).
 * `--dictID=#`:
     A dictionary ID is a locally unique ID that a decoder can use to verify it is
     using the right dictionary.
@@ -176,42 +181,44 @@ Typical gains range from 10% (at 64KB) to x5 better (at <1KB).
     This compares favorably to 4 bytes default.
     However, it's up to the dictionary manager to not assign twice the same ID to
     2 different dictionaries.
-* `-s#`:
-    dictionary selectivity level (default: 9)
-    the smaller the value, the denser the dictionary,
-    improving its efficiency but reducing its possible maximum size.
-* `--cover=k#,d=#`:
-    Use alternate dictionary builder algorithm named cover with parameters
-    _k_ and _d_ with _d_ <= _k_.
-    Selects segments of size _k_ with the highest score to put in the dictionary.
+* `--train-cover[=k#,d=#,steps=#]`:
+    Select parameters for the default dictionary builder algorithm named cover.
+    If _d_ is not specified, then it tries _d_ = 6 and _d_ = 8.
+    If _k_ is not specified, then it tries _steps_ values in the range [50, 2000].
+    If _steps_ is not specified, then the default value of 40 is used.
+    Requires that _d_ <= _k_.
+
+    Selects segments of size _k_ with highest score to put in the dictionary.
     The score of a segment is computed by the sum of the frequencies of all the
-    subsegments of of size _d_.
-    Generally _d_ should be in the range [6, 24].
-    Good values for _k_ vary widely based on the input data,
-    but a safe range is [32, 2048].<br />
-    Example: `--train --cover=k=64,d=8 FILEs`.
-
-* `--optimize-cover[=steps=#,k=#,d=#]`:
-    If _steps_ is not specified, the default value of 32 is used.
-    If _k_ is not specified, the _k_ values in [16, 2048] are checked for each
-    value of _d_.
-    If _d_ is not specified, the values checked are [6, 8, ..., 16].
-
-    Runs the cover dictionary builder for each parameter set
-    and saves the optimal parameters and dictionary.
-    Prints optimal parameters and writes optimal dictionary into output file.
+    subsegments of size _d_.
+    Generally _d_ should be in the range [6, 8], occasionally up to 16, but the
+    algorithm will run faster with d <= _8_.
+    Good values for _k_ vary widely based on the input data, but a safe range is
+    [2 * _d_, 2000].
     Supports multithreading if `zstd` is compiled with threading support.
 
-    The parameter _k_ is more sensitive than _d_, and is faster to optimize over.
-    Suggested use is to run with a _steps_ <= 32 with neither _k_ nor _d_ set.
-    Once it completes, use the value of _d_ it selects with a higher _steps_
-    (in the range [256, 1024]).
+    Examples:
+
+    `zstd --train-cover FILEs`
+
+    `zstd --train-cover=k=50,d=8 FILEs`
+
+    `zstd --train-cover=d=8,steps=500 FILEs`
+
+    `zstd --train-cover=k=50 FILEs`
+
+* `--train-legacy[=selectivity=#]`:
+    Use legacy dictionary builder algorithm with the given dictionary
+    _selectivity_ (default: 9).
+    The smaller the _selectivity_ value, the denser the dictionary,
+    improving its efficiency but reducing its possible maximum size.
+    `--train-legacy=s=#` is also accepted.
 
-    Examples :
+    Examples:
 
-    `zstd --train --optimize-cover FILEs`
+    `zstd --train-legacy FILEs`
 
-    `zstd --train --optimize-cover=d=d,steps=512 FILEs`
+    `zstd --train-legacy=selectivity=8 FILEs`
 
 
 BENCHMARK