From 4b2b2d4f83ffeaac7708e44409fe34896a01a278 Mon Sep 17 00:00:00 2001
From: Eugene Kliuchnikov <eustas@google.com>
Date: Fri, 12 Apr 2019 13:57:42 +0200
Subject: Update (#749)

Update:

 * Bazel: fix MSVC configuration
 * C: common: extended documentation and helpers around distance codes
 * C: common: enable BROTLI_DCHECK in "debug" builds
 * C: common: fix implicit trailing zero in `kPrefixSuffix`
 * C: dec: fix possible bit reader discharge for "large-window" mode
 * C: dec: simplify distance decoding via lookup table
 * C: dec: reuse decoder state members memory via union with lookup table
 * C: dec: add decoder state diagram
 * C: enc: clarify access to static dictionary
 * C: enc: improve static dictionary hash
 * C: enc: add "stream offset" parameter for parallel encoding
 * C: enc: reorganize hasher; now Q2-Q3 require exactly 256KiB
           to avoid global TCMalloc lock
 * C: enc: fix rare access to uninitialized data in ring-buffer
 * C: enc: reorganize logging / checks in `write_bits.h`
 * Java: dec: add "large-window" support
 * Java: dec: improve speed
 * Java: dec: debug and 32-bit mode are now activated via system properties
 * Java: dec: demystify some state variables (use better names)
 * Dictionary generator: add single input mode
 * Java: dec: modernize tests
 * Bazel: js: pick working commit for closure rules
---
 research/dictionary_generator.cc | 144 ++++++++++++++++++++++++++++++++-------
 1 file changed, 119 insertions(+), 25 deletions(-)

(limited to 'research')
diff --git a/research/dictionary_generator.cc b/research/dictionary_generator.cc
index 00cfaba..dcdf2fa 100755
--- a/research/dictionary_generator.cc
+++ b/research/dictionary_generator.cc
@@ -1,3 +1,4 @@
+#include <climits>
 #include <cstddef>
 #include <cstdio>
 #include <cstring>
@@ -8,6 +9,16 @@
 #include "./durchschlag.h"
 #include "./sieve.h"
 
+/* This isn't a definitive list of "--foo" arguments, only those that take an
+ * additional "=#" integer parameter, like "--foo=20" or "--foo=32K".
+ */
+#define LONG_ARG_BLOCK_LEN "--block_len="
+#define LONG_ARG_SLICE_LEN "--slice_len="
+#define LONG_ARG_TARGET_DICT_LEN "--target_dict_len="
+#define LONG_ARG_MIN_SLICE_POP "--min_slice_pop="
+#define LONG_ARG_CHUNK_LEN "--chunk_len="
+#define LONG_ARG_OVERLAP_LEN "--overlap_len="
+
 #define METHOD_DM 0
 #define METHOD_SIEVE 1
 #define METHOD_DURCHSCHLAG 2
@@ -93,11 +104,20 @@ static void printHelp(const char* name) {
       "  --dsh      use 'durchschlag' engine (default)\n"
       "  --purify   rewrite samples; unique text parts are zeroed out\n"
       "  --sieve    use 'sieve' engine\n"
-      "  -b#        set block length for 'durchschlag'; default: 1024\n"
-      "  -s#        set slice length for 'distill', 'durchschlag', 'purify'\n"
+      "  -b#, --block_len=#\n"
+      "             set block length for 'durchschlag'; default: 1024\n"
+      "  -s#, --slice_len=#\n"
+      "             set slice length for 'distill', 'durchschlag', 'purify'\n"
       "             and 'sieve'; default: 16\n"
-      "  -t#        set target dictionary size (limit); default: 16K\n"
-      "  -u#        set minimum slice population (for rewrites); default: 2\n"
+      "  -t#, --target_dict_len=#\n"
+      "             set target dictionary length (limit); default: 16K\n"
+      "  -u#, --min_slice_pop=#\n"
+      "             set minimum slice population (for rewrites); default: 2\n"
+      "  -c#, --chunk_len=#\n"
+      "             if positive, samples are cut into chunks of this length;\n"
+      "             default: 0; cannot mix with 'rewrite samples'\n"
+      "  -o#, --overlap_len=#\n"
+      "             set chunk overlap length; default 0\n"
       "# is a decimal number with optional k/K/m/M suffix.\n"
       "WARNING: 'distill' and 'purify' will overwrite original samples!\n"
       "         Completely unique samples might become empty files.\n\n");
@@ -110,6 +130,8 @@ int main(int argc, char const* argv[]) {
   size_t targetSize = 16 << 10;
   size_t blockSize = 1024;
   size_t minimumPopulation = 2;
+  size_t chunkLen = 0;
+  size_t overlapLen = 0;
 
   std::vector<uint8_t> data;
   std::vector<size_t> sizes;
@@ -119,66 +141,115 @@ int main(int argc, char const* argv[]) {
     if (argv[i] == nullptr) {
       continue;
     }
+
     if (argv[i][0] == '-') {
-      if (argv[i][1] == '-') {
+      char arg1 = argv[i][1];
+      const char* arg2 = arg1 ? &argv[i][2] : nullptr;
+      if (arg1 == '-') {
         if (dictionaryArg != -1) {
           fprintf(stderr,
               "Method should be specified before dictionary / sample '%s'\n",
               argv[i]);
           exit(1);
         }
-        if (std::strcmp("--sieve", argv[i]) == 0) {
+
+        /* Look for "--long_arg" via exact match. */
+        if (std::strcmp(argv[i], "--sieve") == 0) {
           method = METHOD_SIEVE;
           continue;
         }
-        if (std::strcmp("--dm", argv[i]) == 0) {
+        if (std::strcmp(argv[i], "--dm") == 0) {
           method = METHOD_DM;
           continue;
         }
-        if (std::strcmp("--dsh", argv[i]) == 0) {
+        if (std::strcmp(argv[i], "--dsh") == 0) {
           method = METHOD_DURCHSCHLAG;
           continue;
         }
-        if (std::strcmp("--distill", argv[i]) == 0) {
+        if (std::strcmp(argv[i], "--distill") == 0) {
           method = METHOD_DISTILL;
           continue;
         }
-        if (std::strcmp("--purify", argv[i]) == 0) {
+        if (std::strcmp(argv[i], "--purify") == 0) {
           method = METHOD_PURIFY;
           continue;
         }
-        printHelp(fileName(argv[0]));
-        fprintf(stderr, "Invalid option '%s'\n", argv[i]);
-        exit(1);
+
+        /* Look for "--long_arg=#" via prefix match. */
+        if (std::strncmp(argv[i], LONG_ARG_BLOCK_LEN,
+              std::strlen(LONG_ARG_BLOCK_LEN)) == 0) {
+          arg1 = 'b';
+          arg2 = &argv[i][std::strlen(LONG_ARG_BLOCK_LEN)];
+        } else if (std::strncmp(argv[i], LONG_ARG_SLICE_LEN,
+              std::strlen(LONG_ARG_SLICE_LEN)) == 0) {
+          arg1 = 's';
+          arg2 = &argv[i][std::strlen(LONG_ARG_SLICE_LEN)];
+        } else if (std::strncmp(argv[i], LONG_ARG_TARGET_DICT_LEN,
+              std::strlen(LONG_ARG_TARGET_DICT_LEN)) == 0) {
+          arg1 = 't';
+          arg2 = &argv[i][std::strlen(LONG_ARG_TARGET_DICT_LEN)];
+        } else if (std::strncmp(argv[i], LONG_ARG_MIN_SLICE_POP,
+              std::strlen(LONG_ARG_MIN_SLICE_POP)) == 0) {
+          arg1 = 'u';
+          arg2 = &argv[i][std::strlen(LONG_ARG_MIN_SLICE_POP)];
+        } else if (std::strncmp(argv[i], LONG_ARG_CHUNK_LEN,
+              std::strlen(LONG_ARG_CHUNK_LEN)) == 0) {
+          arg1 = 'c';
+          arg2 = &argv[i][std::strlen(LONG_ARG_CHUNK_LEN)];
+        } else if (std::strncmp(argv[i], LONG_ARG_OVERLAP_LEN,
+              std::strlen(LONG_ARG_OVERLAP_LEN)) == 0) {
+          arg1 = 'o';
+          arg2 = &argv[i][std::strlen(LONG_ARG_OVERLAP_LEN)];
+        } else {
+          printHelp(fileName(argv[0]));
+          fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+          exit(1);
+        }
       }
-      if (argv[i][1] == 'b') {
-        blockSize = readInt(&argv[i][2]);
+
+      /* Look for "-f" short args or "--foo=#" long args. */
+      if (arg1 == 'b') {
+        blockSize = readInt(arg2);
         if (blockSize < 16 || blockSize > 65536) {
           printHelp(fileName(argv[0]));
           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
           exit(1);
         }
-      } else if (argv[i][1] == 's') {
-        sliceLen = readInt(&argv[i][2]);
+      } else if (arg1 == 's') {
+        sliceLen = readInt(arg2);
         if (sliceLen < 4 || sliceLen > 256) {
           printHelp(fileName(argv[0]));
           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
           exit(1);
         }
-      } else if (argv[i][1] == 't') {
-        targetSize = readInt(&argv[i][2]);
+      } else if (arg1 == 't') {
+        targetSize = readInt(arg2);
         if (targetSize < 256 || targetSize > (1 << 25)) {
           printHelp(fileName(argv[0]));
           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
           exit(1);
         }
-      } else if (argv[i][1] == 'u') {
-        minimumPopulation = readInt(&argv[i][2]);
+      } else if (arg1 == 'u') {
+        minimumPopulation = readInt(arg2);
         if (minimumPopulation < 256 || minimumPopulation > 65536) {
           printHelp(fileName(argv[0]));
           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
           exit(1);
         }
+      } else if (arg1 == 'c') {
+        chunkLen = readInt(arg2);
+        if (chunkLen < 0 || chunkLen > INT_MAX) {
+          printHelp(fileName(argv[0]));
+          fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+          exit(1);
+        }
+      } else if (arg1 == 'o') {
+        overlapLen = readInt(arg2);
+        if (overlapLen < 0 || overlapLen > INT_MAX) {
+          printHelp(fileName(argv[0]));
+          fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+          exit(1);
+        }
       } else {
         printHelp(fileName(argv[0]));
         fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
@@ -186,21 +257,44 @@ int main(int argc, char const* argv[]) {
       }
       continue;
     }
+
     if (dictionaryArg == -1) {
       if (method != METHOD_DISTILL && method != METHOD_PURIFY) {
         dictionaryArg = i;
         continue;
       }
     }
+
     std::string content = readFile(argv[i]);
-    data.insert(data.end(), content.begin(), content.end());
-    total += content.size();
-    pathArgs.push_back(i);
-    sizes.push_back(content.size());
+    if (chunkLen == 0) {
+      pathArgs.push_back(i);
+      data.insert(data.end(), content.begin(), content.end());
+      total += content.size();
+      sizes.push_back(content.size());
+      continue;
+    } else if (chunkLen <= overlapLen) {
+      printHelp(fileName(argv[0]));
+      fprintf(stderr, "Invalid chunkLen - overlapLen combination\n");
+      exit(1);
+    }
+    for (size_t chunkStart = 0;
+        chunkStart < content.size();
+        chunkStart += chunkLen - overlapLen) {
+      std::string chunk = content.substr(chunkStart, chunkLen);
+      data.insert(data.end(), chunk.begin(), chunk.end());
+      total += chunk.size();
+      sizes.push_back(chunk.size());
+    }
   }
+
   bool wantDictionary = (dictionaryArg == -1);
   if (method == METHOD_DISTILL || method == METHOD_PURIFY) {
     wantDictionary = false;
+    if (chunkLen != 0) {
+      printHelp(fileName(argv[0]));
+      fprintf(stderr, "Cannot mix 'rewrite samples' with positive chunk_len\n");
+      exit(1);
+    }
   }
   if (wantDictionary || total == 0) {
     printHelp(fileName(argv[0]));
-- 
cgit v1.1