New feature: "Large Window Brotli" (#640)

* New feature: "Large Window Brotli" By setting special encoder/decoder flag it is now possible to extend LZ-window up to 30 bits; though produced stream will not be RFC7932 compliant. Added new dictionary generator - "DSH". It combines speed of "Sieve" and quality of "DM". Plus utilities to prepare train corpora (remove unique strings). Improved compression ratio: now two sub-blocks could be stitched: the last copy command could be extended to span the next sub-block. Fixed compression ineffectiveness caused by floating numbers rounding and wrong cost heuristic. Other C changes: - combined / moved `context.h` to `common` - moved transforms to `common` - unified some aspects of code formatting - added an abstraction for encoder (static) dictionary - moved default allocator/deallocator functions to `common` brotli CLI: - window size is auto-adjusted if not specified explicitly Java: - added "eager" decoding both to JNI wrapper and pure decoder - huge speed-up of `DictionaryData` initialization * Add dictionaryless compressed dictionary * Fix `sources.lst` * Fix `sources.lst` and add a note that `libtool` is also required. * Update setup.py * Fix `EagerStreamTest` * Fix BUILD file * Add missing `libdivsufsort` dependency * Fix "unused parameter" warning.
author: Eugene Kliuchnikov <eustas@google.com> 2018-02-26 09:04:36 -0500
committer: GitHub <noreply@github.com> 2018-02-26 09:04:36 -0500
commit: 35e69fc7cf9421ab04ffc9d52cb36d07fa12984a (patch)
tree: a1ed614391936d455da2b0610ef8e8caf88b4289 /research/dictionary_generator.cc
parent: 3af18990f50d8f040038aaa08c41f5d27d62efb5 (diff)
download: brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.zip
brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.gz
brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.bz2
1 files changed, 99 insertions, 20 deletions
diff --git a/research/dictionary_generator.cc b/research/dictionary_generator.cc
index b3ee89c..00cfaba 100755
--- a/research/dictionary_generator.cc
+++ b/research/dictionary_generator.cc
@@ -1,15 +1,20 @@
+#include <cstddef>
 #include <cstdio>
 #include <cstring>
 #include <fstream>
 #include <vector>
 
 #include "./deorummolae.h"
+#include "./durchschlag.h"
 #include "./sieve.h"
 
 #define METHOD_DM 0
 #define METHOD_SIEVE 1
+#define METHOD_DURCHSCHLAG 2
+#define METHOD_DISTILL 3
+#define METHOD_PURIFY 4
 
-size_t readInt(const char* str) {
+static size_t readInt(const char* str) {
   size_t result = 0;
   if (str[0] == 0 || str[0] == '0') {
     return 0;
@@ -51,10 +56,25 @@ static std::string readFile(const std::string& path) {
 
 static void writeFile(const char* file, const std::string& content) {
   std::ofstream outfile(file, std::ofstream::binary);
-  outfile.write(content.c_str(), content.size());
+  outfile.write(content.c_str(), static_cast<std::streamsize>(content.size()));
   outfile.close();
 }
 
+static void writeSamples(char const* argv[], const std::vector<int>& pathArgs,
+    const std::vector<size_t>& sizes, const uint8_t* data) {
+  size_t offset = 0;
+  for (size_t i = 0; i < pathArgs.size(); ++i) {
+    int j = pathArgs[i];
+    const char* file = argv[j];
+    size_t sampleSize = sizes[i];
+    std::ofstream outfile(file, std::ofstream::binary);
+    outfile.write(reinterpret_cast<const char*>(data + offset),
+        static_cast<std::streamsize>(sampleSize));
+    outfile.close();
+    offset += sampleSize;
+  }
+}
+
 /* Returns "base file name" or its tail, if it contains '/' or '\'. */
 static const char* fileName(const char* path) {
   const char* separator_position = strrchr(path, '/');
@@ -68,21 +88,32 @@ static void printHelp(const char* name) {
   fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
   fprintf(stderr,
       "Options:\n"
-      "  --dm     use 'deorummolae' engine\n"
-      "  --sieve  use 'sieve' engine (default)\n"
-      "  -t#      set target dictionary size (limit); default: 16K\n"
-      "  -s#      set slize length for 'sieve'; default: 33\n"
-      "# is a decimal number with optional k/K/m/M suffix.\n\n");
+      "  --dm       use 'deorummolae' engine\n"
+      "  --distill  rewrite samples; unique text parts are removed\n"
+      "  --dsh      use 'durchschlag' engine (default)\n"
+      "  --purify   rewrite samples; unique text parts are zeroed out\n"
+      "  --sieve    use 'sieve' engine\n"
+      "  -b#        set block length for 'durchschlag'; default: 1024\n"
+      "  -s#        set slice length for 'distill', 'durchschlag', 'purify'\n"
+      "             and 'sieve'; default: 16\n"
+      "  -t#        set target dictionary size (limit); default: 16K\n"
+      "  -u#        set minimum slice population (for rewrites); default: 2\n"
+      "# is a decimal number with optional k/K/m/M suffix.\n"
+      "WARNING: 'distill' and 'purify' will overwrite original samples!\n"
+      "         Completely unique samples might become empty files.\n\n");
 }
 
 int main(int argc, char const* argv[]) {
   int dictionaryArg = -1;
-  int method = METHOD_SIEVE;
-  int sieveSliceLen = 33;
-  int targetSize = 16 << 10;
+  int method = METHOD_DURCHSCHLAG;
+  size_t sliceLen = 16;
+  size_t targetSize = 16 << 10;
+  size_t blockSize = 1024;
+  size_t minimumPopulation = 2;
 
   std::vector<uint8_t> data;
   std::vector<size_t> sizes;
+  std::vector<int> pathArgs;
   size_t total = 0;
   for (int i = 1; i < argc; ++i) {
     if (argv[i] == nullptr) {
@@ -90,6 +121,12 @@ int main(int argc, char const* argv[]) {
     }
     if (argv[i][0] == '-') {
       if (argv[i][1] == '-') {
+        if (dictionaryArg != -1) {
+          fprintf(stderr,
+              "Method should be specified before dictionary / sample '%s'\n",
+              argv[i]);
+          exit(1);
+        }
         if (std::strcmp("--sieve", argv[i]) == 0) {
           method = METHOD_SIEVE;
           continue;
@@ -98,13 +135,32 @@ int main(int argc, char const* argv[]) {
           method = METHOD_DM;
           continue;
         }
+        if (std::strcmp("--dsh", argv[i]) == 0) {
+          method = METHOD_DURCHSCHLAG;
+          continue;
+        }
+        if (std::strcmp("--distill", argv[i]) == 0) {
+          method = METHOD_DISTILL;
+          continue;
+        }
+        if (std::strcmp("--purify", argv[i]) == 0) {
+          method = METHOD_PURIFY;
+          continue;
+        }
         printHelp(fileName(argv[0]));
         fprintf(stderr, "Invalid option '%s'\n", argv[i]);
         exit(1);
       }
-      if (argv[i][1] == 's') {
-        sieveSliceLen = readInt(&argv[i][2]);
-        if (sieveSliceLen < 4 || sieveSliceLen > 256) {
+      if (argv[i][1] == 'b') {
+        blockSize = readInt(&argv[i][2]);
+        if (blockSize < 16 || blockSize > 65536) {
+          printHelp(fileName(argv[0]));
+          fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+          exit(1);
+        }
+      } else if (argv[i][1] == 's') {
+        sliceLen = readInt(&argv[i][2]);
+        if (sliceLen < 4 || sliceLen > 256) {
           printHelp(fileName(argv[0]));
           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
           exit(1);
@@ -116,6 +172,13 @@ int main(int argc, char const* argv[]) {
           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
           exit(1);
         }
+      } else if (argv[i][1] == 'u') {
+        minimumPopulation = readInt(&argv[i][2]);
+        if (minimumPopulation < 256 || minimumPopulation > 65536) {
+          printHelp(fileName(argv[0]));
+          fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+          exit(1);
+        }
       } else {
         printHelp(fileName(argv[0]));
         fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
@@ -124,26 +187,42 @@ int main(int argc, char const* argv[]) {
       continue;
     }
     if (dictionaryArg == -1) {
-      dictionaryArg = i;
-      continue;
+      if (method != METHOD_DISTILL && method != METHOD_PURIFY) {
+        dictionaryArg = i;
+        continue;
+      }
     }
     std::string content = readFile(argv[i]);
     data.insert(data.end(), content.begin(), content.end());
     total += content.size();
+    pathArgs.push_back(i);
     sizes.push_back(content.size());
   }
-  if (dictionaryArg == -1 || total == 0) {
+  bool wantDictionary = (dictionaryArg == -1);
+  if (method == METHOD_DISTILL || method == METHOD_PURIFY) {
+    wantDictionary = false;
+  }
+  if (wantDictionary || total == 0) {
     printHelp(fileName(argv[0]));
     fprintf(stderr, "Not enough arguments\n");
     exit(1);
   }
 
   if (method == METHOD_SIEVE) {
-    writeFile(argv[dictionaryArg],
-        sieve_generate(targetSize, sieveSliceLen, sizes, data.data()));
+    writeFile(argv[dictionaryArg], sieve_generate(
+        targetSize, sliceLen, sizes, data.data()));
   } else if (method == METHOD_DM) {
-    writeFile(argv[dictionaryArg],
-        DM_generate(targetSize, sizes, data.data()));
+    writeFile(argv[dictionaryArg], DM_generate(
+        targetSize, sizes, data.data()));
+  } else if (method == METHOD_DURCHSCHLAG) {
+    writeFile(argv[dictionaryArg], durchschlag_generate(
+        targetSize, sliceLen, blockSize, sizes, data.data()));
+  } else if (method == METHOD_DISTILL) {
+    durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data());
+    writeSamples(argv, pathArgs, sizes, data.data());
+  } else if (method == METHOD_PURIFY) {
+    durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data());
+    writeSamples(argv, pathArgs, sizes, data.data());
   } else {
     printHelp(fileName(argv[0]));
     fprintf(stderr, "Unknown generator\n");
author	Eugene Kliuchnikov <eustas@google.com>	2018-02-26 09:04:36 -0500
committer	GitHub <noreply@github.com>	2018-02-26 09:04:36 -0500
commit	35e69fc7cf9421ab04ffc9d52cb36d07fa12984a (patch)
tree	a1ed614391936d455da2b0610ef8e8caf88b4289 /research/dictionary_generator.cc
parent	3af18990f50d8f040038aaa08c41f5d27d62efb5 (diff)
download	brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.zip brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.gz brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.bz2