New feature: "Large Window Brotli" (#640)

* New feature: "Large Window Brotli" By setting special encoder/decoder flag it is now possible to extend LZ-window up to 30 bits; though produced stream will not be RFC7932 compliant. Added new dictionary generator - "DSH". It combines speed of "Sieve" and quality of "DM". Plus utilities to prepare train corpora (remove unique strings). Improved compression ratio: now two sub-blocks could be stitched: the last copy command could be extended to span the next sub-block. Fixed compression ineffectiveness caused by floating numbers rounding and wrong cost heuristic. Other C changes: - combined / moved `context.h` to `common` - moved transforms to `common` - unified some aspects of code formatting - added an abstraction for encoder (static) dictionary - moved default allocator/deallocator functions to `common` brotli CLI: - window size is auto-adjusted if not specified explicitly Java: - added "eager" decoding both to JNI wrapper and pure decoder - huge speed-up of `DictionaryData` initialization * Add dictionaryless compressed dictionary * Fix `sources.lst` * Fix `sources.lst` and add a note that `libtool` is also required. * Update setup.py * Fix `EagerStreamTest` * Fix BUILD file * Add missing `libdivsufsort` dependency * Fix "unused parameter" warning.
author: Eugene Kliuchnikov <eustas@google.com> 2018-02-26 09:04:36 -0500
committer: GitHub <noreply@github.com> 2018-02-26 09:04:36 -0500
commit: 35e69fc7cf9421ab04ffc9d52cb36d07fa12984a (patch)
tree: a1ed614391936d455da2b0610ef8e8caf88b4289 /research
parent: 3af18990f50d8f040038aaa08c41f5d27d62efb5 (diff)
download: brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.zip
brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.gz
brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.bz2
11 files changed, 1206 insertions, 171 deletions
diff --git a/research/BUILD b/research/BUILD
index 6ff5ac2..211b3e7 100755
--- a/research/BUILD
+++ b/research/BUILD
@@ -14,6 +14,13 @@ cc_library(
 )
 
 cc_library(
+    name = "durchschlag",
+    srcs = ["durchschlag.cc"],
+    hdrs = ["durchschlag.h"],
+    deps = ["@divsufsort//:libdivsufsort"],
+)
+
+cc_library(
     name = "sieve",
     srcs = ["sieve.cc"],
     hdrs = ["sieve.h"],
@@ -24,6 +31,7 @@ cc_binary(
     srcs = ["dictionary_generator.cc"],
     deps = [
         ":dm",
+        ":durchschlag",
         ":sieve",
     ],
 )
diff --git a/research/BUILD.libdivsufsort b/research/BUILD.libdivsufsort
new file mode 100644
index 0000000..ce60e9c
--- /dev/null
+++ b/research/BUILD.libdivsufsort
@@ -0,0 +1,55 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "libdivsufsort",
+    srcs = [
+        "lib/divsufsort.c",
+        "lib/sssort.c",
+        "lib/trsort.c",
+        "lib/utils.c",
+    ],
+    hdrs = [
+        "include/config.h",
+        "include/divsufsort.h",
+        "include/divsufsort_private.h",
+    ],
+    copts = [
+        "-DHAVE_CONFIG_H=1",
+    ],
+    includes = ["include"],
+)
+
+commom_awk_replaces = (
+    "gsub(/#cmakedefine/, \"#define\"); " +
+    "gsub(/@DIVSUFSORT_EXPORT@/, \"\"); " +
+    "gsub(/@DIVSUFSORT_IMPORT@/, \"\"); " +
+    "gsub(/@INLINE@/, \"inline\"); " +
+    "gsub(/@INCFILE@/, \"#include <inttypes.h>\"); " +
+    "gsub(/@SAUCHAR_TYPE@/, \"uint8_t\"); " +
+    "gsub(/@SAINT32_TYPE@/, \"int32_t\"); " +
+    "gsub(/@SAINT_PRId@/, \"PRId32\"); "
+)
+
+genrule(
+    name = "config_h",
+    srcs = ["include/config.h.cmake"],
+    outs = ["include/config.h"],
+    cmd = ("awk '{ " +
+           "gsub(/@HAVE_IO_H 1@/, \"HAVE_IO_H 0\"); " +
+           commom_awk_replaces +
+           "print; }' $(<) > $(@)"),
+)
+
+genrule(
+    name = "divsufsort_h",
+    srcs = ["include/divsufsort.h.cmake"],
+    outs = ["include/divsufsort.h"],
+    cmd = ("awk '{ " +
+           "gsub(/@W64BIT@/, \"\"); " +
+           "gsub(/@SAINDEX_TYPE@/, \"int32_t\"); " +
+           "gsub(/@SAINDEX_PRId@/, \"PRId32\"); " +
+           commom_awk_replaces +
+           "print; }' $(<) > $(@)"),
+)
diff --git a/research/deorummolae.cc b/research/deorummolae.cc
index c53a53c..d15b7ee 100644
--- a/research/deorummolae.cc
+++ b/research/deorummolae.cc
@@ -15,20 +15,31 @@
 
 /* Non tunable definitions. */
 #define CHUNK_MASK (CHUNK_SIZE - 1)
-#define COVERAGE_SIZE (1 << (LOG_MAX_FILES - 6))
+#define COVERAGE_SIZE (1 << (DM_LOG_MAX_FILES - 6))
 
 /* File coverage: every bit set to 1 denotes a file covered by an isle. */
 typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
 
-static int popcount(uint64_t u) { return __builtin_popcountll(u); }
+/* Symbol of text alphabet. */
+typedef int32_t TextChar;
+
+/* Pointer to position in text. */
+typedef uint32_t TextIdx;
+
+/* SAIS sarray_type; unfortunately, must be a signed type. */
+typedef int32_t TextSaIdx;
+
+static size_t popcount(uint64_t u) {
+  return static_cast<size_t>(__builtin_popcountll(u));
+}
 
 /* Condense terminators and pad file entries. */
-static void rewriteText(std::vector<int>* text) {
-  int terminator = text->back();
-  int prev = terminator;
-  size_t to = 0;
-  for (size_t from = 0; from < text->size(); ++from) {
-    int next = text->at(from);
+static void rewriteText(std::vector<TextChar>* text) {
+  TextChar terminator = text->back();
+  TextChar prev = terminator;
+  TextIdx to = 0;
+  for (TextIdx from = 0; from < text->size(); ++from) {
+    TextChar next = text->at(from);
     if (next < 256 || prev < 256) {
       text->at(to++) = next;
       if (next >= 256) terminator = next;
@@ -41,11 +52,12 @@ static void rewriteText(std::vector<int>* text) {
 }
 
 /* Reenumerate terminators for smaller alphabet. */
-static void remapTerminators(std::vector<int>* text, int* next_terminator) {
-  int prev = -1;
-  int x = 256;
-  for (size_t i = 0; i < text->size(); ++i) {
-    int next = text->at(i);
+static void remapTerminators(std::vector<TextChar>* text,
+    TextChar* next_terminator) {
+  TextChar prev = -1;
+  TextChar x = 256;
+  for (TextIdx i = 0; i < text->size(); ++i) {
+    TextChar next = text->at(i);
     if (next < 256) {  // Char.
       // Do nothing.
     } else if (prev < 256) {  // Terminator after char.
@@ -60,15 +72,15 @@ static void remapTerminators(std::vector<int>* text, int* next_terminator) {
 }
 
 /* Combine all file entries; create mapping position->file. */
-static void buildFullText(std::vector<std::vector<int>>* data,
-    std::vector<int>* full_text, std::vector<size_t>* file_map,
-    std::vector<size_t>* file_offset, int* next_terminator) {
+static void buildFullText(std::vector<std::vector<TextChar>>* data,
+    std::vector<TextChar>* full_text, std::vector<TextIdx>* file_map,
+    std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
   file_map->resize(0);
   file_offset->resize(0);
   full_text->resize(0);
-  for (size_t i = 0; i < data->size(); ++i) {
+  for (TextIdx i = 0; i < data->size(); ++i) {
     file_offset->push_back(full_text->size());
-    std::vector<int>& file = data->at(i);
+    std::vector<TextChar>& file = data->at(i);
     rewriteText(&file);
     full_text->insert(full_text->end(), file.begin(), file.end());
     file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
@@ -78,18 +90,19 @@ static void buildFullText(std::vector<std::vector<int>>* data,
 
 /* Build longest-common-prefix based on suffix array and text.
    TODO: borrowed -> unknown efficiency. */
-static void buildLcp(std::vector<int>* text, std::vector<int>* sa,
-    std::vector<int>* lcp, std::vector<int>* invese_sa) {
-  int size = static_cast<int>(text->size());
+static void buildLcp(std::vector<TextChar>* text, std::vector<TextIdx>* sa,
+    std::vector<TextIdx>* lcp, std::vector<TextIdx>* invese_sa) {
+  TextIdx size = static_cast<TextIdx>(text->size());
   lcp->resize(size);
-  int k = 0;
+  TextIdx k = 0;
   lcp->at(size - 1) = 0;
-  for (int i = 0; i < size; ++i) {
+  for (TextIdx i = 0; i < size; ++i) {
     if (invese_sa->at(i) == size - 1) {
       k = 0;
       continue;
     }
-    int j = sa->at(invese_sa->at(i) + 1);  // Suffix which follow i-th suffix.
+    // Suffix which follow i-th suffix.
+    TextIdx j = sa->at(invese_sa->at(i) + 1);
     while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
       ++k;
     }
@@ -102,21 +115,21 @@ static void buildLcp(std::vector<int>* text, std::vector<int>* sa,
    When we raise the LCP requirement, the isle sunks and smaller isles appear
    instead. */
 typedef struct {
-  int lcp;
-  int l;
-  int r;
+  TextIdx lcp;
+  TextIdx l;
+  TextIdx r;
   Coverage coverage;
 } Isle;
 
 /* Helper routine for `cutMatch`. */
-static void poisonData(int pos, int length, std::vector<std::vector<int>>* data,
-    std::vector<size_t>* file_map, std::vector<size_t>* file_offset,
-    int* next_terminator) {
-  size_t f = file_map->at(pos / CHUNK_SIZE);
+static void poisonData(TextIdx pos, TextIdx length,
+    std::vector<std::vector<TextChar>>* data, std::vector<TextIdx>* file_map,
+    std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
+  TextIdx f = file_map->at(pos / CHUNK_SIZE);
   pos -= file_offset->at(f);
-  std::vector<int>& file = data->at(f);
-  int l = (length == CUT_MATCH) ? CUT_MATCH : 1;
-  for (int j = 0; j < l; j++, pos++) {
+  std::vector<TextChar>& file = data->at(f);
+  TextIdx l = (length == CUT_MATCH) ? CUT_MATCH : 1;
+  for (TextIdx j = 0; j < l; j++, pos++) {
     if (file[pos] >= 256) continue;
     if (file[pos + 1] >= 256) {
       file[pos] = file[pos + 1];
@@ -131,12 +144,12 @@ static void poisonData(int pos, int length, std::vector<std::vector<int>>* data,
 /* Remove substrings of a given match from files.
    Substrings are replaced with unique terminators, so next iteration SA would
    not allow to cross removed areas. */
-static void cutMatch(std::vector<std::vector<int>>* data, int index, int length,
-    std::vector<int>* sa, std::vector<int>* lcp, std::vector<int>* invese_sa,
-    int* next_terminator, std::vector<size_t>* file_map,
-    std::vector<size_t>* file_offset) {
+static void cutMatch(std::vector<std::vector<TextChar>>* data, TextIdx index,
+    TextIdx length, std::vector<TextIdx>* sa, std::vector<TextIdx>* lcp,
+    std::vector<TextIdx>* invese_sa, TextChar* next_terminator,
+    std::vector<TextIdx>* file_map, std::vector<TextIdx>* file_offset) {
   while (length >= CUT_MATCH) {
-    int i = index;
+    TextIdx i = index;
     while (lcp->at(i) >= length) {
       i++;
       poisonData(
@@ -156,54 +169,70 @@ static void cutMatch(std::vector<std::vector<int>>* data, int index, int length,
 std::string DM_generate(size_t dictionary_size_limit,
     const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
   {
-    uint64_t tmp = 0;
-    if (popcount(tmp - 1u) != 64) {
-      fprintf(stderr, "64-bit platform is required\n");
-      return 0;
+    TextIdx tmp = static_cast<TextIdx>(dictionary_size_limit);
+    if ((tmp != dictionary_size_limit) || (tmp > 1u << 30)) {
+      fprintf(stderr, "dictionary_size_limit is too large\n");
+      return "";
     }
   }
 
   /* Could use 256 + '0' for easier debugging. */
-  int next_terminator = 256;
+  TextChar next_terminator = 256;
 
   std::string output;
-  std::vector<std::vector<int>> data;
+  std::vector<std::vector<TextChar>> data;
 
-  size_t offset = 0;
+  TextIdx offset = 0;
   size_t num_samples = sample_sizes.size();
-  if (num_samples > MAX_FILES) num_samples = MAX_FILES;
+  if (num_samples > DM_MAX_FILES) num_samples = DM_MAX_FILES;
   for (size_t n = 0; n < num_samples; ++n) {
-    size_t next_offset = offset + sample_sizes[n];
+    TextIdx delta = static_cast<TextIdx>(sample_sizes[n]);
+    if (delta != sample_sizes[n]) {
+      fprintf(stderr, "sample is too large\n");
+      return "";
+    }
+    if (delta == 0) {
+      fprintf(stderr, "0-length samples are prohibited\n");
+      return "";
+    }
+    TextIdx next_offset = offset + delta;
+    if (next_offset <= offset) {
+      fprintf(stderr, "corpus is too large\n");
+      return "";
+    }
     data.push_back(
-        std::vector<int>(sample_data + offset, sample_data + next_offset));
+        std::vector<TextChar>(sample_data + offset, sample_data + next_offset));
     offset = next_offset;
     data.back().push_back(next_terminator++);
   }
 
   /* Most arrays are allocated once, and then just resized to smaller and
      smaller sizes. */
-  std::vector<int> full_text;
-  std::vector<size_t> file_map;
-  std::vector<size_t> file_offset;
-  std::vector<int> sa;
-  std::vector<int> invese_sa;
-  std::vector<int> lcp;
+  std::vector<TextChar> full_text;
+  std::vector<TextIdx> file_map;
+  std::vector<TextIdx> file_offset;
+  std::vector<TextIdx> sa;
+  std::vector<TextIdx> invese_sa;
+  std::vector<TextIdx> lcp;
   std::vector<Isle> isles;
   std::vector<char> output_data;
-  size_t total = 0;
-  size_t total_cost = 0;
-  size_t best_cost;
+  TextIdx total = 0;
+  TextIdx total_cost = 0;
+  TextIdx best_cost;
   Isle best_isle;
-  int min_count = num_samples;
+  size_t min_count = num_samples;
 
   while (true) {
-    size_t max_match = dictionary_size_limit - total;
+    TextIdx max_match = static_cast<TextIdx>(dictionary_size_limit) - total;
     buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
     sa.resize(full_text.size());
-    saisxx(full_text.data(), sa.data(), static_cast<int>(full_text.size()),
-        next_terminator);
+    /* Hopefully, non-negative TextSaIdx is the same sa TextIdx counterpart. */
+    saisxx(full_text.data(), reinterpret_cast<TextSaIdx*>(sa.data()),
+        static_cast<TextChar>(full_text.size()), next_terminator);
     invese_sa.resize(full_text.size());
-    for (int i = 0; i < full_text.size(); ++i) invese_sa[sa[i]] = i;
+    for (TextIdx i = 0; i < full_text.size(); ++i) {
+      invese_sa[sa[i]] = i;
+    }
     buildLcp(&full_text, &sa, &lcp, &invese_sa);
 
     /* Do not rebuild SA/LCP, just use different selection. */
@@ -213,22 +242,22 @@ std::string DM_generate(size_t dictionary_size_limit,
     isles.resize(0);
     isles.push_back(best_isle);
 
-    for (int i = 0; i < static_cast<int>(lcp.size()); ++i) {
-      int l = i;
+    for (TextIdx i = 0; i < lcp.size(); ++i) {
+      TextIdx l = i;
       Coverage cov = {{0}};
-      int f = file_map[sa[i] / CHUNK_SIZE];
-      cov[f >> 6] = ((uint64_t)1) << (f & 63);
+      size_t f = file_map[sa[i] / CHUNK_SIZE];
+      cov[f >> 6] = (static_cast<uint64_t>(1)) << (f & 63);
       while (lcp[i] < isles.back().lcp) {
         Isle& top = isles.back();
         top.r = i;
         l = top.l;
         for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
-        int count = 0;
+        size_t count = 0;
         for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
-        int effective_lcp = top.lcp;
+        TextIdx effective_lcp = top.lcp;
         /* Restrict (last) dictionary entry length. */
         if (effective_lcp > max_match) effective_lcp = max_match;
-        int cost = count * effective_lcp;
+        TextIdx cost = count * effective_lcp;
         if (cost > best_cost && count >= min_count &&
             effective_lcp >= MIN_MATCH) {
           best_cost = cost;
@@ -251,14 +280,14 @@ std::string DM_generate(size_t dictionary_size_limit,
     if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
       if (min_count >= 8) {
         min_count = (min_count * 7) / 8;
-        fprintf(stderr, "Retry: min_count=%d\n", min_count);
+        fprintf(stderr, "Retry: min_count=%zu\n", min_count);
         goto retry;
       }
       break;
     }
 
     /* Save the entry. */
-    fprintf(stderr, "Savings: %zu+%zu, dictionary: %zu+%d\n",
+    fprintf(stderr, "Savings: %d+%d, dictionary: %d+%d\n",
         total_cost, best_cost, total, best_isle.lcp);
     int* piece = &full_text[sa[best_isle.l]];
     output.insert(output.end(), piece, piece + best_isle.lcp);
diff --git a/research/deorummolae.h b/research/deorummolae.h
index 7f24add..5815097 100644
--- a/research/deorummolae.h
+++ b/research/deorummolae.h
@@ -1,17 +1,16 @@
 #ifndef BROTLI_RESEARCH_DEORUMMOLAE_H_
 #define BROTLI_RESEARCH_DEORUMMOLAE_H_
 
-#include <stddef.h>
-#include <stdint.h>
-
+#include <cstddef>
+#include <cstdint>
 #include <string>
 #include <vector>
 
 /* log2(maximal number of files). Value 6 provides some speedups. */
-#define LOG_MAX_FILES 6
+#define DM_LOG_MAX_FILES 6
 
 /* Non tunable definitions. */
-#define MAX_FILES (1 << LOG_MAX_FILES)
+#define DM_MAX_FILES (1 << DM_LOG_MAX_FILES)
 
 /**
  * Generate a dictionary for given samples.
diff --git a/research/dictionary_generator.cc b/research/dictionary_generator.cc
index b3ee89c..00cfaba 100755
--- a/research/dictionary_generator.cc
+++ b/research/dictionary_generator.cc
@@ -1,15 +1,20 @@
+#include <cstddef>
 #include <cstdio>
 #include <cstring>
 #include <fstream>
 #include <vector>
 
 #include "./deorummolae.h"
+#include "./durchschlag.h"
 #include "./sieve.h"
 
 #define METHOD_DM 0
 #define METHOD_SIEVE 1
+#define METHOD_DURCHSCHLAG 2
+#define METHOD_DISTILL 3
+#define METHOD_PURIFY 4
 
-size_t readInt(const char* str) {
+static size_t readInt(const char* str) {
   size_t result = 0;
   if (str[0] == 0 || str[0] == '0') {
     return 0;
@@ -51,10 +56,25 @@ static std::string readFile(const std::string& path) {
 
 static void writeFile(const char* file, const std::string& content) {
   std::ofstream outfile(file, std::ofstream::binary);
-  outfile.write(content.c_str(), content.size());
+  outfile.write(content.c_str(), static_cast<std::streamsize>(content.size()));
   outfile.close();
 }
 
+static void writeSamples(char const* argv[], const std::vector<int>& pathArgs,
+    const std::vector<size_t>& sizes, const uint8_t* data) {
+  size_t offset = 0;
+  for (size_t i = 0; i < pathArgs.size(); ++i) {
+    int j = pathArgs[i];
+    const char* file = argv[j];
+    size_t sampleSize = sizes[i];
+    std::ofstream outfile(file, std::ofstream::binary);
+    outfile.write(reinterpret_cast<const char*>(data + offset),
+        static_cast<std::streamsize>(sampleSize));
+    outfile.close();
+    offset += sampleSize;
+  }
+}
+
 /* Returns "base file name" or its tail, if it contains '/' or '\'. */
 static const char* fileName(const char* path) {
   const char* separator_position = strrchr(path, '/');
@@ -68,21 +88,32 @@ static void printHelp(const char* name) {
   fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
   fprintf(stderr,
       "Options:\n"
-      "  --dm     use 'deorummolae' engine\n"
-      "  --sieve  use 'sieve' engine (default)\n"
-      "  -t#      set target dictionary size (limit); default: 16K\n"
-      "  -s#      set slize length for 'sieve'; default: 33\n"
-      "# is a decimal number with optional k/K/m/M suffix.\n\n");
+      "  --dm       use 'deorummolae' engine\n"
+      "  --distill  rewrite samples; unique text parts are removed\n"
+      "  --dsh      use 'durchschlag' engine (default)\n"
+      "  --purify   rewrite samples; unique text parts are zeroed out\n"
+      "  --sieve    use 'sieve' engine\n"
+      "  -b#        set block length for 'durchschlag'; default: 1024\n"
+      "  -s#        set slice length for 'distill', 'durchschlag', 'purify'\n"
+      "             and 'sieve'; default: 16\n"
+      "  -t#        set target dictionary size (limit); default: 16K\n"
+      "  -u#        set minimum slice population (for rewrites); default: 2\n"
+      "# is a decimal number with optional k/K/m/M suffix.\n"
+      "WARNING: 'distill' and 'purify' will overwrite original samples!\n"
+      "         Completely unique samples might become empty files.\n\n");
 }
 
 int main(int argc, char const* argv[]) {
   int dictionaryArg = -1;
-  int method = METHOD_SIEVE;
-  int sieveSliceLen = 33;
-  int targetSize = 16 << 10;
+  int method = METHOD_DURCHSCHLAG;
+  size_t sliceLen = 16;
+  size_t targetSize = 16 << 10;
+  size_t blockSize = 1024;
+  size_t minimumPopulation = 2;
 
   std::vector<uint8_t> data;
   std::vector<size_t> sizes;
+  std::vector<int> pathArgs;
   size_t total = 0;
   for (int i = 1; i < argc; ++i) {
     if (argv[i] == nullptr) {
@@ -90,6 +121,12 @@ int main(int argc, char const* argv[]) {
     }
     if (argv[i][0] == '-') {
       if (argv[i][1] == '-') {
+        if (dictionaryArg != -1) {
+          fprintf(stderr,
+              "Method should be specified before dictionary / sample '%s'\n",
+              argv[i]);
+          exit(1);
+        }
         if (std::strcmp("--sieve", argv[i]) == 0) {
           method = METHOD_SIEVE;
           continue;
@@ -98,13 +135,32 @@ int main(int argc, char const* argv[]) {
           method = METHOD_DM;
           continue;
         }
+        if (std::strcmp("--dsh", argv[i]) == 0) {
+          method = METHOD_DURCHSCHLAG;
+          continue;
+        }
+        if (std::strcmp("--distill", argv[i]) == 0) {
+          method = METHOD_DISTILL;
+          continue;
+        }
+        if (std::strcmp("--purify", argv[i]) == 0) {
+          method = METHOD_PURIFY;
+          continue;
+        }
         printHelp(fileName(argv[0]));
         fprintf(stderr, "Invalid option '%s'\n", argv[i]);
         exit(1);
       }
-      if (argv[i][1] == 's') {
-        sieveSliceLen = readInt(&argv[i][2]);
-        if (sieveSliceLen < 4 || sieveSliceLen > 256) {
+      if (argv[i][1] == 'b') {
+        blockSize = readInt(&argv[i][2]);
+        if (blockSize < 16 || blockSize > 65536) {
+          printHelp(fileName(argv[0]));
+          fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+          exit(1);
+        }
+      } else if (argv[i][1] == 's') {
+        sliceLen = readInt(&argv[i][2]);
+        if (sliceLen < 4 || sliceLen > 256) {
           printHelp(fileName(argv[0]));
           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
           exit(1);
@@ -116,6 +172,13 @@ int main(int argc, char const* argv[]) {
           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
           exit(1);
         }
+      } else if (argv[i][1] == 'u') {
+        minimumPopulation = readInt(&argv[i][2]);
+        if (minimumPopulation < 256 || minimumPopulation > 65536) {
+          printHelp(fileName(argv[0]));
+          fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+          exit(1);
+        }
       } else {
         printHelp(fileName(argv[0]));
         fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
@@ -124,26 +187,42 @@ int main(int argc, char const* argv[]) {
       continue;
     }
     if (dictionaryArg == -1) {
-      dictionaryArg = i;
-      continue;
+      if (method != METHOD_DISTILL && method != METHOD_PURIFY) {
+        dictionaryArg = i;
+        continue;
+      }
     }
     std::string content = readFile(argv[i]);
     data.insert(data.end(), content.begin(), content.end());
     total += content.size();
+    pathArgs.push_back(i);
     sizes.push_back(content.size());
   }
-  if (dictionaryArg == -1 || total == 0) {
+  bool wantDictionary = (dictionaryArg == -1);
+  if (method == METHOD_DISTILL || method == METHOD_PURIFY) {
+    wantDictionary = false;
+  }
+  if (wantDictionary || total == 0) {
     printHelp(fileName(argv[0]));
     fprintf(stderr, "Not enough arguments\n");
     exit(1);
   }
 
   if (method == METHOD_SIEVE) {
-    writeFile(argv[dictionaryArg],
-        sieve_generate(targetSize, sieveSliceLen, sizes, data.data()));
+    writeFile(argv[dictionaryArg], sieve_generate(
+        targetSize, sliceLen, sizes, data.data()));
   } else if (method == METHOD_DM) {
-    writeFile(argv[dictionaryArg],
-        DM_generate(targetSize, sizes, data.data()));
+    writeFile(argv[dictionaryArg], DM_generate(
+        targetSize, sizes, data.data()));
+  } else if (method == METHOD_DURCHSCHLAG) {
+    writeFile(argv[dictionaryArg], durchschlag_generate(
+        targetSize, sliceLen, blockSize, sizes, data.data()));
+  } else if (method == METHOD_DISTILL) {
+    durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data());
+    writeSamples(argv, pathArgs, sizes, data.data());
+  } else if (method == METHOD_PURIFY) {
+    durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data());
+    writeSamples(argv, pathArgs, sizes, data.data());
   } else {
     printHelp(fileName(argv[0]));
     fprintf(stderr, "Unknown generator\n");
diff --git a/research/draw_diff.cc b/research/draw_diff.cc
index 01b6716..6541dac 100644
--- a/research/draw_diff.cc
+++ b/research/draw_diff.cc
@@ -20,18 +20,23 @@
 #define CHECK(X) if (!(X)) exit(EXIT_FAILURE);
 #endif
 
-void ReadPGM(FILE* f, uint8_t*** image, size_t* height, size_t* width) {
+typedef uint8_t* ScanLine;
+typedef ScanLine* Image;
+
+void ReadPGM(FILE* f, Image* image, size_t* height, size_t* width) {
   int colors;
   CHECK(fscanf(f, "P5\n%lu %lu\n%d\n", width, height, &colors) == 3);
   assert(colors == 255);
-  *image = new uint8_t*[*height];
+  ScanLine* lines = new ScanLine[*height];
+  *image = lines;
   for (int i = *height - 1; i >= 0; --i) {
-    (*image)[i] = new uint8_t[*width];
-    CHECK(fread((*image)[i], 1, *width, f) == *width);
+    ScanLine line = new uint8_t[*width];
+    lines[i] = line;
+    CHECK(fread(line, 1, *width, f) == *width);
   }
 }
 
-void CalculateDiff(int** diff, uint8_t** image1, uint8_t** image2,
+void CalculateDiff(int** diff, Image image1, Image image2,
                    size_t height, size_t width) {
   for (size_t i = 0; i < height; ++i) {
     for (size_t j = 0; j < width; ++j) {
@@ -40,7 +45,7 @@ void CalculateDiff(int** diff, uint8_t** image1, uint8_t** image2,
   }
 }
 
-void DrawDiff(int** diff, uint8_t** image1, uint8_t** image2,
+void DrawDiff(int** diff, Image image1, Image image2,
               size_t height, size_t width, FILE* f) {
   int max = -1234;
   int min = +1234;
@@ -78,13 +83,13 @@ void DrawDiff(int** diff, uint8_t** image1, uint8_t** image2,
   delete[] row;
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char** argv) {
   if (argc != 4) {
     printf("usage: %s pgm1 pgm2 diff_ppm_path\n", argv[0]);
     return 1;
   }
 
-  uint8_t **image1, **image2;
+  Image image1, image2;
   size_t h1, w1, h2, w2;
 
   FILE* fimage1 = fopen(argv[1], "rb");
diff --git a/research/durchschlag.cc b/research/durchschlag.cc
new file mode 100755
index 0000000..cc4ed68
--- /dev/null
+++ b/research/durchschlag.cc
@@ -0,0 +1,714 @@
+#include "./durchschlag.h"
+
+#include <algorithm>
+#include <exception>  /* terminate */
+
+#include "divsufsort.h"
+
+/* Pointer to position in text. */
+typedef DurchschlagTextIdx TextIdx;
+
+/* (Sum of) value(s) of slice(s). */
+typedef uint32_t Score;
+
+typedef struct HashSlot {
+  TextIdx next;
+  TextIdx offset;
+} HashSlot;
+
+typedef struct MetaSlot {
+  TextIdx mark;
+  Score score;
+} MetaSlot;
+
+typedef struct Range {
+  TextIdx start;
+  TextIdx end;
+} Range;
+
+typedef struct Candidate {
+  Score score;
+  TextIdx position;
+} Candidate;
+
+struct greaterScore {
+  bool operator()(const Candidate& a, const Candidate& b) const {
+    return (a.score > b.score) ||
+        ((a.score == b.score) && (a.position < b.position));
+  }
+};
+
+struct lessScore {
+  bool operator()(const Candidate& a, const Candidate& b) const {
+    return (a.score < b.score) ||
+        ((a.score == b.score) && (a.position > b.position));
+  }
+};
+
+#define CANDIDATE_BUNDLE_SIZE (1 << 18)
+
+static void fatal(const char* error) {
+  fprintf(stderr, "%s\n", error);
+  std::terminate();
+}
+
+static TextIdx calculateDictionarySize(const std::vector<Range>& ranges) {
+  TextIdx result = 0;
+  for (size_t i = 0; i < ranges.size(); ++i) {
+    const Range& r = ranges[i];
+    result += r.end - r.start;
+  }
+  return result;
+}
+
+static std::string createDictionary(
+    const uint8_t* data, const std::vector<Range>& ranges, size_t limit) {
+  std::string output;
+  output.reserve(calculateDictionarySize(ranges));
+  for (size_t i = 0; i < ranges.size(); ++i) {
+    const Range& r = ranges[i];
+    output.insert(output.end(), &data[r.start], &data[r.end]);
+  }
+  if (output.size() > limit) {
+    output.resize(limit);
+  }
+  return output;
+}
+
+static Score buildCandidatesList(std::vector<Candidate>* candidates,
+    std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
+    TextIdx end) {
+  candidates->resize(0);
+
+  size_t n = map->size();
+  MetaSlot* slots = map->data();
+  for (size_t j = 0; j < n; ++j) {
+    slots[j].mark = 0;
+  }
+
+  Score score = 0;
+  for (size_t j = 0; j < span; ++j) {
+    MetaSlot& item = slots[shortcut[j]];
+    if (item.mark == 0) {
+      score += item.score;
+    }
+    item.mark++;
+  }
+
+  TextIdx i = 0;
+  TextIdx limit = std::min<TextIdx>(end, CANDIDATE_BUNDLE_SIZE);
+  Score maxScore = 0;
+  for (; i < limit; ++i) {
+    MetaSlot& pick = slots[shortcut[i + span]];
+    if (pick.mark == 0) {
+      score += pick.score;
+    }
+    pick.mark++;
+
+    if (score > maxScore) {
+      maxScore = score;
+    }
+    candidates->push_back({score, i});
+
+    MetaSlot& drop = slots[shortcut[i]];
+    drop.mark--;
+    if (drop.mark == 0) {
+      score -= drop.score;
+    }
+  }
+
+  std::make_heap(candidates->begin(), candidates->end(), greaterScore());
+  Score minScore = candidates->at(0).score;
+  for (; i < end; ++i) {
+    MetaSlot& pick = slots[shortcut[i + span]];
+    if (pick.mark == 0) {
+      score += pick.score;
+    }
+    pick.mark++;
+
+    if (score > maxScore) {
+      maxScore = score;
+    }
+    if (score >= minScore) {
+      candidates->push_back({score, i});
+      std::push_heap(candidates->begin(), candidates->end(), greaterScore());
+      if (candidates->size() > CANDIDATE_BUNDLE_SIZE && maxScore != minScore) {
+        while (candidates->at(0).score == minScore) {
+          std::pop_heap(candidates->begin(), candidates->end(), greaterScore());
+          candidates->pop_back();
+        }
+        minScore = candidates->at(0).score;
+      }
+    }
+
+    MetaSlot& drop = slots[shortcut[i]];
+    drop.mark--;
+    if (drop.mark == 0) {
+      score -= drop.score;
+    }
+  }
+
+  for (size_t j = 0; j < n; ++j) {
+    slots[j].mark = 0;
+  }
+
+  std::make_heap(candidates->begin(), candidates->end(), lessScore());
+  return minScore;
+}
+
+static Score rebuildCandidatesList(std::vector<TextIdx>* candidates,
+    std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
+    TextIdx end, TextIdx* next) {
+  size_t n = candidates->size();
+  TextIdx* data = candidates->data();
+  for (size_t i = 0; i < n; ++i) {
+    data[i] = 0;
+  }
+
+  n = map->size();
+  MetaSlot* slots = map->data();
+  for (size_t i = 0; i < n; ++i) {
+    slots[i].mark = 0;
+  }
+
+  Score score = 0;
+  for (TextIdx i = 0; i < span; ++i) {
+    MetaSlot& item = slots[shortcut[i]];
+    if (item.mark == 0) {
+      score += item.score;
+    }
+    item.mark++;
+  }
+
+  Score maxScore = 0;
+  for (TextIdx i = 0; i < end; ++i) {
+    MetaSlot& pick = slots[shortcut[i + span]];
+    if (pick.mark == 0) {
+      score += pick.score;
+    }
+    pick.mark++;
+
+    if (candidates->size() <= score) {
+      candidates->resize(score + 1);
+    }
+    if (score > maxScore) {
+      maxScore = score;
+    }
+    next[i] = candidates->at(score);
+    candidates->at(score) = i;
+
+    MetaSlot& drop = slots[shortcut[i]];
+    drop.mark--;
+    if (drop.mark == 0) {
+      score -= drop.score;
+    }
+  }
+
+  for (size_t i = 0; i < n; ++i) {
+    slots[i].mark = 0;
+  }
+
+  candidates->resize(maxScore + 1);
+  return maxScore;
+}
+
+static void addRange(std::vector<Range>* ranges, TextIdx start, TextIdx end) {
+  for (auto it = ranges->begin(); it != ranges->end();) {
+    if (end < it->start) {
+      ranges->insert(it, {start, end});
+      return;
+    }
+    if (it->end < start) {
+      it++;
+      continue;
+    }
+    // Combine with existing.
+    start = std::min(start, it->start);
+    end = std::max(end, it->end);
+    // Remove consumed vector and continue.
+    it = ranges->erase(it);
+  }
+  ranges->push_back({start, end});
+}
+
+std::string durchschlag_generate(
+    size_t dictionary_size_limit, size_t slice_len, size_t block_len,
+    const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
+  DurchschlagContext ctx = durchschlag_prepare(
+      slice_len, sample_sizes, sample_data);
+  return durchschlag_generate(DURCHSCHLAG_COLLABORATIVE,
+      dictionary_size_limit, block_len, ctx, sample_data);
+}
+
+DurchschlagContext durchschlag_prepare(size_t slice_len,
+    const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
+  /* Parameters aliasing */
+  TextIdx sliceLen = static_cast<TextIdx>(slice_len);
+  if (sliceLen != slice_len) fatal("slice_len is too large");
+  if (sliceLen < 1) fatal("slice_len is too small");
+  const uint8_t* data = sample_data;
+
+  TextIdx total = 0;
+  std::vector<TextIdx> offsets;
+  offsets.reserve(sample_sizes.size());
+  for (size_t i = 0; i < sample_sizes.size(); ++i) {
+    TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
+    if (delta != sample_sizes[i]) fatal("sample is too large");
+    if (delta == 0) fatal("0-length samples are prohibited");
+    TextIdx next_total = total + delta;
+    if (next_total <= total) fatal("corpus is too large");
+    total = next_total;
+    offsets.push_back(total);
+  }
+
+  if (total < sliceLen) fatal("slice_len is larger than corpus size");
+  TextIdx end = total - static_cast<TextIdx>(sliceLen) + 1;
+  TextIdx hashLen = 11;
+  while (hashLen < 29 && ((1u << hashLen) < end)) {
+    hashLen += 3;
+  }
+  hashLen -= 3;
+  TextIdx hashMask = (1u << hashLen) - 1u;
+  std::vector<TextIdx> hashHead(1 << hashLen);
+  TextIdx hash = 0;
+  TextIdx lShift = 3;
+  TextIdx rShift = hashLen - lShift;
+  for (TextIdx i = 0; i < sliceLen - 1; ++i) {
+    TextIdx v = data[i];
+    hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
+  }
+  TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
+  TextIdx rShiftX = hashLen - lShiftX;
+
+  std::vector<HashSlot> map;
+  map.push_back({0, 0});
+  TextIdx hashSlot = 1;
+  std::vector<TextIdx> sliceMap;
+  sliceMap.reserve(end);
+  for (TextIdx i = 0; i < end; ++i) {
+    TextIdx v = data[i + sliceLen - 1];
+    TextIdx bucket = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
+    v = data[i];
+    hash = bucket ^ (((v << lShiftX) | (v >> rShiftX)) & hashMask);
+    TextIdx slot = hashHead[bucket];
+    while (slot != 0) {
+      HashSlot& item = map[slot];
+      TextIdx start = item.offset;
+      bool miss = false;
+      for (TextIdx j = 0; j < sliceLen; ++j) {
+        if (data[i + j] != data[start + j]) {
+          miss = true;
+          break;
+        }
+      }
+      if (!miss) {
+        sliceMap.push_back(slot);
+        break;
+      }
+      slot = item.next;
+    }
+    if (slot == 0) {
+      map.push_back({hashHead[bucket], i});
+      hashHead[bucket] = hashSlot;
+      sliceMap.push_back(hashSlot);
+      hashSlot++;
+    }
+  }
+
+  return {total, sliceLen, static_cast<TextIdx>(map.size()),
+      std::move(offsets), std::move(sliceMap)};
+}
+
+DurchschlagContext durchschlag_prepare(size_t slice_len,
+    const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index) {
+  /* Parameters aliasing */
+  TextIdx sliceLen = static_cast<TextIdx>(slice_len);
+  if (sliceLen != slice_len) fatal("slice_len is too large");
+  if (sliceLen < 1) fatal("slice_len is too small");
+  const TextIdx* lcp = index.lcp.data();
+  const TextIdx* sa = index.sa.data();
+
+  TextIdx total = 0;
+  std::vector<TextIdx> offsets;
+  offsets.reserve(sample_sizes.size());
+  for (size_t i = 0; i < sample_sizes.size(); ++i) {
+    TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
+    if (delta != sample_sizes[i]) fatal("sample is too large");
+    if (delta == 0) fatal("0-length samples are prohibited");
+    TextIdx next_total = total + delta;
+    if (next_total <= total) fatal("corpus is too large");
+    total = next_total;
+    offsets.push_back(total);
+  }
+
+  if (total < sliceLen) fatal("slice_len is larger than corpus size");
+  TextIdx counter = 1;
+  TextIdx end = total - sliceLen + 1;
+  std::vector<TextIdx> sliceMap(total);
+  TextIdx last = 0;
+  TextIdx current = 1;
+  while (current <= total) {
+    if (lcp[current - 1] < sliceLen) {
+      for (TextIdx i = last; i < current; ++i) {
+        sliceMap[sa[i]] = counter;
+      }
+      counter++;
+      last = current;
+    }
+    current++;
+  }
+  sliceMap.resize(end);
+
+  // Reorder items for the better locality.
+  std::vector<TextIdx> reorder(counter);
+  counter = 1;
+  for (TextIdx i = 0; i < end; ++i) {
+    if (reorder[sliceMap[i]] == 0) {
+      reorder[sliceMap[i]] = counter++;
+    }
+  }
+  for (TextIdx i = 0; i < end; ++i) {
+    sliceMap[i] = reorder[sliceMap[i]];
+  }
+
+  return {total, sliceLen, counter, std::move(offsets), std::move(sliceMap)};
+}
+
+DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data) {
+  TextIdx total = static_cast<TextIdx>(data.size());
+  if (total != data.size()) fatal("corpus is too large");
+  saidx_t saTotal = static_cast<saidx_t>(total);
+  if (saTotal < 0) fatal("corpus is too large");
+  if (static_cast<TextIdx>(saTotal) != total) fatal("corpus is too large");
+  std::vector<TextIdx> sa(total);
+  /* Hopefully, non-negative int32_t values match TextIdx ones. */
+  if (sizeof(TextIdx) != sizeof(int32_t)) fatal("type length mismatch");
+  int32_t* saData = reinterpret_cast<int32_t*>(sa.data());
+  divsufsort(data.data(), saData, saTotal);
+
+  std::vector<TextIdx> isa(total);
+  for (TextIdx i = 0; i < total; ++i) isa[sa[i]] = i;
+
+  // TODO: borrowed -> unknown efficiency.
+  std::vector<TextIdx> lcp(total);
+  TextIdx k = 0;
+  lcp[total - 1] = 0;
+  for (TextIdx i = 0; i < total; ++i) {
+    TextIdx current = isa[i];
+    if (current == total - 1) {
+      k = 0;
+      continue;
+    }
+    TextIdx j = sa[current + 1];  // Suffix which follow i-th suffix.
+    while ((i + k < total) && (j + k < total) && (data[i + k] == data[j + k])) {
+      ++k;
+    }
+    lcp[current] = k;
+    if (k > 0) --k;
+  }
+
+  return {std::move(lcp), std::move(sa)};
+}
+
+static void ScoreSlices(const std::vector<TextIdx>& offsets,
+    std::vector<MetaSlot>& map, const TextIdx* shortcut, TextIdx end) {
+  TextIdx piece = 0;
+  /* Fresh map contains all zeroes -> initial mark should be different. */
+  TextIdx mark = 1;
+  for (TextIdx i = 0; i < end; ++i) {
+    if (offsets[piece] == i) {
+      piece++;
+      mark++;
+    }
+    MetaSlot& item = map[shortcut[i]];
+    if (item.mark != mark) {
+      item.mark = mark;
+      item.score++;
+    }
+  }
+}
+
+static std::string durchschlagGenerateExclusive(
+    size_t dictionary_size_limit, size_t block_len,
+    const DurchschlagContext& context, const uint8_t* sample_data) {
+  /* Parameters aliasing */
+  TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
+  if (targetSize != dictionary_size_limit) {
+    fprintf(stderr, "dictionary_size_limit is too large\n");
+    return "";
+  }
+  TextIdx sliceLen = context.sliceLen;
+  TextIdx total = context.dataSize;
+  TextIdx blockLen = static_cast<TextIdx>(block_len);
+  if (blockLen != block_len) {
+    fprintf(stderr, "block_len is too large\n");
+    return "";
+  }
+  const uint8_t* data = sample_data;
+  const std::vector<TextIdx>& offsets = context.offsets;
+  std::vector<MetaSlot> map(context.numUniqueSlices);
+  const TextIdx* shortcut = context.sliceMap.data();
+
+  /* Initialization */
+  if (blockLen < sliceLen) {
+    fprintf(stderr, "sliceLen is larger than block_len\n");
+    return "";
+  }
+  if (targetSize < blockLen || total < blockLen) {
+    fprintf(stderr, "block_len is too large\n");
+    return "";
+  }
+  TextIdx end = total - sliceLen + 1;
+  ScoreSlices(offsets, map, shortcut, end);
+  end = total - blockLen + 1;
+  std::vector<TextIdx> candidates;
+  std::vector<TextIdx> next(end);
+  TextIdx span = blockLen - sliceLen + 1;
+  Score maxScore = rebuildCandidatesList(
+      &candidates, &map, span, shortcut, end, next.data());
+
+  /* Block selection */
+  const size_t triesLimit = (600 * 1000000) / span;
+  const size_t candidatesLimit = (150 * 1000000) / span;
+  std::vector<Range> ranges;
+  TextIdx mark = 0;
+  size_t numTries = 0;
+  while (true) {
+    TextIdx dictSize = calculateDictionarySize(ranges);
+    size_t numCandidates = 0;
+    if (dictSize > targetSize - blockLen) {
+      break;
+    }
+    if (maxScore == 0) {
+      break;
+    }
+    while (true) {
+      TextIdx candidate = 0;
+      while (maxScore > 0) {
+        if (candidates[maxScore] != 0) {
+          candidate = candidates[maxScore];
+          candidates[maxScore] = next[candidate];
+          break;
+        }
+        maxScore--;
+      }
+      if (maxScore == 0) {
+        break;
+      }
+      mark++;
+      numTries++;
+      numCandidates++;
+      Score score = 0;
+      for (size_t j = candidate; j <= candidate + span; ++j) {
+        MetaSlot& item = map[shortcut[j]];
+        if (item.mark != mark) {
+          score += item.score;
+          item.mark = mark;
+        }
+      }
+      if (score < maxScore) {
+        if (numTries < triesLimit && numCandidates < candidatesLimit) {
+          next[candidate] = candidates[score];
+          candidates[score] = candidate;
+        } else {
+          maxScore = rebuildCandidatesList(
+              &candidates, &map, span, shortcut, end, next.data());
+          mark = 0;
+          numTries = 0;
+          numCandidates = 0;
+        }
+        continue;
+      } else if (score > maxScore) {
+        fprintf(stderr, "Broken invariant\n");
+        return "";
+      }
+      for (TextIdx j = candidate; j <= candidate + span; ++j) {
+        MetaSlot& item = map[shortcut[j]];
+        item.score = 0;
+      }
+      addRange(&ranges, candidate, candidate + blockLen);
+      break;
+    }
+  }
+
+  return createDictionary(data, ranges, targetSize);
+}
+
+static std::string durchschlagGenerateCollaborative(
+    size_t dictionary_size_limit, size_t block_len,
+    const DurchschlagContext& context, const uint8_t* sample_data) {
+  /* Parameters aliasing */
+  TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
+  if (targetSize != dictionary_size_limit) {
+    fprintf(stderr, "dictionary_size_limit is too large\n");
+    return "";
+  }
+  TextIdx sliceLen = context.sliceLen;
+  TextIdx total = context.dataSize;
+  TextIdx blockLen = static_cast<TextIdx>(block_len);
+  if (blockLen != block_len) {
+    fprintf(stderr, "block_len is too large\n");
+    return "";
+  }
+  const uint8_t* data = sample_data;
+  const std::vector<TextIdx>& offsets = context.offsets;
+  std::vector<MetaSlot> map(context.numUniqueSlices);
+  const TextIdx* shortcut = context.sliceMap.data();
+
+  /* Initialization */
+  if (blockLen < sliceLen) {
+    fprintf(stderr, "sliceLen is larger than block_len\n");
+    return "";
+  }
+  if (targetSize < blockLen || total < blockLen) {
+    fprintf(stderr, "block_len is too large\n");
+    return "";
+  }
+  TextIdx end = total - sliceLen + 1;
+  ScoreSlices(offsets, map, shortcut, end);
+  end = total - blockLen + 1;
+  std::vector<Candidate> candidates;
+  candidates.reserve(CANDIDATE_BUNDLE_SIZE + 1024);
+  TextIdx span = blockLen - sliceLen + 1;
+  Score minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
+
+  /* Block selection */
+  std::vector<Range> ranges;
+  TextIdx mark = 0;
+  while (true) {
+    TextIdx dictSize = calculateDictionarySize(ranges);
+    if (dictSize > targetSize - blockLen) {
+      break;
+    }
+    if (minScore == 0 && candidates.empty()) {
+      break;
+    }
+    while (true) {
+      if (candidates.empty()) {
+        minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
+        mark = 0;
+      }
+      TextIdx candidate = candidates[0].position;
+      Score expectedScore = candidates[0].score;
+      if (expectedScore == 0) {
+        candidates.resize(0);
+        break;
+      }
+      std::pop_heap(candidates.begin(), candidates.end(), lessScore());
+      candidates.pop_back();
+      mark++;
+      Score score = 0;
+      for (TextIdx j = candidate; j <= candidate + span; ++j) {
+        MetaSlot& item = map[shortcut[j]];
+        if (item.mark != mark) {
+          score += item.score;
+          item.mark = mark;
+        }
+      }
+      if (score < expectedScore) {
+        if (score >= minScore) {
+          candidates.push_back({score, candidate});
+          std::push_heap(candidates.begin(), candidates.end(), lessScore());
+        }
+        continue;
+      } else if (score > expectedScore) {
+        fatal("Broken invariant");
+      }
+      for (TextIdx j = candidate; j <= candidate + span; ++j) {
+        MetaSlot& item = map[shortcut[j]];
+        item.score = 0;
+      }
+      addRange(&ranges, candidate, candidate + blockLen);
+      break;
+    }
+  }
+
+  return createDictionary(data, ranges, targetSize);
+}
+
+std::string durchschlag_generate(DurchschalgResourceStrategy strategy,
+    size_t dictionary_size_limit, size_t block_len,
+    const DurchschlagContext& context, const uint8_t* sample_data) {
+  if (strategy == DURCHSCHLAG_COLLABORATIVE) {
+    return durchschlagGenerateCollaborative(
+        dictionary_size_limit, block_len, context, sample_data);
+  } else {
+    return durchschlagGenerateExclusive(
+        dictionary_size_limit, block_len, context, sample_data);
+  }
+}
+
+void durchschlag_distill(size_t slice_len, size_t minimum_population,
+    std::vector<size_t>* sample_sizes, uint8_t* sample_data) {
+  /* Parameters aliasing */
+  uint8_t* data = sample_data;
+
+  /* Build slice map. */
+  DurchschlagContext context = durchschlag_prepare(
+      slice_len, *sample_sizes, data);
+
+  /* Calculate slice population. */
+  const std::vector<TextIdx>& offsets = context.offsets;
+  std::vector<MetaSlot> map(context.numUniqueSlices);
+  const TextIdx* shortcut = context.sliceMap.data();
+  TextIdx sliceLen = context.sliceLen;
+  TextIdx total = context.dataSize;
+  TextIdx end = total - sliceLen + 1;
+  ScoreSlices(offsets, map, shortcut, end);
+
+  /* Condense samples, omitting unique slices. */
+  TextIdx readPos = 0;
+  TextIdx writePos = 0;
+  TextIdx lastNonUniquePos = 0;
+  for (TextIdx i = 0; i < sample_sizes->size(); ++i) {
+    TextIdx sampleStart = writePos;
+    TextIdx oldSampleEnd =
+        readPos + static_cast<TextIdx>(sample_sizes->at(i));
+    while (readPos < oldSampleEnd) {
+      if (readPos < end) {
+        MetaSlot& item = map[shortcut[readPos]];
+        if (item.score >= minimum_population) {
+          lastNonUniquePos = readPos + sliceLen;
+        }
+      }
+      if (readPos < lastNonUniquePos) {
+        data[writePos++] = data[readPos];
+      }
+      readPos++;
+    }
+    sample_sizes->at(i) = writePos - sampleStart;
+  }
+}
+
+void durchschlag_purify(size_t slice_len, size_t minimum_population,
+    const std::vector<size_t>& sample_sizes, uint8_t* sample_data) {
+  /* Parameters aliasing */
+  uint8_t* data = sample_data;
+
+  /* Build slice map. */
+  DurchschlagContext context = durchschlag_prepare(
+      slice_len, sample_sizes, data);
+
+  /* Calculate slice population. */
+  const std::vector<TextIdx>& offsets = context.offsets;
+  std::vector<MetaSlot> map(context.numUniqueSlices);
+  const TextIdx* shortcut = context.sliceMap.data();
+  TextIdx sliceLen = context.sliceLen;
+  TextIdx total = context.dataSize;
+  TextIdx end = total - sliceLen + 1;
+  ScoreSlices(offsets, map, shortcut, end);
+
+  /* Rewrite samples, zeroing out unique slices. */
+  TextIdx lastNonUniquePos = 0;
+  for (TextIdx readPos = 0; readPos < total; ++readPos) {
+    if (readPos < end) {
+      MetaSlot& item = map[shortcut[readPos]];
+      if (item.score >= minimum_population) {
+        lastNonUniquePos = readPos + sliceLen;
+      }
+    }
+    if (readPos >= lastNonUniquePos) {
+      data[readPos] = 0;
+    }
+  }
+}
diff --git a/research/durchschlag.h b/research/durchschlag.h
new file mode 100755
index 0000000..adbc531
--- /dev/null
+++ b/research/durchschlag.h
@@ -0,0 +1,99 @@
+#ifndef BROTLI_RESEARCH_DURCHSCHLAG_H_
+#define BROTLI_RESEARCH_DURCHSCHLAG_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+/**
+ * Generate a dictionary for given samples.
+ *
+ * @param dictionary_size_limit maximal dictionary size
+ * @param slice_len text slice size
+ * @param block_len score block length
+ * @param sample_sizes vector with sample sizes
+ * @param sample_data concatenated samples
+ * @return generated dictionary
+ */
+std::string durchschlag_generate(
+    size_t dictionary_size_limit, size_t slice_len, size_t block_len,
+    const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
+
+//------------------------------------------------------------------------------
+// Lower level API for repetitive dictionary generation.
+//------------------------------------------------------------------------------
+
+/* Pointer to position in text. */
+typedef uint32_t DurchschlagTextIdx;
+
+/* Context is made public for flexible serialization / deserialization. */
+typedef struct DurchschlagContext {
+  DurchschlagTextIdx dataSize;
+  DurchschlagTextIdx sliceLen;
+  DurchschlagTextIdx numUniqueSlices;
+  std::vector<DurchschlagTextIdx> offsets;
+  std::vector<DurchschlagTextIdx> sliceMap;
+} DurchschlagContext;
+
+DurchschlagContext durchschlag_prepare(size_t slice_len,
+    const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
+
+typedef enum DurchschalgResourceStrategy {
+  // Faster
+  DURCHSCHLAG_EXCLUSIVE = 0,
+  // Uses much less memory
+  DURCHSCHLAG_COLLABORATIVE = 1
+} DurchschalgResourceStrategy;
+
+std::string durchschlag_generate(DurchschalgResourceStrategy strategy,
+    size_t dictionary_size_limit, size_t block_len,
+    const DurchschlagContext& context, const uint8_t* sample_data);
+
+//------------------------------------------------------------------------------
+// Suffix Array based preparation.
+//------------------------------------------------------------------------------
+
+typedef struct DurchschlagIndex {
+  std::vector<DurchschlagTextIdx> lcp;
+  std::vector<DurchschlagTextIdx> sa;
+} DurchschlagIndex;
+
+DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data);
+
+DurchschlagContext durchschlag_prepare(size_t slice_len,
+    const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index);
+
+//------------------------------------------------------------------------------
+// Data preparation.
+//------------------------------------------------------------------------------
+
+/**
+ * Cut out unique slices.
+ *
+ * Both @p sample_sizes and @p sample_data are modified in-place. Number of
+ * samples remains unchanged, but some samples become shorter.
+ *
+ * @param slice_len (unique) slice size
+ * @param minimum_population minimum non-unique slice occurrence
+ * @param sample_sizes [in / out] vector with sample sizes
+ * @param sample_data [in / out] concatenated samples
+ */
+void durchschlag_distill(size_t slice_len, size_t minimum_population,
+    std::vector<size_t>* sample_sizes, uint8_t* sample_data);
+
+/**
+ * Replace unique slices with zeroes.
+ *
+ * @p sample_data is modified in-place. Number of samples and their length
+ * remain unchanged.
+ *
+ * @param slice_len (unique) slice size
+ * @param minimum_population minimum non-unique slice occurrence
+ * @param sample_sizes vector with sample sizes
+ * @param sample_data [in / out] concatenated samples
+ */
+void durchschlag_purify(size_t slice_len, size_t minimum_population,
+    const std::vector<size_t>& sample_sizes, uint8_t* sample_data);
+
+#endif  // BROTLI_RESEARCH_DURCHSCHLAG_H_
diff --git a/research/libdivsufsort b/research/libdivsufsort
new file mode 160000
+Subproject 5f60d6f026c30fb4ac296f696b3c8b0eb71bd42
diff --git a/research/sieve.cc b/research/sieve.cc
index fbc1dbf..4d147e1 100755
--- a/research/sieve.cc
+++ b/research/sieve.cc
@@ -1,19 +1,27 @@
 #include "./sieve.h"
 
+/* Pointer to position in (combined corpus) text. */
+typedef uint32_t TextIdx;
+
+/* Index of sample / generation. */
+typedef uint16_t SampleIdx;
+
 typedef struct Slot {
-  uint32_t next;
-  uint32_t offset;
-  uint16_t presence;
-  uint16_t mark;
+  TextIdx next;
+  TextIdx offset;
+  SampleIdx presence;
+  SampleIdx mark;
 } Slot;
 
-static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end,
-    size_t middle, uint16_t minPresence, uint16_t iteration) {
-  int from = -2;
-  int to = -1;
-  size_t result = 0;
-  uint16_t targetPresence = minPresence;
-  for (uint32_t i = 0; i < end; ++i) {
+static const TextIdx kNowhere = static_cast<TextIdx>(-1);
+
+static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut,
+    TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) {
+  TextIdx from = kNowhere;
+  TextIdx to = kNowhere;
+  TextIdx result = 0;
+  SampleIdx targetPresence = minPresence;
+  for (TextIdx i = 0; i < end; ++i) {
     if (i == middle) {
       targetPresence++;
     }
@@ -21,8 +29,8 @@ static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end,
     if (item.mark != iteration) {
       item.mark = iteration;
       if (item.presence >= targetPresence) {
-        if (to < i) {
-          if (from > 0) {
+        if ((to == kNowhere) || (to < i)) {
+          if (from != kNowhere) {
             result += to - from;
           }
           from = i;
@@ -31,20 +39,20 @@ static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end,
       }
     }
   }
-  if (from > 0) {
+  if (from != kNowhere) {
     result += to - from;
   }
   return result;
 }
 
-static std::string createDictionary(const uint8_t* data, size_t sliceLen,
-    Slot* map, uint32_t* shortcut, size_t end, size_t middle,
-    uint16_t minPresence, uint16_t iteration) {
+static std::string createDictionary(const uint8_t* data, TextIdx sliceLen,
+    Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle,
+    SampleIdx minPresence, SampleIdx iteration) {
   std::string output;
-  int from = -2;
-  int to = -1;
-  uint16_t targetPresence = minPresence;
-  for (uint32_t i = 0; i < end; ++i) {
+  TextIdx from = kNowhere;
+  TextIdx to = kNowhere;
+  SampleIdx targetPresence = minPresence;
+  for (TextIdx i = 0; i < end; ++i) {
     if (i == middle) {
       targetPresence++;
     }
@@ -52,8 +60,8 @@ static std::string createDictionary(const uint8_t* data, size_t sliceLen,
     if (item.mark != iteration) {
       item.mark = iteration;
       if (item.presence >= targetPresence) {
-        if (to < i) {
-          if (from > 0) {
+        if ((to == kNowhere) || (to < i)) {
+          if (from != kNowhere) {
             output.insert(output.end(), &data[from], &data[to]);
           }
           from = i;
@@ -62,7 +70,7 @@ static std::string createDictionary(const uint8_t* data, size_t sliceLen,
       }
     }
   }
-  if (from > 0) {
+  if (from != kNowhere) {
     output.insert(output.end(), &data[from], &data[to]);
   }
   return output;
@@ -71,55 +79,95 @@ static std::string createDictionary(const uint8_t* data, size_t sliceLen,
 std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
     const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
   /* Parameters aliasing */
-  size_t targetSize = dictionary_size_limit;
-  size_t sliceLen = slice_len;
+  TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
+  if (targetSize != dictionary_size_limit) {
+    fprintf(stderr, "dictionary_size_limit is too large\n");
+    return "";
+  }
+  TextIdx sliceLen = static_cast<TextIdx>(slice_len);
+  if (sliceLen != slice_len) {
+    fprintf(stderr, "slice_len is too large\n");
+    return "";
+  }
+  if (sliceLen < 1) {
+    fprintf(stderr, "slice_len is too small\n");
+    return "";
+  }
+  SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size());
+  if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) {
+    fprintf(stderr, "too many samples\n");
+    return "";
+  }
   const uint8_t* data = sample_data;
 
-  size_t total = 0;
-  std::vector<size_t> offsets;
-  for (size_t i = 0; i < sample_sizes.size(); ++i) {
-    total += sample_sizes[i];
+  TextIdx total = 0;
+  std::vector<TextIdx> offsets;
+  for (SampleIdx i = 0; i < numSamples; ++i) {
+    TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
+    if (delta != sample_sizes[i]) {
+      fprintf(stderr, "sample is too large\n");
+      return "";
+    }
+    if (delta == 0) {
+      fprintf(stderr, "empty samples are prohibited\n");
+      return "";
+    }
+    if (total + delta <= total) {
+      fprintf(stderr, "corpus is too large\n");
+      return "";
+    }
+    total += delta;
     offsets.push_back(total);
   }
 
+  if (total * 2 < total) {
+    fprintf(stderr, "corpus is too large\n");
+    return "";
+  }
+
+  if (total < sliceLen) {
+    fprintf(stderr, "slice_len is larger than corpus size\n");
+    return "";
+  }
+
   /*****************************************************************************
    * Build coverage map.
    ****************************************************************************/
   std::vector<Slot> map;
-  std::vector<uint32_t> shortcut;
+  std::vector<TextIdx> shortcut;
   map.push_back({0, 0, 0, 0});
-  size_t end = total - sliceLen;
-  int hashLen = 8;
-  while ((1 << hashLen) < end) {
+  TextIdx end = total - sliceLen;
+  TextIdx hashLen = 11;
+  while (hashLen < 29 && ((1u << hashLen) < end)) {
     hashLen += 3;
   }
   hashLen -= 3;
-  uint32_t hashMask = (1u << hashLen) - 1u;
-  std::vector<uint32_t> hashHead(1 << hashLen);
-  uint32_t hashSlot = 1;
-  uint16_t piece = 0;
-  uint32_t hash = 0;
-  int lShift = 3;
-  int rShift = hashLen - lShift;
-  for (int i = 0; i < sliceLen - 1; ++i) {
-    uint32_t v = data[i];
+  TextIdx hashMask = (1u << hashLen) - 1u;
+  std::vector<TextIdx> hashHead(1 << hashLen);
+  TextIdx hashSlot = 1;
+  SampleIdx piece = 0;
+  TextIdx hash = 0;
+  TextIdx lShift = 3;
+  TextIdx rShift = hashLen - lShift;
+  for (TextIdx i = 0; i < sliceLen - 1; ++i) {
+    TextIdx v = data[i];
     hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
   }
-  int lShiftX = (lShift * (sliceLen - 1)) % hashLen;
-  int rShiftX = hashLen - lShiftX;
-  for (uint32_t i = 0; i < end; ++i) {
-    uint32_t v = data[i + sliceLen - 1];
+  TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
+  TextIdx rShiftX = hashLen - lShiftX;
+  for (TextIdx i = 0; i < end; ++i) {
+    TextIdx v = data[i + sliceLen - 1];
     hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
 
     if (offsets[piece] == i) {
       piece++;
     }
-    uint32_t slot = hashHead[hash];
+    TextIdx slot = hashHead[hash];
     while (slot != 0) {
       Slot& item = map[slot];
-      int start = item.offset;
+      TextIdx start = item.offset;
       bool miss = false;
-      for (size_t j = 0; j < sliceLen; ++j) {
+      for (TextIdx j = 0; j < sliceLen; ++j) {
         if (data[i + j] != data[start + j]) {
           miss = true;
           break;
@@ -148,8 +196,8 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
   /*****************************************************************************
    * Build dictionary of specified size.
    ****************************************************************************/
-  size_t a = 1;
-  size_t size = dryRun(
+  SampleIdx a = 1;
+  TextIdx size = dryRun(
       sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
   /* Maximal output is smaller than target. */
   if (size <= targetSize) {
@@ -157,7 +205,7 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
         data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
   }
 
-  size_t b = offsets.size();
+  SampleIdx b = numSamples;
   size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
   if (size == targetSize) {
     return createDictionary(
@@ -167,7 +215,7 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
   if (size < targetSize) {
     /* size(a) > targetSize > size(b) && a < m < b */
     while (a + 1 < b) {
-      size_t m = (a + b) / 2;
+      SampleIdx m = static_cast<SampleIdx>((a + b) / 2);
       size = dryRun(
           sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
       if (size < targetSize) {
@@ -183,18 +231,18 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
     a = b;
   }
   /* size(minPresence) > targetSize > size(minPresence + 1) */
-  size_t minPresence = a;
-  a = 0;
-  b = end;
+  SampleIdx minPresence = a;
+  TextIdx c = 0;
+  TextIdx d = end;
   /* size(a) < targetSize < size(b) && a < m < b */
-  while (a + 1 < b) {
-    size_t m = (a + b) / 2;
+  while (c + 1 < d) {
+    TextIdx m = (c + d) / 2;
     size = dryRun(
         sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
     if (size < targetSize) {
-      a = m;
+      c = m;
     } else if (size > targetSize) {
-      b = m;
+      d = m;
     } else {
       return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
           m, minPresence, ++piece);
@@ -204,8 +252,8 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
   bool unrestricted = false;
   if (minPresence <= 2 && !unrestricted) {
     minPresence = 2;
-    a = end;
+    c = end;
   }
-  return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, a,
+  return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c,
       minPresence, ++piece);
 }
diff --git a/research/sieve.h b/research/sieve.h
index 2aae669..6c65dc8 100755
--- a/research/sieve.h
+++ b/research/sieve.h
@@ -1,9 +1,8 @@
 #ifndef BROTLI_RESEARCH_SIEVE_H_
 #define BROTLI_RESEARCH_SIEVE_H_
 
-#include <stddef.h>
-#include <stdint.h>
-
+#include <cstddef>
+#include <cstdint>
 #include <string>
 #include <vector>
author	Eugene Kliuchnikov <eustas@google.com>	2018-02-26 09:04:36 -0500
committer	GitHub <noreply@github.com>	2018-02-26 09:04:36 -0500
commit	35e69fc7cf9421ab04ffc9d52cb36d07fa12984a (patch)
tree	a1ed614391936d455da2b0610ef8e8caf88b4289 /research
parent	3af18990f50d8f040038aaa08c41f5d27d62efb5 (diff)
download	brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.zip brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.gz brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.bz2