aboutsummaryrefslogtreecommitdiff
path: root/research
diff options
context:
space:
mode:
authorEugene Kliuchnikov <eustas@google.com>2018-02-26 09:04:36 -0500
committerGitHub <noreply@github.com>2018-02-26 09:04:36 -0500
commit35e69fc7cf9421ab04ffc9d52cb36d07fa12984a (patch)
treea1ed614391936d455da2b0610ef8e8caf88b4289 /research
parent3af18990f50d8f040038aaa08c41f5d27d62efb5 (diff)
downloadbrotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.zip
brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.gz
brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.bz2
New feature: "Large Window Brotli" (#640)
* New feature: "Large Window Brotli" By setting special encoder/decoder flag it is now possible to extend LZ-window up to 30 bits; though produced stream will not be RFC7932 compliant. Added new dictionary generator - "DSH". It combines speed of "Sieve" and quality of "DM". Plus utilities to prepare train corpora (remove unique strings). Improved compression ratio: now two sub-blocks could be stitched: the last copy command could be extended to span the next sub-block. Fixed compression ineffectiveness caused by floating numbers rounding and wrong cost heuristic. Other C changes: - combined / moved `context.h` to `common` - moved transforms to `common` - unified some aspects of code formatting - added an abstraction for encoder (static) dictionary - moved default allocator/deallocator functions to `common` brotli CLI: - window size is auto-adjusted if not specified explicitly Java: - added "eager" decoding both to JNI wrapper and pure decoder - huge speed-up of `DictionaryData` initialization * Add dictionaryless compressed dictionary * Fix `sources.lst` * Fix `sources.lst` and add a note that `libtool` is also required. * Update setup.py * Fix `EagerStreamTest` * Fix BUILD file * Add missing `libdivsufsort` dependency * Fix "unused parameter" warning.
Diffstat (limited to 'research')
-rwxr-xr-xresearch/BUILD8
-rw-r--r--research/BUILD.libdivsufsort55
-rw-r--r--research/deorummolae.cc173
-rw-r--r--research/deorummolae.h9
-rwxr-xr-xresearch/dictionary_generator.cc119
-rw-r--r--research/draw_diff.cc21
-rwxr-xr-xresearch/durchschlag.cc714
-rwxr-xr-xresearch/durchschlag.h99
m---------research/libdivsufsort0
-rwxr-xr-xresearch/sieve.cc174
-rwxr-xr-xresearch/sieve.h5
11 files changed, 1206 insertions, 171 deletions
diff --git a/research/BUILD b/research/BUILD
index 6ff5ac2..211b3e7 100755
--- a/research/BUILD
+++ b/research/BUILD
@@ -14,6 +14,13 @@ cc_library(
)
cc_library(
+ name = "durchschlag",
+ srcs = ["durchschlag.cc"],
+ hdrs = ["durchschlag.h"],
+ deps = ["@divsufsort//:libdivsufsort"],
+)
+
+cc_library(
name = "sieve",
srcs = ["sieve.cc"],
hdrs = ["sieve.h"],
@@ -24,6 +31,7 @@ cc_binary(
srcs = ["dictionary_generator.cc"],
deps = [
":dm",
+ ":durchschlag",
":sieve",
],
)
diff --git a/research/BUILD.libdivsufsort b/research/BUILD.libdivsufsort
new file mode 100644
index 0000000..ce60e9c
--- /dev/null
+++ b/research/BUILD.libdivsufsort
@@ -0,0 +1,55 @@
+package(
+ default_visibility = ["//visibility:public"],
+)
+
+cc_library(
+ name = "libdivsufsort",
+ srcs = [
+ "lib/divsufsort.c",
+ "lib/sssort.c",
+ "lib/trsort.c",
+ "lib/utils.c",
+ ],
+ hdrs = [
+ "include/config.h",
+ "include/divsufsort.h",
+ "include/divsufsort_private.h",
+ ],
+ copts = [
+ "-DHAVE_CONFIG_H=1",
+ ],
+ includes = ["include"],
+)
+
+commom_awk_replaces = (
+ "gsub(/#cmakedefine/, \"#define\"); " +
+ "gsub(/@DIVSUFSORT_EXPORT@/, \"\"); " +
+ "gsub(/@DIVSUFSORT_IMPORT@/, \"\"); " +
+ "gsub(/@INLINE@/, \"inline\"); " +
+ "gsub(/@INCFILE@/, \"#include <inttypes.h>\"); " +
+ "gsub(/@SAUCHAR_TYPE@/, \"uint8_t\"); " +
+ "gsub(/@SAINT32_TYPE@/, \"int32_t\"); " +
+ "gsub(/@SAINT_PRId@/, \"PRId32\"); "
+)
+
+genrule(
+ name = "config_h",
+ srcs = ["include/config.h.cmake"],
+ outs = ["include/config.h"],
+ cmd = ("awk '{ " +
+ "gsub(/@HAVE_IO_H 1@/, \"HAVE_IO_H 0\"); " +
+ commom_awk_replaces +
+ "print; }' $(<) > $(@)"),
+)
+
+genrule(
+ name = "divsufsort_h",
+ srcs = ["include/divsufsort.h.cmake"],
+ outs = ["include/divsufsort.h"],
+ cmd = ("awk '{ " +
+ "gsub(/@W64BIT@/, \"\"); " +
+ "gsub(/@SAINDEX_TYPE@/, \"int32_t\"); " +
+ "gsub(/@SAINDEX_PRId@/, \"PRId32\"); " +
+ commom_awk_replaces +
+ "print; }' $(<) > $(@)"),
+)
diff --git a/research/deorummolae.cc b/research/deorummolae.cc
index c53a53c..d15b7ee 100644
--- a/research/deorummolae.cc
+++ b/research/deorummolae.cc
@@ -15,20 +15,31 @@
/* Non tunable definitions. */
#define CHUNK_MASK (CHUNK_SIZE - 1)
-#define COVERAGE_SIZE (1 << (LOG_MAX_FILES - 6))
+#define COVERAGE_SIZE (1 << (DM_LOG_MAX_FILES - 6))
/* File coverage: every bit set to 1 denotes a file covered by an isle. */
typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
-static int popcount(uint64_t u) { return __builtin_popcountll(u); }
+/* Symbol of text alphabet. */
+typedef int32_t TextChar;
+
+/* Pointer to position in text. */
+typedef uint32_t TextIdx;
+
+/* SAIS sarray_type; unfortunately, must be a signed type. */
+typedef int32_t TextSaIdx;
+
+static size_t popcount(uint64_t u) {
+ return static_cast<size_t>(__builtin_popcountll(u));
+}
/* Condense terminators and pad file entries. */
-static void rewriteText(std::vector<int>* text) {
- int terminator = text->back();
- int prev = terminator;
- size_t to = 0;
- for (size_t from = 0; from < text->size(); ++from) {
- int next = text->at(from);
+static void rewriteText(std::vector<TextChar>* text) {
+ TextChar terminator = text->back();
+ TextChar prev = terminator;
+ TextIdx to = 0;
+ for (TextIdx from = 0; from < text->size(); ++from) {
+ TextChar next = text->at(from);
if (next < 256 || prev < 256) {
text->at(to++) = next;
if (next >= 256) terminator = next;
@@ -41,11 +52,12 @@ static void rewriteText(std::vector<int>* text) {
}
/* Reenumerate terminators for smaller alphabet. */
-static void remapTerminators(std::vector<int>* text, int* next_terminator) {
- int prev = -1;
- int x = 256;
- for (size_t i = 0; i < text->size(); ++i) {
- int next = text->at(i);
+static void remapTerminators(std::vector<TextChar>* text,
+ TextChar* next_terminator) {
+ TextChar prev = -1;
+ TextChar x = 256;
+ for (TextIdx i = 0; i < text->size(); ++i) {
+ TextChar next = text->at(i);
if (next < 256) { // Char.
// Do nothing.
} else if (prev < 256) { // Terminator after char.
@@ -60,15 +72,15 @@ static void remapTerminators(std::vector<int>* text, int* next_terminator) {
}
/* Combine all file entries; create mapping position->file. */
-static void buildFullText(std::vector<std::vector<int>>* data,
- std::vector<int>* full_text, std::vector<size_t>* file_map,
- std::vector<size_t>* file_offset, int* next_terminator) {
+static void buildFullText(std::vector<std::vector<TextChar>>* data,
+ std::vector<TextChar>* full_text, std::vector<TextIdx>* file_map,
+ std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
file_map->resize(0);
file_offset->resize(0);
full_text->resize(0);
- for (size_t i = 0; i < data->size(); ++i) {
+ for (TextIdx i = 0; i < data->size(); ++i) {
file_offset->push_back(full_text->size());
- std::vector<int>& file = data->at(i);
+ std::vector<TextChar>& file = data->at(i);
rewriteText(&file);
full_text->insert(full_text->end(), file.begin(), file.end());
file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
@@ -78,18 +90,19 @@ static void buildFullText(std::vector<std::vector<int>>* data,
/* Build longest-common-prefix based on suffix array and text.
TODO: borrowed -> unknown efficiency. */
-static void buildLcp(std::vector<int>* text, std::vector<int>* sa,
- std::vector<int>* lcp, std::vector<int>* invese_sa) {
- int size = static_cast<int>(text->size());
+static void buildLcp(std::vector<TextChar>* text, std::vector<TextIdx>* sa,
+ std::vector<TextIdx>* lcp, std::vector<TextIdx>* invese_sa) {
+ TextIdx size = static_cast<TextIdx>(text->size());
lcp->resize(size);
- int k = 0;
+ TextIdx k = 0;
lcp->at(size - 1) = 0;
- for (int i = 0; i < size; ++i) {
+ for (TextIdx i = 0; i < size; ++i) {
if (invese_sa->at(i) == size - 1) {
k = 0;
continue;
}
- int j = sa->at(invese_sa->at(i) + 1); // Suffix which follow i-th suffix.
+ // Suffix which follow i-th suffix.
+ TextIdx j = sa->at(invese_sa->at(i) + 1);
while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
++k;
}
@@ -102,21 +115,21 @@ static void buildLcp(std::vector<int>* text, std::vector<int>* sa,
When we raise the LCP requirement, the isle sunks and smaller isles appear
instead. */
typedef struct {
- int lcp;
- int l;
- int r;
+ TextIdx lcp;
+ TextIdx l;
+ TextIdx r;
Coverage coverage;
} Isle;
/* Helper routine for `cutMatch`. */
-static void poisonData(int pos, int length, std::vector<std::vector<int>>* data,
- std::vector<size_t>* file_map, std::vector<size_t>* file_offset,
- int* next_terminator) {
- size_t f = file_map->at(pos / CHUNK_SIZE);
+static void poisonData(TextIdx pos, TextIdx length,
+ std::vector<std::vector<TextChar>>* data, std::vector<TextIdx>* file_map,
+ std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
+ TextIdx f = file_map->at(pos / CHUNK_SIZE);
pos -= file_offset->at(f);
- std::vector<int>& file = data->at(f);
- int l = (length == CUT_MATCH) ? CUT_MATCH : 1;
- for (int j = 0; j < l; j++, pos++) {
+ std::vector<TextChar>& file = data->at(f);
+ TextIdx l = (length == CUT_MATCH) ? CUT_MATCH : 1;
+ for (TextIdx j = 0; j < l; j++, pos++) {
if (file[pos] >= 256) continue;
if (file[pos + 1] >= 256) {
file[pos] = file[pos + 1];
@@ -131,12 +144,12 @@ static void poisonData(int pos, int length, std::vector<std::vector<int>>* data,
/* Remove substrings of a given match from files.
Substrings are replaced with unique terminators, so next iteration SA would
not allow to cross removed areas. */
-static void cutMatch(std::vector<std::vector<int>>* data, int index, int length,
- std::vector<int>* sa, std::vector<int>* lcp, std::vector<int>* invese_sa,
- int* next_terminator, std::vector<size_t>* file_map,
- std::vector<size_t>* file_offset) {
+static void cutMatch(std::vector<std::vector<TextChar>>* data, TextIdx index,
+ TextIdx length, std::vector<TextIdx>* sa, std::vector<TextIdx>* lcp,
+ std::vector<TextIdx>* invese_sa, TextChar* next_terminator,
+ std::vector<TextIdx>* file_map, std::vector<TextIdx>* file_offset) {
while (length >= CUT_MATCH) {
- int i = index;
+ TextIdx i = index;
while (lcp->at(i) >= length) {
i++;
poisonData(
@@ -156,54 +169,70 @@ static void cutMatch(std::vector<std::vector<int>>* data, int index, int length,
std::string DM_generate(size_t dictionary_size_limit,
const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
{
- uint64_t tmp = 0;
- if (popcount(tmp - 1u) != 64) {
- fprintf(stderr, "64-bit platform is required\n");
- return 0;
+ TextIdx tmp = static_cast<TextIdx>(dictionary_size_limit);
+ if ((tmp != dictionary_size_limit) || (tmp > 1u << 30)) {
+ fprintf(stderr, "dictionary_size_limit is too large\n");
+ return "";
}
}
/* Could use 256 + '0' for easier debugging. */
- int next_terminator = 256;
+ TextChar next_terminator = 256;
std::string output;
- std::vector<std::vector<int>> data;
+ std::vector<std::vector<TextChar>> data;
- size_t offset = 0;
+ TextIdx offset = 0;
size_t num_samples = sample_sizes.size();
- if (num_samples > MAX_FILES) num_samples = MAX_FILES;
+ if (num_samples > DM_MAX_FILES) num_samples = DM_MAX_FILES;
for (size_t n = 0; n < num_samples; ++n) {
- size_t next_offset = offset + sample_sizes[n];
+ TextIdx delta = static_cast<TextIdx>(sample_sizes[n]);
+ if (delta != sample_sizes[n]) {
+ fprintf(stderr, "sample is too large\n");
+ return "";
+ }
+ if (delta == 0) {
+ fprintf(stderr, "0-length samples are prohibited\n");
+ return "";
+ }
+ TextIdx next_offset = offset + delta;
+ if (next_offset <= offset) {
+ fprintf(stderr, "corpus is too large\n");
+ return "";
+ }
data.push_back(
- std::vector<int>(sample_data + offset, sample_data + next_offset));
+ std::vector<TextChar>(sample_data + offset, sample_data + next_offset));
offset = next_offset;
data.back().push_back(next_terminator++);
}
/* Most arrays are allocated once, and then just resized to smaller and
smaller sizes. */
- std::vector<int> full_text;
- std::vector<size_t> file_map;
- std::vector<size_t> file_offset;
- std::vector<int> sa;
- std::vector<int> invese_sa;
- std::vector<int> lcp;
+ std::vector<TextChar> full_text;
+ std::vector<TextIdx> file_map;
+ std::vector<TextIdx> file_offset;
+ std::vector<TextIdx> sa;
+ std::vector<TextIdx> invese_sa;
+ std::vector<TextIdx> lcp;
std::vector<Isle> isles;
std::vector<char> output_data;
- size_t total = 0;
- size_t total_cost = 0;
- size_t best_cost;
+ TextIdx total = 0;
+ TextIdx total_cost = 0;
+ TextIdx best_cost;
Isle best_isle;
- int min_count = num_samples;
+ size_t min_count = num_samples;
while (true) {
- size_t max_match = dictionary_size_limit - total;
+ TextIdx max_match = static_cast<TextIdx>(dictionary_size_limit) - total;
buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
sa.resize(full_text.size());
- saisxx(full_text.data(), sa.data(), static_cast<int>(full_text.size()),
- next_terminator);
+ /* Hopefully, non-negative TextSaIdx is the same sa TextIdx counterpart. */
+ saisxx(full_text.data(), reinterpret_cast<TextSaIdx*>(sa.data()),
+ static_cast<TextChar>(full_text.size()), next_terminator);
invese_sa.resize(full_text.size());
- for (int i = 0; i < full_text.size(); ++i) invese_sa[sa[i]] = i;
+ for (TextIdx i = 0; i < full_text.size(); ++i) {
+ invese_sa[sa[i]] = i;
+ }
buildLcp(&full_text, &sa, &lcp, &invese_sa);
/* Do not rebuild SA/LCP, just use different selection. */
@@ -213,22 +242,22 @@ std::string DM_generate(size_t dictionary_size_limit,
isles.resize(0);
isles.push_back(best_isle);
- for (int i = 0; i < static_cast<int>(lcp.size()); ++i) {
- int l = i;
+ for (TextIdx i = 0; i < lcp.size(); ++i) {
+ TextIdx l = i;
Coverage cov = {{0}};
- int f = file_map[sa[i] / CHUNK_SIZE];
- cov[f >> 6] = ((uint64_t)1) << (f & 63);
+ size_t f = file_map[sa[i] / CHUNK_SIZE];
+ cov[f >> 6] = (static_cast<uint64_t>(1)) << (f & 63);
while (lcp[i] < isles.back().lcp) {
Isle& top = isles.back();
top.r = i;
l = top.l;
for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
- int count = 0;
+ size_t count = 0;
for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
- int effective_lcp = top.lcp;
+ TextIdx effective_lcp = top.lcp;
/* Restrict (last) dictionary entry length. */
if (effective_lcp > max_match) effective_lcp = max_match;
- int cost = count * effective_lcp;
+ TextIdx cost = count * effective_lcp;
if (cost > best_cost && count >= min_count &&
effective_lcp >= MIN_MATCH) {
best_cost = cost;
@@ -251,14 +280,14 @@ std::string DM_generate(size_t dictionary_size_limit,
if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
if (min_count >= 8) {
min_count = (min_count * 7) / 8;
- fprintf(stderr, "Retry: min_count=%d\n", min_count);
+ fprintf(stderr, "Retry: min_count=%zu\n", min_count);
goto retry;
}
break;
}
/* Save the entry. */
- fprintf(stderr, "Savings: %zu+%zu, dictionary: %zu+%d\n",
+ fprintf(stderr, "Savings: %d+%d, dictionary: %d+%d\n",
total_cost, best_cost, total, best_isle.lcp);
int* piece = &full_text[sa[best_isle.l]];
output.insert(output.end(), piece, piece + best_isle.lcp);
diff --git a/research/deorummolae.h b/research/deorummolae.h
index 7f24add..5815097 100644
--- a/research/deorummolae.h
+++ b/research/deorummolae.h
@@ -1,17 +1,16 @@
#ifndef BROTLI_RESEARCH_DEORUMMOLAE_H_
#define BROTLI_RESEARCH_DEORUMMOLAE_H_
-#include <stddef.h>
-#include <stdint.h>
-
+#include <cstddef>
+#include <cstdint>
#include <string>
#include <vector>
/* log2(maximal number of files). Value 6 provides some speedups. */
-#define LOG_MAX_FILES 6
+#define DM_LOG_MAX_FILES 6
/* Non tunable definitions. */
-#define MAX_FILES (1 << LOG_MAX_FILES)
+#define DM_MAX_FILES (1 << DM_LOG_MAX_FILES)
/**
* Generate a dictionary for given samples.
diff --git a/research/dictionary_generator.cc b/research/dictionary_generator.cc
index b3ee89c..00cfaba 100755
--- a/research/dictionary_generator.cc
+++ b/research/dictionary_generator.cc
@@ -1,15 +1,20 @@
+#include <cstddef>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <vector>
#include "./deorummolae.h"
+#include "./durchschlag.h"
#include "./sieve.h"
#define METHOD_DM 0
#define METHOD_SIEVE 1
+#define METHOD_DURCHSCHLAG 2
+#define METHOD_DISTILL 3
+#define METHOD_PURIFY 4
-size_t readInt(const char* str) {
+static size_t readInt(const char* str) {
size_t result = 0;
if (str[0] == 0 || str[0] == '0') {
return 0;
@@ -51,10 +56,25 @@ static std::string readFile(const std::string& path) {
static void writeFile(const char* file, const std::string& content) {
std::ofstream outfile(file, std::ofstream::binary);
- outfile.write(content.c_str(), content.size());
+ outfile.write(content.c_str(), static_cast<std::streamsize>(content.size()));
outfile.close();
}
+static void writeSamples(char const* argv[], const std::vector<int>& pathArgs,
+ const std::vector<size_t>& sizes, const uint8_t* data) {
+ size_t offset = 0;
+ for (size_t i = 0; i < pathArgs.size(); ++i) {
+ int j = pathArgs[i];
+ const char* file = argv[j];
+ size_t sampleSize = sizes[i];
+ std::ofstream outfile(file, std::ofstream::binary);
+ outfile.write(reinterpret_cast<const char*>(data + offset),
+ static_cast<std::streamsize>(sampleSize));
+ outfile.close();
+ offset += sampleSize;
+ }
+}
+
/* Returns "base file name" or its tail, if it contains '/' or '\'. */
static const char* fileName(const char* path) {
const char* separator_position = strrchr(path, '/');
@@ -68,21 +88,32 @@ static void printHelp(const char* name) {
fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
fprintf(stderr,
"Options:\n"
- " --dm use 'deorummolae' engine\n"
- " --sieve use 'sieve' engine (default)\n"
- " -t# set target dictionary size (limit); default: 16K\n"
- " -s# set slize length for 'sieve'; default: 33\n"
- "# is a decimal number with optional k/K/m/M suffix.\n\n");
+ " --dm use 'deorummolae' engine\n"
+ " --distill rewrite samples; unique text parts are removed\n"
+ " --dsh use 'durchschlag' engine (default)\n"
+ " --purify rewrite samples; unique text parts are zeroed out\n"
+ " --sieve use 'sieve' engine\n"
+ " -b# set block length for 'durchschlag'; default: 1024\n"
+ " -s# set slice length for 'distill', 'durchschlag', 'purify'\n"
+ " and 'sieve'; default: 16\n"
+ " -t# set target dictionary size (limit); default: 16K\n"
+ " -u# set minimum slice population (for rewrites); default: 2\n"
+ "# is a decimal number with optional k/K/m/M suffix.\n"
+ "WARNING: 'distill' and 'purify' will overwrite original samples!\n"
+ " Completely unique samples might become empty files.\n\n");
}
int main(int argc, char const* argv[]) {
int dictionaryArg = -1;
- int method = METHOD_SIEVE;
- int sieveSliceLen = 33;
- int targetSize = 16 << 10;
+ int method = METHOD_DURCHSCHLAG;
+ size_t sliceLen = 16;
+ size_t targetSize = 16 << 10;
+ size_t blockSize = 1024;
+ size_t minimumPopulation = 2;
std::vector<uint8_t> data;
std::vector<size_t> sizes;
+ std::vector<int> pathArgs;
size_t total = 0;
for (int i = 1; i < argc; ++i) {
if (argv[i] == nullptr) {
@@ -90,6 +121,12 @@ int main(int argc, char const* argv[]) {
}
if (argv[i][0] == '-') {
if (argv[i][1] == '-') {
+ if (dictionaryArg != -1) {
+ fprintf(stderr,
+ "Method should be specified before dictionary / sample '%s'\n",
+ argv[i]);
+ exit(1);
+ }
if (std::strcmp("--sieve", argv[i]) == 0) {
method = METHOD_SIEVE;
continue;
@@ -98,13 +135,32 @@ int main(int argc, char const* argv[]) {
method = METHOD_DM;
continue;
}
+ if (std::strcmp("--dsh", argv[i]) == 0) {
+ method = METHOD_DURCHSCHLAG;
+ continue;
+ }
+ if (std::strcmp("--distill", argv[i]) == 0) {
+ method = METHOD_DISTILL;
+ continue;
+ }
+ if (std::strcmp("--purify", argv[i]) == 0) {
+ method = METHOD_PURIFY;
+ continue;
+ }
printHelp(fileName(argv[0]));
fprintf(stderr, "Invalid option '%s'\n", argv[i]);
exit(1);
}
- if (argv[i][1] == 's') {
- sieveSliceLen = readInt(&argv[i][2]);
- if (sieveSliceLen < 4 || sieveSliceLen > 256) {
+ if (argv[i][1] == 'b') {
+ blockSize = readInt(&argv[i][2]);
+ if (blockSize < 16 || blockSize > 65536) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
+ } else if (argv[i][1] == 's') {
+ sliceLen = readInt(&argv[i][2]);
+ if (sliceLen < 4 || sliceLen > 256) {
printHelp(fileName(argv[0]));
fprintf(stderr, "Invalid option '%s'\n", argv[i]);
exit(1);
@@ -116,6 +172,13 @@ int main(int argc, char const* argv[]) {
fprintf(stderr, "Invalid option '%s'\n", argv[i]);
exit(1);
}
+ } else if (argv[i][1] == 'u') {
+ minimumPopulation = readInt(&argv[i][2]);
+ if (minimumPopulation < 256 || minimumPopulation > 65536) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
} else {
printHelp(fileName(argv[0]));
fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
@@ -124,26 +187,42 @@ int main(int argc, char const* argv[]) {
continue;
}
if (dictionaryArg == -1) {
- dictionaryArg = i;
- continue;
+ if (method != METHOD_DISTILL && method != METHOD_PURIFY) {
+ dictionaryArg = i;
+ continue;
+ }
}
std::string content = readFile(argv[i]);
data.insert(data.end(), content.begin(), content.end());
total += content.size();
+ pathArgs.push_back(i);
sizes.push_back(content.size());
}
- if (dictionaryArg == -1 || total == 0) {
+ bool wantDictionary = (dictionaryArg == -1);
+ if (method == METHOD_DISTILL || method == METHOD_PURIFY) {
+ wantDictionary = false;
+ }
+ if (wantDictionary || total == 0) {
printHelp(fileName(argv[0]));
fprintf(stderr, "Not enough arguments\n");
exit(1);
}
if (method == METHOD_SIEVE) {
- writeFile(argv[dictionaryArg],
- sieve_generate(targetSize, sieveSliceLen, sizes, data.data()));
+ writeFile(argv[dictionaryArg], sieve_generate(
+ targetSize, sliceLen, sizes, data.data()));
} else if (method == METHOD_DM) {
- writeFile(argv[dictionaryArg],
- DM_generate(targetSize, sizes, data.data()));
+ writeFile(argv[dictionaryArg], DM_generate(
+ targetSize, sizes, data.data()));
+ } else if (method == METHOD_DURCHSCHLAG) {
+ writeFile(argv[dictionaryArg], durchschlag_generate(
+ targetSize, sliceLen, blockSize, sizes, data.data()));
+ } else if (method == METHOD_DISTILL) {
+ durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data());
+ writeSamples(argv, pathArgs, sizes, data.data());
+ } else if (method == METHOD_PURIFY) {
+ durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data());
+ writeSamples(argv, pathArgs, sizes, data.data());
} else {
printHelp(fileName(argv[0]));
fprintf(stderr, "Unknown generator\n");
diff --git a/research/draw_diff.cc b/research/draw_diff.cc
index 01b6716..6541dac 100644
--- a/research/draw_diff.cc
+++ b/research/draw_diff.cc
@@ -20,18 +20,23 @@
#define CHECK(X) if (!(X)) exit(EXIT_FAILURE);
#endif
-void ReadPGM(FILE* f, uint8_t*** image, size_t* height, size_t* width) {
+typedef uint8_t* ScanLine;
+typedef ScanLine* Image;
+
+void ReadPGM(FILE* f, Image* image, size_t* height, size_t* width) {
int colors;
CHECK(fscanf(f, "P5\n%lu %lu\n%d\n", width, height, &colors) == 3);
assert(colors == 255);
- *image = new uint8_t*[*height];
+ ScanLine* lines = new ScanLine[*height];
+ *image = lines;
for (int i = *height - 1; i >= 0; --i) {
- (*image)[i] = new uint8_t[*width];
- CHECK(fread((*image)[i], 1, *width, f) == *width);
+ ScanLine line = new uint8_t[*width];
+ lines[i] = line;
+ CHECK(fread(line, 1, *width, f) == *width);
}
}
-void CalculateDiff(int** diff, uint8_t** image1, uint8_t** image2,
+void CalculateDiff(int** diff, Image image1, Image image2,
size_t height, size_t width) {
for (size_t i = 0; i < height; ++i) {
for (size_t j = 0; j < width; ++j) {
@@ -40,7 +45,7 @@ void CalculateDiff(int** diff, uint8_t** image1, uint8_t** image2,
}
}
-void DrawDiff(int** diff, uint8_t** image1, uint8_t** image2,
+void DrawDiff(int** diff, Image image1, Image image2,
size_t height, size_t width, FILE* f) {
int max = -1234;
int min = +1234;
@@ -78,13 +83,13 @@ void DrawDiff(int** diff, uint8_t** image1, uint8_t** image2,
delete[] row;
}
-int main(int argc, char* argv[]) {
+int main(int argc, char** argv) {
if (argc != 4) {
printf("usage: %s pgm1 pgm2 diff_ppm_path\n", argv[0]);
return 1;
}
- uint8_t **image1, **image2;
+ Image image1, image2;
size_t h1, w1, h2, w2;
FILE* fimage1 = fopen(argv[1], "rb");
diff --git a/research/durchschlag.cc b/research/durchschlag.cc
new file mode 100755
index 0000000..cc4ed68
--- /dev/null
+++ b/research/durchschlag.cc
@@ -0,0 +1,714 @@
+#include "./durchschlag.h"
+
+#include <algorithm>
+#include <exception> /* terminate */
+
+#include "divsufsort.h"
+
+/* Pointer to position in text. */
+typedef DurchschlagTextIdx TextIdx;
+
+/* (Sum of) value(s) of slice(s). */
+typedef uint32_t Score;
+
+typedef struct HashSlot {
+ TextIdx next;
+ TextIdx offset;
+} HashSlot;
+
+typedef struct MetaSlot {
+ TextIdx mark;
+ Score score;
+} MetaSlot;
+
+typedef struct Range {
+ TextIdx start;
+ TextIdx end;
+} Range;
+
+typedef struct Candidate {
+ Score score;
+ TextIdx position;
+} Candidate;
+
+struct greaterScore {
+ bool operator()(const Candidate& a, const Candidate& b) const {
+ return (a.score > b.score) ||
+ ((a.score == b.score) && (a.position < b.position));
+ }
+};
+
+struct lessScore {
+ bool operator()(const Candidate& a, const Candidate& b) const {
+ return (a.score < b.score) ||
+ ((a.score == b.score) && (a.position > b.position));
+ }
+};
+
+#define CANDIDATE_BUNDLE_SIZE (1 << 18)
+
+static void fatal(const char* error) {
+ fprintf(stderr, "%s\n", error);
+ std::terminate();
+}
+
+static TextIdx calculateDictionarySize(const std::vector<Range>& ranges) {
+ TextIdx result = 0;
+ for (size_t i = 0; i < ranges.size(); ++i) {
+ const Range& r = ranges[i];
+ result += r.end - r.start;
+ }
+ return result;
+}
+
+static std::string createDictionary(
+ const uint8_t* data, const std::vector<Range>& ranges, size_t limit) {
+ std::string output;
+ output.reserve(calculateDictionarySize(ranges));
+ for (size_t i = 0; i < ranges.size(); ++i) {
+ const Range& r = ranges[i];
+ output.insert(output.end(), &data[r.start], &data[r.end]);
+ }
+ if (output.size() > limit) {
+ output.resize(limit);
+ }
+ return output;
+}
+
+static Score buildCandidatesList(std::vector<Candidate>* candidates,
+ std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
+ TextIdx end) {
+ candidates->resize(0);
+
+ size_t n = map->size();
+ MetaSlot* slots = map->data();
+ for (size_t j = 0; j < n; ++j) {
+ slots[j].mark = 0;
+ }
+
+ Score score = 0;
+ for (size_t j = 0; j < span; ++j) {
+ MetaSlot& item = slots[shortcut[j]];
+ if (item.mark == 0) {
+ score += item.score;
+ }
+ item.mark++;
+ }
+
+ TextIdx i = 0;
+ TextIdx limit = std::min<TextIdx>(end, CANDIDATE_BUNDLE_SIZE);
+ Score maxScore = 0;
+ for (; i < limit; ++i) {
+ MetaSlot& pick = slots[shortcut[i + span]];
+ if (pick.mark == 0) {
+ score += pick.score;
+ }
+ pick.mark++;
+
+ if (score > maxScore) {
+ maxScore = score;
+ }
+ candidates->push_back({score, i});
+
+ MetaSlot& drop = slots[shortcut[i]];
+ drop.mark--;
+ if (drop.mark == 0) {
+ score -= drop.score;
+ }
+ }
+
+ std::make_heap(candidates->begin(), candidates->end(), greaterScore());
+ Score minScore = candidates->at(0).score;
+ for (; i < end; ++i) {
+ MetaSlot& pick = slots[shortcut[i + span]];
+ if (pick.mark == 0) {
+ score += pick.score;
+ }
+ pick.mark++;
+
+ if (score > maxScore) {
+ maxScore = score;
+ }
+ if (score >= minScore) {
+ candidates->push_back({score, i});
+ std::push_heap(candidates->begin(), candidates->end(), greaterScore());
+ if (candidates->size() > CANDIDATE_BUNDLE_SIZE && maxScore != minScore) {
+ while (candidates->at(0).score == minScore) {
+ std::pop_heap(candidates->begin(), candidates->end(), greaterScore());
+ candidates->pop_back();
+ }
+ minScore = candidates->at(0).score;
+ }
+ }
+
+ MetaSlot& drop = slots[shortcut[i]];
+ drop.mark--;
+ if (drop.mark == 0) {
+ score -= drop.score;
+ }
+ }
+
+ for (size_t j = 0; j < n; ++j) {
+ slots[j].mark = 0;
+ }
+
+ std::make_heap(candidates->begin(), candidates->end(), lessScore());
+ return minScore;
+}
+
+static Score rebuildCandidatesList(std::vector<TextIdx>* candidates,
+ std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
+ TextIdx end, TextIdx* next) {
+ size_t n = candidates->size();
+ TextIdx* data = candidates->data();
+ for (size_t i = 0; i < n; ++i) {
+ data[i] = 0;
+ }
+
+ n = map->size();
+ MetaSlot* slots = map->data();
+ for (size_t i = 0; i < n; ++i) {
+ slots[i].mark = 0;
+ }
+
+ Score score = 0;
+ for (TextIdx i = 0; i < span; ++i) {
+ MetaSlot& item = slots[shortcut[i]];
+ if (item.mark == 0) {
+ score += item.score;
+ }
+ item.mark++;
+ }
+
+ Score maxScore = 0;
+ for (TextIdx i = 0; i < end; ++i) {
+ MetaSlot& pick = slots[shortcut[i + span]];
+ if (pick.mark == 0) {
+ score += pick.score;
+ }
+ pick.mark++;
+
+ if (candidates->size() <= score) {
+ candidates->resize(score + 1);
+ }
+ if (score > maxScore) {
+ maxScore = score;
+ }
+ next[i] = candidates->at(score);
+ candidates->at(score) = i;
+
+ MetaSlot& drop = slots[shortcut[i]];
+ drop.mark--;
+ if (drop.mark == 0) {
+ score -= drop.score;
+ }
+ }
+
+ for (size_t i = 0; i < n; ++i) {
+ slots[i].mark = 0;
+ }
+
+ candidates->resize(maxScore + 1);
+ return maxScore;
+}
+
+static void addRange(std::vector<Range>* ranges, TextIdx start, TextIdx end) {
+ for (auto it = ranges->begin(); it != ranges->end();) {
+ if (end < it->start) {
+ ranges->insert(it, {start, end});
+ return;
+ }
+ if (it->end < start) {
+ it++;
+ continue;
+ }
+ // Combine with existing.
+ start = std::min(start, it->start);
+ end = std::max(end, it->end);
+ // Remove consumed vector and continue.
+ it = ranges->erase(it);
+ }
+ ranges->push_back({start, end});
+}
+
+std::string durchschlag_generate(
+ size_t dictionary_size_limit, size_t slice_len, size_t block_len,
+ const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
+ DurchschlagContext ctx = durchschlag_prepare(
+ slice_len, sample_sizes, sample_data);
+ return durchschlag_generate(DURCHSCHLAG_COLLABORATIVE,
+ dictionary_size_limit, block_len, ctx, sample_data);
+}
+
+DurchschlagContext durchschlag_prepare(size_t slice_len,
+ const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
+ /* Parameters aliasing */
+ TextIdx sliceLen = static_cast<TextIdx>(slice_len);
+ if (sliceLen != slice_len) fatal("slice_len is too large");
+ if (sliceLen < 1) fatal("slice_len is too small");
+ const uint8_t* data = sample_data;
+
+ TextIdx total = 0;
+ std::vector<TextIdx> offsets;
+ offsets.reserve(sample_sizes.size());
+ for (size_t i = 0; i < sample_sizes.size(); ++i) {
+ TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
+ if (delta != sample_sizes[i]) fatal("sample is too large");
+ if (delta == 0) fatal("0-length samples are prohibited");
+ TextIdx next_total = total + delta;
+ if (next_total <= total) fatal("corpus is too large");
+ total = next_total;
+ offsets.push_back(total);
+ }
+
+ if (total < sliceLen) fatal("slice_len is larger than corpus size");
+ TextIdx end = total - static_cast<TextIdx>(sliceLen) + 1;
+ TextIdx hashLen = 11;
+ while (hashLen < 29 && ((1u << hashLen) < end)) {
+ hashLen += 3;
+ }
+ hashLen -= 3;
+ TextIdx hashMask = (1u << hashLen) - 1u;
+ std::vector<TextIdx> hashHead(1 << hashLen);
+ TextIdx hash = 0;
+ TextIdx lShift = 3;
+ TextIdx rShift = hashLen - lShift;
+ for (TextIdx i = 0; i < sliceLen - 1; ++i) {
+ TextIdx v = data[i];
+ hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
+ }
+ TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
+ TextIdx rShiftX = hashLen - lShiftX;
+
+ std::vector<HashSlot> map;
+ map.push_back({0, 0});
+ TextIdx hashSlot = 1;
+ std::vector<TextIdx> sliceMap;
+ sliceMap.reserve(end);
+ for (TextIdx i = 0; i < end; ++i) {
+ TextIdx v = data[i + sliceLen - 1];
+ TextIdx bucket = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
+ v = data[i];
+ hash = bucket ^ (((v << lShiftX) | (v >> rShiftX)) & hashMask);
+ TextIdx slot = hashHead[bucket];
+ while (slot != 0) {
+ HashSlot& item = map[slot];
+ TextIdx start = item.offset;
+ bool miss = false;
+ for (TextIdx j = 0; j < sliceLen; ++j) {
+ if (data[i + j] != data[start + j]) {
+ miss = true;
+ break;
+ }
+ }
+ if (!miss) {
+ sliceMap.push_back(slot);
+ break;
+ }
+ slot = item.next;
+ }
+ if (slot == 0) {
+ map.push_back({hashHead[bucket], i});
+ hashHead[bucket] = hashSlot;
+ sliceMap.push_back(hashSlot);
+ hashSlot++;
+ }
+ }
+
+ return {total, sliceLen, static_cast<TextIdx>(map.size()),
+ std::move(offsets), std::move(sliceMap)};
+}
+
+DurchschlagContext durchschlag_prepare(size_t slice_len,
+ const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index) {
+ /* Parameters aliasing */
+ TextIdx sliceLen = static_cast<TextIdx>(slice_len);
+ if (sliceLen != slice_len) fatal("slice_len is too large");
+ if (sliceLen < 1) fatal("slice_len is too small");
+ const TextIdx* lcp = index.lcp.data();
+ const TextIdx* sa = index.sa.data();
+
+ TextIdx total = 0;
+ std::vector<TextIdx> offsets;
+ offsets.reserve(sample_sizes.size());
+ for (size_t i = 0; i < sample_sizes.size(); ++i) {
+ TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
+ if (delta != sample_sizes[i]) fatal("sample is too large");
+ if (delta == 0) fatal("0-length samples are prohibited");
+ TextIdx next_total = total + delta;
+ if (next_total <= total) fatal("corpus is too large");
+ total = next_total;
+ offsets.push_back(total);
+ }
+
+ if (total < sliceLen) fatal("slice_len is larger than corpus size");
+ TextIdx counter = 1;
+ TextIdx end = total - sliceLen + 1;
+ std::vector<TextIdx> sliceMap(total);
+ TextIdx last = 0;
+ TextIdx current = 1;
+ while (current <= total) {
+ if (lcp[current - 1] < sliceLen) {
+ for (TextIdx i = last; i < current; ++i) {
+ sliceMap[sa[i]] = counter;
+ }
+ counter++;
+ last = current;
+ }
+ current++;
+ }
+ sliceMap.resize(end);
+
+ // Reorder items for the better locality.
+ std::vector<TextIdx> reorder(counter);
+ counter = 1;
+ for (TextIdx i = 0; i < end; ++i) {
+ if (reorder[sliceMap[i]] == 0) {
+ reorder[sliceMap[i]] = counter++;
+ }
+ }
+ for (TextIdx i = 0; i < end; ++i) {
+ sliceMap[i] = reorder[sliceMap[i]];
+ }
+
+ return {total, sliceLen, counter, std::move(offsets), std::move(sliceMap)};
+}
+
+DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data) {
+ TextIdx total = static_cast<TextIdx>(data.size());
+ if (total != data.size()) fatal("corpus is too large");
+ saidx_t saTotal = static_cast<saidx_t>(total);
+ if (saTotal < 0) fatal("corpus is too large");
+ if (static_cast<TextIdx>(saTotal) != total) fatal("corpus is too large");
+ std::vector<TextIdx> sa(total);
+ /* Hopefully, non-negative int32_t values match TextIdx ones. */
+ if (sizeof(TextIdx) != sizeof(int32_t)) fatal("type length mismatch");
+ int32_t* saData = reinterpret_cast<int32_t*>(sa.data());
+ divsufsort(data.data(), saData, saTotal);
+
+ std::vector<TextIdx> isa(total);
+ for (TextIdx i = 0; i < total; ++i) isa[sa[i]] = i;
+
+ // TODO: borrowed -> unknown efficiency.
+ std::vector<TextIdx> lcp(total);
+ TextIdx k = 0;
+ lcp[total - 1] = 0;
+ for (TextIdx i = 0; i < total; ++i) {
+ TextIdx current = isa[i];
+ if (current == total - 1) {
+ k = 0;
+ continue;
+ }
+ TextIdx j = sa[current + 1]; // Suffix which follow i-th suffix.
+ while ((i + k < total) && (j + k < total) && (data[i + k] == data[j + k])) {
+ ++k;
+ }
+ lcp[current] = k;
+ if (k > 0) --k;
+ }
+
+ return {std::move(lcp), std::move(sa)};
+}
+
+static void ScoreSlices(const std::vector<TextIdx>& offsets,
+ std::vector<MetaSlot>& map, const TextIdx* shortcut, TextIdx end) {
+ TextIdx piece = 0;
+ /* Fresh map contains all zeroes -> initial mark should be different. */
+ TextIdx mark = 1;
+ for (TextIdx i = 0; i < end; ++i) {
+ if (offsets[piece] == i) {
+ piece++;
+ mark++;
+ }
+ MetaSlot& item = map[shortcut[i]];
+ if (item.mark != mark) {
+ item.mark = mark;
+ item.score++;
+ }
+ }
+}
+
+static std::string durchschlagGenerateExclusive(
+ size_t dictionary_size_limit, size_t block_len,
+ const DurchschlagContext& context, const uint8_t* sample_data) {
+ /* Parameters aliasing */
+ TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
+ if (targetSize != dictionary_size_limit) {
+ fprintf(stderr, "dictionary_size_limit is too large\n");
+ return "";
+ }
+ TextIdx sliceLen = context.sliceLen;
+ TextIdx total = context.dataSize;
+ TextIdx blockLen = static_cast<TextIdx>(block_len);
+ if (blockLen != block_len) {
+ fprintf(stderr, "block_len is too large\n");
+ return "";
+ }
+ const uint8_t* data = sample_data;
+ const std::vector<TextIdx>& offsets = context.offsets;
+ std::vector<MetaSlot> map(context.numUniqueSlices);
+ const TextIdx* shortcut = context.sliceMap.data();
+
+ /* Initialization */
+ if (blockLen < sliceLen) {
+ fprintf(stderr, "sliceLen is larger than block_len\n");
+ return "";
+ }
+ if (targetSize < blockLen || total < blockLen) {
+ fprintf(stderr, "block_len is too large\n");
+ return "";
+ }
+ TextIdx end = total - sliceLen + 1;
+ ScoreSlices(offsets, map, shortcut, end);
+ end = total - blockLen + 1;
+ std::vector<TextIdx> candidates;
+ std::vector<TextIdx> next(end);
+ TextIdx span = blockLen - sliceLen + 1;
+ Score maxScore = rebuildCandidatesList(
+ &candidates, &map, span, shortcut, end, next.data());
+
+ /* Block selection */
+ const size_t triesLimit = (600 * 1000000) / span;
+ const size_t candidatesLimit = (150 * 1000000) / span;
+ std::vector<Range> ranges;
+ TextIdx mark = 0;
+ size_t numTries = 0;
+ while (true) {
+ TextIdx dictSize = calculateDictionarySize(ranges);
+ size_t numCandidates = 0;
+ if (dictSize > targetSize - blockLen) {
+ break;
+ }
+ if (maxScore == 0) {
+ break;
+ }
+ while (true) {
+ TextIdx candidate = 0;
+ while (maxScore > 0) {
+ if (candidates[maxScore] != 0) {
+ candidate = candidates[maxScore];
+ candidates[maxScore] = next[candidate];
+ break;
+ }
+ maxScore--;
+ }
+ if (maxScore == 0) {
+ break;
+ }
+ mark++;
+ numTries++;
+ numCandidates++;
+ Score score = 0;
+ for (size_t j = candidate; j <= candidate + span; ++j) {
+ MetaSlot& item = map[shortcut[j]];
+ if (item.mark != mark) {
+ score += item.score;
+ item.mark = mark;
+ }
+ }
+ if (score < maxScore) {
+ if (numTries < triesLimit && numCandidates < candidatesLimit) {
+ next[candidate] = candidates[score];
+ candidates[score] = candidate;
+ } else {
+ maxScore = rebuildCandidatesList(
+ &candidates, &map, span, shortcut, end, next.data());
+ mark = 0;
+ numTries = 0;
+ numCandidates = 0;
+ }
+ continue;
+ } else if (score > maxScore) {
+ fprintf(stderr, "Broken invariant\n");
+ return "";
+ }
+ for (TextIdx j = candidate; j <= candidate + span; ++j) {
+ MetaSlot& item = map[shortcut[j]];
+ item.score = 0;
+ }
+ addRange(&ranges, candidate, candidate + blockLen);
+ break;
+ }
+ }
+
+ return createDictionary(data, ranges, targetSize);
+}
+
+static std::string durchschlagGenerateCollaborative(
+ size_t dictionary_size_limit, size_t block_len,
+ const DurchschlagContext& context, const uint8_t* sample_data) {
+ /* Parameters aliasing */
+ TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
+ if (targetSize != dictionary_size_limit) {
+ fprintf(stderr, "dictionary_size_limit is too large\n");
+ return "";
+ }
+ TextIdx sliceLen = context.sliceLen;
+ TextIdx total = context.dataSize;
+ TextIdx blockLen = static_cast<TextIdx>(block_len);
+ if (blockLen != block_len) {
+ fprintf(stderr, "block_len is too large\n");
+ return "";
+ }
+ const uint8_t* data = sample_data;
+ const std::vector<TextIdx>& offsets = context.offsets;
+ std::vector<MetaSlot> map(context.numUniqueSlices);
+ const TextIdx* shortcut = context.sliceMap.data();
+
+ /* Initialization */
+ if (blockLen < sliceLen) {
+ fprintf(stderr, "sliceLen is larger than block_len\n");
+ return "";
+ }
+ if (targetSize < blockLen || total < blockLen) {
+ fprintf(stderr, "block_len is too large\n");
+ return "";
+ }
+ TextIdx end = total - sliceLen + 1;
+ ScoreSlices(offsets, map, shortcut, end);
+ end = total - blockLen + 1;
+ std::vector<Candidate> candidates;
+ candidates.reserve(CANDIDATE_BUNDLE_SIZE + 1024);
+ TextIdx span = blockLen - sliceLen + 1;
+ Score minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
+
+ /* Block selection */
+ std::vector<Range> ranges;
+ TextIdx mark = 0;
+ while (true) {
+ TextIdx dictSize = calculateDictionarySize(ranges);
+ if (dictSize > targetSize - blockLen) {
+ break;
+ }
+ if (minScore == 0 && candidates.empty()) {
+ break;
+ }
+ while (true) {
+ if (candidates.empty()) {
+ minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
+ mark = 0;
+ }
+ TextIdx candidate = candidates[0].position;
+ Score expectedScore = candidates[0].score;
+ if (expectedScore == 0) {
+ candidates.resize(0);
+ break;
+ }
+ std::pop_heap(candidates.begin(), candidates.end(), lessScore());
+ candidates.pop_back();
+ mark++;
+ Score score = 0;
+ for (TextIdx j = candidate; j <= candidate + span; ++j) {
+ MetaSlot& item = map[shortcut[j]];
+ if (item.mark != mark) {
+ score += item.score;
+ item.mark = mark;
+ }
+ }
+ if (score < expectedScore) {
+ if (score >= minScore) {
+ candidates.push_back({score, candidate});
+ std::push_heap(candidates.begin(), candidates.end(), lessScore());
+ }
+ continue;
+ } else if (score > expectedScore) {
+ fatal("Broken invariant");
+ }
+ for (TextIdx j = candidate; j <= candidate + span; ++j) {
+ MetaSlot& item = map[shortcut[j]];
+ item.score = 0;
+ }
+ addRange(&ranges, candidate, candidate + blockLen);
+ break;
+ }
+ }
+
+ return createDictionary(data, ranges, targetSize);
+}
+
+std::string durchschlag_generate(DurchschalgResourceStrategy strategy,
+ size_t dictionary_size_limit, size_t block_len,
+ const DurchschlagContext& context, const uint8_t* sample_data) {
+ if (strategy == DURCHSCHLAG_COLLABORATIVE) {
+ return durchschlagGenerateCollaborative(
+ dictionary_size_limit, block_len, context, sample_data);
+ } else {
+ return durchschlagGenerateExclusive(
+ dictionary_size_limit, block_len, context, sample_data);
+ }
+}
+
+void durchschlag_distill(size_t slice_len, size_t minimum_population,
+ std::vector<size_t>* sample_sizes, uint8_t* sample_data) {
+ /* Parameters aliasing */
+ uint8_t* data = sample_data;
+
+ /* Build slice map. */
+ DurchschlagContext context = durchschlag_prepare(
+ slice_len, *sample_sizes, data);
+
+ /* Calculate slice population. */
+ const std::vector<TextIdx>& offsets = context.offsets;
+ std::vector<MetaSlot> map(context.numUniqueSlices);
+ const TextIdx* shortcut = context.sliceMap.data();
+ TextIdx sliceLen = context.sliceLen;
+ TextIdx total = context.dataSize;
+ TextIdx end = total - sliceLen + 1;
+ ScoreSlices(offsets, map, shortcut, end);
+
+ /* Condense samples, omitting unique slices. */
+ TextIdx readPos = 0;
+ TextIdx writePos = 0;
+ TextIdx lastNonUniquePos = 0;
+ for (TextIdx i = 0; i < sample_sizes->size(); ++i) {
+ TextIdx sampleStart = writePos;
+ TextIdx oldSampleEnd =
+ readPos + static_cast<TextIdx>(sample_sizes->at(i));
+ while (readPos < oldSampleEnd) {
+ if (readPos < end) {
+ MetaSlot& item = map[shortcut[readPos]];
+ if (item.score >= minimum_population) {
+ lastNonUniquePos = readPos + sliceLen;
+ }
+ }
+ if (readPos < lastNonUniquePos) {
+ data[writePos++] = data[readPos];
+ }
+ readPos++;
+ }
+ sample_sizes->at(i) = writePos - sampleStart;
+ }
+}
+
+void durchschlag_purify(size_t slice_len, size_t minimum_population,
+ const std::vector<size_t>& sample_sizes, uint8_t* sample_data) {
+ /* Parameters aliasing */
+ uint8_t* data = sample_data;
+
+ /* Build slice map. */
+ DurchschlagContext context = durchschlag_prepare(
+ slice_len, sample_sizes, data);
+
+ /* Calculate slice population. */
+ const std::vector<TextIdx>& offsets = context.offsets;
+ std::vector<MetaSlot> map(context.numUniqueSlices);
+ const TextIdx* shortcut = context.sliceMap.data();
+ TextIdx sliceLen = context.sliceLen;
+ TextIdx total = context.dataSize;
+ TextIdx end = total - sliceLen + 1;
+ ScoreSlices(offsets, map, shortcut, end);
+
+ /* Rewrite samples, zeroing out unique slices. */
+ TextIdx lastNonUniquePos = 0;
+ for (TextIdx readPos = 0; readPos < total; ++readPos) {
+ if (readPos < end) {
+ MetaSlot& item = map[shortcut[readPos]];
+ if (item.score >= minimum_population) {
+ lastNonUniquePos = readPos + sliceLen;
+ }
+ }
+ if (readPos >= lastNonUniquePos) {
+ data[readPos] = 0;
+ }
+ }
+}
diff --git a/research/durchschlag.h b/research/durchschlag.h
new file mode 100755
index 0000000..adbc531
--- /dev/null
+++ b/research/durchschlag.h
@@ -0,0 +1,99 @@
+#ifndef BROTLI_RESEARCH_DURCHSCHLAG_H_
+#define BROTLI_RESEARCH_DURCHSCHLAG_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+/**
+ * Generate a dictionary for given samples.
+ *
+ * @param dictionary_size_limit maximal dictionary size
+ * @param slice_len text slice size
+ * @param block_len score block length
+ * @param sample_sizes vector with sample sizes
+ * @param sample_data concatenated samples
+ * @return generated dictionary
+ */
+std::string durchschlag_generate(
+ size_t dictionary_size_limit, size_t slice_len, size_t block_len,
+ const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
+
+//------------------------------------------------------------------------------
+// Lower level API for repetitive dictionary generation.
+//------------------------------------------------------------------------------
+
+/* Pointer to position in text. */
+typedef uint32_t DurchschlagTextIdx;
+
+/* Context is made public for flexible serialization / deserialization. */
+typedef struct DurchschlagContext {
+ DurchschlagTextIdx dataSize;
+ DurchschlagTextIdx sliceLen;
+ DurchschlagTextIdx numUniqueSlices;
+ std::vector<DurchschlagTextIdx> offsets;
+ std::vector<DurchschlagTextIdx> sliceMap;
+} DurchschlagContext;
+
+DurchschlagContext durchschlag_prepare(size_t slice_len,
+ const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
+
+typedef enum DurchschalgResourceStrategy {
+ // Faster
+ DURCHSCHLAG_EXCLUSIVE = 0,
+ // Uses much less memory
+ DURCHSCHLAG_COLLABORATIVE = 1
+} DurchschalgResourceStrategy;
+
+std::string durchschlag_generate(DurchschalgResourceStrategy strategy,
+ size_t dictionary_size_limit, size_t block_len,
+ const DurchschlagContext& context, const uint8_t* sample_data);
+
+//------------------------------------------------------------------------------
+// Suffix Array based preparation.
+//------------------------------------------------------------------------------
+
+typedef struct DurchschlagIndex {
+ std::vector<DurchschlagTextIdx> lcp;
+ std::vector<DurchschlagTextIdx> sa;
+} DurchschlagIndex;
+
+DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data);
+
+DurchschlagContext durchschlag_prepare(size_t slice_len,
+ const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index);
+
+//------------------------------------------------------------------------------
+// Data preparation.
+//------------------------------------------------------------------------------
+
+/**
+ * Cut out unique slices.
+ *
+ * Both @p sample_sizes and @p sample_data are modified in-place. Number of
+ * samples remains unchanged, but some samples become shorter.
+ *
+ * @param slice_len (unique) slice size
+ * @param minimum_population minimum non-unique slice occurrence
+ * @param sample_sizes [in / out] vector with sample sizes
+ * @param sample_data [in / out] concatenated samples
+ */
+void durchschlag_distill(size_t slice_len, size_t minimum_population,
+ std::vector<size_t>* sample_sizes, uint8_t* sample_data);
+
+/**
+ * Replace unique slices with zeroes.
+ *
+ * @p sample_data is modified in-place. Number of samples and their length
+ * remain unchanged.
+ *
+ * @param slice_len (unique) slice size
+ * @param minimum_population minimum non-unique slice occurrence
+ * @param sample_sizes vector with sample sizes
+ * @param sample_data [in / out] concatenated samples
+ */
+void durchschlag_purify(size_t slice_len, size_t minimum_population,
+ const std::vector<size_t>& sample_sizes, uint8_t* sample_data);
+
+#endif // BROTLI_RESEARCH_DURCHSCHLAG_H_
diff --git a/research/libdivsufsort b/research/libdivsufsort
new file mode 160000
+Subproject 5f60d6f026c30fb4ac296f696b3c8b0eb71bd42
diff --git a/research/sieve.cc b/research/sieve.cc
index fbc1dbf..4d147e1 100755
--- a/research/sieve.cc
+++ b/research/sieve.cc
@@ -1,19 +1,27 @@
#include "./sieve.h"
+/* Pointer to position in (combined corpus) text. */
+typedef uint32_t TextIdx;
+
+/* Index of sample / generation. */
+typedef uint16_t SampleIdx;
+
typedef struct Slot {
- uint32_t next;
- uint32_t offset;
- uint16_t presence;
- uint16_t mark;
+ TextIdx next;
+ TextIdx offset;
+ SampleIdx presence;
+ SampleIdx mark;
} Slot;
-static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end,
- size_t middle, uint16_t minPresence, uint16_t iteration) {
- int from = -2;
- int to = -1;
- size_t result = 0;
- uint16_t targetPresence = minPresence;
- for (uint32_t i = 0; i < end; ++i) {
+static const TextIdx kNowhere = static_cast<TextIdx>(-1);
+
+static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut,
+ TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) {
+ TextIdx from = kNowhere;
+ TextIdx to = kNowhere;
+ TextIdx result = 0;
+ SampleIdx targetPresence = minPresence;
+ for (TextIdx i = 0; i < end; ++i) {
if (i == middle) {
targetPresence++;
}
@@ -21,8 +29,8 @@ static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end,
if (item.mark != iteration) {
item.mark = iteration;
if (item.presence >= targetPresence) {
- if (to < i) {
- if (from > 0) {
+ if ((to == kNowhere) || (to < i)) {
+ if (from != kNowhere) {
result += to - from;
}
from = i;
@@ -31,20 +39,20 @@ static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end,
}
}
}
- if (from > 0) {
+ if (from != kNowhere) {
result += to - from;
}
return result;
}
-static std::string createDictionary(const uint8_t* data, size_t sliceLen,
- Slot* map, uint32_t* shortcut, size_t end, size_t middle,
- uint16_t minPresence, uint16_t iteration) {
+static std::string createDictionary(const uint8_t* data, TextIdx sliceLen,
+ Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle,
+ SampleIdx minPresence, SampleIdx iteration) {
std::string output;
- int from = -2;
- int to = -1;
- uint16_t targetPresence = minPresence;
- for (uint32_t i = 0; i < end; ++i) {
+ TextIdx from = kNowhere;
+ TextIdx to = kNowhere;
+ SampleIdx targetPresence = minPresence;
+ for (TextIdx i = 0; i < end; ++i) {
if (i == middle) {
targetPresence++;
}
@@ -52,8 +60,8 @@ static std::string createDictionary(const uint8_t* data, size_t sliceLen,
if (item.mark != iteration) {
item.mark = iteration;
if (item.presence >= targetPresence) {
- if (to < i) {
- if (from > 0) {
+ if ((to == kNowhere) || (to < i)) {
+ if (from != kNowhere) {
output.insert(output.end(), &data[from], &data[to]);
}
from = i;
@@ -62,7 +70,7 @@ static std::string createDictionary(const uint8_t* data, size_t sliceLen,
}
}
}
- if (from > 0) {
+ if (from != kNowhere) {
output.insert(output.end(), &data[from], &data[to]);
}
return output;
@@ -71,55 +79,95 @@ static std::string createDictionary(const uint8_t* data, size_t sliceLen,
std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
/* Parameters aliasing */
- size_t targetSize = dictionary_size_limit;
- size_t sliceLen = slice_len;
+ TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
+ if (targetSize != dictionary_size_limit) {
+ fprintf(stderr, "dictionary_size_limit is too large\n");
+ return "";
+ }
+ TextIdx sliceLen = static_cast<TextIdx>(slice_len);
+ if (sliceLen != slice_len) {
+ fprintf(stderr, "slice_len is too large\n");
+ return "";
+ }
+ if (sliceLen < 1) {
+ fprintf(stderr, "slice_len is too small\n");
+ return "";
+ }
+ SampleIdx numSamples = static_cast<SampleIdx>(sample_sizes.size());
+ if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) {
+ fprintf(stderr, "too many samples\n");
+ return "";
+ }
const uint8_t* data = sample_data;
- size_t total = 0;
- std::vector<size_t> offsets;
- for (size_t i = 0; i < sample_sizes.size(); ++i) {
- total += sample_sizes[i];
+ TextIdx total = 0;
+ std::vector<TextIdx> offsets;
+ for (SampleIdx i = 0; i < numSamples; ++i) {
+ TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
+ if (delta != sample_sizes[i]) {
+ fprintf(stderr, "sample is too large\n");
+ return "";
+ }
+ if (delta == 0) {
+ fprintf(stderr, "empty samples are prohibited\n");
+ return "";
+ }
+ if (total + delta <= total) {
+ fprintf(stderr, "corpus is too large\n");
+ return "";
+ }
+ total += delta;
offsets.push_back(total);
}
+ if (total * 2 < total) {
+ fprintf(stderr, "corpus is too large\n");
+ return "";
+ }
+
+ if (total < sliceLen) {
+ fprintf(stderr, "slice_len is larger than corpus size\n");
+ return "";
+ }
+
/*****************************************************************************
* Build coverage map.
****************************************************************************/
std::vector<Slot> map;
- std::vector<uint32_t> shortcut;
+ std::vector<TextIdx> shortcut;
map.push_back({0, 0, 0, 0});
- size_t end = total - sliceLen;
- int hashLen = 8;
- while ((1 << hashLen) < end) {
+ TextIdx end = total - sliceLen;
+ TextIdx hashLen = 11;
+ while (hashLen < 29 && ((1u << hashLen) < end)) {
hashLen += 3;
}
hashLen -= 3;
- uint32_t hashMask = (1u << hashLen) - 1u;
- std::vector<uint32_t> hashHead(1 << hashLen);
- uint32_t hashSlot = 1;
- uint16_t piece = 0;
- uint32_t hash = 0;
- int lShift = 3;
- int rShift = hashLen - lShift;
- for (int i = 0; i < sliceLen - 1; ++i) {
- uint32_t v = data[i];
+ TextIdx hashMask = (1u << hashLen) - 1u;
+ std::vector<TextIdx> hashHead(1 << hashLen);
+ TextIdx hashSlot = 1;
+ SampleIdx piece = 0;
+ TextIdx hash = 0;
+ TextIdx lShift = 3;
+ TextIdx rShift = hashLen - lShift;
+ for (TextIdx i = 0; i < sliceLen - 1; ++i) {
+ TextIdx v = data[i];
hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
}
- int lShiftX = (lShift * (sliceLen - 1)) % hashLen;
- int rShiftX = hashLen - lShiftX;
- for (uint32_t i = 0; i < end; ++i) {
- uint32_t v = data[i + sliceLen - 1];
+ TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
+ TextIdx rShiftX = hashLen - lShiftX;
+ for (TextIdx i = 0; i < end; ++i) {
+ TextIdx v = data[i + sliceLen - 1];
hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
if (offsets[piece] == i) {
piece++;
}
- uint32_t slot = hashHead[hash];
+ TextIdx slot = hashHead[hash];
while (slot != 0) {
Slot& item = map[slot];
- int start = item.offset;
+ TextIdx start = item.offset;
bool miss = false;
- for (size_t j = 0; j < sliceLen; ++j) {
+ for (TextIdx j = 0; j < sliceLen; ++j) {
if (data[i + j] != data[start + j]) {
miss = true;
break;
@@ -148,8 +196,8 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
/*****************************************************************************
* Build dictionary of specified size.
****************************************************************************/
- size_t a = 1;
- size_t size = dryRun(
+ SampleIdx a = 1;
+ TextIdx size = dryRun(
sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
/* Maximal output is smaller than target. */
if (size <= targetSize) {
@@ -157,7 +205,7 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
}
- size_t b = offsets.size();
+ SampleIdx b = numSamples;
size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
if (size == targetSize) {
return createDictionary(
@@ -167,7 +215,7 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
if (size < targetSize) {
/* size(a) > targetSize > size(b) && a < m < b */
while (a + 1 < b) {
- size_t m = (a + b) / 2;
+ SampleIdx m = static_cast<SampleIdx>((a + b) / 2);
size = dryRun(
sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
if (size < targetSize) {
@@ -183,18 +231,18 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
a = b;
}
/* size(minPresence) > targetSize > size(minPresence + 1) */
- size_t minPresence = a;
- a = 0;
- b = end;
+ SampleIdx minPresence = a;
+ TextIdx c = 0;
+ TextIdx d = end;
/* size(a) < targetSize < size(b) && a < m < b */
- while (a + 1 < b) {
- size_t m = (a + b) / 2;
+ while (c + 1 < d) {
+ TextIdx m = (c + d) / 2;
size = dryRun(
sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
if (size < targetSize) {
- a = m;
+ c = m;
} else if (size > targetSize) {
- b = m;
+ d = m;
} else {
return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
m, minPresence, ++piece);
@@ -204,8 +252,8 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
bool unrestricted = false;
if (minPresence <= 2 && !unrestricted) {
minPresence = 2;
- a = end;
+ c = end;
}
- return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, a,
+ return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c,
minPresence, ++piece);
}
diff --git a/research/sieve.h b/research/sieve.h
index 2aae669..6c65dc8 100755
--- a/research/sieve.h
+++ b/research/sieve.h
@@ -1,9 +1,8 @@
#ifndef BROTLI_RESEARCH_SIEVE_H_
#define BROTLI_RESEARCH_SIEVE_H_
-#include <stddef.h>
-#include <stdint.h>
-
+#include <cstddef>
+#include <cstdint>
#include <string>
#include <vector>