aboutsummaryrefslogtreecommitdiff
path: root/research/dictionary_generator.cc
diff options
context:
space:
mode:
authorEugene Kliuchnikov <eustas@google.com>2018-02-26 09:04:36 -0500
committerGitHub <noreply@github.com>2018-02-26 09:04:36 -0500
commit35e69fc7cf9421ab04ffc9d52cb36d07fa12984a (patch)
treea1ed614391936d455da2b0610ef8e8caf88b4289 /research/dictionary_generator.cc
parent3af18990f50d8f040038aaa08c41f5d27d62efb5 (diff)
downloadbrotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.zip
brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.gz
brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.bz2
New feature: "Large Window Brotli" (#640)
* New feature: "Large Window Brotli" By setting special encoder/decoder flag it is now possible to extend LZ-window up to 30 bits; though produced stream will not be RFC7932 compliant. Added new dictionary generator - "DSH". It combines speed of "Sieve" and quality of "DM". Plus utilities to prepare train corpora (remove unique strings). Improved compression ratio: now two sub-blocks could be stitched: the last copy command could be extended to span the next sub-block. Fixed compression ineffectiveness caused by floating numbers rounding and wrong cost heuristic. Other C changes: - combined / moved `context.h` to `common` - moved transforms to `common` - unified some aspects of code formatting - added an abstraction for encoder (static) dictionary - moved default allocator/deallocator functions to `common` brotli CLI: - window size is auto-adjusted if not specified explicitly Java: - added "eager" decoding both to JNI wrapper and pure decoder - huge speed-up of `DictionaryData` initialization * Add dictionaryless compressed dictionary * Fix `sources.lst` * Fix `sources.lst` and add a note that `libtool` is also required. * Update setup.py * Fix `EagerStreamTest` * Fix BUILD file * Add missing `libdivsufsort` dependency * Fix "unused parameter" warning.
Diffstat (limited to 'research/dictionary_generator.cc')
-rwxr-xr-xresearch/dictionary_generator.cc119
1 files changed, 99 insertions, 20 deletions
diff --git a/research/dictionary_generator.cc b/research/dictionary_generator.cc
index b3ee89c..00cfaba 100755
--- a/research/dictionary_generator.cc
+++ b/research/dictionary_generator.cc
@@ -1,15 +1,20 @@
+#include <cstddef>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <vector>
#include "./deorummolae.h"
+#include "./durchschlag.h"
#include "./sieve.h"
#define METHOD_DM 0
#define METHOD_SIEVE 1
+#define METHOD_DURCHSCHLAG 2
+#define METHOD_DISTILL 3
+#define METHOD_PURIFY 4
-size_t readInt(const char* str) {
+static size_t readInt(const char* str) {
size_t result = 0;
if (str[0] == 0 || str[0] == '0') {
return 0;
@@ -51,10 +56,25 @@ static std::string readFile(const std::string& path) {
static void writeFile(const char* file, const std::string& content) {
std::ofstream outfile(file, std::ofstream::binary);
- outfile.write(content.c_str(), content.size());
+ outfile.write(content.c_str(), static_cast<std::streamsize>(content.size()));
outfile.close();
}
+static void writeSamples(char const* argv[], const std::vector<int>& pathArgs,
+ const std::vector<size_t>& sizes, const uint8_t* data) {
+ size_t offset = 0;
+ for (size_t i = 0; i < pathArgs.size(); ++i) {
+ int j = pathArgs[i];
+ const char* file = argv[j];
+ size_t sampleSize = sizes[i];
+ std::ofstream outfile(file, std::ofstream::binary);
+ outfile.write(reinterpret_cast<const char*>(data + offset),
+ static_cast<std::streamsize>(sampleSize));
+ outfile.close();
+ offset += sampleSize;
+ }
+}
+
/* Returns "base file name" or its tail, if it contains '/' or '\'. */
static const char* fileName(const char* path) {
const char* separator_position = strrchr(path, '/');
@@ -68,21 +88,32 @@ static void printHelp(const char* name) {
fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
fprintf(stderr,
"Options:\n"
- " --dm use 'deorummolae' engine\n"
- " --sieve use 'sieve' engine (default)\n"
- " -t# set target dictionary size (limit); default: 16K\n"
- " -s# set slize length for 'sieve'; default: 33\n"
- "# is a decimal number with optional k/K/m/M suffix.\n\n");
+ " --dm use 'deorummolae' engine\n"
+ " --distill rewrite samples; unique text parts are removed\n"
+ " --dsh use 'durchschlag' engine (default)\n"
+ " --purify rewrite samples; unique text parts are zeroed out\n"
+ " --sieve use 'sieve' engine\n"
+ " -b# set block length for 'durchschlag'; default: 1024\n"
+ " -s# set slice length for 'distill', 'durchschlag', 'purify'\n"
+ " and 'sieve'; default: 16\n"
+ " -t# set target dictionary size (limit); default: 16K\n"
+ " -u# set minimum slice population (for rewrites); default: 2\n"
+ "# is a decimal number with optional k/K/m/M suffix.\n"
+ "WARNING: 'distill' and 'purify' will overwrite original samples!\n"
+ " Completely unique samples might become empty files.\n\n");
}
int main(int argc, char const* argv[]) {
int dictionaryArg = -1;
- int method = METHOD_SIEVE;
- int sieveSliceLen = 33;
- int targetSize = 16 << 10;
+ int method = METHOD_DURCHSCHLAG;
+ size_t sliceLen = 16;
+ size_t targetSize = 16 << 10;
+ size_t blockSize = 1024;
+ size_t minimumPopulation = 2;
std::vector<uint8_t> data;
std::vector<size_t> sizes;
+ std::vector<int> pathArgs;
size_t total = 0;
for (int i = 1; i < argc; ++i) {
if (argv[i] == nullptr) {
@@ -90,6 +121,12 @@ int main(int argc, char const* argv[]) {
}
if (argv[i][0] == '-') {
if (argv[i][1] == '-') {
+ if (dictionaryArg != -1) {
+ fprintf(stderr,
+ "Method should be specified before dictionary / sample '%s'\n",
+ argv[i]);
+ exit(1);
+ }
if (std::strcmp("--sieve", argv[i]) == 0) {
method = METHOD_SIEVE;
continue;
@@ -98,13 +135,32 @@ int main(int argc, char const* argv[]) {
method = METHOD_DM;
continue;
}
+ if (std::strcmp("--dsh", argv[i]) == 0) {
+ method = METHOD_DURCHSCHLAG;
+ continue;
+ }
+ if (std::strcmp("--distill", argv[i]) == 0) {
+ method = METHOD_DISTILL;
+ continue;
+ }
+ if (std::strcmp("--purify", argv[i]) == 0) {
+ method = METHOD_PURIFY;
+ continue;
+ }
printHelp(fileName(argv[0]));
fprintf(stderr, "Invalid option '%s'\n", argv[i]);
exit(1);
}
- if (argv[i][1] == 's') {
- sieveSliceLen = readInt(&argv[i][2]);
- if (sieveSliceLen < 4 || sieveSliceLen > 256) {
+ if (argv[i][1] == 'b') {
+ blockSize = readInt(&argv[i][2]);
+ if (blockSize < 16 || blockSize > 65536) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
+ } else if (argv[i][1] == 's') {
+ sliceLen = readInt(&argv[i][2]);
+ if (sliceLen < 4 || sliceLen > 256) {
printHelp(fileName(argv[0]));
fprintf(stderr, "Invalid option '%s'\n", argv[i]);
exit(1);
@@ -116,6 +172,13 @@ int main(int argc, char const* argv[]) {
fprintf(stderr, "Invalid option '%s'\n", argv[i]);
exit(1);
}
+ } else if (argv[i][1] == 'u') {
+ minimumPopulation = readInt(&argv[i][2]);
+ if (minimumPopulation < 256 || minimumPopulation > 65536) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
} else {
printHelp(fileName(argv[0]));
fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
@@ -124,26 +187,42 @@ int main(int argc, char const* argv[]) {
continue;
}
if (dictionaryArg == -1) {
- dictionaryArg = i;
- continue;
+ if (method != METHOD_DISTILL && method != METHOD_PURIFY) {
+ dictionaryArg = i;
+ continue;
+ }
}
std::string content = readFile(argv[i]);
data.insert(data.end(), content.begin(), content.end());
total += content.size();
+ pathArgs.push_back(i);
sizes.push_back(content.size());
}
- if (dictionaryArg == -1 || total == 0) {
+ bool wantDictionary = (dictionaryArg == -1);
+ if (method == METHOD_DISTILL || method == METHOD_PURIFY) {
+ wantDictionary = false;
+ }
+ if (wantDictionary || total == 0) {
printHelp(fileName(argv[0]));
fprintf(stderr, "Not enough arguments\n");
exit(1);
}
if (method == METHOD_SIEVE) {
- writeFile(argv[dictionaryArg],
- sieve_generate(targetSize, sieveSliceLen, sizes, data.data()));
+ writeFile(argv[dictionaryArg], sieve_generate(
+ targetSize, sliceLen, sizes, data.data()));
} else if (method == METHOD_DM) {
- writeFile(argv[dictionaryArg],
- DM_generate(targetSize, sizes, data.data()));
+ writeFile(argv[dictionaryArg], DM_generate(
+ targetSize, sizes, data.data()));
+ } else if (method == METHOD_DURCHSCHLAG) {
+ writeFile(argv[dictionaryArg], durchschlag_generate(
+ targetSize, sliceLen, blockSize, sizes, data.data()));
+ } else if (method == METHOD_DISTILL) {
+ durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data());
+ writeSamples(argv, pathArgs, sizes, data.data());
+ } else if (method == METHOD_PURIFY) {
+ durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data());
+ writeSamples(argv, pathArgs, sizes, data.data());
} else {
printHelp(fileName(argv[0]));
fprintf(stderr, "Unknown generator\n");