diff options
author | Eugene Kliuchnikov <eustas@google.com> | 2018-02-26 09:04:36 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-02-26 09:04:36 -0500 |
commit | 35e69fc7cf9421ab04ffc9d52cb36d07fa12984a (patch) | |
tree | a1ed614391936d455da2b0610ef8e8caf88b4289 /research/dictionary_generator.cc | |
parent | 3af18990f50d8f040038aaa08c41f5d27d62efb5 (diff) | |
download | brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.zip brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.gz brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.bz2 |
New feature: "Large Window Brotli" (#640)
* New feature: "Large Window Brotli"
By setting special encoder/decoder flag it is now possible to extend
LZ-window up to 30 bits; though produced stream will not be RFC7932
compliant.
Added new dictionary generator - "DSH". It combines speed of "Sieve"
and quality of "DM". Plus utilities to prepare train corpora
(remove unique strings).
Improved compression ratio: now two sub-blocks could be stitched:
the last copy command could be extended to span the next sub-block.
Fixed compression ineffectiveness caused by floating numbers rounding and
wrong cost heuristic.
Other C changes:
- combined / moved `context.h` to `common`
- moved transforms to `common`
- unified some aspects of code formatting
- added an abstraction for encoder (static) dictionary
- moved default allocator/deallocator functions to `common`
brotli CLI:
- window size is auto-adjusted if not specified explicitly
Java:
- added "eager" decoding both to JNI wrapper and pure decoder
- huge speed-up of `DictionaryData` initialization
* Add dictionaryless compressed dictionary
* Fix `sources.lst`
* Fix `sources.lst` and add a note that `libtool` is also required.
* Update setup.py
* Fix `EagerStreamTest`
* Fix BUILD file
* Add missing `libdivsufsort` dependency
* Fix "unused parameter" warning.
Diffstat (limited to 'research/dictionary_generator.cc')
-rwxr-xr-x | research/dictionary_generator.cc | 119 |
1 files changed, 99 insertions, 20 deletions
diff --git a/research/dictionary_generator.cc b/research/dictionary_generator.cc index b3ee89c..00cfaba 100755 --- a/research/dictionary_generator.cc +++ b/research/dictionary_generator.cc @@ -1,15 +1,20 @@ +#include <cstddef> #include <cstdio> #include <cstring> #include <fstream> #include <vector> #include "./deorummolae.h" +#include "./durchschlag.h" #include "./sieve.h" #define METHOD_DM 0 #define METHOD_SIEVE 1 +#define METHOD_DURCHSCHLAG 2 +#define METHOD_DISTILL 3 +#define METHOD_PURIFY 4 -size_t readInt(const char* str) { +static size_t readInt(const char* str) { size_t result = 0; if (str[0] == 0 || str[0] == '0') { return 0; @@ -51,10 +56,25 @@ static std::string readFile(const std::string& path) { static void writeFile(const char* file, const std::string& content) { std::ofstream outfile(file, std::ofstream::binary); - outfile.write(content.c_str(), content.size()); + outfile.write(content.c_str(), static_cast<std::streamsize>(content.size())); outfile.close(); } +static void writeSamples(char const* argv[], const std::vector<int>& pathArgs, + const std::vector<size_t>& sizes, const uint8_t* data) { + size_t offset = 0; + for (size_t i = 0; i < pathArgs.size(); ++i) { + int j = pathArgs[i]; + const char* file = argv[j]; + size_t sampleSize = sizes[i]; + std::ofstream outfile(file, std::ofstream::binary); + outfile.write(reinterpret_cast<const char*>(data + offset), + static_cast<std::streamsize>(sampleSize)); + outfile.close(); + offset += sampleSize; + } +} + /* Returns "base file name" or its tail, if it contains '/' or '\'. */ static const char* fileName(const char* path) { const char* separator_position = strrchr(path, '/'); @@ -68,21 +88,32 @@ static void printHelp(const char* name) { fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name); fprintf(stderr, "Options:\n" - " --dm use 'deorummolae' engine\n" - " --sieve use 'sieve' engine (default)\n" - " -t# set target dictionary size (limit); default: 16K\n" - " -s# set slize length for 'sieve'; default: 33\n" - "# is a decimal number with optional k/K/m/M suffix.\n\n"); + " --dm use 'deorummolae' engine\n" + " --distill rewrite samples; unique text parts are removed\n" + " --dsh use 'durchschlag' engine (default)\n" + " --purify rewrite samples; unique text parts are zeroed out\n" + " --sieve use 'sieve' engine\n" + " -b# set block length for 'durchschlag'; default: 1024\n" + " -s# set slice length for 'distill', 'durchschlag', 'purify'\n" + " and 'sieve'; default: 16\n" + " -t# set target dictionary size (limit); default: 16K\n" + " -u# set minimum slice population (for rewrites); default: 2\n" + "# is a decimal number with optional k/K/m/M suffix.\n" + "WARNING: 'distill' and 'purify' will overwrite original samples!\n" + " Completely unique samples might become empty files.\n\n"); } int main(int argc, char const* argv[]) { int dictionaryArg = -1; - int method = METHOD_SIEVE; - int sieveSliceLen = 33; - int targetSize = 16 << 10; + int method = METHOD_DURCHSCHLAG; + size_t sliceLen = 16; + size_t targetSize = 16 << 10; + size_t blockSize = 1024; + size_t minimumPopulation = 2; std::vector<uint8_t> data; std::vector<size_t> sizes; + std::vector<int> pathArgs; size_t total = 0; for (int i = 1; i < argc; ++i) { if (argv[i] == nullptr) { @@ -90,6 +121,12 @@ int main(int argc, char const* argv[]) { } if (argv[i][0] == '-') { if (argv[i][1] == '-') { + if (dictionaryArg != -1) { + fprintf(stderr, + "Method should be specified before dictionary / sample '%s'\n", + argv[i]); + exit(1); + } if (std::strcmp("--sieve", argv[i]) == 0) { method = METHOD_SIEVE; continue; @@ -98,13 +135,32 @@ int main(int argc, char const* argv[]) { method = METHOD_DM; continue; } + if (std::strcmp("--dsh", argv[i]) == 0) { + method = METHOD_DURCHSCHLAG; + continue; + } + if (std::strcmp("--distill", argv[i]) == 0) { + method = METHOD_DISTILL; + continue; + } + if (std::strcmp("--purify", argv[i]) == 0) { + method = METHOD_PURIFY; + continue; + } printHelp(fileName(argv[0])); fprintf(stderr, "Invalid option '%s'\n", argv[i]); exit(1); } - if (argv[i][1] == 's') { - sieveSliceLen = readInt(&argv[i][2]); - if (sieveSliceLen < 4 || sieveSliceLen > 256) { + if (argv[i][1] == 'b') { + blockSize = readInt(&argv[i][2]); + if (blockSize < 16 || blockSize > 65536) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } + } else if (argv[i][1] == 's') { + sliceLen = readInt(&argv[i][2]); + if (sliceLen < 4 || sliceLen > 256) { printHelp(fileName(argv[0])); fprintf(stderr, "Invalid option '%s'\n", argv[i]); exit(1); @@ -116,6 +172,13 @@ int main(int argc, char const* argv[]) { fprintf(stderr, "Invalid option '%s'\n", argv[i]); exit(1); } + } else if (argv[i][1] == 'u') { + minimumPopulation = readInt(&argv[i][2]); + if (minimumPopulation < 256 || minimumPopulation > 65536) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } } else { printHelp(fileName(argv[0])); fprintf(stderr, "Unrecognized option '%s'\n", argv[i]); @@ -124,26 +187,42 @@ int main(int argc, char const* argv[]) { continue; } if (dictionaryArg == -1) { - dictionaryArg = i; - continue; + if (method != METHOD_DISTILL && method != METHOD_PURIFY) { + dictionaryArg = i; + continue; + } } std::string content = readFile(argv[i]); data.insert(data.end(), content.begin(), content.end()); total += content.size(); + pathArgs.push_back(i); sizes.push_back(content.size()); } - if (dictionaryArg == -1 || total == 0) { + bool wantDictionary = (dictionaryArg == -1); + if (method == METHOD_DISTILL || method == METHOD_PURIFY) { + wantDictionary = false; + } + if (wantDictionary || total == 0) { printHelp(fileName(argv[0])); fprintf(stderr, "Not enough arguments\n"); exit(1); } if (method == METHOD_SIEVE) { - writeFile(argv[dictionaryArg], - sieve_generate(targetSize, sieveSliceLen, sizes, data.data())); + writeFile(argv[dictionaryArg], sieve_generate( + targetSize, sliceLen, sizes, data.data())); } else if (method == METHOD_DM) { - writeFile(argv[dictionaryArg], - DM_generate(targetSize, sizes, data.data())); + writeFile(argv[dictionaryArg], DM_generate( + targetSize, sizes, data.data())); + } else if (method == METHOD_DURCHSCHLAG) { + writeFile(argv[dictionaryArg], durchschlag_generate( + targetSize, sliceLen, blockSize, sizes, data.data())); + } else if (method == METHOD_DISTILL) { + durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data()); + writeSamples(argv, pathArgs, sizes, data.data()); + } else if (method == METHOD_PURIFY) { + durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data()); + writeSamples(argv, pathArgs, sizes, data.data()); } else { printHelp(fileName(argv[0])); fprintf(stderr, "Unknown generator\n"); |