From 4b2b2d4f83ffeaac7708e44409fe34896a01a278 Mon Sep 17 00:00:00 2001 From: Eugene Kliuchnikov Date: Fri, 12 Apr 2019 13:57:42 +0200 Subject: Update (#749) Update: * Bazel: fix MSVC configuration * C: common: extended documentation and helpers around distance codes * C: common: enable BROTLI_DCHECK in "debug" builds * C: common: fix implicit trailing zero in `kPrefixSuffix` * C: dec: fix possible bit reader discharge for "large-window" mode * C: dec: simplify distance decoding via lookup table * C: dec: reuse decoder state members memory via union with lookup table * C: dec: add decoder state diagram * C: enc: clarify access to static dictionary * C: enc: improve static dictionary hash * C: enc: add "stream offset" parameter for parallel encoding * C: enc: reorganize hasher; now Q2-Q3 require exactly 256KiB to avoid global TCMalloc lock * C: enc: fix rare access to uninitialized data in ring-buffer * C: enc: reorganize logging / checks in `write_bits.h` * Java: dec: add "large-window" support * Java: dec: improve speed * Java: dec: debug and 32-bit mode are now activated via system properties * Java: dec: demystify some state variables (use better names) * Dictionary generator: add single input mode * Java: dec: modernize tests * Bazel: js: pick working commit for closure rules --- research/dictionary_generator.cc | 144 ++++++++++++++++++++++++++++++++------- 1 file changed, 119 insertions(+), 25 deletions(-) (limited to 'research') diff --git a/research/dictionary_generator.cc b/research/dictionary_generator.cc index 00cfaba..dcdf2fa 100755 --- a/research/dictionary_generator.cc +++ b/research/dictionary_generator.cc @@ -1,3 +1,4 @@ +#include #include #include #include @@ -8,6 +9,16 @@ #include "./durchschlag.h" #include "./sieve.h" +/* This isn't a definitive list of "--foo" arguments, only those that take an + * additional "=#" integer parameter, like "--foo=20" or "--foo=32K". + */ +#define LONG_ARG_BLOCK_LEN "--block_len=" +#define LONG_ARG_SLICE_LEN "--slice_len=" +#define LONG_ARG_TARGET_DICT_LEN "--target_dict_len=" +#define LONG_ARG_MIN_SLICE_POP "--min_slice_pop=" +#define LONG_ARG_CHUNK_LEN "--chunk_len=" +#define LONG_ARG_OVERLAP_LEN "--overlap_len=" + #define METHOD_DM 0 #define METHOD_SIEVE 1 #define METHOD_DURCHSCHLAG 2 @@ -93,11 +104,20 @@ static void printHelp(const char* name) { " --dsh use 'durchschlag' engine (default)\n" " --purify rewrite samples; unique text parts are zeroed out\n" " --sieve use 'sieve' engine\n" - " -b# set block length for 'durchschlag'; default: 1024\n" - " -s# set slice length for 'distill', 'durchschlag', 'purify'\n" + " -b#, --block_len=#\n" + " set block length for 'durchschlag'; default: 1024\n" + " -s#, --slice_len=#\n" + " set slice length for 'distill', 'durchschlag', 'purify'\n" " and 'sieve'; default: 16\n" - " -t# set target dictionary size (limit); default: 16K\n" - " -u# set minimum slice population (for rewrites); default: 2\n" + " -t#, --target_dict_len=#\n" + " set target dictionary length (limit); default: 16K\n" + " -u#, --min_slice_pop=#\n" + " set minimum slice population (for rewrites); default: 2\n" + " -c#, --chunk_len=#\n" + " if positive, samples are cut into chunks of this length;\n" + " default: 0; cannot mix with 'rewrite samples'\n" + " -o#, --overlap_len=#\n" + " set chunk overlap length; default 0\n" "# is a decimal number with optional k/K/m/M suffix.\n" "WARNING: 'distill' and 'purify' will overwrite original samples!\n" " Completely unique samples might become empty files.\n\n"); @@ -110,6 +130,8 @@ int main(int argc, char const* argv[]) { size_t targetSize = 16 << 10; size_t blockSize = 1024; size_t minimumPopulation = 2; + size_t chunkLen = 0; + size_t overlapLen = 0; std::vector data; std::vector sizes; @@ -119,66 +141,115 @@ int main(int argc, char const* argv[]) { if (argv[i] == nullptr) { continue; } + if (argv[i][0] == '-') { - if (argv[i][1] == '-') { + char arg1 = argv[i][1]; + const char* arg2 = arg1 ? &argv[i][2] : nullptr; + if (arg1 == '-') { if (dictionaryArg != -1) { fprintf(stderr, "Method should be specified before dictionary / sample '%s'\n", argv[i]); exit(1); } - if (std::strcmp("--sieve", argv[i]) == 0) { + + /* Look for "--long_arg" via exact match. */ + if (std::strcmp(argv[i], "--sieve") == 0) { method = METHOD_SIEVE; continue; } - if (std::strcmp("--dm", argv[i]) == 0) { + if (std::strcmp(argv[i], "--dm") == 0) { method = METHOD_DM; continue; } - if (std::strcmp("--dsh", argv[i]) == 0) { + if (std::strcmp(argv[i], "--dsh") == 0) { method = METHOD_DURCHSCHLAG; continue; } - if (std::strcmp("--distill", argv[i]) == 0) { + if (std::strcmp(argv[i], "--distill") == 0) { method = METHOD_DISTILL; continue; } - if (std::strcmp("--purify", argv[i]) == 0) { + if (std::strcmp(argv[i], "--purify") == 0) { method = METHOD_PURIFY; continue; } - printHelp(fileName(argv[0])); - fprintf(stderr, "Invalid option '%s'\n", argv[i]); - exit(1); + + /* Look for "--long_arg=#" via prefix match. */ + if (std::strncmp(argv[i], LONG_ARG_BLOCK_LEN, + std::strlen(LONG_ARG_BLOCK_LEN)) == 0) { + arg1 = 'b'; + arg2 = &argv[i][std::strlen(LONG_ARG_BLOCK_LEN)]; + } else if (std::strncmp(argv[i], LONG_ARG_SLICE_LEN, + std::strlen(LONG_ARG_SLICE_LEN)) == 0) { + arg1 = 's'; + arg2 = &argv[i][std::strlen(LONG_ARG_SLICE_LEN)]; + } else if (std::strncmp(argv[i], LONG_ARG_TARGET_DICT_LEN, + std::strlen(LONG_ARG_TARGET_DICT_LEN)) == 0) { + arg1 = 't'; + arg2 = &argv[i][std::strlen(LONG_ARG_TARGET_DICT_LEN)]; + } else if (std::strncmp(argv[i], LONG_ARG_MIN_SLICE_POP, + std::strlen(LONG_ARG_MIN_SLICE_POP)) == 0) { + arg1 = 'u'; + arg2 = &argv[i][std::strlen(LONG_ARG_MIN_SLICE_POP)]; + } else if (std::strncmp(argv[i], LONG_ARG_CHUNK_LEN, + std::strlen(LONG_ARG_CHUNK_LEN)) == 0) { + arg1 = 'c'; + arg2 = &argv[i][std::strlen(LONG_ARG_CHUNK_LEN)]; + } else if (std::strncmp(argv[i], LONG_ARG_OVERLAP_LEN, + std::strlen(LONG_ARG_OVERLAP_LEN)) == 0) { + arg1 = 'o'; + arg2 = &argv[i][std::strlen(LONG_ARG_OVERLAP_LEN)]; + } else { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } } - if (argv[i][1] == 'b') { - blockSize = readInt(&argv[i][2]); + + /* Look for "-f" short args or "--foo=#" long args. */ + if (arg1 == 'b') { + blockSize = readInt(arg2); if (blockSize < 16 || blockSize > 65536) { printHelp(fileName(argv[0])); fprintf(stderr, "Invalid option '%s'\n", argv[i]); exit(1); } - } else if (argv[i][1] == 's') { - sliceLen = readInt(&argv[i][2]); + } else if (arg1 == 's') { + sliceLen = readInt(arg2); if (sliceLen < 4 || sliceLen > 256) { printHelp(fileName(argv[0])); fprintf(stderr, "Invalid option '%s'\n", argv[i]); exit(1); } - } else if (argv[i][1] == 't') { - targetSize = readInt(&argv[i][2]); + } else if (arg1 == 't') { + targetSize = readInt(arg2); if (targetSize < 256 || targetSize > (1 << 25)) { printHelp(fileName(argv[0])); fprintf(stderr, "Invalid option '%s'\n", argv[i]); exit(1); } - } else if (argv[i][1] == 'u') { - minimumPopulation = readInt(&argv[i][2]); + } else if (arg1 == 'u') { + minimumPopulation = readInt(arg2); if (minimumPopulation < 256 || minimumPopulation > 65536) { printHelp(fileName(argv[0])); fprintf(stderr, "Invalid option '%s'\n", argv[i]); exit(1); } + } else if (arg1 == 'c') { + chunkLen = readInt(arg2); + if (chunkLen < 0 || chunkLen > INT_MAX) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } + } else if (arg1 == 'o') { + overlapLen = readInt(arg2); + if (overlapLen < 0 || overlapLen > INT_MAX) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } } else { printHelp(fileName(argv[0])); fprintf(stderr, "Unrecognized option '%s'\n", argv[i]); @@ -186,21 +257,44 @@ int main(int argc, char const* argv[]) { } continue; } + if (dictionaryArg == -1) { if (method != METHOD_DISTILL && method != METHOD_PURIFY) { dictionaryArg = i; continue; } } + std::string content = readFile(argv[i]); - data.insert(data.end(), content.begin(), content.end()); - total += content.size(); - pathArgs.push_back(i); - sizes.push_back(content.size()); + if (chunkLen == 0) { + pathArgs.push_back(i); + data.insert(data.end(), content.begin(), content.end()); + total += content.size(); + sizes.push_back(content.size()); + continue; + } else if (chunkLen <= overlapLen) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid chunkLen - overlapLen combination\n"); + exit(1); + } + for (size_t chunkStart = 0; + chunkStart < content.size(); + chunkStart += chunkLen - overlapLen) { + std::string chunk = content.substr(chunkStart, chunkLen); + data.insert(data.end(), chunk.begin(), chunk.end()); + total += chunk.size(); + sizes.push_back(chunk.size()); + } } + bool wantDictionary = (dictionaryArg == -1); if (method == METHOD_DISTILL || method == METHOD_PURIFY) { wantDictionary = false; + if (chunkLen != 0) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Cannot mix 'rewrite samples' with positive chunk_len\n"); + exit(1); + } } if (wantDictionary || total == 0) { printHelp(fileName(argv[0])); -- cgit v1.1