From f8c671774514357abec2f6b14c8ee13c6fd885d3 Mon Sep 17 00:00:00 2001 From: Eugene Kliuchnikov Date: Wed, 23 Jun 2021 09:40:57 +0200 Subject: Update (#908) * re-enable Js build/test * improve decoder performance * rewrite dictionary data in Java/Js to a shorter uncompressed form * improve dictionary generation tool --- research/dictionary_generator.cc | 71 +++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 27 deletions(-) (limited to 'research') diff --git a/research/dictionary_generator.cc b/research/dictionary_generator.cc index dcdf2fa..715723f 100644 --- a/research/dictionary_generator.cc +++ b/research/dictionary_generator.cc @@ -3,6 +3,7 @@ #include #include #include +#include #include #include "./deorummolae.h" @@ -71,14 +72,13 @@ static void writeFile(const char* file, const std::string& content) { outfile.close(); } -static void writeSamples(char const* argv[], const std::vector& pathArgs, +static void writeSamples(const std::vector& paths, const std::vector& sizes, const uint8_t* data) { size_t offset = 0; - for (size_t i = 0; i < pathArgs.size(); ++i) { - int j = pathArgs[i]; - const char* file = argv[j]; + for (size_t i = 0; i < paths.size(); ++i) { + const char* path = paths[i].c_str(); size_t sampleSize = sizes[i]; - std::ofstream outfile(file, std::ofstream::binary); + std::ofstream outfile(path, std::ofstream::binary); outfile.write(reinterpret_cast(data + offset), static_cast(sampleSize)); outfile.close(); @@ -135,7 +135,7 @@ int main(int argc, char const* argv[]) { std::vector data; std::vector sizes; - std::vector pathArgs; + std::vector paths; size_t total = 0; for (int i = 1; i < argc; ++i) { if (argv[i] == nullptr) { @@ -217,6 +217,9 @@ int main(int argc, char const* argv[]) { } } else if (arg1 == 's') { sliceLen = readInt(arg2); + // TODO: investigate why sliceLen == 4..5 greatly slows down + // durschlag engine, but only from command line; + // durschlag_runner seems to work fine with those. if (sliceLen < 4 || sliceLen > 256) { printHelp(fileName(argv[0])); fprintf(stderr, "Invalid option '%s'\n", argv[i]); @@ -265,28 +268,42 @@ int main(int argc, char const* argv[]) { } } - std::string content = readFile(argv[i]); - if (chunkLen == 0) { - pathArgs.push_back(i); - data.insert(data.end(), content.begin(), content.end()); - total += content.size(); - sizes.push_back(content.size()); - continue; - } else if (chunkLen <= overlapLen) { - printHelp(fileName(argv[0])); - fprintf(stderr, "Invalid chunkLen - overlapLen combination\n"); - exit(1); - } - for (size_t chunkStart = 0; - chunkStart < content.size(); - chunkStart += chunkLen - overlapLen) { - std::string chunk = content.substr(chunkStart, chunkLen); - data.insert(data.end(), chunk.begin(), chunk.end()); - total += chunk.size(); - sizes.push_back(chunk.size()); + glob_t resolved_paths; + memset(&resolved_paths, 0, sizeof(resolved_paths)); + bool ok = true; + if (glob(argv[i], GLOB_TILDE, NULL, &resolved_paths) == 0) { + for(size_t j = 0; j < resolved_paths.gl_pathc; ++j) { + std::string content = readFile(resolved_paths.gl_pathv[j]); + if (chunkLen == 0) { + paths.emplace_back(resolved_paths.gl_pathv[j]); + data.insert(data.end(), content.begin(), content.end()); + total += content.size(); + sizes.push_back(content.size()); + continue; + } else if (chunkLen <= overlapLen) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid chunkLen - overlapLen combination\n"); + exit(1); + } + for (size_t chunkStart = 0; + chunkStart < content.size(); + chunkStart += chunkLen - overlapLen) { + std::string chunk = content.substr(chunkStart, chunkLen); + data.insert(data.end(), chunk.begin(), chunk.end()); + total += chunk.size(); + sizes.push_back(chunk.size()); + } + } + } else { + ok = false; } + globfree(&resolved_paths); + if (!ok) exit(1); } + fprintf(stderr, "Number of chunks: %zu; total size: %zu\n", sizes.size(), + total); + bool wantDictionary = (dictionaryArg == -1); if (method == METHOD_DISTILL || method == METHOD_PURIFY) { wantDictionary = false; @@ -313,10 +330,10 @@ int main(int argc, char const* argv[]) { targetSize, sliceLen, blockSize, sizes, data.data())); } else if (method == METHOD_DISTILL) { durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data()); - writeSamples(argv, pathArgs, sizes, data.data()); + writeSamples(paths, sizes, data.data()); } else if (method == METHOD_PURIFY) { durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data()); - writeSamples(argv, pathArgs, sizes, data.data()); + writeSamples(paths, sizes, data.data()); } else { printHelp(fileName(argv[0])); fprintf(stderr, "Unknown generator\n"); -- cgit v1.1