aboutsummaryrefslogtreecommitdiff
path: root/research
diff options
context:
space:
mode:
authorEugene Kliuchnikov <eustas.ru@gmail.com>2021-06-23 09:40:57 +0200
committerGitHub <noreply@github.com>2021-06-23 09:40:57 +0200
commitf8c671774514357abec2f6b14c8ee13c6fd885d3 (patch)
treefa1605f92feea7b851389d805b5bd8c1f010b636 /research
parentbbe5d72ba37b164c4e8e0d7baeb78a22aa849c38 (diff)
downloadbrotli-f8c671774514357abec2f6b14c8ee13c6fd885d3.zip
brotli-f8c671774514357abec2f6b14c8ee13c6fd885d3.tar.gz
brotli-f8c671774514357abec2f6b14c8ee13c6fd885d3.tar.bz2
Update (#908)
* re-enable Js build/test * improve decoder performance * rewrite dictionary data in Java/Js to a shorter uncompressed form * improve dictionary generation tool
Diffstat (limited to 'research')
-rw-r--r--research/dictionary_generator.cc71
1 files changed, 44 insertions, 27 deletions
diff --git a/research/dictionary_generator.cc b/research/dictionary_generator.cc
index dcdf2fa..715723f 100644
--- a/research/dictionary_generator.cc
+++ b/research/dictionary_generator.cc
@@ -3,6 +3,7 @@
#include <cstdio>
#include <cstring>
#include <fstream>
+#include <glob.h>
#include <vector>
#include "./deorummolae.h"
@@ -71,14 +72,13 @@ static void writeFile(const char* file, const std::string& content) {
outfile.close();
}
-static void writeSamples(char const* argv[], const std::vector<int>& pathArgs,
+static void writeSamples(const std::vector<std::string>& paths,
const std::vector<size_t>& sizes, const uint8_t* data) {
size_t offset = 0;
- for (size_t i = 0; i < pathArgs.size(); ++i) {
- int j = pathArgs[i];
- const char* file = argv[j];
+ for (size_t i = 0; i < paths.size(); ++i) {
+ const char* path = paths[i].c_str();
size_t sampleSize = sizes[i];
- std::ofstream outfile(file, std::ofstream::binary);
+ std::ofstream outfile(path, std::ofstream::binary);
outfile.write(reinterpret_cast<const char*>(data + offset),
static_cast<std::streamsize>(sampleSize));
outfile.close();
@@ -135,7 +135,7 @@ int main(int argc, char const* argv[]) {
std::vector<uint8_t> data;
std::vector<size_t> sizes;
- std::vector<int> pathArgs;
+ std::vector<std::string> paths;
size_t total = 0;
for (int i = 1; i < argc; ++i) {
if (argv[i] == nullptr) {
@@ -217,6 +217,9 @@ int main(int argc, char const* argv[]) {
}
} else if (arg1 == 's') {
sliceLen = readInt(arg2);
+ // TODO: investigate why sliceLen == 4..5 greatly slows down
+ // durschlag engine, but only from command line;
+ // durschlag_runner seems to work fine with those.
if (sliceLen < 4 || sliceLen > 256) {
printHelp(fileName(argv[0]));
fprintf(stderr, "Invalid option '%s'\n", argv[i]);
@@ -265,28 +268,42 @@ int main(int argc, char const* argv[]) {
}
}
- std::string content = readFile(argv[i]);
- if (chunkLen == 0) {
- pathArgs.push_back(i);
- data.insert(data.end(), content.begin(), content.end());
- total += content.size();
- sizes.push_back(content.size());
- continue;
- } else if (chunkLen <= overlapLen) {
- printHelp(fileName(argv[0]));
- fprintf(stderr, "Invalid chunkLen - overlapLen combination\n");
- exit(1);
- }
- for (size_t chunkStart = 0;
- chunkStart < content.size();
- chunkStart += chunkLen - overlapLen) {
- std::string chunk = content.substr(chunkStart, chunkLen);
- data.insert(data.end(), chunk.begin(), chunk.end());
- total += chunk.size();
- sizes.push_back(chunk.size());
+ glob_t resolved_paths;
+ memset(&resolved_paths, 0, sizeof(resolved_paths));
+ bool ok = true;
+ if (glob(argv[i], GLOB_TILDE, NULL, &resolved_paths) == 0) {
+ for(size_t j = 0; j < resolved_paths.gl_pathc; ++j) {
+ std::string content = readFile(resolved_paths.gl_pathv[j]);
+ if (chunkLen == 0) {
+ paths.emplace_back(resolved_paths.gl_pathv[j]);
+ data.insert(data.end(), content.begin(), content.end());
+ total += content.size();
+ sizes.push_back(content.size());
+ continue;
+ } else if (chunkLen <= overlapLen) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid chunkLen - overlapLen combination\n");
+ exit(1);
+ }
+ for (size_t chunkStart = 0;
+ chunkStart < content.size();
+ chunkStart += chunkLen - overlapLen) {
+ std::string chunk = content.substr(chunkStart, chunkLen);
+ data.insert(data.end(), chunk.begin(), chunk.end());
+ total += chunk.size();
+ sizes.push_back(chunk.size());
+ }
+ }
+ } else {
+ ok = false;
}
+ globfree(&resolved_paths);
+ if (!ok) exit(1);
}
+ fprintf(stderr, "Number of chunks: %zu; total size: %zu\n", sizes.size(),
+ total);
+
bool wantDictionary = (dictionaryArg == -1);
if (method == METHOD_DISTILL || method == METHOD_PURIFY) {
wantDictionary = false;
@@ -313,10 +330,10 @@ int main(int argc, char const* argv[]) {
targetSize, sliceLen, blockSize, sizes, data.data()));
} else if (method == METHOD_DISTILL) {
durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data());
- writeSamples(argv, pathArgs, sizes, data.data());
+ writeSamples(paths, sizes, data.data());
} else if (method == METHOD_PURIFY) {
durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data());
- writeSamples(argv, pathArgs, sizes, data.data());
+ writeSamples(paths, sizes, data.data());
} else {
printHelp(fileName(argv[0]));
fprintf(stderr, "Unknown generator\n");