aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEugene Kliuchnikov <eustas@google.com>2017-10-13 11:25:03 +0200
committerGitHub <noreply@github.com>2017-10-13 11:25:03 +0200
commit39ef4bbdcffd6316b5d3e95904cf9b0aa5e19504 (patch)
tree15d7f28cb8942e9137cc66170c2c8592be028724
parent9c75a2a26a2b704ee8895ae298e5bff3e78acc70 (diff)
downloadbrotli-39ef4bbdcffd6316b5d3e95904cf9b0aa5e19504.zip
brotli-39ef4bbdcffd6316b5d3e95904cf9b0aa5e19504.tar.gz
brotli-39ef4bbdcffd6316b5d3e95904cf9b0aa5e19504.tar.bz2
Add new (fast) dictionary generator engine. (#616)
Add CLI for dictionary generation. Add BUILD file for research folder
-rwxr-xr-xresearch/BUILD29
-rw-r--r--research/deorummolae.cc33
-rw-r--r--research/deorummolae.h14
-rwxr-xr-xresearch/dictionary_generator.cc153
-rwxr-xr-xresearch/sieve.cc211
-rwxr-xr-xresearch/sieve.h21
6 files changed, 436 insertions, 25 deletions
diff --git a/research/BUILD b/research/BUILD
new file mode 100755
index 0000000..6ff5ac2
--- /dev/null
+++ b/research/BUILD
@@ -0,0 +1,29 @@
+# Description: brotli research tools.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"]) # MIT
+
+cc_library(
+ name = "dm",
+ srcs = ["deorummolae.cc"],
+ hdrs = [
+ "deorummolae.h",
+ "esaxx/sais.hxx",
+ ],
+)
+
+cc_library(
+ name = "sieve",
+ srcs = ["sieve.cc"],
+ hdrs = ["sieve.h"],
+)
+
+cc_binary(
+ name = "dictionary_generator",
+ srcs = ["dictionary_generator.cc"],
+ deps = [
+ ":dm",
+ ":sieve",
+ ],
+)
diff --git a/research/deorummolae.cc b/research/deorummolae.cc
index f4cc53e..c53a53c 100644
--- a/research/deorummolae.cc
+++ b/research/deorummolae.cc
@@ -1,7 +1,7 @@
#include "./deorummolae.h"
#include <array>
-#include <vector>
+#include <cstdio>
#include "./esaxx/sais.hxx"
@@ -20,9 +20,7 @@
/* File coverage: every bit set to 1 denotes a file covered by an isle. */
typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
-static int popcount(uint64_t u) {
- return __builtin_popcountll(u);
-}
+static int popcount(uint64_t u) { return __builtin_popcountll(u); }
/* Condense terminators and pad file entries. */
static void rewriteText(std::vector<int>* text) {
@@ -155,9 +153,8 @@ static void cutMatch(std::vector<std::vector<int>>* data, int index, int length,
}
}
-size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
- size_t num_samples, const size_t* sample_sizes,
- const uint8_t* sample_data) {
+std::string DM_generate(size_t dictionary_size_limit,
+ const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
{
uint64_t tmp = 0;
if (popcount(tmp - 1u) != 64) {
@@ -169,9 +166,11 @@ size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
/* Could use 256 + '0' for easier debugging. */
int next_terminator = 256;
+ std::string output;
std::vector<std::vector<int>> data;
size_t offset = 0;
+ size_t num_samples = sample_sizes.size();
if (num_samples > MAX_FILES) num_samples = MAX_FILES;
for (size_t n = 0; n < num_samples; ++n) {
size_t next_offset = offset + sample_sizes[n];
@@ -208,7 +207,7 @@ size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
buildLcp(&full_text, &sa, &lcp, &invese_sa);
/* Do not rebuild SA/LCP, just use different selection. */
-retry:
+ retry:
best_cost = 0;
best_isle = {0, 0, 0, {{0}}};
isles.resize(0);
@@ -259,18 +258,16 @@ retry:
}
/* Save the entry. */
- fprintf(stderr,
- "Savings: %zu+%zu, dictionary: %zu+%d\n",
- total_cost, best_cost, total, best_isle.lcp);
- for (size_t i = 0; i < best_isle.lcp; ++i) {
- dictionary[total + i] =
- static_cast<uint8_t>(full_text[sa[best_isle.l] + i]);
- }
+ fprintf(stderr, "Savings: %zu+%zu, dictionary: %zu+%d\n",
+ total_cost, best_cost, total, best_isle.lcp);
+ int* piece = &full_text[sa[best_isle.l]];
+ output.insert(output.end(), piece, piece + best_isle.lcp);
total += best_isle.lcp;
total_cost += best_cost;
- cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp,
- &invese_sa, &next_terminator, &file_map, &file_offset);
+ cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa,
+ &next_terminator, &file_map, &file_offset);
if (total >= dictionary_size_limit) break;
}
- return total;
+
+ return output;
}
diff --git a/research/deorummolae.h b/research/deorummolae.h
index f37015c..7f24add 100644
--- a/research/deorummolae.h
+++ b/research/deorummolae.h
@@ -4,6 +4,9 @@
#include <stddef.h>
#include <stdint.h>
+#include <string>
+#include <vector>
+
/* log2(maximal number of files). Value 6 provides some speedups. */
#define LOG_MAX_FILES 6
@@ -13,15 +16,12 @@
/**
* Generate a dictionary for given samples.
*
- * @param dictionary storage for generated dictionary
* @param dictionary_size_limit maximal dictionary size
- * @param num_samples number of samples
- * @param sample_sizes array with sample sizes
+ * @param sample_sizes vector with sample sizes
* @param sample_data concatenated samples
- * @return generated dictionary size
+ * @return generated dictionary
*/
-size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
- size_t num_samples, const size_t* sample_sizes,
- const uint8_t* sample_data);
+std::string DM_generate(size_t dictionary_size_limit,
+ const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
#endif // BROTLI_RESEARCH_DEORUMMOLAE_H_
diff --git a/research/dictionary_generator.cc b/research/dictionary_generator.cc
new file mode 100755
index 0000000..b3ee89c
--- /dev/null
+++ b/research/dictionary_generator.cc
@@ -0,0 +1,153 @@
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <vector>
+
+#include "./deorummolae.h"
+#include "./sieve.h"
+
+#define METHOD_DM 0
+#define METHOD_SIEVE 1
+
+size_t readInt(const char* str) {
+ size_t result = 0;
+ if (str[0] == 0 || str[0] == '0') {
+ return 0;
+ }
+ for (size_t i = 0; i < 13; ++i) {
+ if (str[i] == 0) {
+ return result;
+ }
+ if (str[i] == 'k' || str[i] == 'K') {
+ if ((str[i + 1] == 0) && ((result << 10) > result)) {
+ return result << 10;
+ }
+ return 0;
+ }
+ if (str[i] == 'm' || str[i] == 'M') {
+ if ((str[i + 1] == 0) && ((result << 20) > result)) {
+ return result << 20;
+ }
+ return 0;
+ }
+ if (str[i] < '0' || str[i] > '9') {
+ return 0;
+ }
+ size_t next = (10 * result) + (str[i] - '0');
+ if (next <= result) {
+ return 0;
+ }
+ result = next;
+ }
+ return 0;
+}
+
+static std::string readFile(const std::string& path) {
+ std::ifstream file(path);
+ std::string content(
+ (std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+ return content;
+}
+
+static void writeFile(const char* file, const std::string& content) {
+ std::ofstream outfile(file, std::ofstream::binary);
+ outfile.write(content.c_str(), content.size());
+ outfile.close();
+}
+
+/* Returns "base file name" or its tail, if it contains '/' or '\'. */
+static const char* fileName(const char* path) {
+ const char* separator_position = strrchr(path, '/');
+ if (separator_position) path = separator_position + 1;
+ separator_position = strrchr(path, '\\');
+ if (separator_position) path = separator_position + 1;
+ return path;
+}
+
+static void printHelp(const char* name) {
+ fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
+ fprintf(stderr,
+ "Options:\n"
+ " --dm use 'deorummolae' engine\n"
+ " --sieve use 'sieve' engine (default)\n"
+ " -t# set target dictionary size (limit); default: 16K\n"
+ " -s# set slize length for 'sieve'; default: 33\n"
+ "# is a decimal number with optional k/K/m/M suffix.\n\n");
+}
+
+int main(int argc, char const* argv[]) {
+ int dictionaryArg = -1;
+ int method = METHOD_SIEVE;
+ int sieveSliceLen = 33;
+ int targetSize = 16 << 10;
+
+ std::vector<uint8_t> data;
+ std::vector<size_t> sizes;
+ size_t total = 0;
+ for (int i = 1; i < argc; ++i) {
+ if (argv[i] == nullptr) {
+ continue;
+ }
+ if (argv[i][0] == '-') {
+ if (argv[i][1] == '-') {
+ if (std::strcmp("--sieve", argv[i]) == 0) {
+ method = METHOD_SIEVE;
+ continue;
+ }
+ if (std::strcmp("--dm", argv[i]) == 0) {
+ method = METHOD_DM;
+ continue;
+ }
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
+ if (argv[i][1] == 's') {
+ sieveSliceLen = readInt(&argv[i][2]);
+ if (sieveSliceLen < 4 || sieveSliceLen > 256) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
+ } else if (argv[i][1] == 't') {
+ targetSize = readInt(&argv[i][2]);
+ if (targetSize < 256 || targetSize > (1 << 25)) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Invalid option '%s'\n", argv[i]);
+ exit(1);
+ }
+ } else {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
+ exit(1);
+ }
+ continue;
+ }
+ if (dictionaryArg == -1) {
+ dictionaryArg = i;
+ continue;
+ }
+ std::string content = readFile(argv[i]);
+ data.insert(data.end(), content.begin(), content.end());
+ total += content.size();
+ sizes.push_back(content.size());
+ }
+ if (dictionaryArg == -1 || total == 0) {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Not enough arguments\n");
+ exit(1);
+ }
+
+ if (method == METHOD_SIEVE) {
+ writeFile(argv[dictionaryArg],
+ sieve_generate(targetSize, sieveSliceLen, sizes, data.data()));
+ } else if (method == METHOD_DM) {
+ writeFile(argv[dictionaryArg],
+ DM_generate(targetSize, sizes, data.data()));
+ } else {
+ printHelp(fileName(argv[0]));
+ fprintf(stderr, "Unknown generator\n");
+ exit(1);
+ }
+ return 0;
+}
diff --git a/research/sieve.cc b/research/sieve.cc
new file mode 100755
index 0000000..fbc1dbf
--- /dev/null
+++ b/research/sieve.cc
@@ -0,0 +1,211 @@
+#include "./sieve.h"
+
+typedef struct Slot {
+ uint32_t next;
+ uint32_t offset;
+ uint16_t presence;
+ uint16_t mark;
+} Slot;
+
+static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end,
+ size_t middle, uint16_t minPresence, uint16_t iteration) {
+ int from = -2;
+ int to = -1;
+ size_t result = 0;
+ uint16_t targetPresence = minPresence;
+ for (uint32_t i = 0; i < end; ++i) {
+ if (i == middle) {
+ targetPresence++;
+ }
+ Slot& item = map[shortcut[i]];
+ if (item.mark != iteration) {
+ item.mark = iteration;
+ if (item.presence >= targetPresence) {
+ if (to < i) {
+ if (from > 0) {
+ result += to - from;
+ }
+ from = i;
+ }
+ to = i + sliceLen;
+ }
+ }
+ }
+ if (from > 0) {
+ result += to - from;
+ }
+ return result;
+}
+
+static std::string createDictionary(const uint8_t* data, size_t sliceLen,
+ Slot* map, uint32_t* shortcut, size_t end, size_t middle,
+ uint16_t minPresence, uint16_t iteration) {
+ std::string output;
+ int from = -2;
+ int to = -1;
+ uint16_t targetPresence = minPresence;
+ for (uint32_t i = 0; i < end; ++i) {
+ if (i == middle) {
+ targetPresence++;
+ }
+ Slot& item = map[shortcut[i]];
+ if (item.mark != iteration) {
+ item.mark = iteration;
+ if (item.presence >= targetPresence) {
+ if (to < i) {
+ if (from > 0) {
+ output.insert(output.end(), &data[from], &data[to]);
+ }
+ from = i;
+ }
+ to = i + sliceLen;
+ }
+ }
+ }
+ if (from > 0) {
+ output.insert(output.end(), &data[from], &data[to]);
+ }
+ return output;
+}
+
+std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
+ const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
+ /* Parameters aliasing */
+ size_t targetSize = dictionary_size_limit;
+ size_t sliceLen = slice_len;
+ const uint8_t* data = sample_data;
+
+ size_t total = 0;
+ std::vector<size_t> offsets;
+ for (size_t i = 0; i < sample_sizes.size(); ++i) {
+ total += sample_sizes[i];
+ offsets.push_back(total);
+ }
+
+ /*****************************************************************************
+ * Build coverage map.
+ ****************************************************************************/
+ std::vector<Slot> map;
+ std::vector<uint32_t> shortcut;
+ map.push_back({0, 0, 0, 0});
+ size_t end = total - sliceLen;
+ int hashLen = 8;
+ while ((1 << hashLen) < end) {
+ hashLen += 3;
+ }
+ hashLen -= 3;
+ uint32_t hashMask = (1u << hashLen) - 1u;
+ std::vector<uint32_t> hashHead(1 << hashLen);
+ uint32_t hashSlot = 1;
+ uint16_t piece = 0;
+ uint32_t hash = 0;
+ int lShift = 3;
+ int rShift = hashLen - lShift;
+ for (int i = 0; i < sliceLen - 1; ++i) {
+ uint32_t v = data[i];
+ hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
+ }
+ int lShiftX = (lShift * (sliceLen - 1)) % hashLen;
+ int rShiftX = hashLen - lShiftX;
+ for (uint32_t i = 0; i < end; ++i) {
+ uint32_t v = data[i + sliceLen - 1];
+ hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
+
+ if (offsets[piece] == i) {
+ piece++;
+ }
+ uint32_t slot = hashHead[hash];
+ while (slot != 0) {
+ Slot& item = map[slot];
+ int start = item.offset;
+ bool miss = false;
+ for (size_t j = 0; j < sliceLen; ++j) {
+ if (data[i + j] != data[start + j]) {
+ miss = true;
+ break;
+ }
+ }
+ if (!miss) {
+ if (item.mark != piece) {
+ item.mark = piece;
+ item.presence++;
+ }
+ shortcut.push_back(slot);
+ break;
+ }
+ slot = item.next;
+ }
+ if (slot == 0) {
+ map.push_back({hashHead[hash], i, 1, piece});
+ hashHead[hash] = hashSlot;
+ shortcut.push_back(hashSlot);
+ hashSlot++;
+ }
+ v = data[i];
+ hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask;
+ }
+
+ /*****************************************************************************
+ * Build dictionary of specified size.
+ ****************************************************************************/
+ size_t a = 1;
+ size_t size = dryRun(
+ sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
+ /* Maximal output is smaller than target. */
+ if (size <= targetSize) {
+ return createDictionary(
+ data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece);
+ }
+
+ size_t b = offsets.size();
+ size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
+ if (size == targetSize) {
+ return createDictionary(
+ data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
+ }
+ /* Run binary search. */
+ if (size < targetSize) {
+ /* size(a) > targetSize > size(b) && a < m < b */
+ while (a + 1 < b) {
+ size_t m = (a + b) / 2;
+ size = dryRun(
+ sliceLen, map.data(), shortcut.data(), end, end, m, ++piece);
+ if (size < targetSize) {
+ b = m;
+ } else if (size > targetSize) {
+ a = m;
+ } else {
+ return createDictionary(
+ data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece);
+ }
+ }
+ } else {
+ a = b;
+ }
+ /* size(minPresence) > targetSize > size(minPresence + 1) */
+ size_t minPresence = a;
+ a = 0;
+ b = end;
+ /* size(a) < targetSize < size(b) && a < m < b */
+ while (a + 1 < b) {
+ size_t m = (a + b) / 2;
+ size = dryRun(
+ sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece);
+ if (size < targetSize) {
+ a = m;
+ } else if (size > targetSize) {
+ b = m;
+ } else {
+ return createDictionary(data, sliceLen, map.data(), shortcut.data(), end,
+ m, minPresence, ++piece);
+ }
+ }
+
+ bool unrestricted = false;
+ if (minPresence <= 2 && !unrestricted) {
+ minPresence = 2;
+ a = end;
+ }
+ return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, a,
+ minPresence, ++piece);
+}
diff --git a/research/sieve.h b/research/sieve.h
new file mode 100755
index 0000000..f529835
--- /dev/null
+++ b/research/sieve.h
@@ -0,0 +1,21 @@
+#ifndef BROTLI_RESEARCH_SIEVE_H_
+#define BROTLI_RESEARCH_SIEVE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+/**
+ * Generate a dictionary for given samples.
+ *
+ * @param dictionary_size_limit maximal dictionary size
+ * @param sample_sizes vector with sample sizes
+ * @param sample_data concatenated samples
+ * @return generated dictionary
+ */
+std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len,
+ const std::vector<size_t>& sample_sizes, const uint8_t* sample_data);
+
+#endif // BROTLI_RESEARCH_SIEVE_H_