From 35e69fc7cf9421ab04ffc9d52cb36d07fa12984a Mon Sep 17 00:00:00 2001 From: Eugene Kliuchnikov Date: Mon, 26 Feb 2018 09:04:36 -0500 Subject: New feature: "Large Window Brotli" (#640) * New feature: "Large Window Brotli" By setting special encoder/decoder flag it is now possible to extend LZ-window up to 30 bits; though produced stream will not be RFC7932 compliant. Added new dictionary generator - "DSH". It combines speed of "Sieve" and quality of "DM". Plus utilities to prepare train corpora (remove unique strings). Improved compression ratio: now two sub-blocks could be stitched: the last copy command could be extended to span the next sub-block. Fixed compression ineffectiveness caused by floating numbers rounding and wrong cost heuristic. Other C changes: - combined / moved `context.h` to `common` - moved transforms to `common` - unified some aspects of code formatting - added an abstraction for encoder (static) dictionary - moved default allocator/deallocator functions to `common` brotli CLI: - window size is auto-adjusted if not specified explicitly Java: - added "eager" decoding both to JNI wrapper and pure decoder - huge speed-up of `DictionaryData` initialization * Add dictionaryless compressed dictionary * Fix `sources.lst` * Fix `sources.lst` and add a note that `libtool` is also required. * Update setup.py * Fix `EagerStreamTest` * Fix BUILD file * Add missing `libdivsufsort` dependency * Fix "unused parameter" warning. --- research/BUILD | 8 + research/BUILD.libdivsufsort | 55 +++ research/deorummolae.cc | 173 ++++++---- research/deorummolae.h | 9 +- research/dictionary_generator.cc | 119 +++++-- research/draw_diff.cc | 21 +- research/durchschlag.cc | 714 +++++++++++++++++++++++++++++++++++++++ research/durchschlag.h | 99 ++++++ research/libdivsufsort | 1 + research/sieve.cc | 174 ++++++---- research/sieve.h | 5 +- 11 files changed, 1207 insertions(+), 171 deletions(-) create mode 100644 research/BUILD.libdivsufsort create mode 100755 research/durchschlag.cc create mode 100755 research/durchschlag.h create mode 160000 research/libdivsufsort (limited to 'research') diff --git a/research/BUILD b/research/BUILD index 6ff5ac2..211b3e7 100755 --- a/research/BUILD +++ b/research/BUILD @@ -14,6 +14,13 @@ cc_library( ) cc_library( + name = "durchschlag", + srcs = ["durchschlag.cc"], + hdrs = ["durchschlag.h"], + deps = ["@divsufsort//:libdivsufsort"], +) + +cc_library( name = "sieve", srcs = ["sieve.cc"], hdrs = ["sieve.h"], @@ -24,6 +31,7 @@ cc_binary( srcs = ["dictionary_generator.cc"], deps = [ ":dm", + ":durchschlag", ":sieve", ], ) diff --git a/research/BUILD.libdivsufsort b/research/BUILD.libdivsufsort new file mode 100644 index 0000000..ce60e9c --- /dev/null +++ b/research/BUILD.libdivsufsort @@ -0,0 +1,55 @@ +package( + default_visibility = ["//visibility:public"], +) + +cc_library( + name = "libdivsufsort", + srcs = [ + "lib/divsufsort.c", + "lib/sssort.c", + "lib/trsort.c", + "lib/utils.c", + ], + hdrs = [ + "include/config.h", + "include/divsufsort.h", + "include/divsufsort_private.h", + ], + copts = [ + "-DHAVE_CONFIG_H=1", + ], + includes = ["include"], +) + +commom_awk_replaces = ( + "gsub(/#cmakedefine/, \"#define\"); " + + "gsub(/@DIVSUFSORT_EXPORT@/, \"\"); " + + "gsub(/@DIVSUFSORT_IMPORT@/, \"\"); " + + "gsub(/@INLINE@/, \"inline\"); " + + "gsub(/@INCFILE@/, \"#include \"); " + + "gsub(/@SAUCHAR_TYPE@/, \"uint8_t\"); " + + "gsub(/@SAINT32_TYPE@/, \"int32_t\"); " + + "gsub(/@SAINT_PRId@/, \"PRId32\"); " +) + +genrule( + name = "config_h", + srcs = ["include/config.h.cmake"], + outs = ["include/config.h"], + cmd = ("awk '{ " + + "gsub(/@HAVE_IO_H 1@/, \"HAVE_IO_H 0\"); " + + commom_awk_replaces + + "print; }' $(<) > $(@)"), +) + +genrule( + name = "divsufsort_h", + srcs = ["include/divsufsort.h.cmake"], + outs = ["include/divsufsort.h"], + cmd = ("awk '{ " + + "gsub(/@W64BIT@/, \"\"); " + + "gsub(/@SAINDEX_TYPE@/, \"int32_t\"); " + + "gsub(/@SAINDEX_PRId@/, \"PRId32\"); " + + commom_awk_replaces + + "print; }' $(<) > $(@)"), +) diff --git a/research/deorummolae.cc b/research/deorummolae.cc index c53a53c..d15b7ee 100644 --- a/research/deorummolae.cc +++ b/research/deorummolae.cc @@ -15,20 +15,31 @@ /* Non tunable definitions. */ #define CHUNK_MASK (CHUNK_SIZE - 1) -#define COVERAGE_SIZE (1 << (LOG_MAX_FILES - 6)) +#define COVERAGE_SIZE (1 << (DM_LOG_MAX_FILES - 6)) /* File coverage: every bit set to 1 denotes a file covered by an isle. */ typedef std::array Coverage; -static int popcount(uint64_t u) { return __builtin_popcountll(u); } +/* Symbol of text alphabet. */ +typedef int32_t TextChar; + +/* Pointer to position in text. */ +typedef uint32_t TextIdx; + +/* SAIS sarray_type; unfortunately, must be a signed type. */ +typedef int32_t TextSaIdx; + +static size_t popcount(uint64_t u) { + return static_cast(__builtin_popcountll(u)); +} /* Condense terminators and pad file entries. */ -static void rewriteText(std::vector* text) { - int terminator = text->back(); - int prev = terminator; - size_t to = 0; - for (size_t from = 0; from < text->size(); ++from) { - int next = text->at(from); +static void rewriteText(std::vector* text) { + TextChar terminator = text->back(); + TextChar prev = terminator; + TextIdx to = 0; + for (TextIdx from = 0; from < text->size(); ++from) { + TextChar next = text->at(from); if (next < 256 || prev < 256) { text->at(to++) = next; if (next >= 256) terminator = next; @@ -41,11 +52,12 @@ static void rewriteText(std::vector* text) { } /* Reenumerate terminators for smaller alphabet. */ -static void remapTerminators(std::vector* text, int* next_terminator) { - int prev = -1; - int x = 256; - for (size_t i = 0; i < text->size(); ++i) { - int next = text->at(i); +static void remapTerminators(std::vector* text, + TextChar* next_terminator) { + TextChar prev = -1; + TextChar x = 256; + for (TextIdx i = 0; i < text->size(); ++i) { + TextChar next = text->at(i); if (next < 256) { // Char. // Do nothing. } else if (prev < 256) { // Terminator after char. @@ -60,15 +72,15 @@ static void remapTerminators(std::vector* text, int* next_terminator) { } /* Combine all file entries; create mapping position->file. */ -static void buildFullText(std::vector>* data, - std::vector* full_text, std::vector* file_map, - std::vector* file_offset, int* next_terminator) { +static void buildFullText(std::vector>* data, + std::vector* full_text, std::vector* file_map, + std::vector* file_offset, TextChar* next_terminator) { file_map->resize(0); file_offset->resize(0); full_text->resize(0); - for (size_t i = 0; i < data->size(); ++i) { + for (TextIdx i = 0; i < data->size(); ++i) { file_offset->push_back(full_text->size()); - std::vector& file = data->at(i); + std::vector& file = data->at(i); rewriteText(&file); full_text->insert(full_text->end(), file.begin(), file.end()); file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i); @@ -78,18 +90,19 @@ static void buildFullText(std::vector>* data, /* Build longest-common-prefix based on suffix array and text. TODO: borrowed -> unknown efficiency. */ -static void buildLcp(std::vector* text, std::vector* sa, - std::vector* lcp, std::vector* invese_sa) { - int size = static_cast(text->size()); +static void buildLcp(std::vector* text, std::vector* sa, + std::vector* lcp, std::vector* invese_sa) { + TextIdx size = static_cast(text->size()); lcp->resize(size); - int k = 0; + TextIdx k = 0; lcp->at(size - 1) = 0; - for (int i = 0; i < size; ++i) { + for (TextIdx i = 0; i < size; ++i) { if (invese_sa->at(i) == size - 1) { k = 0; continue; } - int j = sa->at(invese_sa->at(i) + 1); // Suffix which follow i-th suffix. + // Suffix which follow i-th suffix. + TextIdx j = sa->at(invese_sa->at(i) + 1); while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) { ++k; } @@ -102,21 +115,21 @@ static void buildLcp(std::vector* text, std::vector* sa, When we raise the LCP requirement, the isle sunks and smaller isles appear instead. */ typedef struct { - int lcp; - int l; - int r; + TextIdx lcp; + TextIdx l; + TextIdx r; Coverage coverage; } Isle; /* Helper routine for `cutMatch`. */ -static void poisonData(int pos, int length, std::vector>* data, - std::vector* file_map, std::vector* file_offset, - int* next_terminator) { - size_t f = file_map->at(pos / CHUNK_SIZE); +static void poisonData(TextIdx pos, TextIdx length, + std::vector>* data, std::vector* file_map, + std::vector* file_offset, TextChar* next_terminator) { + TextIdx f = file_map->at(pos / CHUNK_SIZE); pos -= file_offset->at(f); - std::vector& file = data->at(f); - int l = (length == CUT_MATCH) ? CUT_MATCH : 1; - for (int j = 0; j < l; j++, pos++) { + std::vector& file = data->at(f); + TextIdx l = (length == CUT_MATCH) ? CUT_MATCH : 1; + for (TextIdx j = 0; j < l; j++, pos++) { if (file[pos] >= 256) continue; if (file[pos + 1] >= 256) { file[pos] = file[pos + 1]; @@ -131,12 +144,12 @@ static void poisonData(int pos, int length, std::vector>* data, /* Remove substrings of a given match from files. Substrings are replaced with unique terminators, so next iteration SA would not allow to cross removed areas. */ -static void cutMatch(std::vector>* data, int index, int length, - std::vector* sa, std::vector* lcp, std::vector* invese_sa, - int* next_terminator, std::vector* file_map, - std::vector* file_offset) { +static void cutMatch(std::vector>* data, TextIdx index, + TextIdx length, std::vector* sa, std::vector* lcp, + std::vector* invese_sa, TextChar* next_terminator, + std::vector* file_map, std::vector* file_offset) { while (length >= CUT_MATCH) { - int i = index; + TextIdx i = index; while (lcp->at(i) >= length) { i++; poisonData( @@ -156,54 +169,70 @@ static void cutMatch(std::vector>* data, int index, int length, std::string DM_generate(size_t dictionary_size_limit, const std::vector& sample_sizes, const uint8_t* sample_data) { { - uint64_t tmp = 0; - if (popcount(tmp - 1u) != 64) { - fprintf(stderr, "64-bit platform is required\n"); - return 0; + TextIdx tmp = static_cast(dictionary_size_limit); + if ((tmp != dictionary_size_limit) || (tmp > 1u << 30)) { + fprintf(stderr, "dictionary_size_limit is too large\n"); + return ""; } } /* Could use 256 + '0' for easier debugging. */ - int next_terminator = 256; + TextChar next_terminator = 256; std::string output; - std::vector> data; + std::vector> data; - size_t offset = 0; + TextIdx offset = 0; size_t num_samples = sample_sizes.size(); - if (num_samples > MAX_FILES) num_samples = MAX_FILES; + if (num_samples > DM_MAX_FILES) num_samples = DM_MAX_FILES; for (size_t n = 0; n < num_samples; ++n) { - size_t next_offset = offset + sample_sizes[n]; + TextIdx delta = static_cast(sample_sizes[n]); + if (delta != sample_sizes[n]) { + fprintf(stderr, "sample is too large\n"); + return ""; + } + if (delta == 0) { + fprintf(stderr, "0-length samples are prohibited\n"); + return ""; + } + TextIdx next_offset = offset + delta; + if (next_offset <= offset) { + fprintf(stderr, "corpus is too large\n"); + return ""; + } data.push_back( - std::vector(sample_data + offset, sample_data + next_offset)); + std::vector(sample_data + offset, sample_data + next_offset)); offset = next_offset; data.back().push_back(next_terminator++); } /* Most arrays are allocated once, and then just resized to smaller and smaller sizes. */ - std::vector full_text; - std::vector file_map; - std::vector file_offset; - std::vector sa; - std::vector invese_sa; - std::vector lcp; + std::vector full_text; + std::vector file_map; + std::vector file_offset; + std::vector sa; + std::vector invese_sa; + std::vector lcp; std::vector isles; std::vector output_data; - size_t total = 0; - size_t total_cost = 0; - size_t best_cost; + TextIdx total = 0; + TextIdx total_cost = 0; + TextIdx best_cost; Isle best_isle; - int min_count = num_samples; + size_t min_count = num_samples; while (true) { - size_t max_match = dictionary_size_limit - total; + TextIdx max_match = static_cast(dictionary_size_limit) - total; buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator); sa.resize(full_text.size()); - saisxx(full_text.data(), sa.data(), static_cast(full_text.size()), - next_terminator); + /* Hopefully, non-negative TextSaIdx is the same sa TextIdx counterpart. */ + saisxx(full_text.data(), reinterpret_cast(sa.data()), + static_cast(full_text.size()), next_terminator); invese_sa.resize(full_text.size()); - for (int i = 0; i < full_text.size(); ++i) invese_sa[sa[i]] = i; + for (TextIdx i = 0; i < full_text.size(); ++i) { + invese_sa[sa[i]] = i; + } buildLcp(&full_text, &sa, &lcp, &invese_sa); /* Do not rebuild SA/LCP, just use different selection. */ @@ -213,22 +242,22 @@ std::string DM_generate(size_t dictionary_size_limit, isles.resize(0); isles.push_back(best_isle); - for (int i = 0; i < static_cast(lcp.size()); ++i) { - int l = i; + for (TextIdx i = 0; i < lcp.size(); ++i) { + TextIdx l = i; Coverage cov = {{0}}; - int f = file_map[sa[i] / CHUNK_SIZE]; - cov[f >> 6] = ((uint64_t)1) << (f & 63); + size_t f = file_map[sa[i] / CHUNK_SIZE]; + cov[f >> 6] = (static_cast(1)) << (f & 63); while (lcp[i] < isles.back().lcp) { Isle& top = isles.back(); top.r = i; l = top.l; for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x]; - int count = 0; + size_t count = 0; for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]); - int effective_lcp = top.lcp; + TextIdx effective_lcp = top.lcp; /* Restrict (last) dictionary entry length. */ if (effective_lcp > max_match) effective_lcp = max_match; - int cost = count * effective_lcp; + TextIdx cost = count * effective_lcp; if (cost > best_cost && count >= min_count && effective_lcp >= MIN_MATCH) { best_cost = cost; @@ -251,14 +280,14 @@ std::string DM_generate(size_t dictionary_size_limit, if (best_cost == 0 || best_isle.lcp < MIN_MATCH) { if (min_count >= 8) { min_count = (min_count * 7) / 8; - fprintf(stderr, "Retry: min_count=%d\n", min_count); + fprintf(stderr, "Retry: min_count=%zu\n", min_count); goto retry; } break; } /* Save the entry. */ - fprintf(stderr, "Savings: %zu+%zu, dictionary: %zu+%d\n", + fprintf(stderr, "Savings: %d+%d, dictionary: %d+%d\n", total_cost, best_cost, total, best_isle.lcp); int* piece = &full_text[sa[best_isle.l]]; output.insert(output.end(), piece, piece + best_isle.lcp); diff --git a/research/deorummolae.h b/research/deorummolae.h index 7f24add..5815097 100644 --- a/research/deorummolae.h +++ b/research/deorummolae.h @@ -1,17 +1,16 @@ #ifndef BROTLI_RESEARCH_DEORUMMOLAE_H_ #define BROTLI_RESEARCH_DEORUMMOLAE_H_ -#include -#include - +#include +#include #include #include /* log2(maximal number of files). Value 6 provides some speedups. */ -#define LOG_MAX_FILES 6 +#define DM_LOG_MAX_FILES 6 /* Non tunable definitions. */ -#define MAX_FILES (1 << LOG_MAX_FILES) +#define DM_MAX_FILES (1 << DM_LOG_MAX_FILES) /** * Generate a dictionary for given samples. diff --git a/research/dictionary_generator.cc b/research/dictionary_generator.cc index b3ee89c..00cfaba 100755 --- a/research/dictionary_generator.cc +++ b/research/dictionary_generator.cc @@ -1,15 +1,20 @@ +#include #include #include #include #include #include "./deorummolae.h" +#include "./durchschlag.h" #include "./sieve.h" #define METHOD_DM 0 #define METHOD_SIEVE 1 +#define METHOD_DURCHSCHLAG 2 +#define METHOD_DISTILL 3 +#define METHOD_PURIFY 4 -size_t readInt(const char* str) { +static size_t readInt(const char* str) { size_t result = 0; if (str[0] == 0 || str[0] == '0') { return 0; @@ -51,10 +56,25 @@ static std::string readFile(const std::string& path) { static void writeFile(const char* file, const std::string& content) { std::ofstream outfile(file, std::ofstream::binary); - outfile.write(content.c_str(), content.size()); + outfile.write(content.c_str(), static_cast(content.size())); outfile.close(); } +static void writeSamples(char const* argv[], const std::vector& pathArgs, + const std::vector& sizes, const uint8_t* data) { + size_t offset = 0; + for (size_t i = 0; i < pathArgs.size(); ++i) { + int j = pathArgs[i]; + const char* file = argv[j]; + size_t sampleSize = sizes[i]; + std::ofstream outfile(file, std::ofstream::binary); + outfile.write(reinterpret_cast(data + offset), + static_cast(sampleSize)); + outfile.close(); + offset += sampleSize; + } +} + /* Returns "base file name" or its tail, if it contains '/' or '\'. */ static const char* fileName(const char* path) { const char* separator_position = strrchr(path, '/'); @@ -68,21 +88,32 @@ static void printHelp(const char* name) { fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name); fprintf(stderr, "Options:\n" - " --dm use 'deorummolae' engine\n" - " --sieve use 'sieve' engine (default)\n" - " -t# set target dictionary size (limit); default: 16K\n" - " -s# set slize length for 'sieve'; default: 33\n" - "# is a decimal number with optional k/K/m/M suffix.\n\n"); + " --dm use 'deorummolae' engine\n" + " --distill rewrite samples; unique text parts are removed\n" + " --dsh use 'durchschlag' engine (default)\n" + " --purify rewrite samples; unique text parts are zeroed out\n" + " --sieve use 'sieve' engine\n" + " -b# set block length for 'durchschlag'; default: 1024\n" + " -s# set slice length for 'distill', 'durchschlag', 'purify'\n" + " and 'sieve'; default: 16\n" + " -t# set target dictionary size (limit); default: 16K\n" + " -u# set minimum slice population (for rewrites); default: 2\n" + "# is a decimal number with optional k/K/m/M suffix.\n" + "WARNING: 'distill' and 'purify' will overwrite original samples!\n" + " Completely unique samples might become empty files.\n\n"); } int main(int argc, char const* argv[]) { int dictionaryArg = -1; - int method = METHOD_SIEVE; - int sieveSliceLen = 33; - int targetSize = 16 << 10; + int method = METHOD_DURCHSCHLAG; + size_t sliceLen = 16; + size_t targetSize = 16 << 10; + size_t blockSize = 1024; + size_t minimumPopulation = 2; std::vector data; std::vector sizes; + std::vector pathArgs; size_t total = 0; for (int i = 1; i < argc; ++i) { if (argv[i] == nullptr) { @@ -90,6 +121,12 @@ int main(int argc, char const* argv[]) { } if (argv[i][0] == '-') { if (argv[i][1] == '-') { + if (dictionaryArg != -1) { + fprintf(stderr, + "Method should be specified before dictionary / sample '%s'\n", + argv[i]); + exit(1); + } if (std::strcmp("--sieve", argv[i]) == 0) { method = METHOD_SIEVE; continue; @@ -98,13 +135,32 @@ int main(int argc, char const* argv[]) { method = METHOD_DM; continue; } + if (std::strcmp("--dsh", argv[i]) == 0) { + method = METHOD_DURCHSCHLAG; + continue; + } + if (std::strcmp("--distill", argv[i]) == 0) { + method = METHOD_DISTILL; + continue; + } + if (std::strcmp("--purify", argv[i]) == 0) { + method = METHOD_PURIFY; + continue; + } printHelp(fileName(argv[0])); fprintf(stderr, "Invalid option '%s'\n", argv[i]); exit(1); } - if (argv[i][1] == 's') { - sieveSliceLen = readInt(&argv[i][2]); - if (sieveSliceLen < 4 || sieveSliceLen > 256) { + if (argv[i][1] == 'b') { + blockSize = readInt(&argv[i][2]); + if (blockSize < 16 || blockSize > 65536) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } + } else if (argv[i][1] == 's') { + sliceLen = readInt(&argv[i][2]); + if (sliceLen < 4 || sliceLen > 256) { printHelp(fileName(argv[0])); fprintf(stderr, "Invalid option '%s'\n", argv[i]); exit(1); @@ -116,6 +172,13 @@ int main(int argc, char const* argv[]) { fprintf(stderr, "Invalid option '%s'\n", argv[i]); exit(1); } + } else if (argv[i][1] == 'u') { + minimumPopulation = readInt(&argv[i][2]); + if (minimumPopulation < 256 || minimumPopulation > 65536) { + printHelp(fileName(argv[0])); + fprintf(stderr, "Invalid option '%s'\n", argv[i]); + exit(1); + } } else { printHelp(fileName(argv[0])); fprintf(stderr, "Unrecognized option '%s'\n", argv[i]); @@ -124,26 +187,42 @@ int main(int argc, char const* argv[]) { continue; } if (dictionaryArg == -1) { - dictionaryArg = i; - continue; + if (method != METHOD_DISTILL && method != METHOD_PURIFY) { + dictionaryArg = i; + continue; + } } std::string content = readFile(argv[i]); data.insert(data.end(), content.begin(), content.end()); total += content.size(); + pathArgs.push_back(i); sizes.push_back(content.size()); } - if (dictionaryArg == -1 || total == 0) { + bool wantDictionary = (dictionaryArg == -1); + if (method == METHOD_DISTILL || method == METHOD_PURIFY) { + wantDictionary = false; + } + if (wantDictionary || total == 0) { printHelp(fileName(argv[0])); fprintf(stderr, "Not enough arguments\n"); exit(1); } if (method == METHOD_SIEVE) { - writeFile(argv[dictionaryArg], - sieve_generate(targetSize, sieveSliceLen, sizes, data.data())); + writeFile(argv[dictionaryArg], sieve_generate( + targetSize, sliceLen, sizes, data.data())); } else if (method == METHOD_DM) { - writeFile(argv[dictionaryArg], - DM_generate(targetSize, sizes, data.data())); + writeFile(argv[dictionaryArg], DM_generate( + targetSize, sizes, data.data())); + } else if (method == METHOD_DURCHSCHLAG) { + writeFile(argv[dictionaryArg], durchschlag_generate( + targetSize, sliceLen, blockSize, sizes, data.data())); + } else if (method == METHOD_DISTILL) { + durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data()); + writeSamples(argv, pathArgs, sizes, data.data()); + } else if (method == METHOD_PURIFY) { + durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data()); + writeSamples(argv, pathArgs, sizes, data.data()); } else { printHelp(fileName(argv[0])); fprintf(stderr, "Unknown generator\n"); diff --git a/research/draw_diff.cc b/research/draw_diff.cc index 01b6716..6541dac 100644 --- a/research/draw_diff.cc +++ b/research/draw_diff.cc @@ -20,18 +20,23 @@ #define CHECK(X) if (!(X)) exit(EXIT_FAILURE); #endif -void ReadPGM(FILE* f, uint8_t*** image, size_t* height, size_t* width) { +typedef uint8_t* ScanLine; +typedef ScanLine* Image; + +void ReadPGM(FILE* f, Image* image, size_t* height, size_t* width) { int colors; CHECK(fscanf(f, "P5\n%lu %lu\n%d\n", width, height, &colors) == 3); assert(colors == 255); - *image = new uint8_t*[*height]; + ScanLine* lines = new ScanLine[*height]; + *image = lines; for (int i = *height - 1; i >= 0; --i) { - (*image)[i] = new uint8_t[*width]; - CHECK(fread((*image)[i], 1, *width, f) == *width); + ScanLine line = new uint8_t[*width]; + lines[i] = line; + CHECK(fread(line, 1, *width, f) == *width); } } -void CalculateDiff(int** diff, uint8_t** image1, uint8_t** image2, +void CalculateDiff(int** diff, Image image1, Image image2, size_t height, size_t width) { for (size_t i = 0; i < height; ++i) { for (size_t j = 0; j < width; ++j) { @@ -40,7 +45,7 @@ void CalculateDiff(int** diff, uint8_t** image1, uint8_t** image2, } } -void DrawDiff(int** diff, uint8_t** image1, uint8_t** image2, +void DrawDiff(int** diff, Image image1, Image image2, size_t height, size_t width, FILE* f) { int max = -1234; int min = +1234; @@ -78,13 +83,13 @@ void DrawDiff(int** diff, uint8_t** image1, uint8_t** image2, delete[] row; } -int main(int argc, char* argv[]) { +int main(int argc, char** argv) { if (argc != 4) { printf("usage: %s pgm1 pgm2 diff_ppm_path\n", argv[0]); return 1; } - uint8_t **image1, **image2; + Image image1, image2; size_t h1, w1, h2, w2; FILE* fimage1 = fopen(argv[1], "rb"); diff --git a/research/durchschlag.cc b/research/durchschlag.cc new file mode 100755 index 0000000..cc4ed68 --- /dev/null +++ b/research/durchschlag.cc @@ -0,0 +1,714 @@ +#include "./durchschlag.h" + +#include +#include /* terminate */ + +#include "divsufsort.h" + +/* Pointer to position in text. */ +typedef DurchschlagTextIdx TextIdx; + +/* (Sum of) value(s) of slice(s). */ +typedef uint32_t Score; + +typedef struct HashSlot { + TextIdx next; + TextIdx offset; +} HashSlot; + +typedef struct MetaSlot { + TextIdx mark; + Score score; +} MetaSlot; + +typedef struct Range { + TextIdx start; + TextIdx end; +} Range; + +typedef struct Candidate { + Score score; + TextIdx position; +} Candidate; + +struct greaterScore { + bool operator()(const Candidate& a, const Candidate& b) const { + return (a.score > b.score) || + ((a.score == b.score) && (a.position < b.position)); + } +}; + +struct lessScore { + bool operator()(const Candidate& a, const Candidate& b) const { + return (a.score < b.score) || + ((a.score == b.score) && (a.position > b.position)); + } +}; + +#define CANDIDATE_BUNDLE_SIZE (1 << 18) + +static void fatal(const char* error) { + fprintf(stderr, "%s\n", error); + std::terminate(); +} + +static TextIdx calculateDictionarySize(const std::vector& ranges) { + TextIdx result = 0; + for (size_t i = 0; i < ranges.size(); ++i) { + const Range& r = ranges[i]; + result += r.end - r.start; + } + return result; +} + +static std::string createDictionary( + const uint8_t* data, const std::vector& ranges, size_t limit) { + std::string output; + output.reserve(calculateDictionarySize(ranges)); + for (size_t i = 0; i < ranges.size(); ++i) { + const Range& r = ranges[i]; + output.insert(output.end(), &data[r.start], &data[r.end]); + } + if (output.size() > limit) { + output.resize(limit); + } + return output; +} + +static Score buildCandidatesList(std::vector* candidates, + std::vector* map, TextIdx span, const TextIdx* shortcut, + TextIdx end) { + candidates->resize(0); + + size_t n = map->size(); + MetaSlot* slots = map->data(); + for (size_t j = 0; j < n; ++j) { + slots[j].mark = 0; + } + + Score score = 0; + for (size_t j = 0; j < span; ++j) { + MetaSlot& item = slots[shortcut[j]]; + if (item.mark == 0) { + score += item.score; + } + item.mark++; + } + + TextIdx i = 0; + TextIdx limit = std::min(end, CANDIDATE_BUNDLE_SIZE); + Score maxScore = 0; + for (; i < limit; ++i) { + MetaSlot& pick = slots[shortcut[i + span]]; + if (pick.mark == 0) { + score += pick.score; + } + pick.mark++; + + if (score > maxScore) { + maxScore = score; + } + candidates->push_back({score, i}); + + MetaSlot& drop = slots[shortcut[i]]; + drop.mark--; + if (drop.mark == 0) { + score -= drop.score; + } + } + + std::make_heap(candidates->begin(), candidates->end(), greaterScore()); + Score minScore = candidates->at(0).score; + for (; i < end; ++i) { + MetaSlot& pick = slots[shortcut[i + span]]; + if (pick.mark == 0) { + score += pick.score; + } + pick.mark++; + + if (score > maxScore) { + maxScore = score; + } + if (score >= minScore) { + candidates->push_back({score, i}); + std::push_heap(candidates->begin(), candidates->end(), greaterScore()); + if (candidates->size() > CANDIDATE_BUNDLE_SIZE && maxScore != minScore) { + while (candidates->at(0).score == minScore) { + std::pop_heap(candidates->begin(), candidates->end(), greaterScore()); + candidates->pop_back(); + } + minScore = candidates->at(0).score; + } + } + + MetaSlot& drop = slots[shortcut[i]]; + drop.mark--; + if (drop.mark == 0) { + score -= drop.score; + } + } + + for (size_t j = 0; j < n; ++j) { + slots[j].mark = 0; + } + + std::make_heap(candidates->begin(), candidates->end(), lessScore()); + return minScore; +} + +static Score rebuildCandidatesList(std::vector* candidates, + std::vector* map, TextIdx span, const TextIdx* shortcut, + TextIdx end, TextIdx* next) { + size_t n = candidates->size(); + TextIdx* data = candidates->data(); + for (size_t i = 0; i < n; ++i) { + data[i] = 0; + } + + n = map->size(); + MetaSlot* slots = map->data(); + for (size_t i = 0; i < n; ++i) { + slots[i].mark = 0; + } + + Score score = 0; + for (TextIdx i = 0; i < span; ++i) { + MetaSlot& item = slots[shortcut[i]]; + if (item.mark == 0) { + score += item.score; + } + item.mark++; + } + + Score maxScore = 0; + for (TextIdx i = 0; i < end; ++i) { + MetaSlot& pick = slots[shortcut[i + span]]; + if (pick.mark == 0) { + score += pick.score; + } + pick.mark++; + + if (candidates->size() <= score) { + candidates->resize(score + 1); + } + if (score > maxScore) { + maxScore = score; + } + next[i] = candidates->at(score); + candidates->at(score) = i; + + MetaSlot& drop = slots[shortcut[i]]; + drop.mark--; + if (drop.mark == 0) { + score -= drop.score; + } + } + + for (size_t i = 0; i < n; ++i) { + slots[i].mark = 0; + } + + candidates->resize(maxScore + 1); + return maxScore; +} + +static void addRange(std::vector* ranges, TextIdx start, TextIdx end) { + for (auto it = ranges->begin(); it != ranges->end();) { + if (end < it->start) { + ranges->insert(it, {start, end}); + return; + } + if (it->end < start) { + it++; + continue; + } + // Combine with existing. + start = std::min(start, it->start); + end = std::max(end, it->end); + // Remove consumed vector and continue. + it = ranges->erase(it); + } + ranges->push_back({start, end}); +} + +std::string durchschlag_generate( + size_t dictionary_size_limit, size_t slice_len, size_t block_len, + const std::vector& sample_sizes, const uint8_t* sample_data) { + DurchschlagContext ctx = durchschlag_prepare( + slice_len, sample_sizes, sample_data); + return durchschlag_generate(DURCHSCHLAG_COLLABORATIVE, + dictionary_size_limit, block_len, ctx, sample_data); +} + +DurchschlagContext durchschlag_prepare(size_t slice_len, + const std::vector& sample_sizes, const uint8_t* sample_data) { + /* Parameters aliasing */ + TextIdx sliceLen = static_cast(slice_len); + if (sliceLen != slice_len) fatal("slice_len is too large"); + if (sliceLen < 1) fatal("slice_len is too small"); + const uint8_t* data = sample_data; + + TextIdx total = 0; + std::vector offsets; + offsets.reserve(sample_sizes.size()); + for (size_t i = 0; i < sample_sizes.size(); ++i) { + TextIdx delta = static_cast(sample_sizes[i]); + if (delta != sample_sizes[i]) fatal("sample is too large"); + if (delta == 0) fatal("0-length samples are prohibited"); + TextIdx next_total = total + delta; + if (next_total <= total) fatal("corpus is too large"); + total = next_total; + offsets.push_back(total); + } + + if (total < sliceLen) fatal("slice_len is larger than corpus size"); + TextIdx end = total - static_cast(sliceLen) + 1; + TextIdx hashLen = 11; + while (hashLen < 29 && ((1u << hashLen) < end)) { + hashLen += 3; + } + hashLen -= 3; + TextIdx hashMask = (1u << hashLen) - 1u; + std::vector hashHead(1 << hashLen); + TextIdx hash = 0; + TextIdx lShift = 3; + TextIdx rShift = hashLen - lShift; + for (TextIdx i = 0; i < sliceLen - 1; ++i) { + TextIdx v = data[i]; + hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; + } + TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen; + TextIdx rShiftX = hashLen - lShiftX; + + std::vector map; + map.push_back({0, 0}); + TextIdx hashSlot = 1; + std::vector sliceMap; + sliceMap.reserve(end); + for (TextIdx i = 0; i < end; ++i) { + TextIdx v = data[i + sliceLen - 1]; + TextIdx bucket = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; + v = data[i]; + hash = bucket ^ (((v << lShiftX) | (v >> rShiftX)) & hashMask); + TextIdx slot = hashHead[bucket]; + while (slot != 0) { + HashSlot& item = map[slot]; + TextIdx start = item.offset; + bool miss = false; + for (TextIdx j = 0; j < sliceLen; ++j) { + if (data[i + j] != data[start + j]) { + miss = true; + break; + } + } + if (!miss) { + sliceMap.push_back(slot); + break; + } + slot = item.next; + } + if (slot == 0) { + map.push_back({hashHead[bucket], i}); + hashHead[bucket] = hashSlot; + sliceMap.push_back(hashSlot); + hashSlot++; + } + } + + return {total, sliceLen, static_cast(map.size()), + std::move(offsets), std::move(sliceMap)}; +} + +DurchschlagContext durchschlag_prepare(size_t slice_len, + const std::vector& sample_sizes, const DurchschlagIndex& index) { + /* Parameters aliasing */ + TextIdx sliceLen = static_cast(slice_len); + if (sliceLen != slice_len) fatal("slice_len is too large"); + if (sliceLen < 1) fatal("slice_len is too small"); + const TextIdx* lcp = index.lcp.data(); + const TextIdx* sa = index.sa.data(); + + TextIdx total = 0; + std::vector offsets; + offsets.reserve(sample_sizes.size()); + for (size_t i = 0; i < sample_sizes.size(); ++i) { + TextIdx delta = static_cast(sample_sizes[i]); + if (delta != sample_sizes[i]) fatal("sample is too large"); + if (delta == 0) fatal("0-length samples are prohibited"); + TextIdx next_total = total + delta; + if (next_total <= total) fatal("corpus is too large"); + total = next_total; + offsets.push_back(total); + } + + if (total < sliceLen) fatal("slice_len is larger than corpus size"); + TextIdx counter = 1; + TextIdx end = total - sliceLen + 1; + std::vector sliceMap(total); + TextIdx last = 0; + TextIdx current = 1; + while (current <= total) { + if (lcp[current - 1] < sliceLen) { + for (TextIdx i = last; i < current; ++i) { + sliceMap[sa[i]] = counter; + } + counter++; + last = current; + } + current++; + } + sliceMap.resize(end); + + // Reorder items for the better locality. + std::vector reorder(counter); + counter = 1; + for (TextIdx i = 0; i < end; ++i) { + if (reorder[sliceMap[i]] == 0) { + reorder[sliceMap[i]] = counter++; + } + } + for (TextIdx i = 0; i < end; ++i) { + sliceMap[i] = reorder[sliceMap[i]]; + } + + return {total, sliceLen, counter, std::move(offsets), std::move(sliceMap)}; +} + +DurchschlagIndex durchschlag_index(const std::vector& data) { + TextIdx total = static_cast(data.size()); + if (total != data.size()) fatal("corpus is too large"); + saidx_t saTotal = static_cast(total); + if (saTotal < 0) fatal("corpus is too large"); + if (static_cast(saTotal) != total) fatal("corpus is too large"); + std::vector sa(total); + /* Hopefully, non-negative int32_t values match TextIdx ones. */ + if (sizeof(TextIdx) != sizeof(int32_t)) fatal("type length mismatch"); + int32_t* saData = reinterpret_cast(sa.data()); + divsufsort(data.data(), saData, saTotal); + + std::vector isa(total); + for (TextIdx i = 0; i < total; ++i) isa[sa[i]] = i; + + // TODO: borrowed -> unknown efficiency. + std::vector lcp(total); + TextIdx k = 0; + lcp[total - 1] = 0; + for (TextIdx i = 0; i < total; ++i) { + TextIdx current = isa[i]; + if (current == total - 1) { + k = 0; + continue; + } + TextIdx j = sa[current + 1]; // Suffix which follow i-th suffix. + while ((i + k < total) && (j + k < total) && (data[i + k] == data[j + k])) { + ++k; + } + lcp[current] = k; + if (k > 0) --k; + } + + return {std::move(lcp), std::move(sa)}; +} + +static void ScoreSlices(const std::vector& offsets, + std::vector& map, const TextIdx* shortcut, TextIdx end) { + TextIdx piece = 0; + /* Fresh map contains all zeroes -> initial mark should be different. */ + TextIdx mark = 1; + for (TextIdx i = 0; i < end; ++i) { + if (offsets[piece] == i) { + piece++; + mark++; + } + MetaSlot& item = map[shortcut[i]]; + if (item.mark != mark) { + item.mark = mark; + item.score++; + } + } +} + +static std::string durchschlagGenerateExclusive( + size_t dictionary_size_limit, size_t block_len, + const DurchschlagContext& context, const uint8_t* sample_data) { + /* Parameters aliasing */ + TextIdx targetSize = static_cast(dictionary_size_limit); + if (targetSize != dictionary_size_limit) { + fprintf(stderr, "dictionary_size_limit is too large\n"); + return ""; + } + TextIdx sliceLen = context.sliceLen; + TextIdx total = context.dataSize; + TextIdx blockLen = static_cast(block_len); + if (blockLen != block_len) { + fprintf(stderr, "block_len is too large\n"); + return ""; + } + const uint8_t* data = sample_data; + const std::vector& offsets = context.offsets; + std::vector map(context.numUniqueSlices); + const TextIdx* shortcut = context.sliceMap.data(); + + /* Initialization */ + if (blockLen < sliceLen) { + fprintf(stderr, "sliceLen is larger than block_len\n"); + return ""; + } + if (targetSize < blockLen || total < blockLen) { + fprintf(stderr, "block_len is too large\n"); + return ""; + } + TextIdx end = total - sliceLen + 1; + ScoreSlices(offsets, map, shortcut, end); + end = total - blockLen + 1; + std::vector candidates; + std::vector next(end); + TextIdx span = blockLen - sliceLen + 1; + Score maxScore = rebuildCandidatesList( + &candidates, &map, span, shortcut, end, next.data()); + + /* Block selection */ + const size_t triesLimit = (600 * 1000000) / span; + const size_t candidatesLimit = (150 * 1000000) / span; + std::vector ranges; + TextIdx mark = 0; + size_t numTries = 0; + while (true) { + TextIdx dictSize = calculateDictionarySize(ranges); + size_t numCandidates = 0; + if (dictSize > targetSize - blockLen) { + break; + } + if (maxScore == 0) { + break; + } + while (true) { + TextIdx candidate = 0; + while (maxScore > 0) { + if (candidates[maxScore] != 0) { + candidate = candidates[maxScore]; + candidates[maxScore] = next[candidate]; + break; + } + maxScore--; + } + if (maxScore == 0) { + break; + } + mark++; + numTries++; + numCandidates++; + Score score = 0; + for (size_t j = candidate; j <= candidate + span; ++j) { + MetaSlot& item = map[shortcut[j]]; + if (item.mark != mark) { + score += item.score; + item.mark = mark; + } + } + if (score < maxScore) { + if (numTries < triesLimit && numCandidates < candidatesLimit) { + next[candidate] = candidates[score]; + candidates[score] = candidate; + } else { + maxScore = rebuildCandidatesList( + &candidates, &map, span, shortcut, end, next.data()); + mark = 0; + numTries = 0; + numCandidates = 0; + } + continue; + } else if (score > maxScore) { + fprintf(stderr, "Broken invariant\n"); + return ""; + } + for (TextIdx j = candidate; j <= candidate + span; ++j) { + MetaSlot& item = map[shortcut[j]]; + item.score = 0; + } + addRange(&ranges, candidate, candidate + blockLen); + break; + } + } + + return createDictionary(data, ranges, targetSize); +} + +static std::string durchschlagGenerateCollaborative( + size_t dictionary_size_limit, size_t block_len, + const DurchschlagContext& context, const uint8_t* sample_data) { + /* Parameters aliasing */ + TextIdx targetSize = static_cast(dictionary_size_limit); + if (targetSize != dictionary_size_limit) { + fprintf(stderr, "dictionary_size_limit is too large\n"); + return ""; + } + TextIdx sliceLen = context.sliceLen; + TextIdx total = context.dataSize; + TextIdx blockLen = static_cast(block_len); + if (blockLen != block_len) { + fprintf(stderr, "block_len is too large\n"); + return ""; + } + const uint8_t* data = sample_data; + const std::vector& offsets = context.offsets; + std::vector map(context.numUniqueSlices); + const TextIdx* shortcut = context.sliceMap.data(); + + /* Initialization */ + if (blockLen < sliceLen) { + fprintf(stderr, "sliceLen is larger than block_len\n"); + return ""; + } + if (targetSize < blockLen || total < blockLen) { + fprintf(stderr, "block_len is too large\n"); + return ""; + } + TextIdx end = total - sliceLen + 1; + ScoreSlices(offsets, map, shortcut, end); + end = total - blockLen + 1; + std::vector candidates; + candidates.reserve(CANDIDATE_BUNDLE_SIZE + 1024); + TextIdx span = blockLen - sliceLen + 1; + Score minScore = buildCandidatesList(&candidates, &map, span, shortcut, end); + + /* Block selection */ + std::vector ranges; + TextIdx mark = 0; + while (true) { + TextIdx dictSize = calculateDictionarySize(ranges); + if (dictSize > targetSize - blockLen) { + break; + } + if (minScore == 0 && candidates.empty()) { + break; + } + while (true) { + if (candidates.empty()) { + minScore = buildCandidatesList(&candidates, &map, span, shortcut, end); + mark = 0; + } + TextIdx candidate = candidates[0].position; + Score expectedScore = candidates[0].score; + if (expectedScore == 0) { + candidates.resize(0); + break; + } + std::pop_heap(candidates.begin(), candidates.end(), lessScore()); + candidates.pop_back(); + mark++; + Score score = 0; + for (TextIdx j = candidate; j <= candidate + span; ++j) { + MetaSlot& item = map[shortcut[j]]; + if (item.mark != mark) { + score += item.score; + item.mark = mark; + } + } + if (score < expectedScore) { + if (score >= minScore) { + candidates.push_back({score, candidate}); + std::push_heap(candidates.begin(), candidates.end(), lessScore()); + } + continue; + } else if (score > expectedScore) { + fatal("Broken invariant"); + } + for (TextIdx j = candidate; j <= candidate + span; ++j) { + MetaSlot& item = map[shortcut[j]]; + item.score = 0; + } + addRange(&ranges, candidate, candidate + blockLen); + break; + } + } + + return createDictionary(data, ranges, targetSize); +} + +std::string durchschlag_generate(DurchschalgResourceStrategy strategy, + size_t dictionary_size_limit, size_t block_len, + const DurchschlagContext& context, const uint8_t* sample_data) { + if (strategy == DURCHSCHLAG_COLLABORATIVE) { + return durchschlagGenerateCollaborative( + dictionary_size_limit, block_len, context, sample_data); + } else { + return durchschlagGenerateExclusive( + dictionary_size_limit, block_len, context, sample_data); + } +} + +void durchschlag_distill(size_t slice_len, size_t minimum_population, + std::vector* sample_sizes, uint8_t* sample_data) { + /* Parameters aliasing */ + uint8_t* data = sample_data; + + /* Build slice map. */ + DurchschlagContext context = durchschlag_prepare( + slice_len, *sample_sizes, data); + + /* Calculate slice population. */ + const std::vector& offsets = context.offsets; + std::vector map(context.numUniqueSlices); + const TextIdx* shortcut = context.sliceMap.data(); + TextIdx sliceLen = context.sliceLen; + TextIdx total = context.dataSize; + TextIdx end = total - sliceLen + 1; + ScoreSlices(offsets, map, shortcut, end); + + /* Condense samples, omitting unique slices. */ + TextIdx readPos = 0; + TextIdx writePos = 0; + TextIdx lastNonUniquePos = 0; + for (TextIdx i = 0; i < sample_sizes->size(); ++i) { + TextIdx sampleStart = writePos; + TextIdx oldSampleEnd = + readPos + static_cast(sample_sizes->at(i)); + while (readPos < oldSampleEnd) { + if (readPos < end) { + MetaSlot& item = map[shortcut[readPos]]; + if (item.score >= minimum_population) { + lastNonUniquePos = readPos + sliceLen; + } + } + if (readPos < lastNonUniquePos) { + data[writePos++] = data[readPos]; + } + readPos++; + } + sample_sizes->at(i) = writePos - sampleStart; + } +} + +void durchschlag_purify(size_t slice_len, size_t minimum_population, + const std::vector& sample_sizes, uint8_t* sample_data) { + /* Parameters aliasing */ + uint8_t* data = sample_data; + + /* Build slice map. */ + DurchschlagContext context = durchschlag_prepare( + slice_len, sample_sizes, data); + + /* Calculate slice population. */ + const std::vector& offsets = context.offsets; + std::vector map(context.numUniqueSlices); + const TextIdx* shortcut = context.sliceMap.data(); + TextIdx sliceLen = context.sliceLen; + TextIdx total = context.dataSize; + TextIdx end = total - sliceLen + 1; + ScoreSlices(offsets, map, shortcut, end); + + /* Rewrite samples, zeroing out unique slices. */ + TextIdx lastNonUniquePos = 0; + for (TextIdx readPos = 0; readPos < total; ++readPos) { + if (readPos < end) { + MetaSlot& item = map[shortcut[readPos]]; + if (item.score >= minimum_population) { + lastNonUniquePos = readPos + sliceLen; + } + } + if (readPos >= lastNonUniquePos) { + data[readPos] = 0; + } + } +} diff --git a/research/durchschlag.h b/research/durchschlag.h new file mode 100755 index 0000000..adbc531 --- /dev/null +++ b/research/durchschlag.h @@ -0,0 +1,99 @@ +#ifndef BROTLI_RESEARCH_DURCHSCHLAG_H_ +#define BROTLI_RESEARCH_DURCHSCHLAG_H_ + +#include +#include +#include +#include + +/** + * Generate a dictionary for given samples. + * + * @param dictionary_size_limit maximal dictionary size + * @param slice_len text slice size + * @param block_len score block length + * @param sample_sizes vector with sample sizes + * @param sample_data concatenated samples + * @return generated dictionary + */ +std::string durchschlag_generate( + size_t dictionary_size_limit, size_t slice_len, size_t block_len, + const std::vector& sample_sizes, const uint8_t* sample_data); + +//------------------------------------------------------------------------------ +// Lower level API for repetitive dictionary generation. +//------------------------------------------------------------------------------ + +/* Pointer to position in text. */ +typedef uint32_t DurchschlagTextIdx; + +/* Context is made public for flexible serialization / deserialization. */ +typedef struct DurchschlagContext { + DurchschlagTextIdx dataSize; + DurchschlagTextIdx sliceLen; + DurchschlagTextIdx numUniqueSlices; + std::vector offsets; + std::vector sliceMap; +} DurchschlagContext; + +DurchschlagContext durchschlag_prepare(size_t slice_len, + const std::vector& sample_sizes, const uint8_t* sample_data); + +typedef enum DurchschalgResourceStrategy { + // Faster + DURCHSCHLAG_EXCLUSIVE = 0, + // Uses much less memory + DURCHSCHLAG_COLLABORATIVE = 1 +} DurchschalgResourceStrategy; + +std::string durchschlag_generate(DurchschalgResourceStrategy strategy, + size_t dictionary_size_limit, size_t block_len, + const DurchschlagContext& context, const uint8_t* sample_data); + +//------------------------------------------------------------------------------ +// Suffix Array based preparation. +//------------------------------------------------------------------------------ + +typedef struct DurchschlagIndex { + std::vector lcp; + std::vector sa; +} DurchschlagIndex; + +DurchschlagIndex durchschlag_index(const std::vector& data); + +DurchschlagContext durchschlag_prepare(size_t slice_len, + const std::vector& sample_sizes, const DurchschlagIndex& index); + +//------------------------------------------------------------------------------ +// Data preparation. +//------------------------------------------------------------------------------ + +/** + * Cut out unique slices. + * + * Both @p sample_sizes and @p sample_data are modified in-place. Number of + * samples remains unchanged, but some samples become shorter. + * + * @param slice_len (unique) slice size + * @param minimum_population minimum non-unique slice occurrence + * @param sample_sizes [in / out] vector with sample sizes + * @param sample_data [in / out] concatenated samples + */ +void durchschlag_distill(size_t slice_len, size_t minimum_population, + std::vector* sample_sizes, uint8_t* sample_data); + +/** + * Replace unique slices with zeroes. + * + * @p sample_data is modified in-place. Number of samples and their length + * remain unchanged. + * + * @param slice_len (unique) slice size + * @param minimum_population minimum non-unique slice occurrence + * @param sample_sizes vector with sample sizes + * @param sample_data [in / out] concatenated samples + */ +void durchschlag_purify(size_t slice_len, size_t minimum_population, + const std::vector& sample_sizes, uint8_t* sample_data); + +#endif // BROTLI_RESEARCH_DURCHSCHLAG_H_ diff --git a/research/libdivsufsort b/research/libdivsufsort new file mode 160000 index 0000000..5f60d6f --- /dev/null +++ b/research/libdivsufsort @@ -0,0 +1 @@ +Subproject commit 5f60d6f026c30fb4ac296f696b3c8b0eb71bd428 diff --git a/research/sieve.cc b/research/sieve.cc index fbc1dbf..4d147e1 100755 --- a/research/sieve.cc +++ b/research/sieve.cc @@ -1,19 +1,27 @@ #include "./sieve.h" +/* Pointer to position in (combined corpus) text. */ +typedef uint32_t TextIdx; + +/* Index of sample / generation. */ +typedef uint16_t SampleIdx; + typedef struct Slot { - uint32_t next; - uint32_t offset; - uint16_t presence; - uint16_t mark; + TextIdx next; + TextIdx offset; + SampleIdx presence; + SampleIdx mark; } Slot; -static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end, - size_t middle, uint16_t minPresence, uint16_t iteration) { - int from = -2; - int to = -1; - size_t result = 0; - uint16_t targetPresence = minPresence; - for (uint32_t i = 0; i < end; ++i) { +static const TextIdx kNowhere = static_cast(-1); + +static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut, + TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) { + TextIdx from = kNowhere; + TextIdx to = kNowhere; + TextIdx result = 0; + SampleIdx targetPresence = minPresence; + for (TextIdx i = 0; i < end; ++i) { if (i == middle) { targetPresence++; } @@ -21,8 +29,8 @@ static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end, if (item.mark != iteration) { item.mark = iteration; if (item.presence >= targetPresence) { - if (to < i) { - if (from > 0) { + if ((to == kNowhere) || (to < i)) { + if (from != kNowhere) { result += to - from; } from = i; @@ -31,20 +39,20 @@ static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end, } } } - if (from > 0) { + if (from != kNowhere) { result += to - from; } return result; } -static std::string createDictionary(const uint8_t* data, size_t sliceLen, - Slot* map, uint32_t* shortcut, size_t end, size_t middle, - uint16_t minPresence, uint16_t iteration) { +static std::string createDictionary(const uint8_t* data, TextIdx sliceLen, + Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle, + SampleIdx minPresence, SampleIdx iteration) { std::string output; - int from = -2; - int to = -1; - uint16_t targetPresence = minPresence; - for (uint32_t i = 0; i < end; ++i) { + TextIdx from = kNowhere; + TextIdx to = kNowhere; + SampleIdx targetPresence = minPresence; + for (TextIdx i = 0; i < end; ++i) { if (i == middle) { targetPresence++; } @@ -52,8 +60,8 @@ static std::string createDictionary(const uint8_t* data, size_t sliceLen, if (item.mark != iteration) { item.mark = iteration; if (item.presence >= targetPresence) { - if (to < i) { - if (from > 0) { + if ((to == kNowhere) || (to < i)) { + if (from != kNowhere) { output.insert(output.end(), &data[from], &data[to]); } from = i; @@ -62,7 +70,7 @@ static std::string createDictionary(const uint8_t* data, size_t sliceLen, } } } - if (from > 0) { + if (from != kNowhere) { output.insert(output.end(), &data[from], &data[to]); } return output; @@ -71,55 +79,95 @@ static std::string createDictionary(const uint8_t* data, size_t sliceLen, std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, const std::vector& sample_sizes, const uint8_t* sample_data) { /* Parameters aliasing */ - size_t targetSize = dictionary_size_limit; - size_t sliceLen = slice_len; + TextIdx targetSize = static_cast(dictionary_size_limit); + if (targetSize != dictionary_size_limit) { + fprintf(stderr, "dictionary_size_limit is too large\n"); + return ""; + } + TextIdx sliceLen = static_cast(slice_len); + if (sliceLen != slice_len) { + fprintf(stderr, "slice_len is too large\n"); + return ""; + } + if (sliceLen < 1) { + fprintf(stderr, "slice_len is too small\n"); + return ""; + } + SampleIdx numSamples = static_cast(sample_sizes.size()); + if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) { + fprintf(stderr, "too many samples\n"); + return ""; + } const uint8_t* data = sample_data; - size_t total = 0; - std::vector offsets; - for (size_t i = 0; i < sample_sizes.size(); ++i) { - total += sample_sizes[i]; + TextIdx total = 0; + std::vector offsets; + for (SampleIdx i = 0; i < numSamples; ++i) { + TextIdx delta = static_cast(sample_sizes[i]); + if (delta != sample_sizes[i]) { + fprintf(stderr, "sample is too large\n"); + return ""; + } + if (delta == 0) { + fprintf(stderr, "empty samples are prohibited\n"); + return ""; + } + if (total + delta <= total) { + fprintf(stderr, "corpus is too large\n"); + return ""; + } + total += delta; offsets.push_back(total); } + if (total * 2 < total) { + fprintf(stderr, "corpus is too large\n"); + return ""; + } + + if (total < sliceLen) { + fprintf(stderr, "slice_len is larger than corpus size\n"); + return ""; + } + /***************************************************************************** * Build coverage map. ****************************************************************************/ std::vector map; - std::vector shortcut; + std::vector shortcut; map.push_back({0, 0, 0, 0}); - size_t end = total - sliceLen; - int hashLen = 8; - while ((1 << hashLen) < end) { + TextIdx end = total - sliceLen; + TextIdx hashLen = 11; + while (hashLen < 29 && ((1u << hashLen) < end)) { hashLen += 3; } hashLen -= 3; - uint32_t hashMask = (1u << hashLen) - 1u; - std::vector hashHead(1 << hashLen); - uint32_t hashSlot = 1; - uint16_t piece = 0; - uint32_t hash = 0; - int lShift = 3; - int rShift = hashLen - lShift; - for (int i = 0; i < sliceLen - 1; ++i) { - uint32_t v = data[i]; + TextIdx hashMask = (1u << hashLen) - 1u; + std::vector hashHead(1 << hashLen); + TextIdx hashSlot = 1; + SampleIdx piece = 0; + TextIdx hash = 0; + TextIdx lShift = 3; + TextIdx rShift = hashLen - lShift; + for (TextIdx i = 0; i < sliceLen - 1; ++i) { + TextIdx v = data[i]; hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; } - int lShiftX = (lShift * (sliceLen - 1)) % hashLen; - int rShiftX = hashLen - lShiftX; - for (uint32_t i = 0; i < end; ++i) { - uint32_t v = data[i + sliceLen - 1]; + TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen; + TextIdx rShiftX = hashLen - lShiftX; + for (TextIdx i = 0; i < end; ++i) { + TextIdx v = data[i + sliceLen - 1]; hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; if (offsets[piece] == i) { piece++; } - uint32_t slot = hashHead[hash]; + TextIdx slot = hashHead[hash]; while (slot != 0) { Slot& item = map[slot]; - int start = item.offset; + TextIdx start = item.offset; bool miss = false; - for (size_t j = 0; j < sliceLen; ++j) { + for (TextIdx j = 0; j < sliceLen; ++j) { if (data[i + j] != data[start + j]) { miss = true; break; @@ -148,8 +196,8 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, /***************************************************************************** * Build dictionary of specified size. ****************************************************************************/ - size_t a = 1; - size_t size = dryRun( + SampleIdx a = 1; + TextIdx size = dryRun( sliceLen, map.data(), shortcut.data(), end, end, a, ++piece); /* Maximal output is smaller than target. */ if (size <= targetSize) { @@ -157,7 +205,7 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece); } - size_t b = offsets.size(); + SampleIdx b = numSamples; size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); if (size == targetSize) { return createDictionary( @@ -167,7 +215,7 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, if (size < targetSize) { /* size(a) > targetSize > size(b) && a < m < b */ while (a + 1 < b) { - size_t m = (a + b) / 2; + SampleIdx m = static_cast((a + b) / 2); size = dryRun( sliceLen, map.data(), shortcut.data(), end, end, m, ++piece); if (size < targetSize) { @@ -183,18 +231,18 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, a = b; } /* size(minPresence) > targetSize > size(minPresence + 1) */ - size_t minPresence = a; - a = 0; - b = end; + SampleIdx minPresence = a; + TextIdx c = 0; + TextIdx d = end; /* size(a) < targetSize < size(b) && a < m < b */ - while (a + 1 < b) { - size_t m = (a + b) / 2; + while (c + 1 < d) { + TextIdx m = (c + d) / 2; size = dryRun( sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece); if (size < targetSize) { - a = m; + c = m; } else if (size > targetSize) { - b = m; + d = m; } else { return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece); @@ -204,8 +252,8 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, bool unrestricted = false; if (minPresence <= 2 && !unrestricted) { minPresence = 2; - a = end; + c = end; } - return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, a, + return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c, minPresence, ++piece); } diff --git a/research/sieve.h b/research/sieve.h index 2aae669..6c65dc8 100755 --- a/research/sieve.h +++ b/research/sieve.h @@ -1,9 +1,8 @@ #ifndef BROTLI_RESEARCH_SIEVE_H_ #define BROTLI_RESEARCH_SIEVE_H_ -#include -#include - +#include +#include #include #include -- cgit v1.1