From 35e69fc7cf9421ab04ffc9d52cb36d07fa12984a Mon Sep 17 00:00:00 2001 From: Eugene Kliuchnikov Date: Mon, 26 Feb 2018 09:04:36 -0500 Subject: New feature: "Large Window Brotli" (#640) * New feature: "Large Window Brotli" By setting special encoder/decoder flag it is now possible to extend LZ-window up to 30 bits; though produced stream will not be RFC7932 compliant. Added new dictionary generator - "DSH". It combines speed of "Sieve" and quality of "DM". Plus utilities to prepare train corpora (remove unique strings). Improved compression ratio: now two sub-blocks could be stitched: the last copy command could be extended to span the next sub-block. Fixed compression ineffectiveness caused by floating numbers rounding and wrong cost heuristic. Other C changes: - combined / moved `context.h` to `common` - moved transforms to `common` - unified some aspects of code formatting - added an abstraction for encoder (static) dictionary - moved default allocator/deallocator functions to `common` brotli CLI: - window size is auto-adjusted if not specified explicitly Java: - added "eager" decoding both to JNI wrapper and pure decoder - huge speed-up of `DictionaryData` initialization * Add dictionaryless compressed dictionary * Fix `sources.lst` * Fix `sources.lst` and add a note that `libtool` is also required. * Update setup.py * Fix `EagerStreamTest` * Fix BUILD file * Add missing `libdivsufsort` dependency * Fix "unused parameter" warning. --- research/sieve.cc | 174 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 111 insertions(+), 63 deletions(-) (limited to 'research/sieve.cc') diff --git a/research/sieve.cc b/research/sieve.cc index fbc1dbf..4d147e1 100755 --- a/research/sieve.cc +++ b/research/sieve.cc @@ -1,19 +1,27 @@ #include "./sieve.h" +/* Pointer to position in (combined corpus) text. */ +typedef uint32_t TextIdx; + +/* Index of sample / generation. */ +typedef uint16_t SampleIdx; + typedef struct Slot { - uint32_t next; - uint32_t offset; - uint16_t presence; - uint16_t mark; + TextIdx next; + TextIdx offset; + SampleIdx presence; + SampleIdx mark; } Slot; -static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end, - size_t middle, uint16_t minPresence, uint16_t iteration) { - int from = -2; - int to = -1; - size_t result = 0; - uint16_t targetPresence = minPresence; - for (uint32_t i = 0; i < end; ++i) { +static const TextIdx kNowhere = static_cast(-1); + +static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut, + TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) { + TextIdx from = kNowhere; + TextIdx to = kNowhere; + TextIdx result = 0; + SampleIdx targetPresence = minPresence; + for (TextIdx i = 0; i < end; ++i) { if (i == middle) { targetPresence++; } @@ -21,8 +29,8 @@ static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end, if (item.mark != iteration) { item.mark = iteration; if (item.presence >= targetPresence) { - if (to < i) { - if (from > 0) { + if ((to == kNowhere) || (to < i)) { + if (from != kNowhere) { result += to - from; } from = i; @@ -31,20 +39,20 @@ static size_t dryRun(size_t sliceLen, Slot* map, uint32_t* shortcut, size_t end, } } } - if (from > 0) { + if (from != kNowhere) { result += to - from; } return result; } -static std::string createDictionary(const uint8_t* data, size_t sliceLen, - Slot* map, uint32_t* shortcut, size_t end, size_t middle, - uint16_t minPresence, uint16_t iteration) { +static std::string createDictionary(const uint8_t* data, TextIdx sliceLen, + Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle, + SampleIdx minPresence, SampleIdx iteration) { std::string output; - int from = -2; - int to = -1; - uint16_t targetPresence = minPresence; - for (uint32_t i = 0; i < end; ++i) { + TextIdx from = kNowhere; + TextIdx to = kNowhere; + SampleIdx targetPresence = minPresence; + for (TextIdx i = 0; i < end; ++i) { if (i == middle) { targetPresence++; } @@ -52,8 +60,8 @@ static std::string createDictionary(const uint8_t* data, size_t sliceLen, if (item.mark != iteration) { item.mark = iteration; if (item.presence >= targetPresence) { - if (to < i) { - if (from > 0) { + if ((to == kNowhere) || (to < i)) { + if (from != kNowhere) { output.insert(output.end(), &data[from], &data[to]); } from = i; @@ -62,7 +70,7 @@ static std::string createDictionary(const uint8_t* data, size_t sliceLen, } } } - if (from > 0) { + if (from != kNowhere) { output.insert(output.end(), &data[from], &data[to]); } return output; @@ -71,55 +79,95 @@ static std::string createDictionary(const uint8_t* data, size_t sliceLen, std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, const std::vector& sample_sizes, const uint8_t* sample_data) { /* Parameters aliasing */ - size_t targetSize = dictionary_size_limit; - size_t sliceLen = slice_len; + TextIdx targetSize = static_cast(dictionary_size_limit); + if (targetSize != dictionary_size_limit) { + fprintf(stderr, "dictionary_size_limit is too large\n"); + return ""; + } + TextIdx sliceLen = static_cast(slice_len); + if (sliceLen != slice_len) { + fprintf(stderr, "slice_len is too large\n"); + return ""; + } + if (sliceLen < 1) { + fprintf(stderr, "slice_len is too small\n"); + return ""; + } + SampleIdx numSamples = static_cast(sample_sizes.size()); + if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) { + fprintf(stderr, "too many samples\n"); + return ""; + } const uint8_t* data = sample_data; - size_t total = 0; - std::vector offsets; - for (size_t i = 0; i < sample_sizes.size(); ++i) { - total += sample_sizes[i]; + TextIdx total = 0; + std::vector offsets; + for (SampleIdx i = 0; i < numSamples; ++i) { + TextIdx delta = static_cast(sample_sizes[i]); + if (delta != sample_sizes[i]) { + fprintf(stderr, "sample is too large\n"); + return ""; + } + if (delta == 0) { + fprintf(stderr, "empty samples are prohibited\n"); + return ""; + } + if (total + delta <= total) { + fprintf(stderr, "corpus is too large\n"); + return ""; + } + total += delta; offsets.push_back(total); } + if (total * 2 < total) { + fprintf(stderr, "corpus is too large\n"); + return ""; + } + + if (total < sliceLen) { + fprintf(stderr, "slice_len is larger than corpus size\n"); + return ""; + } + /***************************************************************************** * Build coverage map. ****************************************************************************/ std::vector map; - std::vector shortcut; + std::vector shortcut; map.push_back({0, 0, 0, 0}); - size_t end = total - sliceLen; - int hashLen = 8; - while ((1 << hashLen) < end) { + TextIdx end = total - sliceLen; + TextIdx hashLen = 11; + while (hashLen < 29 && ((1u << hashLen) < end)) { hashLen += 3; } hashLen -= 3; - uint32_t hashMask = (1u << hashLen) - 1u; - std::vector hashHead(1 << hashLen); - uint32_t hashSlot = 1; - uint16_t piece = 0; - uint32_t hash = 0; - int lShift = 3; - int rShift = hashLen - lShift; - for (int i = 0; i < sliceLen - 1; ++i) { - uint32_t v = data[i]; + TextIdx hashMask = (1u << hashLen) - 1u; + std::vector hashHead(1 << hashLen); + TextIdx hashSlot = 1; + SampleIdx piece = 0; + TextIdx hash = 0; + TextIdx lShift = 3; + TextIdx rShift = hashLen - lShift; + for (TextIdx i = 0; i < sliceLen - 1; ++i) { + TextIdx v = data[i]; hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; } - int lShiftX = (lShift * (sliceLen - 1)) % hashLen; - int rShiftX = hashLen - lShiftX; - for (uint32_t i = 0; i < end; ++i) { - uint32_t v = data[i + sliceLen - 1]; + TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen; + TextIdx rShiftX = hashLen - lShiftX; + for (TextIdx i = 0; i < end; ++i) { + TextIdx v = data[i + sliceLen - 1]; hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; if (offsets[piece] == i) { piece++; } - uint32_t slot = hashHead[hash]; + TextIdx slot = hashHead[hash]; while (slot != 0) { Slot& item = map[slot]; - int start = item.offset; + TextIdx start = item.offset; bool miss = false; - for (size_t j = 0; j < sliceLen; ++j) { + for (TextIdx j = 0; j < sliceLen; ++j) { if (data[i + j] != data[start + j]) { miss = true; break; @@ -148,8 +196,8 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, /***************************************************************************** * Build dictionary of specified size. ****************************************************************************/ - size_t a = 1; - size_t size = dryRun( + SampleIdx a = 1; + TextIdx size = dryRun( sliceLen, map.data(), shortcut.data(), end, end, a, ++piece); /* Maximal output is smaller than target. */ if (size <= targetSize) { @@ -157,7 +205,7 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece); } - size_t b = offsets.size(); + SampleIdx b = numSamples; size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); if (size == targetSize) { return createDictionary( @@ -167,7 +215,7 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, if (size < targetSize) { /* size(a) > targetSize > size(b) && a < m < b */ while (a + 1 < b) { - size_t m = (a + b) / 2; + SampleIdx m = static_cast((a + b) / 2); size = dryRun( sliceLen, map.data(), shortcut.data(), end, end, m, ++piece); if (size < targetSize) { @@ -183,18 +231,18 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, a = b; } /* size(minPresence) > targetSize > size(minPresence + 1) */ - size_t minPresence = a; - a = 0; - b = end; + SampleIdx minPresence = a; + TextIdx c = 0; + TextIdx d = end; /* size(a) < targetSize < size(b) && a < m < b */ - while (a + 1 < b) { - size_t m = (a + b) / 2; + while (c + 1 < d) { + TextIdx m = (c + d) / 2; size = dryRun( sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece); if (size < targetSize) { - a = m; + c = m; } else if (size > targetSize) { - b = m; + d = m; } else { return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece); @@ -204,8 +252,8 @@ std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, bool unrestricted = false; if (minPresence <= 2 && !unrestricted) { minPresence = 2; - a = end; + c = end; } - return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, a, + return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c, minPresence, ++piece); } -- cgit v1.1