diff options
author | Zoltan Szabadka <szabadka@google.com> | 2014-03-20 14:32:35 +0100 |
---|---|---|
committer | Zoltan Szabadka <szabadka@google.com> | 2014-03-20 14:32:35 +0100 |
commit | e7650080a8cedc159fc4e89d7f144c28f4b028b4 (patch) | |
tree | df57cf3742e529f8daefc74c7abb755213e96a48 /enc/hash.h | |
parent | cddab4adef2cecc2923358c199b793292cab094c (diff) | |
download | brotli-e7650080a8cedc159fc4e89d7f144c28f4b028b4.zip brotli-e7650080a8cedc159fc4e89d7f144c28f4b028b4.tar.gz brotli-e7650080a8cedc159fc4e89d7f144c28f4b028b4.tar.bz2 |
Updates to Brotli compression format, decoder and encoder
This commit contains a batch of changes that were made to the Brotli
compression algorithm in the last month. Most important changes:
* Format change: don't push distances representing static dictionary words to the distance cache.
* Fix decoder invalid memory access bug caused by building a non-complete Huffman tree.
* Add a mode parameter to the encoder interface.
* Use different hashers for text and font mode.
* Add a heuristics to the hasher for skipping non-compressible data.
* Exhaustive search of static dictionary during backward reference search.
Diffstat (limited to 'enc/hash.h')
-rw-r--r-- | enc/hash.h | 257 |
1 files changed, 148 insertions, 109 deletions
@@ -24,12 +24,14 @@ #include <sys/types.h> #include <algorithm> #include <cstdlib> +#include <memory> #include <string> #include "./transform.h" #include "./fast_log.h" #include "./find_match_length.h" #include "./port.h" +#include "./static_dict.h" namespace brotli { @@ -41,11 +43,21 @@ namespace brotli { // * The number has been tuned heuristically against compression benchmarks. static const uint32_t kHashMul32 = 0x1e35a7bd; -inline uint32_t Hash3Bytes(const uint8_t *data, const int bits) { - uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32; - // The higher bits contain more mixture from the multiplication, - // so we take our results from there. - return h >> (32 - bits); +template<int kShiftBits, int kMinLength> +inline uint32_t Hash(const uint8_t *data) { + if (kMinLength <= 3) { + // If kMinLength is 2 or 3, we hash the first 3 bytes of data. + uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32; + // The higher bits contain more mixture from the multiplication, + // so we take our results from there. + return h >> (32 - kShiftBits); + } else { + // If kMinLength is at least 4, we hash the first 4 bytes of data. + uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32; + // The higher bits contain more mixture from the multiplication, + // so we take our results from there. + return h >> (32 - kShiftBits); + } } // Usually, we always choose the longest backward reference. This function @@ -67,32 +79,35 @@ inline double BackwardReferenceScore(double average_cost, double start_cost3, double start_cost2, int copy_length, - int backward_reference_offset, - int last_distance1, - int last_distance2, - int last_distance3, - int last_distance4) { + int backward_reference_offset) { double retval = 0; switch (copy_length) { case 2: retval = start_cost2; break; case 3: retval = start_cost3; break; default: retval = start_cost4 + (copy_length - 4) * average_cost; break; } - int diff_last1 = abs(backward_reference_offset - last_distance1); - int diff_last2 = abs(backward_reference_offset - last_distance2); - if (diff_last1 == 0) { - retval += 0.6; - } else if (diff_last1 < 4) { - retval -= 0.9 + 0.03 * diff_last1; - } else if (diff_last2 < 4) { - retval -= 0.95 + 0.1 * diff_last2; - } else if (backward_reference_offset == last_distance3) { - retval -= 1.17; - } else if (backward_reference_offset == last_distance4) { - retval -= 1.27; - } else { - retval -= 1.20 * Log2Floor(backward_reference_offset); + retval -= 1.20 * Log2Floor(backward_reference_offset); + return retval; +} + +inline double BackwardReferenceScoreUsingLastDistance(double average_cost, + double start_cost4, + double start_cost3, + double start_cost2, + int copy_length, + int distance_short_code) { + double retval = 0; + switch (copy_length) { + case 2: retval = start_cost2; break; + case 3: retval = start_cost3; break; + default: retval = start_cost4 + (copy_length - 4) * average_cost; break; } + static const double kDistanceShortCodeBitCost[16] = { + -0.6, 0.95, 1.17, 1.27, + 0.93, 0.93, 0.96, 0.96, 0.99, 0.99, + 1.05, 1.05, 1.15, 1.15, 1.25, 1.25 + }; + retval -= kDistanceShortCodeBitCost[distance_short_code]; return retval; } @@ -102,7 +117,7 @@ inline double BackwardReferenceScore(double average_cost, // This is a hash map of fixed size (kBucketSize) to a ring buffer of // fixed size (kBlockSize). The ring buffer contains the last kBlockSize // index positions of the given hash key in the compressed data. -template <int kBucketBits, int kBlockBits> +template <int kBucketBits, int kBlockBits, int kMinLength> class HashLongestMatch { public: HashLongestMatch() @@ -111,17 +126,24 @@ class HashLongestMatch { last_distance3_(15), last_distance4_(16), insert_length_(0), - average_cost_(5.4) { + average_cost_(5.4), + static_dict_(NULL) { Reset(); } void Reset() { std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0); } + void SetStaticDictionary(const StaticDictionary *dict) { + static_dict_ = dict; + } + bool HasStaticDictionary() const { + return static_dict_ != NULL; + } // Look at 3 bytes at data. // Compute a hash from these, and store the value of ix at that position. inline void Store(const uint8_t *data, const int ix) { - const uint32_t key = Hash3Bytes(data, kBucketBits); + const uint32_t key = Hash<kBucketBits, kMinLength>(data); const int minor_ix = num_[key] & kBlockMask; buckets_[key][minor_ix] = ix; ++num_[key]; @@ -218,19 +240,17 @@ class HashLongestMatch { const size_t len = FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked], max_length); - if (len >= 3 || (len == 2 && i < 2)) { + if (len >= std::max(kMinLength, 3) || + (kMinLength == 2 && len == 2 && i < 2)) { // Comparing for >= 2 does not change the semantics, but just saves for // a few unnecessary binary logarithms in backward reference score, // since we are not interested in such short matches. - const double score = BackwardReferenceScore(average_cost_, - start_cost4, - start_cost3, - start_cost2, - len, backward, - last_distance1_, - last_distance2_, - last_distance3_, - last_distance4_); + const double score = BackwardReferenceScoreUsingLastDistance( + average_cost_, + start_cost4, + start_cost3, + start_cost2, + len, i); if (best_score < score) { best_score = score; best_len = len; @@ -244,76 +264,41 @@ class HashLongestMatch { } } } - const uint32_t key = Hash3Bytes(&data[cur_ix_masked], kBucketBits); - const int * __restrict const bucket = &buckets_[key][0]; - const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0; - int stop = int(cur_ix) - 64; - if (stop < 0) { stop = 0; } + if (kMinLength == 2) { + int stop = int(cur_ix) - 64; + if (stop < 0) { stop = 0; } + start_cost2 -= 1.0; + for (int i = cur_ix - 1; i > stop; --i) { + size_t prev_ix = i; + const size_t backward = cur_ix - prev_ix; + if (PREDICT_FALSE(backward > max_backward)) { + break; + } + prev_ix &= ring_buffer_mask; + if (data[cur_ix_masked] != data[prev_ix] || + data[cur_ix_masked + 1] != data[prev_ix + 1]) { + continue; + } + int len = 2; + const double score = start_cost2 - 2.3 * Log2Floor(backward); - start_cost2 -= 1.0; - for (int i = cur_ix - 1; i > stop; --i) { - size_t prev_ix = i; - const size_t backward = cur_ix - prev_ix; - if (PREDICT_FALSE(backward > max_backward)) { - break; - } - prev_ix &= ring_buffer_mask; - if (data[cur_ix_masked] != data[prev_ix] || - data[cur_ix_masked + 1] != data[prev_ix + 1]) { - continue; - } - int len = 2; - const double score = start_cost2 - 2.3 * Log2Floor(backward); - - if (best_score < score) { - best_score = score; - best_len = len; - best_ix = backward; - *best_len_out = best_len; - *best_len_code_out = best_len; - *best_distance_out = best_ix; - match_found = true; + if (best_score < score) { + best_score = score; + best_len = len; + best_ix = backward; + *best_len_out = best_len; + *best_len_code_out = best_len; + *best_distance_out = best_ix; + match_found = true; + } } } + const uint32_t key = Hash<kBucketBits, kMinLength>(&data[cur_ix_masked]); + const int * __restrict const bucket = &buckets_[key][0]; + const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0; for (int i = num_[key] - 1; i >= down; --i) { int prev_ix = bucket[i & kBlockMask]; - if (prev_ix < 0) { - prev_ix *= -1; - prev_ix -= 1; - int copy_len_code = prev_ix >> 20; - int word_id = prev_ix & 0xfffff; - std::string word = GetTransformedDictionaryWord(copy_len_code, word_id); - int len = word.size(); - const size_t backward = max_backward + word_id + 1; - bool word_matched = (len >= 3 && len <= max_length); - for (int k = 0; k < len && word_matched; ++k) { - if ((uint8_t)(word[k]) != data[cur_ix_masked + k]) { - word_matched = false; - } - } - if (word_matched) { - const double score = BackwardReferenceScore(average_cost_, - start_cost4, - start_cost3, - start_cost2, - len, backward, - last_distance1_, - last_distance2_, - last_distance3_, - last_distance4_); - if (best_score < score) { - best_score = score; - best_len = len; - best_ix = backward; - *best_len_out = best_len; - *best_len_code_out = copy_len_code; - *best_distance_out = best_ix; - *best_score_out = best_score; - match_found = true; - *in_dictionary = true; - } - } - } else { + if (prev_ix >= 0) { const size_t backward = cur_ix - prev_ix; if (PREDICT_FALSE(backward > max_backward)) { break; @@ -327,7 +312,7 @@ class HashLongestMatch { const size_t len = FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked], max_length); - if (len >= 3) { + if (len >= std::max(kMinLength, 3)) { // Comparing for >= 3 does not change the semantics, but just saves // for a few unnecessary binary logarithms in backward reference // score, since we are not interested in such short matches. @@ -335,11 +320,7 @@ class HashLongestMatch { start_cost4, start_cost3, start_cost2, - len, backward, - last_distance1_, - last_distance2_, - last_distance3_, - last_distance4_); + len, backward); if (best_score < score) { best_score = score; best_len = len; @@ -354,6 +335,36 @@ class HashLongestMatch { } } } + if (static_dict_ != NULL) { + // We decide based on first 4 bytes how many bytes to test for. + int prefix = BROTLI_UNALIGNED_LOAD32(&data[cur_ix_masked]); + int maxlen = static_dict_->GetLength(prefix); + for (int len = std::min<size_t>(maxlen, max_length); + len > best_len && len >= 4; --len) { + std::string snippet((const char *)&data[cur_ix_masked], len); + int copy_len_code; + int word_id; + if (static_dict_->Get(snippet, ©_len_code, &word_id)) { + const size_t backward = max_backward + word_id + 1; + const double score = BackwardReferenceScore(average_cost_, + start_cost4, + start_cost3, + start_cost2, + len, backward); + if (best_score < score) { + best_score = score; + best_len = len; + best_ix = backward; + *best_len_out = best_len; + *best_len_code_out = copy_len_code; + *best_distance_out = best_ix; + *best_score_out = best_score; + match_found = true; + *in_dictionary = true; + } + } + } + } return match_found; } @@ -399,9 +410,37 @@ class HashLongestMatch { int insert_length_; double average_cost_; + + const StaticDictionary *static_dict_; }; -typedef HashLongestMatch<13, 11> Hasher; +struct Hashers { + enum Type { + HASH_15_8_4 = 0, + HASH_15_8_2 = 1, + }; + + void Init(Type type) { + switch (type) { + case HASH_15_8_4: + hash_15_8_4.reset(new HashLongestMatch<15, 8, 4>()); + break; + case HASH_15_8_2: + hash_15_8_2.reset(new HashLongestMatch<15, 8, 2>()); + break; + default: + break; + } + } + + void SetStaticDictionary(const StaticDictionary *dict) { + if (hash_15_8_4.get() != NULL) hash_15_8_4->SetStaticDictionary(dict); + if (hash_15_8_2.get() != NULL) hash_15_8_2->SetStaticDictionary(dict); + } + + std::unique_ptr<HashLongestMatch<15, 8, 4> > hash_15_8_4; + std::unique_ptr<HashLongestMatch<15, 8, 2> > hash_15_8_2; +}; } // namespace brotli |