Updates to Brotli compression format, decoder and encoder

This commit contains a batch of changes that were made to the Brotli compression algorithm in the last month. Most important changes: * Format change: don't push distances representing static dictionary words to the distance cache. * Fix decoder invalid memory access bug caused by building a non-complete Huffman tree. * Add a mode parameter to the encoder interface. * Use different hashers for text and font mode. * Add a heuristics to the hasher for skipping non-compressible data. * Exhaustive search of static dictionary during backward reference search.
author: Zoltan Szabadka <szabadka@google.com> 2014-03-20 14:32:35 +0100
committer: Zoltan Szabadka <szabadka@google.com> 2014-03-20 14:32:35 +0100
commit: e7650080a8cedc159fc4e89d7f144c28f4b028b4 (patch)
tree: df57cf3742e529f8daefc74c7abb755213e96a48 /enc/hash.h
parent: cddab4adef2cecc2923358c199b793292cab094c (diff)
download: brotli-e7650080a8cedc159fc4e89d7f144c28f4b028b4.zip
brotli-e7650080a8cedc159fc4e89d7f144c28f4b028b4.tar.gz
brotli-e7650080a8cedc159fc4e89d7f144c28f4b028b4.tar.bz2
1 files changed, 148 insertions, 109 deletions
diff --git a/enc/hash.h b/enc/hash.h
index d6a415b..b07a538 100644
--- a/enc/hash.h
+++ b/enc/hash.h
@@ -24,12 +24,14 @@
 #include <sys/types.h>
 #include <algorithm>
 #include <cstdlib>
+#include <memory>
 #include <string>
 
 #include "./transform.h"
 #include "./fast_log.h"
 #include "./find_match_length.h"
 #include "./port.h"
+#include "./static_dict.h"
 
 namespace brotli {
 
@@ -41,11 +43,21 @@ namespace brotli {
 // * The number has been tuned heuristically against compression benchmarks.
 static const uint32_t kHashMul32 = 0x1e35a7bd;
 
-inline uint32_t Hash3Bytes(const uint8_t *data, const int bits) {
-  uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32;
-  // The higher bits contain more mixture from the multiplication,
-  // so we take our results from there.
-  return h >> (32 - bits);
+template<int kShiftBits, int kMinLength>
+inline uint32_t Hash(const uint8_t *data) {
+  if (kMinLength <= 3) {
+    // If kMinLength is 2 or 3, we hash the first 3 bytes of data.
+    uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32;
+    // The higher bits contain more mixture from the multiplication,
+    // so we take our results from there.
+    return h >> (32 - kShiftBits);
+  } else {
+    // If kMinLength is at least 4, we hash the first 4 bytes of data.
+    uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
+    // The higher bits contain more mixture from the multiplication,
+    // so we take our results from there.
+    return h >> (32 - kShiftBits);
+  }
 }
 
 // Usually, we always choose the longest backward reference. This function
@@ -67,32 +79,35 @@ inline double BackwardReferenceScore(double average_cost,
                                      double start_cost3,
                                      double start_cost2,
                                      int copy_length,
-                                     int backward_reference_offset,
-                                     int last_distance1,
-                                     int last_distance2,
-                                     int last_distance3,
-                                     int last_distance4) {
+                                     int backward_reference_offset) {
   double retval = 0;
   switch (copy_length) {
     case 2: retval = start_cost2; break;
     case 3: retval = start_cost3; break;
     default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
   }
-  int diff_last1 = abs(backward_reference_offset - last_distance1);
-  int diff_last2 = abs(backward_reference_offset - last_distance2);
-  if (diff_last1 == 0) {
-    retval += 0.6;
-  } else if (diff_last1 < 4) {
-    retval -= 0.9 + 0.03 * diff_last1;
-  } else if (diff_last2 < 4) {
-    retval -= 0.95 + 0.1 * diff_last2;
-  } else if (backward_reference_offset == last_distance3) {
-    retval -= 1.17;
-  } else if (backward_reference_offset == last_distance4) {
-    retval -= 1.27;
-  } else {
-    retval -= 1.20 * Log2Floor(backward_reference_offset);
+  retval -= 1.20 * Log2Floor(backward_reference_offset);
+  return retval;
+}
+
+inline double BackwardReferenceScoreUsingLastDistance(double average_cost,
+                                                      double start_cost4,
+                                                      double start_cost3,
+                                                      double start_cost2,
+                                                      int copy_length,
+                                                      int distance_short_code) {
+  double retval = 0;
+  switch (copy_length) {
+    case 2: retval = start_cost2; break;
+    case 3: retval = start_cost3; break;
+    default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
   }
+  static const double kDistanceShortCodeBitCost[16] = {
+    -0.6, 0.95, 1.17, 1.27,
+    0.93, 0.93, 0.96, 0.96, 0.99, 0.99,
+    1.05, 1.05, 1.15, 1.15, 1.25, 1.25
+  };
+  retval -= kDistanceShortCodeBitCost[distance_short_code];
   return retval;
 }
 
@@ -102,7 +117,7 @@ inline double BackwardReferenceScore(double average_cost,
 // This is a hash map of fixed size (kBucketSize) to a ring buffer of
 // fixed size (kBlockSize). The ring buffer contains the last kBlockSize
 // index positions of the given hash key in the compressed data.
-template <int kBucketBits, int kBlockBits>
+template <int kBucketBits, int kBlockBits, int kMinLength>
 class HashLongestMatch {
  public:
   HashLongestMatch()
@@ -111,17 +126,24 @@ class HashLongestMatch {
         last_distance3_(15),
         last_distance4_(16),
         insert_length_(0),
-        average_cost_(5.4) {
+        average_cost_(5.4),
+        static_dict_(NULL) {
     Reset();
   }
   void Reset() {
     std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0);
   }
+  void SetStaticDictionary(const StaticDictionary *dict) {
+    static_dict_ = dict;
+  }
+  bool HasStaticDictionary() const {
+    return static_dict_ != NULL;
+  }
 
   // Look at 3 bytes at data.
   // Compute a hash from these, and store the value of ix at that position.
   inline void Store(const uint8_t *data, const int ix) {
-    const uint32_t key = Hash3Bytes(data, kBucketBits);
+    const uint32_t key = Hash<kBucketBits, kMinLength>(data);
     const int minor_ix = num_[key] & kBlockMask;
     buckets_[key][minor_ix] = ix;
     ++num_[key];
@@ -218,19 +240,17 @@ class HashLongestMatch {
       const size_t len =
           FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
                                    max_length);
-      if (len >= 3 || (len == 2 && i < 2)) {
+      if (len >= std::max(kMinLength, 3) ||
+          (kMinLength == 2 && len == 2 && i < 2)) {
         // Comparing for >= 2 does not change the semantics, but just saves for
         // a few unnecessary binary logarithms in backward reference score,
         // since we are not interested in such short matches.
-        const double score = BackwardReferenceScore(average_cost_,
-                                                    start_cost4,
-                                                    start_cost3,
-                                                    start_cost2,
-                                                    len, backward,
-                                                    last_distance1_,
-                                                    last_distance2_,
-                                                    last_distance3_,
-                                                    last_distance4_);
+        const double score = BackwardReferenceScoreUsingLastDistance(
+            average_cost_,
+            start_cost4,
+            start_cost3,
+            start_cost2,
+            len, i);
         if (best_score < score) {
           best_score = score;
           best_len = len;
@@ -244,76 +264,41 @@ class HashLongestMatch {
         }
       }
     }
-    const uint32_t key = Hash3Bytes(&data[cur_ix_masked], kBucketBits);
-    const int * __restrict const bucket = &buckets_[key][0];
-    const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
-    int stop = int(cur_ix) - 64;
-    if (stop < 0) { stop = 0; }
+    if (kMinLength == 2) {
+      int stop = int(cur_ix) - 64;
+      if (stop < 0) { stop = 0; }
+      start_cost2 -= 1.0;
+      for (int i = cur_ix - 1; i > stop; --i) {
+        size_t prev_ix = i;
+        const size_t backward = cur_ix - prev_ix;
+        if (PREDICT_FALSE(backward > max_backward)) {
+          break;
+        }
+        prev_ix &= ring_buffer_mask;
+        if (data[cur_ix_masked] != data[prev_ix] ||
+            data[cur_ix_masked + 1] != data[prev_ix + 1]) {
+          continue;
+        }
+        int len = 2;
+        const double score = start_cost2 - 2.3 * Log2Floor(backward);
 
-    start_cost2 -= 1.0;
-    for (int i = cur_ix - 1; i > stop; --i) {
-      size_t prev_ix = i;
-      const size_t backward = cur_ix - prev_ix;
-      if (PREDICT_FALSE(backward > max_backward)) {
-        break;
-      }
-      prev_ix &= ring_buffer_mask;
-      if (data[cur_ix_masked] != data[prev_ix] ||
-          data[cur_ix_masked + 1] != data[prev_ix + 1]) {
-        continue;
-      }
-      int len = 2;
-      const double score = start_cost2 - 2.3 * Log2Floor(backward);
-
-      if (best_score < score) {
-        best_score = score;
-        best_len = len;
-        best_ix = backward;
-        *best_len_out = best_len;
-        *best_len_code_out = best_len;
-        *best_distance_out = best_ix;
-        match_found = true;
+        if (best_score < score) {
+          best_score = score;
+          best_len = len;
+          best_ix = backward;
+          *best_len_out = best_len;
+          *best_len_code_out = best_len;
+          *best_distance_out = best_ix;
+          match_found = true;
+        }
       }
     }
+    const uint32_t key = Hash<kBucketBits, kMinLength>(&data[cur_ix_masked]);
+    const int * __restrict const bucket = &buckets_[key][0];
+    const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
     for (int i = num_[key] - 1; i >= down; --i) {
       int prev_ix = bucket[i & kBlockMask];
-      if (prev_ix < 0) {
-        prev_ix *= -1;
-        prev_ix -= 1;
-        int copy_len_code = prev_ix >> 20;
-        int word_id = prev_ix & 0xfffff;
-        std::string word = GetTransformedDictionaryWord(copy_len_code, word_id);
-        int len = word.size();
-        const size_t backward = max_backward + word_id + 1;
-        bool word_matched = (len >= 3 && len <= max_length);
-        for (int k = 0; k < len && word_matched; ++k) {
-          if ((uint8_t)(word[k]) != data[cur_ix_masked + k]) {
-            word_matched = false;
-          }
-        }
-        if (word_matched) {
-          const double score = BackwardReferenceScore(average_cost_,
-                                                      start_cost4,
-                                                      start_cost3,
-                                                      start_cost2,
-                                                      len, backward,
-                                                      last_distance1_,
-                                                      last_distance2_,
-                                                      last_distance3_,
-                                                      last_distance4_);
-          if (best_score < score) {
-            best_score = score;
-            best_len = len;
-            best_ix = backward;
-            *best_len_out = best_len;
-            *best_len_code_out = copy_len_code;
-            *best_distance_out = best_ix;
-            *best_score_out = best_score;
-            match_found = true;
-            *in_dictionary = true;
-          }
-        }
-      } else {
+      if (prev_ix >= 0) {
         const size_t backward = cur_ix - prev_ix;
         if (PREDICT_FALSE(backward > max_backward)) {
           break;
@@ -327,7 +312,7 @@ class HashLongestMatch {
         const size_t len =
             FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
                                      max_length);
-        if (len >= 3) {
+        if (len >= std::max(kMinLength, 3)) {
           // Comparing for >= 3 does not change the semantics, but just saves
           // for a few unnecessary binary logarithms in backward reference
           // score, since we are not interested in such short matches.
@@ -335,11 +320,7 @@ class HashLongestMatch {
                                                       start_cost4,
                                                       start_cost3,
                                                       start_cost2,
-                                                      len, backward,
-                                                      last_distance1_,
-                                                      last_distance2_,
-                                                      last_distance3_,
-                                                      last_distance4_);
+                                                      len, backward);
           if (best_score < score) {
             best_score = score;
             best_len = len;
@@ -354,6 +335,36 @@ class HashLongestMatch {
         }
       }
     }
+    if (static_dict_ != NULL) {
+      // We decide based on first 4 bytes how many bytes to test for.
+      int prefix = BROTLI_UNALIGNED_LOAD32(&data[cur_ix_masked]);
+      int maxlen = static_dict_->GetLength(prefix);
+      for (int len = std::min<size_t>(maxlen, max_length);
+           len > best_len && len >= 4; --len) {
+        std::string snippet((const char *)&data[cur_ix_masked], len);
+        int copy_len_code;
+        int word_id;
+        if (static_dict_->Get(snippet, &copy_len_code, &word_id)) {
+          const size_t backward = max_backward + word_id + 1;
+          const double score = BackwardReferenceScore(average_cost_,
+                                                      start_cost4,
+                                                      start_cost3,
+                                                      start_cost2,
+                                                      len, backward);
+          if (best_score < score) {
+            best_score = score;
+            best_len = len;
+            best_ix = backward;
+            *best_len_out = best_len;
+            *best_len_code_out = copy_len_code;
+            *best_distance_out = best_ix;
+            *best_score_out = best_score;
+            match_found = true;
+            *in_dictionary = true;
+          }
+        }
+      }
+    }
     return match_found;
   }
 
@@ -399,9 +410,37 @@ class HashLongestMatch {
   int insert_length_;
 
   double average_cost_;
+
+  const StaticDictionary *static_dict_;
 };
 
-typedef HashLongestMatch<13, 11> Hasher;
+struct Hashers {
+  enum Type {
+    HASH_15_8_4 = 0,
+    HASH_15_8_2 = 1,
+  };
+
+  void Init(Type type) {
+    switch (type) {
+      case HASH_15_8_4:
+        hash_15_8_4.reset(new HashLongestMatch<15, 8, 4>());
+        break;
+      case HASH_15_8_2:
+        hash_15_8_2.reset(new HashLongestMatch<15, 8, 2>());
+        break;
+      default:
+        break;
+    }
+  }
+
+  void SetStaticDictionary(const StaticDictionary *dict) {
+    if (hash_15_8_4.get() != NULL) hash_15_8_4->SetStaticDictionary(dict);
+    if (hash_15_8_2.get() != NULL) hash_15_8_2->SetStaticDictionary(dict);
+  }
+
+  std::unique_ptr<HashLongestMatch<15, 8, 4> > hash_15_8_4;
+  std::unique_ptr<HashLongestMatch<15, 8, 2> > hash_15_8_2;
+};
 
 }  // namespace brotli
author	Zoltan Szabadka <szabadka@google.com>	2014-03-20 14:32:35 +0100
committer	Zoltan Szabadka <szabadka@google.com>	2014-03-20 14:32:35 +0100
commit	e7650080a8cedc159fc4e89d7f144c28f4b028b4 (patch)
tree	df57cf3742e529f8daefc74c7abb755213e96a48 /enc/hash.h
parent	cddab4adef2cecc2923358c199b793292cab094c (diff)
download	brotli-e7650080a8cedc159fc4e89d7f144c28f4b028b4.zip brotli-e7650080a8cedc159fc4e89d7f144c28f4b028b4.tar.gz brotli-e7650080a8cedc159fc4e89d7f144c28f4b028b4.tar.bz2