aboutsummaryrefslogtreecommitdiff
path: root/enc/hash.h
diff options
context:
space:
mode:
authorZoltan Szabadka <szabadka@google.com>2014-03-20 14:32:35 +0100
committerZoltan Szabadka <szabadka@google.com>2014-03-20 14:32:35 +0100
commite7650080a8cedc159fc4e89d7f144c28f4b028b4 (patch)
treedf57cf3742e529f8daefc74c7abb755213e96a48 /enc/hash.h
parentcddab4adef2cecc2923358c199b793292cab094c (diff)
downloadbrotli-e7650080a8cedc159fc4e89d7f144c28f4b028b4.zip
brotli-e7650080a8cedc159fc4e89d7f144c28f4b028b4.tar.gz
brotli-e7650080a8cedc159fc4e89d7f144c28f4b028b4.tar.bz2
Updates to Brotli compression format, decoder and encoder
This commit contains a batch of changes that were made to the Brotli compression algorithm in the last month. Most important changes: * Format change: don't push distances representing static dictionary words to the distance cache. * Fix decoder invalid memory access bug caused by building a non-complete Huffman tree. * Add a mode parameter to the encoder interface. * Use different hashers for text and font mode. * Add a heuristics to the hasher for skipping non-compressible data. * Exhaustive search of static dictionary during backward reference search.
Diffstat (limited to 'enc/hash.h')
-rw-r--r--enc/hash.h257
1 files changed, 148 insertions, 109 deletions
diff --git a/enc/hash.h b/enc/hash.h
index d6a415b..b07a538 100644
--- a/enc/hash.h
+++ b/enc/hash.h
@@ -24,12 +24,14 @@
#include <sys/types.h>
#include <algorithm>
#include <cstdlib>
+#include <memory>
#include <string>
#include "./transform.h"
#include "./fast_log.h"
#include "./find_match_length.h"
#include "./port.h"
+#include "./static_dict.h"
namespace brotli {
@@ -41,11 +43,21 @@ namespace brotli {
// * The number has been tuned heuristically against compression benchmarks.
static const uint32_t kHashMul32 = 0x1e35a7bd;
-inline uint32_t Hash3Bytes(const uint8_t *data, const int bits) {
- uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32;
- // The higher bits contain more mixture from the multiplication,
- // so we take our results from there.
- return h >> (32 - bits);
+template<int kShiftBits, int kMinLength>
+inline uint32_t Hash(const uint8_t *data) {
+ if (kMinLength <= 3) {
+ // If kMinLength is 2 or 3, we hash the first 3 bytes of data.
+ uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32;
+ // The higher bits contain more mixture from the multiplication,
+ // so we take our results from there.
+ return h >> (32 - kShiftBits);
+ } else {
+ // If kMinLength is at least 4, we hash the first 4 bytes of data.
+ uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
+ // The higher bits contain more mixture from the multiplication,
+ // so we take our results from there.
+ return h >> (32 - kShiftBits);
+ }
}
// Usually, we always choose the longest backward reference. This function
@@ -67,32 +79,35 @@ inline double BackwardReferenceScore(double average_cost,
double start_cost3,
double start_cost2,
int copy_length,
- int backward_reference_offset,
- int last_distance1,
- int last_distance2,
- int last_distance3,
- int last_distance4) {
+ int backward_reference_offset) {
double retval = 0;
switch (copy_length) {
case 2: retval = start_cost2; break;
case 3: retval = start_cost3; break;
default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
}
- int diff_last1 = abs(backward_reference_offset - last_distance1);
- int diff_last2 = abs(backward_reference_offset - last_distance2);
- if (diff_last1 == 0) {
- retval += 0.6;
- } else if (diff_last1 < 4) {
- retval -= 0.9 + 0.03 * diff_last1;
- } else if (diff_last2 < 4) {
- retval -= 0.95 + 0.1 * diff_last2;
- } else if (backward_reference_offset == last_distance3) {
- retval -= 1.17;
- } else if (backward_reference_offset == last_distance4) {
- retval -= 1.27;
- } else {
- retval -= 1.20 * Log2Floor(backward_reference_offset);
+ retval -= 1.20 * Log2Floor(backward_reference_offset);
+ return retval;
+}
+
+inline double BackwardReferenceScoreUsingLastDistance(double average_cost,
+ double start_cost4,
+ double start_cost3,
+ double start_cost2,
+ int copy_length,
+ int distance_short_code) {
+ double retval = 0;
+ switch (copy_length) {
+ case 2: retval = start_cost2; break;
+ case 3: retval = start_cost3; break;
+ default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
}
+ static const double kDistanceShortCodeBitCost[16] = {
+ -0.6, 0.95, 1.17, 1.27,
+ 0.93, 0.93, 0.96, 0.96, 0.99, 0.99,
+ 1.05, 1.05, 1.15, 1.15, 1.25, 1.25
+ };
+ retval -= kDistanceShortCodeBitCost[distance_short_code];
return retval;
}
@@ -102,7 +117,7 @@ inline double BackwardReferenceScore(double average_cost,
// This is a hash map of fixed size (kBucketSize) to a ring buffer of
// fixed size (kBlockSize). The ring buffer contains the last kBlockSize
// index positions of the given hash key in the compressed data.
-template <int kBucketBits, int kBlockBits>
+template <int kBucketBits, int kBlockBits, int kMinLength>
class HashLongestMatch {
public:
HashLongestMatch()
@@ -111,17 +126,24 @@ class HashLongestMatch {
last_distance3_(15),
last_distance4_(16),
insert_length_(0),
- average_cost_(5.4) {
+ average_cost_(5.4),
+ static_dict_(NULL) {
Reset();
}
void Reset() {
std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0);
}
+ void SetStaticDictionary(const StaticDictionary *dict) {
+ static_dict_ = dict;
+ }
+ bool HasStaticDictionary() const {
+ return static_dict_ != NULL;
+ }
// Look at 3 bytes at data.
// Compute a hash from these, and store the value of ix at that position.
inline void Store(const uint8_t *data, const int ix) {
- const uint32_t key = Hash3Bytes(data, kBucketBits);
+ const uint32_t key = Hash<kBucketBits, kMinLength>(data);
const int minor_ix = num_[key] & kBlockMask;
buckets_[key][minor_ix] = ix;
++num_[key];
@@ -218,19 +240,17 @@ class HashLongestMatch {
const size_t len =
FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
max_length);
- if (len >= 3 || (len == 2 && i < 2)) {
+ if (len >= std::max(kMinLength, 3) ||
+ (kMinLength == 2 && len == 2 && i < 2)) {
// Comparing for >= 2 does not change the semantics, but just saves for
// a few unnecessary binary logarithms in backward reference score,
// since we are not interested in such short matches.
- const double score = BackwardReferenceScore(average_cost_,
- start_cost4,
- start_cost3,
- start_cost2,
- len, backward,
- last_distance1_,
- last_distance2_,
- last_distance3_,
- last_distance4_);
+ const double score = BackwardReferenceScoreUsingLastDistance(
+ average_cost_,
+ start_cost4,
+ start_cost3,
+ start_cost2,
+ len, i);
if (best_score < score) {
best_score = score;
best_len = len;
@@ -244,76 +264,41 @@ class HashLongestMatch {
}
}
}
- const uint32_t key = Hash3Bytes(&data[cur_ix_masked], kBucketBits);
- const int * __restrict const bucket = &buckets_[key][0];
- const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
- int stop = int(cur_ix) - 64;
- if (stop < 0) { stop = 0; }
+ if (kMinLength == 2) {
+ int stop = int(cur_ix) - 64;
+ if (stop < 0) { stop = 0; }
+ start_cost2 -= 1.0;
+ for (int i = cur_ix - 1; i > stop; --i) {
+ size_t prev_ix = i;
+ const size_t backward = cur_ix - prev_ix;
+ if (PREDICT_FALSE(backward > max_backward)) {
+ break;
+ }
+ prev_ix &= ring_buffer_mask;
+ if (data[cur_ix_masked] != data[prev_ix] ||
+ data[cur_ix_masked + 1] != data[prev_ix + 1]) {
+ continue;
+ }
+ int len = 2;
+ const double score = start_cost2 - 2.3 * Log2Floor(backward);
- start_cost2 -= 1.0;
- for (int i = cur_ix - 1; i > stop; --i) {
- size_t prev_ix = i;
- const size_t backward = cur_ix - prev_ix;
- if (PREDICT_FALSE(backward > max_backward)) {
- break;
- }
- prev_ix &= ring_buffer_mask;
- if (data[cur_ix_masked] != data[prev_ix] ||
- data[cur_ix_masked + 1] != data[prev_ix + 1]) {
- continue;
- }
- int len = 2;
- const double score = start_cost2 - 2.3 * Log2Floor(backward);
-
- if (best_score < score) {
- best_score = score;
- best_len = len;
- best_ix = backward;
- *best_len_out = best_len;
- *best_len_code_out = best_len;
- *best_distance_out = best_ix;
- match_found = true;
+ if (best_score < score) {
+ best_score = score;
+ best_len = len;
+ best_ix = backward;
+ *best_len_out = best_len;
+ *best_len_code_out = best_len;
+ *best_distance_out = best_ix;
+ match_found = true;
+ }
}
}
+ const uint32_t key = Hash<kBucketBits, kMinLength>(&data[cur_ix_masked]);
+ const int * __restrict const bucket = &buckets_[key][0];
+ const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
for (int i = num_[key] - 1; i >= down; --i) {
int prev_ix = bucket[i & kBlockMask];
- if (prev_ix < 0) {
- prev_ix *= -1;
- prev_ix -= 1;
- int copy_len_code = prev_ix >> 20;
- int word_id = prev_ix & 0xfffff;
- std::string word = GetTransformedDictionaryWord(copy_len_code, word_id);
- int len = word.size();
- const size_t backward = max_backward + word_id + 1;
- bool word_matched = (len >= 3 && len <= max_length);
- for (int k = 0; k < len && word_matched; ++k) {
- if ((uint8_t)(word[k]) != data[cur_ix_masked + k]) {
- word_matched = false;
- }
- }
- if (word_matched) {
- const double score = BackwardReferenceScore(average_cost_,
- start_cost4,
- start_cost3,
- start_cost2,
- len, backward,
- last_distance1_,
- last_distance2_,
- last_distance3_,
- last_distance4_);
- if (best_score < score) {
- best_score = score;
- best_len = len;
- best_ix = backward;
- *best_len_out = best_len;
- *best_len_code_out = copy_len_code;
- *best_distance_out = best_ix;
- *best_score_out = best_score;
- match_found = true;
- *in_dictionary = true;
- }
- }
- } else {
+ if (prev_ix >= 0) {
const size_t backward = cur_ix - prev_ix;
if (PREDICT_FALSE(backward > max_backward)) {
break;
@@ -327,7 +312,7 @@ class HashLongestMatch {
const size_t len =
FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
max_length);
- if (len >= 3) {
+ if (len >= std::max(kMinLength, 3)) {
// Comparing for >= 3 does not change the semantics, but just saves
// for a few unnecessary binary logarithms in backward reference
// score, since we are not interested in such short matches.
@@ -335,11 +320,7 @@ class HashLongestMatch {
start_cost4,
start_cost3,
start_cost2,
- len, backward,
- last_distance1_,
- last_distance2_,
- last_distance3_,
- last_distance4_);
+ len, backward);
if (best_score < score) {
best_score = score;
best_len = len;
@@ -354,6 +335,36 @@ class HashLongestMatch {
}
}
}
+ if (static_dict_ != NULL) {
+ // We decide based on first 4 bytes how many bytes to test for.
+ int prefix = BROTLI_UNALIGNED_LOAD32(&data[cur_ix_masked]);
+ int maxlen = static_dict_->GetLength(prefix);
+ for (int len = std::min<size_t>(maxlen, max_length);
+ len > best_len && len >= 4; --len) {
+ std::string snippet((const char *)&data[cur_ix_masked], len);
+ int copy_len_code;
+ int word_id;
+ if (static_dict_->Get(snippet, &copy_len_code, &word_id)) {
+ const size_t backward = max_backward + word_id + 1;
+ const double score = BackwardReferenceScore(average_cost_,
+ start_cost4,
+ start_cost3,
+ start_cost2,
+ len, backward);
+ if (best_score < score) {
+ best_score = score;
+ best_len = len;
+ best_ix = backward;
+ *best_len_out = best_len;
+ *best_len_code_out = copy_len_code;
+ *best_distance_out = best_ix;
+ *best_score_out = best_score;
+ match_found = true;
+ *in_dictionary = true;
+ }
+ }
+ }
+ }
return match_found;
}
@@ -399,9 +410,37 @@ class HashLongestMatch {
int insert_length_;
double average_cost_;
+
+ const StaticDictionary *static_dict_;
};
-typedef HashLongestMatch<13, 11> Hasher;
+struct Hashers {
+ enum Type {
+ HASH_15_8_4 = 0,
+ HASH_15_8_2 = 1,
+ };
+
+ void Init(Type type) {
+ switch (type) {
+ case HASH_15_8_4:
+ hash_15_8_4.reset(new HashLongestMatch<15, 8, 4>());
+ break;
+ case HASH_15_8_2:
+ hash_15_8_2.reset(new HashLongestMatch<15, 8, 2>());
+ break;
+ default:
+ break;
+ }
+ }
+
+ void SetStaticDictionary(const StaticDictionary *dict) {
+ if (hash_15_8_4.get() != NULL) hash_15_8_4->SetStaticDictionary(dict);
+ if (hash_15_8_2.get() != NULL) hash_15_8_2->SetStaticDictionary(dict);
+ }
+
+ std::unique_ptr<HashLongestMatch<15, 8, 4> > hash_15_8_4;
+ std::unique_ptr<HashLongestMatch<15, 8, 2> > hash_15_8_2;
+};
} // namespace brotli