aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEugene Kliuchnikov <eustas@google.com>2017-03-21 16:08:23 +0100
committerGitHub <noreply@github.com>2017-03-21 16:08:23 +0100
commit8a06e02935abadcfff46099c43c879a677d56280 (patch)
treeef1daa0494093ded043288bd1e9f9b5f3a4f8dab
parent1ff78b877f0138064f0a0513267e8355affd4be8 (diff)
downloadbrotli-8a06e02935abadcfff46099c43c879a677d56280.zip
brotli-8a06e02935abadcfff46099c43c879a677d56280.tar.gz
brotli-8a06e02935abadcfff46099c43c879a677d56280.tar.bz2
Better compression (#523)
Better compression: * use more complex content modeling on 1MiB+ files
-rw-r--r--enc/encode.c95
-rw-r--r--enc/metablock.c23
2 files changed, 107 insertions, 11 deletions
diff --git a/enc/encode.c b/enc/encode.c
index dc0ddc1..372af65 100644
--- a/enc/encode.c
+++ b/enc/encode.c
@@ -381,12 +381,100 @@ static void ChooseContextMap(int quality,
}
}
+/* Decide if we want to use a more complex static context map containing 13
+ context values, based on the entropy reduction of histograms over the
+ first 5 bits of literals. */
+static BROTLI_BOOL ShouldUseComplexStaticContextMap(const uint8_t* input,
+ size_t start_pos, size_t length, size_t mask, int quality,
+ size_t size_hint, ContextType* literal_context_mode,
+ size_t* num_literal_contexts, const uint32_t** literal_context_map) {
+ static const uint32_t kStaticContextMapComplexUTF8[64] = {
+ 11, 11, 12, 12, /* 0 special */
+ 0, 0, 0, 0, /* 4 lf */
+ 1, 1, 9, 9, /* 8 space */
+ 2, 2, 2, 2, /* !, first after space/lf and after something else. */
+ 1, 1, 1, 1, /* " */
+ 8, 3, 3, 3, /* % */
+ 1, 1, 1, 1, /* ({[ */
+ 2, 2, 2, 2, /* }]) */
+ 8, 4, 4, 4, /* :; */
+ 8, 7, 4, 4, /* . */
+ 8, 0, 0, 0, /* > */
+ 3, 3, 3, 3, /* [0..9] */
+ 5, 5, 10, 5, /* [A-Z] */
+ 5, 5, 10, 5,
+ 6, 6, 6, 6, /* [a-z] */
+ 6, 6, 6, 6,
+ };
+ BROTLI_UNUSED(quality);
+ /* Try the more complex static context map only for long data. */
+ if (size_hint < (1 << 20)) {
+ return BROTLI_FALSE;
+ } else {
+ const size_t end_pos = start_pos + length;
+ /* To make entropy calculations faster and to fit on the stack, we collect
+ histograms over the 5 most significant bits of literals. One histogram
+ without context and 13 additional histograms for each context value. */
+ uint32_t combined_histo[32] = { 0 };
+ uint32_t context_histo[13][32] = { { 0 } };
+ uint32_t total = 0;
+ double entropy[3];
+ size_t dummy;
+ size_t i;
+ for (; start_pos + 64 <= end_pos; start_pos += 4096) {
+ const size_t stride_end_pos = start_pos + 64;
+ uint8_t prev2 = input[start_pos & mask];
+ uint8_t prev1 = input[(start_pos + 1) & mask];
+ size_t pos;
+ /* To make the analysis of the data faster we only examine 64 byte long
+ strides at every 4kB intervals. */
+ for (pos = start_pos + 2; pos < stride_end_pos; ++pos) {
+ const uint8_t literal = input[pos & mask];
+ const uint8_t context = (uint8_t)kStaticContextMapComplexUTF8[
+ Context(prev1, prev2, CONTEXT_UTF8)];
+ ++total;
+ ++combined_histo[literal >> 3];
+ ++context_histo[context][literal >> 3];
+ prev2 = prev1;
+ prev1 = literal;
+ }
+ }
+ entropy[1] = ShannonEntropy(combined_histo, 32, &dummy);
+ entropy[2] = 0;
+ for (i = 0; i < 13; ++i) {
+ entropy[2] += ShannonEntropy(&context_histo[i][0], 32, &dummy);
+ }
+ entropy[0] = 1.0 / (double)total;
+ entropy[1] *= entropy[0];
+ entropy[2] *= entropy[0];
+ /* The triggering heuristics below were tuned by compressing the individual
+ files of the silesia corpus. If we skip this kind of context modeling
+ for not very well compressible input (i.e. entropy using context modeling
+ is 60% of maximal entropy) or if expected savings by symbol are less
+ than 0.2 bits, then in every case when it triggers, the final compression
+ ratio is improved. Note however that this heuristics might be too strict
+ for some cases and could be tuned further. */
+ if (entropy[2] > 3.0 || entropy[1] - entropy[2] < 0.2) {
+ return BROTLI_FALSE;
+ } else {
+ *literal_context_mode = CONTEXT_UTF8;
+ *num_literal_contexts = 13;
+ *literal_context_map = kStaticContextMapComplexUTF8;
+ return BROTLI_TRUE;
+ }
+ }
+}
+
static void DecideOverLiteralContextModeling(const uint8_t* input,
size_t start_pos, size_t length, size_t mask, int quality,
- ContextType* literal_context_mode, size_t* num_literal_contexts,
- const uint32_t** literal_context_map) {
+ size_t size_hint, ContextType* literal_context_mode,
+ size_t* num_literal_contexts, const uint32_t** literal_context_map) {
if (quality < MIN_QUALITY_FOR_CONTEXT_MODELING || length < 64) {
return;
+ } else if (ShouldUseComplexStaticContextMap(
+ input, start_pos, length, mask, quality, size_hint, literal_context_mode,
+ num_literal_contexts, literal_context_map)) {
+ /* Context map was already set, nothing else to do. */
} else {
/* Gather bi-gram data of the UTF8 byte prefixes. To make the analysis of
UTF8 data faster we only examine 64 byte long strides at every 4kB
@@ -508,7 +596,8 @@ static void WriteMetaBlockInternal(MemoryManager* m,
if (!params->disable_literal_context_modeling) {
DecideOverLiteralContextModeling(
data, wrapped_last_flush_pos, bytes, mask, params->quality,
- &literal_context_mode, &num_literal_contexts, &literal_context_map);
+ params->size_hint, &literal_context_mode, &num_literal_contexts,
+ &literal_context_map);
}
BrotliBuildMetaBlockGreedy(m, data, wrapped_last_flush_pos, mask,
prev_byte, prev_byte2, literal_context_mode, num_literal_contexts,
diff --git a/enc/metablock.c b/enc/metablock.c
index 86118c2..1db76da 100644
--- a/enc/metablock.c
+++ b/enc/metablock.c
@@ -156,7 +156,7 @@ void BrotliBuildMetaBlock(MemoryManager* m,
#include "./metablock_inc.h" /* NOLINT(build/include) */
#undef FN
-#define BROTLI_MAX_STATIC_CONTEXTS 3
+#define BROTLI_MAX_STATIC_CONTEXTS 13
/* Greedy block splitter for one block category (literal, command or distance).
Gathers histograms for all context buckets. */
@@ -241,7 +241,7 @@ static void InitContextBlockSplitter(
(2) emits the current block with the type of the second last block;
(3) merges the current block with the last block. */
static void ContextBlockSplitterFinishBlock(
- ContextBlockSplitter* self, BROTLI_BOOL is_final) {
+ ContextBlockSplitter* self, MemoryManager* m, BROTLI_BOOL is_final) {
BlockSplit* split = self->split_;
const size_t num_contexts = self->num_contexts_;
double* last_entropy = self->last_entropy_;
@@ -275,10 +275,12 @@ static void ContextBlockSplitterFinishBlock(
Decide over the split based on the total reduction of entropy across
all contexts. */
double entropy[BROTLI_MAX_STATIC_CONTEXTS];
- HistogramLiteral combined_histo[2 * BROTLI_MAX_STATIC_CONTEXTS]; /* 6KiB */
+ HistogramLiteral* combined_histo =
+ BROTLI_ALLOC(m, HistogramLiteral, 2 * num_contexts);
double combined_entropy[2 * BROTLI_MAX_STATIC_CONTEXTS];
double diff[2] = { 0.0 };
size_t i;
+ if (BROTLI_IS_OOM(m)) return;
for (i = 0; i < num_contexts; ++i) {
size_t curr_histo_ix = self->curr_histogram_ix_ + i;
size_t j;
@@ -350,6 +352,7 @@ static void ContextBlockSplitterFinishBlock(
self->target_block_size_ += self->min_block_size_;
}
}
+ BROTLI_FREE(m, combined_histo);
}
if (is_final) {
*self->histograms_size_ = split->num_types * num_contexts;
@@ -360,12 +363,14 @@ static void ContextBlockSplitterFinishBlock(
/* Adds the next symbol to the current block type and context. When the
current block reaches the target size, decides on merging the block. */
static void ContextBlockSplitterAddSymbol(
- ContextBlockSplitter* self, size_t symbol, size_t context) {
+ ContextBlockSplitter* self, MemoryManager* m,
+ size_t symbol, size_t context) {
HistogramAddLiteral(&self->histograms_[self->curr_histogram_ix_ + context],
symbol);
++self->block_size_;
if (self->block_size_ == self->target_block_size_) {
- ContextBlockSplitterFinishBlock(self, /* is_final = */ BROTLI_FALSE);
+ ContextBlockSplitterFinishBlock(self, m, /* is_final = */ BROTLI_FALSE);
+ if (BROTLI_IS_OOM(m)) return;
}
}
@@ -437,8 +442,9 @@ static BROTLI_INLINE void BrotliBuildMetaBlockGreedyInternal(
BlockSplitterAddSymbolLiteral(&lit_blocks.plain, literal);
} else {
size_t context = Context(prev_byte, prev_byte2, literal_context_mode);
- ContextBlockSplitterAddSymbol(&lit_blocks.ctx, literal,
- static_context_map[context]);
+ ContextBlockSplitterAddSymbol(&lit_blocks.ctx, m, literal,
+ static_context_map[context]);
+ if (BROTLI_IS_OOM(m)) return;
}
prev_byte2 = prev_byte;
prev_byte = literal;
@@ -459,7 +465,8 @@ static BROTLI_INLINE void BrotliBuildMetaBlockGreedyInternal(
&lit_blocks.plain, /* is_final = */ BROTLI_TRUE);
} else {
ContextBlockSplitterFinishBlock(
- &lit_blocks.ctx, /* is_final = */ BROTLI_TRUE);
+ &lit_blocks.ctx, m, /* is_final = */ BROTLI_TRUE);
+ if (BROTLI_IS_OOM(m)) return;
}
BlockSplitterFinishBlockCommand(&cmd_blocks, /* is_final = */ BROTLI_TRUE);
BlockSplitterFinishBlockDistance(&dist_blocks, /* is_final = */ BROTLI_TRUE);