From 52441069ef31c3c02a4aecad727f2ec5a17ab68b Mon Sep 17 00:00:00 2001 From: Eugene Kliuchnikov Date: Fri, 21 Jul 2017 10:07:24 +0200 Subject: Update (#574) * Update * decoder: better behavior after failure * encoder: replace "len_x_code" with delta * research: add experimental dictionary generator * python: test combing --- c/dec/decode.c | 19 ++- c/enc/backward_references_hq.c | 4 +- c/enc/backward_references_inc.h | 22 +-- c/enc/command.h | 13 +- c/enc/hash.h | 11 +- c/enc/hash_forgetful_chain_inc.h | 15 +- c/enc/hash_longest_match64_inc.h | 15 +- c/enc/hash_longest_match_inc.h | 15 +- c/enc/hash_longest_match_quickly_inc.h | 52 ++++--- c/tools/brotli.c | 12 +- python/tests/_test_utils.py | 6 +- research/deorummolae.cc | 273 +++++++++++++++++++++++++++++++++ research/deorummolae.h | 27 ++++ 13 files changed, 397 insertions(+), 87 deletions(-) create mode 100755 research/deorummolae.cc create mode 100755 research/deorummolae.h diff --git a/c/dec/decode.c b/c/dec/decode.c index 53a9b55..ce97cba 100644 --- a/c/dec/decode.c +++ b/c/dec/decode.c @@ -1942,7 +1942,13 @@ BrotliDecoderResult BrotliDecoderDecompressStream( if (result != BROTLI_DECODER_SUCCESS) { /* Error, needs more input/output */ if (result == BROTLI_DECODER_NEEDS_MORE_INPUT) { if (s->ringbuffer != 0) { /* Pro-actively push output. */ - WriteRingBuffer(s, available_out, next_out, total_out, BROTLI_TRUE); + BrotliDecoderErrorCode intermediate_result = WriteRingBuffer(s, + available_out, next_out, total_out, BROTLI_TRUE); + /* WriteRingBuffer checks s->meta_block_remaining_len validity. */ + if ((int)intermediate_result < 0) { + result = intermediate_result; + break; + } } if (s->buffer_length != 0) { /* Used with internal buffer. */ if (br->avail_in == 0) { /* Successfully finished read transaction. */ @@ -2327,6 +2333,10 @@ void BrotliDecoderSetCustomDictionary( } BROTLI_BOOL BrotliDecoderHasMoreOutput(const BrotliDecoderState* s) { + /* After unrecoverable error remaining output is considered nonsensical. */ + if ((int)s->error_code < 0) { + return BROTLI_FALSE; + } return TO_BROTLI_BOOL( s->ringbuffer != 0 && UnwrittenBytes(s, BROTLI_FALSE) != 0); } @@ -2336,17 +2346,20 @@ const uint8_t* BrotliDecoderTakeOutput(BrotliDecoderState* s, size_t* size) { size_t available_out = *size ? *size : 1u << 24; size_t requested_out = available_out; BrotliDecoderErrorCode status; - if (s->ringbuffer == 0) { + if ((s->ringbuffer == 0) || ((int)s->error_code < 0)) { *size = 0; return 0; } WrapRingBuffer(s); status = WriteRingBuffer(s, &available_out, &result, 0, BROTLI_TRUE); + /* Either WriteRingBuffer returns those "success" codes... */ if (status == BROTLI_DECODER_SUCCESS || status == BROTLI_DECODER_NEEDS_MORE_OUTPUT) { *size = requested_out - available_out; } else { - /* This might happen if previous decoder error code was ignored. */ + /* ... or stream is broken. Normally this should be caught by + BrotliDecoderDecompressStream, this is just a safeguard. */ + if ((int)status < 0) SaveErrorCode(s, status); *size = 0; result = 0; } diff --git a/c/enc/backward_references_hq.c b/c/enc/backward_references_hq.c index 5150ae1..d766487 100755 --- a/c/enc/backward_references_hq.c +++ b/c/enc/backward_references_hq.c @@ -549,8 +549,8 @@ void BrotliZopfliCreateCommands(const size_t num_bytes, BROTLI_BOOL is_dictionary = TO_BROTLI_BOOL(distance > max_distance); size_t dist_code = ZopfliNodeDistanceCode(next); - InitCommand( - &commands[i], insert_length, copy_length, len_code, dist_code); + InitCommand(&commands[i], insert_length, + copy_length, (int)len_code - (int)copy_length, dist_code); if (!is_dictionary && dist_code > 0) { dist_cache[3] = dist_cache[2]; diff --git a/c/enc/backward_references_inc.h b/c/enc/backward_references_inc.h index 0479dfd..e7b1665 100644 --- a/c/enc/backward_references_inc.h +++ b/c/enc/backward_references_inc.h @@ -38,29 +38,29 @@ static BROTLI_NOINLINE void FN(CreateBackwardReferences)( size_t max_distance = BROTLI_MIN(size_t, position, max_backward_limit); HasherSearchResult sr; sr.len = 0; - sr.len_x_code = 0; + sr.len_code_delta = 0; sr.distance = 0; sr.score = kMinScore; - if (FN(FindLongestMatch)(hasher, dictionary, dictionary_hash, - ringbuffer, ringbuffer_mask, dist_cache, - position, max_length, max_distance, &sr)) { + FN(FindLongestMatch)(hasher, dictionary, dictionary_hash, ringbuffer, + ringbuffer_mask, dist_cache, position, + max_length, max_distance, &sr); + if (sr.score > kMinScore) { /* Found a match. Let's look for something even better ahead. */ int delayed_backward_references_in_row = 0; --max_length; for (;; --max_length) { const score_t cost_diff_lazy = 175; - BROTLI_BOOL is_match_found; HasherSearchResult sr2; sr2.len = params->quality < MIN_QUALITY_FOR_EXTENSIVE_REFERENCE_SEARCH ? BROTLI_MIN(size_t, sr.len - 1, max_length) : 0; - sr2.len_x_code = 0; + sr2.len_code_delta = 0; sr2.distance = 0; sr2.score = kMinScore; max_distance = BROTLI_MIN(size_t, position + 1, max_backward_limit); - is_match_found = FN(FindLongestMatch)(hasher, dictionary, - dictionary_hash, ringbuffer, ringbuffer_mask, dist_cache, - position + 1, max_length, max_distance, &sr2); - if (is_match_found && sr2.score >= sr.score + cost_diff_lazy) { + FN(FindLongestMatch)(hasher, dictionary, dictionary_hash, ringbuffer, + ringbuffer_mask, dist_cache, position + 1, + max_length, max_distance, &sr2); + if (sr2.score >= sr.score + cost_diff_lazy) { /* Ok, let's just write one byte for now and start a match from the next byte. */ ++position; @@ -88,7 +88,7 @@ static BROTLI_NOINLINE void FN(CreateBackwardReferences)( dist_cache[0] = (int)sr.distance; FN(PrepareDistanceCache)(hasher, dist_cache); } - InitCommand(commands++, insert_length, sr.len, sr.len ^ sr.len_x_code, + InitCommand(commands++, insert_length, sr.len, sr.len_code_delta, distance_code); } *num_literals += insert_length; diff --git a/c/enc/command.h b/c/enc/command.h index 67ac981..632318e 100644 --- a/c/enc/command.h +++ b/c/enc/command.h @@ -114,17 +114,19 @@ typedef struct Command { /* distance_code is e.g. 0 for same-as-last short code, or 16 for offset 1. */ static BROTLI_INLINE void InitCommand(Command* self, size_t insertlen, - size_t copylen, size_t copylen_code, size_t distance_code) { + size_t copylen, int copylen_code_delta, size_t distance_code) { + /* Don't rely on signed int representation, use honest casts. */ + uint32_t delta = (uint8_t)((int8_t)copylen_code_delta); self->insert_len_ = (uint32_t)insertlen; - self->copy_len_ = (uint32_t)(copylen | ((copylen_code ^ copylen) << 24)); + self->copy_len_ = (uint32_t)(copylen | (delta << 24)); /* The distance prefix and extra bits are stored in this Command as if npostfix and ndirect were 0, they are only recomputed later after the clustering if needed. */ PrefixEncodeCopyDistance( distance_code, 0, 0, &self->dist_prefix_, &self->dist_extra_); GetLengthCode( - insertlen, copylen_code, TO_BROTLI_BOOL(self->dist_prefix_ == 0), - &self->cmd_prefix_); + insertlen, (size_t)((int)copylen + copylen_code_delta), + TO_BROTLI_BOOL(self->dist_prefix_ == 0), &self->cmd_prefix_); } static BROTLI_INLINE void InitInsertCommand(Command* self, size_t insertlen) { @@ -167,7 +169,8 @@ static BROTLI_INLINE uint32_t CommandCopyLen(const Command* self) { } static BROTLI_INLINE uint32_t CommandCopyLenCode(const Command* self) { - return (self->copy_len_ & 0xFFFFFF) ^ (self->copy_len_ >> 24); + int32_t delta = (int8_t)((uint8_t)(self->copy_len_ >> 24)); + return (uint32_t)((int32_t)(self->copy_len_ & 0xFFFFFF) + delta); } #if defined(__cplusplus) || defined(c_plusplus) diff --git a/c/enc/hash.h b/c/enc/hash.h index 4c94cda..f97d969 100644 --- a/c/enc/hash.h +++ b/c/enc/hash.h @@ -62,9 +62,9 @@ static const uint64_t kCutoffTransforms = typedef struct HasherSearchResult { size_t len; - size_t len_x_code; /* == len ^ len_code */ size_t distance; score_t score; + int len_code_delta; /* == len_code - len */ } HasherSearchResult; /* kHashMul32 multiplier has these properties: @@ -178,22 +178,21 @@ static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem( return BROTLI_FALSE; } out->len = matchlen; - out->len_x_code = len ^ matchlen; + out->len_code_delta = (int)len - (int)matchlen; out->distance = backward; out->score = score; return BROTLI_TRUE; } -static BROTLI_INLINE BROTLI_BOOL SearchInStaticDictionary( +static BROTLI_INLINE void SearchInStaticDictionary( const BrotliDictionary* dictionary, const uint16_t* dictionary_hash, HasherHandle handle, const uint8_t* data, size_t max_length, size_t max_backward, HasherSearchResult* out, BROTLI_BOOL shallow) { size_t key; size_t i; - BROTLI_BOOL is_match_found = BROTLI_FALSE; HasherCommon* self = GetHasherCommon(handle); if (self->dict_num_matches < (self->dict_num_lookups >> 7)) { - return BROTLI_FALSE; + return; } key = Hash14(data) << 1; for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) { @@ -204,11 +203,9 @@ static BROTLI_INLINE BROTLI_BOOL SearchInStaticDictionary( dictionary, item, data, max_length, max_backward, out); if (item_matches) { self->dict_num_matches++; - is_match_found = BROTLI_TRUE; } } } - return is_match_found; } typedef struct BackwardMatch { diff --git a/c/enc/hash_forgetful_chain_inc.h b/c/enc/hash_forgetful_chain_inc.h index a49fa6d..88b0841 100755 --- a/c/enc/hash_forgetful_chain_inc.h +++ b/c/enc/hash_forgetful_chain_inc.h @@ -152,8 +152,8 @@ static BROTLI_INLINE void FN(PrepareDistanceCache)( Does not look for matches longer than max_length. Does not look for matches further away than max_backward. Writes the best match into |out|. - Returns 1 when match is found, otherwise 0. */ -static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, + |out|->score is updated only if a better match is found. */ +static BROTLI_INLINE void FN(FindLongestMatch)(HasherHandle handle, const BrotliDictionary* dictionary, const uint16_t* dictionary_hash, const uint8_t* BROTLI_RESTRICT data, const size_t ring_buffer_mask, const int* BROTLI_RESTRICT distance_cache, @@ -161,15 +161,15 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, HasherSearchResult* BROTLI_RESTRICT out) { HashForgetfulChain* self = FN(Self)(handle); const size_t cur_ix_masked = cur_ix & ring_buffer_mask; - BROTLI_BOOL is_match_found = BROTLI_FALSE; /* Don't accept a short copy from far away. */ + score_t min_score = out->score; score_t best_score = out->score; size_t best_len = out->len; size_t i; const size_t key = FN(HashBytes)(&data[cur_ix_masked]); const uint8_t tiny_hash = (uint8_t)(key); out->len = 0; - out->len_x_code = 0; + out->len_code_delta = 0; /* Try last distance first. */ for (i = 0; i < NUM_LAST_DISTANCES_TO_CHECK; ++i) { const size_t backward = (size_t)distance_cache[i]; @@ -194,7 +194,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, out->len = best_len; out->distance = backward; out->score = best_score; - is_match_found = BROTLI_TRUE; } } } @@ -234,19 +233,17 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, out->len = best_len; out->distance = backward; out->score = best_score; - is_match_found = BROTLI_TRUE; } } } } FN(Store)(handle, data, ring_buffer_mask, cur_ix); } - if (!is_match_found) { - is_match_found = SearchInStaticDictionary(dictionary, dictionary_hash, + if (out->score == min_score) { + SearchInStaticDictionary(dictionary, dictionary_hash, handle, &data[cur_ix_masked], max_length, max_backward, out, BROTLI_FALSE); } - return is_match_found; } #undef BANK_SIZE diff --git a/c/enc/hash_longest_match64_inc.h b/c/enc/hash_longest_match64_inc.h index 7d4199f..1581770 100755 --- a/c/enc/hash_longest_match64_inc.h +++ b/c/enc/hash_longest_match64_inc.h @@ -156,8 +156,8 @@ static BROTLI_INLINE void FN(PrepareDistanceCache)( Does not look for matches longer than max_length. Does not look for matches further away than max_backward. Writes the best match into |out|. - Returns true when match is found, otherwise false. */ -static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, + |out|->score is updated only if a better match is found. */ +static BROTLI_INLINE void FN(FindLongestMatch)(HasherHandle handle, const BrotliDictionary* dictionary, const uint16_t* dictionary_hash, const uint8_t* BROTLI_RESTRICT data, const size_t ring_buffer_mask, const int* BROTLI_RESTRICT distance_cache, const size_t cur_ix, @@ -168,13 +168,13 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, uint16_t* num = FN(Num)(self); uint32_t* buckets = FN(Buckets)(self); const size_t cur_ix_masked = cur_ix & ring_buffer_mask; - BROTLI_BOOL is_match_found = BROTLI_FALSE; /* Don't accept a short copy from far away. */ + score_t min_score = out->score; score_t best_score = out->score; size_t best_len = out->len; size_t i; out->len = 0; - out->len_x_code = 0; + out->len_code_delta = 0; /* Try last distance first. */ for (i = 0; i < (size_t)common->params.num_last_distances_to_check; ++i) { const size_t backward = (size_t)distance_cache[i]; @@ -209,7 +209,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, out->len = best_len; out->distance = backward; out->score = best_score; - is_match_found = BROTLI_TRUE; } } } @@ -250,7 +249,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, out->len = best_len; out->distance = backward; out->score = best_score; - is_match_found = BROTLI_TRUE; } } } @@ -258,12 +256,11 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, bucket[num[key] & self->block_mask_] = (uint32_t)cur_ix; ++num[key]; } - if (!is_match_found) { - is_match_found = SearchInStaticDictionary(dictionary, dictionary_hash, + if (min_score == out->score) { + SearchInStaticDictionary(dictionary, dictionary_hash, handle, &data[cur_ix_masked], max_length, max_backward, out, BROTLI_FALSE); } - return is_match_found; } #undef HashLongestMatch diff --git a/c/enc/hash_longest_match_inc.h b/c/enc/hash_longest_match_inc.h index 6913c73..fe26206 100644 --- a/c/enc/hash_longest_match_inc.h +++ b/c/enc/hash_longest_match_inc.h @@ -149,8 +149,8 @@ static BROTLI_INLINE void FN(PrepareDistanceCache)( Does not look for matches longer than max_length. Does not look for matches further away than max_backward. Writes the best match into |out|. - Returns true when match is found, otherwise false. */ -static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, + |out|->score is updated only if a better match is found. */ +static BROTLI_INLINE void FN(FindLongestMatch)(HasherHandle handle, const BrotliDictionary* dictionary, const uint16_t* dictionary_hash, const uint8_t* BROTLI_RESTRICT data, const size_t ring_buffer_mask, const int* BROTLI_RESTRICT distance_cache, const size_t cur_ix, @@ -161,13 +161,13 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, uint16_t* num = FN(Num)(self); uint32_t* buckets = FN(Buckets)(self); const size_t cur_ix_masked = cur_ix & ring_buffer_mask; - BROTLI_BOOL is_match_found = BROTLI_FALSE; /* Don't accept a short copy from far away. */ + score_t min_score = out->score; score_t best_score = out->score; size_t best_len = out->len; size_t i; out->len = 0; - out->len_x_code = 0; + out->len_code_delta = 0; /* Try last distance first. */ for (i = 0; i < (size_t)common->params.num_last_distances_to_check; ++i) { const size_t backward = (size_t)distance_cache[i]; @@ -202,7 +202,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, out->len = best_len; out->distance = backward; out->score = best_score; - is_match_found = BROTLI_TRUE; } } } @@ -242,7 +241,6 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, out->len = best_len; out->distance = backward; out->score = best_score; - is_match_found = BROTLI_TRUE; } } } @@ -250,12 +248,11 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)(HasherHandle handle, bucket[num[key] & self->block_mask_] = (uint32_t)cur_ix; ++num[key]; } - if (!is_match_found) { - is_match_found = SearchInStaticDictionary(dictionary, dictionary_hash, + if (min_score == out->score) { + SearchInStaticDictionary(dictionary, dictionary_hash, handle, &data[cur_ix_masked], max_length, max_backward, out, BROTLI_FALSE); } - return is_match_found; } #undef HashLongestMatch diff --git a/c/enc/hash_longest_match_quickly_inc.h b/c/enc/hash_longest_match_quickly_inc.h index ec1553e..b8ddc31 100644 --- a/c/enc/hash_longest_match_quickly_inc.h +++ b/c/enc/hash_longest_match_quickly_inc.h @@ -123,8 +123,8 @@ static BROTLI_INLINE void FN(PrepareDistanceCache)( Does not look for matches longer than max_length. Does not look for matches further away than max_backward. Writes the best match into |out|. - Returns true if match is found, otherwise false. */ -static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)( + |out|->score is updated only if a better match is found. */ +static BROTLI_INLINE void FN(FindLongestMatch)( HasherHandle handle, const BrotliDictionary* dictionary, const uint16_t* dictionary_hash, const uint8_t* BROTLI_RESTRICT data, const size_t ring_buffer_mask, const int* BROTLI_RESTRICT distance_cache, @@ -135,12 +135,12 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)( const size_t cur_ix_masked = cur_ix & ring_buffer_mask; const uint32_t key = FN(HashBytes)(&data[cur_ix_masked]); int compare_char = data[cur_ix_masked + best_len_in]; + score_t min_score = out->score; score_t best_score = out->score; size_t best_len = best_len_in; size_t cached_backward = (size_t)distance_cache[0]; size_t prev_ix = cur_ix - cached_backward; - BROTLI_BOOL is_match_found = BROTLI_FALSE; - out->len_x_code = 0; + out->len_code_delta = 0; if (prev_ix < cur_ix) { prev_ix &= (uint32_t)ring_buffer_mask; if (compare_char == data[prev_ix + best_len]) { @@ -148,17 +148,18 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)( &data[cur_ix_masked], max_length); if (len >= 4) { - best_score = BackwardReferenceScoreUsingLastDistance(len); - best_len = len; - out->len = len; - out->distance = cached_backward; - out->score = best_score; - compare_char = data[cur_ix_masked + best_len]; - if (BUCKET_SWEEP == 1) { - self->buckets_[key] = (uint32_t)cur_ix; - return BROTLI_TRUE; - } else { - is_match_found = BROTLI_TRUE; + const score_t score = BackwardReferenceScoreUsingLastDistance(len); + if (best_score < score) { + best_score = score; + best_len = len; + out->len = len; + out->distance = cached_backward; + out->score = best_score; + compare_char = data[cur_ix_masked + best_len]; + if (BUCKET_SWEEP == 1) { + self->buckets_[key] = (uint32_t)cur_ix; + return; + } } } } @@ -172,19 +173,22 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)( backward = cur_ix - prev_ix; prev_ix &= (uint32_t)ring_buffer_mask; if (compare_char != data[prev_ix + best_len_in]) { - return BROTLI_FALSE; + return; } if (BROTLI_PREDICT_FALSE(backward == 0 || backward > max_backward)) { - return BROTLI_FALSE; + return; } len = FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked], max_length); if (len >= 4) { - out->len = len; - out->distance = backward; - out->score = BackwardReferenceScore(len, backward); - return BROTLI_TRUE; + const score_t score = BackwardReferenceScore(len, backward); + if (best_score < score) { + out->len = len; + out->distance = backward; + out->score = score; + return; + } } } else { uint32_t *bucket = self->buckets_ + key; @@ -212,18 +216,16 @@ static BROTLI_INLINE BROTLI_BOOL FN(FindLongestMatch)( out->distance = backward; out->score = score; compare_char = data[cur_ix_masked + best_len]; - is_match_found = BROTLI_TRUE; } } } } - if (USE_DICTIONARY && !is_match_found) { - is_match_found = SearchInStaticDictionary(dictionary, dictionary_hash, + if (USE_DICTIONARY && min_score == out->score) { + SearchInStaticDictionary(dictionary, dictionary_hash, handle, &data[cur_ix_masked], max_length, max_backward, out, BROTLI_TRUE); } self->buckets_[key + ((cur_ix >> 3) % BUCKET_SWEEP)] = (uint32_t)cur_ix; - return is_match_found; } #undef HASH_MAP_SIZE diff --git a/c/tools/brotli.c b/c/tools/brotli.c index 637e94e..8b6f9e2 100755 --- a/c/tools/brotli.c +++ b/c/tools/brotli.c @@ -681,12 +681,14 @@ static BROTLI_BOOL CloseFiles(Context* context, BROTLI_BOOL success) { } } - if (fclose(context->fin) != 0) { - if (is_ok) { - fprintf(stderr, "fclose failed [%s]: %s\n", - PrintablePath(context->current_input_path), strerror(errno)); + if (context->fin) { + if (fclose(context->fin) != 0) { + if (is_ok) { + fprintf(stderr, "fclose failed [%s]: %s\n", + PrintablePath(context->current_input_path), strerror(errno)); + } + is_ok = BROTLI_FALSE; } - is_ok = BROTLI_FALSE; } if (success && context->junk_source) { unlink(context->current_input_path); diff --git a/python/tests/_test_utils.py b/python/tests/_test_utils.py index 5d1842e..e38edd3 100644 --- a/python/tests/_test_utils.py +++ b/python/tests/_test_utils.py @@ -10,10 +10,12 @@ import unittest project_dir = os.path.abspath(os.path.join(__file__, '..', '..', '..')) +src_dir = os.path.join(project_dir, 'python') +test_dir = os.path.join(project_dir, 'tests') PYTHON = sys.executable or 'python' -BRO = os.path.join(project_dir, 'python', 'bro.py') +BRO = os.path.join(src_dir, 'bro.py') # Get the platform/version-specific build folder. # By default, the distutils build base is in the same location as setup.py. @@ -30,7 +32,7 @@ if 'PYTHONPATH' not in TEST_ENV: else: TEST_ENV['PYTHONPATH'] = build_dir + os.pathsep + TEST_ENV['PYTHONPATH'] -TESTDATA_DIR = os.path.join(project_dir, 'tests', 'testdata') +TESTDATA_DIR = os.path.join(test_dir, 'testdata') TESTDATA_FILES = [ 'empty', # Empty file diff --git a/research/deorummolae.cc b/research/deorummolae.cc new file mode 100755 index 0000000..839179d --- /dev/null +++ b/research/deorummolae.cc @@ -0,0 +1,273 @@ +#include "./deorummolae.h" + +#include +#include + +#include "./esaxx/sais.hxx" + +/* Used for quick SA-entry to file mapping. Each file is padded to size that + is a multiple of chunk size. */ +#define CHUNK_SIZE 64 +/* Length of substring that is considered to be covered by dictionary string. */ +#define CUT_MATCH 6 +/* Minimal dictionary entry size. */ +#define MIN_MATCH 24 + +/* Non tunable definitions. */ +#define CHUNK_MASK (CHUNK_SIZE - 1) +#define COVERAGE_SIZE (1 << (LOG_MAX_FILES - 6)) + +/* File coverage: every bit set to 1 denotes a file covered by an isle. */ +typedef std::array Coverage; + +static int popcount(uint64_t u) { + return __builtin_popcountll(u); +} + +/* Condense terminators and pad file entries. */ +static void rewriteText(std::vector* text) { + int terminator = text->back(); + int prev = terminator; + size_t to = 0; + for (size_t from = 0; from < text->size(); ++from) { + int next = text->at(from); + if (next < 256 || prev < 256) { + text->at(to++) = next; + if (next >= 256) terminator = next; + } + prev = next; + } + text->resize(to); + if (text->empty()) text->push_back(terminator); + while (text->size() & CHUNK_MASK) text->push_back(terminator); +} + +/* Reenumerate terminators for smaller alphabet. */ +static void remapTerminators(std::vector* text, int* next_terminator) { + int prev = -1; + int x = 256; + for (size_t i = 0; i < text->size(); ++i) { + int next = text->at(i); + if (next < 256) { // Char. + // Do nothing. + } else if (prev < 256) { // Terminator after char. + next = x++; + } else { // Terminator after terminator. + next = prev; + } + text->at(i) = next; + prev = next; + } + *next_terminator = x; +} + +/* Combine all file entries; create mapping position->file. */ +static void buildFullText(std::vector>* data, + std::vector* full_text, std::vector* file_map, + std::vector* file_offset, int* next_terminator) { + file_map->resize(0); + file_offset->resize(0); + full_text->resize(0); + for (size_t i = 0; i < data->size(); ++i) { + file_offset->push_back(full_text->size()); + std::vector& file = data->at(i); + rewriteText(&file); + full_text->insert(full_text->end(), file.begin(), file.end()); + file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i); + } + if (false) remapTerminators(full_text, next_terminator); +} + +/* Build longest-common-prefix based on suffix array and text. + TODO: borrowed -> unknown efficiency. */ +static void buildLcp(std::vector* text, std::vector* sa, + std::vector* lcp, std::vector* invese_sa) { + int size = static_cast(text->size()); + lcp->resize(size); + int k = 0; + lcp->at(size - 1) = 0; + for (int i = 0; i < size; ++i) { + if (invese_sa->at(i) == size - 1) { + k = 0; + continue; + } + int j = sa->at(invese_sa->at(i) + 1); // Suffix which follow i-th suffix. + while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) { + ++k; + } + lcp->at(invese_sa->at(i)) = k; + if (k > 0) --k; + } +} + +/* Isle is a range in SA with LCP not less than some value. + When we raise the LCP requirement, the isle sunks and smaller isles appear + instead. */ +typedef struct { + int lcp; + int l; + int r; + Coverage coverage; +} Isle; + +/* Helper routine for `cutMatch`. */ +static void poisonData(int pos, int length, std::vector>* data, + std::vector* file_map, std::vector* file_offset, + int* next_terminator) { + size_t f = file_map->at(pos / CHUNK_SIZE); + pos -= file_offset->at(f); + std::vector& file = data->at(f); + int l = (length == CUT_MATCH) ? CUT_MATCH : 1; + for (int j = 0; j < l; j++, pos++) { + if (file[pos] >= 256) continue; + if (file[pos + 1] >= 256) { + file[pos] = file[pos + 1]; + } else if (pos > 0 && file[pos - 1] >= 256) { + file[pos] = file[pos - 1]; + } else { + file[pos] = (*next_terminator)++; + } + } +} + +/* Remove substrings of a given match from files. + Substrings are replaced with unique terminators, so next iteration SA would + not allow to cross removed areas. */ +static void cutMatch(std::vector>* data, int index, int length, + std::vector* sa, std::vector* lcp, std::vector* invese_sa, + int* next_terminator, std::vector* file_map, + std::vector* file_offset) { + while (length >= CUT_MATCH) { + int i = index; + while (lcp->at(i) >= length) { + i++; + poisonData( + sa->at(i), length, data, file_map, file_offset, next_terminator); + } + while (true) { + poisonData( + sa->at(index), length, data, file_map, file_offset, next_terminator); + if (index == 0 || lcp->at(index - 1) < length) break; + index--; + } + length--; + index = invese_sa->at(sa->at(index) + 1); + } +} + +size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit, + size_t num_samples, const size_t* sample_sizes, + const uint8_t* sample_data) { + { + uint64_t tmp = 0; + if (popcount(tmp - 1u) != 64) { + fprintf(stderr, "64-bit platform is required\n"); + return 0; + } + } + + /* Could use 256 + '0' for easier debugging. */ + int next_terminator = 256; + + std::vector> data; + + size_t offset = 0; + if (num_samples > MAX_FILES) num_samples = MAX_FILES; + for (size_t n = 0; n < num_samples; ++n) { + size_t next_offset = offset + sample_sizes[n]; + data.push_back( + std::vector(sample_data + offset, sample_data + next_offset)); + offset = next_offset; + data.back().push_back(next_terminator++); + } + + /* Most arrays are allocated once, and then just resized to smaller and + smaller sizes. */ + std::vector full_text; + std::vector file_map; + std::vector file_offset; + std::vector sa; + std::vector invese_sa; + std::vector lcp; + std::vector isles; + std::vector output_data; + size_t total = 0; + size_t total_cost = 0; + size_t best_cost; + Isle best_isle; + int min_count = num_samples; + + while (true) { + size_t max_match = dictionary_size_limit - total; + buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator); + sa.resize(full_text.size()); + saisxx(full_text.data(), sa.data(), static_cast(full_text.size()), + next_terminator); + invese_sa.resize(full_text.size()); + for (int i = 0; i < full_text.size(); ++i) invese_sa[sa[i]] = i; + buildLcp(&full_text, &sa, &lcp, &invese_sa); + + /* Do not rebuild SA/LCP, just use different selection. */ +retry: + best_cost = 0; + best_isle = {0, 0, 0, {{0}}}; + isles.resize(0); + isles.push_back(best_isle); + + for (int i = 0; i < static_cast(lcp.size()); ++i) { + int l = i; + Coverage cov = {{0}}; + int f = file_map[sa[i] / CHUNK_SIZE]; + cov[f >> 6] = ((uint64_t)1) << (f & 63); + while (lcp[i] < isles.back().lcp) { + Isle& top = isles.back(); + top.r = i; + l = top.l; + for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x]; + int count = 0; + for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]); + int effective_lcp = top.lcp; + /* Restrict (last) dictionary entry length. */ + if (effective_lcp > max_match) effective_lcp = max_match; + int cost = count * effective_lcp; + if (cost > best_cost && count >= min_count && + effective_lcp >= MIN_MATCH) { + best_cost = cost; + best_isle = top; + best_isle.lcp = effective_lcp; + } + isles.pop_back(); + for (size_t x = 0; x < cov.size(); ++x) { + isles.back().coverage[x] |= cov[x]; + } + } + if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}}); + for (size_t x = 0; x < cov.size(); ++x) { + isles.back().coverage[x] |= cov[x]; + } + } + + /* When saturated matches do not match length restrictions, lower the + saturation requirements. */ + if (best_cost == 0 || best_isle.lcp < MIN_MATCH) { + if (min_count >= 8) { + min_count = (min_count * 7) / 8; + goto retry; + } + break; + } + + /* Save the entry. */ + fprintf(stderr, + "Savings: %zu+%zu, dictionary: %zu+%d\n", + total_cost, best_cost, total, best_isle.lcp); + memcpy( + dictionary + total, full_text.data() + sa[best_isle.l], best_isle.lcp); + total += best_isle.lcp; + total_cost += best_cost; + cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, + &invese_sa, &next_terminator, &file_map, &file_offset); + if (total >= dictionary_size_limit) break; + } + return total; +} diff --git a/research/deorummolae.h b/research/deorummolae.h new file mode 100755 index 0000000..f37015c --- /dev/null +++ b/research/deorummolae.h @@ -0,0 +1,27 @@ +#ifndef BROTLI_RESEARCH_DEORUMMOLAE_H_ +#define BROTLI_RESEARCH_DEORUMMOLAE_H_ + +#include +#include + +/* log2(maximal number of files). Value 6 provides some speedups. */ +#define LOG_MAX_FILES 6 + +/* Non tunable definitions. */ +#define MAX_FILES (1 << LOG_MAX_FILES) + +/** + * Generate a dictionary for given samples. + * + * @param dictionary storage for generated dictionary + * @param dictionary_size_limit maximal dictionary size + * @param num_samples number of samples + * @param sample_sizes array with sample sizes + * @param sample_data concatenated samples + * @return generated dictionary size + */ +size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit, + size_t num_samples, const size_t* sample_sizes, + const uint8_t* sample_data); + +#endif // BROTLI_RESEARCH_DEORUMMOLAE_H_ -- cgit v1.1