aboutsummaryrefslogtreecommitdiff
path: root/research/durchschlag.cc
diff options
context:
space:
mode:
authorEugene Kliuchnikov <eustas@google.com>2018-02-26 09:04:36 -0500
committerGitHub <noreply@github.com>2018-02-26 09:04:36 -0500
commit35e69fc7cf9421ab04ffc9d52cb36d07fa12984a (patch)
treea1ed614391936d455da2b0610ef8e8caf88b4289 /research/durchschlag.cc
parent3af18990f50d8f040038aaa08c41f5d27d62efb5 (diff)
downloadbrotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.zip
brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.gz
brotli-35e69fc7cf9421ab04ffc9d52cb36d07fa12984a.tar.bz2
New feature: "Large Window Brotli" (#640)
* New feature: "Large Window Brotli" By setting special encoder/decoder flag it is now possible to extend LZ-window up to 30 bits; though produced stream will not be RFC7932 compliant. Added new dictionary generator - "DSH". It combines speed of "Sieve" and quality of "DM". Plus utilities to prepare train corpora (remove unique strings). Improved compression ratio: now two sub-blocks could be stitched: the last copy command could be extended to span the next sub-block. Fixed compression ineffectiveness caused by floating numbers rounding and wrong cost heuristic. Other C changes: - combined / moved `context.h` to `common` - moved transforms to `common` - unified some aspects of code formatting - added an abstraction for encoder (static) dictionary - moved default allocator/deallocator functions to `common` brotli CLI: - window size is auto-adjusted if not specified explicitly Java: - added "eager" decoding both to JNI wrapper and pure decoder - huge speed-up of `DictionaryData` initialization * Add dictionaryless compressed dictionary * Fix `sources.lst` * Fix `sources.lst` and add a note that `libtool` is also required. * Update setup.py * Fix `EagerStreamTest` * Fix BUILD file * Add missing `libdivsufsort` dependency * Fix "unused parameter" warning.
Diffstat (limited to 'research/durchschlag.cc')
-rwxr-xr-xresearch/durchschlag.cc714
1 files changed, 714 insertions, 0 deletions
diff --git a/research/durchschlag.cc b/research/durchschlag.cc
new file mode 100755
index 0000000..cc4ed68
--- /dev/null
+++ b/research/durchschlag.cc
@@ -0,0 +1,714 @@
+#include "./durchschlag.h"
+
+#include <algorithm>
+#include <exception> /* terminate */
+
+#include "divsufsort.h"
+
+/* Pointer to position in text. */
+typedef DurchschlagTextIdx TextIdx;
+
+/* (Sum of) value(s) of slice(s). */
+typedef uint32_t Score;
+
+typedef struct HashSlot {
+ TextIdx next;
+ TextIdx offset;
+} HashSlot;
+
+typedef struct MetaSlot {
+ TextIdx mark;
+ Score score;
+} MetaSlot;
+
+typedef struct Range {
+ TextIdx start;
+ TextIdx end;
+} Range;
+
+typedef struct Candidate {
+ Score score;
+ TextIdx position;
+} Candidate;
+
+struct greaterScore {
+ bool operator()(const Candidate& a, const Candidate& b) const {
+ return (a.score > b.score) ||
+ ((a.score == b.score) && (a.position < b.position));
+ }
+};
+
+struct lessScore {
+ bool operator()(const Candidate& a, const Candidate& b) const {
+ return (a.score < b.score) ||
+ ((a.score == b.score) && (a.position > b.position));
+ }
+};
+
+#define CANDIDATE_BUNDLE_SIZE (1 << 18)
+
+static void fatal(const char* error) {
+ fprintf(stderr, "%s\n", error);
+ std::terminate();
+}
+
+static TextIdx calculateDictionarySize(const std::vector<Range>& ranges) {
+ TextIdx result = 0;
+ for (size_t i = 0; i < ranges.size(); ++i) {
+ const Range& r = ranges[i];
+ result += r.end - r.start;
+ }
+ return result;
+}
+
+static std::string createDictionary(
+ const uint8_t* data, const std::vector<Range>& ranges, size_t limit) {
+ std::string output;
+ output.reserve(calculateDictionarySize(ranges));
+ for (size_t i = 0; i < ranges.size(); ++i) {
+ const Range& r = ranges[i];
+ output.insert(output.end(), &data[r.start], &data[r.end]);
+ }
+ if (output.size() > limit) {
+ output.resize(limit);
+ }
+ return output;
+}
+
+static Score buildCandidatesList(std::vector<Candidate>* candidates,
+ std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
+ TextIdx end) {
+ candidates->resize(0);
+
+ size_t n = map->size();
+ MetaSlot* slots = map->data();
+ for (size_t j = 0; j < n; ++j) {
+ slots[j].mark = 0;
+ }
+
+ Score score = 0;
+ for (size_t j = 0; j < span; ++j) {
+ MetaSlot& item = slots[shortcut[j]];
+ if (item.mark == 0) {
+ score += item.score;
+ }
+ item.mark++;
+ }
+
+ TextIdx i = 0;
+ TextIdx limit = std::min<TextIdx>(end, CANDIDATE_BUNDLE_SIZE);
+ Score maxScore = 0;
+ for (; i < limit; ++i) {
+ MetaSlot& pick = slots[shortcut[i + span]];
+ if (pick.mark == 0) {
+ score += pick.score;
+ }
+ pick.mark++;
+
+ if (score > maxScore) {
+ maxScore = score;
+ }
+ candidates->push_back({score, i});
+
+ MetaSlot& drop = slots[shortcut[i]];
+ drop.mark--;
+ if (drop.mark == 0) {
+ score -= drop.score;
+ }
+ }
+
+ std::make_heap(candidates->begin(), candidates->end(), greaterScore());
+ Score minScore = candidates->at(0).score;
+ for (; i < end; ++i) {
+ MetaSlot& pick = slots[shortcut[i + span]];
+ if (pick.mark == 0) {
+ score += pick.score;
+ }
+ pick.mark++;
+
+ if (score > maxScore) {
+ maxScore = score;
+ }
+ if (score >= minScore) {
+ candidates->push_back({score, i});
+ std::push_heap(candidates->begin(), candidates->end(), greaterScore());
+ if (candidates->size() > CANDIDATE_BUNDLE_SIZE && maxScore != minScore) {
+ while (candidates->at(0).score == minScore) {
+ std::pop_heap(candidates->begin(), candidates->end(), greaterScore());
+ candidates->pop_back();
+ }
+ minScore = candidates->at(0).score;
+ }
+ }
+
+ MetaSlot& drop = slots[shortcut[i]];
+ drop.mark--;
+ if (drop.mark == 0) {
+ score -= drop.score;
+ }
+ }
+
+ for (size_t j = 0; j < n; ++j) {
+ slots[j].mark = 0;
+ }
+
+ std::make_heap(candidates->begin(), candidates->end(), lessScore());
+ return minScore;
+}
+
+static Score rebuildCandidatesList(std::vector<TextIdx>* candidates,
+ std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
+ TextIdx end, TextIdx* next) {
+ size_t n = candidates->size();
+ TextIdx* data = candidates->data();
+ for (size_t i = 0; i < n; ++i) {
+ data[i] = 0;
+ }
+
+ n = map->size();
+ MetaSlot* slots = map->data();
+ for (size_t i = 0; i < n; ++i) {
+ slots[i].mark = 0;
+ }
+
+ Score score = 0;
+ for (TextIdx i = 0; i < span; ++i) {
+ MetaSlot& item = slots[shortcut[i]];
+ if (item.mark == 0) {
+ score += item.score;
+ }
+ item.mark++;
+ }
+
+ Score maxScore = 0;
+ for (TextIdx i = 0; i < end; ++i) {
+ MetaSlot& pick = slots[shortcut[i + span]];
+ if (pick.mark == 0) {
+ score += pick.score;
+ }
+ pick.mark++;
+
+ if (candidates->size() <= score) {
+ candidates->resize(score + 1);
+ }
+ if (score > maxScore) {
+ maxScore = score;
+ }
+ next[i] = candidates->at(score);
+ candidates->at(score) = i;
+
+ MetaSlot& drop = slots[shortcut[i]];
+ drop.mark--;
+ if (drop.mark == 0) {
+ score -= drop.score;
+ }
+ }
+
+ for (size_t i = 0; i < n; ++i) {
+ slots[i].mark = 0;
+ }
+
+ candidates->resize(maxScore + 1);
+ return maxScore;
+}
+
+static void addRange(std::vector<Range>* ranges, TextIdx start, TextIdx end) {
+ for (auto it = ranges->begin(); it != ranges->end();) {
+ if (end < it->start) {
+ ranges->insert(it, {start, end});
+ return;
+ }
+ if (it->end < start) {
+ it++;
+ continue;
+ }
+ // Combine with existing.
+ start = std::min(start, it->start);
+ end = std::max(end, it->end);
+ // Remove consumed vector and continue.
+ it = ranges->erase(it);
+ }
+ ranges->push_back({start, end});
+}
+
+std::string durchschlag_generate(
+ size_t dictionary_size_limit, size_t slice_len, size_t block_len,
+ const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
+ DurchschlagContext ctx = durchschlag_prepare(
+ slice_len, sample_sizes, sample_data);
+ return durchschlag_generate(DURCHSCHLAG_COLLABORATIVE,
+ dictionary_size_limit, block_len, ctx, sample_data);
+}
+
+DurchschlagContext durchschlag_prepare(size_t slice_len,
+ const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
+ /* Parameters aliasing */
+ TextIdx sliceLen = static_cast<TextIdx>(slice_len);
+ if (sliceLen != slice_len) fatal("slice_len is too large");
+ if (sliceLen < 1) fatal("slice_len is too small");
+ const uint8_t* data = sample_data;
+
+ TextIdx total = 0;
+ std::vector<TextIdx> offsets;
+ offsets.reserve(sample_sizes.size());
+ for (size_t i = 0; i < sample_sizes.size(); ++i) {
+ TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
+ if (delta != sample_sizes[i]) fatal("sample is too large");
+ if (delta == 0) fatal("0-length samples are prohibited");
+ TextIdx next_total = total + delta;
+ if (next_total <= total) fatal("corpus is too large");
+ total = next_total;
+ offsets.push_back(total);
+ }
+
+ if (total < sliceLen) fatal("slice_len is larger than corpus size");
+ TextIdx end = total - static_cast<TextIdx>(sliceLen) + 1;
+ TextIdx hashLen = 11;
+ while (hashLen < 29 && ((1u << hashLen) < end)) {
+ hashLen += 3;
+ }
+ hashLen -= 3;
+ TextIdx hashMask = (1u << hashLen) - 1u;
+ std::vector<TextIdx> hashHead(1 << hashLen);
+ TextIdx hash = 0;
+ TextIdx lShift = 3;
+ TextIdx rShift = hashLen - lShift;
+ for (TextIdx i = 0; i < sliceLen - 1; ++i) {
+ TextIdx v = data[i];
+ hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
+ }
+ TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
+ TextIdx rShiftX = hashLen - lShiftX;
+
+ std::vector<HashSlot> map;
+ map.push_back({0, 0});
+ TextIdx hashSlot = 1;
+ std::vector<TextIdx> sliceMap;
+ sliceMap.reserve(end);
+ for (TextIdx i = 0; i < end; ++i) {
+ TextIdx v = data[i + sliceLen - 1];
+ TextIdx bucket = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
+ v = data[i];
+ hash = bucket ^ (((v << lShiftX) | (v >> rShiftX)) & hashMask);
+ TextIdx slot = hashHead[bucket];
+ while (slot != 0) {
+ HashSlot& item = map[slot];
+ TextIdx start = item.offset;
+ bool miss = false;
+ for (TextIdx j = 0; j < sliceLen; ++j) {
+ if (data[i + j] != data[start + j]) {
+ miss = true;
+ break;
+ }
+ }
+ if (!miss) {
+ sliceMap.push_back(slot);
+ break;
+ }
+ slot = item.next;
+ }
+ if (slot == 0) {
+ map.push_back({hashHead[bucket], i});
+ hashHead[bucket] = hashSlot;
+ sliceMap.push_back(hashSlot);
+ hashSlot++;
+ }
+ }
+
+ return {total, sliceLen, static_cast<TextIdx>(map.size()),
+ std::move(offsets), std::move(sliceMap)};
+}
+
+DurchschlagContext durchschlag_prepare(size_t slice_len,
+ const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index) {
+ /* Parameters aliasing */
+ TextIdx sliceLen = static_cast<TextIdx>(slice_len);
+ if (sliceLen != slice_len) fatal("slice_len is too large");
+ if (sliceLen < 1) fatal("slice_len is too small");
+ const TextIdx* lcp = index.lcp.data();
+ const TextIdx* sa = index.sa.data();
+
+ TextIdx total = 0;
+ std::vector<TextIdx> offsets;
+ offsets.reserve(sample_sizes.size());
+ for (size_t i = 0; i < sample_sizes.size(); ++i) {
+ TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
+ if (delta != sample_sizes[i]) fatal("sample is too large");
+ if (delta == 0) fatal("0-length samples are prohibited");
+ TextIdx next_total = total + delta;
+ if (next_total <= total) fatal("corpus is too large");
+ total = next_total;
+ offsets.push_back(total);
+ }
+
+ if (total < sliceLen) fatal("slice_len is larger than corpus size");
+ TextIdx counter = 1;
+ TextIdx end = total - sliceLen + 1;
+ std::vector<TextIdx> sliceMap(total);
+ TextIdx last = 0;
+ TextIdx current = 1;
+ while (current <= total) {
+ if (lcp[current - 1] < sliceLen) {
+ for (TextIdx i = last; i < current; ++i) {
+ sliceMap[sa[i]] = counter;
+ }
+ counter++;
+ last = current;
+ }
+ current++;
+ }
+ sliceMap.resize(end);
+
+ // Reorder items for the better locality.
+ std::vector<TextIdx> reorder(counter);
+ counter = 1;
+ for (TextIdx i = 0; i < end; ++i) {
+ if (reorder[sliceMap[i]] == 0) {
+ reorder[sliceMap[i]] = counter++;
+ }
+ }
+ for (TextIdx i = 0; i < end; ++i) {
+ sliceMap[i] = reorder[sliceMap[i]];
+ }
+
+ return {total, sliceLen, counter, std::move(offsets), std::move(sliceMap)};
+}
+
+DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data) {
+ TextIdx total = static_cast<TextIdx>(data.size());
+ if (total != data.size()) fatal("corpus is too large");
+ saidx_t saTotal = static_cast<saidx_t>(total);
+ if (saTotal < 0) fatal("corpus is too large");
+ if (static_cast<TextIdx>(saTotal) != total) fatal("corpus is too large");
+ std::vector<TextIdx> sa(total);
+ /* Hopefully, non-negative int32_t values match TextIdx ones. */
+ if (sizeof(TextIdx) != sizeof(int32_t)) fatal("type length mismatch");
+ int32_t* saData = reinterpret_cast<int32_t*>(sa.data());
+ divsufsort(data.data(), saData, saTotal);
+
+ std::vector<TextIdx> isa(total);
+ for (TextIdx i = 0; i < total; ++i) isa[sa[i]] = i;
+
+ // TODO: borrowed -> unknown efficiency.
+ std::vector<TextIdx> lcp(total);
+ TextIdx k = 0;
+ lcp[total - 1] = 0;
+ for (TextIdx i = 0; i < total; ++i) {
+ TextIdx current = isa[i];
+ if (current == total - 1) {
+ k = 0;
+ continue;
+ }
+ TextIdx j = sa[current + 1]; // Suffix which follow i-th suffix.
+ while ((i + k < total) && (j + k < total) && (data[i + k] == data[j + k])) {
+ ++k;
+ }
+ lcp[current] = k;
+ if (k > 0) --k;
+ }
+
+ return {std::move(lcp), std::move(sa)};
+}
+
+static void ScoreSlices(const std::vector<TextIdx>& offsets,
+ std::vector<MetaSlot>& map, const TextIdx* shortcut, TextIdx end) {
+ TextIdx piece = 0;
+ /* Fresh map contains all zeroes -> initial mark should be different. */
+ TextIdx mark = 1;
+ for (TextIdx i = 0; i < end; ++i) {
+ if (offsets[piece] == i) {
+ piece++;
+ mark++;
+ }
+ MetaSlot& item = map[shortcut[i]];
+ if (item.mark != mark) {
+ item.mark = mark;
+ item.score++;
+ }
+ }
+}
+
+static std::string durchschlagGenerateExclusive(
+ size_t dictionary_size_limit, size_t block_len,
+ const DurchschlagContext& context, const uint8_t* sample_data) {
+ /* Parameters aliasing */
+ TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
+ if (targetSize != dictionary_size_limit) {
+ fprintf(stderr, "dictionary_size_limit is too large\n");
+ return "";
+ }
+ TextIdx sliceLen = context.sliceLen;
+ TextIdx total = context.dataSize;
+ TextIdx blockLen = static_cast<TextIdx>(block_len);
+ if (blockLen != block_len) {
+ fprintf(stderr, "block_len is too large\n");
+ return "";
+ }
+ const uint8_t* data = sample_data;
+ const std::vector<TextIdx>& offsets = context.offsets;
+ std::vector<MetaSlot> map(context.numUniqueSlices);
+ const TextIdx* shortcut = context.sliceMap.data();
+
+ /* Initialization */
+ if (blockLen < sliceLen) {
+ fprintf(stderr, "sliceLen is larger than block_len\n");
+ return "";
+ }
+ if (targetSize < blockLen || total < blockLen) {
+ fprintf(stderr, "block_len is too large\n");
+ return "";
+ }
+ TextIdx end = total - sliceLen + 1;
+ ScoreSlices(offsets, map, shortcut, end);
+ end = total - blockLen + 1;
+ std::vector<TextIdx> candidates;
+ std::vector<TextIdx> next(end);
+ TextIdx span = blockLen - sliceLen + 1;
+ Score maxScore = rebuildCandidatesList(
+ &candidates, &map, span, shortcut, end, next.data());
+
+ /* Block selection */
+ const size_t triesLimit = (600 * 1000000) / span;
+ const size_t candidatesLimit = (150 * 1000000) / span;
+ std::vector<Range> ranges;
+ TextIdx mark = 0;
+ size_t numTries = 0;
+ while (true) {
+ TextIdx dictSize = calculateDictionarySize(ranges);
+ size_t numCandidates = 0;
+ if (dictSize > targetSize - blockLen) {
+ break;
+ }
+ if (maxScore == 0) {
+ break;
+ }
+ while (true) {
+ TextIdx candidate = 0;
+ while (maxScore > 0) {
+ if (candidates[maxScore] != 0) {
+ candidate = candidates[maxScore];
+ candidates[maxScore] = next[candidate];
+ break;
+ }
+ maxScore--;
+ }
+ if (maxScore == 0) {
+ break;
+ }
+ mark++;
+ numTries++;
+ numCandidates++;
+ Score score = 0;
+ for (size_t j = candidate; j <= candidate + span; ++j) {
+ MetaSlot& item = map[shortcut[j]];
+ if (item.mark != mark) {
+ score += item.score;
+ item.mark = mark;
+ }
+ }
+ if (score < maxScore) {
+ if (numTries < triesLimit && numCandidates < candidatesLimit) {
+ next[candidate] = candidates[score];
+ candidates[score] = candidate;
+ } else {
+ maxScore = rebuildCandidatesList(
+ &candidates, &map, span, shortcut, end, next.data());
+ mark = 0;
+ numTries = 0;
+ numCandidates = 0;
+ }
+ continue;
+ } else if (score > maxScore) {
+ fprintf(stderr, "Broken invariant\n");
+ return "";
+ }
+ for (TextIdx j = candidate; j <= candidate + span; ++j) {
+ MetaSlot& item = map[shortcut[j]];
+ item.score = 0;
+ }
+ addRange(&ranges, candidate, candidate + blockLen);
+ break;
+ }
+ }
+
+ return createDictionary(data, ranges, targetSize);
+}
+
+static std::string durchschlagGenerateCollaborative(
+ size_t dictionary_size_limit, size_t block_len,
+ const DurchschlagContext& context, const uint8_t* sample_data) {
+ /* Parameters aliasing */
+ TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
+ if (targetSize != dictionary_size_limit) {
+ fprintf(stderr, "dictionary_size_limit is too large\n");
+ return "";
+ }
+ TextIdx sliceLen = context.sliceLen;
+ TextIdx total = context.dataSize;
+ TextIdx blockLen = static_cast<TextIdx>(block_len);
+ if (blockLen != block_len) {
+ fprintf(stderr, "block_len is too large\n");
+ return "";
+ }
+ const uint8_t* data = sample_data;
+ const std::vector<TextIdx>& offsets = context.offsets;
+ std::vector<MetaSlot> map(context.numUniqueSlices);
+ const TextIdx* shortcut = context.sliceMap.data();
+
+ /* Initialization */
+ if (blockLen < sliceLen) {
+ fprintf(stderr, "sliceLen is larger than block_len\n");
+ return "";
+ }
+ if (targetSize < blockLen || total < blockLen) {
+ fprintf(stderr, "block_len is too large\n");
+ return "";
+ }
+ TextIdx end = total - sliceLen + 1;
+ ScoreSlices(offsets, map, shortcut, end);
+ end = total - blockLen + 1;
+ std::vector<Candidate> candidates;
+ candidates.reserve(CANDIDATE_BUNDLE_SIZE + 1024);
+ TextIdx span = blockLen - sliceLen + 1;
+ Score minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
+
+ /* Block selection */
+ std::vector<Range> ranges;
+ TextIdx mark = 0;
+ while (true) {
+ TextIdx dictSize = calculateDictionarySize(ranges);
+ if (dictSize > targetSize - blockLen) {
+ break;
+ }
+ if (minScore == 0 && candidates.empty()) {
+ break;
+ }
+ while (true) {
+ if (candidates.empty()) {
+ minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
+ mark = 0;
+ }
+ TextIdx candidate = candidates[0].position;
+ Score expectedScore = candidates[0].score;
+ if (expectedScore == 0) {
+ candidates.resize(0);
+ break;
+ }
+ std::pop_heap(candidates.begin(), candidates.end(), lessScore());
+ candidates.pop_back();
+ mark++;
+ Score score = 0;
+ for (TextIdx j = candidate; j <= candidate + span; ++j) {
+ MetaSlot& item = map[shortcut[j]];
+ if (item.mark != mark) {
+ score += item.score;
+ item.mark = mark;
+ }
+ }
+ if (score < expectedScore) {
+ if (score >= minScore) {
+ candidates.push_back({score, candidate});
+ std::push_heap(candidates.begin(), candidates.end(), lessScore());
+ }
+ continue;
+ } else if (score > expectedScore) {
+ fatal("Broken invariant");
+ }
+ for (TextIdx j = candidate; j <= candidate + span; ++j) {
+ MetaSlot& item = map[shortcut[j]];
+ item.score = 0;
+ }
+ addRange(&ranges, candidate, candidate + blockLen);
+ break;
+ }
+ }
+
+ return createDictionary(data, ranges, targetSize);
+}
+
+std::string durchschlag_generate(DurchschalgResourceStrategy strategy,
+ size_t dictionary_size_limit, size_t block_len,
+ const DurchschlagContext& context, const uint8_t* sample_data) {
+ if (strategy == DURCHSCHLAG_COLLABORATIVE) {
+ return durchschlagGenerateCollaborative(
+ dictionary_size_limit, block_len, context, sample_data);
+ } else {
+ return durchschlagGenerateExclusive(
+ dictionary_size_limit, block_len, context, sample_data);
+ }
+}
+
+void durchschlag_distill(size_t slice_len, size_t minimum_population,
+ std::vector<size_t>* sample_sizes, uint8_t* sample_data) {
+ /* Parameters aliasing */
+ uint8_t* data = sample_data;
+
+ /* Build slice map. */
+ DurchschlagContext context = durchschlag_prepare(
+ slice_len, *sample_sizes, data);
+
+ /* Calculate slice population. */
+ const std::vector<TextIdx>& offsets = context.offsets;
+ std::vector<MetaSlot> map(context.numUniqueSlices);
+ const TextIdx* shortcut = context.sliceMap.data();
+ TextIdx sliceLen = context.sliceLen;
+ TextIdx total = context.dataSize;
+ TextIdx end = total - sliceLen + 1;
+ ScoreSlices(offsets, map, shortcut, end);
+
+ /* Condense samples, omitting unique slices. */
+ TextIdx readPos = 0;
+ TextIdx writePos = 0;
+ TextIdx lastNonUniquePos = 0;
+ for (TextIdx i = 0; i < sample_sizes->size(); ++i) {
+ TextIdx sampleStart = writePos;
+ TextIdx oldSampleEnd =
+ readPos + static_cast<TextIdx>(sample_sizes->at(i));
+ while (readPos < oldSampleEnd) {
+ if (readPos < end) {
+ MetaSlot& item = map[shortcut[readPos]];
+ if (item.score >= minimum_population) {
+ lastNonUniquePos = readPos + sliceLen;
+ }
+ }
+ if (readPos < lastNonUniquePos) {
+ data[writePos++] = data[readPos];
+ }
+ readPos++;
+ }
+ sample_sizes->at(i) = writePos - sampleStart;
+ }
+}
+
+void durchschlag_purify(size_t slice_len, size_t minimum_population,
+ const std::vector<size_t>& sample_sizes, uint8_t* sample_data) {
+ /* Parameters aliasing */
+ uint8_t* data = sample_data;
+
+ /* Build slice map. */
+ DurchschlagContext context = durchschlag_prepare(
+ slice_len, sample_sizes, data);
+
+ /* Calculate slice population. */
+ const std::vector<TextIdx>& offsets = context.offsets;
+ std::vector<MetaSlot> map(context.numUniqueSlices);
+ const TextIdx* shortcut = context.sliceMap.data();
+ TextIdx sliceLen = context.sliceLen;
+ TextIdx total = context.dataSize;
+ TextIdx end = total - sliceLen + 1;
+ ScoreSlices(offsets, map, shortcut, end);
+
+ /* Rewrite samples, zeroing out unique slices. */
+ TextIdx lastNonUniquePos = 0;
+ for (TextIdx readPos = 0; readPos < total; ++readPos) {
+ if (readPos < end) {
+ MetaSlot& item = map[shortcut[readPos]];
+ if (item.score >= minimum_population) {
+ lastNonUniquePos = readPos + sliceLen;
+ }
+ }
+ if (readPos >= lastNonUniquePos) {
+ data[readPos] = 0;
+ }
+ }
+}