#include "sieve.h" /* Pointer to position in (combined corpus) text. */ typedef uint32_t TextIdx; /* Index of sample / generation. */ typedef uint16_t SampleIdx; typedef struct Slot { TextIdx next; TextIdx offset; SampleIdx presence; SampleIdx mark; } Slot; static const TextIdx kNowhere = static_cast(-1); static TextIdx dryRun(TextIdx sliceLen, Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) { TextIdx from = kNowhere; TextIdx to = kNowhere; TextIdx result = 0; SampleIdx targetPresence = minPresence; for (TextIdx i = 0; i < end; ++i) { if (i == middle) { targetPresence++; } Slot& item = map[shortcut[i]]; if (item.mark != iteration) { item.mark = iteration; if (item.presence >= targetPresence) { if ((to == kNowhere) || (to < i)) { if (from != kNowhere) { result += to - from; } from = i; } to = i + sliceLen; } } } if (from != kNowhere) { result += to - from; } return result; } static std::string createDictionary(const uint8_t* data, TextIdx sliceLen, Slot* map, TextIdx* shortcut, TextIdx end, TextIdx middle, SampleIdx minPresence, SampleIdx iteration) { std::string output; TextIdx from = kNowhere; TextIdx to = kNowhere; SampleIdx targetPresence = minPresence; for (TextIdx i = 0; i < end; ++i) { if (i == middle) { targetPresence++; } Slot& item = map[shortcut[i]]; if (item.mark != iteration) { item.mark = iteration; if (item.presence >= targetPresence) { if ((to == kNowhere) || (to < i)) { if (from != kNowhere) { output.insert(output.end(), &data[from], &data[to]); } from = i; } to = i + sliceLen; } } } if (from != kNowhere) { output.insert(output.end(), &data[from], &data[to]); } return output; } std::string sieve_generate(size_t dictionary_size_limit, size_t slice_len, const std::vector& sample_sizes, const uint8_t* sample_data) { /* Parameters aliasing */ TextIdx targetSize = static_cast(dictionary_size_limit); if (targetSize != dictionary_size_limit) { fprintf(stderr, "dictionary_size_limit is too large\n"); return ""; } TextIdx sliceLen = static_cast(slice_len); if (sliceLen != slice_len) { fprintf(stderr, "slice_len is too large\n"); return ""; } if (sliceLen < 1) { fprintf(stderr, "slice_len is too small\n"); return ""; } SampleIdx numSamples = static_cast(sample_sizes.size()); if ((numSamples != sample_sizes.size()) || (numSamples * 2 < numSamples)) { fprintf(stderr, "too many samples\n"); return ""; } const uint8_t* data = sample_data; TextIdx total = 0; std::vector offsets; for (SampleIdx i = 0; i < numSamples; ++i) { TextIdx delta = static_cast(sample_sizes[i]); if (delta != sample_sizes[i]) { fprintf(stderr, "sample is too large\n"); return ""; } if (delta == 0) { fprintf(stderr, "empty samples are prohibited\n"); return ""; } if (total + delta <= total) { fprintf(stderr, "corpus is too large\n"); return ""; } total += delta; offsets.push_back(total); } if (total * 2 < total) { fprintf(stderr, "corpus is too large\n"); return ""; } if (total < sliceLen) { fprintf(stderr, "slice_len is larger than corpus size\n"); return ""; } /***************************************************************************** * Build coverage map. ****************************************************************************/ std::vector map; std::vector shortcut; map.push_back({0, 0, 0, 0}); TextIdx end = total - sliceLen; TextIdx hashLen = 11; while (hashLen < 29 && ((1u << hashLen) < end)) { hashLen += 3; } hashLen -= 3; TextIdx hashMask = (1u << hashLen) - 1u; std::vector hashHead(1 << hashLen); TextIdx hashSlot = 1; SampleIdx piece = 0; TextIdx hash = 0; TextIdx lShift = 3; TextIdx rShift = hashLen - lShift; for (TextIdx i = 0; i < sliceLen - 1; ++i) { TextIdx v = data[i]; hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; } TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen; TextIdx rShiftX = hashLen - lShiftX; for (TextIdx i = 0; i < end; ++i) { TextIdx v = data[i + sliceLen - 1]; hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; if (offsets[piece] == i) { piece++; } TextIdx slot = hashHead[hash]; while (slot != 0) { Slot& item = map[slot]; TextIdx start = item.offset; bool miss = false; for (TextIdx j = 0; j < sliceLen; ++j) { if (data[i + j] != data[start + j]) { miss = true; break; } } if (!miss) { if (item.mark != piece) { item.mark = piece; item.presence++; } shortcut.push_back(slot); break; } slot = item.next; } if (slot == 0) { map.push_back({hashHead[hash], i, 1, piece}); hashHead[hash] = hashSlot; shortcut.push_back(hashSlot); hashSlot++; } v = data[i]; hash ^= ((v << lShiftX) | (v >> rShiftX)) & hashMask; } /***************************************************************************** * Build dictionary of specified size. ****************************************************************************/ SampleIdx a = 1; TextIdx size = dryRun( sliceLen, map.data(), shortcut.data(), end, end, a, ++piece); /* Maximal output is smaller than target. */ if (size <= targetSize) { return createDictionary( data, sliceLen, map.data(), shortcut.data(), end, end, a, ++piece); } SampleIdx b = numSamples; size = dryRun(sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); if (size == targetSize) { return createDictionary( data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); } /* Run binary search. */ if (size < targetSize) { /* size(a) > targetSize > size(b) && a < m < b */ while (a + 1 < b) { SampleIdx m = static_cast((a + b) / 2); size = dryRun( sliceLen, map.data(), shortcut.data(), end, end, m, ++piece); if (size < targetSize) { b = m; } else if (size > targetSize) { a = m; } else { return createDictionary( data, sliceLen, map.data(), shortcut.data(), end, end, b, ++piece); } } } else { a = b; } /* size(minPresence) > targetSize > size(minPresence + 1) */ SampleIdx minPresence = a; TextIdx c = 0; TextIdx d = end; /* size(a) < targetSize < size(b) && a < m < b */ while (c + 1 < d) { TextIdx m = (c + d) / 2; size = dryRun( sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece); if (size < targetSize) { c = m; } else if (size > targetSize) { d = m; } else { return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, m, minPresence, ++piece); } } bool unrestricted = false; if (minPresence <= 2 && !unrestricted) { minPresence = 2; c = end; } return createDictionary(data, sliceLen, map.data(), shortcut.data(), end, c, minPresence, ++piece); }