aboutsummaryrefslogtreecommitdiff
path: root/research/find_opt_references.cc
blob: beab5b68847a317c1ac6d848d2bfcf51f86f139c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
/* Copyright 2016 Google Inc. All Rights Reserved.
   Author: zip753@gmail.com (Ivan Nikulin)

   Distributed under MIT license.
   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
*/

/* Tool for generating optimal backward references for the input file. Uses
   sais-lite library for building suffix array. */

#include <algorithm>
#include <cassert>
#include <cstdio>
#include <cstring>

#include <gflags/gflags.h>
using gflags::ParseCommandLineFlags;

#include "./esaxx/sais.hxx"

DEFINE_int32(min_length, 1, "Minimal length of found backward references.");
DEFINE_int32(skip, 1, "Number of bytes to skip.");

const size_t kFileBufferSize = (1 << 16);  // 64KB

typedef int sarray_type;  // Can't make it unsigned because of templates :(
typedef uint8_t input_type;
typedef uint32_t lcp_type;

void ReadInput(FILE* fin, input_type* storage, size_t input_size) {
  size_t last_pos = 0;
  size_t available_in;
  fseek(fin, 0, SEEK_SET);
  do {
    available_in = fread(storage + last_pos, 1, kFileBufferSize, fin);
    last_pos += available_in;
  } while (available_in != 0);
  assert(last_pos == input_size);
}

void BuildLCP(input_type* storage, sarray_type* sarray, lcp_type* lcp,
              size_t size, uint32_t* pos) {
  for (int i = 0; i < size; ++i) {
    pos[sarray[i]] = i;
  }
  uint32_t k = 0;
  lcp[size - 1] = 0;
  for (int i = 0; i < size; ++i) {
    if (pos[i] == size - 1) {
      k = 0;
      continue;
    }
    uint32_t j = sarray[pos[i] + 1];  // Suffix which follow i-th suffix in SA.
    while (i + k < size && j + k < size && storage[i + k] == storage[j + k]) {
      ++k;
    }
    lcp[pos[i]] = k;
    if (k > 0) --k;
  }
}

void ProcessReferences(input_type* storage, sarray_type* sarray, lcp_type* lcp,
                       size_t size, uint32_t* pos, FILE* fout) {
  int min_length = FLAGS_min_length;
  for (int idx = FLAGS_skip; idx < size; ++idx) {
    int max_lcp = -1;
    int max_lcp_ix;
    int left_lcp = -1;
    int left_ix;
    for (left_ix = pos[idx] - 1; left_ix >= 0; --left_ix) {
      if (left_lcp == -1 || left_lcp > lcp[left_ix]) {
        left_lcp = lcp[left_ix];
      }
      if (left_lcp == 0) break;
      if (sarray[left_ix] < idx) break;
    }
    if (left_ix >= 0) {
      max_lcp = left_lcp;
      max_lcp_ix = left_ix;
    }

    int right_lcp = -1;
    int right_ix;
    for (right_ix = pos[idx]; right_ix < size - 1; ++right_ix) {
      if (right_lcp == -1 || right_lcp > lcp[right_ix]) {
        right_lcp = lcp[right_ix];
      }
      // Stop if we have better result from the left side already.
      if (right_lcp < max_lcp) break;
      if (right_lcp == 0) break;
      if (sarray[right_ix] < idx) break;
    }
    if (right_lcp > max_lcp && right_ix < size - 1) {
      max_lcp = right_lcp;
      max_lcp_ix = right_ix;
    }

    if (max_lcp >= min_length) {
      int dist = idx - sarray[max_lcp_ix];
      if (dist <= 0) {
        printf("idx = %d, pos[idx] = %u\n", idx, pos[idx]);
        printf("left_ix = %d, right_ix = %d\n",
                left_ix, right_ix);
        printf("left_lcp = %d, right_lcp = %d\n",
                left_lcp, right_lcp);
        printf("sarray[left_ix] = %d, sarray[right_ix] = %d\n",
                sarray[left_ix], sarray[right_ix]);
        assert(dist > 0);
      }
      fputc(1, fout);
      fwrite(&idx, sizeof(int), 1, fout);  // Position in input.
      fwrite(&dist, sizeof(int), 1, fout);  // Backward distance.
    }
  }
}

int main(int argc, char* argv[]) {
  ParseCommandLineFlags(&argc, &argv, true);
  if (argc != 3) {
    printf("usage: %s input_file output_file\n", argv[0]);
    return 1;
  }

  FILE* fin = fopen(argv[1], "rb");
  FILE* fout = fopen(argv[2], "w");

  fseek(fin, 0, SEEK_END);
  int input_size = ftell(fin);
  fseek(fin, 0, SEEK_SET);
  printf("The file size is %u bytes\n", input_size);

  input_type* storage = new input_type[input_size];

  ReadInput(fin, storage, input_size);
  fclose(fin);

  sarray_type* sarray = new sarray_type[input_size];
  saisxx(storage, sarray, input_size);
  printf("Suffix array calculated.\n");

  // Inverse suffix array.
  uint32_t* pos = new uint32_t[input_size];

  lcp_type* lcp = new lcp_type[input_size];
  BuildLCP(storage, sarray, lcp, input_size, pos);
  printf("LCP array constructed.\n");

  ProcessReferences(storage, sarray, lcp, input_size, pos, fout);
  fclose(fout);
  return 0;
}