aboutsummaryrefslogtreecommitdiff
path: root/gcc/rust/lex/rust-input-source.h
blob: 07137debb8f3cf481c81c23540a98bee834d5c76 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#ifndef RUST_INPUT_SOURCE_H
#define RUST_INPUT_SOURCE_H

#include "rust-codepoint.h"
#include "optional.h"

namespace Rust {
// Input source wrapper thing.
class InputSource
{
private:
  // position of current character
  unsigned int pos;
  std::vector<Codepoint> chars;
  bool is_valid_utf8;

  // Overload operator () to return next char from input stream.
  virtual int next_byte () = 0;

  Codepoint next_codepoint ()
  {
    uint32_t input = next_byte ();

    if ((int32_t) input == EOF)
      return Codepoint::eof ();
    else if (input < 128)
      {
	// ascii -- 1 byte
	return {input};
      }
    else if ((input & 0xC0) == 0x80)
      {
	// invalid (continuation; can't be first char)
	return {0xFFFE};
      }
    else if ((input & 0xE0) == 0xC0)
      {
	// 2 bytes
	uint8_t input2 = next_byte ();
	if ((input2 & 0xC0) != 0x80)
	  return {0xFFFE};

	uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
	return output;
      }
    else if ((input & 0xF0) == 0xE0)
      {
	// 3 bytes or UTF-8 BOM
	uint8_t input2 = next_byte ();
	// If the second byte is equal to 0xBB then the input is no longer a
	// valid UTF-8 char. Then, we check if the third byte makes up a UTF
	// BOM.
	if (input == 0xEF && input2 == 0xBB)
	  {
	    uint8_t input3 = next_byte ();
	    if (input3 == 0xBF)
	      // found BOM
	      return next_codepoint ();
	    else
	      return {0xFFFE};
	  }

	if ((input2 & 0xC0) != 0x80)
	  return {0xFFFE};

	uint8_t input3 = next_byte ();

	if ((input3 & 0xC0) != 0x80)
	  return {0xFFFE};

	uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
			  | ((input3 & 0x3F) << 0);
	return {output};
      }
    else if ((input & 0xF8) == 0xF0)
      {
	// 4 bytes
	uint8_t input2 = next_byte ();
	if ((input2 & 0xC0) != 0x80)
	  return {0xFFFE};

	uint8_t input3 = next_byte ();
	if ((input3 & 0xC0) != 0x80)
	  return {0xFFFE};

	uint8_t input4 = next_byte ();
	if ((input4 & 0xC0) != 0x80)
	  return {0xFFFE};

	uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
			  | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
	return {output};
      }
    else
      {
	return {0xFFFE};
      }
  }

protected:
  // Check if the input source is valid as utf-8 and copy all characters to
  // `chars`.
  void init ()
  {
    Codepoint char32 = next_codepoint ();
    while (!char32.is_eof () && char32 != 0xFFFE)
      {
	chars.push_back (char32);
	char32 = next_codepoint ();
      }

    if (char32 == 0xFFFE)
      {
	// Input source is not valid as utf-8.
	is_valid_utf8 = false;
      }
  }

public:
  InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}

  virtual ~InputSource () {}

  // Checks if input source is a valid UTF-8 string
  bool is_valid () { return is_valid_utf8; }

  // get the next UTF-8 character
  Codepoint next ()
  {
    if (pos >= chars.size ())
      return Codepoint::eof ();
    else
      {
	Codepoint c = chars[pos];
	pos++;
	return c;
      }
  }

  // Returns codepoint if input source is a valid UTF-8 string. Returns
  // nullopt otherwise.
  tl::optional<std::vector<Codepoint>> get_chars ()
  {
    if (is_valid ())
      return {chars};
    else
      return tl::nullopt;
  }
};

class FileInputSource : public InputSource
{
private:
  // Input source file.
  FILE *input;

  int next_byte () override { return fgetc (input); }

public:
  // Create new input source from file.
  FileInputSource (FILE *input) : InputSource (), input (input)
  {
    // TODO make this better?
    init ();
  }
};

class BufferInputSource : public InputSource
{
private:
  const std::string &buffer;
  size_t offs;

  int next_byte () override
  {
    if (offs >= buffer.size ())
      return EOF;
    return (uint8_t) buffer.at (offs++);
  }

public:
  // Create new input source from file.
  BufferInputSource (const std::string &b, size_t offset)
    : InputSource (), buffer (b), offs (offset)
  {
    // TODO make this better?
    init ();
  }
};

} // namespace Rust

#endif