aboutsummaryrefslogtreecommitdiff
path: root/gcc/rust/lex/rust-input-source.h
blob: 9b7114f4f1d54ea11cac1b98089a17c84841a9d1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
// Copyright (C) 2020-2025 Free Software Foundation, Inc.

// This file is part of GCC.

// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.

// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.

// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3.  If not see
// <http://www.gnu.org/licenses/>.

#ifndef RUST_INPUT_SOURCE_H
#define RUST_INPUT_SOURCE_H

#include "rust-codepoint.h"
#include "optional.h"

namespace Rust {

constexpr uint8_t UTF8_BOM1 = 0xEF;
constexpr uint8_t UTF8_BOM2 = 0xBB;
constexpr uint8_t UTF8_BOM3 = 0xBF;

// Input source wrapper thing.
class InputSource
{
private:
  // position of current character
  unsigned int pos;
  std::vector<Codepoint> chars;
  bool is_valid_utf8;

  // Overload operator () to return next char from input stream.
  virtual int next_byte () = 0;

  Codepoint next_codepoint ()
  {
    uint32_t input = next_byte ();

    if ((int32_t) input == EOF)
      return Codepoint::eof ();
    else if (input <= MAX_ASCII_CODEPOINT)
      {
	// ascii -- 1 byte
	return {input};
      }
    else if ((input & 0xC0) == 0x80)
      {
	// invalid (continuation; can't be first char)
	return {CODEPOINT_INVALID};
      }
    else if ((input & 0xE0) == 0xC0)
      {
	// 2 bytes
	uint8_t input2 = next_byte ();
	if ((input2 & 0xC0) != 0x80)
	  return {CODEPOINT_INVALID};

	uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
	return output;
      }
    else if ((input & 0xF0) == 0xE0)
      {
	// 3 bytes or UTF-8 BOM
	uint8_t input2 = next_byte ();
	// If the second byte is equal to 0xBB then the input is no longer a
	// valid UTF-8 char. Then, we check if the third byte makes up a UTF
	// BOM.
	if (input == UTF8_BOM1 && input2 == UTF8_BOM2)
	  {
	    uint8_t input3 = next_byte ();
	    if (input3 == UTF8_BOM3)
	      // found BOM
	      return next_codepoint ();
	    else
	      return {CODEPOINT_INVALID};
	  }

	if ((input2 & 0xC0) != 0x80)
	  return {CODEPOINT_INVALID};

	uint8_t input3 = next_byte ();

	if ((input3 & 0xC0) != 0x80)
	  return {CODEPOINT_INVALID};

	uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
			  | ((input3 & 0x3F) << 0);
	return {output};
      }
    else if ((input & 0xF8) == 0xF0)
      {
	// 4 bytes
	uint8_t input2 = next_byte ();
	if ((input2 & 0xC0) != 0x80)
	  return {CODEPOINT_INVALID};

	uint8_t input3 = next_byte ();
	if ((input3 & 0xC0) != 0x80)
	  return {CODEPOINT_INVALID};

	uint8_t input4 = next_byte ();
	if ((input4 & 0xC0) != 0x80)
	  return {CODEPOINT_INVALID};

	uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
			  | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
	return {output};
      }
    else
      {
	return {CODEPOINT_INVALID};
      }
  }

protected:
  // This method must be called by the constructor to initialize the input
  // source. We cannot move this to the constructor because it calls a
  // virtual method .
  void init ()
  {
    // Check if the input source is valid as utf-8 and copy all characters to
    // `chars`.
    Codepoint char32 = next_codepoint ();
    while (!char32.is_eof () && char32 != CODEPOINT_INVALID)
      {
	chars.push_back (char32);
	char32 = next_codepoint ();
      }

    if (char32 == CODEPOINT_INVALID)
      {
	// Input source is not valid as utf-8.
	is_valid_utf8 = false;
      }
  }

public:
  InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}

  virtual ~InputSource () {}

  // Checks if input source is a valid UTF-8 string
  bool is_valid () { return is_valid_utf8; }

  // get the next UTF-8 character
  Codepoint next ()
  {
    if (pos >= chars.size ())
      return Codepoint::eof ();
    else
      {
	Codepoint c = chars[pos];
	pos++;
	return c;
      }
  }

  // Returns codepoint if input source is a valid UTF-8 string. Returns
  // nullopt otherwise.
  tl::optional<std::vector<Codepoint>> get_chars ()
  {
    if (is_valid ())
      return {chars};
    else
      return tl::nullopt;
  }
};

class FileInputSource : public InputSource
{
private:
  // Input source file.
  FILE *input;

  int next_byte () override { return fgetc (input); }

public:
  // Create new input source from file.
  FileInputSource (FILE *input) : InputSource (), input (input) { init (); }
};

class BufferInputSource : public InputSource
{
private:
  const std::string &buffer;
  size_t offs;

  int next_byte () override
  {
    if (offs >= buffer.size ())
      return EOF;
    return static_cast<uint8_t> (buffer.at (offs++));
  }

public:
  // Create new input source from file.
  BufferInputSource (const std::string &b, size_t offset)
    : InputSource (), buffer (b), offs (offset)
  {
    init ();
  }
};

} // namespace Rust

#endif