aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc')
-rw-r--r--gcc/rust/lex/rust-lex.cc4436
-rw-r--r--gcc/rust/lex/rust-lex.h56
2 files changed, 2225 insertions, 2267 deletions
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index 322079e..1f0f9cb 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -8,2333 +8,2285 @@
#include <sstream> // for ostringstream
namespace Rust {
- // TODO: move to separate compilation unit?
- // overload += for uint32_t to allow 32-bit encoded utf-8 to be added
- std::string& operator+=(std::string& str, Codepoint char32) {
- if (char32.value < 0x80) {
- str += static_cast<char>(char32.value);
- } else if (char32.value < (0x1F + 1) << (1 * 6)) {
- str += static_cast<char>(0xC0 | ((char32.value >> 6) & 0x1F));
- str += static_cast<char>(0x80 | ((char32.value >> 0) & 0x3F));
- } else if (char32.value < (0x0F + 1) << (2 * 6)) {
- str += static_cast<char>(0xE0 | ((char32.value >> 12) & 0x0F));
- str += static_cast<char>(0x80 | ((char32.value >> 6) & 0x3F));
- str += static_cast<char>(0x80 | ((char32.value >> 0) & 0x3F));
- } else if (char32.value < (0x07 + 1) << (3 * 6)) {
- str += static_cast<char>(0xF0 | ((char32.value >> 18) & 0x07));
- str += static_cast<char>(0x80 | ((char32.value >> 12) & 0x3F));
- str += static_cast<char>(0x80 | ((char32.value >> 6) & 0x3F));
- str += static_cast<char>(0x80 | ((char32.value >> 0) & 0x3F));
- } else {
- fprintf(stderr, "Invalid unicode codepoint found: '%u' \n", char32.value);
- }
- return str;
+// TODO: move to separate compilation unit?
+// overload += for uint32_t to allow 32-bit encoded utf-8 to be added
+std::string &
+operator+= (std::string &str, Codepoint char32)
+{
+ if (char32.value < 0x80)
+ {
+ str += static_cast<char> (char32.value);
}
+ else if (char32.value < (0x1F + 1) << (1 * 6))
+ {
+ str += static_cast<char> (0xC0 | ((char32.value >> 6) & 0x1F));
+ str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
+ }
+ else if (char32.value < (0x0F + 1) << (2 * 6))
+ {
+ str += static_cast<char> (0xE0 | ((char32.value >> 12) & 0x0F));
+ str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
+ str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
+ }
+ else if (char32.value < (0x07 + 1) << (3 * 6))
+ {
+ str += static_cast<char> (0xF0 | ((char32.value >> 18) & 0x07));
+ str += static_cast<char> (0x80 | ((char32.value >> 12) & 0x3F));
+ str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
+ str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
+ }
+ else
+ {
+ fprintf (stderr, "Invalid unicode codepoint found: '%u' \n",
+ char32.value);
+ }
+ return str;
+}
+
+std::string
+Codepoint::as_string ()
+{
+ std::string str;
+
+ // str += Codepoint (value);
+ str += *this;
+
+ return str;
+}
+
+/* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
+ * for handling. */
+bool
+is_float_digit (char number)
+{
+ return ISDIGIT (number) || number == 'E' || number == 'e';
+}
+
+/* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
+ * whatever is different */
+bool
+is_x_digit (char number)
+{
+ return ISXDIGIT (number);
+}
+
+bool
+is_octal_digit (char number)
+{
+ return number >= '0' && number <= '7';
+}
+
+bool
+is_bin_digit (char number)
+{
+ return number == '0' || number == '1';
+}
+
+bool
+check_valid_float_dot_end (char character)
+{
+ return character != '.' && character != '_' && !ISALPHA (character);
+}
+
+// ISSPACE from safe-ctype but may change in future
+bool
+is_whitespace (char character)
+{
+ return ISSPACE (character);
+}
+
+Lexer::Lexer (const char *filename, FILE *input, Linemap *linemap)
+ : input (input), current_line (1), current_column (1), line_map (linemap),
+ input_source (input), input_queue (input_source), token_source (this),
+ token_queue (token_source)
+{
+ // inform line_table that file is being entered and is in line 1
+ line_map->start_file (filename, current_line);
+}
+
+Lexer::~Lexer ()
+{
+ /* ok apparently stop (which is equivalent of original code in destructor) is
+ * meant to be called after all files have finished parsing, for cleanup. On
+ * the other hand, actual code that it calls to leave a certain line map is
+ * mentioned in GCC docs as being useful for "just leaving an included header"
+ * and stuff like that, so this line mapping functionality may need fixing.
+ * FIXME: find out whether this occurs. */
+ // line_map->stop();
+}
+
+/* TODO: need to optimise somehow to avoid the virtual function call in the
+ * tight loop. Best idea at the moment is CRTP, but that might make lexer
+ * implementation annoying when storing the "base class" (i.e. would need
+ * template parameter everywhere), although in practice it would mostly just
+ * look ugly and make enclosing classes like Parser also require a type
+ * parameter. At this point a macro might be better. OK I guess macros can be
+ * replaced by constexpr if or something if possible. */
+Location
+Lexer::get_current_location ()
+{
+ return line_map->get_location (current_column);
+}
+
+int
+Lexer::peek_input (int n)
+{
+ return input_queue.peek (n);
+}
+
+int
+Lexer::peek_input ()
+{
+ return peek_input (0);
+}
+
+void
+Lexer::skip_input (int n)
+{
+ input_queue.skip (n);
+}
+
+void
+Lexer::skip_input ()
+{
+ skip_input (0);
+}
+
+void
+Lexer::replace_current_token (TokenPtr replacement)
+{
+ token_queue.replace_current_value (replacement);
+}
+
+/* shitty anonymous namespace that can only be accessed inside the compilation
+ * unit - used for classify_keyword Binary search in sorted array of keywords
+ * created with x-macros. */
+namespace {
+const std::string keyword_index[] = {
+#define RS_TOKEN(x, y)
+#define RS_TOKEN_KEYWORD(name, keyword) keyword,
+ RS_TOKEN_LIST
+#undef RS_TOKEN_KEYWORD
+#undef RS_TOKEN
+};
- std::string Codepoint::as_string() {
- std::string str;
+TokenId keyword_keys[] = {
+#define RS_TOKEN(x, y)
+#define RS_TOKEN_KEYWORD(name, keyword) name,
+ RS_TOKEN_LIST
+#undef RS_TOKEN_KEYWORD
+#undef RS_TOKEN
+};
+
+const int num_keywords = sizeof (keyword_index) / sizeof (*keyword_index);
+} // namespace
+
+/* Determines whether the string passed in is a keyword or not. If it is, it
+ * returns the keyword name. */
+TokenId
+Lexer::classify_keyword (const std::string &str)
+{
+ const std::string *last = keyword_index + num_keywords;
+ const std::string *idx = std::lower_bound (keyword_index, last, str);
+
+ if (idx == last || str != *idx)
+ return IDENTIFIER;
+ else
+ return keyword_keys[idx - keyword_index];
+
+ // TODO: possibly replace this x-macro system with something like hash map?
+}
+
+TokenPtr
+Lexer::build_token ()
+{
+ // loop to go through multiple characters to build a single token
+ while (true)
+ {
+ Location loc = get_current_location ();
+ current_char = peek_input ();
+ skip_input ();
+
+ // return end of file token if end of file
+ if (current_char == EOF)
+ return Token::make (END_OF_FILE, loc);
+
+ // detect shebang
+ if (loc == 1 && current_line == 1 && current_char == '#')
+ {
+ current_char = peek_input ();
+
+ if (current_char == '!')
+ {
+ skip_input ();
+ current_char = peek_input ();
+
+ if (current_char == '/')
+ {
+ // definitely shebang
+
+ skip_input ();
+
+ // ignore rest of line
+ while (current_char != '\n')
+ {
+ current_char = peek_input ();
+ skip_input ();
+ }
+
+ // newline
+ current_line++;
+ current_column = 1;
+ // tell line_table that new line starts
+ line_map->start_line (current_line, max_column_hint);
+ continue;
+ }
+ }
+ }
+
+ // if not end of file, start tokenising
+ switch (current_char)
+ {
+ /* ignore whitespace characters for tokens but continue updating
+ * location */
+ case '\n': // newline
+ current_line++;
+ current_column = 1;
+ // tell line_table that new line starts
+ line_map->start_line (current_line, max_column_hint);
+ continue;
+ case ' ': // space
+ current_column++;
+ continue;
+ case '\t': // tab
+ // width of a tab is not well-defined, assume 8 spaces
+ current_column += 8;
+ continue;
+
+ // punctuation - actual tokens
+ case '=':
+ if (peek_input () == '>')
+ {
+ // match arm arrow
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (MATCH_ARROW, loc);
+ }
+ else if (peek_input () == '=')
+ {
+ // equality operator
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (EQUAL_EQUAL, loc);
+ }
+ else
+ {
+ // assignment operator
+ current_column++;
+ return Token::make (EQUAL, loc);
+ }
+ case '(':
+ current_column++;
+ return Token::make (LEFT_PAREN, loc);
+ case '-':
+ if (peek_input () == '>')
+ {
+ // return type specifier
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (RETURN_TYPE, loc);
+ }
+ else if (peek_input () == '=')
+ {
+ // minus-assign
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (MINUS_EQ, loc);
+ }
+ else
+ {
+ // minus
+ current_column++;
+ return Token::make (MINUS, loc);
+ }
+ case '+':
+ if (peek_input () == '=')
+ {
+ // add-assign
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (PLUS_EQ, loc);
+ }
+ else
+ {
+ // add
+ current_column++;
+ return Token::make (PLUS, loc);
+ }
+ case ')':
+ current_column++;
+ return Token::make (RIGHT_PAREN, loc);
+ case ';':
+ current_column++;
+ return Token::make (SEMICOLON, loc);
+ case '*':
+ if (peek_input () == '=')
+ {
+ // multiplication-assign
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (ASTERISK_EQ, loc);
+ }
+ else
+ {
+ // multiplication
+ current_column++;
+ return Token::make (ASTERISK, loc);
+ }
+ case ',':
+ current_column++;
+ return Token::make (COMMA, loc);
+ case '/':
+ if (peek_input () == '=')
+ {
+ // division-assign
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (DIV_EQ, loc);
+ }
+ else if (peek_input () == '/')
+ {
+ // TODO: single-line doc comments
+
+ // single line comment
+ skip_input ();
+ current_column += 2;
+
+ // basically ignore until line finishes
+ while (current_char != '\n' && current_char != EOF)
+ {
+ skip_input ();
+ current_column++; // not used
+ current_char = peek_input ();
+ }
+ continue;
+ break;
+ }
+ else if (peek_input () == '*')
+ {
+ // block comment
+ skip_input ();
+ current_column += 2;
+
+ // TODO: block doc comments
+
+ current_char = peek_input ();
+
+ int level = 1;
+ while (level > 0)
+ {
+ skip_input ();
+ current_column++; // for error-handling
+ current_char = peek_input ();
+
+ // if /* found
+ if (current_char == '/' && peek_input (1) == '*')
+ {
+ // skip /* characters
+ skip_input (1);
+
+ current_column += 2;
+
+ level += 1;
+ }
+
+ // ignore until */ is found
+ if (current_char == '*' && peek_input (1) == '/')
+ {
+ // skip */ characters
+ skip_input (1);
+
+ current_column += 2;
+ // should only break inner loop here - seems to do so
+ // break;
+
+ level -= 1;
+ }
+ }
+
+ // refresh new token
+ continue;
+ break;
+ }
+ else
+ {
+ // division
+ current_column++;
+ return Token::make (DIV, loc);
+ }
+ case '%':
+ if (peek_input () == '=')
+ {
+ // modulo-assign
+ current_column += 2;
+ return Token::make (PERCENT_EQ, loc);
+ }
+ else
+ {
+ // modulo
+ current_column++;
+ return Token::make (PERCENT, loc);
+ }
+ case '^':
+ if (peek_input () == '=')
+ {
+ // xor-assign?
+ current_column += 2;
+ return Token::make (CARET_EQ, loc);
+ }
+ else
+ {
+ // xor?
+ current_column++;
+ return Token::make (CARET, loc);
+ }
+ case '<':
+ if (peek_input () == '<')
+ {
+ if (peek_input (1) == '=')
+ {
+ // left-shift assign
+ skip_input (1);
+ current_column += 3;
+
+ return Token::make (LEFT_SHIFT_EQ, loc);
+ }
+ else
+ {
+ // left-shift
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (LEFT_SHIFT, loc);
+ }
+ }
+ else if (peek_input () == '=')
+ {
+ // smaller than or equal to
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (LESS_OR_EQUAL, loc);
+ }
+ else
+ {
+ // smaller than
+ current_column++;
+ return Token::make (LEFT_ANGLE, loc);
+ }
+ break;
+ case '>':
+ if (peek_input () == '>')
+ {
+ if (peek_input (1) == '=')
+ {
+ // right-shift-assign
+ skip_input (1);
+ current_column += 3;
+
+ return Token::make (RIGHT_SHIFT_EQ, loc);
+ }
+ else
+ {
+ // right-shift
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (RIGHT_SHIFT, loc);
+ }
+ }
+ else if (peek_input () == '=')
+ {
+ // larger than or equal to
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (GREATER_OR_EQUAL, loc);
+ }
+ else
+ {
+ // larger than
+ current_column++;
+ return Token::make (RIGHT_ANGLE, loc);
+ }
+ case ':':
+ if (peek_input () == ':')
+ {
+ // scope resolution ::
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (SCOPE_RESOLUTION, loc);
+ }
+ else
+ {
+ // single colon :
+ current_column++;
+ return Token::make (COLON, loc);
+ }
+ case '!':
+ // no special handling for macros in lexer?
+ if (peek_input () == '=')
+ {
+ // not equal boolean operator
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (NOT_EQUAL, loc);
+ }
+ else
+ {
+ // not equal unary operator
+ current_column++;
+
+ return Token::make (EXCLAM, loc);
+ }
+ case '?':
+ current_column++;
+ return Token::make (QUESTION_MARK, loc);
+ case '#':
+ current_column++;
+ return Token::make (HASH, loc);
+ case '[':
+ current_column++;
+ return Token::make (LEFT_SQUARE, loc);
+ case ']':
+ current_column++;
+ return Token::make (RIGHT_SQUARE, loc);
+ case '{':
+ current_column++;
+ return Token::make (LEFT_CURLY, loc);
+ case '}':
+ current_column++;
+ return Token::make (RIGHT_CURLY, loc);
+ case '@':
+ current_column++;
+ return Token::make (PATTERN_BIND, loc);
+ case '$':
+ current_column++;
+ return Token::make (DOLLAR_SIGN, loc);
+ case '~':
+ current_column++;
+ return Token::make (TILDE, loc);
+ case '\\':
+ current_column++;
+ return Token::make (BACKSLASH, loc);
+ case '`':
+ current_column++;
+ return Token::make (BACKTICK, loc);
+ case '|':
+ if (peek_input () == '=')
+ {
+ // bitwise or-assign?
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (PIPE_EQ, loc);
+ }
+ else if (peek_input () == '|')
+ {
+ // logical or
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (OR, loc);
+ }
+ else
+ {
+ // bitwise or
+ current_column++;
+
+ return Token::make (PIPE, loc);
+ }
+ case '&':
+ if (peek_input () == '=')
+ {
+ // bitwise and-assign?
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (AMP_EQ, loc);
+ }
+ else if (peek_input () == '&')
+ {
+ // logical and
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (LOGICAL_AND, loc);
+ }
+ else
+ {
+ // bitwise and/reference
+ current_column++;
+
+ return Token::make (AMP, loc);
+ }
+ case '.':
+ if (peek_input () == '.')
+ {
+ if (peek_input (1) == '.')
+ {
+ // ellipsis
+ skip_input (1);
+ current_column += 3;
+
+ return Token::make (ELLIPSIS, loc);
+ }
+ else if (peek_input (1) == '=')
+ {
+ // ..=
+ skip_input (1);
+ current_column += 3;
+
+ return Token::make (DOT_DOT_EQ, loc);
+ }
+ else
+ {
+ // ..
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (DOT_DOT, loc);
+ }
+ }
+ else if (!ISDIGIT (peek_input ()))
+ {
+ // single dot .
+ // Only if followed by a non-number - otherwise is float
+ current_column++;
+ return Token::make (DOT, loc);
+ }
+ }
+ // TODO: special handling of _ in the lexer? instead of being identifier
+
+ // byte character, byte string and raw byte string literals
+ if (current_char == 'b')
+ {
+ if (peek_input () == '\'')
+ return parse_byte_char (loc);
+ else if (peek_input () == '"')
+ return parse_byte_string (loc);
+ else if (peek_input () == 'r'
+ && (peek_input (1) == '#' || peek_input (1) == '"'))
+ return parse_raw_byte_string (loc);
+ }
+
+ // raw identifiers and raw strings
+ if (current_char == 'r')
+ {
+ int peek = peek_input ();
+ int peek1 = peek_input (1);
+
+ if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
+ {
+ TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
+ if (raw_ident_ptr != nullptr)
+ return raw_ident_ptr;
+ }
+ else
+ {
+ TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc);
+ if (maybe_raw_string_ptr != nullptr)
+ return maybe_raw_string_ptr;
+ }
+ }
+
+ // find identifiers and keywords
+ if (ISALPHA (current_char) || current_char == '_')
+ return parse_identifier_or_keyword (loc);
+
+ // int and float literals
+ if (ISDIGIT (current_char) || current_char == '.')
+ { // _ not allowed as first char
+ if (current_char == '0' && !ISDIGIT (peek_input ()))
+ {
+ // handle binary, octal, hex literals
+ TokenPtr non_dec_int_lit_ptr
+ = parse_non_decimal_int_literals (loc);
+ if (non_dec_int_lit_ptr != nullptr)
+ return non_dec_int_lit_ptr;
+ }
+ else
+ {
+ // handle decimals (integer or float)
+ TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc);
+ if (decimal_or_float_ptr != nullptr)
+ return decimal_or_float_ptr;
+ }
+ }
+
+ // string literals
+ if (current_char == '"')
+ return parse_string (loc);
+
+ // char literals and lifetime names
+ if (current_char == '\'')
+ {
+ TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc);
+ if (char_or_lifetime_ptr != nullptr)
+ return char_or_lifetime_ptr;
+ }
+
+ // didn't match anything so error
+ rust_error_at (loc, "unexpected character '%x'", current_char);
+ current_column++;
+ }
+}
- // str += Codepoint (value);
- str += *this;
+// Parses in a type suffix.
+std::pair<PrimitiveCoreType, int>
+Lexer::parse_in_type_suffix ()
+{
+ std::string suffix;
+ suffix.reserve (5);
- return str;
- }
+ int additional_length_offset = 0;
- /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
- * for handling. */
- bool is_float_digit(char number) {
- return ISDIGIT(number) || number == 'E' || number == 'e';
- }
+ // get suffix
+ while (ISALPHA (current_char) || ISDIGIT (current_char)
+ || current_char == '_')
+ {
+ if (current_char == '_')
+ {
+ // don't add _ to suffix
+ skip_input ();
+ current_char = peek_input ();
- /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
- * whatever is different */
- bool is_x_digit(char number) {
- return ISXDIGIT(number);
- }
+ additional_length_offset++;
+
+ continue;
+ }
- bool is_octal_digit(char number) {
- return number >= '0' && number <= '7';
+ additional_length_offset++;
+
+ suffix += current_char;
+ skip_input ();
+ current_char = peek_input ();
}
- bool is_bin_digit(char number) {
- return number == '0' || number == '1';
+ if (suffix.empty ())
+ {
+ // no type suffix: do nothing but also no error
+ return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
+ }
+ else if (suffix == "f32")
+ {
+ return std::make_pair (CORETYPE_F32, additional_length_offset);
+ }
+ else if (suffix == "f64")
+ {
+ return std::make_pair (CORETYPE_F64, additional_length_offset);
+ }
+ else if (suffix == "i8")
+ {
+ return std::make_pair (CORETYPE_I8, additional_length_offset);
+ }
+ else if (suffix == "i16")
+ {
+ return std::make_pair (CORETYPE_I16, additional_length_offset);
+ }
+ else if (suffix == "i32")
+ {
+ return std::make_pair (CORETYPE_I32, additional_length_offset);
+ }
+ else if (suffix == "i64")
+ {
+ return std::make_pair (CORETYPE_I64, additional_length_offset);
+ }
+ else if (suffix == "i128")
+ {
+ return std::make_pair (CORETYPE_I128, additional_length_offset);
+ }
+ else if (suffix == "isize")
+ {
+ return std::make_pair (CORETYPE_ISIZE, additional_length_offset);
+ }
+ else if (suffix == "u8")
+ {
+ return std::make_pair (CORETYPE_U8, additional_length_offset);
+ }
+ else if (suffix == "u16")
+ {
+ return std::make_pair (CORETYPE_U16, additional_length_offset);
+ }
+ else if (suffix == "u32")
+ {
+ return std::make_pair (CORETYPE_U32, additional_length_offset);
}
+ else if (suffix == "u64")
+ {
+ return std::make_pair (CORETYPE_U64, additional_length_offset);
+ }
+ else if (suffix == "u128")
+ {
+ return std::make_pair (CORETYPE_U128, additional_length_offset);
+ }
+ else if (suffix == "usize")
+ {
+ return std::make_pair (CORETYPE_USIZE, additional_length_offset);
+ }
+ else
+ {
+ rust_error_at (get_current_location (), "unknown number suffix '%s'",
+ suffix.c_str ());
- bool check_valid_float_dot_end(char character) {
- return character != '.' && character != '_' && !ISALPHA(character);
+ return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
+ }
+}
+
+// Parses in the exponent part (if any) of a float literal.
+std::pair<std::string, int>
+Lexer::parse_in_exponent_part ()
+{
+ int additional_length_offset = 0;
+ std::string str;
+ if (current_char == 'E' || current_char == 'e')
+ {
+ // add exponent to string as strtod works with it
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+
+ additional_length_offset++;
+
+ // special - and + handling
+ if (current_char == '-')
+ {
+ str += '-';
+
+ skip_input ();
+ current_char = peek_input ();
+
+ additional_length_offset++;
+ }
+ else if (current_char == '+')
+ {
+ // don't add + but still skip input
+ skip_input ();
+ current_char = peek_input ();
+
+ additional_length_offset++;
+ }
+
+ // parse another decimal number for exponent
+ auto str_length_pair = parse_in_decimal ();
+ str += str_length_pair.first;
+ additional_length_offset += str_length_pair.second;
+ }
+ return std::make_pair (str, additional_length_offset);
+}
+
+// Parses a decimal integer.
+std::pair<std::string, int>
+Lexer::parse_in_decimal ()
+{
+ int additional_length_offset = 0;
+ std::string str;
+ while (ISDIGIT (current_char) || current_char == '_')
+ {
+ if (current_char == '_')
+ {
+ // don't add _ to number
+ skip_input ();
+ current_char = peek_input ();
+
+ additional_length_offset++;
+
+ continue;
+ }
+
+ additional_length_offset++;
+
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ }
+ return std::make_pair (str, additional_length_offset);
+}
+
+/* Parses escapes (and string continues) in "byte" strings and characters. Does
+ * not support unicode. */
+std::tuple<char, int, bool>
+Lexer::parse_escape (char opening_char)
+{
+ int additional_length_offset = 0;
+ char output_char = 0;
+
+ // skip to actual letter
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+
+ switch (current_char)
+ {
+ case 'x': {
+ auto hex_escape_pair = parse_partial_hex_escape ();
+ long hexLong = hex_escape_pair.first;
+ additional_length_offset += hex_escape_pair.second;
+
+ if (hexLong > 255 || hexLong < 0)
+ rust_error_at (
+ get_current_location (),
+ "byte \\x escape '\\x%X' out of range - allows up to '\\xFF'",
+ static_cast<unsigned int> (hexLong));
+ char hexChar = static_cast<char> (hexLong);
+
+ output_char = hexChar;
+ }
+ break;
+ case 'n':
+ output_char = '\n';
+ break;
+ case 'r':
+ output_char = '\r';
+ break;
+ case 't':
+ output_char = '\t';
+ break;
+ case '\\':
+ output_char = '\\';
+ break;
+ case '0':
+ output_char = '\0';
+ break;
+ case '\'':
+ output_char = '\'';
+ break;
+ case '"':
+ output_char = '"';
+ break;
+ case 'u':
+ rust_error_at (get_current_location (),
+ "cannot have a unicode escape \\u in a byte %s!",
+ opening_char == '\'' ? "character" : "string");
+ return std::make_tuple (output_char, additional_length_offset, false);
+ case '\r':
+ case '\n':
+ // string continue
+ return std::make_tuple (0, parse_partial_string_continue (), true);
+ default:
+ rust_error_at (get_current_location (), "unknown escape sequence '\\%c'",
+ current_char);
+ // returns false if no parsing could be done
+ // return false;
+ return std::make_tuple (output_char, additional_length_offset, false);
+ break;
+ }
+ // all non-special cases (string continue) should skip their used char
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+
+ // returns true if parsing was successful
+ // return true;
+ return std::make_tuple (output_char, additional_length_offset, false);
+}
+
+// Parses an escape (or string continue) in a string or character. Supports
+// unicode escapes.
+std::tuple<Codepoint, int, bool>
+Lexer::parse_utf8_escape (char opening_char)
+{
+ Codepoint output_char;
+ int additional_length_offset = 0;
+
+ // skip to actual letter
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+
+ switch (current_char)
+ {
+ case 'x': {
+ auto hex_escape_pair = parse_partial_hex_escape ();
+ long hexLong = hex_escape_pair.first;
+ additional_length_offset += hex_escape_pair.second;
+
+ if (hexLong > 127 || hexLong < 0)
+ rust_error_at (
+ get_current_location (),
+ "ascii \\x escape '\\x%X' out of range - allows up to '\\x7F'",
+ static_cast<unsigned int> (hexLong));
+ char hexChar = static_cast<char> (hexLong);
+
+ output_char = hexChar;
+ }
+ break;
+ case 'n':
+ output_char = '\n';
+ break;
+ case 'r':
+ output_char = '\r';
+ break;
+ case 't':
+ output_char = '\t';
+ break;
+ case '\\':
+ output_char = '\\';
+ break;
+ case '0':
+ output_char = '\0';
+ break;
+ case '\'':
+ output_char = '\'';
+ break;
+ case '"':
+ output_char = '"';
+ break;
+ case 'u': {
+ auto unicode_escape_pair = parse_partial_unicode_escape ();
+ output_char = unicode_escape_pair.first;
+ additional_length_offset += unicode_escape_pair.second;
+
+ return std::make_tuple (output_char, additional_length_offset, false);
+ }
+ break;
+ case '\r':
+ case '\n':
+ // string continue
+ return std::make_tuple (0, parse_partial_string_continue (), true);
+ default:
+ rust_error_at (get_current_location (), "unknown escape sequence '\\%c'",
+ current_char);
+ // returns false if no parsing could be done
+ // return false;
+ return std::make_tuple (output_char, additional_length_offset, false);
+ break;
+ }
+ /* all non-special cases (unicode, string continue) should skip their used
+ * char */
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+
+ // returns true if parsing was successful
+ // return true;
+ return std::make_tuple (output_char, additional_length_offset, false);
+}
+
+// Parses the body of a string continue that has been found in an escape.
+int
+Lexer::parse_partial_string_continue ()
+{
+ int additional_length_offset = 1;
+
+ // string continue
+ while (is_whitespace (current_char))
+ {
+ if (current_char == '\n')
+ {
+ current_line++;
+ current_column = 1;
+ // tell line_table that new line starts
+ line_map->start_line (current_line, max_column_hint);
+
+ // reset "length"
+ additional_length_offset = 1;
+
+ // get next char
+ skip_input ();
+ current_char = peek_input ();
+
+ continue;
+ }
+
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
}
- // ISSPACE from safe-ctype but may change in future
- bool is_whitespace(char character) {
- return ISSPACE(character);
+ return additional_length_offset;
+}
+
+/* Parses the body of a '\x' escape. Note that it does not check that the number
+ * is valid and smaller than 255. */
+std::pair<long, int>
+Lexer::parse_partial_hex_escape ()
+{
+ // hex char string (null-terminated)
+ char hexNum[3] = {0, 0, 0};
+
+ // first hex char
+ skip_input ();
+ current_char = peek_input ();
+ int additional_length_offset = 1;
+
+ if (!is_x_digit (current_char))
+ {
+ rust_error_at (get_current_location (),
+ "invalid character '\\x%c' in \\x sequence", current_char);
}
+ hexNum[0] = current_char;
+
+ // second hex char
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
- Lexer::Lexer(const char* filename, FILE* input, Linemap* linemap) :
- input(input), current_line(1), current_column(1), line_map(linemap), input_source(input),
- input_queue(input_source), token_source(this), token_queue(token_source) {
- // inform line_table that file is being entered and is in line 1
- line_map->start_file(filename, current_line);
+ if (!is_x_digit (current_char))
+ {
+ rust_error_at (get_current_location (),
+ "invalid character '\\x%c' in \\x sequence", current_char);
}
+ hexNum[1] = current_char;
+
+ long hexLong = std::strtol (hexNum, nullptr, 16);
+
+ return std::make_pair (hexLong, additional_length_offset);
+}
- Lexer::~Lexer() {
- /* ok apparently stop (which is equivalent of original code in destructor) is
- * meant to be called after all files have finished parsing, for cleanup. On
- * the other hand, actual code that it calls to leave a certain line map is
- * mentioned in GCC docs as being useful for "just leaving an included header"
- * and stuff like that, so this line mapping functionality may need fixing.
- * FIXME: find out whether this occurs. */
- // line_map->stop();
+// Parses the body of a unicode escape.
+std::pair<Codepoint, int>
+Lexer::parse_partial_unicode_escape ()
+{
+ skip_input ();
+ current_char = peek_input ();
+ int additional_length_offset = 1;
+
+ bool need_close_brace = false;
+ if (current_char == '{')
+ {
+ need_close_brace = true;
+
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
}
- /* TODO: need to optimise somehow to avoid the virtual function call in the
- * tight loop. Best idea at the moment is CRTP, but that might make lexer
- * implementation annoying when storing the "base class" (i.e. would need
- * template parameter everywhere), although in practice it would mostly just
- * look ugly and make enclosing classes like Parser also require a type
- * parameter. At this point a macro might be better. OK I guess macros can be
- * replaced by constexpr if or something if possible. */
- Location Lexer::get_current_location() {
- return line_map->get_location(current_column);
+ // parse unicode escape - 1-6 hex digits
+ std::string num_str;
+ num_str.reserve (6);
+
+ // loop through to add entire hex number to string
+ while (is_x_digit (current_char) || current_char == '_')
+ {
+ if (current_char == '_')
+ {
+ // don't add _ to number
+ skip_input ();
+ current_char = peek_input ();
+
+ additional_length_offset++;
+
+ continue;
+ }
+
+ additional_length_offset++;
+
+ // add raw hex numbers
+ num_str += current_char;
+
+ skip_input ();
+ current_char = peek_input ();
}
- int Lexer::peek_input(int n) {
- return input_queue.peek(n);
+ // ensure closing brace if required
+ if (need_close_brace)
+ {
+ if (current_char == '}')
+ {
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+ }
+ else
+ {
+ // actually an error, but allow propagation anyway
+ rust_error_at (get_current_location (),
+ "expected terminating '}' in unicode escape");
+ // return false;
+ return std::make_pair (Codepoint (0), additional_length_offset);
+ }
}
- int Lexer::peek_input() {
- return peek_input(0);
+ // ensure 1-6 hex characters
+ if (num_str.length () > 6 || num_str.length () < 1)
+ {
+ rust_error_at (get_current_location (),
+ "unicode escape should be between 1 and 6 hex "
+ "characters; it is %lu",
+ num_str.length ());
+ // return false;
+ return std::make_pair (Codepoint (0), additional_length_offset);
}
- void Lexer::skip_input(int n) {
- input_queue.skip(n);
+ long hex_num = std::strtol (num_str.c_str (), nullptr, 16);
+
+ // assert fits a uint32_t
+ gcc_assert (hex_num < 4294967296);
+
+ // return true;
+ return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
+ additional_length_offset);
+}
+
+// Parses a byte character.
+TokenPtr
+Lexer::parse_byte_char (Location loc)
+{
+ skip_input ();
+ current_column++;
+ // make current char the next character
+ current_char = peek_input ();
+
+ int length = 1;
+
+ // char to save
+ char byte_char = 0;
+
+ // detect escapes
+ if (current_char == '\\')
+ {
+ auto escape_length_pair = parse_escape ('\'');
+ byte_char = std::get<0> (escape_length_pair);
+ length += std::get<1> (escape_length_pair);
+
+ if (byte_char > 127)
+ {
+ rust_error_at (get_current_location (), "byte char '%c' out of range",
+ byte_char);
+ byte_char = 0;
+ }
+
+ current_char = peek_input ();
+
+ if (current_char != '\'')
+ {
+ rust_error_at (get_current_location (), "unclosed byte char");
+ }
+
+ skip_input ();
+ current_char = peek_input ();
+ length++; // go to next char
+ }
+ else if (current_char != '\'')
+ {
+ // otherwise, get character from direct input character
+ byte_char = current_char;
+
+ skip_input ();
+ current_char = peek_input ();
+ length++;
+
+ if (current_char != '\'')
+ {
+ rust_error_at (get_current_location (), "unclosed byte char");
+ }
+
+ skip_input ();
+ current_char = peek_input ();
+ length++; // go to next char
+ }
+ else
+ {
+ rust_error_at (get_current_location (),
+ "no character inside '' for byte char");
}
- void Lexer::skip_input() {
- skip_input(0);
+ current_column += length;
+
+ return Token::make_byte_char (loc, byte_char);
+}
+
+// Parses a byte string.
+TokenPtr
+Lexer::parse_byte_string (Location loc)
+{
+ // byte string
+
+ // skip quote character
+ skip_input ();
+ current_column++;
+
+ std::string str;
+ str.reserve (16); // some sensible default
+
+ int length = 1;
+ current_char = peek_input ();
+
+ while (current_char != '"' && current_char != '\n')
+ {
+ if (current_char == '\\')
+ {
+ auto escape_length_pair = parse_escape ('"');
+ char output_char = std::get<0> (escape_length_pair);
+
+ if (output_char == 0 && std::get<2> (escape_length_pair))
+ length = std::get<1> (escape_length_pair) - 1;
+ else
+ length += std::get<1> (escape_length_pair);
+
+ if (output_char > 127)
+ {
+ rust_error_at (get_current_location (),
+ "char '%c' in byte string out of range",
+ output_char);
+ output_char = 0;
+ }
+
+ if (output_char != 0)
+ str += output_char;
+
+ continue;
+ }
+
+ length++;
+
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
}
- const_TokenPtr Lexer::peek_token(int n) {
- return token_queue.peek(n);
+ current_column += length;
+
+ if (current_char == '\n')
+ {
+ rust_error_at (get_current_location (), "unended byte string literal");
}
+ else if (current_char == '"')
+ {
+ current_column++;
- const_TokenPtr Lexer::peek_token() {
- return peek_token(0);
+ skip_input ();
+ current_char = peek_input ();
+ }
+ else
+ {
+ gcc_unreachable ();
}
- void Lexer::skip_token(int n) {
- token_queue.skip(n);
+ str.shrink_to_fit ();
+
+ return Token::make_byte_string (loc, str);
+}
+
+// Parses a raw byte string.
+TokenPtr
+Lexer::parse_raw_byte_string (Location loc)
+{
+ // raw byte string literals
+ std::string str;
+ str.reserve (16); // some sensible default
+
+ int length = 1;
+ int hash_count = 0;
+
+ // get hash count at beginnning
+ skip_input ();
+ current_char = peek_input ();
+ length++;
+ while (current_char == '#')
+ {
+ hash_count++;
+ length++;
+
+ skip_input ();
+ current_char = peek_input ();
}
- void Lexer::skip_token() {
- skip_token(0);
+ if (current_char != '"')
+ {
+ rust_error_at (get_current_location (),
+ "raw byte string has no opening '\"'");
}
- void Lexer::replace_current_token(TokenPtr replacement) {
- token_queue.replace_current_value(replacement);
+ skip_input ();
+ current_char = peek_input ();
+ length++;
+
+ while (true)
+ {
+ if (current_char == '"')
+ {
+ bool enough_hashes = true;
+
+ for (int i = 0; i < hash_count; i++)
+ {
+ if (peek_input (i + 1) != '#')
+ {
+ enough_hashes = false;
+ break;
+ }
+ }
+
+ if (enough_hashes)
+ {
+ // skip enough input and peek enough input
+ skip_input (hash_count);
+ current_char = peek_input ();
+ length += hash_count + 1;
+ break;
+ }
+ }
+
+ length++;
+
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
}
- /* shitty anonymous namespace that can only be accessed inside the compilation
- * unit - used for classify_keyword Binary search in sorted array of keywords
- * created with x-macros. */
- namespace {
- const std::string keyword_index[] = {
-#define RS_TOKEN(x, y)
-#define RS_TOKEN_KEYWORD(name, keyword) keyword,
- RS_TOKEN_LIST
-#undef RS_TOKEN_KEYWORD
-#undef RS_TOKEN
- };
+ current_column += length;
- TokenId keyword_keys[] = {
-#define RS_TOKEN(x, y)
-#define RS_TOKEN_KEYWORD(name, keyword) name,
- RS_TOKEN_LIST
-#undef RS_TOKEN_KEYWORD
-#undef RS_TOKEN
- };
-
- const int num_keywords = sizeof(keyword_index) / sizeof(*keyword_index);
- } // namespace
-
- /* Determines whether the string passed in is a keyword or not. If it is, it
- * returns the keyword name. */
- TokenId Lexer::classify_keyword(const std::string& str) {
- const std::string* last = keyword_index + num_keywords;
- const std::string* idx = std::lower_bound(keyword_index, last, str);
-
- if (idx == last || str != *idx)
- return IDENTIFIER;
- else
- return keyword_keys[idx - keyword_index];
- }
-
- TokenPtr Lexer::build_token() {
- // loop to go through multiple characters to build a single token
- while (true) {
- Location loc = get_current_location();
- /*int */ current_char = peek_input();
- skip_input();
-
- // return end of file token if end of file
- if (current_char == EOF)
- return Token::make(END_OF_FILE, loc);
-
- // detect shebang
- if (loc == 1 && current_line == 1 && current_char == '#') {
- current_char = peek_input();
-
- if (current_char == '!') {
- skip_input();
- current_char = peek_input();
-
- switch (current_char) {
- case '/':
- // shebang
-
- skip_input();
-
- // ignore rest of line
- while (current_char != '\n') {
- current_char = peek_input();
- skip_input();
- }
-
- // newline
- current_line++;
- current_column = 1;
- // tell line_table that new line starts
- line_map->start_line(current_line, max_column_hint);
- continue;
- }
- }
- }
-
- // if not end of file, start tokenising
- switch (current_char) {
- /* ignore whitespace characters for tokens but continue updating
- * location */
- case '\n': // newline
- current_line++;
- current_column = 1;
- // tell line_table that new line starts
- line_map->start_line(current_line, max_column_hint);
- continue;
- case ' ': // space
- current_column++;
- continue;
- case '\t': // tab
- // width of a tab is not well-defined, assume 8 spaces
- current_column += 8;
- continue;
-
- // punctuation - actual tokens
- case '=':
- if (peek_input() == '>') {
- // match arm arrow
- skip_input();
- current_column += 2;
-
- return Token::make(MATCH_ARROW, loc);
- } else if (peek_input() == '=') {
- // equality operator
- skip_input();
- current_column += 2;
-
- return Token::make(EQUAL_EQUAL, loc);
- } else {
- // assignment operator
- current_column++;
- return Token::make(EQUAL, loc);
- }
- case '(':
- current_column++;
- return Token::make(LEFT_PAREN, loc);
- case '-':
- if (peek_input() == '>') {
- // return type specifier
- skip_input();
- current_column += 2;
-
- return Token::make(RETURN_TYPE, loc);
- } else if (peek_input() == '=') {
- // minus-assign
- skip_input();
- current_column += 2;
-
- return Token::make(MINUS_EQ, loc);
- } else {
- // minus
- current_column++;
- return Token::make(MINUS, loc);
- }
- case '+':
- if (peek_input() == '=') {
- // add-assign
- skip_input();
- current_column += 2;
-
- return Token::make(PLUS_EQ, loc);
- } else {
- // add
- current_column++;
- return Token::make(PLUS, loc);
- }
- case ')':
- current_column++;
- return Token::make(RIGHT_PAREN, loc);
- case ';':
- current_column++;
- return Token::make(SEMICOLON, loc);
- case '*':
- if (peek_input() == '=') {
- // multiplication-assign
- skip_input();
- current_column += 2;
-
- return Token::make(ASTERISK_EQ, loc);
- } else {
- // multiplication
- current_column++;
- return Token::make(ASTERISK, loc);
- }
- case ',':
- current_column++;
- return Token::make(COMMA, loc);
- case '/':
- if (peek_input() == '=') {
- // division-assign
- skip_input();
- current_column += 2;
-
- return Token::make(DIV_EQ, loc);
- } else if (peek_input() == '/') {
- // TODO: single-line doc comments
-
- // single line comment
- skip_input();
- current_column += 2;
-
- // basically ignore until line finishes
- while (current_char != '\n' && current_char != EOF) {
- skip_input();
- current_column++; // not used
- current_char = peek_input();
- }
- continue;
- break;
- } else if (peek_input() == '*') {
- // block comment
- skip_input();
- current_column += 2;
-
- // TODO: block doc comments
-
- current_char = peek_input();
-
- int level = 1;
- while (level > 0) {
- skip_input();
- current_column++; // for error-handling
- current_char = peek_input();
-
- // if /* found
- if (current_char == '/') {
- if (peek_input(1) == '*') {
- // skip /* characters
- skip_input(1);
-
- current_column += 2;
-
- level += 1;
- }
- }
-
- // ignore until */ is found
- if (current_char == '*') {
- if (peek_input(1) == '/') {
- // skip */ characters
- skip_input(1);
-
- current_column += 2;
- // should only break inner loop here - seems to do so
- // break;
-
- level -= 1;
- }
- }
- }
-
- // refresh new token
- continue;
- break;
- } else {
- // division
- current_column++;
- return Token::make(DIV, loc);
- }
- case '%':
- if (peek_input() == '=') {
- // modulo-assign
- current_column += 2;
- return Token::make(PERCENT_EQ, loc);
- } else {
- // modulo
- current_column++;
- return Token::make(PERCENT, loc);
- }
- case '^':
- if (peek_input() == '=') {
- // xor-assign?
- current_column += 2;
- return Token::make(CARET_EQ, loc);
- } else {
- // xor?
- current_column++;
- return Token::make(CARET, loc);
- }
- case '<':
- if (peek_input() == '<') {
- if (peek_input(1) == '=') {
- // left-shift assign
- skip_input(1);
- current_column += 3;
-
- return Token::make(LEFT_SHIFT_EQ, loc);
- } else {
- // left-shift
- skip_input();
- current_column += 2;
-
- return Token::make(LEFT_SHIFT, loc);
- }
- } else if (peek_input() == '=') {
- // smaller than or equal to
- skip_input();
- current_column += 2;
-
- return Token::make(LESS_OR_EQUAL, loc);
- } else {
- // smaller than
- current_column++;
- return Token::make(LEFT_ANGLE, loc);
- }
- break;
- case '>':
- if (peek_input() == '>') {
- if (peek_input(1) == '=') {
- // right-shift-assign
- skip_input(1);
- current_column += 3;
-
- return Token::make(RIGHT_SHIFT_EQ, loc);
- } else {
- // right-shift
- skip_input();
- current_column += 2;
-
- return Token::make(RIGHT_SHIFT, loc);
- }
- } else if (peek_input() == '=') {
- // larger than or equal to
- skip_input();
- current_column += 2;
-
- return Token::make(GREATER_OR_EQUAL, loc);
- } else {
- // larger than
- current_column++;
- return Token::make(RIGHT_ANGLE, loc);
- }
- case ':':
- if (peek_input() == ':') {
- // scope resolution ::
- skip_input();
- current_column += 2;
-
- return Token::make(SCOPE_RESOLUTION, loc);
- } else {
- // single colon :
- current_column++;
- return Token::make(COLON, loc);
- }
- case '!':
- // no special handling for macros in lexer?
- if (peek_input() == '=') {
- // not equal boolean operator
- skip_input();
- current_column += 2;
-
- return Token::make(NOT_EQUAL, loc);
- } else {
- // not equal unary operator
- current_column++;
-
- return Token::make(EXCLAM, loc);
- }
- case '?':
- current_column++;
- return Token::make(QUESTION_MARK, loc);
- case '#':
- current_column++;
- return Token::make(HASH, loc);
- case '[':
- current_column++;
- return Token::make(LEFT_SQUARE, loc);
- case ']':
- current_column++;
- return Token::make(RIGHT_SQUARE, loc);
- case '{':
- current_column++;
- return Token::make(LEFT_CURLY, loc);
- case '}':
- current_column++;
- return Token::make(RIGHT_CURLY, loc);
- case '@':
- current_column++;
- return Token::make(PATTERN_BIND, loc);
- case '$':
- current_column++;
- return Token::make(DOLLAR_SIGN, loc);
- case '~':
- current_column++;
- return Token::make(TILDE, loc);
- case '\\':
- current_column++;
- return Token::make(BACKSLASH, loc);
- case '`':
- current_column++;
- return Token::make(BACKTICK, loc);
- case '|':
- if (peek_input() == '=') {
- // bitwise or-assign?
- skip_input();
- current_column += 2;
-
- return Token::make(PIPE_EQ, loc);
- } else if (peek_input() == '|') {
- // logical or
- skip_input();
- current_column += 2;
-
- return Token::make(OR, loc);
- } else {
- // bitwise or
- current_column++;
-
- return Token::make(PIPE, loc);
- }
- case '&':
- if (peek_input() == '=') {
- // bitwise and-assign?
- skip_input();
- current_column += 2;
-
- return Token::make(AMP_EQ, loc);
- } else if (peek_input() == '&') {
- // logical and
- skip_input();
- current_column += 2;
-
- return Token::make(LOGICAL_AND, loc);
- } else {
- // bitwise and/reference
- current_column++;
-
- return Token::make(AMP, loc);
- }
- case '.':
- if (peek_input() == '.') {
- if (peek_input(1) == '.') {
- // ellipsis
- skip_input(1);
- current_column += 3;
-
- return Token::make(ELLIPSIS, loc);
- } else if (peek_input(1) == '=') {
- // ..=
- skip_input(1);
- current_column += 3;
-
- return Token::make(DOT_DOT_EQ, loc);
- } else {
- // ..
- skip_input();
- current_column += 2;
-
- return Token::make(DOT_DOT, loc);
- }
- } else if (!ISDIGIT(peek_input())) {
- // single dot .
- // Only if followed by a non-number
- current_column++;
- return Token::make(DOT, loc);
- }
- }
- // TODO: special handling of _ in the lexer? instead of being identifier
-
- // byte and byte string test
- if (current_char == 'b') {
- if (peek_input() == '\'') {
- skip_input();
- current_column++;
- // make current char the next character
- current_char = peek_input();
-
- int length = 1;
-
- // char to save
- char byte_char = 0;
-
- // detect escapes
- if (current_char == '\\') {
- auto escape_length_pair = parse_escape('\'');
- byte_char = std::get<0>(escape_length_pair);
- length += std::get<1>(escape_length_pair);
-
- if (byte_char > 127) {
- rust_error_at(
- get_current_location(), "byte char '%c' out of range", byte_char);
- byte_char = 0;
- }
-
- current_char = peek_input();
-
- if (current_char != '\'') {
- rust_error_at(get_current_location(), "unclosed byte char");
- }
-
- skip_input();
- current_char = peek_input();
- length++; // go to next char
- } else if (current_char != '\'') {
- // otherwise, get character from direct input character
- byte_char = current_char;
-
- skip_input();
- current_char = peek_input();
- length++;
-
- if (current_char != '\'') {
- rust_error_at(get_current_location(), "unclosed byte char");
- }
-
- skip_input();
- current_char = peek_input();
- length++; // go to next char
- } else {
- rust_error_at(get_current_location(), "no character inside '' for byte char");
- }
-
- current_column += length;
-
- return Token::make_byte_char(loc, byte_char);
- } else if (peek_input() == '"') {
- // byte string
-
- // skip quote character
- skip_input();
- current_column++;
-
- std::string str;
- str.reserve(16); // some sensible default
-
- int length = 1;
- current_char = peek_input();
-
- while (current_char != '"' && current_char != '\n') {
- if (current_char == '\\') {
- auto escape_length_pair = parse_escape('"');
- char output_char = std::get<0>(escape_length_pair);
- //length += escape_length_pair.second;
-
- // TODO: need to fix length - after escape, the length of the line up to the next non-whitespace char of the string is added to length, which is not what we want - we want length to be replaced by that.
- // possible option could if "if escape_length_pair.first == 0, then length = escape_length_pair.second else length += escape_length_pair.second."
- if (output_char == 0 && std::get<2>(escape_length_pair))
- length = std::get<1>(escape_length_pair) - 1;
- else
- length += std::get<1>(escape_length_pair);
-
- if (output_char > 127) {
- rust_error_at(get_current_location(),
- "char '%c' in byte string out of range", output_char);
- output_char = 0;
- }
-
- if (output_char != 0)
- str += output_char;
-
- continue;
- }
-
- length++;
-
- str += current_char;
- skip_input();
- current_char = peek_input();
- }
-
- current_column += length;
-
- if (current_char == '\n') {
- rust_error_at(get_current_location(), "unended byte string literal");
- } else if (current_char == '"') {
- // TEST: hopefully column inc should make string line up properly
- current_column++;
-
- skip_input();
- current_char = peek_input();
- } else {
- gcc_unreachable();
- }
-
- str.shrink_to_fit();
-
- return Token::make_byte_string(loc, str);
- } else if (peek_input() == 'r' && (peek_input(1) == '#' || peek_input(1) == '"')) {
- // raw byte string literals
- std::string str;
- str.reserve(16); // some sensible default
-
- int length = 1;
- int hash_count = 0;
-
- // get hash count at beginnning
- skip_input();
- current_char = peek_input();
- length++;
- while (current_char == '#') {
- hash_count++;
- length++;
-
- skip_input();
- current_char = peek_input();
- }
-
- if (current_char != '"') {
- rust_error_at(get_current_location(), "raw byte string has no opening '\"'");
- }
-
- skip_input();
- current_char = peek_input();
- length++;
-
- while (true) {
- if (current_char == '"') {
- bool enough_hashes = true;
-
- for (int i = 0; i < hash_count; i++) {
- if (peek_input(i + 1) != '#') {
- enough_hashes = false; // could continue here -
- // improve performance
- }
- }
-
- if (enough_hashes) {
- // skip enough input and peek enough input
- skip_input(hash_count); // is this enough?
- current_char = peek_input();
- length += hash_count + 1;
- break;
- }
- }
-
- length++;
-
- str += current_char;
- skip_input();
- current_char = peek_input();
- }
-
- current_column += length;
-
- str.shrink_to_fit();
-
- return Token::make_byte_string(loc, str);
- }
- }
-
- // raw stuff
- if (current_char == 'r') {
- int peek = peek_input();
- int peek1 = peek_input(1);
-
- if (peek == '#' && (ISALPHA(peek1) || peek1 == '_')) {
- // raw identifier
- std::string str;
- str.reserve(16); // default
-
- skip_input();
- current_char = peek_input();
-
- current_column += 2;
-
- str += current_char;
-
- bool first_is_underscore = current_char == '_';
-
- int length = 1;
- current_char = peek_input();
- // loop through entire name
- while (ISALPHA(current_char) || ISDIGIT(current_char) || current_char == '_') {
- length++;
-
- str += current_char;
- skip_input();
- current_char = peek_input();
- }
-
- current_column += length;
-
- // if just a single underscore, not an identifier
- if (first_is_underscore && length == 1) {
- rust_error_at(get_current_location(), "'_' is not a valid raw identifier");
- }
-
- if (str == "crate" || str == "extern" || str == "self" || str == "super"
- || str == "Self") {
- rust_error_at(
- get_current_location(), "'%s' is a forbidden raw identifier", str.c_str());
- } else {
- str.shrink_to_fit();
-
- return Token::make_identifier(loc, str);
- }
- } else {
- int peek_index = 0;
- while (peek_input(peek_index) == '#')
- peek_index++;
- // TODO: optimise by using "peek_index" as the hash count - 1 or something
-
- if (peek_input(peek_index) == '"') {
- // raw string literals
- std::string str;
- str.reserve(16); // some sensible default
-
- int length = 1;
- int hash_count = 0;
-
- // get hash count at beginnning
- current_char = peek;
- while (current_char == '#') {
- hash_count++;
- length++;
-
- skip_input();
- current_char = peek_input();
- }
-
- if (current_char != '"') {
- rust_error_at(get_current_location(), "raw string has no opening '\"'");
- }
-
- length++;
- skip_input();
- Codepoint current_char32 = test_peek_codepoint_input();
-
- // TODO: didn't account for current_column++ somewhere - one less than is required
+ str.shrink_to_fit ();
- while (true) {
- if (current_char32.value == '"') {
- bool enough_hashes = true;
+ return Token::make_byte_string (loc, str);
+}
- for (int i = 0; i < hash_count; i++) {
- // if (test_peek_codepoint_input(i + 1) != '#') {
- // TODO: ensure this is a good enough replacement
- if (peek_input(i + 1) != '#') {
- enough_hashes = false; // could continue here -
- // improve performance
- }
- }
+// Parses a raw identifier.
+TokenPtr
+Lexer::parse_raw_identifier (Location loc)
+{
+ // raw identifier
+ std::string str;
+ str.reserve (16); // default
- if (enough_hashes) {
- // skip enough input and peek enough input
- skip_input(hash_count); // is this enough?
- current_char = peek_input();
- length += hash_count + 1;
- break;
- }
- }
+ skip_input ();
+ current_char = peek_input ();
- length++;
+ current_column += 2;
- str += current_char32;
- test_skip_codepoint_input();
- current_char32 = test_peek_codepoint_input();
- }
+ str += current_char;
- current_column += length;
+ bool first_is_underscore = current_char == '_';
- str.shrink_to_fit();
+ int length = 1;
+ current_char = peek_input ();
+ // loop through entire name
+ while (ISALPHA (current_char) || ISDIGIT (current_char)
+ || current_char == '_')
+ {
+ length++;
- return Token::make_string(loc, str);
- }
- }
- }
-
- // find identifiers and keywords
- if (ISALPHA(current_char) || current_char == '_') {
- std::string str;
- str.reserve(16); // default
- str += current_char;
-
- bool first_is_underscore = current_char == '_';
-
- int length = 1;
- current_char = peek_input();
- // loop through entire name
- while (ISALPHA(current_char) || ISDIGIT(current_char) || current_char == '_') {
- length++;
-
- str += current_char;
- skip_input();
- current_char = peek_input();
- }
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ }
- current_column += length;
+ current_column += length;
- // if just a single underscore, not an identifier
- if (first_is_underscore && length == 1)
- return Token::make(UNDERSCORE, loc);
+ // if just a single underscore, not an identifier
+ if (first_is_underscore && length == 1)
+ rust_error_at (get_current_location (),
+ "'_' is not a valid raw identifier");
- str.shrink_to_fit();
+ if (str == "crate" || str == "extern" || str == "self" || str == "super"
+ || str == "Self")
+ {
+ rust_error_at (get_current_location (),
+ "'%s' is a forbidden raw identifier", str.c_str ());
- TokenId keyword = classify_keyword(str);
- if (keyword == IDENTIFIER)
- return Token::make_identifier(loc, str);
- else
- return Token::make(keyword, loc);
- }
+ return nullptr;
+ }
+ else
+ {
+ str.shrink_to_fit ();
- // identify literals
- // int or float literals - not processed properly
- if (ISDIGIT(current_char) || current_char == '.') { // _ not allowed as first char
- std::string str;
- str.reserve(16); // some sensible default
- str += current_char;
+ return Token::make_identifier (loc, str);
+ }
+}
- PrimitiveCoreType type_hint = CORETYPE_UNKNOWN;
+// Parses a unicode string.
+TokenPtr
+Lexer::parse_string (Location loc)
+{
+ Codepoint current_char32;
- bool is_real = (current_char == '.');
+ std::string str;
+ str.reserve (16); // some sensible default
- int length = 1;
+ int length = 1;
+ current_char32 = peek_codepoint_input ();
- // handle binary, octal, hex literals
- if (current_char == '0' && !ISDIGIT(peek_input())) {
- current_char = peek_input();
+ while (current_char32.value != '\n' && current_char32.value != '"')
+ {
+ if (current_char32.value == '\\')
+ {
+ // parse escape
+ auto utf8_escape_pair = parse_utf8_escape ('\'');
+ current_char32 = std::get<0> (utf8_escape_pair);
- if (current_char == 'x') {
- // hex (integer only)
+ if (current_char32 == Codepoint (0) && std::get<2> (utf8_escape_pair))
+ length = std::get<1> (utf8_escape_pair) - 1;
+ else
+ length += std::get<1> (utf8_escape_pair);
- skip_input();
- current_char = peek_input();
+ if (current_char32 != Codepoint (0))
+ str += current_char32;
- length++;
+ // required as parsing utf8 escape only changes current_char
+ current_char32 = peek_codepoint_input ();
- // add 'x' to string after 0 so it is 0xFFAA or whatever
- str += 'x';
+ continue;
+ }
- // loop through to add entire hex number to string
- while (is_x_digit(current_char) || current_char == '_') {
- if (current_char == '_') {
- // don't add _ to number
- skip_input();
- current_char = peek_input();
+ length += get_input_codepoint_length ();
- length++;
+ str += current_char32;
+ skip_codepoint_input ();
+ current_char32 = peek_codepoint_input ();
+ }
- continue;
- }
+ current_column += length;
- length++;
+ if (current_char32.value == '\n')
+ {
+ rust_error_at (get_current_location (), "unended string literal");
+ }
+ else if (current_char32.value == '"')
+ {
+ current_column++;
- // add raw hex numbers
- str += current_char;
- skip_input();
- current_char = peek_input();
- }
+ skip_input ();
+ current_char = peek_input ();
+ }
+ else
+ {
+ gcc_unreachable ();
+ }
- current_column += length;
+ str.shrink_to_fit ();
+ return Token::make_string (loc, str);
+}
+
+// Parses an identifier or keyword.
+TokenPtr
+Lexer::parse_identifier_or_keyword (Location loc)
+{
+ std::string str;
+ str.reserve (16); // default
+ str += current_char;
+
+ bool first_is_underscore = current_char == '_';
+
+ int length = 1;
+ current_char = peek_input ();
+ // loop through entire name
+ while (ISALPHA (current_char) || ISDIGIT (current_char)
+ || current_char == '_')
+ {
+ length++;
+
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ }
- // convert hex value to decimal representation
- long hex_num = std::strtol(str.c_str(), NULL, 16);
+ current_column += length;
+
+ // if just a single underscore, not an identifier
+ if (first_is_underscore && length == 1)
+ return Token::make (UNDERSCORE, loc);
+
+ str.shrink_to_fit ();
+
+ TokenId keyword = classify_keyword (str);
+ if (keyword == IDENTIFIER)
+ return Token::make_identifier (loc, str);
+ else
+ return Token::make (keyword, loc);
+}
+
+// Possibly returns a raw string token if it exists - otherwise returns null.
+TokenPtr
+Lexer::maybe_parse_raw_string (Location loc)
+{
+ int peek_index = 0;
+ while (peek_input (peek_index) == '#')
+ peek_index++;
+
+ if (peek_input (peek_index) == '"')
+ return parse_raw_string (loc, peek_index);
+ else
+ return nullptr;
+}
+
+// Returns a raw string token.
+TokenPtr
+Lexer::parse_raw_string (Location loc, int initial_hash_count)
+{
+ // raw string literals
+ std::string str;
+ str.reserve (16); // some sensible default
+
+ int length = 1 + initial_hash_count;
+
+ if (initial_hash_count > 0)
+ skip_input (initial_hash_count - 1);
+
+ current_char = peek_input ();
+
+ if (current_char != '"')
+ rust_error_at (get_current_location (), "raw string has no opening '\"'");
+
+ length++;
+ skip_input ();
+ Codepoint current_char32 = peek_codepoint_input ();
+
+ while (true)
+ {
+ if (current_char32.value == '"')
+ {
+ bool enough_hashes = true;
+
+ for (int i = 0; i < initial_hash_count; i++)
+ {
+ if (peek_input (i + 1) != '#')
+ {
+ enough_hashes = false;
+ break;
+ }
+ }
+
+ if (enough_hashes)
+ {
+ // skip enough input and peek enough input
+ skip_input (initial_hash_count);
+ current_char = peek_input ();
+ length += initial_hash_count + 1;
+ break;
+ }
+ }
+
+ length++;
+
+ str += current_char32;
+ skip_codepoint_input ();
+ current_char32 = peek_codepoint_input ();
+ }
- str = std::to_string(hex_num);
+ current_column += length;
- // parse in type suffix if it exists
- auto type_suffix_pair = parse_in_type_suffix();
- type_hint = type_suffix_pair.first;
- length += type_suffix_pair.second;
+ str.shrink_to_fit ();
- if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) {
- rust_error_at(get_current_location(),
- "invalid type suffix '%s' for integer (hex) literal",
- get_type_hint_string(type_hint));
- }
- } else if (current_char == 'o') {
- // octal (integer only)
+ return Token::make_string (loc, str);
+}
- skip_input();
- current_char = peek_input();
+template <typename IsDigitFunc>
+TokenPtr
+Lexer::parse_non_decimal_int_literal (Location loc, IsDigitFunc is_digit_func,
+ std::string existent_str, int base)
+{
+ int length = 1;
- length++;
+ skip_input ();
+ current_char = peek_input ();
- // loop through to add entire octal number to string
- while (is_octal_digit(current_char) || current_char == '_') {
- if (current_char == '_') {
- // don't add _ to number
- skip_input();
- current_char = peek_input();
+ length++;
- length++;
+ // loop through to add entire number to string
+ while (is_digit_func (current_char) || current_char == '_')
+ {
+ if (current_char == '_')
+ {
+ // don't add _ to number
+ skip_input ();
+ current_char = peek_input ();
- continue;
- }
+ length++;
- length++;
+ continue;
+ }
- // add raw octal numbers
- str += current_char;
- skip_input();
- current_char = peek_input();
- }
+ length++;
- current_column += length;
+ // add raw numbers
+ existent_str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ }
- // convert octal value to decimal representation
- long octal_num = std::strtol(str.c_str(), NULL, 8);
+ // convert value to decimal representation
+ long dec_num = std::strtol (existent_str.c_str (), nullptr, base);
+
+ existent_str = std::to_string (dec_num);
+
+ // parse in type suffix if it exists
+ auto type_suffix_pair = parse_in_type_suffix ();
+ PrimitiveCoreType type_hint = type_suffix_pair.first;
+ length += type_suffix_pair.second;
+
+ current_column += length;
+
+ if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64)
+ {
+ rust_error_at (get_current_location (),
+ "invalid type suffix '%s' for integer (%s) literal",
+ get_type_hint_string (type_hint),
+ base == 16
+ ? "hex"
+ : (base == 8 ? "octal"
+ : (base == 2 ? "binary"
+ : "<insert unknown base>")));
+ return nullptr;
+ }
+ return Token::make_int (loc, existent_str, type_hint);
+}
+
+// Parses a hex, binary or octal int literal.
+TokenPtr
+Lexer::parse_non_decimal_int_literals (Location loc)
+{
+ std::string str;
+ str.reserve (16); // some sensible default
+ str += current_char;
+
+ current_char = peek_input ();
+
+ if (current_char == 'x')
+ {
+ // hex (integer only)
+ return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16);
+ }
+ else if (current_char == 'o')
+ {
+ // octal (integer only)
+ return parse_non_decimal_int_literal (loc, is_octal_digit,
+ std::move (str), 8);
+ }
+ else if (current_char == 'b')
+ {
+ // binary (integer only)
+ return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str),
+ 2);
+ }
+ else
+ {
+ return nullptr;
+ }
+}
+
+// Parses a decimal-based int literal or float literal.
+TokenPtr
+Lexer::parse_decimal_int_or_float (Location loc)
+{
+ std::string str;
+ str.reserve (16); // some sensible default
+ str += current_char;
+
+ int length = 1;
+
+ current_char = peek_input ();
+
+ // parse initial decimal integer (or first integer part of float) literal
+ auto initial_decimal_pair = parse_in_decimal ();
+ str += initial_decimal_pair.first;
+ length += initial_decimal_pair.second;
+
+ // detect float literal
+ if (current_char == '.' && is_float_digit (peek_input (1)))
+ {
+ // float with a '.', parse another decimal into it
+
+ // add . to str
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ length++;
+
+ // parse another decimal number for float
+ auto second_decimal_pair = parse_in_decimal ();
+ str += second_decimal_pair.first;
+ length += second_decimal_pair.second;
+
+ // parse in exponent part if it exists
+ auto exponent_pair = parse_in_exponent_part ();
+ str += exponent_pair.first;
+ length += exponent_pair.second;
+
+ // parse in type suffix if it exists
+ auto type_suffix_pair = parse_in_type_suffix ();
+ PrimitiveCoreType type_hint = type_suffix_pair.first;
+ length += type_suffix_pair.second;
+
+ if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
+ && type_hint != CORETYPE_UNKNOWN)
+ {
+ rust_error_at (get_current_location (),
+ "invalid type suffix '%s' for float literal",
+ get_type_hint_string (type_hint));
+ // ignore invalid type suffix as everything else seems fine
+ type_hint = CORETYPE_UNKNOWN;
+ }
+
+ current_column += length;
+
+ str.shrink_to_fit ();
+ return Token::make_float (loc, str, type_hint);
+ }
+ else if (current_char == '.' && check_valid_float_dot_end (peek_input (1)))
+ {
+ // float that is just an integer with a terminating '.' character
+
+ // add . to str
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ length++;
+
+ // add a '0' after the . to prevent ambiguity
+ str += '0';
- str = std::to_string(octal_num);
+ // type hint not allowed
- // parse in type suffix if it exists
- // parse_in_type_suffix (/*current_char, */ type_hint, length);
- auto type_suffix_pair = parse_in_type_suffix();
- type_hint = type_suffix_pair.first;
- length += type_suffix_pair.second;
+ current_column += length;
+
+ str.shrink_to_fit ();
+ return Token::make_float (loc, str, CORETYPE_UNKNOWN);
+ }
+ else if (current_char == 'E' || current_char == 'e')
+ {
+ // exponent float with no '.' character
+
+ // parse exponent part
+ auto exponent_pair = parse_in_exponent_part ();
+ str += exponent_pair.first;
+ length += exponent_pair.second;
+
+ // parse in type suffix if it exists
+ auto type_suffix_pair = parse_in_type_suffix ();
+ PrimitiveCoreType type_hint = type_suffix_pair.first;
+ length += type_suffix_pair.second;
+
+ if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
+ && type_hint != CORETYPE_UNKNOWN)
+ {
+ rust_error_at (get_current_location (),
+ "invalid type suffix '%s' for float literal",
+ get_type_hint_string (type_hint));
+ // ignore invalid type suffix as everything else seems fine
+ type_hint = CORETYPE_UNKNOWN;
+ }
+
+ current_column += length;
+
+ str.shrink_to_fit ();
+ return Token::make_float (loc, str, type_hint);
+ }
+ else
+ {
+ // is an integer
+
+ // parse in type suffix if it exists
+ auto type_suffix_pair = parse_in_type_suffix ();
+ PrimitiveCoreType type_hint = type_suffix_pair.first;
+ length += type_suffix_pair.second;
+
+ if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64)
+ {
+ rust_error_at (get_current_location (),
+ "invalid type suffix '%s' for integer "
+ "(decimal) literal",
+ get_type_hint_string (type_hint));
+ // ignore invalid type suffix as everything else seems fine
+ type_hint = CORETYPE_UNKNOWN;
+ }
+
+ current_column += length;
+
+ str.shrink_to_fit ();
+ return Token::make_int (loc, str, type_hint);
+ }
+}
+
+TokenPtr
+Lexer::parse_char_or_lifetime (Location loc)
+{
+ Codepoint current_char32;
+
+ int length = 1;
+
+ current_char32 = peek_codepoint_input ();
+
+ // parse escaped char literal
+ if (current_char32.value == '\\')
+ {
+ // parse escape
+ auto utf8_escape_pair = parse_utf8_escape ('\'');
+ current_char32 = std::get<0> (utf8_escape_pair);
+ length += std::get<1> (utf8_escape_pair);
+
+ if (peek_codepoint_input ().value != '\'')
+ {
+ rust_error_at (get_current_location (), "unended char literal");
+ }
+ else
+ {
+ skip_codepoint_input ();
+ current_char = peek_input ();
+ length++;
+ }
+
+ current_column += length;
+
+ return Token::make_char (loc, current_char32);
+ }
+ else
+ {
+ skip_codepoint_input ();
+
+ if (peek_codepoint_input ().value == '\'')
+ {
+ // parse non-escaped char literal
+
+ // skip the ' character
+ skip_input ();
+ current_char = peek_input ();
+
+ // TODO fix due to different widths of utf-8 chars?
+ current_column += 3;
+
+ return Token::make_char (loc, current_char32);
+ }
+ else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
+ || current_char32.value == '_')
+ {
+ // parse lifetime name
+ std::string str;
+ str += current_char32;
+ length++;
+
+ current_char = peek_input ();
+ while (ISDIGIT (current_char) || ISALPHA (current_char)
+ || current_char == '_')
+ {
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ length++;
+ }
+
+ current_column += length;
+
+ str.shrink_to_fit ();
+ return Token::make_lifetime (loc, str);
+ }
+ else
+ {
+ rust_error_at (get_current_location (),
+ "expected ' after character constant in char literal");
+ }
+ }
+}
- if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) {
- rust_error_at(get_current_location(),
- "invalid type suffix '%s' for integer (octal) literal",
- get_type_hint_string(type_hint));
- }
- } else if (current_char == 'b') {
- // binary (integer only)
+// Returns the length of the codepoint at the current position.
+int
+Lexer::get_input_codepoint_length ()
+{
+ uint8_t input = peek_input ();
- skip_input();
- current_char = peek_input();
+ if (input < 128)
+ {
+ // ascii -- 1 byte
+ // return input;
- length++;
+ return 1;
+ }
+ else if ((input & 0xC0) == 0x80)
+ {
+ // invalid (continuation; can't be first char)
+ // return 0xFFFE;
- // loop through to add entire binary number to string
- while (is_bin_digit(current_char) || current_char == '_') {
- if (current_char == '_') {
- // don't add _ to number
- skip_input();
- current_char = peek_input();
-
- length++;
-
- continue;
- }
-
- length++;
-
- // add raw binary numbers
- str += current_char;
- skip_input();
- current_char = peek_input();
- }
-
- current_column += length;
-
- // convert binary value to decimal representation
- long bin_num = std::strtol(str.c_str(), NULL, 2);
-
- str = std::to_string(bin_num);
-
- // parse in type suffix if it exists
- // parse_in_type_suffix (/*current_char, */ type_hint, length);
- auto type_suffix_pair = parse_in_type_suffix();
- type_hint = type_suffix_pair.first;
- length += type_suffix_pair.second;
-
- if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) {
- rust_error_at(get_current_location(),
- "invalid type suffix '%s' for integer (binary) literal",
- get_type_hint_string(type_hint));
- }
- }
- } else {
- // handle decimals (integer or float)
-
- current_char = peek_input();
-
- // parse initial decimal literal - assuming integer
- // parse_in_decimal (/*current_char, */ str, length);
- auto str_length_pair = parse_in_decimal();
- str += str_length_pair.first;
- length += str_length_pair.second;
-
- // detect float literal - TODO: fix: "242." is not recognised as a
- // float literal
- if (current_char == '.' && is_float_digit(peek_input(1))) {
- // float with a '.', parse another decimal into it
-
- is_real = true;
-
- // add . to str
- str += current_char;
- skip_input();
- current_char = peek_input();
-
- length++;
-
- // parse another decimal number for float
- auto str_length_pair2 = parse_in_decimal();
- str += str_length_pair2.first;
- length += str_length_pair2.second;
-
- // parse in exponent part if it exists
- auto exponent_part = parse_in_exponent_part();
- str += exponent_part.first;
- length += exponent_part.second;
-
- // parse in type suffix if it exists
- auto type_suffix_pair = parse_in_type_suffix();
- type_hint = type_suffix_pair.first;
- length += type_suffix_pair.second;
-
- if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
- && type_hint != CORETYPE_UNKNOWN) {
- rust_error_at(get_current_location(),
- "invalid type suffix '%s' for float literal",
- get_type_hint_string(type_hint));
- }
- } else if (current_char == '.' && check_valid_float_dot_end(peek_input(1))) {
- is_real = true;
-
- // add . to str
- str += current_char;
- skip_input();
- current_char = peek_input();
- length++;
-
- // add a '0' after the . to stop ambiguity
- str += '0';
-
- // don't parse another decimal number for float
-
- if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
- && type_hint != CORETYPE_UNKNOWN) {
- rust_error_at(get_current_location(),
- "invalid type suffix '%s' for float literal",
- get_type_hint_string(type_hint));
- }
- } else if (current_char == 'E' || current_char == 'e') {
- is_real = true;
-
- // parse exponent part
- // parse_in_exponent_part (/*current_char, */ str, length);
- auto exponent_part = parse_in_exponent_part();
- str += exponent_part.first;
- length += exponent_part.second;
-
- // parse in type suffix if it exists
- // parse_in_type_suffix (/*current_char, */ type_hint, length);
- auto type_suffix_pair = parse_in_type_suffix();
- type_hint = type_suffix_pair.first;
- length += type_suffix_pair.second;
-
- if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
- && type_hint != CORETYPE_UNKNOWN) {
- rust_error_at(get_current_location(),
- "invalid type suffix '%s' for float literal",
- get_type_hint_string(type_hint));
- }
- } else {
- // is an integer
-
- // parse in type suffix if it exists
- // parse_in_type_suffix (/*current_char, */ type_hint, length);
- auto type_suffix_pair = parse_in_type_suffix();
- type_hint = type_suffix_pair.first;
- length += type_suffix_pair.second;
-
- if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) {
- rust_error_at(get_current_location(),
- "invalid type suffix '%s' for integer "
- "(decimal) literal",
- get_type_hint_string(type_hint));
- }
- }
-
- current_column += length;
- }
-
- str.shrink_to_fit();
-
- // actually make the tokens
- if (is_real)
- return Token::make_float(loc, str, type_hint);
- else
- return Token::make_int(loc, str, type_hint);
- }
-
- // string literals - not processed properly
- if (current_char == '"') {
- Codepoint current_char32;
-
- std::string str;
- str.reserve(16); // some sensible default
-
- int length = 1;
- current_char32 = test_peek_codepoint_input();
-
- while (current_char32.value != '\n' && current_char32.value != '"') {
- if (current_char32.value == '\\') {
- // parse escape
- auto utf8_escape_pair = parse_utf8_escape('\'');
- current_char32 = std::get<0>(utf8_escape_pair);
- //length += utf8_escape_pair.second;
-
- // TODO: need to fix length - after escape, the length of the line up to the next non-whitespace char of the string is added to length, which is not what we want - we want length to be replaced by that.
- // possible option could if "if escape_length_pair.first == 0, then length = escape_length_pair.second else length += escape_length_pair.second."
- if (current_char32 == Codepoint(0) && std::get<2>(utf8_escape_pair))
- length = std::get<1>(utf8_escape_pair);
- else
- length += std::get<1>(utf8_escape_pair);
-
- if (current_char32 != Codepoint(0))
- str += current_char32;
-
- // required as parsing utf8 escape only changes current_char
- // or something
- current_char32 = test_peek_codepoint_input();
-
- continue;
- }
-
- length += test_get_input_codepoint_length();
-
- str += current_char32;
- test_skip_codepoint_input();
- current_char32 = test_peek_codepoint_input();
- }
-
- current_column += length;
-
- if (current_char32.value == '\n') {
- rust_error_at(get_current_location(), "unended string literal");
- } else if (current_char32.value == '"') {
- current_column++;
-
- skip_input();
- current_char = peek_input();
- } else {
- gcc_unreachable();
- }
-
- str.shrink_to_fit();
- return Token::make_string(loc, str);
- }
-
- // char literal attempt
- if (current_char == '\'') {
- Codepoint current_char32;
-
- int length = 1;
-
- current_char32 = test_peek_codepoint_input();
-
- // parse escaped char literal
- if (current_char32.value == '\\') {
- // parse escape
- auto utf8_escape_pair = parse_utf8_escape('\'');
- current_char32 = std::get<0>(utf8_escape_pair);
- length += std::get<1>(utf8_escape_pair);
-
- if (test_peek_codepoint_input().value != '\'') {
- rust_error_at(get_current_location(), "unended char literal");
- } else {
- test_skip_codepoint_input();
- current_char = peek_input();
- length++;
- }
-
- current_column += length;
-
- return Token::make_char(loc, current_char32);
- } else {
- // current_char32 = test_peek_codepoint_input();
- test_skip_codepoint_input();
-
- if (test_peek_codepoint_input().value == '\'') {
- // parse normal char literal
-
- // skip the ' character
- skip_input();
- current_char = peek_input();
-
- // TODO fix due to different widths of utf-8 chars
- current_column += 3;
-
- return Token::make_char(loc, current_char32);
- } else if (ISDIGIT(current_char32.value) || ISALPHA(current_char32.value)
- || current_char32.value == '_') {
- // parse lifetime name
- std::string str;
- str += current_char32;
-
- /* TODO: fix lifetime name thing - actually, why am I even
- * using utf-8 here? */
-
- int length = 1;
-
- current_char32 = test_peek_codepoint_input();
-
- while (ISDIGIT(current_char32.value) || ISALPHA(current_char32.value)
- || current_char32.value == '_') {
- length += test_get_input_codepoint_length();
-
- str += current_char32;
- test_skip_codepoint_input();
- current_char32 = test_peek_codepoint_input();
- }
-
- current_column += length;
-
- str.shrink_to_fit();
- return Token::make_lifetime(loc, str);
- } else {
- rust_error_at(get_current_location(), "expected ' after character constant");
- }
- }
- }
-
- // didn't match anything so error
- rust_error_at(loc, "unexpected character '%x'", current_char);
- current_column++;
- }
- }
-
- // Shitty pass-by-reference way of parsing in type suffix.
- std::pair<PrimitiveCoreType, int> Lexer::parse_in_type_suffix() {
- std::string suffix;
- suffix.reserve(5);
-
- int additional_length_offset = 0;
-
- // get suffix
- while (ISALPHA(current_char) || ISDIGIT(current_char) || current_char == '_') {
- if (current_char == '_') {
- // don't add _ to suffix
- skip_input();
- current_char = peek_input();
-
- additional_length_offset++;
-
- continue;
- }
-
- additional_length_offset++;
-
- suffix += current_char;
- skip_input();
- current_char = peek_input();
- }
-
- if (suffix.empty()) {
- // no type suffix: do nothing but also no error
- return std::make_pair(CORETYPE_UNKNOWN, additional_length_offset);
- } else if (suffix == "f32") {
- return std::make_pair(CORETYPE_F32, additional_length_offset);
- } else if (suffix == "f64") {
- return std::make_pair(CORETYPE_F64, additional_length_offset);
- } else if (suffix == "i8") {
- return std::make_pair(CORETYPE_I8, additional_length_offset);
- } else if (suffix == "i16") {
- return std::make_pair(CORETYPE_I16, additional_length_offset);
- } else if (suffix == "i32") {
- return std::make_pair(CORETYPE_I32, additional_length_offset);
- } else if (suffix == "i64") {
- return std::make_pair(CORETYPE_I64, additional_length_offset);
- } else if (suffix == "i128") {
- return std::make_pair(CORETYPE_I128, additional_length_offset);
- } else if (suffix == "isize") {
- return std::make_pair(CORETYPE_ISIZE, additional_length_offset);
- } else if (suffix == "u8") {
- return std::make_pair(CORETYPE_U8, additional_length_offset);
- } else if (suffix == "u16") {
- return std::make_pair(CORETYPE_U16, additional_length_offset);
- } else if (suffix == "u32") {
- return std::make_pair(CORETYPE_U32, additional_length_offset);
- } else if (suffix == "u64") {
- return std::make_pair(CORETYPE_U64, additional_length_offset);
- } else if (suffix == "u128") {
- return std::make_pair(CORETYPE_U128, additional_length_offset);
- } else if (suffix == "usize") {
- return std::make_pair(CORETYPE_USIZE, additional_length_offset);
- } else {
- rust_error_at(get_current_location(), "unknown number suffix '%s'", suffix.c_str());
-
- return std::make_pair(CORETYPE_UNKNOWN, additional_length_offset);
- }
- }
-
- std::pair<std::string, int> Lexer::parse_in_exponent_part() {
- int additional_length_offset = 0;
- std::string str;
- if (current_char == 'E' || current_char == 'e') {
- // add exponent to string as strtod works with it
- str += current_char;
- skip_input();
- current_char = peek_input();
-
- additional_length_offset++;
-
- // special - and + handling
- if (current_char == '-') {
- str += '-';
-
- skip_input();
- current_char = peek_input();
-
- additional_length_offset++;
- } else if (current_char == '+') {
- // don't add + but still skip input
- skip_input();
- current_char = peek_input();
-
- additional_length_offset++;
- }
-
- // parse another decimal number for exponent
- auto str_length_pair = parse_in_decimal();
- str += str_length_pair.first;
- additional_length_offset += str_length_pair.second;
- }
- return std::make_pair(str, additional_length_offset);
- }
-
- std::pair<std::string, int> Lexer::parse_in_decimal() {
- int additional_length_offset = 0;
- std::string str;
- while (ISDIGIT(current_char) || current_char == '_') {
- if (current_char == '_') {
- // don't add _ to number
- skip_input();
- current_char = peek_input();
-
- additional_length_offset++;
-
- continue;
- }
-
- additional_length_offset++;
-
- str += current_char;
- skip_input();
- current_char = peek_input();
- }
- return std::make_pair(str, additional_length_offset);
- }
-
- /* Parses escapes (and string continues) in "byte" strings and characters. Does not support unicode. */
- std::tuple<char, int, bool> Lexer::parse_escape(char opening_char) {
- int additional_length_offset = 0;
- char output_char = 0;
-
- // skip to actual letter
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
-
- switch (current_char) {
- case 'x': {
- // hex char string (null-terminated)
- char hexNum[3] = { 0, 0, 0 };
-
- // first hex char
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
-
- if (!is_x_digit(current_char)) {
- rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
- current_char);
- }
- hexNum[0] = current_char;
-
- // second hex char
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
-
- if (!is_x_digit(current_char)) {
- rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
- current_char);
- }
- hexNum[1] = current_char;
-
- long hexLong = std::strtol(hexNum, NULL, 16);
-
- if (hexLong > 255 || hexLong < 0)
- rust_error_at(get_current_location(),
- "byte \\x escape '\\x%s' out of range - allows up to '\\xFF'", hexNum);
- char hexChar = static_cast<char>(hexLong);
-
- output_char = hexChar;
- } break;
- case 'n':
- output_char = '\n';
- break;
- case 'r':
- output_char = '\r';
- break;
- case 't':
- output_char = '\t';
- break;
- case '\\':
- output_char = '\\';
- break;
- case '0':
- output_char = '\0';
- break;
- case '\'':
- output_char = '\'';
- break;
- case '"':
- output_char = '"';
- break;
- case 'u':
- rust_error_at(get_current_location(),
- "cannot have a unicode escape \\u in a byte %s!",
- opening_char == '\'' ? "character" : "string");
- return std::make_tuple(output_char, additional_length_offset, false);
-#if 0
- {
- // TODO: shouldn't be used with this - use parse_utf8_escape
-
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
-
- bool need_close_brace = false;
-
- // TODO: rustc lexer doesn't seem to allow not having { but mrustc lexer
- // does? look at spec?
- if (current_char == '{') {
- need_close_brace = true;
-
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
- }
-
- // parse unicode escape
- // 1-6 hex digits?
- std::string num_str;
- num_str.reserve(6);
-
- // test adding number directly
- uint32_t test_val;
-
- // loop through to add entire hex number to string
- while (is_x_digit(current_char) || current_char == '_') {
- if (current_char == '_') {
- // don't add _ to number
- skip_input();
- current_char = peek_input();
-
- additional_length_offset++;
-
- continue;
- }
-
- additional_length_offset++;
-
- // add raw hex numbers
- num_str += current_char;
-
- // test adding number directly
- char tmp[2] = { current_char, 0 };
- test_val *= 16;
- test_val += std::strtol(tmp, NULL, 16);
-
- skip_input();
- current_char = peek_input();
- }
-
- // ensure closing brace
- if (need_close_brace && current_char != '}') {
- // actually an error
- rust_error_at(
- get_current_location(), "expected terminating '}' in unicode escape");
- // return false;
- return std::make_pair(output_char, additional_length_offset);
- }
-
- // ensure 1-6 hex characters
- if (num_str.length() > 6 || num_str.length() < 1) {
- rust_error_at(get_current_location(),
- "unicode escape should be between 1 and 6 hex "
- "characters; it is %lu",
- num_str.length());
- // return false;
- return std::make_pair(output_char, additional_length_offset);
- }
-
- long hex_num = std::strtol(num_str.c_str(), NULL, 16);
-
- // as debug, check hex_num = test_val
- if (hex_num > 255) {
- rust_error_at(
- get_current_location(), "non-ascii chars not implemented yet, defaulting to 0");
- hex_num = 0;
- }
-
- // make output_char the value - UTF-8?
- // TODO: actually make this work - output char must be 4 bytes, do I
- // need a string for this?
- output_char = static_cast</*uint32_t*/ char>(hex_num);
-
- // return true;
- return std::make_pair(output_char, additional_length_offset);
- } break;
-#endif
- case '\r':
- case '\n':
- // string continue
- while (is_whitespace(current_char)) {
- if (current_char == '\n') {
- current_line++;
- current_column = 1;
- // tell line_table that new line starts
- line_map->start_line(current_line, max_column_hint);
-
- // reset "length"
- additional_length_offset = 1;
-
- // get next char
- skip_input();
- current_char = peek_input();
-
- continue;
- }
-
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
- }
-
- return std::make_tuple(0, additional_length_offset, true);
- default:
- rust_error_at(get_current_location(), "unknown escape sequence '\\%c'", current_char);
- // returns false if no parsing could be done
- // return false;
- return std::make_tuple(output_char, additional_length_offset, false);
- break;
- }
- // all non-special cases (string continue) should skip their used char
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
-
- // returns true if parsing was successful
- // return true;
- return std::make_tuple(output_char, additional_length_offset, false);
- }
-
- // Parses an escape (or string continue) in a string or character. Supports unicode escapes.
- std::tuple<Codepoint, int, bool> Lexer::parse_utf8_escape(char opening_char) {
- Codepoint output_char;
- int additional_length_offset = 0;
-
- // skip to actual letter
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
-
- switch (current_char) {
- case 'x': {
- // hex char string (null-terminated)
- char hexNum[3] = { 0, 0, 0 };
-
- // first hex char
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
-
- if (!is_x_digit(current_char)) {
- rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
- current_char);
- }
- hexNum[0] = current_char;
-
- // second hex char
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
-
- if (!is_x_digit(current_char)) {
- rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
- current_char);
- }
- hexNum[1] = current_char;
-
- long hexLong = std::strtol(hexNum, NULL, 16);
-
- if (hexLong > 127)
- rust_error_at(get_current_location(),
- "ascii \\x escape '\\x%s' out of range - allows up to '\\x7F'", hexNum);
- // gcc_assert(hexLong < 128); // as ascii
- char hexChar = static_cast<char>(hexLong);
-
- output_char = hexChar;
- } break;
- case 'n':
- output_char = '\n';
- break;
- case 'r':
- output_char = '\r';
- break;
- case 't':
- output_char = '\t';
- break;
- case '\\':
- output_char = '\\';
- break;
- case '0':
- output_char = '\0';
- break;
- case '\'':
- output_char = '\'';
- break;
- case '"':
- output_char = '"';
- break;
- case 'u': {
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
-
- bool need_close_brace = false;
- if (current_char == '{') {
- need_close_brace = true;
-
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
- }
-
- // parse unicode escape - 1-6 hex digits
- std::string num_str;
- num_str.reserve(6);
-
- // loop through to add entire hex number to string
- while (is_x_digit(current_char) || current_char == '_') {
- if (current_char == '_') {
- // don't add _ to number
- skip_input();
- current_char = peek_input();
-
- additional_length_offset++;
-
- continue;
- }
-
- additional_length_offset++;
-
- // add raw hex numbers
- num_str += current_char;
-
- skip_input();
- current_char = peek_input();
- }
-
- // ensure closing brace if required
- if (need_close_brace) {
- if (current_char == '}') {
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
- } else {
- // actually an error
- rust_error_at(
- get_current_location(), "expected terminating '}' in unicode escape");
- // return false;
- return std::make_tuple(output_char, additional_length_offset, false);
- }
- }
-
- // ensure 1-6 hex characters
- if (num_str.length() > 6 || num_str.length() < 1) {
- rust_error_at(get_current_location(),
- "unicode escape should be between 1 and 6 hex "
- "characters; it is %lu",
- num_str.length());
- // return false;
- return std::make_tuple(output_char, additional_length_offset, false);
- }
-
- long hex_num = std::strtol(num_str.c_str(), NULL, 16);
-
- // assert fits a uint32_t
- gcc_assert(hex_num < 4294967296);
-
- output_char = Codepoint(static_cast<uint32_t>(hex_num));
-
- // TODO: what is being outputted? the escape code for the unicode char
- // (unicode number) or the character number?
-
- // return true;
- return std::make_tuple(output_char, additional_length_offset, false);
- } break;
- case '\r':
- case '\n':
- // string continue
- while (is_whitespace(current_char)) {
- if (current_char == '\n') {
- current_line++;
- current_column = 1;
- // tell line_table that new line starts
- line_map->start_line(current_line, max_column_hint);
-
- // reset "length"
- additional_length_offset = 1;
-
- // get next char
- skip_input();
- current_char = peek_input();
-
- continue;
- }
-
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
- }
-
- return std::make_tuple(0, additional_length_offset, true);
- default:
- rust_error_at(get_current_location(), "unknown escape sequence '\\%c'", current_char);
- // returns false if no parsing could be done
- // return false;
- return std::make_tuple(output_char, additional_length_offset, false);
- break;
- }
- /* all non-special cases (unicode, string continue) should skip their used
- * char */
- skip_input();
- current_char = peek_input();
- additional_length_offset++;
-
- // returns true if parsing was successful
- // return true;
- return std::make_tuple(output_char, additional_length_offset, false);
- }
-
-#if 0
- bool Lexer::parse_ascii_escape(/*char& current_char, */ int& length, char& output_char) {
- // skip to actual letter
- skip_input();
- current_char = peek_input();
- length++;
-
- switch (current_char) {
- case 'x': {
- // hex char string (null-terminated)
- char hexNum[3] = { 0, 0, 0 };
-
- // first hex char
- skip_input();
- current_char = peek_input();
- length++;
-
- if (!ISXDIGIT(current_char)) {
- rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
- current_char);
- }
- hexNum[0] = current_char;
-
- // second hex char
- skip_input();
- current_char = peek_input();
- length++;
-
- if (!ISXDIGIT(current_char)) {
- rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
- current_char);
- }
- hexNum[1] = current_char;
-
- long hexLong = ::std::strtol(hexNum, NULL, 16);
-
- if (hexLong > 127)
- rust_error_at(get_current_location(),
- "ascii \\x escape '\\x%s' out of range - allows up to '\\x7F'", hexNum);
- // gcc_assert(hexLong < 128); // as ascii
- char hexChar = static_cast<char>(hexLong);
-
- // TODO: fix - does this actually give the right character?
- output_char = hexChar;
- } break;
- case 'n':
- output_char = '\n';
- break;
- case 'r':
- output_char = '\r';
- break;
- case 't':
- output_char = '\t';
- break;
- case '\\':
- output_char = '\\';
- break;
- case '0':
- output_char = '\0';
- break;
- default:
- // rust_error_at(get_current_location(), "unknown escape sequence '\\%c'", current_char);
- // returns false if no parsing could be done
- return false;
- break;
- }
- // returns true if parsing was successful
- return true;
- }
-
- bool Lexer::parse_quote_escape(/*char& current_char, */ int& length, char& output_char) {
- // skip to actual letter
- skip_input();
- current_char = peek_input();
- length++;
-
- switch (current_char) {
- case '\'':
- output_char = '\'';
- break;
- case '"':
- output_char = '"';
- break;
- default:
- return false;
- break;
- }
- return true;
- }
-
- bool Lexer::parse_unicode_escape(
- /*char& current_char, */ int& length, /*char*/ uint32_t& output_char) {
- // skip to actual letter
- skip_input();
- current_char = peek_input();
- length++;
-
- if (current_char != 'u') {
- // not a unicode escape, but not necessarily an error
- return false;
- }
-
- skip_input();
- current_char = peek_input();
- length++;
-
- bool need_close_brace = false;
-
- // TODO: rustc lexer doesn't seem to allow not having { but mrustc lexer does? look at spec?
- if (current_char == '{') {
- need_close_brace = true;
-
- skip_input();
- current_char = peek_input();
- length++;
- }
-
- // parse unicode escape
- // 1-6 hex digits?
- ::std::string num_str;
- num_str.reserve(6);
-
- // test adding number directly
- uint32_t test_val;
-
- // loop through to add entire hex number to string
- while (is_x_digit(current_char) || current_char == '_') {
- if (current_char == '_') {
- // don't add _ to number
- skip_input();
- current_char = peek_input();
-
- length++;
-
- continue;
- }
-
- length++;
-
- // add raw hex numbers
- num_str += current_char;
-
- // test adding number directly
- char tmp[2] = { current_char, 0 };
- test_val *= 16;
- test_val += ::std::strtol(tmp, NULL, 16);
-
- skip_input();
- current_char = peek_input();
- }
-
- // ensure closing brace
- if (need_close_brace && current_char != '}') {
- // actually an error
- rust_error_at(get_current_location(), "expected terminating '}' in unicode escape");
- return false;
- }
-
- // ensure 1-6 hex characters
- if (num_str.length() > 6 || num_str.length() < 1) {
- rust_error_at(get_current_location(),
- "unicode escape should be between 1 and 6 hex characters; it is %lu", num_str.length());
- return false;
- }
-
- long hex_num = ::std::strtol(num_str.c_str(), NULL, 16);
-
- // as debug, check hex_num = test_val
-
- // make output_char the value - UTF-8?
- // TODO: actually make this work - output char must be 4 bytes, do I need a string for this?
- output_char = static_cast<uint32_t>(hex_num);
-
- return true;
- }
-
- bool Lexer::parse_byte_escape(/*char& current_char, */ int& length, char& output_char) {
- // skip to actual letter
- skip_input();
- current_char = peek_input();
- length++;
-
- switch (current_char) {
- case 'x': {
- // hex char string (null-terminated)
- char hexNum[3] = { 0, 0, 0 };
-
- // first hex char
- skip_input();
- current_char = peek_input();
- length++;
-
- if (!ISXDIGIT(current_char)) {
- rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
- current_char);
- }
- hexNum[0] = current_char;
-
- // second hex char
- skip_input();
- current_char = peek_input();
- length++;
-
- if (!ISXDIGIT(current_char)) {
- rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
- current_char);
- }
- hexNum[1] = current_char;
-
- long hexLong = ::std::strtol(hexNum, NULL, 16);
-
- if (hexLong > 255)
- rust_error_at(get_current_location(),
- "ascii \\x escape '\\x%s' out of range - allows up to '\\xFF'", hexNum);
- // gcc_assert(hexLong < 128); // as ascii
- char hexChar = static_cast<char>(hexLong);
-
- // TODO: fix - does this actually give the right character?
- output_char = hexChar;
- } break;
- case 'n':
- output_char = '\n';
- break;
- case 'r':
- output_char = '\r';
- break;
- case 't':
- output_char = '\t';
- break;
- case '\\':
- output_char = '\\';
- break;
- case '0':
- output_char = '\0';
- break;
- default:
- // rust_error_at(get_current_location(), "unknown escape sequence '\\%c'", current_char);
- // returns false if no parsing could be done
- return false;
- break;
- }
- // returns true if parsing was successful
- return true;
- }
-#endif
-
- // Returns the length of the codepoint at the current position.
- int Lexer::test_get_input_codepoint_length() {
- uint8_t input = peek_input();
-
- if (input < 128) {
- // ascii -- 1 byte
- // return input;
-
- return 1;
- } else if ((input & 0xC0) == 0x80) {
- // invalid (continuation; can't be first char)
- // return 0xFFFE;
-
- return 0;
- } else if ((input & 0xE0) == 0xC0) {
- // 2 bytes
- uint8_t input2 = peek_input(1);
- if ((input2 & 0xC0) != 0x80)
- return 0;
- // return 0xFFFE;
-
- // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
- // return output;
- return 2;
- } else if ((input & 0xF0) == 0xE0) {
- // 3 bytes
- uint8_t input2 = peek_input(1);
- if ((input2 & 0xC0) != 0x80)
- return 0;
- // return 0xFFFE;
-
- uint8_t input3 = peek_input(2);
- if ((input3 & 0xC0) != 0x80)
- return 0;
- // return 0xFFFE;
-
- /*uint32_t output
- = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
- 0); return output;*/
- return 3;
- } else if ((input & 0xF8) == 0xF0) {
- // 4 bytes
- uint8_t input2 = peek_input(1);
- if ((input2 & 0xC0) != 0x80)
- return 0;
- // return 0xFFFE;
-
- uint8_t input3 = peek_input(2);
- if ((input3 & 0xC0) != 0x80)
- return 0;
- // return 0xFFFE;
-
- uint8_t input4 = peek_input(3);
- if ((input4 & 0xC0) != 0x80)
- return 0;
- // return 0xFFFE;
-
- /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
- | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
- return output;*/
- return 4;
- } else {
- rust_error_at(get_current_location(), "invalid UTF-8 (too long)");
- return 0;
- }
- }
-
- // Returns the codepoint at the current position.
- Codepoint Lexer::test_peek_codepoint_input() {
- uint8_t input = peek_input();
-
- if (input < 128) {
- // ascii -- 1 byte
- return { input };
- } else if ((input & 0xC0) == 0x80) {
- // invalid (continuation; can't be first char)
- return { 0xFFFE };
- } else if ((input & 0xE0) == 0xC0) {
- // 2 bytes
- uint8_t input2 = peek_input(1);
- if ((input2 & 0xC0) != 0x80)
- return { 0xFFFE };
-
- uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
- return { output };
- } else if ((input & 0xF0) == 0xE0) {
- // 3 bytes
- uint8_t input2 = peek_input(1);
- if ((input2 & 0xC0) != 0x80)
- return { 0xFFFE };
-
- uint8_t input3 = peek_input(2);
- if ((input3 & 0xC0) != 0x80)
- return { 0xFFFE };
-
- uint32_t output
- = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0);
- return { output };
- } else if ((input & 0xF8) == 0xF0) {
- // 4 bytes
- uint8_t input2 = peek_input(1);
- if ((input2 & 0xC0) != 0x80)
- return { 0xFFFE };
-
- uint8_t input3 = peek_input(2);
- if ((input3 & 0xC0) != 0x80)
- return { 0xFFFE };
-
- uint8_t input4 = peek_input(3);
- if ((input4 & 0xC0) != 0x80)
- return { 0xFFFE };
-
- uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
- | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
- return { output };
- } else {
- rust_error_at(get_current_location(), "invalid UTF-8 (too long)");
- return { 0xFFFE };
- }
- }
-
- void Lexer::test_skip_codepoint_input() {
- int toSkip = test_get_input_codepoint_length();
- gcc_assert(toSkip >= 1);
-
- skip_input(toSkip - 1);
- }
-
- int Lexer::test_get_input_codepoint_n_length(int n_start_offset) {
- uint8_t input = peek_input(n_start_offset);
-
- if (input < 128) {
- // ascii -- 1 byte
- // return input;
- return 1;
- } else if ((input & 0xC0) == 0x80) {
- // invalid (continuation; can't be first char)
- // return 0xFFFE;
- return 0;
- } else if ((input & 0xE0) == 0xC0) {
- // 2 bytes
- uint8_t input2 = peek_input(n_start_offset + 1);
- if ((input2 & 0xC0) != 0x80)
- // return 0xFFFE;
- return 0;
-
- // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
- // return output;
- return 2;
- } else if ((input & 0xF0) == 0xE0) {
- // 3 bytes
- uint8_t input2 = peek_input(n_start_offset + 1);
- if ((input2 & 0xC0) != 0x80)
- // return 0xFFFE;
- return 0;
-
- uint8_t input3 = peek_input(n_start_offset + 2);
- if ((input3 & 0xC0) != 0x80)
- // return 0xFFFE;
- return 0;
-
- /*uint32_t output
- = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
- 0); return output;*/
- return 3;
- } else if ((input & 0xF8) == 0xF0) {
- // 4 bytes
- uint8_t input2 = peek_input(n_start_offset + 1);
- if ((input2 & 0xC0) != 0x80)
- // return 0xFFFE;
- return 0;
-
- uint8_t input3 = peek_input(n_start_offset + 2);
- if ((input3 & 0xC0) != 0x80)
- // return 0xFFFE;
- return 0;
-
- uint8_t input4 = peek_input(n_start_offset + 3);
- if ((input4 & 0xC0) != 0x80)
- // return 0xFFFE;
- return 0;
-
- /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
- | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
- return output;*/
- return 4;
- } else {
- rust_error_at(get_current_location(), "invalid UTF-8 (too long)");
- return 0;
- }
- }
-
- // peeks the codepoint input at n codepoints ahead of current codepoint - try
- // not to use
- Codepoint Lexer::test_peek_codepoint_input(int n) {
- int totalOffset = 0;
-
- // add up all offsets into total offset? does this do what I want?
- for (int i = 0; i < n; i++) {
- totalOffset += test_get_input_codepoint_n_length(totalOffset);
- }
- // issues: this would have (at least) O(n) lookup time, not O(1) like the
- // rest?
-
- // TODO: implement if still needed
-
- // error out of function as it is not implemented
- gcc_assert(1 == 0);
- return { 0 };
- /*
- uint8_t input = peek_input();
-
- if (input < 128) {
- // ascii -- 1 byte
- return input;
- } else if ((input & 0xC0) == 0x80) {
- // invalid (continuation; can't be first char)
- return 0xFFFE;
- } else if ((input & 0xE0) == 0xC0) {
- // 2 bytes
- uint8_t input2 = peek_input(1);
- if ((input2 & 0xC0) != 0x80)
- return 0xFFFE;
-
- uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
- return output;
- } else if ((input & 0xF0) == 0xE0) {
- // 3 bytes
- uint8_t input2 = peek_input(1);
- if ((input2 & 0xC0) != 0x80)
- return 0xFFFE;
-
- uint8_t input3 = peek_input(2);
- if ((input3 & 0xC0) != 0x80)
- return 0xFFFE;
-
- uint32_t output
- = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 &
- 0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) {
- // 4 bytes
- uint8_t input2 = peek_input(1);
- if ((input2 & 0xC0) != 0x80)
- return 0xFFFE;
-
- uint8_t input3 = peek_input(2);
- if ((input3 & 0xC0) != 0x80)
- return 0xFFFE;
-
- uint8_t input4 = peek_input(3);
- if ((input4 & 0xC0) != 0x80)
- return 0xFFFE;
-
- uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
- | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) <<
- 0); return output; } else { rust_error_at(get_current_location(), "invalid
- UTF-8 (too long)"); return 0xFFFE;
- }*/
+ return 0;
+ }
+ else if ((input & 0xE0) == 0xC0)
+ {
+ // 2 bytes
+ uint8_t input2 = peek_input (1);
+ if ((input2 & 0xC0) != 0x80)
+ return 0;
+ // return 0xFFFE;
+
+ // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+ // return output;
+ return 2;
+ }
+ else if ((input & 0xF0) == 0xE0)
+ {
+ // 3 bytes
+ uint8_t input2 = peek_input (1);
+ if ((input2 & 0xC0) != 0x80)
+ return 0;
+ // return 0xFFFE;
+
+ uint8_t input3 = peek_input (2);
+ if ((input3 & 0xC0) != 0x80)
+ return 0;
+ // return 0xFFFE;
+
+ /*uint32_t output
+ = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
+ 0); return output;*/
+ return 3;
+ }
+ else if ((input & 0xF8) == 0xF0)
+ {
+ // 4 bytes
+ uint8_t input2 = peek_input (1);
+ if ((input2 & 0xC0) != 0x80)
+ return 0;
+ // return 0xFFFE;
+
+ uint8_t input3 = peek_input (2);
+ if ((input3 & 0xC0) != 0x80)
+ return 0;
+ // return 0xFFFE;
+
+ uint8_t input4 = peek_input (3);
+ if ((input4 & 0xC0) != 0x80)
+ return 0;
+ // return 0xFFFE;
+
+ /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+ | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+ return output;*/
+ return 4;
+ }
+ else
+ {
+ rust_error_at (get_current_location (), "invalid UTF-8 (too long)");
+ return 0;
+ }
+}
+
+// Returns the codepoint at the current position.
+Codepoint
+Lexer::peek_codepoint_input ()
+{
+ uint8_t input = peek_input ();
+
+ if (input < 128)
+ {
+ // ascii -- 1 byte
+ return {input};
+ }
+ else if ((input & 0xC0) == 0x80)
+ {
+ // invalid (continuation; can't be first char)
+ return {0xFFFE};
+ }
+ else if ((input & 0xE0) == 0xC0)
+ {
+ // 2 bytes
+ uint8_t input2 = peek_input (1);
+ if ((input2 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+ return {output};
+ }
+ else if ((input & 0xF0) == 0xE0)
+ {
+ // 3 bytes
+ uint8_t input2 = peek_input (1);
+ if ((input2 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint8_t input3 = peek_input (2);
+ if ((input3 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
+ | ((input3 & 0x3F) << 0);
+ return {output};
+ }
+ else if ((input & 0xF8) == 0xF0)
+ {
+ // 4 bytes
+ uint8_t input2 = peek_input (1);
+ if ((input2 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint8_t input3 = peek_input (2);
+ if ((input3 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint8_t input4 = peek_input (3);
+ if ((input4 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+ | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+ return {output};
+ }
+ else
+ {
+ rust_error_at (get_current_location (), "invalid UTF-8 (too long)");
+ return {0xFFFE};
+ }
+}
+
+void
+Lexer::skip_codepoint_input ()
+{
+ int toSkip = get_input_codepoint_length ();
+ gcc_assert (toSkip >= 1);
+
+ skip_input (toSkip - 1);
+}
+
+int
+Lexer::test_get_input_codepoint_n_length (int n_start_offset)
+{
+ uint8_t input = peek_input (n_start_offset);
+
+ if (input < 128)
+ {
+ // ascii -- 1 byte
+ // return input;
+ return 1;
+ }
+ else if ((input & 0xC0) == 0x80)
+ {
+ // invalid (continuation; can't be first char)
+ // return 0xFFFE;
+ return 0;
+ }
+ else if ((input & 0xE0) == 0xC0)
+ {
+ // 2 bytes
+ uint8_t input2 = peek_input (n_start_offset + 1);
+ if ((input2 & 0xC0) != 0x80)
+ // return 0xFFFE;
+ return 0;
+
+ // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+ // return output;
+ return 2;
+ }
+ else if ((input & 0xF0) == 0xE0)
+ {
+ // 3 bytes
+ uint8_t input2 = peek_input (n_start_offset + 1);
+ if ((input2 & 0xC0) != 0x80)
+ // return 0xFFFE;
+ return 0;
+
+ uint8_t input3 = peek_input (n_start_offset + 2);
+ if ((input3 & 0xC0) != 0x80)
+ // return 0xFFFE;
+ return 0;
+
+ /*uint32_t output
+ = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
+ 0); return output;*/
+ return 3;
+ }
+ else if ((input & 0xF8) == 0xF0)
+ {
+ // 4 bytes
+ uint8_t input2 = peek_input (n_start_offset + 1);
+ if ((input2 & 0xC0) != 0x80)
+ // return 0xFFFE;
+ return 0;
+
+ uint8_t input3 = peek_input (n_start_offset + 2);
+ if ((input3 & 0xC0) != 0x80)
+ // return 0xFFFE;
+ return 0;
+
+ uint8_t input4 = peek_input (n_start_offset + 3);
+ if ((input4 & 0xC0) != 0x80)
+ // return 0xFFFE;
+ return 0;
+
+ /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+ | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+ return output;*/
+ return 4;
+ }
+ else
+ {
+ rust_error_at (get_current_location (), "invalid UTF-8 (too long)");
+ return 0;
+ }
+}
+
+// peeks the codepoint input at n codepoints ahead of current codepoint - try
+// not to use
+Codepoint
+Lexer::test_peek_codepoint_input (int n)
+{
+ int totalOffset = 0;
+
+ // add up all offsets into total offset? does this do what I want?
+ for (int i = 0; i < n; i++)
+ {
+ totalOffset += test_get_input_codepoint_n_length (totalOffset);
}
+ // issues: this would have (at least) O(n) lookup time, not O(1) like the
+ // rest?
+
+ // TODO: implement if still needed
+
+ // error out of function as it is not implemented
+ gcc_assert (1 == 0);
+ return {0};
+ /*
+ uint8_t input = peek_input();
+
+ if (input < 128) {
+ // ascii -- 1 byte
+ return input;
+ } else if ((input & 0xC0) == 0x80) {
+ // invalid (continuation; can't be first char)
+ return 0xFFFE;
+ } else if ((input & 0xE0) == 0xC0) {
+ // 2 bytes
+ uint8_t input2 = peek_input(1);
+ if ((input2 & 0xC0) != 0x80)
+ return 0xFFFE;
+
+ uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+ return output;
+ } else if ((input & 0xF0) == 0xE0) {
+ // 3 bytes
+ uint8_t input2 = peek_input(1);
+ if ((input2 & 0xC0) != 0x80)
+ return 0xFFFE;
+
+ uint8_t input3 = peek_input(2);
+ if ((input3 & 0xC0) != 0x80)
+ return 0xFFFE;
+
+ uint32_t output
+ = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 &
+ 0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) {
+ // 4 bytes
+ uint8_t input2 = peek_input(1);
+ if ((input2 & 0xC0) != 0x80)
+ return 0xFFFE;
+
+ uint8_t input3 = peek_input(2);
+ if ((input3 & 0xC0) != 0x80)
+ return 0xFFFE;
+
+ uint8_t input4 = peek_input(3);
+ if ((input4 & 0xC0) != 0x80)
+ return 0xFFFE;
+
+ uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+ | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) <<
+ 0); return output; } else { rust_error_at(get_current_location(), "invalid
+ UTF-8 (too long)"); return 0xFFFE;
+ }*/
+}
} // namespace Rust
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
index 1465cb2..8a031ed 100644
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@@ -31,28 +31,37 @@ private:
// Builds a token from the input queue.
TokenPtr build_token ();
- // ok maybe all these may mean the lexer structure needs to be rethought
- /* separated into functions because main method was too long, but they rely on
- * and change state in the lexer, so variables must be passed by reference. */
std::pair<std::string, int> parse_in_decimal ();
std::pair<std::string, int> parse_in_exponent_part ();
std::pair<PrimitiveCoreType, int> parse_in_type_suffix ();
- /*bool parse_ascii_escape (int &length,
- char &output_char);*/
- /*bool parse_quote_escape (char& current_char, int &length,
- char &output_char);*/
- /*bool parse_unicode_escape (
- char& current_char, int &length, Codepoint &output_char);*/
- /*bool parse_byte_escape (char& current_char, int &length,
- char &output_char);*/
std::tuple<char, int, bool> parse_escape (char opening_char);
std::tuple<Codepoint, int, bool> parse_utf8_escape (char opening_char);
- int test_get_input_codepoint_length ();
+ int parse_partial_string_continue ();
+ std::pair<long, int> parse_partial_hex_escape ();
+ std::pair<Codepoint, int> parse_partial_unicode_escape ();
+
+ int get_input_codepoint_length ();
int test_get_input_codepoint_n_length (int n_start_offset);
- Codepoint test_peek_codepoint_input ();
- Codepoint test_peek_codepoint_input (
- int n); // maybe can use get_input_codepoint_length to get starting index
- void test_skip_codepoint_input ();
+ Codepoint peek_codepoint_input ();
+ Codepoint test_peek_codepoint_input (int n);
+ void skip_codepoint_input ();
+
+ TokenPtr parse_byte_char (Location loc);
+ TokenPtr parse_byte_string (Location loc);
+ TokenPtr parse_raw_byte_string (Location loc);
+ TokenPtr parse_raw_identifier (Location loc);
+ TokenPtr parse_string (Location loc);
+ TokenPtr maybe_parse_raw_string (Location loc);
+ TokenPtr parse_raw_string (Location loc, int initial_hash_count);
+ TokenPtr parse_non_decimal_int_literals (Location loc);
+ TokenPtr parse_decimal_int_or_float (Location loc);
+ TokenPtr parse_char_or_lifetime (Location loc);
+ TokenPtr parse_identifier_or_keyword (Location loc);
+
+ template <typename IsDigitFunc>
+ TokenPtr parse_non_decimal_int_literal (Location loc,
+ IsDigitFunc is_digit_func,
+ std::string existent_str, int base);
public:
// Construct lexer with input file and filename provided
@@ -68,14 +77,14 @@ public:
Lexer &operator= (Lexer &&other) = default;
// Returns token n tokens ahead of current position.
- const_TokenPtr peek_token (int n);
+ const_TokenPtr peek_token (int n) { return token_queue.peek (n); }
// Peeks the current token.
- const_TokenPtr peek_token ();
+ const_TokenPtr peek_token () { return peek_token (0); }
// Advances current token to n + 1 tokens ahead of current position.
- void skip_token (int n);
+ void skip_token (int n) { token_queue.skip (n); }
// Skips the current token.
- void skip_token ();
+ void skip_token () { skip_token (0); }
// Replaces the current token with a specified token.
void replace_current_token (TokenPtr replacement);
@@ -90,6 +99,8 @@ private:
int current_line;
// Current column number.
int current_column;
+ // Current character.
+ int current_char;
// Line map.
Linemap *line_map;
@@ -132,11 +143,6 @@ private:
TokenSource token_source;
// Token stream queue.
buffered_queue<std::shared_ptr<Token>, TokenSource> token_queue;
-
- // START CRAPPY CHANGES
- int current_char;
-
- // END CRAPPY CHANGES
};
} // namespace Rust