// Copyright (C) 2020-2023 Free Software Foundation, Inc.
// This file is part of GCC.
// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.
// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// .
#include "rust-system.h"
#include "rust-lex.h"
#include "rust-diagnostics.h"
#include "rust-linemap.h"
#include "rust-session-manager.h"
#include "safe-ctype.h"
namespace Rust {
// TODO: move to separate compilation unit?
// overload += for uint32_t to allow 32-bit encoded utf-8 to be added
std::string &
operator+= (std::string &str, Codepoint char32)
{
if (char32.value < 0x80)
{
str += static_cast (char32.value);
}
else if (char32.value < (0x1F + 1) << (1 * 6))
{
str += static_cast (0xC0 | ((char32.value >> 6) & 0x1F));
str += static_cast (0x80 | ((char32.value >> 0) & 0x3F));
}
else if (char32.value < (0x0F + 1) << (2 * 6))
{
str += static_cast (0xE0 | ((char32.value >> 12) & 0x0F));
str += static_cast (0x80 | ((char32.value >> 6) & 0x3F));
str += static_cast (0x80 | ((char32.value >> 0) & 0x3F));
}
else if (char32.value < (0x07 + 1) << (3 * 6))
{
str += static_cast (0xF0 | ((char32.value >> 18) & 0x07));
str += static_cast (0x80 | ((char32.value >> 12) & 0x3F));
str += static_cast (0x80 | ((char32.value >> 6) & 0x3F));
str += static_cast (0x80 | ((char32.value >> 0) & 0x3F));
}
else
{
rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value);
}
return str;
}
std::string
Codepoint::as_string ()
{
std::string str;
// str += Codepoint (value);
str += *this;
return str;
}
/* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
* for handling. */
bool
is_float_digit (char number)
{
return ISDIGIT (number) || number == 'E' || number == 'e';
}
/* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
* whatever is different */
bool
is_x_digit (char number)
{
return ISXDIGIT (number);
}
bool
is_octal_digit (char number)
{
return number >= '0' && number <= '7';
}
bool
is_bin_digit (char number)
{
return number == '0' || number == '1';
}
bool
check_valid_float_dot_end (char character)
{
return character != '.' && character != '_' && !ISALPHA (character);
}
// ISSPACE from safe-ctype but may change in future
bool
is_whitespace (char character)
{
return ISSPACE (character);
}
bool
is_non_decimal_int_literal_separator (char character)
{
return character == 'x' || character == 'o' || character == 'b';
}
Lexer::Lexer (const std::string &input)
: input (RAIIFile::create_error ()), current_line (1), current_column (1),
line_map (nullptr), dump_lex_out (Optional::none ()),
raw_input_source (new BufferInputSource (input, 0)),
input_queue{*raw_input_source}, token_queue (TokenSource (this))
{}
Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap,
Optional dump_lex_opt)
: input (std::move (file_input)), current_line (1), current_column (1),
line_map (linemap), dump_lex_out (dump_lex_opt),
raw_input_source (new FileInputSource (input.get_raw ())),
input_queue{*raw_input_source}, token_queue (TokenSource (this))
{
// inform line_table that file is being entered and is in line 1
if (linemap)
line_map->start_file (filename, current_line);
}
Lexer::~Lexer ()
{
/* ok apparently stop (which is equivalent of original code in destructor) is
* meant to be called after all files have finished parsing, for cleanup. On
* the other hand, actual code that it calls to leave a certain line map is
* mentioned in GCC docs as being useful for "just leaving an included header"
* and stuff like that, so this line mapping functionality may need fixing.
* FIXME: find out whether this occurs. */
// line_map->stop();
}
/* TODO: need to optimise somehow to avoid the virtual function call in the
* tight loop. Best idea at the moment is CRTP, but that might make lexer
* implementation annoying when storing the "base class" (i.e. would need
* template parameter everywhere), although in practice it would mostly just
* look ugly and make enclosing classes like Parser also require a type
* parameter. At this point a macro might be better. OK I guess macros can be
* replaced by constexpr if or something if possible. */
Location
Lexer::get_current_location ()
{
if (line_map)
return line_map->get_location (current_column);
else
// If we have no linemap, we're lexing something without proper locations
return Location ();
}
int
Lexer::peek_input (int n)
{
return input_queue.peek (n);
}
int
Lexer::peek_input ()
{
return peek_input (0);
}
void
Lexer::skip_input (int n)
{
input_queue.skip (n);
}
void
Lexer::skip_input ()
{
skip_input (0);
}
void
Lexer::skip_token (int n)
{
// dump tokens if dump-lex option is enabled
if (dump_lex_out.is_some ())
dump_and_skip (n);
else
token_queue.skip (n);
}
void
Lexer::dump_and_skip (int n)
{
std::ofstream &out = dump_lex_out.get ();
bool found_eof = false;
const_TokenPtr tok;
for (int i = 0; i < n + 1; i++)
{
if (!found_eof)
{
tok = peek_token ();
found_eof |= tok->get_id () == Rust::END_OF_FILE;
Location loc = tok->get_locus ();
out << "token_id_to_str ();
out << (tok->has_str () ? (std::string (", text=") + tok->get_str ()
+ std::string (", typehint=")
+ std::string (tok->get_type_hint_str ()))
: "")
<< " ";
out << get_line_map ()->to_string (loc) << " ";
}
token_queue.skip (0);
}
}
void
Lexer::replace_current_token (TokenPtr replacement)
{
token_queue.replace_current_value (replacement);
rust_debug ("called 'replace_current_token' - this is deprecated");
}
/* shitty anonymous namespace that can only be accessed inside the compilation
* unit - used for classify_keyword binary search in sorted array of keywords
* created with x-macros. */
namespace {
// TODO: make constexpr when update to c++20
const std::string keyword_index[] = {
#define RS_TOKEN(x, y)
#define RS_TOKEN_KEYWORD(name, keyword) keyword,
RS_TOKEN_LIST
#undef RS_TOKEN_KEYWORD
#undef RS_TOKEN
};
constexpr TokenId keyword_keys[] = {
#define RS_TOKEN(x, y)
#define RS_TOKEN_KEYWORD(name, keyword) name,
RS_TOKEN_LIST
#undef RS_TOKEN_KEYWORD
#undef RS_TOKEN
};
constexpr int num_keywords = sizeof (keyword_index) / sizeof (*keyword_index);
} // namespace
/* Determines whether the string passed in is a keyword or not. If it is, it
* returns the keyword name. */
TokenId
Lexer::classify_keyword (const std::string &str)
{
const std::string *last = keyword_index + num_keywords;
const std::string *idx = std::lower_bound (keyword_index, last, str);
if (idx == last || str != *idx)
return IDENTIFIER;
// TODO: possibly replace this x-macro system with something like hash map?
// We now have the expected token ID of the reserved keyword. However, some
// keywords are reserved starting in certain editions. For example, `try` is
// only a reserved keyword in editions >=2018. The language might gain new
// reserved keywords in the future.
//
// https://doc.rust-lang.org/reference/keywords.html#reserved-keywords
auto id = keyword_keys[idx - keyword_index];
// `try` is not a reserved keyword before 2018
if (Session::get_instance ().options.get_edition ()
== CompileOptions::Edition::E2015
&& id == TRY)
return IDENTIFIER;
return id;
}
TokenPtr
Lexer::build_token ()
{
// loop to go through multiple characters to build a single token
while (true)
{
Location loc = get_current_location ();
current_char = peek_input ();
skip_input ();
// detect UTF8 bom
//
// Must be the first thing on the first line.
// There might be an optional BOM (Byte Order Mark), which for UTF-8 is
// the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
if (current_line == 1 && current_column == 1 && current_char == 0xef
&& peek_input () == 0xbb && peek_input (1) == 0xbf)
{
skip_input (1);
current_char = peek_input ();
skip_input ();
}
// detect shebang
// Must be the first thing on the first line, starting with #!
// But since an attribute can also start with an #! we don't count it as a
// shebang line when after any whitespace or comments there is a [. If it
// is a shebang line we simple drop the line. Otherwise we don't consume
// any characters and fall through to the real tokenizer.
if (current_line == 1 && current_column == 1 && current_char == '#'
&& peek_input () == '!')
{
int n = 1;
while (true)
{
int next_char = peek_input (n);
if (is_whitespace (next_char))
n++;
else if ((next_char == '/' && peek_input (n + 1) == '/'
&& peek_input (n + 2) != '!'
&& peek_input (n + 2) != '/')
|| (next_char == '/' && peek_input (n + 1) == '/'
&& peek_input (n + 2) == '/'
&& peek_input (n + 3) == '/'))
{
// two // or four ////
// A single line comment
// (but not an inner or outer doc comment)
n += 2;
next_char = peek_input (n);
while (next_char != '\n' && next_char != EOF)
{
n++;
next_char = peek_input (n);
}
if (next_char == '\n')
n++;
}
else if (next_char == '/' && peek_input (n + 1) == '*'
&& peek_input (n + 2) == '*'
&& peek_input (n + 3) == '/')
{
/**/
n += 4;
}
else if (next_char == '/' && peek_input (n + 1) == '*'
&& peek_input (n + 2) == '*' && peek_input (n + 3) == '*'
&& peek_input (n + 4) == '/')
{
/***/
n += 5;
}
else if ((next_char == '/' && peek_input (n + 1) == '*'
&& peek_input (n + 2) != '*'
&& peek_input (n + 2) != '!')
|| (next_char == '/' && peek_input (n + 1) == '*'
&& peek_input (n + 2) == '*'
&& peek_input (n + 3) == '*'))
{
// one /* or three /***
// Start of a block comment
// (but not an inner or outer doc comment)
n += 2;
int level = 1;
while (level > 0)
{
if (peek_input (n) == EOF)
break;
else if (peek_input (n) == '/'
&& peek_input (n + 1) == '*')
{
n += 2;
level += 1;
}
else if (peek_input (n) == '*'
&& peek_input (n + 1) == '/')
{
n += 2;
level -= 1;
}
else
n++;
}
}
else if (next_char != '[')
{
// definitely shebang, ignore the first line
while (current_char != '\n' && current_char != EOF)
{
current_char = peek_input ();
skip_input ();
}
// newline
current_line++;
current_column = 1;
// tell line_table that new line starts
start_line (current_line, max_column_hint);
break;
}
else
break; /* Definitely not a shebang line. */
}
}
// return end of file token if end of file
if (current_char == EOF)
return Token::make (END_OF_FILE, loc);
// if not end of file, start tokenising
switch (current_char)
{
/* ignore whitespace characters for tokens but continue updating
* location */
case '\n': // newline
current_line++;
current_column = 1;
// tell line_table that new line starts
start_line (current_line, max_column_hint);
continue;
case '\r': // cr
// Ignore, we expect a newline (lf) soon.
continue;
case ' ': // space
current_column++;
continue;
case '\t': // tab
// width of a tab is not well-defined, assume 8 spaces
current_column += 8;
continue;
// punctuation - actual tokens
case '=':
if (peek_input () == '>')
{
// match arm arrow
skip_input ();
current_column += 2;
loc += 1;
return Token::make (MATCH_ARROW, loc);
}
else if (peek_input () == '=')
{
// equality operator
skip_input ();
current_column += 2;
loc += 1;
return Token::make (EQUAL_EQUAL, loc);
}
else
{
// assignment operator
current_column++;
return Token::make (EQUAL, loc);
}
case '(':
current_column++;
return Token::make (LEFT_PAREN, loc);
case '-':
if (peek_input () == '>')
{
// return type specifier
skip_input ();
current_column += 2;
loc += 1;
return Token::make (RETURN_TYPE, loc);
}
else if (peek_input () == '=')
{
// minus-assign
skip_input ();
current_column += 2;
loc += 1;
return Token::make (MINUS_EQ, loc);
}
else
{
// minus
current_column++;
return Token::make (MINUS, loc);
}
case '+':
if (peek_input () == '=')
{
// add-assign
skip_input ();
current_column += 2;
loc += 1;
return Token::make (PLUS_EQ, loc);
}
else
{
// add
current_column++;
return Token::make (PLUS, loc);
}
case ')':
current_column++;
return Token::make (RIGHT_PAREN, loc);
case ';':
current_column++;
return Token::make (SEMICOLON, loc);
case '*':
if (peek_input () == '=')
{
// multiplication-assign
skip_input ();
current_column += 2;
loc += 1;
return Token::make (ASTERISK_EQ, loc);
}
else
{
// multiplication
current_column++;
return Token::make (ASTERISK, loc);
}
case ',':
current_column++;
return Token::make (COMMA, loc);
case '/':
if (peek_input () == '=')
{
// division-assign
skip_input ();
current_column += 2;
loc += 1;
return Token::make (DIV_EQ, loc);
}
else if ((peek_input () == '/' && peek_input (1) != '!'
&& peek_input (1) != '/')
|| (peek_input () == '/' && peek_input (1) == '/'
&& peek_input (2) == '/'))
{
// two // or four ////
// single line comment
// (but not an inner or outer doc comment)
skip_input ();
current_column += 2;
current_char = peek_input ();
// basically ignore until line finishes
while (current_char != '\n' && current_char != EOF)
{
skip_input ();
current_column++; // not used
current_char = peek_input ();
}
continue;
}
else if (peek_input () == '/'
&& (peek_input (1) == '!' || peek_input (1) == '/'))
{
/* single line doc comment, inner or outer. */
bool is_inner = peek_input (1) == '!';
skip_input (1);
current_column += 3;
std::string str;
str.reserve (32);
current_char = peek_input ();
while (current_char != '\n')
{
skip_input ();
if (current_char == '\r')
{
char next_char = peek_input ();
if (next_char == '\n')
{
current_char = '\n';
break;
}
rust_error_at (
loc, "Isolated CR %<\\r%> not allowed in doc comment");
current_char = next_char;
continue;
}
if (current_char == EOF)
{
rust_error_at (
loc, "unexpected EOF while looking for end of comment");
break;
}
str += current_char;
current_char = peek_input ();
}
skip_input ();
current_line++;
current_column = 1;
// tell line_table that new line starts
start_line (current_line, max_column_hint);
str.shrink_to_fit ();
loc += str.size () - 1;
if (is_inner)
return Token::make_inner_doc_comment (loc, std::move (str));
else
return Token::make_outer_doc_comment (loc, std::move (str));
}
else if (peek_input () == '*' && peek_input (1) == '*'
&& peek_input (2) == '/')
{
/**/
skip_input (2);
current_column += 4;
continue;
}
else if (peek_input () == '*' && peek_input (1) == '*'
&& peek_input (2) == '*' && peek_input (3) == '/')
{
/***/
skip_input (3);
current_column += 5;
continue;
}
else if ((peek_input () == '*' && peek_input (1) != '!'
&& peek_input (1) != '*')
|| (peek_input () == '*' && peek_input (1) == '*'
&& peek_input (2) == '*'))
{
// one /* or three /***
// block comment
// (but not an inner or outer doc comment)
skip_input ();
current_column += 2;
int level = 1;
while (level > 0)
{
current_char = peek_input ();
if (current_char == EOF)
{
rust_error_at (
loc, "unexpected EOF while looking for end of comment");
break;
}
// if /* found
if (current_char == '/' && peek_input (1) == '*')
{
// skip /* characters
skip_input (1);
current_column += 2;
level += 1;
continue;
}
// ignore until */ is found
if (current_char == '*' && peek_input (1) == '/')
{
// skip */ characters
skip_input (1);
current_column += 2;
level -= 1;
continue;
}
if (current_char == '\n')
{
skip_input ();
current_line++;
current_column = 1;
// tell line_table that new line starts
start_line (current_line, max_column_hint);
continue;
}
skip_input ();
current_column++;
}
// refresh new token
continue;
}
else if (peek_input () == '*'
&& (peek_input (1) == '!' || peek_input (1) == '*'))
{
// block doc comment, inner /*! or outer /**
bool is_inner = peek_input (1) == '!';
skip_input (1);
current_column += 3;
std::string str;
str.reserve (96);
int level = 1;
while (level > 0)
{
current_char = peek_input ();
if (current_char == EOF)
{
rust_error_at (
loc, "unexpected EOF while looking for end of comment");
break;
}
// if /* found
if (current_char == '/' && peek_input (1) == '*')
{
// skip /* characters
skip_input (1);
current_column += 2;
level += 1;
str += "/*";
continue;
}
// ignore until */ is found
if (current_char == '*' && peek_input (1) == '/')
{
// skip */ characters
skip_input (1);
current_column += 2;
level -= 1;
if (level > 0)
str += "*/";
continue;
}
if (current_char == '\r' && peek_input (1) != '\n')
rust_error_at (
loc, "Isolated CR %<\\r%> not allowed in doc comment");
if (current_char == '\n')
{
skip_input ();
current_line++;
current_column = 1;
// tell line_table that new line starts
start_line (current_line, max_column_hint);
str += '\n';
continue;
}
str += current_char;
skip_input ();
current_column++;
}
str.shrink_to_fit ();
loc += str.size () - 1;
if (is_inner)
return Token::make_inner_doc_comment (loc, std::move (str));
else
return Token::make_outer_doc_comment (loc, std::move (str));
}
else
{
// division
current_column++;
return Token::make (DIV, loc);
}
case '%':
if (peek_input () == '=')
{
// modulo-assign
skip_input ();
current_column += 2;
loc += 1;
return Token::make (PERCENT_EQ, loc);
}
else
{
// modulo
current_column++;
return Token::make (PERCENT, loc);
}
case '^':
if (peek_input () == '=')
{
// xor-assign?
skip_input ();
current_column += 2;
loc += 1;
return Token::make (CARET_EQ, loc);
}
else
{
// xor?
current_column++;
return Token::make (CARET, loc);
}
case '<':
if (peek_input () == '<')
{
if (peek_input (1) == '=')
{
// left-shift assign
skip_input (1);
current_column += 3;
loc += 2;
return Token::make (LEFT_SHIFT_EQ, loc);
}
else
{
// left-shift
skip_input ();
current_column += 2;
loc += 1;
return Token::make (LEFT_SHIFT, loc);
}
}
else if (peek_input () == '=')
{
// smaller than or equal to
skip_input ();
current_column += 2;
loc += 1;
return Token::make (LESS_OR_EQUAL, loc);
}
else
{
// smaller than
current_column++;
return Token::make (LEFT_ANGLE, loc);
}
break;
case '>':
if (peek_input () == '>')
{
if (peek_input (1) == '=')
{
// right-shift-assign
skip_input (1);
current_column += 3;
loc += 2;
return Token::make (RIGHT_SHIFT_EQ, loc);
}
else
{
// right-shift
skip_input ();
current_column += 2;
loc += 1;
return Token::make (RIGHT_SHIFT, loc);
}
}
else if (peek_input () == '=')
{
// larger than or equal to
skip_input ();
current_column += 2;
loc += 1;
return Token::make (GREATER_OR_EQUAL, loc);
}
else
{
// larger than
current_column++;
return Token::make (RIGHT_ANGLE, loc);
}
case ':':
if (peek_input () == ':')
{
// scope resolution ::
skip_input ();
current_column += 2;
loc += 1;
return Token::make (SCOPE_RESOLUTION, loc);
}
else
{
// single colon :
current_column++;
return Token::make (COLON, loc);
}
case '!':
// no special handling for macros in lexer?
if (peek_input () == '=')
{
// not equal boolean operator
skip_input ();
current_column += 2;
loc += 1;
return Token::make (NOT_EQUAL, loc);
}
else
{
// not equal unary operator
current_column++;
return Token::make (EXCLAM, loc);
}
case '?':
current_column++;
return Token::make (QUESTION_MARK, loc);
case '#':
current_column++;
return Token::make (HASH, loc);
case '[':
current_column++;
return Token::make (LEFT_SQUARE, loc);
case ']':
current_column++;
return Token::make (RIGHT_SQUARE, loc);
case '{':
current_column++;
return Token::make (LEFT_CURLY, loc);
case '}':
current_column++;
return Token::make (RIGHT_CURLY, loc);
case '@':
current_column++;
return Token::make (PATTERN_BIND, loc);
case '$':
current_column++;
return Token::make (DOLLAR_SIGN, loc);
case '~':
current_column++;
return Token::make (TILDE, loc);
case '\\':
current_column++;
return Token::make (BACKSLASH, loc);
case '`':
current_column++;
return Token::make (BACKTICK, loc);
case '|':
if (peek_input () == '=')
{
// bitwise or-assign?
skip_input ();
current_column += 2;
loc += 1;
return Token::make (PIPE_EQ, loc);
}
else if (peek_input () == '|')
{
// logical or
skip_input ();
current_column += 2;
loc += 1;
return Token::make (OR, loc);
}
else
{
// bitwise or
current_column++;
return Token::make (PIPE, loc);
}
case '&':
if (peek_input () == '=')
{
// bitwise and-assign?
skip_input ();
current_column += 2;
loc += 1;
return Token::make (AMP_EQ, loc);
}
else if (peek_input () == '&')
{
// logical and
skip_input ();
current_column += 2;
loc += 1;
return Token::make (LOGICAL_AND, loc);
}
else
{
// bitwise and/reference
current_column++;
return Token::make (AMP, loc);
}
case '.':
if (peek_input () == '.')
{
if (peek_input (1) == '.')
{
// ellipsis
skip_input (1);
current_column += 3;
loc += 2;
return Token::make (ELLIPSIS, loc);
}
else if (peek_input (1) == '=')
{
// ..=
skip_input (1);
current_column += 3;
loc += 2;
return Token::make (DOT_DOT_EQ, loc);
}
else
{
// ..
skip_input ();
current_column += 2;
loc += 1;
return Token::make (DOT_DOT, loc);
}
}
else /*if (!ISDIGIT (peek_input ()))*/
{
// single dot .
// Only if followed by a non-number - otherwise is float
// nope, float cannot start with '.'.
current_column++;
return Token::make (DOT, loc);
}
}
// TODO: special handling of _ in the lexer? instead of being identifier
// byte character, byte string and raw byte string literals
if (current_char == 'b')
{
if (peek_input () == '\'')
return parse_byte_char (loc);
else if (peek_input () == '"')
return parse_byte_string (loc);
else if (peek_input () == 'r'
&& (peek_input (1) == '#' || peek_input (1) == '"'))
return parse_raw_byte_string (loc);
}
// raw identifiers and raw strings
if (current_char == 'r')
{
int peek = peek_input ();
int peek1 = peek_input (1);
if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
{
TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
if (raw_ident_ptr != nullptr)
return raw_ident_ptr;
else
continue; /* input got parsed, it just wasn't valid. An error
was produced. */
}
else
{
TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc);
if (maybe_raw_string_ptr != nullptr)
return maybe_raw_string_ptr;
}
}
// find identifiers and keywords
if (ISALPHA (current_char) || current_char == '_')
return parse_identifier_or_keyword (loc);
// int and float literals
if (ISDIGIT (current_char))
{ // _ not allowed as first char
if (current_char == '0'
&& is_non_decimal_int_literal_separator (peek_input ()))
{
// handle binary, octal, hex literals
TokenPtr non_dec_int_lit_ptr
= parse_non_decimal_int_literals (loc);
if (non_dec_int_lit_ptr != nullptr)
return non_dec_int_lit_ptr;
}
else
{
// handle decimals (integer or float)
TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc);
if (decimal_or_float_ptr != nullptr)
return decimal_or_float_ptr;
}
}
// string literals
if (current_char == '"')
return parse_string (loc);
// char literals and lifetime names
if (current_char == '\'')
{
TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc);
if (char_or_lifetime_ptr != nullptr)
return char_or_lifetime_ptr;
}
// DEBUG: check for specific character problems:
if (current_char == '0')
rust_debug ("'0' uncaught before unexpected character");
else if (current_char == ']')
rust_debug ("']' uncaught before unexpected character");
else if (current_char == 0x5d)
rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before "
"unexpected character");
// didn't match anything so error
rust_error_at (loc, "unexpected character %<%x%>", current_char);
current_column++;
}
}
// Parses in a type suffix.
std::pair
Lexer::parse_in_type_suffix ()
{
std::string suffix;
suffix.reserve (5);
int additional_length_offset = 0;
// get suffix
while (ISALPHA (current_char) || ISDIGIT (current_char)
|| current_char == '_')
{
if (current_char == '_')
{
// don't add _ to suffix
skip_input ();
current_char = peek_input ();
additional_length_offset++;
continue;
}
additional_length_offset++;
suffix += current_char;
skip_input ();
current_char = peek_input ();
}
if (suffix.empty ())
{
// no type suffix: do nothing but also no error
return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
}
else if (suffix == "f32")
{
return std::make_pair (CORETYPE_F32, additional_length_offset);
}
else if (suffix == "f64")
{
return std::make_pair (CORETYPE_F64, additional_length_offset);
}
else if (suffix == "i8")
{
return std::make_pair (CORETYPE_I8, additional_length_offset);
}
else if (suffix == "i16")
{
return std::make_pair (CORETYPE_I16, additional_length_offset);
}
else if (suffix == "i32")
{
return std::make_pair (CORETYPE_I32, additional_length_offset);
}
else if (suffix == "i64")
{
return std::make_pair (CORETYPE_I64, additional_length_offset);
}
else if (suffix == "i128")
{
return std::make_pair (CORETYPE_I128, additional_length_offset);
}
else if (suffix == "isize")
{
return std::make_pair (CORETYPE_ISIZE, additional_length_offset);
}
else if (suffix == "u8")
{
return std::make_pair (CORETYPE_U8, additional_length_offset);
}
else if (suffix == "u16")
{
return std::make_pair (CORETYPE_U16, additional_length_offset);
}
else if (suffix == "u32")
{
return std::make_pair (CORETYPE_U32, additional_length_offset);
}
else if (suffix == "u64")
{
return std::make_pair (CORETYPE_U64, additional_length_offset);
}
else if (suffix == "u128")
{
return std::make_pair (CORETYPE_U128, additional_length_offset);
}
else if (suffix == "usize")
{
return std::make_pair (CORETYPE_USIZE, additional_length_offset);
}
else
{
rust_error_at (get_current_location (), "unknown number suffix %qs",
suffix.c_str ());
return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
}
}
// Parses in the exponent part (if any) of a float literal.
std::pair
Lexer::parse_in_exponent_part ()
{
int additional_length_offset = 0;
std::string str;
if (current_char == 'E' || current_char == 'e')
{
// add exponent to string as strtod works with it
str += current_char;
skip_input ();
current_char = peek_input ();
additional_length_offset++;
// special - and + handling
if (current_char == '-')
{
str += '-';
skip_input ();
current_char = peek_input ();
additional_length_offset++;
}
else if (current_char == '+')
{
// don't add + but still skip input
skip_input ();
current_char = peek_input ();
additional_length_offset++;
}
// parse another decimal number for exponent
auto str_length = parse_in_decimal ();
str += std::get<0> (str_length);
additional_length_offset += std::get<1> (str_length);
}
return std::make_pair (str, additional_length_offset);
}
// Parses a decimal integer.
std::tuple
Lexer::parse_in_decimal ()
{
/* A pure decimal contains only digits. */
bool pure_decimal = true;
int additional_length_offset = 0;
std::string str;
while (ISDIGIT (current_char) || current_char == '_')
{
if (current_char == '_')
{
pure_decimal = false;
// don't add _ to number
skip_input ();
current_char = peek_input ();
additional_length_offset++;
continue;
}
additional_length_offset++;
str += current_char;
skip_input ();
current_char = peek_input ();
}
return std::make_tuple (str, additional_length_offset, pure_decimal);
}
/* Parses escapes (and string continues) in "byte" strings and characters. Does
* not support unicode. */
std::tuple
Lexer::parse_escape (char opening_char)
{
int additional_length_offset = 0;
char output_char = 0;
// skip to actual letter
skip_input ();
current_char = peek_input ();
additional_length_offset++;
switch (current_char)
{
case 'x': {
auto hex_escape_pair = parse_partial_hex_escape ();
long hexLong = hex_escape_pair.first;
additional_length_offset += hex_escape_pair.second;
if (hexLong > 255 || hexLong < 0)
rust_error_at (
get_current_location (),
"byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>",
static_cast (hexLong));
/* TODO: restore capital for escape output - gcc pretty-printer doesn't
* support %X directly */
char hexChar = static_cast (hexLong);
output_char = hexChar;
}
break;
case 'n':
output_char = '\n';
break;
case 'r':
output_char = '\r';
break;
case 't':
output_char = '\t';
break;
case '\\':
output_char = '\\';
break;
case '0':
output_char = '\0';
break;
case '\'':
output_char = '\'';
break;
case '"':
output_char = '"';
break;
case 'u':
rust_error_at (get_current_location (),
"cannot have a unicode escape \\u in a byte %s",
opening_char == '\'' ? "character" : "string");
// Try to parse it anyway, just to skip it
parse_partial_unicode_escape ();
return std::make_tuple (output_char, additional_length_offset, false);
case '\r':
case '\n':
// string continue
return std::make_tuple (0, parse_partial_string_continue (), true);
default:
rust_error_at (get_current_location (),
"unknown escape sequence %<\\%c%>", current_char);
// returns false if no parsing could be done
// return false;
return std::make_tuple (output_char, additional_length_offset, false);
break;
}
// all non-special cases (string continue) should skip their used char
skip_input ();
current_char = peek_input ();
additional_length_offset++;
// returns true if parsing was successful
// return true;
return std::make_tuple (output_char, additional_length_offset, false);
}
/* Parses an escape (or string continue) in a string or character. Supports
* unicode escapes. */
std::tuple
Lexer::parse_utf8_escape ()
{
Codepoint output_char;
int additional_length_offset = 0;
// skip to actual letter
skip_input ();
current_char = peek_input ();
additional_length_offset++;
switch (current_char)
{
case 'x': {
auto hex_escape_pair = parse_partial_hex_escape ();
long hexLong = hex_escape_pair.first;
additional_length_offset += hex_escape_pair.second;
if (hexLong > 127 || hexLong < 0)
rust_error_at (
get_current_location (),
"ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>",
static_cast (hexLong));
/* TODO: restore capital for escape output - gcc pretty-printer doesn't
* support %X directly */
char hexChar = static_cast (hexLong);
output_char = hexChar;
}
break;
case 'n':
output_char = '\n';
break;
case 'r':
output_char = '\r';
break;
case 't':
output_char = '\t';
break;
case '\\':
output_char = '\\';
break;
case '0':
output_char = '\0';
break;
case '\'':
output_char = '\'';
break;
case '"':
output_char = '"';
break;
case 'u': {
auto unicode_escape_pair = parse_partial_unicode_escape ();
output_char = unicode_escape_pair.first;
additional_length_offset += unicode_escape_pair.second;
return std::make_tuple (output_char, additional_length_offset, false);
}
break;
case '\r':
case '\n':
// string continue
return std::make_tuple (0, parse_partial_string_continue (), true);
default:
rust_error_at (get_current_location (),
"unknown escape sequence %<\\%c%>", current_char);
// returns false if no parsing could be done
// return false;
return std::make_tuple (output_char, additional_length_offset, false);
break;
}
/* all non-special cases (unicode, string continue) should skip their used
* char */
skip_input ();
current_char = peek_input ();
additional_length_offset++;
// returns true if parsing was successful
// return true;
return std::make_tuple (output_char, additional_length_offset, false);
}
// Parses the body of a string continue that has been found in an escape.
int
Lexer::parse_partial_string_continue ()
{
int additional_length_offset = 1;
// string continue
while (is_whitespace (current_char))
{
if (current_char == '\n')
{
current_line++;
current_column = 1;
// tell line_table that new line starts
start_line (current_line, max_column_hint);
// reset "length"
additional_length_offset = 1;
// get next char
skip_input ();
current_char = peek_input ();
continue;
}
skip_input ();
current_char = peek_input ();
additional_length_offset++;
}
return additional_length_offset;
}
/* Parses the body of a '\x' escape. Note that it does not check that the number
* is valid and smaller than 255. */
std::pair
Lexer::parse_partial_hex_escape ()
{
// hex char string (null-terminated)
char hexNum[3] = {0, 0, 0};
// first hex char
current_char = peek_input (1);
int additional_length_offset = 1;
if (!is_x_digit (current_char))
{
rust_error_at (get_current_location (),
"invalid character %<\\x%c%> in \\x sequence",
current_char);
return std::make_pair (0, 0);
}
hexNum[0] = current_char;
// second hex char
skip_input ();
current_char = peek_input (1);
additional_length_offset++;
if (!is_x_digit (current_char))
{
rust_error_at (get_current_location (),
"invalid character %<\\x%c%c%> in \\x sequence", hexNum[0],
current_char);
return std::make_pair (0, 1);
}
skip_input ();
hexNum[1] = current_char;
long hexLong = std::strtol (hexNum, nullptr, 16);
return std::make_pair (hexLong, additional_length_offset);
}
// Parses the body of a unicode escape.
std::pair
Lexer::parse_partial_unicode_escape ()
{
skip_input ();
current_char = peek_input ();
int additional_length_offset = 0;
if (current_char != '{')
{
rust_error_at (get_current_location (),
"unicode escape should start with %<{%>");
/* Skip what should probaby have been between brackets. */
while (is_x_digit (current_char) || current_char == '_')
{
skip_input ();
current_char = peek_input ();
additional_length_offset++;
}
return std::make_pair (Codepoint (0), additional_length_offset);
}
skip_input ();
current_char = peek_input ();
additional_length_offset++;
if (current_char == '_')
{
rust_error_at (get_current_location (),
"unicode escape cannot start with %<_%>");
skip_input ();
current_char = peek_input ();
additional_length_offset++;
// fallthrough and try to parse the rest anyway
}
// parse unicode escape - 1-6 hex digits
std::string num_str;
num_str.reserve (6);
// loop through to add entire hex number to string
while (is_x_digit (current_char) || current_char == '_')
{
if (current_char == '_')
{
// don't add _ to number
skip_input ();
current_char = peek_input ();
additional_length_offset++;
continue;
}
additional_length_offset++;
// add raw hex numbers
num_str += current_char;
skip_input ();
current_char = peek_input ();
}
if (current_char == '}')
{
skip_input ();
current_char = peek_input ();
additional_length_offset++;
}
else
{
// actually an error, but allow propagation anyway Assume that
// wrong bracketm whitespace or single/double quotes are wrong
// termination, otherwise it is a wrong character, then skip to the actual
// terminator.
if (current_char == '{' || is_whitespace (current_char)
|| current_char == '\'' || current_char == '"')
{
rust_error_at (get_current_location (),
"expected terminating %<}%> in unicode escape");
return std::make_pair (Codepoint (0), additional_length_offset);
}
else
{
rust_error_at (get_current_location (),
"invalid character %<%c%> in unicode escape",
current_char);
while (current_char != '}' && current_char != '{'
&& !is_whitespace (current_char) && current_char != '\''
&& current_char != '"')
{
skip_input ();
current_char = peek_input ();
additional_length_offset++;
}
// Consume the actual closing bracket if found
if (current_char == '}')
{
skip_input ();
current_char = peek_input ();
additional_length_offset++;
}
return std::make_pair (Codepoint (0), additional_length_offset);
}
}
// ensure 1-6 hex characters
if (num_str.length () > 6 || num_str.length () < 1)
{
rust_error_at (get_current_location (),
"unicode escape should be between 1 and 6 hex "
"characters; it is %lu",
(unsigned long) num_str.length ());
// return false;
return std::make_pair (Codepoint (0), additional_length_offset);
}
unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16);
if (hex_num > 0xd7ff && hex_num < 0xe000)
{
rust_error_at (
get_current_location (),
"unicode escape cannot be a surrogate value (D800 to DFFF)");
return std::make_pair (Codepoint (0), additional_length_offset);
}
if (hex_num > 0x10ffff)
{
rust_error_at (get_current_location (),
"unicode escape cannot be larger than 10FFFF");
return std::make_pair (Codepoint (0), additional_length_offset);
}
// return true;
return std::make_pair (Codepoint (static_cast (hex_num)),
additional_length_offset);
}
// Parses a byte character.
TokenPtr
Lexer::parse_byte_char (Location loc)
{
skip_input ();
current_column++;
// make current char the next character
current_char = peek_input ();
int length = 1;
// char to save
char byte_char = 0;
// detect escapes
if (current_char == '\\')
{
auto escape_length_pair = parse_escape ('\'');
byte_char = std::get<0> (escape_length_pair);
length += std::get<1> (escape_length_pair);
current_char = peek_input ();
if (current_char != '\'')
{
rust_error_at (get_current_location (), "unclosed %");
}
skip_input ();
current_char = peek_input ();
length++; // go to next char
}
else if (current_char != '\'')
{
// otherwise, get character from direct input character
byte_char = current_char;
skip_input ();
current_char = peek_input ();
length++;
if (current_char != '\'')
{
rust_error_at (get_current_location (), "unclosed %");
}
skip_input ();
current_char = peek_input ();
length++; // go to next char
}
else
{
rust_error_at (get_current_location (),
"no character inside %<%> for %");
}
current_column += length;
loc += length - 1;
return Token::make_byte_char (loc, byte_char);
}
// Parses a byte string.
TokenPtr
Lexer::parse_byte_string (Location loc)
{
// byte string
// skip quote character
skip_input ();
current_column++;
std::string str;
str.reserve (16); // some sensible default
int length = 1;
current_char = peek_input ();
while (current_char != '"' && current_char != EOF)
{
if (current_char == '\\')
{
auto escape_length_pair = parse_escape ('"');
char output_char = std::get<0> (escape_length_pair);
if (output_char == 0 && std::get<2> (escape_length_pair))
length = std::get<1> (escape_length_pair) - 1;
else
length += std::get<1> (escape_length_pair);
if (output_char != 0 || !std::get<2> (escape_length_pair))
str += output_char;
continue;
}
length++;
str += current_char;
skip_input ();
current_char = peek_input ();
}
current_column += length;
if (current_char == '"')
{
current_column++;
skip_input ();
current_char = peek_input ();
}
else if (current_char == EOF)
{
rust_error_at (get_current_location (), "unended byte string literal");
return Token::make (END_OF_FILE, get_current_location ());
}
else
{
gcc_unreachable ();
}
str.shrink_to_fit ();
loc += str.size () - 1;
return Token::make_byte_string (loc, std::move (str));
}
// Parses a raw byte string.
TokenPtr
Lexer::parse_raw_byte_string (Location loc)
{
// raw byte string literals
std::string str;
str.reserve (16); // some sensible default
int length = 1;
int hash_count = 0;
// get hash count at beginnning
skip_input ();
current_char = peek_input ();
length++;
while (current_char == '#')
{
hash_count++;
length++;
skip_input ();
current_char = peek_input ();
}
if (current_char != '"')
{
rust_error_at (get_current_location (),
"raw byte string has no opening %<\"%>");
}
skip_input ();
current_char = peek_input ();
length++;
while (true)
{
if (current_char == '"')
{
bool enough_hashes = true;
for (int i = 0; i < hash_count; i++)
{
if (peek_input (i + 1) != '#')
{
enough_hashes = false;
break;
}
}
if (enough_hashes)
{
// skip enough input and peek enough input
skip_input (hash_count);
current_char = peek_input ();
length += hash_count + 1;
break;
}
}
if ((unsigned char) current_char > 127)
{
rust_error_at (get_current_location (),
"character %<%c%> in raw byte string out of range",
current_char);
current_char = 0;
}
length++;
str += current_char;
skip_input ();
current_char = peek_input ();
}
current_column += length;
loc += length - 1;
str.shrink_to_fit ();
return Token::make_byte_string (loc, std::move (str));
}
// Parses a raw identifier.
TokenPtr
Lexer::parse_raw_identifier (Location loc)
{
// raw identifier
std::string str;
str.reserve (16); // default
skip_input ();
current_char = peek_input ();
current_column += 2;
bool first_is_underscore = current_char == '_';
int length = 0;
current_char = peek_input ();
// loop through entire name
while (ISALPHA (current_char) || ISDIGIT (current_char)
|| current_char == '_')
{
length++;
str += current_char;
skip_input ();
current_char = peek_input ();
}
current_column += length;
// if just a single underscore, not an identifier
if (first_is_underscore && length == 1)
rust_error_at (get_current_location (),
"%<_%> is not a valid raw identifier");
if (str == "crate" || str == "extern" || str == "self" || str == "super"
|| str == "Self")
{
rust_error_at (get_current_location (),
"%qs is a forbidden raw identifier", str.c_str ());
return nullptr;
}
else
{
str.shrink_to_fit ();
loc += length - 1;
return Token::make_identifier (loc, std::move (str));
}
}
// skip broken string input (unterminated strings)
void
Lexer::skip_broken_string_input (int current_char)
{
while (current_char != '"' && current_char != EOF)
{
if (current_char == '\n')
{
current_line++;
current_column = 1;
}
else
{
current_column++;
}
skip_input ();
current_char = peek_input ();
}
if (current_char == '"')
{
current_column++;
skip_input ();
current_char = peek_input ();
}
rust_debug ("skipped to %d:%d due to bad quotes", current_line,
current_column);
}
// Parses a unicode string.
TokenPtr
Lexer::parse_string (Location loc)
{
Codepoint current_char32;
std::string str;
str.reserve (16); // some sensible default
int length = 1;
current_char32 = peek_codepoint_input ();
// FIXME: This fails if the input ends. How do we check for EOF?
while (current_char32.value != '"' && !current_char32.is_eof ())
{
if (current_char32.value == '\\')
{
// parse escape
auto utf8_escape_pair = parse_utf8_escape ();
current_char32 = std::get<0> (utf8_escape_pair);
if (current_char32 == Codepoint (0) && std::get<2> (utf8_escape_pair))
length = std::get<1> (utf8_escape_pair) - 1;
else
length += std::get<1> (utf8_escape_pair);
if (current_char32 != Codepoint (0)
|| !std::get<2> (utf8_escape_pair))
str += current_char32;
// required as parsing utf8 escape only changes current_char
current_char32 = peek_codepoint_input ();
continue;
}
length += get_input_codepoint_length ();
str += current_char32;
skip_codepoint_input ();
current_char32 = peek_codepoint_input ();
}
current_column += length;
if (current_char32.value == '"')
{
current_column++;
skip_input ();
current_char = peek_input ();
}
else if (current_char32.is_eof ())
{
rust_error_at (get_current_location (), "unended string literal");
return Token::make (END_OF_FILE, get_current_location ());
}
else
{
gcc_unreachable ();
}
str.shrink_to_fit ();
loc += length - 1;
return Token::make_string (loc, std::move (str));
}
// Parses an identifier or keyword.
TokenPtr
Lexer::parse_identifier_or_keyword (Location loc)
{
std::string str;
str.reserve (16); // default
str += current_char;
bool first_is_underscore = current_char == '_';
int length = 1;
current_char = peek_input ();
// loop through entire name
while (ISALPHA (current_char) || ISDIGIT (current_char)
|| current_char == '_')
{
length++;
str += current_char;
skip_input ();
current_char = peek_input ();
}
current_column += length;
// if just a single underscore, not an identifier
if (first_is_underscore && length == 1)
return Token::make (UNDERSCORE, loc);
str.shrink_to_fit ();
loc += length - 1;
TokenId keyword = classify_keyword (str);
if (keyword == IDENTIFIER)
return Token::make_identifier (loc, std::move (str));
else
return Token::make (keyword, loc);
}
// Possibly returns a raw string token if it exists - otherwise returns null.
TokenPtr
Lexer::maybe_parse_raw_string (Location loc)
{
int peek_index = 0;
while (peek_input (peek_index) == '#')
peek_index++;
if (peek_input (peek_index) == '"')
return parse_raw_string (loc, peek_index);
else
return nullptr;
}
// Returns a raw string token.
TokenPtr
Lexer::parse_raw_string (Location loc, int initial_hash_count)
{
// raw string literals
std::string str;
str.reserve (16); // some sensible default
int length = 1 + initial_hash_count;
if (initial_hash_count > 0)
skip_input (initial_hash_count - 1);
current_char = peek_input ();
if (current_char != '"')
rust_error_at (get_current_location (), "raw string has no opening %<\"%>");
length++;
skip_input ();
Codepoint current_char32 = peek_codepoint_input ();
while (!current_char32.is_eof ())
{
if (current_char32.value == '"')
{
bool enough_hashes = true;
for (int i = 0; i < initial_hash_count; i++)
{
if (peek_input (i + 1) != '#')
{
enough_hashes = false;
break;
}
}
if (enough_hashes)
{
// skip enough input and peek enough input
skip_input (initial_hash_count);
current_char = peek_input ();
length += initial_hash_count + 1;
break;
}
}
length++;
str += current_char32;
skip_codepoint_input ();
current_char32 = peek_codepoint_input ();
}
current_column += length;
loc += length - 1;
str.shrink_to_fit ();
return Token::make_string (loc, std::move (str));
}
template
TokenPtr
Lexer::parse_non_decimal_int_literal (Location loc, IsDigitFunc is_digit_func,
std::string existent_str, int base)
{
int length = 1;
skip_input ();
current_char = peek_input ();
length++;
// loop through to add entire number to string
while (is_digit_func (current_char) || current_char == '_')
{
if (current_char == '_')
{
// don't add _ to number
skip_input ();
current_char = peek_input ();
length++;
continue;
}
length++;
// add raw numbers
existent_str += current_char;
skip_input ();
current_char = peek_input ();
}
// convert value to decimal representation
long dec_num = std::strtol (existent_str.c_str (), nullptr, base);
existent_str = std::to_string (dec_num);
// parse in type suffix if it exists
auto type_suffix_pair = parse_in_type_suffix ();
PrimitiveCoreType type_hint = type_suffix_pair.first;
length += type_suffix_pair.second;
current_column += length;
if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64)
{
rust_error_at (get_current_location (),
"invalid type suffix %qs for integer (%s) literal",
get_type_hint_string (type_hint),
base == 16
? "hex"
: (base == 8 ? "octal"
: (base == 2 ? "binary"
: "")));
return nullptr;
}
loc += length - 1;
return Token::make_int (loc, std::move (existent_str), type_hint);
}
// Parses a hex, binary or octal int literal.
TokenPtr
Lexer::parse_non_decimal_int_literals (Location loc)
{
std::string str;
str.reserve (16); // some sensible default
str += current_char;
current_char = peek_input ();
if (current_char == 'x')
{
// hex (integer only)
return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16);
}
else if (current_char == 'o')
{
// octal (integer only)
return parse_non_decimal_int_literal (loc, is_octal_digit,
std::move (str), 8);
}
else if (current_char == 'b')
{
// binary (integer only)
return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str),
2);
}
else
{
return nullptr;
}
}
// Parses a decimal-based int literal or float literal.
TokenPtr
Lexer::parse_decimal_int_or_float (Location loc)
{
std::string str;
str.reserve (16); // some sensible default
str += current_char;
int length = 1;
bool first_zero = current_char == '0';
current_char = peek_input ();
// parse initial decimal integer (or first integer part of float) literal
auto initial_decimal = parse_in_decimal ();
str += std::get<0> (initial_decimal);
length += std::get<1> (initial_decimal);
// detect float literal
if (current_char == '.' && is_float_digit (peek_input (1)))
{
// float with a '.', parse another decimal into it
// add . to str
str += current_char;
skip_input ();
current_char = peek_input ();
length++;
// parse another decimal number for float
auto second_decimal = parse_in_decimal ();
str += std::get<0> (second_decimal);
length += std::get<1> (second_decimal);
// parse in exponent part if it exists
auto exponent_pair = parse_in_exponent_part ();
str += exponent_pair.first;
length += exponent_pair.second;
// parse in type suffix if it exists
auto type_suffix_pair = parse_in_type_suffix ();
PrimitiveCoreType type_hint = type_suffix_pair.first;
length += type_suffix_pair.second;
if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
&& type_hint != CORETYPE_UNKNOWN)
{
rust_error_at (get_current_location (),
"invalid type suffix %qs for floating-point literal",
get_type_hint_string (type_hint));
// ignore invalid type suffix as everything else seems fine
type_hint = CORETYPE_UNKNOWN;
}
current_column += length;
loc += length - 1;
str.shrink_to_fit ();
return Token::make_float (loc, std::move (str), type_hint);
}
else if (current_char == '.' && check_valid_float_dot_end (peek_input (1)))
{
// float that is just an integer with a terminating '.' character
// add . to str
str += current_char;
skip_input ();
current_char = peek_input ();
length++;
// add a '0' after the . to prevent ambiguity
str += '0';
// type hint not allowed
current_column += length;
loc += length - 1;
str.shrink_to_fit ();
return Token::make_float (loc, std::move (str), CORETYPE_UNKNOWN);
}
else if (current_char == 'E' || current_char == 'e')
{
// exponent float with no '.' character
// parse exponent part
auto exponent_pair = parse_in_exponent_part ();
str += exponent_pair.first;
length += exponent_pair.second;
// parse in type suffix if it exists
auto type_suffix_pair = parse_in_type_suffix ();
PrimitiveCoreType type_hint = type_suffix_pair.first;
length += type_suffix_pair.second;
if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
&& type_hint != CORETYPE_UNKNOWN)
{
rust_error_at (get_current_location (),
"invalid type suffix %qs for floating-point literal",
get_type_hint_string (type_hint));
// ignore invalid type suffix as everything else seems fine
type_hint = CORETYPE_UNKNOWN;
}
current_column += length;
loc += length - 1;
str.shrink_to_fit ();
return Token::make_float (loc, std::move (str), type_hint);
}
else
{
// is an integer
// parse in type suffix if it exists
auto type_suffix_pair = parse_in_type_suffix ();
PrimitiveCoreType type_hint = type_suffix_pair.first;
/* A "real" pure decimal doesn't have a suffix and no zero prefix. */
if (type_hint == CORETYPE_UNKNOWN)
{
bool pure_decimal = std::get<2> (initial_decimal);
if (pure_decimal && (!first_zero || str.size () == 1))
type_hint = CORETYPE_PURE_DECIMAL;
}
length += type_suffix_pair.second;
current_column += length;
loc += length - 1;
str.shrink_to_fit ();
return Token::make_int (loc, std::move (str), type_hint);
}
}
TokenPtr
Lexer::parse_char_or_lifetime (Location loc)
{
Codepoint current_char32;
int length = 1;
current_char32 = peek_codepoint_input ();
if (current_char32.is_eof ())
return nullptr;
// parse escaped char literal
if (current_char32.value == '\\')
{
// parse escape
auto utf8_escape_pair = parse_utf8_escape ();
current_char32 = std::get<0> (utf8_escape_pair);
length += std::get<1> (utf8_escape_pair);
if (peek_codepoint_input ().value != '\'')
{
rust_error_at (get_current_location (), "unended character literal");
}
else
{
skip_codepoint_input ();
current_char = peek_input ();
length++;
}
current_column += length;
loc += length - 1;
return Token::make_char (loc, current_char32);
}
else
{
skip_codepoint_input ();
if (peek_codepoint_input ().value == '\'')
{
// parse non-escaped char literal
// skip the ' character
skip_input ();
current_char = peek_input ();
// TODO fix due to different widths of utf-8 chars?
current_column += 3;
loc += 2;
return Token::make_char (loc, current_char32);
}
else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
|| current_char32.value == '_')
{
// parse lifetime name
std::string str;
str += current_char32;
length++;
current_char = peek_input ();
while (ISDIGIT (current_char) || ISALPHA (current_char)
|| current_char == '_')
{
str += current_char;
skip_input ();
current_char = peek_input ();
length++;
}
current_column += length;
loc += length - 1;
str.shrink_to_fit ();
return Token::make_lifetime (loc, std::move (str));
}
else
{
rust_error_at (
get_current_location (),
"expected %' after character constant in character literal");
return nullptr;
}
}
}
// Returns the length of the codepoint at the current position.
int
Lexer::get_input_codepoint_length ()
{
uint8_t input = peek_input ();
if ((int8_t) input == EOF)
return 0;
if (input < 128)
{
// ascii -- 1 byte
// return input;
return 1;
}
else if ((input & 0xC0) == 0x80)
{
// invalid (continuation; can't be first char)
// return 0xFFFE;
return 0;
}
else if ((input & 0xE0) == 0xC0)
{
// 2 bytes
uint8_t input2 = peek_input (1);
if ((input2 & 0xC0) != 0x80)
return 0;
// return 0xFFFE;
// uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
// return output;
return 2;
}
else if ((input & 0xF0) == 0xE0)
{
// 3 bytes
uint8_t input2 = peek_input (1);
if ((input2 & 0xC0) != 0x80)
return 0;
// return 0xFFFE;
uint8_t input3 = peek_input (2);
if ((input3 & 0xC0) != 0x80)
return 0;
// return 0xFFFE;
/*uint32_t output
= ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
0); return output;*/
return 3;
}
else if ((input & 0xF8) == 0xF0)
{
// 4 bytes
uint8_t input2 = peek_input (1);
if ((input2 & 0xC0) != 0x80)
return 0;
// return 0xFFFE;
uint8_t input3 = peek_input (2);
if ((input3 & 0xC0) != 0x80)
return 0;
// return 0xFFFE;
uint8_t input4 = peek_input (3);
if ((input4 & 0xC0) != 0x80)
return 0;
// return 0xFFFE;
/*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
return output;*/
return 4;
}
else
{
rust_error_at (get_current_location (),
"invalid UTF-8 [FIRST] (too long)");
return 0;
}
}
// Returns the codepoint at the current position.
Codepoint
Lexer::peek_codepoint_input ()
{
uint8_t input = peek_input ();
if ((int8_t) input == EOF)
return Codepoint::eof ();
if (input < 128)
{
// ascii -- 1 byte
return {input};
}
else if ((input & 0xC0) == 0x80)
{
// invalid (continuation; can't be first char)
return {0xFFFE};
}
else if ((input & 0xE0) == 0xC0)
{
// 2 bytes
uint8_t input2 = peek_input (1);
if ((input2 & 0xC0) != 0x80)
return {0xFFFE};
uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
return {output};
}
else if ((input & 0xF0) == 0xE0)
{
// 3 bytes
uint8_t input2 = peek_input (1);
if ((input2 & 0xC0) != 0x80)
return {0xFFFE};
uint8_t input3 = peek_input (2);
if ((input3 & 0xC0) != 0x80)
return {0xFFFE};
uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
| ((input3 & 0x3F) << 0);
return {output};
}
else if ((input & 0xF8) == 0xF0)
{
// 4 bytes
uint8_t input2 = peek_input (1);
if ((input2 & 0xC0) != 0x80)
return {0xFFFE};
uint8_t input3 = peek_input (2);
if ((input3 & 0xC0) != 0x80)
return {0xFFFE};
uint8_t input4 = peek_input (3);
if ((input4 & 0xC0) != 0x80)
return {0xFFFE};
uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
return {output};
}
else
{
rust_error_at (get_current_location (),
"invalid UTF-8 [SECND] (too long)");
return {0xFFFE};
}
}
void
Lexer::skip_codepoint_input ()
{
int toSkip = get_input_codepoint_length ();
gcc_assert (toSkip >= 1);
skip_input (toSkip - 1);
}
int
Lexer::test_get_input_codepoint_n_length (int n_start_offset)
{
uint8_t input = peek_input (n_start_offset);
if (input < 128)
{
// ascii -- 1 byte
// return input;
return 1;
}
else if ((input & 0xC0) == 0x80)
{
// invalid (continuation; can't be first char)
// return 0xFFFE;
return 0;
}
else if ((input & 0xE0) == 0xC0)
{
// 2 bytes
uint8_t input2 = peek_input (n_start_offset + 1);
if ((input2 & 0xC0) != 0x80)
// return 0xFFFE;
return 0;
// uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
// return output;
return 2;
}
else if ((input & 0xF0) == 0xE0)
{
// 3 bytes
uint8_t input2 = peek_input (n_start_offset + 1);
if ((input2 & 0xC0) != 0x80)
// return 0xFFFE;
return 0;
uint8_t input3 = peek_input (n_start_offset + 2);
if ((input3 & 0xC0) != 0x80)
// return 0xFFFE;
return 0;
/*uint32_t output
= ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
0); return output;*/
return 3;
}
else if ((input & 0xF8) == 0xF0)
{
// 4 bytes
uint8_t input2 = peek_input (n_start_offset + 1);
if ((input2 & 0xC0) != 0x80)
// return 0xFFFE;
return 0;
uint8_t input3 = peek_input (n_start_offset + 2);
if ((input3 & 0xC0) != 0x80)
// return 0xFFFE;
return 0;
uint8_t input4 = peek_input (n_start_offset + 3);
if ((input4 & 0xC0) != 0x80)
// return 0xFFFE;
return 0;
/*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
return output;*/
return 4;
}
else
{
rust_error_at (get_current_location (),
"invalid UTF-8 [THIRD] (too long)");
return 0;
}
}
// peeks the codepoint input at n codepoints ahead of current codepoint - try
// not to use
Codepoint
Lexer::test_peek_codepoint_input (int n)
{
int totalOffset = 0;
// add up all offsets into total offset? does this do what I want?
for (int i = 0; i < n; i++)
{
totalOffset += test_get_input_codepoint_n_length (totalOffset);
}
// issues: this would have (at least) O(n) lookup time, not O(1) like the
// rest?
// TODO: implement if still needed
// error out of function as it is not implemented
gcc_assert (1 == 0);
return {0};
/*
uint8_t input = peek_input();
if (input < 128) {
// ascii -- 1 byte
return input;
} else if ((input & 0xC0) == 0x80) {
// invalid (continuation; can't be first char)
return 0xFFFE;
} else if ((input & 0xE0) == 0xC0) {
// 2 bytes
uint8_t input2 = peek_input(1);
if ((input2 & 0xC0) != 0x80)
return 0xFFFE;
uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
return output;
} else if ((input & 0xF0) == 0xE0) {
// 3 bytes
uint8_t input2 = peek_input(1);
if ((input2 & 0xC0) != 0x80)
return 0xFFFE;
uint8_t input3 = peek_input(2);
if ((input3 & 0xC0) != 0x80)
return 0xFFFE;
uint32_t output
= ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 &
0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) {
// 4 bytes
uint8_t input2 = peek_input(1);
if ((input2 & 0xC0) != 0x80)
return 0xFFFE;
uint8_t input3 = peek_input(2);
if ((input3 & 0xC0) != 0x80)
return 0xFFFE;
uint8_t input4 = peek_input(3);
if ((input4 & 0xC0) != 0x80)
return 0xFFFE;
uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) <<
0); return output; } else { rust_error_at(get_current_location(), "invalid
UTF-8 (too long)"); return 0xFFFE;
}*/
}
void
Lexer::split_current_token (TokenId new_left, TokenId new_right)
{
/* TODO: assert that this TokenId is a "simple token" like punctuation and not
* like "IDENTIFIER"? */
Location current_loc = peek_token ()->get_locus ();
TokenPtr new_left_tok = Token::make (new_left, current_loc);
TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);
token_queue.replace_current_value (std::move (new_left_tok));
token_queue.insert (1, std::move (new_right_tok));
}
void
Lexer::start_line (int current_line, int current_column)
{
if (line_map)
line_map->start_line (current_line, current_column);
}
} // namespace Rust