// Copyright (C) 2020-2023 Free Software Foundation, Inc. // This file is part of GCC. // GCC is free software; you can redistribute it and/or modify it under // the terms of the GNU General Public License as published by the Free // Software Foundation; either version 3, or (at your option) any later // version. // GCC is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // You should have received a copy of the GNU General Public License // along with GCC; see the file COPYING3. If not see // . #include "rust-system.h" #include "rust-lex.h" #include "rust-diagnostics.h" #include "rust-linemap.h" #include "rust-session-manager.h" #include "safe-ctype.h" namespace Rust { // TODO: move to separate compilation unit? // overload += for uint32_t to allow 32-bit encoded utf-8 to be added std::string & operator+= (std::string &str, Codepoint char32) { if (char32.value < 0x80) { str += static_cast (char32.value); } else if (char32.value < (0x1F + 1) << (1 * 6)) { str += static_cast (0xC0 | ((char32.value >> 6) & 0x1F)); str += static_cast (0x80 | ((char32.value >> 0) & 0x3F)); } else if (char32.value < (0x0F + 1) << (2 * 6)) { str += static_cast (0xE0 | ((char32.value >> 12) & 0x0F)); str += static_cast (0x80 | ((char32.value >> 6) & 0x3F)); str += static_cast (0x80 | ((char32.value >> 0) & 0x3F)); } else if (char32.value < (0x07 + 1) << (3 * 6)) { str += static_cast (0xF0 | ((char32.value >> 18) & 0x07)); str += static_cast (0x80 | ((char32.value >> 12) & 0x3F)); str += static_cast (0x80 | ((char32.value >> 6) & 0x3F)); str += static_cast (0x80 | ((char32.value >> 0) & 0x3F)); } else { rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value); } return str; } std::string Codepoint::as_string () { std::string str; // str += Codepoint (value); str += *this; return str; } /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead * for handling. */ bool is_float_digit (char number) { return ISDIGIT (number) || number == 'E' || number == 'e'; } /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or * whatever is different */ bool is_x_digit (char number) { return ISXDIGIT (number); } bool is_octal_digit (char number) { return number >= '0' && number <= '7'; } bool is_bin_digit (char number) { return number == '0' || number == '1'; } bool check_valid_float_dot_end (char character) { return character != '.' && character != '_' && !ISALPHA (character); } // ISSPACE from safe-ctype but may change in future bool is_whitespace (char character) { return ISSPACE (character); } bool is_non_decimal_int_literal_separator (char character) { return character == 'x' || character == 'o' || character == 'b'; } Lexer::Lexer (const std::string &input) : input (RAIIFile::create_error ()), current_line (1), current_column (1), line_map (nullptr), dump_lex_out (Optional::none ()), raw_input_source (new BufferInputSource (input, 0)), input_queue{*raw_input_source}, token_queue (TokenSource (this)) {} Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap, Optional dump_lex_opt) : input (std::move (file_input)), current_line (1), current_column (1), line_map (linemap), dump_lex_out (dump_lex_opt), raw_input_source (new FileInputSource (input.get_raw ())), input_queue{*raw_input_source}, token_queue (TokenSource (this)) { // inform line_table that file is being entered and is in line 1 if (linemap) line_map->start_file (filename, current_line); } Lexer::~Lexer () { /* ok apparently stop (which is equivalent of original code in destructor) is * meant to be called after all files have finished parsing, for cleanup. On * the other hand, actual code that it calls to leave a certain line map is * mentioned in GCC docs as being useful for "just leaving an included header" * and stuff like that, so this line mapping functionality may need fixing. * FIXME: find out whether this occurs. */ // line_map->stop(); } /* TODO: need to optimise somehow to avoid the virtual function call in the * tight loop. Best idea at the moment is CRTP, but that might make lexer * implementation annoying when storing the "base class" (i.e. would need * template parameter everywhere), although in practice it would mostly just * look ugly and make enclosing classes like Parser also require a type * parameter. At this point a macro might be better. OK I guess macros can be * replaced by constexpr if or something if possible. */ Location Lexer::get_current_location () { if (line_map) return line_map->get_location (current_column); else // If we have no linemap, we're lexing something without proper locations return Location (); } int Lexer::peek_input (int n) { return input_queue.peek (n); } int Lexer::peek_input () { return peek_input (0); } void Lexer::skip_input (int n) { input_queue.skip (n); } void Lexer::skip_input () { skip_input (0); } void Lexer::skip_token (int n) { // dump tokens if dump-lex option is enabled if (dump_lex_out.is_some ()) dump_and_skip (n); else token_queue.skip (n); } void Lexer::dump_and_skip (int n) { std::ofstream &out = dump_lex_out.get (); bool found_eof = false; const_TokenPtr tok; for (int i = 0; i < n + 1; i++) { if (!found_eof) { tok = peek_token (); found_eof |= tok->get_id () == Rust::END_OF_FILE; Location loc = tok->get_locus (); out << "token_id_to_str (); out << (tok->has_str () ? (std::string (", text=") + tok->get_str () + std::string (", typehint=") + std::string (tok->get_type_hint_str ())) : "") << " "; out << get_line_map ()->to_string (loc) << " "; } token_queue.skip (0); } } void Lexer::replace_current_token (TokenPtr replacement) { token_queue.replace_current_value (replacement); rust_debug ("called 'replace_current_token' - this is deprecated"); } /* shitty anonymous namespace that can only be accessed inside the compilation * unit - used for classify_keyword binary search in sorted array of keywords * created with x-macros. */ namespace { // TODO: make constexpr when update to c++20 const std::string keyword_index[] = { #define RS_TOKEN(x, y) #define RS_TOKEN_KEYWORD(name, keyword) keyword, RS_TOKEN_LIST #undef RS_TOKEN_KEYWORD #undef RS_TOKEN }; constexpr TokenId keyword_keys[] = { #define RS_TOKEN(x, y) #define RS_TOKEN_KEYWORD(name, keyword) name, RS_TOKEN_LIST #undef RS_TOKEN_KEYWORD #undef RS_TOKEN }; constexpr int num_keywords = sizeof (keyword_index) / sizeof (*keyword_index); } // namespace /* Determines whether the string passed in is a keyword or not. If it is, it * returns the keyword name. */ TokenId Lexer::classify_keyword (const std::string &str) { const std::string *last = keyword_index + num_keywords; const std::string *idx = std::lower_bound (keyword_index, last, str); if (idx == last || str != *idx) return IDENTIFIER; // TODO: possibly replace this x-macro system with something like hash map? // We now have the expected token ID of the reserved keyword. However, some // keywords are reserved starting in certain editions. For example, `try` is // only a reserved keyword in editions >=2018. The language might gain new // reserved keywords in the future. // // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords auto id = keyword_keys[idx - keyword_index]; // `try` is not a reserved keyword before 2018 if (Session::get_instance ().options.get_edition () == CompileOptions::Edition::E2015 && id == TRY) return IDENTIFIER; return id; } TokenPtr Lexer::build_token () { // loop to go through multiple characters to build a single token while (true) { Location loc = get_current_location (); current_char = peek_input (); skip_input (); // detect UTF8 bom // // Must be the first thing on the first line. // There might be an optional BOM (Byte Order Mark), which for UTF-8 is // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped. if (current_line == 1 && current_column == 1 && current_char == 0xef && peek_input () == 0xbb && peek_input (1) == 0xbf) { skip_input (1); current_char = peek_input (); skip_input (); } // detect shebang // Must be the first thing on the first line, starting with #! // But since an attribute can also start with an #! we don't count it as a // shebang line when after any whitespace or comments there is a [. If it // is a shebang line we simple drop the line. Otherwise we don't consume // any characters and fall through to the real tokenizer. if (current_line == 1 && current_column == 1 && current_char == '#' && peek_input () == '!') { int n = 1; while (true) { int next_char = peek_input (n); if (is_whitespace (next_char)) n++; else if ((next_char == '/' && peek_input (n + 1) == '/' && peek_input (n + 2) != '!' && peek_input (n + 2) != '/') || (next_char == '/' && peek_input (n + 1) == '/' && peek_input (n + 2) == '/' && peek_input (n + 3) == '/')) { // two // or four //// // A single line comment // (but not an inner or outer doc comment) n += 2; next_char = peek_input (n); while (next_char != '\n' && next_char != EOF) { n++; next_char = peek_input (n); } if (next_char == '\n') n++; } else if (next_char == '/' && peek_input (n + 1) == '*' && peek_input (n + 2) == '*' && peek_input (n + 3) == '/') { /**/ n += 4; } else if (next_char == '/' && peek_input (n + 1) == '*' && peek_input (n + 2) == '*' && peek_input (n + 3) == '*' && peek_input (n + 4) == '/') { /***/ n += 5; } else if ((next_char == '/' && peek_input (n + 1) == '*' && peek_input (n + 2) != '*' && peek_input (n + 2) != '!') || (next_char == '/' && peek_input (n + 1) == '*' && peek_input (n + 2) == '*' && peek_input (n + 3) == '*')) { // one /* or three /*** // Start of a block comment // (but not an inner or outer doc comment) n += 2; int level = 1; while (level > 0) { if (peek_input (n) == EOF) break; else if (peek_input (n) == '/' && peek_input (n + 1) == '*') { n += 2; level += 1; } else if (peek_input (n) == '*' && peek_input (n + 1) == '/') { n += 2; level -= 1; } else n++; } } else if (next_char != '[') { // definitely shebang, ignore the first line while (current_char != '\n' && current_char != EOF) { current_char = peek_input (); skip_input (); } // newline current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); break; } else break; /* Definitely not a shebang line. */ } } // return end of file token if end of file if (current_char == EOF) return Token::make (END_OF_FILE, loc); // if not end of file, start tokenising switch (current_char) { /* ignore whitespace characters for tokens but continue updating * location */ case '\n': // newline current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); continue; case '\r': // cr // Ignore, we expect a newline (lf) soon. continue; case ' ': // space current_column++; continue; case '\t': // tab // width of a tab is not well-defined, assume 8 spaces current_column += 8; continue; // punctuation - actual tokens case '=': if (peek_input () == '>') { // match arm arrow skip_input (); current_column += 2; loc += 1; return Token::make (MATCH_ARROW, loc); } else if (peek_input () == '=') { // equality operator skip_input (); current_column += 2; loc += 1; return Token::make (EQUAL_EQUAL, loc); } else { // assignment operator current_column++; return Token::make (EQUAL, loc); } case '(': current_column++; return Token::make (LEFT_PAREN, loc); case '-': if (peek_input () == '>') { // return type specifier skip_input (); current_column += 2; loc += 1; return Token::make (RETURN_TYPE, loc); } else if (peek_input () == '=') { // minus-assign skip_input (); current_column += 2; loc += 1; return Token::make (MINUS_EQ, loc); } else { // minus current_column++; return Token::make (MINUS, loc); } case '+': if (peek_input () == '=') { // add-assign skip_input (); current_column += 2; loc += 1; return Token::make (PLUS_EQ, loc); } else { // add current_column++; return Token::make (PLUS, loc); } case ')': current_column++; return Token::make (RIGHT_PAREN, loc); case ';': current_column++; return Token::make (SEMICOLON, loc); case '*': if (peek_input () == '=') { // multiplication-assign skip_input (); current_column += 2; loc += 1; return Token::make (ASTERISK_EQ, loc); } else { // multiplication current_column++; return Token::make (ASTERISK, loc); } case ',': current_column++; return Token::make (COMMA, loc); case '/': if (peek_input () == '=') { // division-assign skip_input (); current_column += 2; loc += 1; return Token::make (DIV_EQ, loc); } else if ((peek_input () == '/' && peek_input (1) != '!' && peek_input (1) != '/') || (peek_input () == '/' && peek_input (1) == '/' && peek_input (2) == '/')) { // two // or four //// // single line comment // (but not an inner or outer doc comment) skip_input (); current_column += 2; current_char = peek_input (); // basically ignore until line finishes while (current_char != '\n' && current_char != EOF) { skip_input (); current_column++; // not used current_char = peek_input (); } continue; } else if (peek_input () == '/' && (peek_input (1) == '!' || peek_input (1) == '/')) { /* single line doc comment, inner or outer. */ bool is_inner = peek_input (1) == '!'; skip_input (1); current_column += 3; std::string str; str.reserve (32); current_char = peek_input (); while (current_char != '\n') { skip_input (); if (current_char == '\r') { char next_char = peek_input (); if (next_char == '\n') { current_char = '\n'; break; } rust_error_at ( loc, "Isolated CR %<\\r%> not allowed in doc comment"); current_char = next_char; continue; } if (current_char == EOF) { rust_error_at ( loc, "unexpected EOF while looking for end of comment"); break; } str += current_char; current_char = peek_input (); } skip_input (); current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); str.shrink_to_fit (); loc += str.size () - 1; if (is_inner) return Token::make_inner_doc_comment (loc, std::move (str)); else return Token::make_outer_doc_comment (loc, std::move (str)); } else if (peek_input () == '*' && peek_input (1) == '*' && peek_input (2) == '/') { /**/ skip_input (2); current_column += 4; continue; } else if (peek_input () == '*' && peek_input (1) == '*' && peek_input (2) == '*' && peek_input (3) == '/') { /***/ skip_input (3); current_column += 5; continue; } else if ((peek_input () == '*' && peek_input (1) != '!' && peek_input (1) != '*') || (peek_input () == '*' && peek_input (1) == '*' && peek_input (2) == '*')) { // one /* or three /*** // block comment // (but not an inner or outer doc comment) skip_input (); current_column += 2; int level = 1; while (level > 0) { current_char = peek_input (); if (current_char == EOF) { rust_error_at ( loc, "unexpected EOF while looking for end of comment"); break; } // if /* found if (current_char == '/' && peek_input (1) == '*') { // skip /* characters skip_input (1); current_column += 2; level += 1; continue; } // ignore until */ is found if (current_char == '*' && peek_input (1) == '/') { // skip */ characters skip_input (1); current_column += 2; level -= 1; continue; } if (current_char == '\n') { skip_input (); current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); continue; } skip_input (); current_column++; } // refresh new token continue; } else if (peek_input () == '*' && (peek_input (1) == '!' || peek_input (1) == '*')) { // block doc comment, inner /*! or outer /** bool is_inner = peek_input (1) == '!'; skip_input (1); current_column += 3; std::string str; str.reserve (96); int level = 1; while (level > 0) { current_char = peek_input (); if (current_char == EOF) { rust_error_at ( loc, "unexpected EOF while looking for end of comment"); break; } // if /* found if (current_char == '/' && peek_input (1) == '*') { // skip /* characters skip_input (1); current_column += 2; level += 1; str += "/*"; continue; } // ignore until */ is found if (current_char == '*' && peek_input (1) == '/') { // skip */ characters skip_input (1); current_column += 2; level -= 1; if (level > 0) str += "*/"; continue; } if (current_char == '\r' && peek_input (1) != '\n') rust_error_at ( loc, "Isolated CR %<\\r%> not allowed in doc comment"); if (current_char == '\n') { skip_input (); current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); str += '\n'; continue; } str += current_char; skip_input (); current_column++; } str.shrink_to_fit (); loc += str.size () - 1; if (is_inner) return Token::make_inner_doc_comment (loc, std::move (str)); else return Token::make_outer_doc_comment (loc, std::move (str)); } else { // division current_column++; return Token::make (DIV, loc); } case '%': if (peek_input () == '=') { // modulo-assign skip_input (); current_column += 2; loc += 1; return Token::make (PERCENT_EQ, loc); } else { // modulo current_column++; return Token::make (PERCENT, loc); } case '^': if (peek_input () == '=') { // xor-assign? skip_input (); current_column += 2; loc += 1; return Token::make (CARET_EQ, loc); } else { // xor? current_column++; return Token::make (CARET, loc); } case '<': if (peek_input () == '<') { if (peek_input (1) == '=') { // left-shift assign skip_input (1); current_column += 3; loc += 2; return Token::make (LEFT_SHIFT_EQ, loc); } else { // left-shift skip_input (); current_column += 2; loc += 1; return Token::make (LEFT_SHIFT, loc); } } else if (peek_input () == '=') { // smaller than or equal to skip_input (); current_column += 2; loc += 1; return Token::make (LESS_OR_EQUAL, loc); } else { // smaller than current_column++; return Token::make (LEFT_ANGLE, loc); } break; case '>': if (peek_input () == '>') { if (peek_input (1) == '=') { // right-shift-assign skip_input (1); current_column += 3; loc += 2; return Token::make (RIGHT_SHIFT_EQ, loc); } else { // right-shift skip_input (); current_column += 2; loc += 1; return Token::make (RIGHT_SHIFT, loc); } } else if (peek_input () == '=') { // larger than or equal to skip_input (); current_column += 2; loc += 1; return Token::make (GREATER_OR_EQUAL, loc); } else { // larger than current_column++; return Token::make (RIGHT_ANGLE, loc); } case ':': if (peek_input () == ':') { // scope resolution :: skip_input (); current_column += 2; loc += 1; return Token::make (SCOPE_RESOLUTION, loc); } else { // single colon : current_column++; return Token::make (COLON, loc); } case '!': // no special handling for macros in lexer? if (peek_input () == '=') { // not equal boolean operator skip_input (); current_column += 2; loc += 1; return Token::make (NOT_EQUAL, loc); } else { // not equal unary operator current_column++; return Token::make (EXCLAM, loc); } case '?': current_column++; return Token::make (QUESTION_MARK, loc); case '#': current_column++; return Token::make (HASH, loc); case '[': current_column++; return Token::make (LEFT_SQUARE, loc); case ']': current_column++; return Token::make (RIGHT_SQUARE, loc); case '{': current_column++; return Token::make (LEFT_CURLY, loc); case '}': current_column++; return Token::make (RIGHT_CURLY, loc); case '@': current_column++; return Token::make (PATTERN_BIND, loc); case '$': current_column++; return Token::make (DOLLAR_SIGN, loc); case '~': current_column++; return Token::make (TILDE, loc); case '\\': current_column++; return Token::make (BACKSLASH, loc); case '`': current_column++; return Token::make (BACKTICK, loc); case '|': if (peek_input () == '=') { // bitwise or-assign? skip_input (); current_column += 2; loc += 1; return Token::make (PIPE_EQ, loc); } else if (peek_input () == '|') { // logical or skip_input (); current_column += 2; loc += 1; return Token::make (OR, loc); } else { // bitwise or current_column++; return Token::make (PIPE, loc); } case '&': if (peek_input () == '=') { // bitwise and-assign? skip_input (); current_column += 2; loc += 1; return Token::make (AMP_EQ, loc); } else if (peek_input () == '&') { // logical and skip_input (); current_column += 2; loc += 1; return Token::make (LOGICAL_AND, loc); } else { // bitwise and/reference current_column++; return Token::make (AMP, loc); } case '.': if (peek_input () == '.') { if (peek_input (1) == '.') { // ellipsis skip_input (1); current_column += 3; loc += 2; return Token::make (ELLIPSIS, loc); } else if (peek_input (1) == '=') { // ..= skip_input (1); current_column += 3; loc += 2; return Token::make (DOT_DOT_EQ, loc); } else { // .. skip_input (); current_column += 2; loc += 1; return Token::make (DOT_DOT, loc); } } else /*if (!ISDIGIT (peek_input ()))*/ { // single dot . // Only if followed by a non-number - otherwise is float // nope, float cannot start with '.'. current_column++; return Token::make (DOT, loc); } } // TODO: special handling of _ in the lexer? instead of being identifier // byte character, byte string and raw byte string literals if (current_char == 'b') { if (peek_input () == '\'') return parse_byte_char (loc); else if (peek_input () == '"') return parse_byte_string (loc); else if (peek_input () == 'r' && (peek_input (1) == '#' || peek_input (1) == '"')) return parse_raw_byte_string (loc); } // raw identifiers and raw strings if (current_char == 'r') { int peek = peek_input (); int peek1 = peek_input (1); if (peek == '#' && (ISALPHA (peek1) || peek1 == '_')) { TokenPtr raw_ident_ptr = parse_raw_identifier (loc); if (raw_ident_ptr != nullptr) return raw_ident_ptr; else continue; /* input got parsed, it just wasn't valid. An error was produced. */ } else { TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc); if (maybe_raw_string_ptr != nullptr) return maybe_raw_string_ptr; } } // find identifiers and keywords if (ISALPHA (current_char) || current_char == '_') return parse_identifier_or_keyword (loc); // int and float literals if (ISDIGIT (current_char)) { // _ not allowed as first char if (current_char == '0' && is_non_decimal_int_literal_separator (peek_input ())) { // handle binary, octal, hex literals TokenPtr non_dec_int_lit_ptr = parse_non_decimal_int_literals (loc); if (non_dec_int_lit_ptr != nullptr) return non_dec_int_lit_ptr; } else { // handle decimals (integer or float) TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc); if (decimal_or_float_ptr != nullptr) return decimal_or_float_ptr; } } // string literals if (current_char == '"') return parse_string (loc); // char literals and lifetime names if (current_char == '\'') { TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc); if (char_or_lifetime_ptr != nullptr) return char_or_lifetime_ptr; } // DEBUG: check for specific character problems: if (current_char == '0') rust_debug ("'0' uncaught before unexpected character"); else if (current_char == ']') rust_debug ("']' uncaught before unexpected character"); else if (current_char == 0x5d) rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before " "unexpected character"); // didn't match anything so error rust_error_at (loc, "unexpected character %<%x%>", current_char); current_column++; } } // Parses in a type suffix. std::pair Lexer::parse_in_type_suffix () { std::string suffix; suffix.reserve (5); int additional_length_offset = 0; // get suffix while (ISALPHA (current_char) || ISDIGIT (current_char) || current_char == '_') { if (current_char == '_') { // don't add _ to suffix skip_input (); current_char = peek_input (); additional_length_offset++; continue; } additional_length_offset++; suffix += current_char; skip_input (); current_char = peek_input (); } if (suffix.empty ()) { // no type suffix: do nothing but also no error return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset); } else if (suffix == "f32") { return std::make_pair (CORETYPE_F32, additional_length_offset); } else if (suffix == "f64") { return std::make_pair (CORETYPE_F64, additional_length_offset); } else if (suffix == "i8") { return std::make_pair (CORETYPE_I8, additional_length_offset); } else if (suffix == "i16") { return std::make_pair (CORETYPE_I16, additional_length_offset); } else if (suffix == "i32") { return std::make_pair (CORETYPE_I32, additional_length_offset); } else if (suffix == "i64") { return std::make_pair (CORETYPE_I64, additional_length_offset); } else if (suffix == "i128") { return std::make_pair (CORETYPE_I128, additional_length_offset); } else if (suffix == "isize") { return std::make_pair (CORETYPE_ISIZE, additional_length_offset); } else if (suffix == "u8") { return std::make_pair (CORETYPE_U8, additional_length_offset); } else if (suffix == "u16") { return std::make_pair (CORETYPE_U16, additional_length_offset); } else if (suffix == "u32") { return std::make_pair (CORETYPE_U32, additional_length_offset); } else if (suffix == "u64") { return std::make_pair (CORETYPE_U64, additional_length_offset); } else if (suffix == "u128") { return std::make_pair (CORETYPE_U128, additional_length_offset); } else if (suffix == "usize") { return std::make_pair (CORETYPE_USIZE, additional_length_offset); } else { rust_error_at (get_current_location (), "unknown number suffix %qs", suffix.c_str ()); return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset); } } // Parses in the exponent part (if any) of a float literal. std::pair Lexer::parse_in_exponent_part () { int additional_length_offset = 0; std::string str; if (current_char == 'E' || current_char == 'e') { // add exponent to string as strtod works with it str += current_char; skip_input (); current_char = peek_input (); additional_length_offset++; // special - and + handling if (current_char == '-') { str += '-'; skip_input (); current_char = peek_input (); additional_length_offset++; } else if (current_char == '+') { // don't add + but still skip input skip_input (); current_char = peek_input (); additional_length_offset++; } // parse another decimal number for exponent auto str_length = parse_in_decimal (); str += std::get<0> (str_length); additional_length_offset += std::get<1> (str_length); } return std::make_pair (str, additional_length_offset); } // Parses a decimal integer. std::tuple Lexer::parse_in_decimal () { /* A pure decimal contains only digits. */ bool pure_decimal = true; int additional_length_offset = 0; std::string str; while (ISDIGIT (current_char) || current_char == '_') { if (current_char == '_') { pure_decimal = false; // don't add _ to number skip_input (); current_char = peek_input (); additional_length_offset++; continue; } additional_length_offset++; str += current_char; skip_input (); current_char = peek_input (); } return std::make_tuple (str, additional_length_offset, pure_decimal); } /* Parses escapes (and string continues) in "byte" strings and characters. Does * not support unicode. */ std::tuple Lexer::parse_escape (char opening_char) { int additional_length_offset = 0; char output_char = 0; // skip to actual letter skip_input (); current_char = peek_input (); additional_length_offset++; switch (current_char) { case 'x': { auto hex_escape_pair = parse_partial_hex_escape (); long hexLong = hex_escape_pair.first; additional_length_offset += hex_escape_pair.second; if (hexLong > 255 || hexLong < 0) rust_error_at ( get_current_location (), "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>", static_cast (hexLong)); /* TODO: restore capital for escape output - gcc pretty-printer doesn't * support %X directly */ char hexChar = static_cast (hexLong); output_char = hexChar; } break; case 'n': output_char = '\n'; break; case 'r': output_char = '\r'; break; case 't': output_char = '\t'; break; case '\\': output_char = '\\'; break; case '0': output_char = '\0'; break; case '\'': output_char = '\''; break; case '"': output_char = '"'; break; case 'u': rust_error_at (get_current_location (), "cannot have a unicode escape \\u in a byte %s", opening_char == '\'' ? "character" : "string"); // Try to parse it anyway, just to skip it parse_partial_unicode_escape (); return std::make_tuple (output_char, additional_length_offset, false); case '\r': case '\n': // string continue return std::make_tuple (0, parse_partial_string_continue (), true); default: rust_error_at (get_current_location (), "unknown escape sequence %<\\%c%>", current_char); // returns false if no parsing could be done // return false; return std::make_tuple (output_char, additional_length_offset, false); break; } // all non-special cases (string continue) should skip their used char skip_input (); current_char = peek_input (); additional_length_offset++; // returns true if parsing was successful // return true; return std::make_tuple (output_char, additional_length_offset, false); } /* Parses an escape (or string continue) in a string or character. Supports * unicode escapes. */ std::tuple Lexer::parse_utf8_escape () { Codepoint output_char; int additional_length_offset = 0; // skip to actual letter skip_input (); current_char = peek_input (); additional_length_offset++; switch (current_char) { case 'x': { auto hex_escape_pair = parse_partial_hex_escape (); long hexLong = hex_escape_pair.first; additional_length_offset += hex_escape_pair.second; if (hexLong > 127 || hexLong < 0) rust_error_at ( get_current_location (), "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>", static_cast (hexLong)); /* TODO: restore capital for escape output - gcc pretty-printer doesn't * support %X directly */ char hexChar = static_cast (hexLong); output_char = hexChar; } break; case 'n': output_char = '\n'; break; case 'r': output_char = '\r'; break; case 't': output_char = '\t'; break; case '\\': output_char = '\\'; break; case '0': output_char = '\0'; break; case '\'': output_char = '\''; break; case '"': output_char = '"'; break; case 'u': { auto unicode_escape_pair = parse_partial_unicode_escape (); output_char = unicode_escape_pair.first; additional_length_offset += unicode_escape_pair.second; return std::make_tuple (output_char, additional_length_offset, false); } break; case '\r': case '\n': // string continue return std::make_tuple (0, parse_partial_string_continue (), true); default: rust_error_at (get_current_location (), "unknown escape sequence %<\\%c%>", current_char); // returns false if no parsing could be done // return false; return std::make_tuple (output_char, additional_length_offset, false); break; } /* all non-special cases (unicode, string continue) should skip their used * char */ skip_input (); current_char = peek_input (); additional_length_offset++; // returns true if parsing was successful // return true; return std::make_tuple (output_char, additional_length_offset, false); } // Parses the body of a string continue that has been found in an escape. int Lexer::parse_partial_string_continue () { int additional_length_offset = 1; // string continue while (is_whitespace (current_char)) { if (current_char == '\n') { current_line++; current_column = 1; // tell line_table that new line starts start_line (current_line, max_column_hint); // reset "length" additional_length_offset = 1; // get next char skip_input (); current_char = peek_input (); continue; } skip_input (); current_char = peek_input (); additional_length_offset++; } return additional_length_offset; } /* Parses the body of a '\x' escape. Note that it does not check that the number * is valid and smaller than 255. */ std::pair Lexer::parse_partial_hex_escape () { // hex char string (null-terminated) char hexNum[3] = {0, 0, 0}; // first hex char current_char = peek_input (1); int additional_length_offset = 1; if (!is_x_digit (current_char)) { rust_error_at (get_current_location (), "invalid character %<\\x%c%> in \\x sequence", current_char); return std::make_pair (0, 0); } hexNum[0] = current_char; // second hex char skip_input (); current_char = peek_input (1); additional_length_offset++; if (!is_x_digit (current_char)) { rust_error_at (get_current_location (), "invalid character %<\\x%c%c%> in \\x sequence", hexNum[0], current_char); return std::make_pair (0, 1); } skip_input (); hexNum[1] = current_char; long hexLong = std::strtol (hexNum, nullptr, 16); return std::make_pair (hexLong, additional_length_offset); } // Parses the body of a unicode escape. std::pair Lexer::parse_partial_unicode_escape () { skip_input (); current_char = peek_input (); int additional_length_offset = 0; if (current_char != '{') { rust_error_at (get_current_location (), "unicode escape should start with %<{%>"); /* Skip what should probaby have been between brackets. */ while (is_x_digit (current_char) || current_char == '_') { skip_input (); current_char = peek_input (); additional_length_offset++; } return std::make_pair (Codepoint (0), additional_length_offset); } skip_input (); current_char = peek_input (); additional_length_offset++; if (current_char == '_') { rust_error_at (get_current_location (), "unicode escape cannot start with %<_%>"); skip_input (); current_char = peek_input (); additional_length_offset++; // fallthrough and try to parse the rest anyway } // parse unicode escape - 1-6 hex digits std::string num_str; num_str.reserve (6); // loop through to add entire hex number to string while (is_x_digit (current_char) || current_char == '_') { if (current_char == '_') { // don't add _ to number skip_input (); current_char = peek_input (); additional_length_offset++; continue; } additional_length_offset++; // add raw hex numbers num_str += current_char; skip_input (); current_char = peek_input (); } if (current_char == '}') { skip_input (); current_char = peek_input (); additional_length_offset++; } else { // actually an error, but allow propagation anyway Assume that // wrong bracketm whitespace or single/double quotes are wrong // termination, otherwise it is a wrong character, then skip to the actual // terminator. if (current_char == '{' || is_whitespace (current_char) || current_char == '\'' || current_char == '"') { rust_error_at (get_current_location (), "expected terminating %<}%> in unicode escape"); return std::make_pair (Codepoint (0), additional_length_offset); } else { rust_error_at (get_current_location (), "invalid character %<%c%> in unicode escape", current_char); while (current_char != '}' && current_char != '{' && !is_whitespace (current_char) && current_char != '\'' && current_char != '"') { skip_input (); current_char = peek_input (); additional_length_offset++; } // Consume the actual closing bracket if found if (current_char == '}') { skip_input (); current_char = peek_input (); additional_length_offset++; } return std::make_pair (Codepoint (0), additional_length_offset); } } // ensure 1-6 hex characters if (num_str.length () > 6 || num_str.length () < 1) { rust_error_at (get_current_location (), "unicode escape should be between 1 and 6 hex " "characters; it is %lu", (unsigned long) num_str.length ()); // return false; return std::make_pair (Codepoint (0), additional_length_offset); } unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16); if (hex_num > 0xd7ff && hex_num < 0xe000) { rust_error_at ( get_current_location (), "unicode escape cannot be a surrogate value (D800 to DFFF)"); return std::make_pair (Codepoint (0), additional_length_offset); } if (hex_num > 0x10ffff) { rust_error_at (get_current_location (), "unicode escape cannot be larger than 10FFFF"); return std::make_pair (Codepoint (0), additional_length_offset); } // return true; return std::make_pair (Codepoint (static_cast (hex_num)), additional_length_offset); } // Parses a byte character. TokenPtr Lexer::parse_byte_char (Location loc) { skip_input (); current_column++; // make current char the next character current_char = peek_input (); int length = 1; // char to save char byte_char = 0; // detect escapes if (current_char == '\\') { auto escape_length_pair = parse_escape ('\''); byte_char = std::get<0> (escape_length_pair); length += std::get<1> (escape_length_pair); current_char = peek_input (); if (current_char != '\'') { rust_error_at (get_current_location (), "unclosed %"); } skip_input (); current_char = peek_input (); length++; // go to next char } else if (current_char != '\'') { // otherwise, get character from direct input character byte_char = current_char; skip_input (); current_char = peek_input (); length++; if (current_char != '\'') { rust_error_at (get_current_location (), "unclosed %"); } skip_input (); current_char = peek_input (); length++; // go to next char } else { rust_error_at (get_current_location (), "no character inside %<%> for %"); } current_column += length; loc += length - 1; return Token::make_byte_char (loc, byte_char); } // Parses a byte string. TokenPtr Lexer::parse_byte_string (Location loc) { // byte string // skip quote character skip_input (); current_column++; std::string str; str.reserve (16); // some sensible default int length = 1; current_char = peek_input (); while (current_char != '"' && current_char != EOF) { if (current_char == '\\') { auto escape_length_pair = parse_escape ('"'); char output_char = std::get<0> (escape_length_pair); if (output_char == 0 && std::get<2> (escape_length_pair)) length = std::get<1> (escape_length_pair) - 1; else length += std::get<1> (escape_length_pair); if (output_char != 0 || !std::get<2> (escape_length_pair)) str += output_char; continue; } length++; str += current_char; skip_input (); current_char = peek_input (); } current_column += length; if (current_char == '"') { current_column++; skip_input (); current_char = peek_input (); } else if (current_char == EOF) { rust_error_at (get_current_location (), "unended byte string literal"); return Token::make (END_OF_FILE, get_current_location ()); } else { gcc_unreachable (); } str.shrink_to_fit (); loc += str.size () - 1; return Token::make_byte_string (loc, std::move (str)); } // Parses a raw byte string. TokenPtr Lexer::parse_raw_byte_string (Location loc) { // raw byte string literals std::string str; str.reserve (16); // some sensible default int length = 1; int hash_count = 0; // get hash count at beginnning skip_input (); current_char = peek_input (); length++; while (current_char == '#') { hash_count++; length++; skip_input (); current_char = peek_input (); } if (current_char != '"') { rust_error_at (get_current_location (), "raw byte string has no opening %<\"%>"); } skip_input (); current_char = peek_input (); length++; while (true) { if (current_char == '"') { bool enough_hashes = true; for (int i = 0; i < hash_count; i++) { if (peek_input (i + 1) != '#') { enough_hashes = false; break; } } if (enough_hashes) { // skip enough input and peek enough input skip_input (hash_count); current_char = peek_input (); length += hash_count + 1; break; } } if ((unsigned char) current_char > 127) { rust_error_at (get_current_location (), "character %<%c%> in raw byte string out of range", current_char); current_char = 0; } length++; str += current_char; skip_input (); current_char = peek_input (); } current_column += length; loc += length - 1; str.shrink_to_fit (); return Token::make_byte_string (loc, std::move (str)); } // Parses a raw identifier. TokenPtr Lexer::parse_raw_identifier (Location loc) { // raw identifier std::string str; str.reserve (16); // default skip_input (); current_char = peek_input (); current_column += 2; bool first_is_underscore = current_char == '_'; int length = 0; current_char = peek_input (); // loop through entire name while (ISALPHA (current_char) || ISDIGIT (current_char) || current_char == '_') { length++; str += current_char; skip_input (); current_char = peek_input (); } current_column += length; // if just a single underscore, not an identifier if (first_is_underscore && length == 1) rust_error_at (get_current_location (), "%<_%> is not a valid raw identifier"); if (str == "crate" || str == "extern" || str == "self" || str == "super" || str == "Self") { rust_error_at (get_current_location (), "%qs is a forbidden raw identifier", str.c_str ()); return nullptr; } else { str.shrink_to_fit (); loc += length - 1; return Token::make_identifier (loc, std::move (str)); } } // skip broken string input (unterminated strings) void Lexer::skip_broken_string_input (int current_char) { while (current_char != '"' && current_char != EOF) { if (current_char == '\n') { current_line++; current_column = 1; } else { current_column++; } skip_input (); current_char = peek_input (); } if (current_char == '"') { current_column++; skip_input (); current_char = peek_input (); } rust_debug ("skipped to %d:%d due to bad quotes", current_line, current_column); } // Parses a unicode string. TokenPtr Lexer::parse_string (Location loc) { Codepoint current_char32; std::string str; str.reserve (16); // some sensible default int length = 1; current_char32 = peek_codepoint_input (); // FIXME: This fails if the input ends. How do we check for EOF? while (current_char32.value != '"' && !current_char32.is_eof ()) { if (current_char32.value == '\\') { // parse escape auto utf8_escape_pair = parse_utf8_escape (); current_char32 = std::get<0> (utf8_escape_pair); if (current_char32 == Codepoint (0) && std::get<2> (utf8_escape_pair)) length = std::get<1> (utf8_escape_pair) - 1; else length += std::get<1> (utf8_escape_pair); if (current_char32 != Codepoint (0) || !std::get<2> (utf8_escape_pair)) str += current_char32; // required as parsing utf8 escape only changes current_char current_char32 = peek_codepoint_input (); continue; } length += get_input_codepoint_length (); str += current_char32; skip_codepoint_input (); current_char32 = peek_codepoint_input (); } current_column += length; if (current_char32.value == '"') { current_column++; skip_input (); current_char = peek_input (); } else if (current_char32.is_eof ()) { rust_error_at (get_current_location (), "unended string literal"); return Token::make (END_OF_FILE, get_current_location ()); } else { gcc_unreachable (); } str.shrink_to_fit (); loc += length - 1; return Token::make_string (loc, std::move (str)); } // Parses an identifier or keyword. TokenPtr Lexer::parse_identifier_or_keyword (Location loc) { std::string str; str.reserve (16); // default str += current_char; bool first_is_underscore = current_char == '_'; int length = 1; current_char = peek_input (); // loop through entire name while (ISALPHA (current_char) || ISDIGIT (current_char) || current_char == '_') { length++; str += current_char; skip_input (); current_char = peek_input (); } current_column += length; // if just a single underscore, not an identifier if (first_is_underscore && length == 1) return Token::make (UNDERSCORE, loc); str.shrink_to_fit (); loc += length - 1; TokenId keyword = classify_keyword (str); if (keyword == IDENTIFIER) return Token::make_identifier (loc, std::move (str)); else return Token::make (keyword, loc); } // Possibly returns a raw string token if it exists - otherwise returns null. TokenPtr Lexer::maybe_parse_raw_string (Location loc) { int peek_index = 0; while (peek_input (peek_index) == '#') peek_index++; if (peek_input (peek_index) == '"') return parse_raw_string (loc, peek_index); else return nullptr; } // Returns a raw string token. TokenPtr Lexer::parse_raw_string (Location loc, int initial_hash_count) { // raw string literals std::string str; str.reserve (16); // some sensible default int length = 1 + initial_hash_count; if (initial_hash_count > 0) skip_input (initial_hash_count - 1); current_char = peek_input (); if (current_char != '"') rust_error_at (get_current_location (), "raw string has no opening %<\"%>"); length++; skip_input (); Codepoint current_char32 = peek_codepoint_input (); while (!current_char32.is_eof ()) { if (current_char32.value == '"') { bool enough_hashes = true; for (int i = 0; i < initial_hash_count; i++) { if (peek_input (i + 1) != '#') { enough_hashes = false; break; } } if (enough_hashes) { // skip enough input and peek enough input skip_input (initial_hash_count); current_char = peek_input (); length += initial_hash_count + 1; break; } } length++; str += current_char32; skip_codepoint_input (); current_char32 = peek_codepoint_input (); } current_column += length; loc += length - 1; str.shrink_to_fit (); return Token::make_string (loc, std::move (str)); } template TokenPtr Lexer::parse_non_decimal_int_literal (Location loc, IsDigitFunc is_digit_func, std::string existent_str, int base) { int length = 1; skip_input (); current_char = peek_input (); length++; // loop through to add entire number to string while (is_digit_func (current_char) || current_char == '_') { if (current_char == '_') { // don't add _ to number skip_input (); current_char = peek_input (); length++; continue; } length++; // add raw numbers existent_str += current_char; skip_input (); current_char = peek_input (); } // convert value to decimal representation long dec_num = std::strtol (existent_str.c_str (), nullptr, base); existent_str = std::to_string (dec_num); // parse in type suffix if it exists auto type_suffix_pair = parse_in_type_suffix (); PrimitiveCoreType type_hint = type_suffix_pair.first; length += type_suffix_pair.second; current_column += length; if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) { rust_error_at (get_current_location (), "invalid type suffix %qs for integer (%s) literal", get_type_hint_string (type_hint), base == 16 ? "hex" : (base == 8 ? "octal" : (base == 2 ? "binary" : ""))); return nullptr; } loc += length - 1; return Token::make_int (loc, std::move (existent_str), type_hint); } // Parses a hex, binary or octal int literal. TokenPtr Lexer::parse_non_decimal_int_literals (Location loc) { std::string str; str.reserve (16); // some sensible default str += current_char; current_char = peek_input (); if (current_char == 'x') { // hex (integer only) return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16); } else if (current_char == 'o') { // octal (integer only) return parse_non_decimal_int_literal (loc, is_octal_digit, std::move (str), 8); } else if (current_char == 'b') { // binary (integer only) return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str), 2); } else { return nullptr; } } // Parses a decimal-based int literal or float literal. TokenPtr Lexer::parse_decimal_int_or_float (Location loc) { std::string str; str.reserve (16); // some sensible default str += current_char; int length = 1; bool first_zero = current_char == '0'; current_char = peek_input (); // parse initial decimal integer (or first integer part of float) literal auto initial_decimal = parse_in_decimal (); str += std::get<0> (initial_decimal); length += std::get<1> (initial_decimal); // detect float literal if (current_char == '.' && is_float_digit (peek_input (1))) { // float with a '.', parse another decimal into it // add . to str str += current_char; skip_input (); current_char = peek_input (); length++; // parse another decimal number for float auto second_decimal = parse_in_decimal (); str += std::get<0> (second_decimal); length += std::get<1> (second_decimal); // parse in exponent part if it exists auto exponent_pair = parse_in_exponent_part (); str += exponent_pair.first; length += exponent_pair.second; // parse in type suffix if it exists auto type_suffix_pair = parse_in_type_suffix (); PrimitiveCoreType type_hint = type_suffix_pair.first; length += type_suffix_pair.second; if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64 && type_hint != CORETYPE_UNKNOWN) { rust_error_at (get_current_location (), "invalid type suffix %qs for floating-point literal", get_type_hint_string (type_hint)); // ignore invalid type suffix as everything else seems fine type_hint = CORETYPE_UNKNOWN; } current_column += length; loc += length - 1; str.shrink_to_fit (); return Token::make_float (loc, std::move (str), type_hint); } else if (current_char == '.' && check_valid_float_dot_end (peek_input (1))) { // float that is just an integer with a terminating '.' character // add . to str str += current_char; skip_input (); current_char = peek_input (); length++; // add a '0' after the . to prevent ambiguity str += '0'; // type hint not allowed current_column += length; loc += length - 1; str.shrink_to_fit (); return Token::make_float (loc, std::move (str), CORETYPE_UNKNOWN); } else if (current_char == 'E' || current_char == 'e') { // exponent float with no '.' character // parse exponent part auto exponent_pair = parse_in_exponent_part (); str += exponent_pair.first; length += exponent_pair.second; // parse in type suffix if it exists auto type_suffix_pair = parse_in_type_suffix (); PrimitiveCoreType type_hint = type_suffix_pair.first; length += type_suffix_pair.second; if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64 && type_hint != CORETYPE_UNKNOWN) { rust_error_at (get_current_location (), "invalid type suffix %qs for floating-point literal", get_type_hint_string (type_hint)); // ignore invalid type suffix as everything else seems fine type_hint = CORETYPE_UNKNOWN; } current_column += length; loc += length - 1; str.shrink_to_fit (); return Token::make_float (loc, std::move (str), type_hint); } else { // is an integer // parse in type suffix if it exists auto type_suffix_pair = parse_in_type_suffix (); PrimitiveCoreType type_hint = type_suffix_pair.first; /* A "real" pure decimal doesn't have a suffix and no zero prefix. */ if (type_hint == CORETYPE_UNKNOWN) { bool pure_decimal = std::get<2> (initial_decimal); if (pure_decimal && (!first_zero || str.size () == 1)) type_hint = CORETYPE_PURE_DECIMAL; } length += type_suffix_pair.second; current_column += length; loc += length - 1; str.shrink_to_fit (); return Token::make_int (loc, std::move (str), type_hint); } } TokenPtr Lexer::parse_char_or_lifetime (Location loc) { Codepoint current_char32; int length = 1; current_char32 = peek_codepoint_input (); if (current_char32.is_eof ()) return nullptr; // parse escaped char literal if (current_char32.value == '\\') { // parse escape auto utf8_escape_pair = parse_utf8_escape (); current_char32 = std::get<0> (utf8_escape_pair); length += std::get<1> (utf8_escape_pair); if (peek_codepoint_input ().value != '\'') { rust_error_at (get_current_location (), "unended character literal"); } else { skip_codepoint_input (); current_char = peek_input (); length++; } current_column += length; loc += length - 1; return Token::make_char (loc, current_char32); } else { skip_codepoint_input (); if (peek_codepoint_input ().value == '\'') { // parse non-escaped char literal // skip the ' character skip_input (); current_char = peek_input (); // TODO fix due to different widths of utf-8 chars? current_column += 3; loc += 2; return Token::make_char (loc, current_char32); } else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value) || current_char32.value == '_') { // parse lifetime name std::string str; str += current_char32; length++; current_char = peek_input (); while (ISDIGIT (current_char) || ISALPHA (current_char) || current_char == '_') { str += current_char; skip_input (); current_char = peek_input (); length++; } current_column += length; loc += length - 1; str.shrink_to_fit (); return Token::make_lifetime (loc, std::move (str)); } else { rust_error_at ( get_current_location (), "expected %' after character constant in character literal"); return nullptr; } } } // Returns the length of the codepoint at the current position. int Lexer::get_input_codepoint_length () { uint8_t input = peek_input (); if ((int8_t) input == EOF) return 0; if (input < 128) { // ascii -- 1 byte // return input; return 1; } else if ((input & 0xC0) == 0x80) { // invalid (continuation; can't be first char) // return 0xFFFE; return 0; } else if ((input & 0xE0) == 0xC0) { // 2 bytes uint8_t input2 = peek_input (1); if ((input2 & 0xC0) != 0x80) return 0; // return 0xFFFE; // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); // return output; return 2; } else if ((input & 0xF0) == 0xE0) { // 3 bytes uint8_t input2 = peek_input (1); if ((input2 & 0xC0) != 0x80) return 0; // return 0xFFFE; uint8_t input3 = peek_input (2); if ((input3 & 0xC0) != 0x80) return 0; // return 0xFFFE; /*uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0); return output;*/ return 3; } else if ((input & 0xF8) == 0xF0) { // 4 bytes uint8_t input2 = peek_input (1); if ((input2 & 0xC0) != 0x80) return 0; // return 0xFFFE; uint8_t input3 = peek_input (2); if ((input3 & 0xC0) != 0x80) return 0; // return 0xFFFE; uint8_t input4 = peek_input (3); if ((input4 & 0xC0) != 0x80) return 0; // return 0xFFFE; /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); return output;*/ return 4; } else { rust_error_at (get_current_location (), "invalid UTF-8 [FIRST] (too long)"); return 0; } } // Returns the codepoint at the current position. Codepoint Lexer::peek_codepoint_input () { uint8_t input = peek_input (); if ((int8_t) input == EOF) return Codepoint::eof (); if (input < 128) { // ascii -- 1 byte return {input}; } else if ((input & 0xC0) == 0x80) { // invalid (continuation; can't be first char) return {0xFFFE}; } else if ((input & 0xE0) == 0xC0) { // 2 bytes uint8_t input2 = peek_input (1); if ((input2 & 0xC0) != 0x80) return {0xFFFE}; uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); return {output}; } else if ((input & 0xF0) == 0xE0) { // 3 bytes uint8_t input2 = peek_input (1); if ((input2 & 0xC0) != 0x80) return {0xFFFE}; uint8_t input3 = peek_input (2); if ((input3 & 0xC0) != 0x80) return {0xFFFE}; uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0); return {output}; } else if ((input & 0xF8) == 0xF0) { // 4 bytes uint8_t input2 = peek_input (1); if ((input2 & 0xC0) != 0x80) return {0xFFFE}; uint8_t input3 = peek_input (2); if ((input3 & 0xC0) != 0x80) return {0xFFFE}; uint8_t input4 = peek_input (3); if ((input4 & 0xC0) != 0x80) return {0xFFFE}; uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); return {output}; } else { rust_error_at (get_current_location (), "invalid UTF-8 [SECND] (too long)"); return {0xFFFE}; } } void Lexer::skip_codepoint_input () { int toSkip = get_input_codepoint_length (); gcc_assert (toSkip >= 1); skip_input (toSkip - 1); } int Lexer::test_get_input_codepoint_n_length (int n_start_offset) { uint8_t input = peek_input (n_start_offset); if (input < 128) { // ascii -- 1 byte // return input; return 1; } else if ((input & 0xC0) == 0x80) { // invalid (continuation; can't be first char) // return 0xFFFE; return 0; } else if ((input & 0xE0) == 0xC0) { // 2 bytes uint8_t input2 = peek_input (n_start_offset + 1); if ((input2 & 0xC0) != 0x80) // return 0xFFFE; return 0; // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); // return output; return 2; } else if ((input & 0xF0) == 0xE0) { // 3 bytes uint8_t input2 = peek_input (n_start_offset + 1); if ((input2 & 0xC0) != 0x80) // return 0xFFFE; return 0; uint8_t input3 = peek_input (n_start_offset + 2); if ((input3 & 0xC0) != 0x80) // return 0xFFFE; return 0; /*uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0); return output;*/ return 3; } else if ((input & 0xF8) == 0xF0) { // 4 bytes uint8_t input2 = peek_input (n_start_offset + 1); if ((input2 & 0xC0) != 0x80) // return 0xFFFE; return 0; uint8_t input3 = peek_input (n_start_offset + 2); if ((input3 & 0xC0) != 0x80) // return 0xFFFE; return 0; uint8_t input4 = peek_input (n_start_offset + 3); if ((input4 & 0xC0) != 0x80) // return 0xFFFE; return 0; /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); return output;*/ return 4; } else { rust_error_at (get_current_location (), "invalid UTF-8 [THIRD] (too long)"); return 0; } } // peeks the codepoint input at n codepoints ahead of current codepoint - try // not to use Codepoint Lexer::test_peek_codepoint_input (int n) { int totalOffset = 0; // add up all offsets into total offset? does this do what I want? for (int i = 0; i < n; i++) { totalOffset += test_get_input_codepoint_n_length (totalOffset); } // issues: this would have (at least) O(n) lookup time, not O(1) like the // rest? // TODO: implement if still needed // error out of function as it is not implemented gcc_assert (1 == 0); return {0}; /* uint8_t input = peek_input(); if (input < 128) { // ascii -- 1 byte return input; } else if ((input & 0xC0) == 0x80) { // invalid (continuation; can't be first char) return 0xFFFE; } else if ((input & 0xE0) == 0xC0) { // 2 bytes uint8_t input2 = peek_input(1); if ((input2 & 0xC0) != 0x80) return 0xFFFE; uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); return output; } else if ((input & 0xF0) == 0xE0) { // 3 bytes uint8_t input2 = peek_input(1); if ((input2 & 0xC0) != 0x80) return 0xFFFE; uint8_t input3 = peek_input(2); if ((input3 & 0xC0) != 0x80) return 0xFFFE; uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) { // 4 bytes uint8_t input2 = peek_input(1); if ((input2 & 0xC0) != 0x80) return 0xFFFE; uint8_t input3 = peek_input(2); if ((input3 & 0xC0) != 0x80) return 0xFFFE; uint8_t input4 = peek_input(3); if ((input4 & 0xC0) != 0x80) return 0xFFFE; uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); return output; } else { rust_error_at(get_current_location(), "invalid UTF-8 (too long)"); return 0xFFFE; }*/ } void Lexer::split_current_token (TokenId new_left, TokenId new_right) { /* TODO: assert that this TokenId is a "simple token" like punctuation and not * like "IDENTIFIER"? */ Location current_loc = peek_token ()->get_locus (); TokenPtr new_left_tok = Token::make (new_left, current_loc); TokenPtr new_right_tok = Token::make (new_right, current_loc + 1); token_queue.replace_current_value (std::move (new_left_tok)); token_queue.insert (1, std::move (new_right_tok)); } void Lexer::start_line (int current_line, int current_column) { if (line_map) line_map->start_line (current_line, current_column); } } // namespace Rust