diff options
author | SimplyTheOther <simplytheother@gmail.com> | 2020-08-20 11:51:25 +0800 |
---|---|---|
committer | Philip Herron <philip.herron@embecosm.com> | 2020-11-28 21:13:20 +0000 |
commit | b758ec724cc06cb866a72ce17dbfd8a426cf21db (patch) | |
tree | cffb4e7c584f7da75ea42e57011c5a1d160d67a2 /gcc | |
parent | c26f60f6a28394e98ac1d830cbe8f632ef576dbb (diff) | |
download | gcc-b758ec724cc06cb866a72ce17dbfd8a426cf21db.zip gcc-b758ec724cc06cb866a72ce17dbfd8a426cf21db.tar.gz gcc-b758ec724cc06cb866a72ce17dbfd8a426cf21db.tar.bz2 |
Lexer cleanup
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/rust/lex/rust-codepoint.h | 23 | ||||
-rw-r--r-- | gcc/rust/lex/rust-lex.cc | 748 | ||||
-rw-r--r-- | gcc/rust/lex/rust-lex.h | 50 | ||||
-rw-r--r-- | gcc/rust/lex/rust-token.h | 1 | ||||
-rw-r--r-- | gcc/rust/rust-linemap.cc | 3 | ||||
-rw-r--r-- | gcc/rust/rust-session-manager.cc | 6 |
6 files changed, 426 insertions, 405 deletions
diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/lex/rust-codepoint.h index 0f2e5bd..d95bfdf 100644 --- a/gcc/rust/lex/rust-codepoint.h +++ b/gcc/rust/lex/rust-codepoint.h @@ -1,11 +1,6 @@ #ifndef RUST_CODEPOINT_H #define RUST_CODEPOINT_H -#include "config.h" -#include "system.h" -#include "coretypes.h" -// config, system, coretypes - TODO: ensure all are needed - #include <string> namespace Rust { @@ -16,11 +11,21 @@ struct Codepoint // Creates a zero codepoint. Codepoint () : value (0) {} - // Creates a codepoint from UTF-8 value. - Codepoint (uint32_t value_) : value (value_) {} + // Creates a codepoint from an encoded UTF-8 value. + Codepoint (uint32_t value) : value (value) {} + + // Returns a C++ string containing string value of codepoint. + std::string as_string (); + + bool operator== (Codepoint other) const + { + return value == other.value; + } - // Returns a C++ string containing value of codepoint. - ::std::string as_string (); + bool operator!= (Codepoint other) const + { + return !operator== (other); + } }; } // namespace Rust diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc index 54adf48..6f50e21 100644 --- a/gcc/rust/lex/rust-lex.cc +++ b/gcc/rust/lex/rust-lex.cc @@ -1,6 +1,6 @@ #include "rust-lex.h" -#include "rust-system.h" // for rust_assert and rust_unreachable +#include "rust-system.h" // for rust_assert and rust_unreachable #include "rust-diagnostics.h" // for rust_error_at #include "rust-linemap.h" #include "safe-ctype.h" @@ -10,7 +10,7 @@ namespace Rust { // TODO: move to separate compilation unit? // overload += for uint32_t to allow 32-bit encoded utf-8 to be added - ::std::string& operator+=(::std::string& str, Codepoint char32) { + std::string& operator+=(std::string& str, Codepoint char32) { if (char32.value < 0x80) { str += static_cast<char>(char32.value); } else if (char32.value < (0x1F + 1) << (1 * 6)) { @@ -27,64 +27,45 @@ namespace Rust { str += static_cast<char>(0x80 | ((char32.value >> 0) & 0x3F)); } else { fprintf(stderr, "Invalid unicode codepoint found: '%u' \n", char32.value); - // rust_error_at(get_current_location(), "Invalid unicode codepoint found: '%u'", - // char32.value); } return str; } - ::std::string Codepoint::as_string() { + std::string Codepoint::as_string() { std::string str; - // do i need to do this? or can i just do str += value due to op overloading? - - // ok can't figure out how to just convert to codepoint or use "this" so create new one - str += Codepoint(value); - - /*if (value < 0x80) { - str += static_cast<char>(value); - } else if (value < (0x1F + 1) << (1 * 6)) { - str += static_cast<char>(0xC0 | ((value >> 6) & 0x1F)); - str += static_cast<char>(0x80 | ((value >> 0) & 0x3F)); - } else if (value < (0x0F + 1) << (2 * 6)) { - str += static_cast<char>(0xE0 | ((value >> 12) & 0x0F)); - str += static_cast<char>(0x80 | ((value >> 6) & 0x3F)); - str += static_cast<char>(0x80 | ((value >> 0) & 0x3F)); - } else if (value < (0x07 + 1) << (3 * 6)) { - str += static_cast<char>(0xF0 | ((value >> 18) & 0x07)); - str += static_cast<char>(0x80 | ((value >> 12) & 0x3F)); - str += static_cast<char>(0x80 | ((value >> 6) & 0x3F)); - str += static_cast<char>(0x80 | ((value >> 0) & 0x3F)); - } else { - rust_error_at(get_current_location(), "Invalid unicode codepoint found: '%u'", value); - }*/ + // str += Codepoint (value); + str += *this; + return str; } - // Includes all allowable float digits EXCEPT _ and . as that needs lookahead for handling. - inline bool is_float_digit(char number) { + /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead + * for handling. */ + bool is_float_digit(char number) { return ISDIGIT(number) || number == 'E' || number == 'e'; } - // Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or whatever is different - inline bool is_x_digit(char number) { + /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or + * whatever is different */ + bool is_x_digit(char number) { return ISXDIGIT(number); } - inline bool is_octal_digit(char number) { + bool is_octal_digit(char number) { return number >= '0' && number <= '7'; } - inline bool is_bin_digit(char number) { + bool is_bin_digit(char number) { return number == '0' || number == '1'; } - inline bool check_valid_float_dot_end(char character) { + bool check_valid_float_dot_end(char character) { return character != '.' && character != '_' && !ISALPHA(character); } // ISSPACE from safe-ctype but may change in future - inline bool is_whitespace(char character) { + bool is_whitespace(char character) { return ISSPACE(character); } @@ -96,21 +77,22 @@ namespace Rust { } Lexer::~Lexer() { - /* ok apparently stop (which is equivalent of original code in destructor) is meant to be - * called after all files have finished parsing, for cleanup. On the other hand, actual code - * that it calls to leave a certain line map is mentioned in GCC docs as being useful for - * "just leaving an included header" and stuff like that, so this line mapping functionality - * may need fixing. + /* ok apparently stop (which is equivalent of original code in destructor) is + * meant to be called after all files have finished parsing, for cleanup. On + * the other hand, actual code that it calls to leave a certain line map is + * mentioned in GCC docs as being useful for "just leaving an included header" + * and stuff like that, so this line mapping functionality may need fixing. * FIXME: find out whether this occurs. */ // line_map->stop(); } - // TODO: need to optimise somehow to avoid the virtual function call in the tight loop. - // Best idea at the moment is CRTP, but that might make lexer implementation annoying when storing - // the "base class" (i.e. would need template parameter everywhere), although in practice it would - // mostly just look ugly and make enclosing classes like Parser also require a type parameter. - // At this point a macro might be better. - // OK I guess macros can be replaced by constexpr if or something if possible. + /* TODO: need to optimise somehow to avoid the virtual function call in the + * tight loop. Best idea at the moment is CRTP, but that might make lexer + * implementation annoying when storing the "base class" (i.e. would need + * template parameter everywhere), although in practice it would mostly just + * look ugly and make enclosing classes like Parser also require a type + * parameter. At this point a macro might be better. OK I guess macros can be + * replaced by constexpr if or something if possible. */ Location Lexer::get_current_location() { return line_map->get_location(current_column); } @@ -151,9 +133,9 @@ namespace Rust { token_queue.replace_current_value(replacement); } - /* shitty anonymous namespace that can only be accessed inside the compilation unit - used for - * classify_keyword - * Binary search in sorted array of keywords created with x-macros. */ + /* shitty anonymous namespace that can only be accessed inside the compilation + * unit - used for classify_keyword Binary search in sorted array of keywords + * created with x-macros. */ namespace { const std::string keyword_index[] = { #define RS_TOKEN(x, y) @@ -172,19 +154,18 @@ namespace Rust { }; const int num_keywords = sizeof(keyword_index) / sizeof(*keyword_index); - } + } // namespace - /* Determines whether the string passed in is a keyword or not. If it is, it returns the keyword - * name. */ + /* Determines whether the string passed in is a keyword or not. If it is, it + * returns the keyword name. */ TokenId Lexer::classify_keyword(const std::string& str) { const std::string* last = keyword_index + num_keywords; const std::string* idx = std::lower_bound(keyword_index, last, str); - if (idx == last || str != *idx) { + if (idx == last || str != *idx) return IDENTIFIER; - } else { + else return keyword_keys[idx - keyword_index]; - } } TokenPtr Lexer::build_token() { @@ -195,9 +176,8 @@ namespace Rust { skip_input(); // return end of file token if end of file - if (current_char == EOF) { + if (current_char == EOF) return Token::make(END_OF_FILE, loc); - } // detect shebang if (loc == 1 && current_line == 1 && current_char == '#') { @@ -231,12 +211,13 @@ namespace Rust { // if not end of file, start tokenising switch (current_char) { - // ignore whitespace characters for tokens but continue updating location + /* ignore whitespace characters for tokens but continue updating + * location */ case '\n': // newline current_line++; current_column = 1; // tell line_table that new line starts - linemap_line_start(::line_table, current_line, max_column_hint); + line_map->start_line(current_line, max_column_hint); continue; case ' ': // space current_column++; @@ -509,23 +490,18 @@ namespace Rust { current_column++; return Token::make(RIGHT_CURLY, loc); case '@': - // TODO: i don't know what this does, does it need special handling? current_column++; return Token::make(PATTERN_BIND, loc); case '$': - // TODO: i don't know what this does, does it need special handling? current_column++; return Token::make(DOLLAR_SIGN, loc); case '~': - // TODO: i don't know what this does, does it need special handling? current_column++; return Token::make(TILDE, loc); case '\\': - // TODO: i don't know what this does, does it need special handling? current_column++; return Token::make(BACKSLASH, loc); case '`': - // TODO: i don't know what this does, does it need special handling? current_column++; return Token::make(BACKTICK, loc); case '|': @@ -599,27 +575,21 @@ namespace Rust { // byte and byte string test if (current_char == 'b') { if (peek_input() == '\'') { - // byte - allows any ascii or escapes - // would also have to take into account escapes: \x hex_digit hex_digit, - // \n, \r, \t, \\, \0 + skip_input(); + current_column++; + // make current char the next character + current_char = peek_input(); int length = 1; // char to save - char byte_char; - - skip_input(); - // make current char the next character - current_char = peek_input(); + char byte_char = 0; // detect escapes if (current_char == '\\') { - /*skip_input(); - - // make current_char next character (letter) - current_char = peek_input();*/ - - parse_escape(length, byte_char, '\''); + auto escape_length_pair = parse_escape('\''); + byte_char = escape_length_pair.first; + length += escape_length_pair.second; if (byte_char > 127) { rust_error_at( @@ -627,15 +597,12 @@ namespace Rust { byte_char = 0; } - // skip_input(); current_char = peek_input(); - length++; if (current_char != '\'') { rust_error_at(get_current_location(), "unclosed byte char"); } - // TODO: ensure skipping is needed here skip_input(); current_char = peek_input(); length++; // go to next char @@ -645,12 +612,12 @@ namespace Rust { skip_input(); current_char = peek_input(); + length++; if (current_char != '\'') { rust_error_at(get_current_location(), "unclosed byte char"); } - // TODO: ensure skipping is needed here skip_input(); current_char = peek_input(); length++; // go to next char @@ -666,18 +633,26 @@ namespace Rust { // skip quote character skip_input(); + current_column++; std::string str; str.reserve(16); // some sensible default int length = 1; current_char = peek_input(); - // TODO: handle escapes properly while (current_char != '"' && current_char != '\n') { if (current_char == '\\') { - char output_char = 0; - parse_escape(length, output_char, '"'); + auto escape_length_pair = parse_escape('"'); + char output_char = escape_length_pair.first; + //length += escape_length_pair.second; + + // TODO: need to fix length - after escape, the length of the line up to the next non-whitespace char of the string is added to length, which is not what we want - we want length to be replaced by that. + // possible option could if "if escape_length_pair.first == 0, then length = escape_length_pair.second else length += escape_length_pair.second." + if (output_char == 0) + length = escape_length_pair.second - 1; + else + length += escape_length_pair.second; if (output_char > 127) { rust_error_at(get_current_location(), @@ -685,7 +660,8 @@ namespace Rust { output_char = 0; } - str += output_char; + if (output_char != 0) + str += output_char; continue; } @@ -702,14 +678,18 @@ namespace Rust { if (current_char == '\n') { rust_error_at(get_current_location(), "unended byte string literal"); } else if (current_char == '"') { + // TEST: hopefully column inc should make string line up properly + current_column++; + skip_input(); current_char = peek_input(); } else { - rust_unreachable(); + gcc_unreachable(); } + str.shrink_to_fit(); + return Token::make_byte_string(loc, str); - // TODO: ensure escapes and string continue work properly } else if (peek_input() == 'r' && (peek_input(1) == '#' || peek_input(1) == '"')) { // raw byte string literals std::string str; @@ -721,6 +701,7 @@ namespace Rust { // get hash count at beginnning skip_input(); current_char = peek_input(); + length++; while (current_char == '#') { hash_count++; length++; @@ -735,6 +716,7 @@ namespace Rust { skip_input(); current_char = peek_input(); + length++; while (true) { if (current_char == '"') { @@ -742,8 +724,8 @@ namespace Rust { for (int i = 0; i < hash_count; i++) { if (peek_input(i + 1) != '#') { - enough_hashes - = false; // could continue here - improve performance + enough_hashes = false; // could continue here - + // improve performance } } @@ -765,7 +747,9 @@ namespace Rust { current_column += length; - return Token::make_byte_string(loc, str); // TODO: does this work properly + str.shrink_to_fit(); + + return Token::make_byte_string(loc, str); } } @@ -811,65 +795,79 @@ namespace Rust { rust_error_at( get_current_location(), "'%s' is a forbidden raw identifier", str.c_str()); } else { + str.shrink_to_fit(); + return Token::make_identifier(loc, str); } - } else if (peek == '"' || (peek == '#' && (ISALPHA(peek1) || peek1 == '_'))) { - // raw string literals - std::string str; - str.reserve(16); // some sensible default + } else { + int peek_index = 0; + while (peek_input(peek_index) == '#') + peek_index++; + // TODO: optimise by using "peek_index" as the hash count - 1 or something - int length = 1; - int hash_count = 0; + if (peek_input(peek_index) == '"') { + // raw string literals + std::string str; + str.reserve(16); // some sensible default - // get hash count at beginnning - current_char = peek; - while (current_char == '#') { - hash_count++; - length++; + int length = 1; + int hash_count = 0; + + // get hash count at beginnning + current_char = peek; + while (current_char == '#') { + hash_count++; + length++; + + skip_input(); + current_char = peek_input(); + } + + if (current_char != '"') { + rust_error_at(get_current_location(), "raw string has no opening '\"'"); + } + length++; skip_input(); - current_char = peek_input(); - } + Codepoint current_char32 = test_peek_codepoint_input(); - if (current_char != '"') { - rust_error_at(get_current_location(), "raw string has no opening '\"'"); - } + // TODO: didn't account for current_column++ somewhere - one less than is required - skip_input(); - Codepoint current_char32 = test_peek_codepoint_input(); + while (true) { + if (current_char32.value == '"') { + bool enough_hashes = true; - while (true) { - if (current_char32.value == '"') { - bool enough_hashes = true; + for (int i = 0; i < hash_count; i++) { + // if (test_peek_codepoint_input(i + 1) != '#') { + // TODO: ensure this is a good enough replacement + if (peek_input(i + 1) != '#') { + enough_hashes = false; // could continue here - + // improve performance + } + } - for (int i = 0; i < hash_count; i++) { - // if (test_peek_codepoint_input(i + 1) != '#') { - // TODO: ensure this is a good enough replacement - if (peek_input(i + 1) != '#') { - enough_hashes - = false; // could continue here - improve performance + if (enough_hashes) { + // skip enough input and peek enough input + skip_input(hash_count); // is this enough? + current_char = peek_input(); + length += hash_count + 1; + break; } } - if (enough_hashes) { - // skip enough input and peek enough input - skip_input(hash_count); // is this enough? - current_char = peek_input(); - length += hash_count + 1; - break; - } - } + length++; - length++; + str += current_char32; + test_skip_codepoint_input(); + current_char32 = test_peek_codepoint_input(); + } - str += current_char32; - test_skip_codepoint_input(); - current_char32 = test_peek_codepoint_input(); - } + current_column += length; - current_column += length; + str.shrink_to_fit(); - return Token::make_string(loc, str); // TODO: does this work properly + return Token::make_string(loc, str); + } } } @@ -895,16 +893,16 @@ namespace Rust { current_column += length; // if just a single underscore, not an identifier - if (first_is_underscore && length == 1) { + if (first_is_underscore && length == 1) return Token::make(UNDERSCORE, loc); - } + + str.shrink_to_fit(); TokenId keyword = classify_keyword(str); - if (keyword == IDENTIFIER) { + if (keyword == IDENTIFIER) return Token::make_identifier(loc, str); - } else { + else return Token::make(keyword, loc); - } } // identify literals @@ -958,19 +956,14 @@ namespace Rust { current_column += length; // convert hex value to decimal representation - long hex_num = ::std::strtol(str.c_str(), NULL, 16); - - // create output string stream for hex value to be converted to string - // again - // TODO: if too slow, use sprintf - ::std::ostringstream ostr; - ostr << hex_num; + long hex_num = std::strtol(str.c_str(), NULL, 16); - // reassign string representation to converted value - str = ostr.str(); + str = std::to_string(hex_num); // parse in type suffix if it exists - parse_in_type_suffix(/*current_char, */ type_hint, length); + auto type_suffix_pair = parse_in_type_suffix(); + type_hint = type_suffix_pair.first; + length += type_suffix_pair.second; if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) { rust_error_at(get_current_location(), @@ -985,8 +978,6 @@ namespace Rust { length++; - // don't add any characters as C octals are just 0124 or whatever - // loop through to add entire octal number to string while (is_octal_digit(current_char) || current_char == '_') { if (current_char == '_') { @@ -1010,19 +1001,15 @@ namespace Rust { current_column += length; // convert octal value to decimal representation - long octal_num = ::std::strtol(str.c_str(), NULL, 8); - - // create output string stream for octal value to be converted to - // string again - // TODO: if too slow, use sprintf - ::std::ostringstream ostr; - ostr << octal_num; + long octal_num = std::strtol(str.c_str(), NULL, 8); - // reassign string representation to converted value - str = ostr.str(); + str = std::to_string(octal_num); // parse in type suffix if it exists - parse_in_type_suffix(/*current_char, */ type_hint, length); + // parse_in_type_suffix (/*current_char, */ type_hint, length); + auto type_suffix_pair = parse_in_type_suffix(); + type_hint = type_suffix_pair.first; + length += type_suffix_pair.second; if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) { rust_error_at(get_current_location(), @@ -1037,9 +1024,6 @@ namespace Rust { length++; - // don't add any characters as C binary numbers are not really - // supported - // loop through to add entire binary number to string while (is_bin_digit(current_char) || current_char == '_') { if (current_char == '_') { @@ -1063,19 +1047,15 @@ namespace Rust { current_column += length; // convert binary value to decimal representation - long bin_num = ::std::strtol(str.c_str(), NULL, 2); + long bin_num = std::strtol(str.c_str(), NULL, 2); - // create output string stream for binary value to be converted to - // string again - // TODO: if too slow, use sprintf - ::std::ostringstream ostr; - ostr << bin_num; - - // reassign string representation to converted value - str = ostr.str(); + str = std::to_string(bin_num); // parse in type suffix if it exists - parse_in_type_suffix(/*current_char, */ type_hint, length); + // parse_in_type_suffix (/*current_char, */ type_hint, length); + auto type_suffix_pair = parse_in_type_suffix(); + type_hint = type_suffix_pair.first; + length += type_suffix_pair.second; if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) { rust_error_at(get_current_location(), @@ -1089,10 +1069,13 @@ namespace Rust { current_char = peek_input(); // parse initial decimal literal - assuming integer - // TODO: test if works - parse_in_decimal(/*current_char, */ str, length); + // parse_in_decimal (/*current_char, */ str, length); + auto str_length_pair = parse_in_decimal(); + str += str_length_pair.first; + length += str_length_pair.second; - // detect float literal - TODO: fix: "242." is not recognised as a float literal + // detect float literal - TODO: fix: "242." is not recognised as a + // float literal if (current_char == '.' && is_float_digit(peek_input(1))) { // float with a '.', parse another decimal into it @@ -1106,16 +1089,19 @@ namespace Rust { length++; // parse another decimal number for float - // TODO: test if works - parse_in_decimal(/*current_char, */ str, length); + auto str_length_pair2 = parse_in_decimal(); + str += str_length_pair2.first; + length += str_length_pair2.second; // parse in exponent part if it exists - // test to see if this works: - parse_in_exponent_part(/*current_char, */ str, length); + auto exponent_part = parse_in_exponent_part(); + str += exponent_part.first; + length += exponent_part.second; // parse in type suffix if it exists - // TODO: see if works: - parse_in_type_suffix(/*current_char, */ type_hint, length); + auto type_suffix_pair = parse_in_type_suffix(); + type_hint = type_suffix_pair.first; + length += type_suffix_pair.second; if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64 && type_hint != CORETYPE_UNKNOWN) { @@ -1123,7 +1109,6 @@ namespace Rust { "invalid type suffix '%s' for float literal", get_type_hint_string(type_hint)); } - } else if (current_char == '.' && check_valid_float_dot_end(peek_input(1))) { is_real = true; @@ -1138,13 +1123,6 @@ namespace Rust { // don't parse another decimal number for float - // parse in exponent part if it exists - shouldn't exist? - // parse_in_exponent_part(/*current_char, */ str, length); - - // parse in type suffix if it exists - shouldn't exist? - // TODO: see if works: - // parse_in_type_suffix(/*current_char, */ type_hint, length); - if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64 && type_hint != CORETYPE_UNKNOWN) { rust_error_at(get_current_location(), @@ -1155,10 +1133,16 @@ namespace Rust { is_real = true; // parse exponent part - parse_in_exponent_part(/*current_char, */ str, length); + // parse_in_exponent_part (/*current_char, */ str, length); + auto exponent_part = parse_in_exponent_part(); + str += exponent_part.first; + length += exponent_part.second; // parse in type suffix if it exists - parse_in_type_suffix(/*current_char, */ type_hint, length); + // parse_in_type_suffix (/*current_char, */ type_hint, length); + auto type_suffix_pair = parse_in_type_suffix(); + type_hint = type_suffix_pair.first; + length += type_suffix_pair.second; if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64 && type_hint != CORETYPE_UNKNOWN) { @@ -1170,11 +1154,15 @@ namespace Rust { // is an integer // parse in type suffix if it exists - parse_in_type_suffix(/*current_char, */ type_hint, length); + // parse_in_type_suffix (/*current_char, */ type_hint, length); + auto type_suffix_pair = parse_in_type_suffix(); + type_hint = type_suffix_pair.first; + length += type_suffix_pair.second; if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) { rust_error_at(get_current_location(), - "invalid type suffix '%s' for integer (decimal) literal", + "invalid type suffix '%s' for integer " + "(decimal) literal", get_type_hint_string(type_hint)); } } @@ -1182,12 +1170,13 @@ namespace Rust { current_column += length; } + str.shrink_to_fit(); + // actually make the tokens - if (is_real) { + if (is_real) return Token::make_float(loc, str, type_hint); - } else { + else return Token::make_int(loc, str, type_hint); - } } // string literals - not processed properly @@ -1200,20 +1189,25 @@ namespace Rust { int length = 1; current_char32 = test_peek_codepoint_input(); - // ok initial peek_codepoint seems to work without "too long" - while (current_char32.value != '\n' && current_char32.value != '"') { - // TODO: handle escapes and string continue if (current_char32.value == '\\') { // parse escape - parse_utf8_escape(length, current_char32, '\''); - - // TODO: find a way to parse additional characters after the escape? - // return after parsing escape? - - str += current_char32; + auto utf8_escape_pair = parse_utf8_escape('\''); + current_char32 = utf8_escape_pair.first; + //length += utf8_escape_pair.second; + + // TODO: need to fix length - after escape, the length of the line up to the next non-whitespace char of the string is added to length, which is not what we want - we want length to be replaced by that. + // possible option could if "if escape_length_pair.first == 0, then length = escape_length_pair.second else length += escape_length_pair.second." + if (current_char32 == Codepoint(0)) + length = utf8_escape_pair.second - 1; + else + length += utf8_escape_pair.second; + + if (current_char32 != Codepoint(0)) + str += current_char32; - // required as parsing utf8 escape only changes current_char or something + // required as parsing utf8 escape only changes current_char + // or something current_char32 = test_peek_codepoint_input(); continue; @@ -1221,7 +1215,6 @@ namespace Rust { length += test_get_input_codepoint_length(); - // does this work? not technically a char. maybe have to convert to char series str += current_char32; test_skip_codepoint_input(); current_char32 = test_peek_codepoint_input(); @@ -1232,21 +1225,20 @@ namespace Rust { if (current_char32.value == '\n') { rust_error_at(get_current_location(), "unended string literal"); } else if (current_char32.value == '"') { + current_column++; + skip_input(); - current_char = peek_input(); } else { - rust_unreachable(); + gcc_unreachable(); } + str.shrink_to_fit(); return Token::make_string(loc, str); - // TODO: account for escapes and string continue - // also, in rust a string is a series of unicode characters (4 bytes) } // char literal attempt if (current_char == '\'') { - // rust chars are 4 bytes and have some weird unicode representation thing Codepoint current_char32; int length = 1; @@ -1256,10 +1248,9 @@ namespace Rust { // parse escaped char literal if (current_char32.value == '\\') { // parse escape - parse_utf8_escape(length, current_char32, '\''); - - // TODO - this skip may not be needed? - // test_skip_codepoint_input(); + auto utf8_escape_pair = parse_utf8_escape('\''); + current_char32 = utf8_escape_pair.first; + length += utf8_escape_pair.second; if (test_peek_codepoint_input().value != '\'') { rust_error_at(get_current_location(), "unended char literal"); @@ -1271,7 +1262,6 @@ namespace Rust { current_column += length; - // TODO: FIX - char is actually 4 bytes in Rust (uint32) due to unicode return Token::make_char(loc, current_char32); } else { // current_char32 = test_peek_codepoint_input(); @@ -1279,7 +1269,6 @@ namespace Rust { if (test_peek_codepoint_input().value == '\'') { // parse normal char literal - // TODO: FIX - char is actually 4 bytes in Rust (uint32) due to unicode // skip the ' character skip_input(); @@ -1292,11 +1281,11 @@ namespace Rust { } else if (ISDIGIT(current_char32.value) || ISALPHA(current_char32.value) || current_char32.value == '_') { // parse lifetime name - ::std::string str; - // TODO: does this work properly? + std::string str; str += current_char32; - // TODO: fix lifetime name thing - actually, why am I even using utf-8 here? + /* TODO: fix lifetime name thing - actually, why am I even + * using utf-8 here? */ int length = 1; @@ -1313,6 +1302,7 @@ namespace Rust { current_column += length; + str.shrink_to_fit(); return Token::make_lifetime(loc, str); } else { rust_error_at(get_current_location(), "expected ' after character constant"); @@ -1327,11 +1317,12 @@ namespace Rust { } // Shitty pass-by-reference way of parsing in type suffix. - bool Lexer::parse_in_type_suffix( - /*char& current_char, */ PrimitiveCoreType& type_hint, int& length) { - ::std::string suffix; + std::pair<PrimitiveCoreType, int> Lexer::parse_in_type_suffix() { + std::string suffix; suffix.reserve(5); + int additional_length_offset = 0; + // get suffix while (ISALPHA(current_char) || ISDIGIT(current_char) || current_char == '_') { if (current_char == '_') { @@ -1339,12 +1330,12 @@ namespace Rust { skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; continue; } - length++; + additional_length_offset++; suffix += current_char; skip_input(); @@ -1353,52 +1344,52 @@ namespace Rust { if (suffix.empty()) { // no type suffix: do nothing but also no error - return false; + return std::make_pair(CORETYPE_UNKNOWN, additional_length_offset); } else if (suffix == "f32") { - type_hint = CORETYPE_F32; + return std::make_pair(CORETYPE_F32, additional_length_offset); } else if (suffix == "f64") { - type_hint = CORETYPE_F64; + return std::make_pair(CORETYPE_F64, additional_length_offset); } else if (suffix == "i8") { - type_hint = CORETYPE_I8; + return std::make_pair(CORETYPE_I8, additional_length_offset); } else if (suffix == "i16") { - type_hint = CORETYPE_I16; + return std::make_pair(CORETYPE_I16, additional_length_offset); } else if (suffix == "i32") { - type_hint = CORETYPE_I32; + return std::make_pair(CORETYPE_I32, additional_length_offset); } else if (suffix == "i64") { - type_hint = CORETYPE_I64; + return std::make_pair(CORETYPE_I64, additional_length_offset); } else if (suffix == "i128") { - type_hint = CORETYPE_I128; + return std::make_pair(CORETYPE_I128, additional_length_offset); } else if (suffix == "isize") { - type_hint = CORETYPE_ISIZE; + return std::make_pair(CORETYPE_ISIZE, additional_length_offset); } else if (suffix == "u8") { - type_hint = CORETYPE_U8; + return std::make_pair(CORETYPE_U8, additional_length_offset); } else if (suffix == "u16") { - type_hint = CORETYPE_U16; + return std::make_pair(CORETYPE_U16, additional_length_offset); } else if (suffix == "u32") { - type_hint = CORETYPE_U32; + return std::make_pair(CORETYPE_U32, additional_length_offset); } else if (suffix == "u64") { - type_hint = CORETYPE_U64; + return std::make_pair(CORETYPE_U64, additional_length_offset); } else if (suffix == "u128") { - type_hint = CORETYPE_U128; + return std::make_pair(CORETYPE_U128, additional_length_offset); } else if (suffix == "usize") { - type_hint = CORETYPE_USIZE; + return std::make_pair(CORETYPE_USIZE, additional_length_offset); } else { rust_error_at(get_current_location(), "unknown number suffix '%s'", suffix.c_str()); - return false; + return std::make_pair(CORETYPE_UNKNOWN, additional_length_offset); } - - return true; } - void Lexer::parse_in_exponent_part(/*char& current_char, */ std::string& str, int& length) { + std::pair<std::string, int> Lexer::parse_in_exponent_part() { + int additional_length_offset = 0; + std::string str; if (current_char == 'E' || current_char == 'e') { // add exponent to string as strtod works with it str += current_char; skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; // special - and + handling if (current_char == '-') { @@ -1407,46 +1398,55 @@ namespace Rust { skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; } else if (current_char == '+') { // don't add + but still skip input skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; } // parse another decimal number for exponent - parse_in_decimal(/*current_char, */ str, length); + auto str_length_pair = parse_in_decimal(); + str += str_length_pair.first; + additional_length_offset += str_length_pair.second; } + return std::make_pair(str, additional_length_offset); } - void Lexer::parse_in_decimal(/*char& current_char, */ std::string& str, int& length) { + std::pair<std::string, int> Lexer::parse_in_decimal() { + int additional_length_offset = 0; + std::string str; while (ISDIGIT(current_char) || current_char == '_') { if (current_char == '_') { // don't add _ to number skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; continue; } - length++; + additional_length_offset++; str += current_char; skip_input(); current_char = peek_input(); } + return std::make_pair(str, additional_length_offset); } - // Replace all assorted parse_x_escape with this? Avoids the backwards/peek issue. - bool Lexer::parse_escape(int& length, char& output_char, char opening_char) { + /* Parses escapes (and string continues) in "byte" strings and characters. Does not support unicode. */ + std::pair<char, int> Lexer::parse_escape(char opening_char) { + int additional_length_offset = 0; + char output_char = 0; + // skip to actual letter skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; switch (current_char) { case 'x': { @@ -1456,9 +1456,9 @@ namespace Rust { // first hex char skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; - if (!ISXDIGIT(current_char)) { + if (!is_x_digit(current_char)) { rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence", current_char); } @@ -1467,23 +1467,21 @@ namespace Rust { // second hex char skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; - if (!ISXDIGIT(current_char)) { + if (!is_x_digit(current_char)) { rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence", current_char); } hexNum[1] = current_char; - long hexLong = ::std::strtol(hexNum, NULL, 16); + long hexLong = std::strtol(hexNum, NULL, 16); - if (hexLong > 127) + if (hexLong > 255 || hexLong < 0) rust_error_at(get_current_location(), - "ascii \\x escape '\\x%s' out of range - allows up to '\\x7F'", hexNum); - // gcc_assert(hexLong < 128); // as ascii + "byte \\x escape '\\x%s' out of range - allows up to '\\xFF'", hexNum); char hexChar = static_cast<char>(hexLong); - // TODO: fix - does this actually give the right character? output_char = hexChar; } break; case 'n': @@ -1507,28 +1505,34 @@ namespace Rust { case '"': output_char = '"'; break; - case 'u': { + case 'u': + rust_error_at(get_current_location(), + "cannot have a unicode escape \\u in a byte %s!", + opening_char == '\'' ? "character" : "string"); + return std::make_pair(output_char, additional_length_offset); +#if 0 + { // TODO: shouldn't be used with this - use parse_utf8_escape skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; bool need_close_brace = false; - // TODO: rustc lexer doesn't seem to allow not having { but mrustc lexer does? look at - // spec? + // TODO: rustc lexer doesn't seem to allow not having { but mrustc lexer + // does? look at spec? if (current_char == '{') { need_close_brace = true; skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; } // parse unicode escape // 1-6 hex digits? - ::std::string num_str; + std::string num_str; num_str.reserve(6); // test adding number directly @@ -1541,12 +1545,12 @@ namespace Rust { skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; continue; } - length++; + additional_length_offset++; // add raw hex numbers num_str += current_char; @@ -1554,7 +1558,7 @@ namespace Rust { // test adding number directly char tmp[2] = { current_char, 0 }; test_val *= 16; - test_val += ::std::strtol(tmp, NULL, 16); + test_val += std::strtol(tmp, NULL, 16); skip_input(); current_char = peek_input(); @@ -1565,18 +1569,21 @@ namespace Rust { // actually an error rust_error_at( get_current_location(), "expected terminating '}' in unicode escape"); - return false; + // return false; + return std::make_pair(output_char, additional_length_offset); } // ensure 1-6 hex characters if (num_str.length() > 6 || num_str.length() < 1) { rust_error_at(get_current_location(), - "unicode escape should be between 1 and 6 hex characters; it is %lu", + "unicode escape should be between 1 and 6 hex " + "characters; it is %lu", num_str.length()); - return false; + // return false; + return std::make_pair(output_char, additional_length_offset); } - long hex_num = ::std::strtol(num_str.c_str(), NULL, 16); + long hex_num = std::strtol(num_str.c_str(), NULL, 16); // as debug, check hex_num = test_val if (hex_num > 255) { @@ -1586,12 +1593,14 @@ namespace Rust { } // make output_char the value - UTF-8? - // TODO: actually make this work - output char must be 4 bytes, do I need a string for - // this? + // TODO: actually make this work - output char must be 4 bytes, do I + // need a string for this? output_char = static_cast</*uint32_t*/ char>(hex_num); - return true; + // return true; + return std::make_pair(output_char, additional_length_offset); } break; +#endif case '\r': case '\n': // string continue @@ -1600,10 +1609,10 @@ namespace Rust { current_line++; current_column = 1; // tell line_table that new line starts - linemap_line_start(::line_table, current_line, max_column_hint); + line_map->start_line(current_line, max_column_hint); // reset "length" - length = 1; + additional_length_offset = 1; // get next char skip_input(); @@ -1614,45 +1623,56 @@ namespace Rust { skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; } - if (current_char == '\\') { - parse_escape(length, output_char, opening_char); - return true; - } else if (current_char == opening_char) { + // shouldn't need this +#if 0 + if (current_char == opening_char) { // TODO: does this skip the ' or " character? It shouldn't. output_char = 0; - return true; + // return true; + return std::make_pair(output_char, additional_length_offset); } else { - output_char = current_char; + // TODO: shouldn't this make output_char null so that it isn't added to string? + // or check for escape being zero? + output_char = /*current_char*/0; // TODO: test has right result /*skip_input(); current_char = peek_input();*/ - return true; + // return true; + return std::make_pair(output_char, additional_length_offset); } +#endif + return std::make_pair(0, additional_length_offset); default: rust_error_at(get_current_location(), "unknown escape sequence '\\%c'", current_char); // returns false if no parsing could be done - return false; + // return false; + return std::make_pair(output_char, additional_length_offset); break; } - // all non-special cases (unicode, string continue) should skip their used char + // all non-special cases (string continue) should skip their used char skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; // returns true if parsing was successful - return true; + // return true; + return std::make_pair(output_char, additional_length_offset); } - bool Lexer::parse_utf8_escape(int& length, Codepoint& output_char, char opening_char) { + // Parses an escape (or string continue) in a string or character. Supports unicode escapes. + std::pair<Codepoint, int> Lexer::parse_utf8_escape(char opening_char) { + Codepoint output_char; + int additional_length_offset = 0; + // skip to actual letter skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; switch (current_char) { case 'x': { @@ -1662,9 +1682,9 @@ namespace Rust { // first hex char skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; - if (!ISXDIGIT(current_char)) { + if (!is_x_digit(current_char)) { rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence", current_char); } @@ -1673,15 +1693,15 @@ namespace Rust { // second hex char skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; - if (!ISXDIGIT(current_char)) { + if (!is_x_digit(current_char)) { rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence", current_char); } hexNum[1] = current_char; - long hexLong = ::std::strtol(hexNum, NULL, 16); + long hexLong = std::strtol(hexNum, NULL, 16); if (hexLong > 127) rust_error_at(get_current_location(), @@ -1689,7 +1709,6 @@ namespace Rust { // gcc_assert(hexLong < 128); // as ascii char hexChar = static_cast<char>(hexLong); - // TODO: fix - does this actually give the right character? output_char = hexChar; } break; case 'n': @@ -1716,28 +1735,21 @@ namespace Rust { case 'u': { skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; bool need_close_brace = false; - - // TODO: rustc lexer doesn't seem to allow not having { but mrustc lexer does? look at - // spec? if (current_char == '{') { need_close_brace = true; skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; } - // parse unicode escape - // 1-6 hex digits? - ::std::string num_str; + // parse unicode escape - 1-6 hex digits + std::string num_str; num_str.reserve(6); - // test adding number directly - uint32_t test_val; - // loop through to add entire hex number to string while (is_x_digit(current_char) || current_char == '_') { if (current_char == '_') { @@ -1745,21 +1757,16 @@ namespace Rust { skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; continue; } - length++; + additional_length_offset++; // add raw hex numbers num_str += current_char; - // test adding number directly - char tmp[2] = { current_char, 0 }; - test_val *= 16; - test_val += ::std::strtol(tmp, NULL, 16); - skip_input(); current_char = peek_input(); } @@ -1769,36 +1776,38 @@ namespace Rust { if (current_char == '}') { skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; } else { // actually an error rust_error_at( get_current_location(), "expected terminating '}' in unicode escape"); - return false; + // return false; + return std::make_pair(output_char, additional_length_offset); } } // ensure 1-6 hex characters if (num_str.length() > 6 || num_str.length() < 1) { rust_error_at(get_current_location(), - "unicode escape should be between 1 and 6 hex characters; it is %lu", + "unicode escape should be between 1 and 6 hex " + "characters; it is %lu", num_str.length()); - return false; + // return false; + return std::make_pair(output_char, additional_length_offset); } - long hex_num = ::std::strtol(num_str.c_str(), NULL, 16); + long hex_num = std::strtol(num_str.c_str(), NULL, 16); // assert fits a uint32_t - rust_assert(hex_num < 4294967296); + gcc_assert(hex_num < 4294967296); - // ok can't figure out how to just convert to codepoint or use "this" so create new - // one output_char = Codepoint(static_cast<uint32_t>(hex_num)); - // TODO: what is being outputted? the escape code for the unicode char (unicode - // number) or the character number? + // TODO: what is being outputted? the escape code for the unicode char + // (unicode number) or the character number? - return true; + // return true; + return std::make_pair(output_char, additional_length_offset); } break; case '\r': case '\n': @@ -1808,10 +1817,10 @@ namespace Rust { current_line++; current_column = 1; // tell line_table that new line starts - linemap_line_start(::line_table, current_line, max_column_hint); + line_map->start_line(current_line, max_column_hint); // reset "length" - length = 1; + additional_length_offset = 1; // get next char skip_input(); @@ -1822,38 +1831,39 @@ namespace Rust { skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; } - if (current_char == '\\') { - parse_utf8_escape(length, output_char, opening_char); - return true; - } else if (current_char == opening_char) { - // TODO: does this skip the ' or " character? It shouldn't. + // shouldn't need this +#if 0 + if (current_char == opening_char) { output_char = 0; - return true; + // return true; + return std::make_pair(output_char, additional_length_offset); } else { - output_char = current_char; + output_char = /*current_char*/0; - // TODO: test has right result - /*skip_input(); - current_char = peek_input();*/ - - return true; + // return true; + return std::make_pair(output_char, additional_length_offset); } +#endif + return std::make_pair(0, additional_length_offset); default: rust_error_at(get_current_location(), "unknown escape sequence '\\%c'", current_char); // returns false if no parsing could be done - return false; + // return false; + return std::make_pair(output_char, additional_length_offset); break; } - // all non-special cases (unicode, string continue) should skip their used char + /* all non-special cases (unicode, string continue) should skip their used + * char */ skip_input(); current_char = peek_input(); - length++; + additional_length_offset++; // returns true if parsing was successful - return true; + // return true; + return std::make_pair(output_char, additional_length_offset); } #if 0 @@ -2102,6 +2112,7 @@ namespace Rust { } #endif + // Returns the length of the codepoint at the current position. int Lexer::test_get_input_codepoint_length() { uint8_t input = peek_input(); @@ -2138,8 +2149,8 @@ namespace Rust { // return 0xFFFE; /*uint32_t output - = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0); - return output;*/ + = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << + 0); return output;*/ return 3; } else if ((input & 0xF8) == 0xF0) { // 4 bytes @@ -2168,7 +2179,7 @@ namespace Rust { } } - // TODO: rewrite lexing system to use utf-8 "codepoints" rather than bytes? + // Returns the codepoint at the current position. Codepoint Lexer::test_peek_codepoint_input() { uint8_t input = peek_input(); @@ -2224,7 +2235,7 @@ namespace Rust { void Lexer::test_skip_codepoint_input() { int toSkip = test_get_input_codepoint_length(); - rust_assert(toSkip >= 1); + gcc_assert(toSkip >= 1); skip_input(toSkip - 1); } @@ -2263,8 +2274,8 @@ namespace Rust { return 0; /*uint32_t output - = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0); - return output;*/ + = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << + 0); return output;*/ return 3; } else if ((input & 0xF8) == 0xF0) { // 4 bytes @@ -2293,7 +2304,8 @@ namespace Rust { } } - // peeks the codepoint input at n codepoints ahead of current codepoint - try not to use + // peeks the codepoint input at n codepoints ahead of current codepoint - try + // not to use Codepoint Lexer::test_peek_codepoint_input(int n) { int totalOffset = 0; @@ -2301,12 +2313,13 @@ namespace Rust { for (int i = 0; i < n; i++) { totalOffset += test_get_input_codepoint_n_length(totalOffset); } - // issues: this would have (at least) O(n) lookup time, not O(1) like the rest? + // issues: this would have (at least) O(n) lookup time, not O(1) like the + // rest? // TODO: implement if still needed // error out of function as it is not implemented - rust_assert(1 == 0); + gcc_assert(1 == 0); return { 0 }; /* uint8_t input = peek_input(); @@ -2336,9 +2349,8 @@ namespace Rust { return 0xFFFE; uint32_t output - = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0); - return output; - } else if ((input & 0xF8) == 0xF0) { + = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & + 0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) { // 4 bytes uint8_t input2 = peek_input(1); if ((input2 & 0xC0) != 0x80) @@ -2353,11 +2365,9 @@ namespace Rust { return 0xFFFE; uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) - | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); - return output; - } else { - rust_error_at(get_current_location(), "invalid UTF-8 (too long)"); - return 0xFFFE; + | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << + 0); return output; } else { rust_error_at(get_current_location(), "invalid + UTF-8 (too long)"); return 0xFFFE; }*/ } -} +} // namespace Rust diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h index 8dc3e31..5220753 100644 --- a/gcc/rust/lex/rust-lex.h +++ b/gcc/rust/lex/rust-lex.h @@ -5,6 +5,8 @@ #include "rust-buffered-queue.h" #include "rust-token.h" +#include <utility> + namespace Rust { class Lexer { @@ -31,35 +33,39 @@ private: // ok maybe all these may mean the lexer structure needs to be rethought /* separated into functions because main method was too long, but they rely on * and change state in the lexer, so variables must be passed by reference. */ - inline void parse_in_decimal (/*char& current_char, */ std::string &str, - int &length); - inline void parse_in_exponent_part (/*char& current_char, */ std::string &str, - int &length); - inline bool parse_in_type_suffix ( - /*char& current_char, */ PrimitiveCoreType &type_hint, int &length); - inline bool parse_ascii_escape (/*char& current_char, */ int &length, - char &output_char); - inline bool parse_quote_escape (/*char& current_char, */ int &length, - char &output_char); - inline bool parse_unicode_escape ( - /*char& current_char, */ int &length, Codepoint &output_char); - inline bool parse_byte_escape (/*char& current_char, */ int &length, - char &output_char); - inline bool parse_escape (int &length, char &output_char, char opening_char); - inline bool parse_utf8_escape (int &length, Codepoint &output_char, - char opening_char); - inline int test_get_input_codepoint_length (); - inline int test_get_input_codepoint_n_length (int n_start_offset); - inline Codepoint test_peek_codepoint_input (); - inline Codepoint test_peek_codepoint_input ( + std::pair<std::string, int> parse_in_decimal (); + std::pair<std::string, int> parse_in_exponent_part (); + std::pair<PrimitiveCoreType, int> parse_in_type_suffix (); + /*bool parse_ascii_escape (int &length, + char &output_char);*/ + /*bool parse_quote_escape (char& current_char, int &length, + char &output_char);*/ + /*bool parse_unicode_escape ( + char& current_char, int &length, Codepoint &output_char);*/ + /*bool parse_byte_escape (char& current_char, int &length, + char &output_char);*/ + std::pair<char, int> parse_escape (char opening_char); + std::pair<Codepoint, int> parse_utf8_escape (char opening_char); + int test_get_input_codepoint_length (); + int test_get_input_codepoint_n_length (int n_start_offset); + Codepoint test_peek_codepoint_input (); + Codepoint test_peek_codepoint_input ( int n); // maybe can use get_input_codepoint_length to get starting index - inline void test_skip_codepoint_input (); + void test_skip_codepoint_input (); public: // Construct lexer with input file and filename provided Lexer (const char *filename, FILE *input, Linemap *linemap); ~Lexer (); + // don't allow copy semantics (for now, at least) + Lexer (const Lexer &other) = delete; + Lexer &operator= (const Lexer &other) = delete; + + // enable move semantics + Lexer (Lexer &&other) = default; + Lexer &operator= (Lexer &&other) = default; + // Returns token n tokens ahead of current position. const_TokenPtr peek_token (int n); // Peeks the current token. diff --git a/gcc/rust/lex/rust-token.h b/gcc/rust/lex/rust-token.h index 2270fa2..9dd5f0b 100644 --- a/gcc/rust/lex/rust-token.h +++ b/gcc/rust/lex/rust-token.h @@ -53,7 +53,6 @@ enum PrimitiveCoreType // note that abstract, async, become, box, do, final, macro, override, priv, // try, typeof, unsized, virtual, and yield are unused -// TODO finish converting to rust keywords #define RS_TOKEN_LIST \ RS_TOKEN (FIRST_TOKEN, "<first-token-marker>") \ RS_TOKEN (END_OF_FILE, "end of file") \ diff --git a/gcc/rust/rust-linemap.cc b/gcc/rust/rust-linemap.cc index fef4603..5ee76bd 100644 --- a/gcc/rust/rust-linemap.cc +++ b/gcc/rust/rust-linemap.cc @@ -72,7 +72,8 @@ Gcc_linemap::to_string (Location location) // Strip the source file down to the base file, to reduce clutter. std::stringstream ss; - ss << lbasename (path) << ":" << SOURCE_LINE (lmo, location.gcc_location ()); + ss << lbasename (path) << ":" << SOURCE_LINE (lmo, location.gcc_location ()) + << ":" << SOURCE_COLUMN (lmo, location.gcc_location ()); return ss.str (); } diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc index 4308ae3..29933d5 100644 --- a/gcc/rust/rust-session-manager.cc +++ b/gcc/rust/rust-session-manager.cc @@ -340,7 +340,7 @@ Session::enable_dump (::std::string arg) { error_at ( UNKNOWN_LOCATION, - "dumping all is not supported as of now. choose 'lex' or 'parse'"); + "dumping all is not supported as of now. choose 'lex', 'parse', or 'target_options"); return false; } else if (arg == "lex") @@ -379,13 +379,13 @@ Session::enable_dump (::std::string arg) else if (arg == "") { error_at (UNKNOWN_LOCATION, - "dump option was not given a name. choose 'lex' or 'parse'"); + "dump option was not given a name. choose 'lex', 'parse', or 'target_options'"); return false; } else { error_at (UNKNOWN_LOCATION, - "dump option '%s' was unrecognised. choose 'lex' or 'parse'", + "dump option '%s' was unrecognised. choose 'lex', 'parse', or 'target_options", arg.c_str ()); return false; } |