Lexer cleanup

author: SimplyTheOther <simplytheother@gmail.com> 2020-08-20 11:51:25 +0800
committer: Philip Herron <philip.herron@embecosm.com> 2020-11-28 21:13:20 +0000
commit: b758ec724cc06cb866a72ce17dbfd8a426cf21db (patch)
tree: cffb4e7c584f7da75ea42e57011c5a1d160d67a2 /gcc
parent: c26f60f6a28394e98ac1d830cbe8f632ef576dbb (diff)
download: gcc-b758ec724cc06cb866a72ce17dbfd8a426cf21db.zip
gcc-b758ec724cc06cb866a72ce17dbfd8a426cf21db.tar.gz
gcc-b758ec724cc06cb866a72ce17dbfd8a426cf21db.tar.bz2
6 files changed, 426 insertions, 405 deletions
diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/lex/rust-codepoint.h
index 0f2e5bd..d95bfdf 100644
--- a/gcc/rust/lex/rust-codepoint.h
+++ b/gcc/rust/lex/rust-codepoint.h
@@ -1,11 +1,6 @@
 #ifndef RUST_CODEPOINT_H
 #define RUST_CODEPOINT_H
 
-#include "config.h"
-#include "system.h"
-#include "coretypes.h"
-// config, system, coretypes - TODO: ensure all are needed
-
 #include <string>
 
 namespace Rust {
@@ -16,11 +11,21 @@ struct Codepoint
   // Creates a zero codepoint.
   Codepoint () : value (0) {}
 
-  // Creates a codepoint from UTF-8 value.
-  Codepoint (uint32_t value_) : value (value_) {}
+  // Creates a codepoint from an encoded UTF-8 value.
+  Codepoint (uint32_t value) : value (value) {}
+
+  // Returns a C++ string containing string value of codepoint.
+  std::string as_string ();
+
+  bool operator== (Codepoint other) const
+  {
+    return value == other.value;
+  }
 
-  // Returns a C++ string containing value of codepoint.
-  ::std::string as_string ();
+  bool operator!= (Codepoint other) const
+  {
+    return !operator== (other);
+  }
 };
 } // namespace Rust
 
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index 54adf48..6f50e21 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -1,6 +1,6 @@
 #include "rust-lex.h"
 
-#include "rust-system.h" // for rust_assert and rust_unreachable
+#include "rust-system.h"      // for rust_assert and rust_unreachable
 #include "rust-diagnostics.h" // for rust_error_at
 #include "rust-linemap.h"
 #include "safe-ctype.h"
@@ -10,7 +10,7 @@
 namespace Rust {
     // TODO: move to separate compilation unit?
     // overload += for uint32_t to allow 32-bit encoded utf-8 to be added
-    ::std::string& operator+=(::std::string& str, Codepoint char32) {
+    std::string& operator+=(std::string& str, Codepoint char32) {
         if (char32.value < 0x80) {
             str += static_cast<char>(char32.value);
         } else if (char32.value < (0x1F + 1) << (1 * 6)) {
@@ -27,64 +27,45 @@ namespace Rust {
             str += static_cast<char>(0x80 | ((char32.value >> 0) & 0x3F));
         } else {
             fprintf(stderr, "Invalid unicode codepoint found: '%u' \n", char32.value);
-            // rust_error_at(get_current_location(), "Invalid unicode codepoint found: '%u'",
-            // char32.value);
         }
         return str;
     }
 
-    ::std::string Codepoint::as_string() {
+    std::string Codepoint::as_string() {
         std::string str;
 
-        // do i need to do this? or can i just do str += value due to op overloading?
-
-        // ok can't figure out how to just convert to codepoint or use "this" so create new one
-        str += Codepoint(value);
-
-        /*if (value < 0x80) {
-            str += static_cast<char>(value);
-        } else if (value < (0x1F + 1) << (1 * 6)) {
-            str += static_cast<char>(0xC0 | ((value >> 6) & 0x1F));
-            str += static_cast<char>(0x80 | ((value >> 0) & 0x3F));
-        } else if (value < (0x0F + 1) << (2 * 6)) {
-            str += static_cast<char>(0xE0 | ((value >> 12) & 0x0F));
-            str += static_cast<char>(0x80 | ((value >> 6) & 0x3F));
-            str += static_cast<char>(0x80 | ((value >> 0) & 0x3F));
-        } else if (value < (0x07 + 1) << (3 * 6)) {
-            str += static_cast<char>(0xF0 | ((value >> 18) & 0x07));
-            str += static_cast<char>(0x80 | ((value >> 12) & 0x3F));
-            str += static_cast<char>(0x80 | ((value >> 6) & 0x3F));
-            str += static_cast<char>(0x80 | ((value >> 0) & 0x3F));
-        } else {
-            rust_error_at(get_current_location(), "Invalid unicode codepoint found: '%u'", value);
-        }*/
+        // str += Codepoint (value);
+        str += *this;
+
         return str;
     }
 
-    // Includes all allowable float digits EXCEPT _ and . as that needs lookahead for handling.
-    inline bool is_float_digit(char number) {
+    /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
+     * for handling. */
+    bool is_float_digit(char number) {
         return ISDIGIT(number) || number == 'E' || number == 'e';
     }
 
-    // Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or whatever is different
-    inline bool is_x_digit(char number) {
+    /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
+     * whatever is different */
+    bool is_x_digit(char number) {
         return ISXDIGIT(number);
     }
 
-    inline bool is_octal_digit(char number) {
+    bool is_octal_digit(char number) {
         return number >= '0' && number <= '7';
     }
 
-    inline bool is_bin_digit(char number) {
+    bool is_bin_digit(char number) {
         return number == '0' || number == '1';
     }
 
-    inline bool check_valid_float_dot_end(char character) {
+    bool check_valid_float_dot_end(char character) {
         return character != '.' && character != '_' && !ISALPHA(character);
     }
 
     // ISSPACE from safe-ctype but may change in future
-    inline bool is_whitespace(char character) {
+    bool is_whitespace(char character) {
         return ISSPACE(character);
     }
 
@@ -96,21 +77,22 @@ namespace Rust {
     }
 
     Lexer::~Lexer() {
-        /* ok apparently stop (which is equivalent of original code in destructor) is meant to be
-         * called after all files have finished parsing, for cleanup. On the other hand, actual code
-         * that it calls to leave a certain line map is mentioned in GCC docs as being useful for
-         * "just leaving an included header" and stuff like that, so this line mapping functionality
-         * may need fixing.
+        /* ok apparently stop (which is equivalent of original code in destructor) is
+         * meant to be called after all files have finished parsing, for cleanup. On
+         * the other hand, actual code that it calls to leave a certain line map is
+         * mentioned in GCC docs as being useful for "just leaving an included header"
+         * and stuff like that, so this line mapping functionality may need fixing.
          * FIXME: find out whether this occurs. */
         // line_map->stop();
     }
 
-    // TODO: need to optimise somehow to avoid the virtual function call in the tight loop.
-    // Best idea at the moment is CRTP, but that might make lexer implementation annoying when storing
-    // the "base class" (i.e. would need template parameter everywhere), although in practice it would
-    // mostly just look ugly and make enclosing classes like Parser also require a type parameter.
-    // At this point a macro might be better.
-    // OK I guess macros can be replaced by constexpr if or something if possible.
+    /* TODO: need to optimise somehow to avoid the virtual function call in the
+     * tight loop. Best idea at the moment is CRTP, but that might make lexer
+     * implementation annoying when storing the "base class" (i.e. would need
+     * template parameter everywhere), although in practice it would mostly just
+     * look ugly and make enclosing classes like Parser also require a type
+     * parameter. At this point a macro might be better. OK I guess macros can be
+     * replaced by constexpr if or something if possible. */
     Location Lexer::get_current_location() {
         return line_map->get_location(current_column);
     }
@@ -151,9 +133,9 @@ namespace Rust {
         token_queue.replace_current_value(replacement);
     }
 
-    /* shitty anonymous namespace that can only be accessed inside the compilation unit - used for
-     * classify_keyword
-     * Binary search in sorted array of keywords created with x-macros. */
+    /* shitty anonymous namespace that can only be accessed inside the compilation
+     * unit - used for classify_keyword Binary search in sorted array of keywords
+     * created with x-macros. */
     namespace {
         const std::string keyword_index[] = {
 #define RS_TOKEN(x, y)
@@ -172,19 +154,18 @@ namespace Rust {
         };
 
         const int num_keywords = sizeof(keyword_index) / sizeof(*keyword_index);
-    }
+    } // namespace
 
-    /* Determines whether the string passed in is a keyword or not. If it is, it returns the keyword
-     * name.  */
+    /* Determines whether the string passed in is a keyword or not. If it is, it
+     * returns the keyword name.  */
     TokenId Lexer::classify_keyword(const std::string& str) {
         const std::string* last = keyword_index + num_keywords;
         const std::string* idx = std::lower_bound(keyword_index, last, str);
 
-        if (idx == last || str != *idx) {
+        if (idx == last || str != *idx)
             return IDENTIFIER;
-        } else {
+        else
             return keyword_keys[idx - keyword_index];
-        }
     }
 
     TokenPtr Lexer::build_token() {
@@ -195,9 +176,8 @@ namespace Rust {
             skip_input();
 
             // return end of file token if end of file
-            if (current_char == EOF) {
+            if (current_char == EOF) 
                 return Token::make(END_OF_FILE, loc);
-            }
 
             // detect shebang
             if (loc == 1 && current_line == 1 && current_char == '#') {
@@ -231,12 +211,13 @@ namespace Rust {
 
             // if not end of file, start tokenising
             switch (current_char) {
-                // ignore whitespace characters for tokens but continue updating location
+                /* ignore whitespace characters for tokens but continue updating
+                 * location */
                 case '\n': // newline
                     current_line++;
                     current_column = 1;
                     // tell line_table that new line starts
-                    linemap_line_start(::line_table, current_line, max_column_hint);
+                    line_map->start_line(current_line, max_column_hint);
                     continue;
                 case ' ': // space
                     current_column++;
@@ -509,23 +490,18 @@ namespace Rust {
                     current_column++;
                     return Token::make(RIGHT_CURLY, loc);
                 case '@':
-                    // TODO: i don't know what this does, does it need special handling?
                     current_column++;
                     return Token::make(PATTERN_BIND, loc);
                 case '$':
-                    // TODO: i don't know what this does, does it need special handling?
                     current_column++;
                     return Token::make(DOLLAR_SIGN, loc);
                 case '~':
-                    // TODO: i don't know what this does, does it need special handling?
                     current_column++;
                     return Token::make(TILDE, loc);
                 case '\\':
-                    // TODO: i don't know what this does, does it need special handling?
                     current_column++;
                     return Token::make(BACKSLASH, loc);
                 case '`':
-                    // TODO: i don't know what this does, does it need special handling?
                     current_column++;
                     return Token::make(BACKTICK, loc);
                 case '|':
@@ -599,27 +575,21 @@ namespace Rust {
             // byte and byte string test
             if (current_char == 'b') {
                 if (peek_input() == '\'') {
-                    // byte - allows any ascii or escapes
-                    // would also have to take into account escapes: \x hex_digit hex_digit,
-                    // \n, \r, \t, \\, \0
+                    skip_input();
+                    current_column++;
+                    // make current char the next character
+                    current_char = peek_input();
 
                     int length = 1;
 
                     // char to save
-                    char byte_char;
-
-                    skip_input();
-                    // make current char the next character
-                    current_char = peek_input();
+                    char byte_char = 0;
 
                     // detect escapes
                     if (current_char == '\\') {
-                        /*skip_input();
-
-                        // make current_char next character (letter)
-                        current_char = peek_input();*/
-
-                        parse_escape(length, byte_char, '\'');
+                        auto escape_length_pair = parse_escape('\'');
+                        byte_char = escape_length_pair.first;
+                        length += escape_length_pair.second;
 
                         if (byte_char > 127) {
                             rust_error_at(
@@ -627,15 +597,12 @@ namespace Rust {
                             byte_char = 0;
                         }
 
-                        // skip_input();
                         current_char = peek_input();
-                        length++;
 
                         if (current_char != '\'') {
                             rust_error_at(get_current_location(), "unclosed byte char");
                         }
 
-                        // TODO: ensure skipping is needed here
                         skip_input();
                         current_char = peek_input();
                         length++; // go to next char
@@ -645,12 +612,12 @@ namespace Rust {
 
                         skip_input();
                         current_char = peek_input();
+                        length++;
 
                         if (current_char != '\'') {
                             rust_error_at(get_current_location(), "unclosed byte char");
                         }
 
-                        // TODO: ensure skipping is needed here
                         skip_input();
                         current_char = peek_input();
                         length++; // go to next char
@@ -666,18 +633,26 @@ namespace Rust {
 
                     // skip quote character
                     skip_input();
+                    current_column++;
 
                     std::string str;
                     str.reserve(16); // some sensible default
 
                     int length = 1;
                     current_char = peek_input();
-                    // TODO: handle escapes properly
 
                     while (current_char != '"' && current_char != '\n') {
                         if (current_char == '\\') {
-                            char output_char = 0;
-                            parse_escape(length, output_char, '"');
+                            auto escape_length_pair = parse_escape('"');
+                            char output_char = escape_length_pair.first;
+                            //length += escape_length_pair.second;
+
+                            // TODO: need to fix length - after escape, the length of the line up to the next non-whitespace char of the string is added to length, which is not what we want - we want length to be replaced by that.
+                            // possible option could if "if escape_length_pair.first == 0, then length = escape_length_pair.second else length += escape_length_pair.second."
+                            if (output_char == 0)
+                                length = escape_length_pair.second - 1; 
+                            else
+                                length += escape_length_pair.second;
 
                             if (output_char > 127) {
                                 rust_error_at(get_current_location(),
@@ -685,7 +660,8 @@ namespace Rust {
                                 output_char = 0;
                             }
 
-                            str += output_char;
+                            if (output_char != 0)
+                                str += output_char;
 
                             continue;
                         }
@@ -702,14 +678,18 @@ namespace Rust {
                     if (current_char == '\n') {
                         rust_error_at(get_current_location(), "unended byte string literal");
                     } else if (current_char == '"') {
+                        // TEST: hopefully column inc should make string line up properly
+                        current_column++;
+
                         skip_input();
                         current_char = peek_input();
                     } else {
-                        rust_unreachable();
+                        gcc_unreachable();
                     }
 
+                    str.shrink_to_fit();
+
                     return Token::make_byte_string(loc, str);
-                    // TODO: ensure escapes and string continue work properly
                 } else if (peek_input() == 'r' && (peek_input(1) == '#' || peek_input(1) == '"')) {
                     // raw byte string literals
                     std::string str;
@@ -721,6 +701,7 @@ namespace Rust {
                     // get hash count at beginnning
                     skip_input();
                     current_char = peek_input();
+                    length++;
                     while (current_char == '#') {
                         hash_count++;
                         length++;
@@ -735,6 +716,7 @@ namespace Rust {
 
                     skip_input();
                     current_char = peek_input();
+                    length++;
 
                     while (true) {
                         if (current_char == '"') {
@@ -742,8 +724,8 @@ namespace Rust {
 
                             for (int i = 0; i < hash_count; i++) {
                                 if (peek_input(i + 1) != '#') {
-                                    enough_hashes
-                                      = false; // could continue here - improve performance
+                                    enough_hashes = false; // could continue here -
+                                                           // improve performance
                                 }
                             }
 
@@ -765,7 +747,9 @@ namespace Rust {
 
                     current_column += length;
 
-                    return Token::make_byte_string(loc, str); // TODO: does this work properly
+                    str.shrink_to_fit();
+
+                    return Token::make_byte_string(loc, str);
                 }
             }
 
@@ -811,65 +795,79 @@ namespace Rust {
                         rust_error_at(
                           get_current_location(), "'%s' is a forbidden raw identifier", str.c_str());
                     } else {
+                        str.shrink_to_fit();
+
                         return Token::make_identifier(loc, str);
                     }
-                } else if (peek == '"' || (peek == '#' && (ISALPHA(peek1) || peek1 == '_'))) {
-                    // raw string literals
-                    std::string str;
-                    str.reserve(16); // some sensible default
+                } else {
+                    int peek_index = 0;
+                    while (peek_input(peek_index) == '#')
+                        peek_index++;
+                    // TODO: optimise by using "peek_index" as the hash count - 1 or something
 
-                    int length = 1;
-                    int hash_count = 0;
+                    if (peek_input(peek_index) == '"') {
+                        // raw string literals
+                        std::string str;
+                        str.reserve(16); // some sensible default
 
-                    // get hash count at beginnning
-                    current_char = peek;
-                    while (current_char == '#') {
-                        hash_count++;
-                        length++;
+                        int length = 1;
+                        int hash_count = 0;
+
+                        // get hash count at beginnning
+                        current_char = peek;
+                        while (current_char == '#') {
+                            hash_count++;
+                            length++;
+
+                            skip_input();
+                            current_char = peek_input();
+                        }
+
+                        if (current_char != '"') {
+                            rust_error_at(get_current_location(), "raw string has no opening '\"'");
+                        }
 
+                        length++;
                         skip_input();
-                        current_char = peek_input();
-                    }
+                        Codepoint current_char32 = test_peek_codepoint_input();
 
-                    if (current_char != '"') {
-                        rust_error_at(get_current_location(), "raw string has no opening '\"'");
-                    }
+                        // TODO: didn't account for current_column++ somewhere - one less than is required
 
-                    skip_input();
-                    Codepoint current_char32 = test_peek_codepoint_input();
+                        while (true) {
+                            if (current_char32.value == '"') {
+                                bool enough_hashes = true;
 
-                    while (true) {
-                        if (current_char32.value == '"') {
-                            bool enough_hashes = true;
+                                for (int i = 0; i < hash_count; i++) {
+                                    // if (test_peek_codepoint_input(i + 1) != '#') {
+                                    // TODO: ensure this is a good enough replacement
+                                    if (peek_input(i + 1) != '#') {
+                                        enough_hashes = false; // could continue here -
+                                                               // improve performance
+                                    }
+                                }
 
-                            for (int i = 0; i < hash_count; i++) {
-                                // if (test_peek_codepoint_input(i + 1) != '#') {
-                                // TODO: ensure this is a good enough replacement
-                                if (peek_input(i + 1) != '#') {
-                                    enough_hashes
-                                      = false; // could continue here - improve performance
+                                if (enough_hashes) {
+                                    // skip enough input and peek enough input
+                                    skip_input(hash_count); // is this enough?
+                                    current_char = peek_input();
+                                    length += hash_count + 1;
+                                    break;
                                 }
                             }
 
-                            if (enough_hashes) {
-                                // skip enough input and peek enough input
-                                skip_input(hash_count); // is this enough?
-                                current_char = peek_input();
-                                length += hash_count + 1;
-                                break;
-                            }
-                        }
+                            length++;
 
-                        length++;
+                            str += current_char32;
+                            test_skip_codepoint_input();
+                            current_char32 = test_peek_codepoint_input();
+                        }
 
-                        str += current_char32;
-                        test_skip_codepoint_input();
-                        current_char32 = test_peek_codepoint_input();
-                    }
+                        current_column += length;
 
-                    current_column += length;
+                        str.shrink_to_fit();
 
-                    return Token::make_string(loc, str); // TODO: does this work properly
+                        return Token::make_string(loc, str);
+                    }
                 }
             }
 
@@ -895,16 +893,16 @@ namespace Rust {
                 current_column += length;
 
                 // if just a single underscore, not an identifier
-                if (first_is_underscore && length == 1) {
+                if (first_is_underscore && length == 1)
                     return Token::make(UNDERSCORE, loc);
-                }
+
+                str.shrink_to_fit();
 
                 TokenId keyword = classify_keyword(str);
-                if (keyword == IDENTIFIER) {
+                if (keyword == IDENTIFIER)
                     return Token::make_identifier(loc, str);
-                } else {
+                else
                     return Token::make(keyword, loc);
-                }
             }
 
             // identify literals
@@ -958,19 +956,14 @@ namespace Rust {
                         current_column += length;
 
                         // convert hex value to decimal representation
-                        long hex_num = ::std::strtol(str.c_str(), NULL, 16);
-
-                        // create output string stream for hex value to be converted to string
-                        // again
-                        // TODO: if too slow, use sprintf
-                        ::std::ostringstream ostr;
-                        ostr << hex_num;
+                        long hex_num = std::strtol(str.c_str(), NULL, 16);
 
-                        // reassign string representation to converted value
-                        str = ostr.str();
+                        str = std::to_string(hex_num);
 
                         // parse in type suffix if it exists
-                        parse_in_type_suffix(/*current_char, */ type_hint, length);
+                        auto type_suffix_pair = parse_in_type_suffix();
+                        type_hint = type_suffix_pair.first;
+                        length += type_suffix_pair.second;
 
                         if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) {
                             rust_error_at(get_current_location(),
@@ -985,8 +978,6 @@ namespace Rust {
 
                         length++;
 
-                        // don't add any characters as C octals are just 0124 or whatever
-
                         // loop through to add entire octal number to string
                         while (is_octal_digit(current_char) || current_char == '_') {
                             if (current_char == '_') {
@@ -1010,19 +1001,15 @@ namespace Rust {
                         current_column += length;
 
                         // convert octal value to decimal representation
-                        long octal_num = ::std::strtol(str.c_str(), NULL, 8);
-
-                        // create output string stream for octal value to be converted to
-                        // string again
-                        // TODO: if too slow, use sprintf
-                        ::std::ostringstream ostr;
-                        ostr << octal_num;
+                        long octal_num = std::strtol(str.c_str(), NULL, 8);
 
-                        // reassign string representation to converted value
-                        str = ostr.str();
+                        str = std::to_string(octal_num);
 
                         // parse in type suffix if it exists
-                        parse_in_type_suffix(/*current_char, */ type_hint, length);
+                        // parse_in_type_suffix (/*current_char, */ type_hint, length);
+                        auto type_suffix_pair = parse_in_type_suffix();
+                        type_hint = type_suffix_pair.first;
+                        length += type_suffix_pair.second;
 
                         if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) {
                             rust_error_at(get_current_location(),
@@ -1037,9 +1024,6 @@ namespace Rust {
 
                         length++;
 
-                        // don't add any characters as C binary numbers are not really
-                        // supported
-
                         // loop through to add entire binary number to string
                         while (is_bin_digit(current_char) || current_char == '_') {
                             if (current_char == '_') {
@@ -1063,19 +1047,15 @@ namespace Rust {
                         current_column += length;
 
                         // convert binary value to decimal representation
-                        long bin_num = ::std::strtol(str.c_str(), NULL, 2);
+                        long bin_num = std::strtol(str.c_str(), NULL, 2);
 
-                        // create output string stream for binary value to be converted to
-                        // string again
-                        // TODO: if too slow, use sprintf
-                        ::std::ostringstream ostr;
-                        ostr << bin_num;
-
-                        // reassign string representation to converted value
-                        str = ostr.str();
+                        str = std::to_string(bin_num);
 
                         // parse in type suffix if it exists
-                        parse_in_type_suffix(/*current_char, */ type_hint, length);
+                        // parse_in_type_suffix (/*current_char, */ type_hint, length);
+                        auto type_suffix_pair = parse_in_type_suffix();
+                        type_hint = type_suffix_pair.first;
+                        length += type_suffix_pair.second;
 
                         if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) {
                             rust_error_at(get_current_location(),
@@ -1089,10 +1069,13 @@ namespace Rust {
                     current_char = peek_input();
 
                     // parse initial decimal literal - assuming integer
-                    // TODO: test if works
-                    parse_in_decimal(/*current_char, */ str, length);
+                    // parse_in_decimal (/*current_char, */ str, length);
+                    auto str_length_pair = parse_in_decimal();
+                    str += str_length_pair.first;
+                    length += str_length_pair.second;
 
-                    // detect float literal - TODO: fix: "242." is not recognised as a float literal
+                    // detect float literal - TODO: fix: "242." is not recognised as a
+                    // float literal
                     if (current_char == '.' && is_float_digit(peek_input(1))) {
                         // float with a '.', parse another decimal into it
 
@@ -1106,16 +1089,19 @@ namespace Rust {
                         length++;
 
                         // parse another decimal number for float
-                        // TODO: test if works
-                        parse_in_decimal(/*current_char, */ str, length);
+                        auto str_length_pair2 = parse_in_decimal();
+                        str += str_length_pair2.first;
+                        length += str_length_pair2.second;
 
                         // parse in exponent part if it exists
-                        // test to see if this works:
-                        parse_in_exponent_part(/*current_char, */ str, length);
+                        auto exponent_part = parse_in_exponent_part();
+                        str += exponent_part.first;
+                        length += exponent_part.second;
 
                         // parse in type suffix if it exists
-                        // TODO: see if works:
-                        parse_in_type_suffix(/*current_char, */ type_hint, length);
+                        auto type_suffix_pair = parse_in_type_suffix();
+                        type_hint = type_suffix_pair.first;
+                        length += type_suffix_pair.second;
 
                         if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
                             && type_hint != CORETYPE_UNKNOWN) {
@@ -1123,7 +1109,6 @@ namespace Rust {
                               "invalid type suffix '%s' for float literal",
                               get_type_hint_string(type_hint));
                         }
-
                     } else if (current_char == '.' && check_valid_float_dot_end(peek_input(1))) {
                         is_real = true;
 
@@ -1138,13 +1123,6 @@ namespace Rust {
 
                         // don't parse another decimal number for float
 
-                        // parse in exponent part if it exists - shouldn't exist?
-                        // parse_in_exponent_part(/*current_char, */ str, length);
-
-                        // parse in type suffix if it exists - shouldn't exist?
-                        // TODO: see if works:
-                        // parse_in_type_suffix(/*current_char, */ type_hint, length);
-
                         if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
                             && type_hint != CORETYPE_UNKNOWN) {
                             rust_error_at(get_current_location(),
@@ -1155,10 +1133,16 @@ namespace Rust {
                         is_real = true;
 
                         // parse exponent part
-                        parse_in_exponent_part(/*current_char, */ str, length);
+                        // parse_in_exponent_part (/*current_char, */ str, length);
+                        auto exponent_part = parse_in_exponent_part();
+                        str += exponent_part.first;
+                        length += exponent_part.second;
 
                         // parse in type suffix if it exists
-                        parse_in_type_suffix(/*current_char, */ type_hint, length);
+                        // parse_in_type_suffix (/*current_char, */ type_hint, length);
+                        auto type_suffix_pair = parse_in_type_suffix();
+                        type_hint = type_suffix_pair.first;
+                        length += type_suffix_pair.second;
 
                         if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
                             && type_hint != CORETYPE_UNKNOWN) {
@@ -1170,11 +1154,15 @@ namespace Rust {
                         // is an integer
 
                         // parse in type suffix if it exists
-                        parse_in_type_suffix(/*current_char, */ type_hint, length);
+                        // parse_in_type_suffix (/*current_char, */ type_hint, length);
+                        auto type_suffix_pair = parse_in_type_suffix();
+                        type_hint = type_suffix_pair.first;
+                        length += type_suffix_pair.second;
 
                         if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) {
                             rust_error_at(get_current_location(),
-                              "invalid type suffix '%s' for integer (decimal) literal",
+                              "invalid type suffix '%s' for integer "
+                              "(decimal) literal",
                               get_type_hint_string(type_hint));
                         }
                     }
@@ -1182,12 +1170,13 @@ namespace Rust {
                     current_column += length;
                 }
 
+                str.shrink_to_fit();
+
                 // actually make the tokens
-                if (is_real) {
+                if (is_real)
                     return Token::make_float(loc, str, type_hint);
-                } else {
+                else
                     return Token::make_int(loc, str, type_hint);
-                }
             }
 
             // string literals - not processed properly
@@ -1200,20 +1189,25 @@ namespace Rust {
                 int length = 1;
                 current_char32 = test_peek_codepoint_input();
 
-                // ok initial peek_codepoint seems to work without "too long"
-
                 while (current_char32.value != '\n' && current_char32.value != '"') {
-                    // TODO: handle escapes and string continue
                     if (current_char32.value == '\\') {
                         // parse escape
-                        parse_utf8_escape(length, current_char32, '\'');
-
-                        // TODO: find a way to parse additional characters after the escape?
-                        // return after parsing escape?
-
-                        str += current_char32;
+                        auto utf8_escape_pair = parse_utf8_escape('\'');
+                        current_char32 = utf8_escape_pair.first;
+                        //length += utf8_escape_pair.second;
+
+                        // TODO: need to fix length - after escape, the length of the line up to the next non-whitespace char of the string is added to length, which is not what we want - we want length to be replaced by that.
+                        // possible option could if "if escape_length_pair.first == 0, then length = escape_length_pair.second else length += escape_length_pair.second."
+                        if (current_char32 == Codepoint(0))
+                            length = utf8_escape_pair.second - 1; 
+                        else
+                            length += utf8_escape_pair.second;
+
+                        if (current_char32 != Codepoint(0))
+                            str += current_char32;
 
-                        // required as parsing utf8 escape only changes current_char or something
+                        // required as parsing utf8 escape only changes current_char
+                        // or something
                         current_char32 = test_peek_codepoint_input();
 
                         continue;
@@ -1221,7 +1215,6 @@ namespace Rust {
 
                     length += test_get_input_codepoint_length();
 
-                    // does this work? not technically a char. maybe have to convert to char series
                     str += current_char32;
                     test_skip_codepoint_input();
                     current_char32 = test_peek_codepoint_input();
@@ -1232,21 +1225,20 @@ namespace Rust {
                 if (current_char32.value == '\n') {
                     rust_error_at(get_current_location(), "unended string literal");
                 } else if (current_char32.value == '"') {
+                    current_column++;
+                    
                     skip_input();
-
                     current_char = peek_input();
                 } else {
-                    rust_unreachable();
+                    gcc_unreachable();
                 }
 
+                str.shrink_to_fit();
                 return Token::make_string(loc, str);
-                // TODO: account for escapes and string continue
-                // also, in rust a string is a series of unicode characters (4 bytes)
             }
 
             // char literal attempt
             if (current_char == '\'') {
-                // rust chars are 4 bytes and have some weird unicode representation thing
                 Codepoint current_char32;
 
                 int length = 1;
@@ -1256,10 +1248,9 @@ namespace Rust {
                 // parse escaped char literal
                 if (current_char32.value == '\\') {
                     // parse escape
-                    parse_utf8_escape(length, current_char32, '\'');
-
-                    // TODO - this skip may not be needed?
-                    // test_skip_codepoint_input();
+                    auto utf8_escape_pair = parse_utf8_escape('\'');
+                    current_char32 = utf8_escape_pair.first;
+                    length += utf8_escape_pair.second;
 
                     if (test_peek_codepoint_input().value != '\'') {
                         rust_error_at(get_current_location(), "unended char literal");
@@ -1271,7 +1262,6 @@ namespace Rust {
 
                     current_column += length;
 
-                    // TODO: FIX - char is actually 4 bytes in Rust (uint32) due to unicode
                     return Token::make_char(loc, current_char32);
                 } else {
                     // current_char32 = test_peek_codepoint_input();
@@ -1279,7 +1269,6 @@ namespace Rust {
 
                     if (test_peek_codepoint_input().value == '\'') {
                         // parse normal char literal
-                        // TODO: FIX - char is actually 4 bytes in Rust (uint32) due to unicode
 
                         // skip the ' character
                         skip_input();
@@ -1292,11 +1281,11 @@ namespace Rust {
                     } else if (ISDIGIT(current_char32.value) || ISALPHA(current_char32.value)
                                || current_char32.value == '_') {
                         // parse lifetime name
-                        ::std::string str;
-                        // TODO: does this work properly?
+                        std::string str;
                         str += current_char32;
 
-                        // TODO: fix lifetime name thing - actually, why am I even using utf-8 here?
+                        /* TODO: fix lifetime name thing - actually, why am I even
+                         * using utf-8 here? */
 
                         int length = 1;
 
@@ -1313,6 +1302,7 @@ namespace Rust {
 
                         current_column += length;
 
+                        str.shrink_to_fit();
                         return Token::make_lifetime(loc, str);
                     } else {
                         rust_error_at(get_current_location(), "expected ' after character constant");
@@ -1327,11 +1317,12 @@ namespace Rust {
     }
 
     // Shitty pass-by-reference way of parsing in type suffix.
-    bool Lexer::parse_in_type_suffix(
-      /*char& current_char, */ PrimitiveCoreType& type_hint, int& length) {
-        ::std::string suffix;
+    std::pair<PrimitiveCoreType, int> Lexer::parse_in_type_suffix() {
+        std::string suffix;
         suffix.reserve(5);
 
+        int additional_length_offset = 0;
+
         // get suffix
         while (ISALPHA(current_char) || ISDIGIT(current_char) || current_char == '_') {
             if (current_char == '_') {
@@ -1339,12 +1330,12 @@ namespace Rust {
                 skip_input();
                 current_char = peek_input();
 
-                length++;
+                additional_length_offset++;
 
                 continue;
             }
 
-            length++;
+            additional_length_offset++;
 
             suffix += current_char;
             skip_input();
@@ -1353,52 +1344,52 @@ namespace Rust {
 
         if (suffix.empty()) {
             // no type suffix: do nothing but also no error
-            return false;
+            return std::make_pair(CORETYPE_UNKNOWN, additional_length_offset);
         } else if (suffix == "f32") {
-            type_hint = CORETYPE_F32;
+            return std::make_pair(CORETYPE_F32, additional_length_offset);
         } else if (suffix == "f64") {
-            type_hint = CORETYPE_F64;
+            return std::make_pair(CORETYPE_F64, additional_length_offset);
         } else if (suffix == "i8") {
-            type_hint = CORETYPE_I8;
+            return std::make_pair(CORETYPE_I8, additional_length_offset);
         } else if (suffix == "i16") {
-            type_hint = CORETYPE_I16;
+            return std::make_pair(CORETYPE_I16, additional_length_offset);
         } else if (suffix == "i32") {
-            type_hint = CORETYPE_I32;
+            return std::make_pair(CORETYPE_I32, additional_length_offset);
         } else if (suffix == "i64") {
-            type_hint = CORETYPE_I64;
+            return std::make_pair(CORETYPE_I64, additional_length_offset);
         } else if (suffix == "i128") {
-            type_hint = CORETYPE_I128;
+            return std::make_pair(CORETYPE_I128, additional_length_offset);
         } else if (suffix == "isize") {
-            type_hint = CORETYPE_ISIZE;
+            return std::make_pair(CORETYPE_ISIZE, additional_length_offset);
         } else if (suffix == "u8") {
-            type_hint = CORETYPE_U8;
+            return std::make_pair(CORETYPE_U8, additional_length_offset);
         } else if (suffix == "u16") {
-            type_hint = CORETYPE_U16;
+            return std::make_pair(CORETYPE_U16, additional_length_offset);
         } else if (suffix == "u32") {
-            type_hint = CORETYPE_U32;
+            return std::make_pair(CORETYPE_U32, additional_length_offset);
         } else if (suffix == "u64") {
-            type_hint = CORETYPE_U64;
+            return std::make_pair(CORETYPE_U64, additional_length_offset);
         } else if (suffix == "u128") {
-            type_hint = CORETYPE_U128;
+            return std::make_pair(CORETYPE_U128, additional_length_offset);
         } else if (suffix == "usize") {
-            type_hint = CORETYPE_USIZE;
+            return std::make_pair(CORETYPE_USIZE, additional_length_offset);
         } else {
             rust_error_at(get_current_location(), "unknown number suffix '%s'", suffix.c_str());
 
-            return false;
+            return std::make_pair(CORETYPE_UNKNOWN, additional_length_offset);
         }
-
-        return true;
     }
 
-    void Lexer::parse_in_exponent_part(/*char& current_char, */ std::string& str, int& length) {
+    std::pair<std::string, int> Lexer::parse_in_exponent_part() {
+        int additional_length_offset = 0;
+        std::string str;
         if (current_char == 'E' || current_char == 'e') {
             // add exponent to string as strtod works with it
             str += current_char;
             skip_input();
             current_char = peek_input();
 
-            length++;
+            additional_length_offset++;
 
             // special - and + handling
             if (current_char == '-') {
@@ -1407,46 +1398,55 @@ namespace Rust {
                 skip_input();
                 current_char = peek_input();
 
-                length++;
+                additional_length_offset++;
             } else if (current_char == '+') {
                 // don't add + but still skip input
                 skip_input();
                 current_char = peek_input();
 
-                length++;
+                additional_length_offset++;
             }
 
             // parse another decimal number for exponent
-            parse_in_decimal(/*current_char, */ str, length);
+            auto str_length_pair = parse_in_decimal();
+            str += str_length_pair.first;
+            additional_length_offset += str_length_pair.second;
         }
+        return std::make_pair(str, additional_length_offset);
     }
 
-    void Lexer::parse_in_decimal(/*char& current_char, */ std::string& str, int& length) {
+    std::pair<std::string, int> Lexer::parse_in_decimal() {
+        int additional_length_offset = 0;
+        std::string str;
         while (ISDIGIT(current_char) || current_char == '_') {
             if (current_char == '_') {
                 // don't add _ to number
                 skip_input();
                 current_char = peek_input();
 
-                length++;
+                additional_length_offset++;
 
                 continue;
             }
 
-            length++;
+            additional_length_offset++;
 
             str += current_char;
             skip_input();
             current_char = peek_input();
         }
+        return std::make_pair(str, additional_length_offset);
     }
 
-    // Replace all assorted parse_x_escape with this? Avoids the backwards/peek issue.
-    bool Lexer::parse_escape(int& length, char& output_char, char opening_char) {
+    /* Parses escapes (and string continues) in "byte" strings and characters. Does not support unicode. */
+    std::pair<char, int> Lexer::parse_escape(char opening_char) {
+        int additional_length_offset = 0;
+        char output_char = 0;
+
         // skip to actual letter
         skip_input();
         current_char = peek_input();
-        length++;
+        additional_length_offset++;
 
         switch (current_char) {
             case 'x': {
@@ -1456,9 +1456,9 @@ namespace Rust {
                 // first hex char
                 skip_input();
                 current_char = peek_input();
-                length++;
+                additional_length_offset++;
 
-                if (!ISXDIGIT(current_char)) {
+                if (!is_x_digit(current_char)) {
                     rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
                       current_char);
                 }
@@ -1467,23 +1467,21 @@ namespace Rust {
                 // second hex char
                 skip_input();
                 current_char = peek_input();
-                length++;
+                additional_length_offset++;
 
-                if (!ISXDIGIT(current_char)) {
+                if (!is_x_digit(current_char)) {
                     rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
                       current_char);
                 }
                 hexNum[1] = current_char;
 
-                long hexLong = ::std::strtol(hexNum, NULL, 16);
+                long hexLong = std::strtol(hexNum, NULL, 16);
 
-                if (hexLong > 127)
+                if (hexLong > 255 || hexLong < 0)
                     rust_error_at(get_current_location(),
-                      "ascii \\x escape '\\x%s' out of range - allows up to '\\x7F'", hexNum);
-                // gcc_assert(hexLong < 128); // as ascii
+                      "byte \\x escape '\\x%s' out of range - allows up to '\\xFF'", hexNum);
                 char hexChar = static_cast<char>(hexLong);
 
-                // TODO: fix - does this actually give the right character?
                 output_char = hexChar;
             } break;
             case 'n':
@@ -1507,28 +1505,34 @@ namespace Rust {
             case '"':
                 output_char = '"';
                 break;
-            case 'u': {
+            case 'u':
+                rust_error_at(get_current_location(),
+                  "cannot have a unicode escape \\u in a byte %s!",
+                  opening_char == '\'' ? "character" : "string");
+                return std::make_pair(output_char, additional_length_offset);
+#if 0
+			{
                 // TODO: shouldn't be used with this - use parse_utf8_escape
 
                 skip_input();
                 current_char = peek_input();
-                length++;
+                additional_length_offset++;
 
                 bool need_close_brace = false;
 
-                // TODO: rustc lexer doesn't seem to allow not having { but mrustc lexer does? look at
-                // spec?
+                // TODO: rustc lexer doesn't seem to allow not having { but mrustc lexer
+                // does? look at spec?
                 if (current_char == '{') {
                     need_close_brace = true;
 
                     skip_input();
                     current_char = peek_input();
-                    length++;
+                    additional_length_offset++;
                 }
 
                 // parse unicode escape
                 // 1-6 hex digits?
-                ::std::string num_str;
+                std::string num_str;
                 num_str.reserve(6);
 
                 // test adding number directly
@@ -1541,12 +1545,12 @@ namespace Rust {
                         skip_input();
                         current_char = peek_input();
 
-                        length++;
+                        additional_length_offset++;
 
                         continue;
                     }
 
-                    length++;
+                    additional_length_offset++;
 
                     // add raw hex numbers
                     num_str += current_char;
@@ -1554,7 +1558,7 @@ namespace Rust {
                     // test adding number directly
                     char tmp[2] = { current_char, 0 };
                     test_val *= 16;
-                    test_val += ::std::strtol(tmp, NULL, 16);
+                    test_val += std::strtol(tmp, NULL, 16);
 
                     skip_input();
                     current_char = peek_input();
@@ -1565,18 +1569,21 @@ namespace Rust {
                     // actually an error
                     rust_error_at(
                       get_current_location(), "expected terminating '}' in unicode escape");
-                    return false;
+                    // return false;
+                    return std::make_pair(output_char, additional_length_offset);
                 }
 
                 // ensure 1-6 hex characters
                 if (num_str.length() > 6 || num_str.length() < 1) {
                     rust_error_at(get_current_location(),
-                      "unicode escape should be between 1 and 6 hex characters; it is %lu",
+                      "unicode escape should be between 1 and 6 hex "
+                      "characters; it is %lu",
                       num_str.length());
-                    return false;
+                    // return false;
+                    return std::make_pair(output_char, additional_length_offset);
                 }
 
-                long hex_num = ::std::strtol(num_str.c_str(), NULL, 16);
+                long hex_num = std::strtol(num_str.c_str(), NULL, 16);
 
                 // as debug, check hex_num = test_val
                 if (hex_num > 255) {
@@ -1586,12 +1593,14 @@ namespace Rust {
                 }
 
                 // make output_char the value - UTF-8?
-                // TODO: actually make this work - output char must be 4 bytes, do I need a string for
-                // this?
+                // TODO: actually make this work - output char must be 4 bytes, do I
+                // need a string for this?
                 output_char = static_cast</*uint32_t*/ char>(hex_num);
 
-                return true;
+                // return true;
+                return std::make_pair(output_char, additional_length_offset);
             } break;
+#endif
             case '\r':
             case '\n':
                 // string continue
@@ -1600,10 +1609,10 @@ namespace Rust {
                         current_line++;
                         current_column = 1;
                         // tell line_table that new line starts
-                        linemap_line_start(::line_table, current_line, max_column_hint);
+                        line_map->start_line(current_line, max_column_hint);
 
                         // reset "length"
-                        length = 1;
+                        additional_length_offset = 1;
 
                         // get next char
                         skip_input();
@@ -1614,45 +1623,56 @@ namespace Rust {
 
                     skip_input();
                     current_char = peek_input();
-                    length++;
+                    additional_length_offset++;
                 }
 
-                if (current_char == '\\') {
-                    parse_escape(length, output_char, opening_char);
-                    return true;
-                } else if (current_char == opening_char) {
+                // shouldn't need this
+#if 0
+                if (current_char == opening_char) {
                     // TODO: does this skip the ' or " character? It shouldn't.
                     output_char = 0;
-                    return true;
+                    // return true;
+                    return std::make_pair(output_char, additional_length_offset);
                 } else {
-                    output_char = current_char;
+                    // TODO: shouldn't this make output_char null so that it isn't added to string?
+                    // or check for escape being zero?
+                    output_char = /*current_char*/0;
 
                     // TODO: test has right result
                     /*skip_input();
                     current_char = peek_input();*/
 
-                    return true;
+                    // return true;
+                    return std::make_pair(output_char, additional_length_offset);
                 }
+#endif
+                return std::make_pair(0, additional_length_offset);
             default:
                 rust_error_at(get_current_location(), "unknown escape sequence '\\%c'", current_char);
                 // returns false if no parsing could be done
-                return false;
+                // return false;
+                return std::make_pair(output_char, additional_length_offset);
                 break;
         }
-        // all non-special cases (unicode, string continue) should skip their used char
+        // all non-special cases (string continue) should skip their used char
         skip_input();
         current_char = peek_input();
-        length++;
+        additional_length_offset++;
 
         // returns true if parsing was successful
-        return true;
+        // return true;
+        return std::make_pair(output_char, additional_length_offset);
     }
 
-    bool Lexer::parse_utf8_escape(int& length, Codepoint& output_char, char opening_char) {
+    // Parses an escape (or string continue) in a string or character. Supports unicode escapes.
+    std::pair<Codepoint, int> Lexer::parse_utf8_escape(char opening_char) {
+        Codepoint output_char;
+        int additional_length_offset = 0;
+
         // skip to actual letter
         skip_input();
         current_char = peek_input();
-        length++;
+        additional_length_offset++;
 
         switch (current_char) {
             case 'x': {
@@ -1662,9 +1682,9 @@ namespace Rust {
                 // first hex char
                 skip_input();
                 current_char = peek_input();
-                length++;
+                additional_length_offset++;
 
-                if (!ISXDIGIT(current_char)) {
+                if (!is_x_digit(current_char)) {
                     rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
                       current_char);
                 }
@@ -1673,15 +1693,15 @@ namespace Rust {
                 // second hex char
                 skip_input();
                 current_char = peek_input();
-                length++;
+                additional_length_offset++;
 
-                if (!ISXDIGIT(current_char)) {
+                if (!is_x_digit(current_char)) {
                     rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
                       current_char);
                 }
                 hexNum[1] = current_char;
 
-                long hexLong = ::std::strtol(hexNum, NULL, 16);
+                long hexLong = std::strtol(hexNum, NULL, 16);
 
                 if (hexLong > 127)
                     rust_error_at(get_current_location(),
@@ -1689,7 +1709,6 @@ namespace Rust {
                 // gcc_assert(hexLong < 128); // as ascii
                 char hexChar = static_cast<char>(hexLong);
 
-                // TODO: fix - does this actually give the right character?
                 output_char = hexChar;
             } break;
             case 'n':
@@ -1716,28 +1735,21 @@ namespace Rust {
             case 'u': {
                 skip_input();
                 current_char = peek_input();
-                length++;
+                additional_length_offset++;
 
                 bool need_close_brace = false;
-
-                // TODO: rustc lexer doesn't seem to allow not having { but mrustc lexer does? look at
-                // spec?
                 if (current_char == '{') {
                     need_close_brace = true;
 
                     skip_input();
                     current_char = peek_input();
-                    length++;
+                    additional_length_offset++;
                 }
 
-                // parse unicode escape
-                // 1-6 hex digits?
-                ::std::string num_str;
+                // parse unicode escape - 1-6 hex digits
+                std::string num_str;
                 num_str.reserve(6);
 
-                // test adding number directly
-                uint32_t test_val;
-
                 // loop through to add entire hex number to string
                 while (is_x_digit(current_char) || current_char == '_') {
                     if (current_char == '_') {
@@ -1745,21 +1757,16 @@ namespace Rust {
                         skip_input();
                         current_char = peek_input();
 
-                        length++;
+                        additional_length_offset++;
 
                         continue;
                     }
 
-                    length++;
+                    additional_length_offset++;
 
                     // add raw hex numbers
                     num_str += current_char;
 
-                    // test adding number directly
-                    char tmp[2] = { current_char, 0 };
-                    test_val *= 16;
-                    test_val += ::std::strtol(tmp, NULL, 16);
-
                     skip_input();
                     current_char = peek_input();
                 }
@@ -1769,36 +1776,38 @@ namespace Rust {
                     if (current_char == '}') {
                         skip_input();
                         current_char = peek_input();
-                        length++;
+                        additional_length_offset++;
                     } else {
                         // actually an error
                         rust_error_at(
                           get_current_location(), "expected terminating '}' in unicode escape");
-                        return false;
+                        // return false;
+                        return std::make_pair(output_char, additional_length_offset);
                     }
                 }
 
                 // ensure 1-6 hex characters
                 if (num_str.length() > 6 || num_str.length() < 1) {
                     rust_error_at(get_current_location(),
-                      "unicode escape should be between 1 and 6 hex characters; it is %lu",
+                      "unicode escape should be between 1 and 6 hex "
+                      "characters; it is %lu",
                       num_str.length());
-                    return false;
+                    // return false;
+                    return std::make_pair(output_char, additional_length_offset);
                 }
 
-                long hex_num = ::std::strtol(num_str.c_str(), NULL, 16);
+                long hex_num = std::strtol(num_str.c_str(), NULL, 16);
 
                 // assert fits a uint32_t
-                rust_assert(hex_num < 4294967296);
+                gcc_assert(hex_num < 4294967296);
 
-                // ok can't figure out how to just convert to codepoint or use "this" so create new
-                // one
                 output_char = Codepoint(static_cast<uint32_t>(hex_num));
 
-                // TODO: what is being outputted? the escape code for the unicode char (unicode
-                // number) or the character number?
+                // TODO: what is being outputted? the escape code for the unicode char
+                // (unicode number) or the character number?
 
-                return true;
+                // return true;
+                return std::make_pair(output_char, additional_length_offset);
             } break;
             case '\r':
             case '\n':
@@ -1808,10 +1817,10 @@ namespace Rust {
                         current_line++;
                         current_column = 1;
                         // tell line_table that new line starts
-                        linemap_line_start(::line_table, current_line, max_column_hint);
+                        line_map->start_line(current_line, max_column_hint);
 
                         // reset "length"
-                        length = 1;
+                        additional_length_offset = 1;
 
                         // get next char
                         skip_input();
@@ -1822,38 +1831,39 @@ namespace Rust {
 
                     skip_input();
                     current_char = peek_input();
-                    length++;
+                    additional_length_offset++;
                 }
 
-                if (current_char == '\\') {
-                    parse_utf8_escape(length, output_char, opening_char);
-                    return true;
-                } else if (current_char == opening_char) {
-                    // TODO: does this skip the ' or " character? It shouldn't.
+                // shouldn't need this
+#if 0
+                if (current_char == opening_char) {
                     output_char = 0;
-                    return true;
+                    // return true;
+                    return std::make_pair(output_char, additional_length_offset);
                 } else {
-                    output_char = current_char;
+                    output_char = /*current_char*/0;
 
-                    // TODO: test has right result
-                    /*skip_input();
-                    current_char = peek_input();*/
-
-                    return true;
+                    // return true;
+                    return std::make_pair(output_char, additional_length_offset);
                 }
+#endif
+                return std::make_pair(0, additional_length_offset);
             default:
                 rust_error_at(get_current_location(), "unknown escape sequence '\\%c'", current_char);
                 // returns false if no parsing could be done
-                return false;
+                // return false;
+                return std::make_pair(output_char, additional_length_offset);
                 break;
         }
-        // all non-special cases (unicode, string continue) should skip their used char
+        /* all non-special cases (unicode, string continue) should skip their used
+         * char */
         skip_input();
         current_char = peek_input();
-        length++;
+        additional_length_offset++;
 
         // returns true if parsing was successful
-        return true;
+        // return true;
+        return std::make_pair(output_char, additional_length_offset);
     }
 
 #if 0
@@ -2102,6 +2112,7 @@ namespace Rust {
     }
 #endif
 
+    // Returns the length of the codepoint at the current position.
     int Lexer::test_get_input_codepoint_length() {
         uint8_t input = peek_input();
 
@@ -2138,8 +2149,8 @@ namespace Rust {
             // return 0xFFFE;
 
             /*uint32_t output
-              = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0);
-            return output;*/
+              = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
+            0); return output;*/
             return 3;
         } else if ((input & 0xF8) == 0xF0) {
             // 4 bytes
@@ -2168,7 +2179,7 @@ namespace Rust {
         }
     }
 
-    // TODO: rewrite lexing system to use utf-8 "codepoints" rather than bytes?
+    // Returns the codepoint at the current position.
     Codepoint Lexer::test_peek_codepoint_input() {
         uint8_t input = peek_input();
 
@@ -2224,7 +2235,7 @@ namespace Rust {
 
     void Lexer::test_skip_codepoint_input() {
         int toSkip = test_get_input_codepoint_length();
-        rust_assert(toSkip >= 1);
+        gcc_assert(toSkip >= 1);
 
         skip_input(toSkip - 1);
     }
@@ -2263,8 +2274,8 @@ namespace Rust {
                 return 0;
 
             /*uint32_t output
-              = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0);
-            return output;*/
+              = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
+            0); return output;*/
             return 3;
         } else if ((input & 0xF8) == 0xF0) {
             // 4 bytes
@@ -2293,7 +2304,8 @@ namespace Rust {
         }
     }
 
-    // peeks the codepoint input at n codepoints ahead of current codepoint - try not to use
+    // peeks the codepoint input at n codepoints ahead of current codepoint - try
+    // not to use
     Codepoint Lexer::test_peek_codepoint_input(int n) {
         int totalOffset = 0;
 
@@ -2301,12 +2313,13 @@ namespace Rust {
         for (int i = 0; i < n; i++) {
             totalOffset += test_get_input_codepoint_n_length(totalOffset);
         }
-        // issues: this would have (at least) O(n) lookup time, not O(1) like the rest?
+        // issues: this would have (at least) O(n) lookup time, not O(1) like the
+        // rest?
 
         // TODO: implement if still needed
 
         // error out of function as it is not implemented
-        rust_assert(1 == 0);
+        gcc_assert(1 == 0);
         return { 0 };
         /*
                 uint8_t input = peek_input();
@@ -2336,9 +2349,8 @@ namespace Rust {
                         return 0xFFFE;
 
                     uint32_t output
-                      = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0);
-                    return output;
-                } else if ((input & 0xF8) == 0xF0) {
+                      = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 &
+           0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) {
                     // 4 bytes
                     uint8_t input2 = peek_input(1);
                     if ((input2 & 0xC0) != 0x80)
@@ -2353,11 +2365,9 @@ namespace Rust {
                         return 0xFFFE;
 
                     uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-                                      | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
-                    return output;
-                } else {
-                    rust_error_at(get_current_location(), "invalid UTF-8 (too long)");
-                    return 0xFFFE;
+                                      | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) <<
+           0); return output; } else { rust_error_at(get_current_location(), "invalid
+           UTF-8 (too long)"); return 0xFFFE;
                 }*/
     }
-}
+} // namespace Rust
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
index 8dc3e31..5220753 100644
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@@ -5,6 +5,8 @@
 #include "rust-buffered-queue.h"
 #include "rust-token.h"
 
+#include <utility>
+
 namespace Rust {
 class Lexer
 {
@@ -31,35 +33,39 @@ private:
   // ok maybe all these may mean the lexer structure needs to be rethought
   /* separated into functions because main method was too long, but they rely on
    * and change state in the lexer, so variables must be passed by reference. */
-  inline void parse_in_decimal (/*char& current_char, */ std::string &str,
-				int &length);
-  inline void parse_in_exponent_part (/*char& current_char, */ std::string &str,
-				      int &length);
-  inline bool parse_in_type_suffix (
-    /*char& current_char, */ PrimitiveCoreType &type_hint, int &length);
-  inline bool parse_ascii_escape (/*char& current_char, */ int &length,
-				  char &output_char);
-  inline bool parse_quote_escape (/*char& current_char, */ int &length,
-				  char &output_char);
-  inline bool parse_unicode_escape (
-    /*char& current_char, */ int &length, Codepoint &output_char);
-  inline bool parse_byte_escape (/*char& current_char, */ int &length,
-				 char &output_char);
-  inline bool parse_escape (int &length, char &output_char, char opening_char);
-  inline bool parse_utf8_escape (int &length, Codepoint &output_char,
-				 char opening_char);
-  inline int test_get_input_codepoint_length ();
-  inline int test_get_input_codepoint_n_length (int n_start_offset);
-  inline Codepoint test_peek_codepoint_input ();
-  inline Codepoint test_peek_codepoint_input (
+  std::pair<std::string, int> parse_in_decimal ();
+  std::pair<std::string, int> parse_in_exponent_part ();
+  std::pair<PrimitiveCoreType, int> parse_in_type_suffix ();
+  /*bool parse_ascii_escape (int &length,
+				  char &output_char);*/
+  /*bool parse_quote_escape (char& current_char, int &length,
+				  char &output_char);*/
+  /*bool parse_unicode_escape (
+    char& current_char, int &length, Codepoint &output_char);*/
+  /*bool parse_byte_escape (char& current_char, int &length,
+				 char &output_char);*/
+  std::pair<char, int> parse_escape (char opening_char);
+  std::pair<Codepoint, int> parse_utf8_escape (char opening_char);
+  int test_get_input_codepoint_length ();
+  int test_get_input_codepoint_n_length (int n_start_offset);
+  Codepoint test_peek_codepoint_input ();
+  Codepoint test_peek_codepoint_input (
     int n); // maybe can use get_input_codepoint_length to get starting index
-  inline void test_skip_codepoint_input ();
+  void test_skip_codepoint_input ();
 
 public:
   // Construct lexer with input file and filename provided
   Lexer (const char *filename, FILE *input, Linemap *linemap);
   ~Lexer ();
 
+  // don't allow copy semantics (for now, at least)
+  Lexer (const Lexer &other) = delete;
+  Lexer &operator= (const Lexer &other) = delete;
+
+  // enable move semantics
+  Lexer (Lexer &&other) = default;
+  Lexer &operator= (Lexer &&other) = default;
+
   // Returns token n tokens ahead of current position.
   const_TokenPtr peek_token (int n);
   // Peeks the current token.
diff --git a/gcc/rust/lex/rust-token.h b/gcc/rust/lex/rust-token.h
index 2270fa2..9dd5f0b 100644
--- a/gcc/rust/lex/rust-token.h
+++ b/gcc/rust/lex/rust-token.h
@@ -53,7 +53,6 @@ enum PrimitiveCoreType
 
 // note that abstract, async, become, box, do, final, macro, override, priv,
 // try, typeof, unsized, virtual, and yield are unused
-// TODO finish converting to rust keywords
 #define RS_TOKEN_LIST                                                          \
   RS_TOKEN (FIRST_TOKEN, "<first-token-marker>")                               \
   RS_TOKEN (END_OF_FILE, "end of file")                                        \
diff --git a/gcc/rust/rust-linemap.cc b/gcc/rust/rust-linemap.cc
index fef4603..5ee76bd 100644
--- a/gcc/rust/rust-linemap.cc
+++ b/gcc/rust/rust-linemap.cc
@@ -72,7 +72,8 @@ Gcc_linemap::to_string (Location location)
 
   // Strip the source file down to the base file, to reduce clutter.
   std::stringstream ss;
-  ss << lbasename (path) << ":" << SOURCE_LINE (lmo, location.gcc_location ());
+  ss << lbasename (path) << ":" << SOURCE_LINE (lmo, location.gcc_location ()) 
+    << ":" << SOURCE_COLUMN (lmo, location.gcc_location ());
   return ss.str ();
 }
 
diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc
index 4308ae3..29933d5 100644
--- a/gcc/rust/rust-session-manager.cc
+++ b/gcc/rust/rust-session-manager.cc
@@ -340,7 +340,7 @@ Session::enable_dump (::std::string arg)
     {
       error_at (
 	UNKNOWN_LOCATION,
-	"dumping all is not supported as of now. choose 'lex' or 'parse'");
+	"dumping all is not supported as of now. choose 'lex', 'parse', or 'target_options");
       return false;
     }
   else if (arg == "lex")
@@ -379,13 +379,13 @@ Session::enable_dump (::std::string arg)
   else if (arg == "")
     {
       error_at (UNKNOWN_LOCATION,
-		"dump option was not given a name. choose 'lex' or 'parse'");
+		"dump option was not given a name. choose 'lex', 'parse', or 'target_options'");
       return false;
     }
   else
     {
       error_at (UNKNOWN_LOCATION,
-		"dump option '%s' was unrecognised. choose 'lex' or 'parse'",
+		"dump option '%s' was unrecognised. choose 'lex', 'parse', or 'target_options",
 		arg.c_str ());
       return false;
     }
author	SimplyTheOther <simplytheother@gmail.com>	2020-08-20 11:51:25 +0800
committer	Philip Herron <philip.herron@embecosm.com>	2020-11-28 21:13:20 +0000
commit	b758ec724cc06cb866a72ce17dbfd8a426cf21db (patch)
tree	cffb4e7c584f7da75ea42e57011c5a1d160d67a2 /gcc
parent	c26f60f6a28394e98ac1d830cbe8f632ef576dbb (diff)
download	gcc-b758ec724cc06cb866a72ce17dbfd8a426cf21db.zip gcc-b758ec724cc06cb866a72ce17dbfd8a426cf21db.tar.gz gcc-b758ec724cc06cb866a72ce17dbfd8a426cf21db.tar.bz2