2 files changed, 2225 insertions, 2267 deletions
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index 322079e..1f0f9cb 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -8,2333 +8,2285 @@
 #include <sstream> // for ostringstream
 
 namespace Rust {
-    // TODO: move to separate compilation unit?
-    // overload += for uint32_t to allow 32-bit encoded utf-8 to be added
-    std::string& operator+=(std::string& str, Codepoint char32) {
-        if (char32.value < 0x80) {
-            str += static_cast<char>(char32.value);
-        } else if (char32.value < (0x1F + 1) << (1 * 6)) {
-            str += static_cast<char>(0xC0 | ((char32.value >> 6) & 0x1F));
-            str += static_cast<char>(0x80 | ((char32.value >> 0) & 0x3F));
-        } else if (char32.value < (0x0F + 1) << (2 * 6)) {
-            str += static_cast<char>(0xE0 | ((char32.value >> 12) & 0x0F));
-            str += static_cast<char>(0x80 | ((char32.value >> 6) & 0x3F));
-            str += static_cast<char>(0x80 | ((char32.value >> 0) & 0x3F));
-        } else if (char32.value < (0x07 + 1) << (3 * 6)) {
-            str += static_cast<char>(0xF0 | ((char32.value >> 18) & 0x07));
-            str += static_cast<char>(0x80 | ((char32.value >> 12) & 0x3F));
-            str += static_cast<char>(0x80 | ((char32.value >> 6) & 0x3F));
-            str += static_cast<char>(0x80 | ((char32.value >> 0) & 0x3F));
-        } else {
-            fprintf(stderr, "Invalid unicode codepoint found: '%u' \n", char32.value);
-        }
-        return str;
+// TODO: move to separate compilation unit?
+// overload += for uint32_t to allow 32-bit encoded utf-8 to be added
+std::string &
+operator+= (std::string &str, Codepoint char32)
+{
+  if (char32.value < 0x80)
+    {
+      str += static_cast<char> (char32.value);
     }
+  else if (char32.value < (0x1F + 1) << (1 * 6))
+    {
+      str += static_cast<char> (0xC0 | ((char32.value >> 6) & 0x1F));
+      str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
+    }
+  else if (char32.value < (0x0F + 1) << (2 * 6))
+    {
+      str += static_cast<char> (0xE0 | ((char32.value >> 12) & 0x0F));
+      str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
+      str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
+    }
+  else if (char32.value < (0x07 + 1) << (3 * 6))
+    {
+      str += static_cast<char> (0xF0 | ((char32.value >> 18) & 0x07));
+      str += static_cast<char> (0x80 | ((char32.value >> 12) & 0x3F));
+      str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
+      str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
+    }
+  else
+    {
+      fprintf (stderr, "Invalid unicode codepoint found: '%u' \n",
+	       char32.value);
+    }
+  return str;
+}
+
+std::string
+Codepoint::as_string ()
+{
+  std::string str;
+
+  // str += Codepoint (value);
+  str += *this;
+
+  return str;
+}
+
+/* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
+ * for handling. */
+bool
+is_float_digit (char number)
+{
+  return ISDIGIT (number) || number == 'E' || number == 'e';
+}
+
+/* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
+ * whatever is different */
+bool
+is_x_digit (char number)
+{
+  return ISXDIGIT (number);
+}
+
+bool
+is_octal_digit (char number)
+{
+  return number >= '0' && number <= '7';
+}
+
+bool
+is_bin_digit (char number)
+{
+  return number == '0' || number == '1';
+}
+
+bool
+check_valid_float_dot_end (char character)
+{
+  return character != '.' && character != '_' && !ISALPHA (character);
+}
+
+// ISSPACE from safe-ctype but may change in future
+bool
+is_whitespace (char character)
+{
+  return ISSPACE (character);
+}
+
+Lexer::Lexer (const char *filename, FILE *input, Linemap *linemap)
+  : input (input), current_line (1), current_column (1), line_map (linemap),
+    input_source (input), input_queue (input_source), token_source (this),
+    token_queue (token_source)
+{
+  // inform line_table that file is being entered and is in line 1
+  line_map->start_file (filename, current_line);
+}
+
+Lexer::~Lexer ()
+{
+  /* ok apparently stop (which is equivalent of original code in destructor) is
+   * meant to be called after all files have finished parsing, for cleanup. On
+   * the other hand, actual code that it calls to leave a certain line map is
+   * mentioned in GCC docs as being useful for "just leaving an included header"
+   * and stuff like that, so this line mapping functionality may need fixing.
+   * FIXME: find out whether this occurs. */
+  // line_map->stop();
+}
+
+/* TODO: need to optimise somehow to avoid the virtual function call in the
+ * tight loop. Best idea at the moment is CRTP, but that might make lexer
+ * implementation annoying when storing the "base class" (i.e. would need
+ * template parameter everywhere), although in practice it would mostly just
+ * look ugly and make enclosing classes like Parser also require a type
+ * parameter. At this point a macro might be better. OK I guess macros can be
+ * replaced by constexpr if or something if possible. */
+Location
+Lexer::get_current_location ()
+{
+  return line_map->get_location (current_column);
+}
+
+int
+Lexer::peek_input (int n)
+{
+  return input_queue.peek (n);
+}
+
+int
+Lexer::peek_input ()
+{
+  return peek_input (0);
+}
+
+void
+Lexer::skip_input (int n)
+{
+  input_queue.skip (n);
+}
+
+void
+Lexer::skip_input ()
+{
+  skip_input (0);
+}
+
+void
+Lexer::replace_current_token (TokenPtr replacement)
+{
+  token_queue.replace_current_value (replacement);
+}
+
+/* shitty anonymous namespace that can only be accessed inside the compilation
+ * unit - used for classify_keyword Binary search in sorted array of keywords
+ * created with x-macros. */
+namespace {
+const std::string keyword_index[] = {
+#define RS_TOKEN(x, y)
+#define RS_TOKEN_KEYWORD(name, keyword) keyword,
+  RS_TOKEN_LIST
+#undef RS_TOKEN_KEYWORD
+#undef RS_TOKEN
+};
 
-    std::string Codepoint::as_string() {
-        std::string str;
+TokenId keyword_keys[] = {
+#define RS_TOKEN(x, y)
+#define RS_TOKEN_KEYWORD(name, keyword) name,
+  RS_TOKEN_LIST
+#undef RS_TOKEN_KEYWORD
+#undef RS_TOKEN
+};
+
+const int num_keywords = sizeof (keyword_index) / sizeof (*keyword_index);
+} // namespace
+
+/* Determines whether the string passed in is a keyword or not. If it is, it
+ * returns the keyword name.  */
+TokenId
+Lexer::classify_keyword (const std::string &str)
+{
+  const std::string *last = keyword_index + num_keywords;
+  const std::string *idx = std::lower_bound (keyword_index, last, str);
+
+  if (idx == last || str != *idx)
+    return IDENTIFIER;
+  else
+    return keyword_keys[idx - keyword_index];
+
+  // TODO: possibly replace this x-macro system with something like hash map?
+}
+
+TokenPtr
+Lexer::build_token ()
+{
+  // loop to go through multiple characters to build a single token
+  while (true)
+    {
+      Location loc = get_current_location ();
+      current_char = peek_input ();
+      skip_input ();
+
+      // return end of file token if end of file
+      if (current_char == EOF)
+	return Token::make (END_OF_FILE, loc);
+
+      // detect shebang
+      if (loc == 1 && current_line == 1 && current_char == '#')
+	{
+	  current_char = peek_input ();
+
+	  if (current_char == '!')
+	    {
+	      skip_input ();
+	      current_char = peek_input ();
+
+	      if (current_char == '/')
+		{
+		  // definitely shebang
+
+		  skip_input ();
+
+		  // ignore rest of line
+		  while (current_char != '\n')
+		    {
+		      current_char = peek_input ();
+		      skip_input ();
+		    }
+
+		  // newline
+		  current_line++;
+		  current_column = 1;
+		  // tell line_table that new line starts
+		  line_map->start_line (current_line, max_column_hint);
+		  continue;
+		}
+	    }
+	}
+
+      // if not end of file, start tokenising
+      switch (current_char)
+	{
+	/* ignore whitespace characters for tokens but continue updating
+	 * location */
+	case '\n': // newline
+	  current_line++;
+	  current_column = 1;
+	  // tell line_table that new line starts
+	  line_map->start_line (current_line, max_column_hint);
+	  continue;
+	case ' ': // space
+	  current_column++;
+	  continue;
+	case '\t': // tab
+	  // width of a tab is not well-defined, assume 8 spaces
+	  current_column += 8;
+	  continue;
+
+	// punctuation - actual tokens
+	case '=':
+	  if (peek_input () == '>')
+	    {
+	      // match arm arrow
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (MATCH_ARROW, loc);
+	    }
+	  else if (peek_input () == '=')
+	    {
+	      // equality operator
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (EQUAL_EQUAL, loc);
+	    }
+	  else
+	    {
+	      // assignment operator
+	      current_column++;
+	      return Token::make (EQUAL, loc);
+	    }
+	case '(':
+	  current_column++;
+	  return Token::make (LEFT_PAREN, loc);
+	case '-':
+	  if (peek_input () == '>')
+	    {
+	      // return type specifier
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (RETURN_TYPE, loc);
+	    }
+	  else if (peek_input () == '=')
+	    {
+	      // minus-assign
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (MINUS_EQ, loc);
+	    }
+	  else
+	    {
+	      // minus
+	      current_column++;
+	      return Token::make (MINUS, loc);
+	    }
+	case '+':
+	  if (peek_input () == '=')
+	    {
+	      // add-assign
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (PLUS_EQ, loc);
+	    }
+	  else
+	    {
+	      // add
+	      current_column++;
+	      return Token::make (PLUS, loc);
+	    }
+	case ')':
+	  current_column++;
+	  return Token::make (RIGHT_PAREN, loc);
+	case ';':
+	  current_column++;
+	  return Token::make (SEMICOLON, loc);
+	case '*':
+	  if (peek_input () == '=')
+	    {
+	      // multiplication-assign
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (ASTERISK_EQ, loc);
+	    }
+	  else
+	    {
+	      // multiplication
+	      current_column++;
+	      return Token::make (ASTERISK, loc);
+	    }
+	case ',':
+	  current_column++;
+	  return Token::make (COMMA, loc);
+	case '/':
+	  if (peek_input () == '=')
+	    {
+	      // division-assign
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (DIV_EQ, loc);
+	    }
+	  else if (peek_input () == '/')
+	    {
+	      // TODO: single-line doc comments
+
+	      // single line comment
+	      skip_input ();
+	      current_column += 2;
+
+	      // basically ignore until line finishes
+	      while (current_char != '\n' && current_char != EOF)
+		{
+		  skip_input ();
+		  current_column++; // not used
+		  current_char = peek_input ();
+		}
+	      continue;
+	      break;
+	    }
+	  else if (peek_input () == '*')
+	    {
+	      // block comment
+	      skip_input ();
+	      current_column += 2;
+
+	      // TODO: block doc comments
+
+	      current_char = peek_input ();
+
+	      int level = 1;
+	      while (level > 0)
+		{
+		  skip_input ();
+		  current_column++; // for error-handling
+		  current_char = peek_input ();
+
+		  // if /* found
+		  if (current_char == '/' && peek_input (1) == '*')
+		    {
+		      // skip /* characters
+		      skip_input (1);
+
+		      current_column += 2;
+
+		      level += 1;
+		    }
+
+		  // ignore until */ is found
+		  if (current_char == '*' && peek_input (1) == '/')
+		    {
+		      // skip */ characters
+		      skip_input (1);
+
+		      current_column += 2;
+		      // should only break inner loop here - seems to do so
+		      // break;
+
+		      level -= 1;
+		    }
+		}
+
+	      // refresh new token
+	      continue;
+	      break;
+	    }
+	  else
+	    {
+	      // division
+	      current_column++;
+	      return Token::make (DIV, loc);
+	    }
+	case '%':
+	  if (peek_input () == '=')
+	    {
+	      // modulo-assign
+	      current_column += 2;
+	      return Token::make (PERCENT_EQ, loc);
+	    }
+	  else
+	    {
+	      // modulo
+	      current_column++;
+	      return Token::make (PERCENT, loc);
+	    }
+	case '^':
+	  if (peek_input () == '=')
+	    {
+	      // xor-assign?
+	      current_column += 2;
+	      return Token::make (CARET_EQ, loc);
+	    }
+	  else
+	    {
+	      // xor?
+	      current_column++;
+	      return Token::make (CARET, loc);
+	    }
+	case '<':
+	  if (peek_input () == '<')
+	    {
+	      if (peek_input (1) == '=')
+		{
+		  // left-shift assign
+		  skip_input (1);
+		  current_column += 3;
+
+		  return Token::make (LEFT_SHIFT_EQ, loc);
+		}
+	      else
+		{
+		  // left-shift
+		  skip_input ();
+		  current_column += 2;
+
+		  return Token::make (LEFT_SHIFT, loc);
+		}
+	    }
+	  else if (peek_input () == '=')
+	    {
+	      // smaller than or equal to
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (LESS_OR_EQUAL, loc);
+	    }
+	  else
+	    {
+	      // smaller than
+	      current_column++;
+	      return Token::make (LEFT_ANGLE, loc);
+	    }
+	  break;
+	case '>':
+	  if (peek_input () == '>')
+	    {
+	      if (peek_input (1) == '=')
+		{
+		  // right-shift-assign
+		  skip_input (1);
+		  current_column += 3;
+
+		  return Token::make (RIGHT_SHIFT_EQ, loc);
+		}
+	      else
+		{
+		  // right-shift
+		  skip_input ();
+		  current_column += 2;
+
+		  return Token::make (RIGHT_SHIFT, loc);
+		}
+	    }
+	  else if (peek_input () == '=')
+	    {
+	      // larger than or equal to
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (GREATER_OR_EQUAL, loc);
+	    }
+	  else
+	    {
+	      // larger than
+	      current_column++;
+	      return Token::make (RIGHT_ANGLE, loc);
+	    }
+	case ':':
+	  if (peek_input () == ':')
+	    {
+	      // scope resolution ::
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (SCOPE_RESOLUTION, loc);
+	    }
+	  else
+	    {
+	      // single colon :
+	      current_column++;
+	      return Token::make (COLON, loc);
+	    }
+	case '!':
+	  // no special handling for macros in lexer?
+	  if (peek_input () == '=')
+	    {
+	      // not equal boolean operator
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (NOT_EQUAL, loc);
+	    }
+	  else
+	    {
+	      // not equal unary operator
+	      current_column++;
+
+	      return Token::make (EXCLAM, loc);
+	    }
+	case '?':
+	  current_column++;
+	  return Token::make (QUESTION_MARK, loc);
+	case '#':
+	  current_column++;
+	  return Token::make (HASH, loc);
+	case '[':
+	  current_column++;
+	  return Token::make (LEFT_SQUARE, loc);
+	case ']':
+	  current_column++;
+	  return Token::make (RIGHT_SQUARE, loc);
+	case '{':
+	  current_column++;
+	  return Token::make (LEFT_CURLY, loc);
+	case '}':
+	  current_column++;
+	  return Token::make (RIGHT_CURLY, loc);
+	case '@':
+	  current_column++;
+	  return Token::make (PATTERN_BIND, loc);
+	case '$':
+	  current_column++;
+	  return Token::make (DOLLAR_SIGN, loc);
+	case '~':
+	  current_column++;
+	  return Token::make (TILDE, loc);
+	case '\\':
+	  current_column++;
+	  return Token::make (BACKSLASH, loc);
+	case '`':
+	  current_column++;
+	  return Token::make (BACKTICK, loc);
+	case '|':
+	  if (peek_input () == '=')
+	    {
+	      // bitwise or-assign?
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (PIPE_EQ, loc);
+	    }
+	  else if (peek_input () == '|')
+	    {
+	      // logical or
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (OR, loc);
+	    }
+	  else
+	    {
+	      // bitwise or
+	      current_column++;
+
+	      return Token::make (PIPE, loc);
+	    }
+	case '&':
+	  if (peek_input () == '=')
+	    {
+	      // bitwise and-assign?
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (AMP_EQ, loc);
+	    }
+	  else if (peek_input () == '&')
+	    {
+	      // logical and
+	      skip_input ();
+	      current_column += 2;
+
+	      return Token::make (LOGICAL_AND, loc);
+	    }
+	  else
+	    {
+	      // bitwise and/reference
+	      current_column++;
+
+	      return Token::make (AMP, loc);
+	    }
+	case '.':
+	  if (peek_input () == '.')
+	    {
+	      if (peek_input (1) == '.')
+		{
+		  // ellipsis
+		  skip_input (1);
+		  current_column += 3;
+
+		  return Token::make (ELLIPSIS, loc);
+		}
+	      else if (peek_input (1) == '=')
+		{
+		  // ..=
+		  skip_input (1);
+		  current_column += 3;
+
+		  return Token::make (DOT_DOT_EQ, loc);
+		}
+	      else
+		{
+		  // ..
+		  skip_input ();
+		  current_column += 2;
+
+		  return Token::make (DOT_DOT, loc);
+		}
+	    }
+	  else if (!ISDIGIT (peek_input ()))
+	    {
+	      // single dot .
+	      // Only if followed by a non-number - otherwise is float
+	      current_column++;
+	      return Token::make (DOT, loc);
+	    }
+	}
+      // TODO: special handling of _ in the lexer? instead of being identifier
+
+      // byte character, byte string and raw byte string literals
+      if (current_char == 'b')
+	{
+	  if (peek_input () == '\'')
+	    return parse_byte_char (loc);
+	  else if (peek_input () == '"')
+	    return parse_byte_string (loc);
+	  else if (peek_input () == 'r'
+		   && (peek_input (1) == '#' || peek_input (1) == '"'))
+	    return parse_raw_byte_string (loc);
+	}
+
+      // raw identifiers and raw strings
+      if (current_char == 'r')
+	{
+	  int peek = peek_input ();
+	  int peek1 = peek_input (1);
+
+	  if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
+	    {
+	      TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
+	      if (raw_ident_ptr != nullptr)
+		return raw_ident_ptr;
+	    }
+	  else
+	    {
+	      TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc);
+	      if (maybe_raw_string_ptr != nullptr)
+		return maybe_raw_string_ptr;
+	    }
+	}
+
+      // find identifiers and keywords
+      if (ISALPHA (current_char) || current_char == '_')
+	return parse_identifier_or_keyword (loc);
+
+      // int and float literals
+      if (ISDIGIT (current_char) || current_char == '.')
+	{ //  _ not allowed as first char
+	  if (current_char == '0' && !ISDIGIT (peek_input ()))
+	    {
+	      // handle binary, octal, hex literals
+	      TokenPtr non_dec_int_lit_ptr
+		= parse_non_decimal_int_literals (loc);
+	      if (non_dec_int_lit_ptr != nullptr)
+		return non_dec_int_lit_ptr;
+	    }
+	  else
+	    {
+	      // handle decimals (integer or float)
+	      TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc);
+	      if (decimal_or_float_ptr != nullptr)
+		return decimal_or_float_ptr;
+	    }
+	}
+
+      // string literals
+      if (current_char == '"')
+	return parse_string (loc);
+
+      // char literals and lifetime names
+      if (current_char == '\'')
+	{
+	  TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc);
+	  if (char_or_lifetime_ptr != nullptr)
+	    return char_or_lifetime_ptr;
+	}
+
+      // didn't match anything so error
+      rust_error_at (loc, "unexpected character '%x'", current_char);
+      current_column++;
+    }
+}
 
-        // str += Codepoint (value);
-        str += *this;
+// Parses in a type suffix.
+std::pair<PrimitiveCoreType, int>
+Lexer::parse_in_type_suffix ()
+{
+  std::string suffix;
+  suffix.reserve (5);
 
-        return str;
-    }
+  int additional_length_offset = 0;
 
-    /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
-     * for handling. */
-    bool is_float_digit(char number) {
-        return ISDIGIT(number) || number == 'E' || number == 'e';
-    }
+  // get suffix
+  while (ISALPHA (current_char) || ISDIGIT (current_char)
+	 || current_char == '_')
+    {
+      if (current_char == '_')
+	{
+	  // don't add _ to suffix
+	  skip_input ();
+	  current_char = peek_input ();
 
-    /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
-     * whatever is different */
-    bool is_x_digit(char number) {
-        return ISXDIGIT(number);
-    }
+	  additional_length_offset++;
+
+	  continue;
+	}
 
-    bool is_octal_digit(char number) {
-        return number >= '0' && number <= '7';
+      additional_length_offset++;
+
+      suffix += current_char;
+      skip_input ();
+      current_char = peek_input ();
     }
 
-    bool is_bin_digit(char number) {
-        return number == '0' || number == '1';
+  if (suffix.empty ())
+    {
+      // no type suffix: do nothing but also no error
+      return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
+    }
+  else if (suffix == "f32")
+    {
+      return std::make_pair (CORETYPE_F32, additional_length_offset);
+    }
+  else if (suffix == "f64")
+    {
+      return std::make_pair (CORETYPE_F64, additional_length_offset);
+    }
+  else if (suffix == "i8")
+    {
+      return std::make_pair (CORETYPE_I8, additional_length_offset);
+    }
+  else if (suffix == "i16")
+    {
+      return std::make_pair (CORETYPE_I16, additional_length_offset);
+    }
+  else if (suffix == "i32")
+    {
+      return std::make_pair (CORETYPE_I32, additional_length_offset);
+    }
+  else if (suffix == "i64")
+    {
+      return std::make_pair (CORETYPE_I64, additional_length_offset);
+    }
+  else if (suffix == "i128")
+    {
+      return std::make_pair (CORETYPE_I128, additional_length_offset);
+    }
+  else if (suffix == "isize")
+    {
+      return std::make_pair (CORETYPE_ISIZE, additional_length_offset);
+    }
+  else if (suffix == "u8")
+    {
+      return std::make_pair (CORETYPE_U8, additional_length_offset);
+    }
+  else if (suffix == "u16")
+    {
+      return std::make_pair (CORETYPE_U16, additional_length_offset);
+    }
+  else if (suffix == "u32")
+    {
+      return std::make_pair (CORETYPE_U32, additional_length_offset);
     }
+  else if (suffix == "u64")
+    {
+      return std::make_pair (CORETYPE_U64, additional_length_offset);
+    }
+  else if (suffix == "u128")
+    {
+      return std::make_pair (CORETYPE_U128, additional_length_offset);
+    }
+  else if (suffix == "usize")
+    {
+      return std::make_pair (CORETYPE_USIZE, additional_length_offset);
+    }
+  else
+    {
+      rust_error_at (get_current_location (), "unknown number suffix '%s'",
+		     suffix.c_str ());
 
-    bool check_valid_float_dot_end(char character) {
-        return character != '.' && character != '_' && !ISALPHA(character);
+      return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
+    }
+}
+
+// Parses in the exponent part (if any) of a float literal.
+std::pair<std::string, int>
+Lexer::parse_in_exponent_part ()
+{
+  int additional_length_offset = 0;
+  std::string str;
+  if (current_char == 'E' || current_char == 'e')
+    {
+      // add exponent to string as strtod works with it
+      str += current_char;
+      skip_input ();
+      current_char = peek_input ();
+
+      additional_length_offset++;
+
+      // special - and + handling
+      if (current_char == '-')
+	{
+	  str += '-';
+
+	  skip_input ();
+	  current_char = peek_input ();
+
+	  additional_length_offset++;
+	}
+      else if (current_char == '+')
+	{
+	  // don't add + but still skip input
+	  skip_input ();
+	  current_char = peek_input ();
+
+	  additional_length_offset++;
+	}
+
+      // parse another decimal number for exponent
+      auto str_length_pair = parse_in_decimal ();
+      str += str_length_pair.first;
+      additional_length_offset += str_length_pair.second;
+    }
+  return std::make_pair (str, additional_length_offset);
+}
+
+// Parses a decimal integer.
+std::pair<std::string, int>
+Lexer::parse_in_decimal ()
+{
+  int additional_length_offset = 0;
+  std::string str;
+  while (ISDIGIT (current_char) || current_char == '_')
+    {
+      if (current_char == '_')
+	{
+	  // don't add _ to number
+	  skip_input ();
+	  current_char = peek_input ();
+
+	  additional_length_offset++;
+
+	  continue;
+	}
+
+      additional_length_offset++;
+
+      str += current_char;
+      skip_input ();
+      current_char = peek_input ();
+    }
+  return std::make_pair (str, additional_length_offset);
+}
+
+/* Parses escapes (and string continues) in "byte" strings and characters. Does
+ * not support unicode. */
+std::tuple<char, int, bool>
+Lexer::parse_escape (char opening_char)
+{
+  int additional_length_offset = 0;
+  char output_char = 0;
+
+  // skip to actual letter
+  skip_input ();
+  current_char = peek_input ();
+  additional_length_offset++;
+
+  switch (current_char)
+    {
+      case 'x': {
+	auto hex_escape_pair = parse_partial_hex_escape ();
+	long hexLong = hex_escape_pair.first;
+	additional_length_offset += hex_escape_pair.second;
+
+	if (hexLong > 255 || hexLong < 0)
+	  rust_error_at (
+	    get_current_location (),
+	    "byte \\x escape '\\x%X' out of range - allows up to '\\xFF'",
+	    static_cast<unsigned int> (hexLong));
+	char hexChar = static_cast<char> (hexLong);
+
+	output_char = hexChar;
+      }
+      break;
+    case 'n':
+      output_char = '\n';
+      break;
+    case 'r':
+      output_char = '\r';
+      break;
+    case 't':
+      output_char = '\t';
+      break;
+    case '\\':
+      output_char = '\\';
+      break;
+    case '0':
+      output_char = '\0';
+      break;
+    case '\'':
+      output_char = '\'';
+      break;
+    case '"':
+      output_char = '"';
+      break;
+    case 'u':
+      rust_error_at (get_current_location (),
+		     "cannot have a unicode escape \\u in a byte %s!",
+		     opening_char == '\'' ? "character" : "string");
+      return std::make_tuple (output_char, additional_length_offset, false);
+    case '\r':
+    case '\n':
+      // string continue
+      return std::make_tuple (0, parse_partial_string_continue (), true);
+    default:
+      rust_error_at (get_current_location (), "unknown escape sequence '\\%c'",
+		     current_char);
+      // returns false if no parsing could be done
+      // return false;
+      return std::make_tuple (output_char, additional_length_offset, false);
+      break;
+    }
+  // all non-special cases (string continue) should skip their used char
+  skip_input ();
+  current_char = peek_input ();
+  additional_length_offset++;
+
+  // returns true if parsing was successful
+  // return true;
+  return std::make_tuple (output_char, additional_length_offset, false);
+}
+
+// Parses an escape (or string continue) in a string or character. Supports
+// unicode escapes.
+std::tuple<Codepoint, int, bool>
+Lexer::parse_utf8_escape (char opening_char)
+{
+  Codepoint output_char;
+  int additional_length_offset = 0;
+
+  // skip to actual letter
+  skip_input ();
+  current_char = peek_input ();
+  additional_length_offset++;
+
+  switch (current_char)
+    {
+      case 'x': {
+	auto hex_escape_pair = parse_partial_hex_escape ();
+	long hexLong = hex_escape_pair.first;
+	additional_length_offset += hex_escape_pair.second;
+
+	if (hexLong > 127 || hexLong < 0)
+	  rust_error_at (
+	    get_current_location (),
+	    "ascii \\x escape '\\x%X' out of range - allows up to '\\x7F'",
+	    static_cast<unsigned int> (hexLong));
+	char hexChar = static_cast<char> (hexLong);
+
+	output_char = hexChar;
+      }
+      break;
+    case 'n':
+      output_char = '\n';
+      break;
+    case 'r':
+      output_char = '\r';
+      break;
+    case 't':
+      output_char = '\t';
+      break;
+    case '\\':
+      output_char = '\\';
+      break;
+    case '0':
+      output_char = '\0';
+      break;
+    case '\'':
+      output_char = '\'';
+      break;
+    case '"':
+      output_char = '"';
+      break;
+      case 'u': {
+	auto unicode_escape_pair = parse_partial_unicode_escape ();
+	output_char = unicode_escape_pair.first;
+	additional_length_offset += unicode_escape_pair.second;
+
+	return std::make_tuple (output_char, additional_length_offset, false);
+      }
+      break;
+    case '\r':
+    case '\n':
+      // string continue
+      return std::make_tuple (0, parse_partial_string_continue (), true);
+    default:
+      rust_error_at (get_current_location (), "unknown escape sequence '\\%c'",
+		     current_char);
+      // returns false if no parsing could be done
+      // return false;
+      return std::make_tuple (output_char, additional_length_offset, false);
+      break;
+    }
+  /* all non-special cases (unicode, string continue) should skip their used
+   * char */
+  skip_input ();
+  current_char = peek_input ();
+  additional_length_offset++;
+
+  // returns true if parsing was successful
+  // return true;
+  return std::make_tuple (output_char, additional_length_offset, false);
+}
+
+// Parses the body of a string continue that has been found in an escape.
+int
+Lexer::parse_partial_string_continue ()
+{
+  int additional_length_offset = 1;
+
+  // string continue
+  while (is_whitespace (current_char))
+    {
+      if (current_char == '\n')
+	{
+	  current_line++;
+	  current_column = 1;
+	  // tell line_table that new line starts
+	  line_map->start_line (current_line, max_column_hint);
+
+	  // reset "length"
+	  additional_length_offset = 1;
+
+	  // get next char
+	  skip_input ();
+	  current_char = peek_input ();
+
+	  continue;
+	}
+
+      skip_input ();
+      current_char = peek_input ();
+      additional_length_offset++;
     }
 
-    // ISSPACE from safe-ctype but may change in future
-    bool is_whitespace(char character) {
-        return ISSPACE(character);
+  return additional_length_offset;
+}
+
+/* Parses the body of a '\x' escape. Note that it does not check that the number
+ * is valid and smaller than 255. */
+std::pair<long, int>
+Lexer::parse_partial_hex_escape ()
+{
+  // hex char string (null-terminated)
+  char hexNum[3] = {0, 0, 0};
+
+  // first hex char
+  skip_input ();
+  current_char = peek_input ();
+  int additional_length_offset = 1;
+
+  if (!is_x_digit (current_char))
+    {
+      rust_error_at (get_current_location (),
+		     "invalid character '\\x%c' in \\x sequence", current_char);
     }
+  hexNum[0] = current_char;
+
+  // second hex char
+  skip_input ();
+  current_char = peek_input ();
+  additional_length_offset++;
 
-    Lexer::Lexer(const char* filename, FILE* input, Linemap* linemap) :
-      input(input), current_line(1), current_column(1), line_map(linemap), input_source(input),
-      input_queue(input_source), token_source(this), token_queue(token_source) {
-        // inform line_table that file is being entered and is in line 1
-        line_map->start_file(filename, current_line);
+  if (!is_x_digit (current_char))
+    {
+      rust_error_at (get_current_location (),
+		     "invalid character '\\x%c' in \\x sequence", current_char);
     }
+  hexNum[1] = current_char;
+
+  long hexLong = std::strtol (hexNum, nullptr, 16);
+
+  return std::make_pair (hexLong, additional_length_offset);
+}
 
-    Lexer::~Lexer() {
-        /* ok apparently stop (which is equivalent of original code in destructor) is
-         * meant to be called after all files have finished parsing, for cleanup. On
-         * the other hand, actual code that it calls to leave a certain line map is
-         * mentioned in GCC docs as being useful for "just leaving an included header"
-         * and stuff like that, so this line mapping functionality may need fixing.
-         * FIXME: find out whether this occurs. */
-        // line_map->stop();
+// Parses the body of a unicode escape.
+std::pair<Codepoint, int>
+Lexer::parse_partial_unicode_escape ()
+{
+  skip_input ();
+  current_char = peek_input ();
+  int additional_length_offset = 1;
+
+  bool need_close_brace = false;
+  if (current_char == '{')
+    {
+      need_close_brace = true;
+
+      skip_input ();
+      current_char = peek_input ();
+      additional_length_offset++;
     }
 
-    /* TODO: need to optimise somehow to avoid the virtual function call in the
-     * tight loop. Best idea at the moment is CRTP, but that might make lexer
-     * implementation annoying when storing the "base class" (i.e. would need
-     * template parameter everywhere), although in practice it would mostly just
-     * look ugly and make enclosing classes like Parser also require a type
-     * parameter. At this point a macro might be better. OK I guess macros can be
-     * replaced by constexpr if or something if possible. */
-    Location Lexer::get_current_location() {
-        return line_map->get_location(current_column);
+  // parse unicode escape - 1-6 hex digits
+  std::string num_str;
+  num_str.reserve (6);
+
+  // loop through to add entire hex number to string
+  while (is_x_digit (current_char) || current_char == '_')
+    {
+      if (current_char == '_')
+	{
+	  // don't add _ to number
+	  skip_input ();
+	  current_char = peek_input ();
+
+	  additional_length_offset++;
+
+	  continue;
+	}
+
+      additional_length_offset++;
+
+      // add raw hex numbers
+      num_str += current_char;
+
+      skip_input ();
+      current_char = peek_input ();
     }
 
-    int Lexer::peek_input(int n) {
-        return input_queue.peek(n);
+  // ensure closing brace if required
+  if (need_close_brace)
+    {
+      if (current_char == '}')
+	{
+	  skip_input ();
+	  current_char = peek_input ();
+	  additional_length_offset++;
+	}
+      else
+	{
+	  // actually an error, but allow propagation anyway
+	  rust_error_at (get_current_location (),
+			 "expected terminating '}' in unicode escape");
+	  // return false;
+	  return std::make_pair (Codepoint (0), additional_length_offset);
+	}
     }
 
-    int Lexer::peek_input() {
-        return peek_input(0);
+  // ensure 1-6 hex characters
+  if (num_str.length () > 6 || num_str.length () < 1)
+    {
+      rust_error_at (get_current_location (),
+		     "unicode escape should be between 1 and 6 hex "
+		     "characters; it is %lu",
+		     num_str.length ());
+      // return false;
+      return std::make_pair (Codepoint (0), additional_length_offset);
     }
 
-    void Lexer::skip_input(int n) {
-        input_queue.skip(n);
+  long hex_num = std::strtol (num_str.c_str (), nullptr, 16);
+
+  // assert fits a uint32_t
+  gcc_assert (hex_num < 4294967296);
+
+  // return true;
+  return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
+			 additional_length_offset);
+}
+
+// Parses a byte character.
+TokenPtr
+Lexer::parse_byte_char (Location loc)
+{
+  skip_input ();
+  current_column++;
+  // make current char the next character
+  current_char = peek_input ();
+
+  int length = 1;
+
+  // char to save
+  char byte_char = 0;
+
+  // detect escapes
+  if (current_char == '\\')
+    {
+      auto escape_length_pair = parse_escape ('\'');
+      byte_char = std::get<0> (escape_length_pair);
+      length += std::get<1> (escape_length_pair);
+
+      if (byte_char > 127)
+	{
+	  rust_error_at (get_current_location (), "byte char '%c' out of range",
+			 byte_char);
+	  byte_char = 0;
+	}
+
+      current_char = peek_input ();
+
+      if (current_char != '\'')
+	{
+	  rust_error_at (get_current_location (), "unclosed byte char");
+	}
+
+      skip_input ();
+      current_char = peek_input ();
+      length++; // go to next char
+    }
+  else if (current_char != '\'')
+    {
+      // otherwise, get character from direct input character
+      byte_char = current_char;
+
+      skip_input ();
+      current_char = peek_input ();
+      length++;
+
+      if (current_char != '\'')
+	{
+	  rust_error_at (get_current_location (), "unclosed byte char");
+	}
+
+      skip_input ();
+      current_char = peek_input ();
+      length++; // go to next char
+    }
+  else
+    {
+      rust_error_at (get_current_location (),
+		     "no character inside '' for byte char");
     }
 
-    void Lexer::skip_input() {
-        skip_input(0);
+  current_column += length;
+
+  return Token::make_byte_char (loc, byte_char);
+}
+
+// Parses a byte string.
+TokenPtr
+Lexer::parse_byte_string (Location loc)
+{
+  // byte string
+
+  // skip quote character
+  skip_input ();
+  current_column++;
+
+  std::string str;
+  str.reserve (16); // some sensible default
+
+  int length = 1;
+  current_char = peek_input ();
+
+  while (current_char != '"' && current_char != '\n')
+    {
+      if (current_char == '\\')
+	{
+	  auto escape_length_pair = parse_escape ('"');
+	  char output_char = std::get<0> (escape_length_pair);
+
+	  if (output_char == 0 && std::get<2> (escape_length_pair))
+	    length = std::get<1> (escape_length_pair) - 1;
+	  else
+	    length += std::get<1> (escape_length_pair);
+
+	  if (output_char > 127)
+	    {
+	      rust_error_at (get_current_location (),
+			     "char '%c' in byte string out of range",
+			     output_char);
+	      output_char = 0;
+	    }
+
+	  if (output_char != 0)
+	    str += output_char;
+
+	  continue;
+	}
+
+      length++;
+
+      str += current_char;
+      skip_input ();
+      current_char = peek_input ();
     }
 
-    const_TokenPtr Lexer::peek_token(int n) {
-        return token_queue.peek(n);
+  current_column += length;
+
+  if (current_char == '\n')
+    {
+      rust_error_at (get_current_location (), "unended byte string literal");
     }
+  else if (current_char == '"')
+    {
+      current_column++;
 
-    const_TokenPtr Lexer::peek_token() {
-        return peek_token(0);
+      skip_input ();
+      current_char = peek_input ();
+    }
+  else
+    {
+      gcc_unreachable ();
     }
 
-    void Lexer::skip_token(int n) {
-        token_queue.skip(n);
+  str.shrink_to_fit ();
+
+  return Token::make_byte_string (loc, str);
+}
+
+// Parses a raw byte string.
+TokenPtr
+Lexer::parse_raw_byte_string (Location loc)
+{
+  // raw byte string literals
+  std::string str;
+  str.reserve (16); // some sensible default
+
+  int length = 1;
+  int hash_count = 0;
+
+  // get hash count at beginnning
+  skip_input ();
+  current_char = peek_input ();
+  length++;
+  while (current_char == '#')
+    {
+      hash_count++;
+      length++;
+
+      skip_input ();
+      current_char = peek_input ();
     }
 
-    void Lexer::skip_token() {
-        skip_token(0);
+  if (current_char != '"')
+    {
+      rust_error_at (get_current_location (),
+		     "raw byte string has no opening '\"'");
     }
 
-    void Lexer::replace_current_token(TokenPtr replacement) {
-        token_queue.replace_current_value(replacement);
+  skip_input ();
+  current_char = peek_input ();
+  length++;
+
+  while (true)
+    {
+      if (current_char == '"')
+	{
+	  bool enough_hashes = true;
+
+	  for (int i = 0; i < hash_count; i++)
+	    {
+	      if (peek_input (i + 1) != '#')
+		{
+		  enough_hashes = false;
+		  break;
+		}
+	    }
+
+	  if (enough_hashes)
+	    {
+	      // skip enough input and peek enough input
+	      skip_input (hash_count);
+	      current_char = peek_input ();
+	      length += hash_count + 1;
+	      break;
+	    }
+	}
+
+      length++;
+
+      str += current_char;
+      skip_input ();
+      current_char = peek_input ();
     }
 
-    /* shitty anonymous namespace that can only be accessed inside the compilation
-     * unit - used for classify_keyword Binary search in sorted array of keywords
-     * created with x-macros. */
-    namespace {
-        const std::string keyword_index[] = {
-#define RS_TOKEN(x, y)
-#define RS_TOKEN_KEYWORD(name, keyword) keyword,
-            RS_TOKEN_LIST
-#undef RS_TOKEN_KEYWORD
-#undef RS_TOKEN
-        };
+  current_column += length;
 
-        TokenId keyword_keys[] = {
-#define RS_TOKEN(x, y)
-#define RS_TOKEN_KEYWORD(name, keyword) name,
-            RS_TOKEN_LIST
-#undef RS_TOKEN_KEYWORD
-#undef RS_TOKEN
-        };
-
-        const int num_keywords = sizeof(keyword_index) / sizeof(*keyword_index);
-    } // namespace
-
-    /* Determines whether the string passed in is a keyword or not. If it is, it
-     * returns the keyword name.  */
-    TokenId Lexer::classify_keyword(const std::string& str) {
-        const std::string* last = keyword_index + num_keywords;
-        const std::string* idx = std::lower_bound(keyword_index, last, str);
-
-        if (idx == last || str != *idx)
-            return IDENTIFIER;
-        else
-            return keyword_keys[idx - keyword_index];
-    }
-
-    TokenPtr Lexer::build_token() {
-        // loop to go through multiple characters to build a single token
-        while (true) {
-            Location loc = get_current_location();
-            /*int */ current_char = peek_input();
-            skip_input();
-
-            // return end of file token if end of file
-            if (current_char == EOF) 
-                return Token::make(END_OF_FILE, loc);
-
-            // detect shebang
-            if (loc == 1 && current_line == 1 && current_char == '#') {
-                current_char = peek_input();
-
-                if (current_char == '!') {
-                    skip_input();
-                    current_char = peek_input();
-
-                    switch (current_char) {
-                        case '/':
-                            // shebang
-
-                            skip_input();
-
-                            // ignore rest of line
-                            while (current_char != '\n') {
-                                current_char = peek_input();
-                                skip_input();
-                            }
-
-                            // newline
-                            current_line++;
-                            current_column = 1;
-                            // tell line_table that new line starts
-                            line_map->start_line(current_line, max_column_hint);
-                            continue;
-                    }
-                }
-            }
-
-            // if not end of file, start tokenising
-            switch (current_char) {
-                /* ignore whitespace characters for tokens but continue updating
-                 * location */
-                case '\n': // newline
-                    current_line++;
-                    current_column = 1;
-                    // tell line_table that new line starts
-                    line_map->start_line(current_line, max_column_hint);
-                    continue;
-                case ' ': // space
-                    current_column++;
-                    continue;
-                case '\t': // tab
-                    // width of a tab is not well-defined, assume 8 spaces
-                    current_column += 8;
-                    continue;
-
-                // punctuation - actual tokens
-                case '=':
-                    if (peek_input() == '>') {
-                        // match arm arrow
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(MATCH_ARROW, loc);
-                    } else if (peek_input() == '=') {
-                        // equality operator
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(EQUAL_EQUAL, loc);
-                    } else {
-                        // assignment operator
-                        current_column++;
-                        return Token::make(EQUAL, loc);
-                    }
-                case '(':
-                    current_column++;
-                    return Token::make(LEFT_PAREN, loc);
-                case '-':
-                    if (peek_input() == '>') {
-                        // return type specifier
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(RETURN_TYPE, loc);
-                    } else if (peek_input() == '=') {
-                        // minus-assign
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(MINUS_EQ, loc);
-                    } else {
-                        // minus
-                        current_column++;
-                        return Token::make(MINUS, loc);
-                    }
-                case '+':
-                    if (peek_input() == '=') {
-                        // add-assign
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(PLUS_EQ, loc);
-                    } else {
-                        // add
-                        current_column++;
-                        return Token::make(PLUS, loc);
-                    }
-                case ')':
-                    current_column++;
-                    return Token::make(RIGHT_PAREN, loc);
-                case ';':
-                    current_column++;
-                    return Token::make(SEMICOLON, loc);
-                case '*':
-                    if (peek_input() == '=') {
-                        // multiplication-assign
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(ASTERISK_EQ, loc);
-                    } else {
-                        // multiplication
-                        current_column++;
-                        return Token::make(ASTERISK, loc);
-                    }
-                case ',':
-                    current_column++;
-                    return Token::make(COMMA, loc);
-                case '/':
-                    if (peek_input() == '=') {
-                        // division-assign
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(DIV_EQ, loc);
-                    } else if (peek_input() == '/') {
-                        // TODO: single-line doc comments
-
-                        // single line comment
-                        skip_input();
-                        current_column += 2;
-
-                        // basically ignore until line finishes
-                        while (current_char != '\n' && current_char != EOF) {
-                            skip_input();
-                            current_column++; // not used
-                            current_char = peek_input();
-                        }
-                        continue;
-                        break;
-                    } else if (peek_input() == '*') {
-                        // block comment
-                        skip_input();
-                        current_column += 2;
-
-                        // TODO: block doc comments
-
-                        current_char = peek_input();
-
-                        int level = 1;
-                        while (level > 0) {
-                            skip_input();
-                            current_column++; // for error-handling
-                            current_char = peek_input();
-
-                            // if /* found
-                            if (current_char == '/') {
-                                if (peek_input(1) == '*') {
-                                    // skip /* characters
-                                    skip_input(1);
-
-                                    current_column += 2;
-
-                                    level += 1;
-                                }
-                            }
-
-                            // ignore until */ is found
-                            if (current_char == '*') {
-                                if (peek_input(1) == '/') {
-                                    // skip */ characters
-                                    skip_input(1);
-
-                                    current_column += 2;
-                                    // should only break inner loop here - seems to do so
-                                    // break;
-
-                                    level -= 1;
-                                }
-                            }
-                        }
-
-                        // refresh new token
-                        continue;
-                        break;
-                    } else {
-                        // division
-                        current_column++;
-                        return Token::make(DIV, loc);
-                    }
-                case '%':
-                    if (peek_input() == '=') {
-                        // modulo-assign
-                        current_column += 2;
-                        return Token::make(PERCENT_EQ, loc);
-                    } else {
-                        // modulo
-                        current_column++;
-                        return Token::make(PERCENT, loc);
-                    }
-                case '^':
-                    if (peek_input() == '=') {
-                        // xor-assign?
-                        current_column += 2;
-                        return Token::make(CARET_EQ, loc);
-                    } else {
-                        // xor?
-                        current_column++;
-                        return Token::make(CARET, loc);
-                    }
-                case '<':
-                    if (peek_input() == '<') {
-                        if (peek_input(1) == '=') {
-                            // left-shift assign
-                            skip_input(1);
-                            current_column += 3;
-
-                            return Token::make(LEFT_SHIFT_EQ, loc);
-                        } else {
-                            // left-shift
-                            skip_input();
-                            current_column += 2;
-
-                            return Token::make(LEFT_SHIFT, loc);
-                        }
-                    } else if (peek_input() == '=') {
-                        // smaller than or equal to
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(LESS_OR_EQUAL, loc);
-                    } else {
-                        // smaller than
-                        current_column++;
-                        return Token::make(LEFT_ANGLE, loc);
-                    }
-                    break;
-                case '>':
-                    if (peek_input() == '>') {
-                        if (peek_input(1) == '=') {
-                            // right-shift-assign
-                            skip_input(1);
-                            current_column += 3;
-
-                            return Token::make(RIGHT_SHIFT_EQ, loc);
-                        } else {
-                            // right-shift
-                            skip_input();
-                            current_column += 2;
-
-                            return Token::make(RIGHT_SHIFT, loc);
-                        }
-                    } else if (peek_input() == '=') {
-                        // larger than or equal to
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(GREATER_OR_EQUAL, loc);
-                    } else {
-                        // larger than
-                        current_column++;
-                        return Token::make(RIGHT_ANGLE, loc);
-                    }
-                case ':':
-                    if (peek_input() == ':') {
-                        // scope resolution ::
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(SCOPE_RESOLUTION, loc);
-                    } else {
-                        // single colon :
-                        current_column++;
-                        return Token::make(COLON, loc);
-                    }
-                case '!':
-                    // no special handling for macros in lexer?
-                    if (peek_input() == '=') {
-                        // not equal boolean operator
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(NOT_EQUAL, loc);
-                    } else {
-                        // not equal unary operator
-                        current_column++;
-
-                        return Token::make(EXCLAM, loc);
-                    }
-                case '?':
-                    current_column++;
-                    return Token::make(QUESTION_MARK, loc);
-                case '#':
-                    current_column++;
-                    return Token::make(HASH, loc);
-                case '[':
-                    current_column++;
-                    return Token::make(LEFT_SQUARE, loc);
-                case ']':
-                    current_column++;
-                    return Token::make(RIGHT_SQUARE, loc);
-                case '{':
-                    current_column++;
-                    return Token::make(LEFT_CURLY, loc);
-                case '}':
-                    current_column++;
-                    return Token::make(RIGHT_CURLY, loc);
-                case '@':
-                    current_column++;
-                    return Token::make(PATTERN_BIND, loc);
-                case '$':
-                    current_column++;
-                    return Token::make(DOLLAR_SIGN, loc);
-                case '~':
-                    current_column++;
-                    return Token::make(TILDE, loc);
-                case '\\':
-                    current_column++;
-                    return Token::make(BACKSLASH, loc);
-                case '`':
-                    current_column++;
-                    return Token::make(BACKTICK, loc);
-                case '|':
-                    if (peek_input() == '=') {
-                        // bitwise or-assign?
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(PIPE_EQ, loc);
-                    } else if (peek_input() == '|') {
-                        // logical or
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(OR, loc);
-                    } else {
-                        // bitwise or
-                        current_column++;
-
-                        return Token::make(PIPE, loc);
-                    }
-                case '&':
-                    if (peek_input() == '=') {
-                        // bitwise and-assign?
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(AMP_EQ, loc);
-                    } else if (peek_input() == '&') {
-                        // logical and
-                        skip_input();
-                        current_column += 2;
-
-                        return Token::make(LOGICAL_AND, loc);
-                    } else {
-                        // bitwise and/reference
-                        current_column++;
-
-                        return Token::make(AMP, loc);
-                    }
-                case '.':
-                    if (peek_input() == '.') {
-                        if (peek_input(1) == '.') {
-                            // ellipsis
-                            skip_input(1);
-                            current_column += 3;
-
-                            return Token::make(ELLIPSIS, loc);
-                        } else if (peek_input(1) == '=') {
-                            // ..=
-                            skip_input(1);
-                            current_column += 3;
-
-                            return Token::make(DOT_DOT_EQ, loc);
-                        } else {
-                            // ..
-                            skip_input();
-                            current_column += 2;
-
-                            return Token::make(DOT_DOT, loc);
-                        }
-                    } else if (!ISDIGIT(peek_input())) {
-                        // single dot .
-                        // Only if followed by a non-number
-                        current_column++;
-                        return Token::make(DOT, loc);
-                    }
-            }
-            // TODO: special handling of _ in the lexer? instead of being identifier
-
-            // byte and byte string test
-            if (current_char == 'b') {
-                if (peek_input() == '\'') {
-                    skip_input();
-                    current_column++;
-                    // make current char the next character
-                    current_char = peek_input();
-
-                    int length = 1;
-
-                    // char to save
-                    char byte_char = 0;
-
-                    // detect escapes
-                    if (current_char == '\\') {
-                        auto escape_length_pair = parse_escape('\'');
-                        byte_char = std::get<0>(escape_length_pair);
-                        length += std::get<1>(escape_length_pair);
-
-                        if (byte_char > 127) {
-                            rust_error_at(
-                              get_current_location(), "byte char '%c' out of range", byte_char);
-                            byte_char = 0;
-                        }
-
-                        current_char = peek_input();
-
-                        if (current_char != '\'') {
-                            rust_error_at(get_current_location(), "unclosed byte char");
-                        }
-
-                        skip_input();
-                        current_char = peek_input();
-                        length++; // go to next char
-                    } else if (current_char != '\'') {
-                        // otherwise, get character from direct input character
-                        byte_char = current_char;
-
-                        skip_input();
-                        current_char = peek_input();
-                        length++;
-
-                        if (current_char != '\'') {
-                            rust_error_at(get_current_location(), "unclosed byte char");
-                        }
-
-                        skip_input();
-                        current_char = peek_input();
-                        length++; // go to next char
-                    } else {
-                        rust_error_at(get_current_location(), "no character inside '' for byte char");
-                    }
-
-                    current_column += length;
-
-                    return Token::make_byte_char(loc, byte_char);
-                } else if (peek_input() == '"') {
-                    // byte string
-
-                    // skip quote character
-                    skip_input();
-                    current_column++;
-
-                    std::string str;
-                    str.reserve(16); // some sensible default
-
-                    int length = 1;
-                    current_char = peek_input();
-
-                    while (current_char != '"' && current_char != '\n') {
-                        if (current_char == '\\') {
-                            auto escape_length_pair = parse_escape('"');
-                            char output_char = std::get<0>(escape_length_pair);
-                            //length += escape_length_pair.second;
-
-                            // TODO: need to fix length - after escape, the length of the line up to the next non-whitespace char of the string is added to length, which is not what we want - we want length to be replaced by that.
-                            // possible option could if "if escape_length_pair.first == 0, then length = escape_length_pair.second else length += escape_length_pair.second."
-                            if (output_char == 0 && std::get<2>(escape_length_pair))
-                                length = std::get<1>(escape_length_pair) - 1;
-                            else
-                                length += std::get<1>(escape_length_pair);
-
-                            if (output_char > 127) {
-                                rust_error_at(get_current_location(),
-                                  "char '%c' in byte string out of range", output_char);
-                                output_char = 0;
-                            }
-
-                            if (output_char != 0)
-                                str += output_char;
-
-                            continue;
-                        }
-
-                        length++;
-
-                        str += current_char;
-                        skip_input();
-                        current_char = peek_input();
-                    }
-
-                    current_column += length;
-
-                    if (current_char == '\n') {
-                        rust_error_at(get_current_location(), "unended byte string literal");
-                    } else if (current_char == '"') {
-                        // TEST: hopefully column inc should make string line up properly
-                        current_column++;
-
-                        skip_input();
-                        current_char = peek_input();
-                    } else {
-                        gcc_unreachable();
-                    }
-
-                    str.shrink_to_fit();
-
-                    return Token::make_byte_string(loc, str);
-                } else if (peek_input() == 'r' && (peek_input(1) == '#' || peek_input(1) == '"')) {
-                    // raw byte string literals
-                    std::string str;
-                    str.reserve(16); // some sensible default
-
-                    int length = 1;
-                    int hash_count = 0;
-
-                    // get hash count at beginnning
-                    skip_input();
-                    current_char = peek_input();
-                    length++;
-                    while (current_char == '#') {
-                        hash_count++;
-                        length++;
-
-                        skip_input();
-                        current_char = peek_input();
-                    }
-
-                    if (current_char != '"') {
-                        rust_error_at(get_current_location(), "raw byte string has no opening '\"'");
-                    }
-
-                    skip_input();
-                    current_char = peek_input();
-                    length++;
-
-                    while (true) {
-                        if (current_char == '"') {
-                            bool enough_hashes = true;
-
-                            for (int i = 0; i < hash_count; i++) {
-                                if (peek_input(i + 1) != '#') {
-                                    enough_hashes = false; // could continue here -
-                                                           // improve performance
-                                }
-                            }
-
-                            if (enough_hashes) {
-                                // skip enough input and peek enough input
-                                skip_input(hash_count); // is this enough?
-                                current_char = peek_input();
-                                length += hash_count + 1;
-                                break;
-                            }
-                        }
-
-                        length++;
-
-                        str += current_char;
-                        skip_input();
-                        current_char = peek_input();
-                    }
-
-                    current_column += length;
-
-                    str.shrink_to_fit();
-
-                    return Token::make_byte_string(loc, str);
-                }
-            }
-
-            // raw stuff
-            if (current_char == 'r') {
-                int peek = peek_input();
-                int peek1 = peek_input(1);
-
-                if (peek == '#' && (ISALPHA(peek1) || peek1 == '_')) {
-                    // raw identifier
-                    std::string str;
-                    str.reserve(16); // default
-
-                    skip_input();
-                    current_char = peek_input();
-
-                    current_column += 2;
-
-                    str += current_char;
-
-                    bool first_is_underscore = current_char == '_';
-
-                    int length = 1;
-                    current_char = peek_input();
-                    // loop through entire name
-                    while (ISALPHA(current_char) || ISDIGIT(current_char) || current_char == '_') {
-                        length++;
-
-                        str += current_char;
-                        skip_input();
-                        current_char = peek_input();
-                    }
-
-                    current_column += length;
-
-                    // if just a single underscore, not an identifier
-                    if (first_is_underscore && length == 1) {
-                        rust_error_at(get_current_location(), "'_' is not a valid raw identifier");
-                    }
-
-                    if (str == "crate" || str == "extern" || str == "self" || str == "super"
-                        || str == "Self") {
-                        rust_error_at(
-                          get_current_location(), "'%s' is a forbidden raw identifier", str.c_str());
-                    } else {
-                        str.shrink_to_fit();
-
-                        return Token::make_identifier(loc, str);
-                    }
-                } else {
-                    int peek_index = 0;
-                    while (peek_input(peek_index) == '#')
-                        peek_index++;
-                    // TODO: optimise by using "peek_index" as the hash count - 1 or something
-
-                    if (peek_input(peek_index) == '"') {
-                        // raw string literals
-                        std::string str;
-                        str.reserve(16); // some sensible default
-
-                        int length = 1;
-                        int hash_count = 0;
-
-                        // get hash count at beginnning
-                        current_char = peek;
-                        while (current_char == '#') {
-                            hash_count++;
-                            length++;
-
-                            skip_input();
-                            current_char = peek_input();
-                        }
-
-                        if (current_char != '"') {
-                            rust_error_at(get_current_location(), "raw string has no opening '\"'");
-                        }
-
-                        length++;
-                        skip_input();
-                        Codepoint current_char32 = test_peek_codepoint_input();
-
-                        // TODO: didn't account for current_column++ somewhere - one less than is required
+  str.shrink_to_fit ();
 
-                        while (true) {
-                            if (current_char32.value == '"') {
-                                bool enough_hashes = true;
+  return Token::make_byte_string (loc, str);
+}
 
-                                for (int i = 0; i < hash_count; i++) {
-                                    // if (test_peek_codepoint_input(i + 1) != '#') {
-                                    // TODO: ensure this is a good enough replacement
-                                    if (peek_input(i + 1) != '#') {
-                                        enough_hashes = false; // could continue here -
-                                                               // improve performance
-                                    }
-                                }
+// Parses a raw identifier.
+TokenPtr
+Lexer::parse_raw_identifier (Location loc)
+{
+  // raw identifier
+  std::string str;
+  str.reserve (16); // default
 
-                                if (enough_hashes) {
-                                    // skip enough input and peek enough input
-                                    skip_input(hash_count); // is this enough?
-                                    current_char = peek_input();
-                                    length += hash_count + 1;
-                                    break;
-                                }
-                            }
+  skip_input ();
+  current_char = peek_input ();
 
-                            length++;
+  current_column += 2;
 
-                            str += current_char32;
-                            test_skip_codepoint_input();
-                            current_char32 = test_peek_codepoint_input();
-                        }
+  str += current_char;
 
-                        current_column += length;
+  bool first_is_underscore = current_char == '_';
 
-                        str.shrink_to_fit();
+  int length = 1;
+  current_char = peek_input ();
+  // loop through entire name
+  while (ISALPHA (current_char) || ISDIGIT (current_char)
+	 || current_char == '_')
+    {
+      length++;
 
-                        return Token::make_string(loc, str);
-                    }
-                }
-            }
-
-            // find identifiers and keywords
-            if (ISALPHA(current_char) || current_char == '_') {
-                std::string str;
-                str.reserve(16); // default
-                str += current_char;
-
-                bool first_is_underscore = current_char == '_';
-
-                int length = 1;
-                current_char = peek_input();
-                // loop through entire name
-                while (ISALPHA(current_char) || ISDIGIT(current_char) || current_char == '_') {
-                    length++;
-
-                    str += current_char;
-                    skip_input();
-                    current_char = peek_input();
-                }
+      str += current_char;
+      skip_input ();
+      current_char = peek_input ();
+    }
 
-                current_column += length;
+  current_column += length;
 
-                // if just a single underscore, not an identifier
-                if (first_is_underscore && length == 1)
-                    return Token::make(UNDERSCORE, loc);
+  // if just a single underscore, not an identifier
+  if (first_is_underscore && length == 1)
+    rust_error_at (get_current_location (),
+		   "'_' is not a valid raw identifier");
 
-                str.shrink_to_fit();
+  if (str == "crate" || str == "extern" || str == "self" || str == "super"
+      || str == "Self")
+    {
+      rust_error_at (get_current_location (),
+		     "'%s' is a forbidden raw identifier", str.c_str ());
 
-                TokenId keyword = classify_keyword(str);
-                if (keyword == IDENTIFIER)
-                    return Token::make_identifier(loc, str);
-                else
-                    return Token::make(keyword, loc);
-            }
+      return nullptr;
+    }
+  else
+    {
+      str.shrink_to_fit ();
 
-            // identify literals
-            // int or float literals - not processed properly
-            if (ISDIGIT(current_char) || current_char == '.') { //  _ not allowed as first char
-                std::string str;
-                str.reserve(16); // some sensible default
-                str += current_char;
+      return Token::make_identifier (loc, str);
+    }
+}
 
-                PrimitiveCoreType type_hint = CORETYPE_UNKNOWN;
+// Parses a unicode string.
+TokenPtr
+Lexer::parse_string (Location loc)
+{
+  Codepoint current_char32;
 
-                bool is_real = (current_char == '.');
+  std::string str;
+  str.reserve (16); // some sensible default
 
-                int length = 1;
+  int length = 1;
+  current_char32 = peek_codepoint_input ();
 
-                // handle binary, octal, hex literals
-                if (current_char == '0' && !ISDIGIT(peek_input())) {
-                    current_char = peek_input();
+  while (current_char32.value != '\n' && current_char32.value != '"')
+    {
+      if (current_char32.value == '\\')
+	{
+	  // parse escape
+	  auto utf8_escape_pair = parse_utf8_escape ('\'');
+	  current_char32 = std::get<0> (utf8_escape_pair);
 
-                    if (current_char == 'x') {
-                        // hex (integer only)
+	  if (current_char32 == Codepoint (0) && std::get<2> (utf8_escape_pair))
+	    length = std::get<1> (utf8_escape_pair) - 1;
+	  else
+	    length += std::get<1> (utf8_escape_pair);
 
-                        skip_input();
-                        current_char = peek_input();
+	  if (current_char32 != Codepoint (0))
+	    str += current_char32;
 
-                        length++;
+	  // required as parsing utf8 escape only changes current_char
+	  current_char32 = peek_codepoint_input ();
 
-                        // add 'x' to string after 0 so it is 0xFFAA or whatever
-                        str += 'x';
+	  continue;
+	}
 
-                        // loop through to add entire hex number to string
-                        while (is_x_digit(current_char) || current_char == '_') {
-                            if (current_char == '_') {
-                                // don't add _ to number
-                                skip_input();
-                                current_char = peek_input();
+      length += get_input_codepoint_length ();
 
-                                length++;
+      str += current_char32;
+      skip_codepoint_input ();
+      current_char32 = peek_codepoint_input ();
+    }
 
-                                continue;
-                            }
+  current_column += length;
 
-                            length++;
+  if (current_char32.value == '\n')
+    {
+      rust_error_at (get_current_location (), "unended string literal");
+    }
+  else if (current_char32.value == '"')
+    {
+      current_column++;
 
-                            // add raw hex numbers
-                            str += current_char;
-                            skip_input();
-                            current_char = peek_input();
-                        }
+      skip_input ();
+      current_char = peek_input ();
+    }
+  else
+    {
+      gcc_unreachable ();
+    }
 
-                        current_column += length;
+  str.shrink_to_fit ();
+  return Token::make_string (loc, str);
+}
+
+// Parses an identifier or keyword.
+TokenPtr
+Lexer::parse_identifier_or_keyword (Location loc)
+{
+  std::string str;
+  str.reserve (16); // default
+  str += current_char;
+
+  bool first_is_underscore = current_char == '_';
+
+  int length = 1;
+  current_char = peek_input ();
+  // loop through entire name
+  while (ISALPHA (current_char) || ISDIGIT (current_char)
+	 || current_char == '_')
+    {
+      length++;
+
+      str += current_char;
+      skip_input ();
+      current_char = peek_input ();
+    }
 
-                        // convert hex value to decimal representation
-                        long hex_num = std::strtol(str.c_str(), NULL, 16);
+  current_column += length;
+
+  // if just a single underscore, not an identifier
+  if (first_is_underscore && length == 1)
+    return Token::make (UNDERSCORE, loc);
+
+  str.shrink_to_fit ();
+
+  TokenId keyword = classify_keyword (str);
+  if (keyword == IDENTIFIER)
+    return Token::make_identifier (loc, str);
+  else
+    return Token::make (keyword, loc);
+}
+
+// Possibly returns a raw string token if it exists - otherwise returns null.
+TokenPtr
+Lexer::maybe_parse_raw_string (Location loc)
+{
+  int peek_index = 0;
+  while (peek_input (peek_index) == '#')
+    peek_index++;
+
+  if (peek_input (peek_index) == '"')
+    return parse_raw_string (loc, peek_index);
+  else
+    return nullptr;
+}
+
+// Returns a raw string token.
+TokenPtr
+Lexer::parse_raw_string (Location loc, int initial_hash_count)
+{
+  // raw string literals
+  std::string str;
+  str.reserve (16); // some sensible default
+
+  int length = 1 + initial_hash_count;
+
+  if (initial_hash_count > 0)
+    skip_input (initial_hash_count - 1);
+
+  current_char = peek_input ();
+
+  if (current_char != '"')
+    rust_error_at (get_current_location (), "raw string has no opening '\"'");
+
+  length++;
+  skip_input ();
+  Codepoint current_char32 = peek_codepoint_input ();
+
+  while (true)
+    {
+      if (current_char32.value == '"')
+	{
+	  bool enough_hashes = true;
+
+	  for (int i = 0; i < initial_hash_count; i++)
+	    {
+	      if (peek_input (i + 1) != '#')
+		{
+		  enough_hashes = false;
+		  break;
+		}
+	    }
+
+	  if (enough_hashes)
+	    {
+	      // skip enough input and peek enough input
+	      skip_input (initial_hash_count);
+	      current_char = peek_input ();
+	      length += initial_hash_count + 1;
+	      break;
+	    }
+	}
+
+      length++;
+
+      str += current_char32;
+      skip_codepoint_input ();
+      current_char32 = peek_codepoint_input ();
+    }
 
-                        str = std::to_string(hex_num);
+  current_column += length;
 
-                        // parse in type suffix if it exists
-                        auto type_suffix_pair = parse_in_type_suffix();
-                        type_hint = type_suffix_pair.first;
-                        length += type_suffix_pair.second;
+  str.shrink_to_fit ();
 
-                        if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) {
-                            rust_error_at(get_current_location(),
-                              "invalid type suffix '%s' for integer (hex) literal",
-                              get_type_hint_string(type_hint));
-                        }
-                    } else if (current_char == 'o') {
-                        // octal (integer only)
+  return Token::make_string (loc, str);
+}
 
-                        skip_input();
-                        current_char = peek_input();
+template <typename IsDigitFunc>
+TokenPtr
+Lexer::parse_non_decimal_int_literal (Location loc, IsDigitFunc is_digit_func,
+				      std::string existent_str, int base)
+{
+  int length = 1;
 
-                        length++;
+  skip_input ();
+  current_char = peek_input ();
 
-                        // loop through to add entire octal number to string
-                        while (is_octal_digit(current_char) || current_char == '_') {
-                            if (current_char == '_') {
-                                // don't add _ to number
-                                skip_input();
-                                current_char = peek_input();
+  length++;
 
-                                length++;
+  // loop through to add entire number to string
+  while (is_digit_func (current_char) || current_char == '_')
+    {
+      if (current_char == '_')
+	{
+	  // don't add _ to number
+	  skip_input ();
+	  current_char = peek_input ();
 
-                                continue;
-                            }
+	  length++;
 
-                            length++;
+	  continue;
+	}
 
-                            // add raw octal numbers
-                            str += current_char;
-                            skip_input();
-                            current_char = peek_input();
-                        }
+      length++;
 
-                        current_column += length;
+      // add raw numbers
+      existent_str += current_char;
+      skip_input ();
+      current_char = peek_input ();
+    }
 
-                        // convert octal value to decimal representation
-                        long octal_num = std::strtol(str.c_str(), NULL, 8);
+  // convert value to decimal representation
+  long dec_num = std::strtol (existent_str.c_str (), nullptr, base);
+
+  existent_str = std::to_string (dec_num);
+
+  // parse in type suffix if it exists
+  auto type_suffix_pair = parse_in_type_suffix ();
+  PrimitiveCoreType type_hint = type_suffix_pair.first;
+  length += type_suffix_pair.second;
+
+  current_column += length;
+
+  if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64)
+    {
+      rust_error_at (get_current_location (),
+		     "invalid type suffix '%s' for integer (%s) literal",
+		     get_type_hint_string (type_hint),
+		     base == 16
+		       ? "hex"
+		       : (base == 8 ? "octal"
+				    : (base == 2 ? "binary"
+						 : "<insert unknown base>")));
+      return nullptr;
+    }
+  return Token::make_int (loc, existent_str, type_hint);
+}
+
+// Parses a hex, binary or octal int literal.
+TokenPtr
+Lexer::parse_non_decimal_int_literals (Location loc)
+{
+  std::string str;
+  str.reserve (16); // some sensible default
+  str += current_char;
+
+  current_char = peek_input ();
+
+  if (current_char == 'x')
+    {
+      // hex (integer only)
+      return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16);
+    }
+  else if (current_char == 'o')
+    {
+      // octal (integer only)
+      return parse_non_decimal_int_literal (loc, is_octal_digit,
+					    std::move (str), 8);
+    }
+  else if (current_char == 'b')
+    {
+      // binary (integer only)
+      return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str),
+					    2);
+    }
+  else
+    {
+      return nullptr;
+    }
+}
+
+// Parses a decimal-based int literal or float literal.
+TokenPtr
+Lexer::parse_decimal_int_or_float (Location loc)
+{
+  std::string str;
+  str.reserve (16); // some sensible default
+  str += current_char;
+
+  int length = 1;
+
+  current_char = peek_input ();
+
+  // parse initial decimal integer (or first integer part of float) literal
+  auto initial_decimal_pair = parse_in_decimal ();
+  str += initial_decimal_pair.first;
+  length += initial_decimal_pair.second;
+
+  // detect float literal
+  if (current_char == '.' && is_float_digit (peek_input (1)))
+    {
+      // float with a '.', parse another decimal into it
+
+      // add . to str
+      str += current_char;
+      skip_input ();
+      current_char = peek_input ();
+      length++;
+
+      // parse another decimal number for float
+      auto second_decimal_pair = parse_in_decimal ();
+      str += second_decimal_pair.first;
+      length += second_decimal_pair.second;
+
+      // parse in exponent part if it exists
+      auto exponent_pair = parse_in_exponent_part ();
+      str += exponent_pair.first;
+      length += exponent_pair.second;
+
+      // parse in type suffix if it exists
+      auto type_suffix_pair = parse_in_type_suffix ();
+      PrimitiveCoreType type_hint = type_suffix_pair.first;
+      length += type_suffix_pair.second;
+
+      if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
+	  && type_hint != CORETYPE_UNKNOWN)
+	{
+	  rust_error_at (get_current_location (),
+			 "invalid type suffix '%s' for float literal",
+			 get_type_hint_string (type_hint));
+	  // ignore invalid type suffix as everything else seems fine
+	  type_hint = CORETYPE_UNKNOWN;
+	}
+
+      current_column += length;
+
+      str.shrink_to_fit ();
+      return Token::make_float (loc, str, type_hint);
+    }
+  else if (current_char == '.' && check_valid_float_dot_end (peek_input (1)))
+    {
+      // float that is just an integer with a terminating '.' character
+
+      // add . to str
+      str += current_char;
+      skip_input ();
+      current_char = peek_input ();
+      length++;
+
+      // add a '0' after the . to prevent ambiguity
+      str += '0';
 
-                        str = std::to_string(octal_num);
+      // type hint not allowed
 
-                        // parse in type suffix if it exists
-                        // parse_in_type_suffix (/*current_char, */ type_hint, length);
-                        auto type_suffix_pair = parse_in_type_suffix();
-                        type_hint = type_suffix_pair.first;
-                        length += type_suffix_pair.second;
+      current_column += length;
+
+      str.shrink_to_fit ();
+      return Token::make_float (loc, str, CORETYPE_UNKNOWN);
+    }
+  else if (current_char == 'E' || current_char == 'e')
+    {
+      // exponent float with no '.' character
+
+      // parse exponent part
+      auto exponent_pair = parse_in_exponent_part ();
+      str += exponent_pair.first;
+      length += exponent_pair.second;
+
+      // parse in type suffix if it exists
+      auto type_suffix_pair = parse_in_type_suffix ();
+      PrimitiveCoreType type_hint = type_suffix_pair.first;
+      length += type_suffix_pair.second;
+
+      if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
+	  && type_hint != CORETYPE_UNKNOWN)
+	{
+	  rust_error_at (get_current_location (),
+			 "invalid type suffix '%s' for float literal",
+			 get_type_hint_string (type_hint));
+	  // ignore invalid type suffix as everything else seems fine
+	  type_hint = CORETYPE_UNKNOWN;
+	}
+
+      current_column += length;
+
+      str.shrink_to_fit ();
+      return Token::make_float (loc, str, type_hint);
+    }
+  else
+    {
+      // is an integer
+
+      // parse in type suffix if it exists
+      auto type_suffix_pair = parse_in_type_suffix ();
+      PrimitiveCoreType type_hint = type_suffix_pair.first;
+      length += type_suffix_pair.second;
+
+      if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64)
+	{
+	  rust_error_at (get_current_location (),
+			 "invalid type suffix '%s' for integer "
+			 "(decimal) literal",
+			 get_type_hint_string (type_hint));
+	  // ignore invalid type suffix as everything else seems fine
+	  type_hint = CORETYPE_UNKNOWN;
+	}
+
+      current_column += length;
+
+      str.shrink_to_fit ();
+      return Token::make_int (loc, str, type_hint);
+    }
+}
+
+TokenPtr
+Lexer::parse_char_or_lifetime (Location loc)
+{
+  Codepoint current_char32;
+
+  int length = 1;
+
+  current_char32 = peek_codepoint_input ();
+
+  // parse escaped char literal
+  if (current_char32.value == '\\')
+    {
+      // parse escape
+      auto utf8_escape_pair = parse_utf8_escape ('\'');
+      current_char32 = std::get<0> (utf8_escape_pair);
+      length += std::get<1> (utf8_escape_pair);
+
+      if (peek_codepoint_input ().value != '\'')
+	{
+	  rust_error_at (get_current_location (), "unended char literal");
+	}
+      else
+	{
+	  skip_codepoint_input ();
+	  current_char = peek_input ();
+	  length++;
+	}
+
+      current_column += length;
+
+      return Token::make_char (loc, current_char32);
+    }
+  else
+    {
+      skip_codepoint_input ();
+
+      if (peek_codepoint_input ().value == '\'')
+	{
+	  // parse non-escaped char literal
+
+	  // skip the ' character
+	  skip_input ();
+	  current_char = peek_input ();
+
+	  // TODO fix due to different widths of utf-8 chars?
+	  current_column += 3;
+
+	  return Token::make_char (loc, current_char32);
+	}
+      else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
+	       || current_char32.value == '_')
+	{
+	  // parse lifetime name
+	  std::string str;
+	  str += current_char32;
+	  length++;
+
+	  current_char = peek_input ();
+	  while (ISDIGIT (current_char) || ISALPHA (current_char)
+		 || current_char == '_')
+	    {
+	      str += current_char;
+	      skip_input ();
+	      current_char = peek_input ();
+	      length++;
+	    }
+
+	  current_column += length;
+
+	  str.shrink_to_fit ();
+	  return Token::make_lifetime (loc, str);
+	}
+      else
+	{
+	  rust_error_at (get_current_location (),
+			 "expected ' after character constant in char literal");
+	}
+    }
+}
 
-                        if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) {
-                            rust_error_at(get_current_location(),
-                              "invalid type suffix '%s' for integer (octal) literal",
-                              get_type_hint_string(type_hint));
-                        }
-                    } else if (current_char == 'b') {
-                        // binary (integer only)
+// Returns the length of the codepoint at the current position.
+int
+Lexer::get_input_codepoint_length ()
+{
+  uint8_t input = peek_input ();
 
-                        skip_input();
-                        current_char = peek_input();
+  if (input < 128)
+    {
+      // ascii -- 1 byte
+      // return input;
 
-                        length++;
+      return 1;
+    }
+  else if ((input & 0xC0) == 0x80)
+    {
+      // invalid (continuation; can't be first char)
+      // return 0xFFFE;
 
-                        // loop through to add entire binary number to string
-                        while (is_bin_digit(current_char) || current_char == '_') {
-                            if (current_char == '_') {
-                                // don't add _ to number
-                                skip_input();
-                                current_char = peek_input();
-
-                                length++;
-
-                                continue;
-                            }
-
-                            length++;
-
-                            // add raw binary numbers
-                            str += current_char;
-                            skip_input();
-                            current_char = peek_input();
-                        }
-
-                        current_column += length;
-
-                        // convert binary value to decimal representation
-                        long bin_num = std::strtol(str.c_str(), NULL, 2);
-
-                        str = std::to_string(bin_num);
-
-                        // parse in type suffix if it exists
-                        // parse_in_type_suffix (/*current_char, */ type_hint, length);
-                        auto type_suffix_pair = parse_in_type_suffix();
-                        type_hint = type_suffix_pair.first;
-                        length += type_suffix_pair.second;
-
-                        if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) {
-                            rust_error_at(get_current_location(),
-                              "invalid type suffix '%s' for integer (binary) literal",
-                              get_type_hint_string(type_hint));
-                        }
-                    }
-                } else {
-                    // handle decimals (integer or float)
-
-                    current_char = peek_input();
-
-                    // parse initial decimal literal - assuming integer
-                    // parse_in_decimal (/*current_char, */ str, length);
-                    auto str_length_pair = parse_in_decimal();
-                    str += str_length_pair.first;
-                    length += str_length_pair.second;
-
-                    // detect float literal - TODO: fix: "242." is not recognised as a
-                    // float literal
-                    if (current_char == '.' && is_float_digit(peek_input(1))) {
-                        // float with a '.', parse another decimal into it
-
-                        is_real = true;
-
-                        // add . to str
-                        str += current_char;
-                        skip_input();
-                        current_char = peek_input();
-
-                        length++;
-
-                        // parse another decimal number for float
-                        auto str_length_pair2 = parse_in_decimal();
-                        str += str_length_pair2.first;
-                        length += str_length_pair2.second;
-
-                        // parse in exponent part if it exists
-                        auto exponent_part = parse_in_exponent_part();
-                        str += exponent_part.first;
-                        length += exponent_part.second;
-
-                        // parse in type suffix if it exists
-                        auto type_suffix_pair = parse_in_type_suffix();
-                        type_hint = type_suffix_pair.first;
-                        length += type_suffix_pair.second;
-
-                        if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
-                            && type_hint != CORETYPE_UNKNOWN) {
-                            rust_error_at(get_current_location(),
-                              "invalid type suffix '%s' for float literal",
-                              get_type_hint_string(type_hint));
-                        }
-                    } else if (current_char == '.' && check_valid_float_dot_end(peek_input(1))) {
-                        is_real = true;
-
-                        // add . to str
-                        str += current_char;
-                        skip_input();
-                        current_char = peek_input();
-                        length++;
-
-                        // add a '0' after the . to stop ambiguity
-                        str += '0';
-
-                        // don't parse another decimal number for float
-
-                        if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
-                            && type_hint != CORETYPE_UNKNOWN) {
-                            rust_error_at(get_current_location(),
-                              "invalid type suffix '%s' for float literal",
-                              get_type_hint_string(type_hint));
-                        }
-                    } else if (current_char == 'E' || current_char == 'e') {
-                        is_real = true;
-
-                        // parse exponent part
-                        // parse_in_exponent_part (/*current_char, */ str, length);
-                        auto exponent_part = parse_in_exponent_part();
-                        str += exponent_part.first;
-                        length += exponent_part.second;
-
-                        // parse in type suffix if it exists
-                        // parse_in_type_suffix (/*current_char, */ type_hint, length);
-                        auto type_suffix_pair = parse_in_type_suffix();
-                        type_hint = type_suffix_pair.first;
-                        length += type_suffix_pair.second;
-
-                        if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
-                            && type_hint != CORETYPE_UNKNOWN) {
-                            rust_error_at(get_current_location(),
-                              "invalid type suffix '%s' for float literal",
-                              get_type_hint_string(type_hint));
-                        }
-                    } else {
-                        // is an integer
-
-                        // parse in type suffix if it exists
-                        // parse_in_type_suffix (/*current_char, */ type_hint, length);
-                        auto type_suffix_pair = parse_in_type_suffix();
-                        type_hint = type_suffix_pair.first;
-                        length += type_suffix_pair.second;
-
-                        if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) {
-                            rust_error_at(get_current_location(),
-                              "invalid type suffix '%s' for integer "
-                              "(decimal) literal",
-                              get_type_hint_string(type_hint));
-                        }
-                    }
-
-                    current_column += length;
-                }
-
-                str.shrink_to_fit();
-
-                // actually make the tokens
-                if (is_real)
-                    return Token::make_float(loc, str, type_hint);
-                else
-                    return Token::make_int(loc, str, type_hint);
-            }
-
-            // string literals - not processed properly
-            if (current_char == '"') {
-                Codepoint current_char32;
-
-                std::string str;
-                str.reserve(16); // some sensible default
-
-                int length = 1;
-                current_char32 = test_peek_codepoint_input();
-
-                while (current_char32.value != '\n' && current_char32.value != '"') {
-                    if (current_char32.value == '\\') {
-                        // parse escape
-                        auto utf8_escape_pair = parse_utf8_escape('\'');
-                        current_char32 = std::get<0>(utf8_escape_pair);
-                        //length += utf8_escape_pair.second;
-
-                        // TODO: need to fix length - after escape, the length of the line up to the next non-whitespace char of the string is added to length, which is not what we want - we want length to be replaced by that.
-                        // possible option could if "if escape_length_pair.first == 0, then length = escape_length_pair.second else length += escape_length_pair.second."
-                        if (current_char32 == Codepoint(0) && std::get<2>(utf8_escape_pair))
-                            length = std::get<1>(utf8_escape_pair);
-                        else
-                            length += std::get<1>(utf8_escape_pair);
-
-                        if (current_char32 != Codepoint(0))
-                            str += current_char32;
-
-                        // required as parsing utf8 escape only changes current_char
-                        // or something
-                        current_char32 = test_peek_codepoint_input();
-
-                        continue;
-                    }
-
-                    length += test_get_input_codepoint_length();
-
-                    str += current_char32;
-                    test_skip_codepoint_input();
-                    current_char32 = test_peek_codepoint_input();
-                }
-
-                current_column += length;
-
-                if (current_char32.value == '\n') {
-                    rust_error_at(get_current_location(), "unended string literal");
-                } else if (current_char32.value == '"') {
-                    current_column++;
-                    
-                    skip_input();
-                    current_char = peek_input();
-                } else {
-                    gcc_unreachable();
-                }
-
-                str.shrink_to_fit();
-                return Token::make_string(loc, str);
-            }
-
-            // char literal attempt
-            if (current_char == '\'') {
-                Codepoint current_char32;
-
-                int length = 1;
-
-                current_char32 = test_peek_codepoint_input();
-
-                // parse escaped char literal
-                if (current_char32.value == '\\') {
-                    // parse escape
-                    auto utf8_escape_pair = parse_utf8_escape('\'');
-                    current_char32 = std::get<0>(utf8_escape_pair);
-                    length += std::get<1>(utf8_escape_pair);
-
-                    if (test_peek_codepoint_input().value != '\'') {
-                        rust_error_at(get_current_location(), "unended char literal");
-                    } else {
-                        test_skip_codepoint_input();
-                        current_char = peek_input();
-                        length++;
-                    }
-
-                    current_column += length;
-
-                    return Token::make_char(loc, current_char32);
-                } else {
-                    // current_char32 = test_peek_codepoint_input();
-                    test_skip_codepoint_input();
-
-                    if (test_peek_codepoint_input().value == '\'') {
-                        // parse normal char literal
-
-                        // skip the ' character
-                        skip_input();
-                        current_char = peek_input();
-
-                        // TODO fix due to different widths of utf-8 chars
-                        current_column += 3;
-
-                        return Token::make_char(loc, current_char32);
-                    } else if (ISDIGIT(current_char32.value) || ISALPHA(current_char32.value)
-                               || current_char32.value == '_') {
-                        // parse lifetime name
-                        std::string str;
-                        str += current_char32;
-
-                        /* TODO: fix lifetime name thing - actually, why am I even
-                         * using utf-8 here? */
-
-                        int length = 1;
-
-                        current_char32 = test_peek_codepoint_input();
-
-                        while (ISDIGIT(current_char32.value) || ISALPHA(current_char32.value)
-                               || current_char32.value == '_') {
-                            length += test_get_input_codepoint_length();
-
-                            str += current_char32;
-                            test_skip_codepoint_input();
-                            current_char32 = test_peek_codepoint_input();
-                        }
-
-                        current_column += length;
-
-                        str.shrink_to_fit();
-                        return Token::make_lifetime(loc, str);
-                    } else {
-                        rust_error_at(get_current_location(), "expected ' after character constant");
-                    }
-                }
-            }
-
-            // didn't match anything so error
-            rust_error_at(loc, "unexpected character '%x'", current_char);
-            current_column++;
-        }
-    }
-
-    // Shitty pass-by-reference way of parsing in type suffix.
-    std::pair<PrimitiveCoreType, int> Lexer::parse_in_type_suffix() {
-        std::string suffix;
-        suffix.reserve(5);
-
-        int additional_length_offset = 0;
-
-        // get suffix
-        while (ISALPHA(current_char) || ISDIGIT(current_char) || current_char == '_') {
-            if (current_char == '_') {
-                // don't add _ to suffix
-                skip_input();
-                current_char = peek_input();
-
-                additional_length_offset++;
-
-                continue;
-            }
-
-            additional_length_offset++;
-
-            suffix += current_char;
-            skip_input();
-            current_char = peek_input();
-        }
-
-        if (suffix.empty()) {
-            // no type suffix: do nothing but also no error
-            return std::make_pair(CORETYPE_UNKNOWN, additional_length_offset);
-        } else if (suffix == "f32") {
-            return std::make_pair(CORETYPE_F32, additional_length_offset);
-        } else if (suffix == "f64") {
-            return std::make_pair(CORETYPE_F64, additional_length_offset);
-        } else if (suffix == "i8") {
-            return std::make_pair(CORETYPE_I8, additional_length_offset);
-        } else if (suffix == "i16") {
-            return std::make_pair(CORETYPE_I16, additional_length_offset);
-        } else if (suffix == "i32") {
-            return std::make_pair(CORETYPE_I32, additional_length_offset);
-        } else if (suffix == "i64") {
-            return std::make_pair(CORETYPE_I64, additional_length_offset);
-        } else if (suffix == "i128") {
-            return std::make_pair(CORETYPE_I128, additional_length_offset);
-        } else if (suffix == "isize") {
-            return std::make_pair(CORETYPE_ISIZE, additional_length_offset);
-        } else if (suffix == "u8") {
-            return std::make_pair(CORETYPE_U8, additional_length_offset);
-        } else if (suffix == "u16") {
-            return std::make_pair(CORETYPE_U16, additional_length_offset);
-        } else if (suffix == "u32") {
-            return std::make_pair(CORETYPE_U32, additional_length_offset);
-        } else if (suffix == "u64") {
-            return std::make_pair(CORETYPE_U64, additional_length_offset);
-        } else if (suffix == "u128") {
-            return std::make_pair(CORETYPE_U128, additional_length_offset);
-        } else if (suffix == "usize") {
-            return std::make_pair(CORETYPE_USIZE, additional_length_offset);
-        } else {
-            rust_error_at(get_current_location(), "unknown number suffix '%s'", suffix.c_str());
-
-            return std::make_pair(CORETYPE_UNKNOWN, additional_length_offset);
-        }
-    }
-
-    std::pair<std::string, int> Lexer::parse_in_exponent_part() {
-        int additional_length_offset = 0;
-        std::string str;
-        if (current_char == 'E' || current_char == 'e') {
-            // add exponent to string as strtod works with it
-            str += current_char;
-            skip_input();
-            current_char = peek_input();
-
-            additional_length_offset++;
-
-            // special - and + handling
-            if (current_char == '-') {
-                str += '-';
-
-                skip_input();
-                current_char = peek_input();
-
-                additional_length_offset++;
-            } else if (current_char == '+') {
-                // don't add + but still skip input
-                skip_input();
-                current_char = peek_input();
-
-                additional_length_offset++;
-            }
-
-            // parse another decimal number for exponent
-            auto str_length_pair = parse_in_decimal();
-            str += str_length_pair.first;
-            additional_length_offset += str_length_pair.second;
-        }
-        return std::make_pair(str, additional_length_offset);
-    }
-
-    std::pair<std::string, int> Lexer::parse_in_decimal() {
-        int additional_length_offset = 0;
-        std::string str;
-        while (ISDIGIT(current_char) || current_char == '_') {
-            if (current_char == '_') {
-                // don't add _ to number
-                skip_input();
-                current_char = peek_input();
-
-                additional_length_offset++;
-
-                continue;
-            }
-
-            additional_length_offset++;
-
-            str += current_char;
-            skip_input();
-            current_char = peek_input();
-        }
-        return std::make_pair(str, additional_length_offset);
-    }
-
-    /* Parses escapes (and string continues) in "byte" strings and characters. Does not support unicode. */
-    std::tuple<char, int, bool> Lexer::parse_escape(char opening_char) {
-        int additional_length_offset = 0;
-        char output_char = 0;
-
-        // skip to actual letter
-        skip_input();
-        current_char = peek_input();
-        additional_length_offset++;
-
-        switch (current_char) {
-            case 'x': {
-                // hex char string (null-terminated)
-                char hexNum[3] = { 0, 0, 0 };
-
-                // first hex char
-                skip_input();
-                current_char = peek_input();
-                additional_length_offset++;
-
-                if (!is_x_digit(current_char)) {
-                    rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
-                      current_char);
-                }
-                hexNum[0] = current_char;
-
-                // second hex char
-                skip_input();
-                current_char = peek_input();
-                additional_length_offset++;
-
-                if (!is_x_digit(current_char)) {
-                    rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
-                      current_char);
-                }
-                hexNum[1] = current_char;
-
-                long hexLong = std::strtol(hexNum, NULL, 16);
-
-                if (hexLong > 255 || hexLong < 0)
-                    rust_error_at(get_current_location(),
-                      "byte \\x escape '\\x%s' out of range - allows up to '\\xFF'", hexNum);
-                char hexChar = static_cast<char>(hexLong);
-
-                output_char = hexChar;
-            } break;
-            case 'n':
-                output_char = '\n';
-                break;
-            case 'r':
-                output_char = '\r';
-                break;
-            case 't':
-                output_char = '\t';
-                break;
-            case '\\':
-                output_char = '\\';
-                break;
-            case '0':
-                output_char = '\0';
-                break;
-            case '\'':
-                output_char = '\'';
-                break;
-            case '"':
-                output_char = '"';
-                break;
-            case 'u':
-                rust_error_at(get_current_location(),
-                  "cannot have a unicode escape \\u in a byte %s!",
-                  opening_char == '\'' ? "character" : "string");
-                return std::make_tuple(output_char, additional_length_offset, false);
-#if 0
-			{
-                // TODO: shouldn't be used with this - use parse_utf8_escape
-
-                skip_input();
-                current_char = peek_input();
-                additional_length_offset++;
-
-                bool need_close_brace = false;
-
-                // TODO: rustc lexer doesn't seem to allow not having { but mrustc lexer
-                // does? look at spec?
-                if (current_char == '{') {
-                    need_close_brace = true;
-
-                    skip_input();
-                    current_char = peek_input();
-                    additional_length_offset++;
-                }
-
-                // parse unicode escape
-                // 1-6 hex digits?
-                std::string num_str;
-                num_str.reserve(6);
-
-                // test adding number directly
-                uint32_t test_val;
-
-                // loop through to add entire hex number to string
-                while (is_x_digit(current_char) || current_char == '_') {
-                    if (current_char == '_') {
-                        // don't add _ to number
-                        skip_input();
-                        current_char = peek_input();
-
-                        additional_length_offset++;
-
-                        continue;
-                    }
-
-                    additional_length_offset++;
-
-                    // add raw hex numbers
-                    num_str += current_char;
-
-                    // test adding number directly
-                    char tmp[2] = { current_char, 0 };
-                    test_val *= 16;
-                    test_val += std::strtol(tmp, NULL, 16);
-
-                    skip_input();
-                    current_char = peek_input();
-                }
-
-                // ensure closing brace
-                if (need_close_brace && current_char != '}') {
-                    // actually an error
-                    rust_error_at(
-                      get_current_location(), "expected terminating '}' in unicode escape");
-                    // return false;
-                    return std::make_pair(output_char, additional_length_offset);
-                }
-
-                // ensure 1-6 hex characters
-                if (num_str.length() > 6 || num_str.length() < 1) {
-                    rust_error_at(get_current_location(),
-                      "unicode escape should be between 1 and 6 hex "
-                      "characters; it is %lu",
-                      num_str.length());
-                    // return false;
-                    return std::make_pair(output_char, additional_length_offset);
-                }
-
-                long hex_num = std::strtol(num_str.c_str(), NULL, 16);
-
-                // as debug, check hex_num = test_val
-                if (hex_num > 255) {
-                    rust_error_at(
-                      get_current_location(), "non-ascii chars not implemented yet, defaulting to 0");
-                    hex_num = 0;
-                }
-
-                // make output_char the value - UTF-8?
-                // TODO: actually make this work - output char must be 4 bytes, do I
-                // need a string for this?
-                output_char = static_cast</*uint32_t*/ char>(hex_num);
-
-                // return true;
-                return std::make_pair(output_char, additional_length_offset);
-            } break;
-#endif
-            case '\r':
-            case '\n':
-                // string continue
-                while (is_whitespace(current_char)) {
-                    if (current_char == '\n') {
-                        current_line++;
-                        current_column = 1;
-                        // tell line_table that new line starts
-                        line_map->start_line(current_line, max_column_hint);
-
-                        // reset "length"
-                        additional_length_offset = 1;
-
-                        // get next char
-                        skip_input();
-                        current_char = peek_input();
-
-                        continue;
-                    }
-
-                    skip_input();
-                    current_char = peek_input();
-                    additional_length_offset++;
-                }
-
-                return std::make_tuple(0, additional_length_offset, true);
-            default:
-                rust_error_at(get_current_location(), "unknown escape sequence '\\%c'", current_char);
-                // returns false if no parsing could be done
-                // return false;
-                return std::make_tuple(output_char, additional_length_offset, false);
-                break;
-        }
-        // all non-special cases (string continue) should skip their used char
-        skip_input();
-        current_char = peek_input();
-        additional_length_offset++;
-
-        // returns true if parsing was successful
-        // return true;
-        return std::make_tuple(output_char, additional_length_offset, false);
-    }
-
-    // Parses an escape (or string continue) in a string or character. Supports unicode escapes.
-    std::tuple<Codepoint, int, bool> Lexer::parse_utf8_escape(char opening_char) {
-        Codepoint output_char;
-        int additional_length_offset = 0;
-
-        // skip to actual letter
-        skip_input();
-        current_char = peek_input();
-        additional_length_offset++;
-
-        switch (current_char) {
-            case 'x': {
-                // hex char string (null-terminated)
-                char hexNum[3] = { 0, 0, 0 };
-
-                // first hex char
-                skip_input();
-                current_char = peek_input();
-                additional_length_offset++;
-
-                if (!is_x_digit(current_char)) {
-                    rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
-                      current_char);
-                }
-                hexNum[0] = current_char;
-
-                // second hex char
-                skip_input();
-                current_char = peek_input();
-                additional_length_offset++;
-
-                if (!is_x_digit(current_char)) {
-                    rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
-                      current_char);
-                }
-                hexNum[1] = current_char;
-
-                long hexLong = std::strtol(hexNum, NULL, 16);
-
-                if (hexLong > 127)
-                    rust_error_at(get_current_location(),
-                      "ascii \\x escape '\\x%s' out of range - allows up to '\\x7F'", hexNum);
-                // gcc_assert(hexLong < 128); // as ascii
-                char hexChar = static_cast<char>(hexLong);
-
-                output_char = hexChar;
-            } break;
-            case 'n':
-                output_char = '\n';
-                break;
-            case 'r':
-                output_char = '\r';
-                break;
-            case 't':
-                output_char = '\t';
-                break;
-            case '\\':
-                output_char = '\\';
-                break;
-            case '0':
-                output_char = '\0';
-                break;
-            case '\'':
-                output_char = '\'';
-                break;
-            case '"':
-                output_char = '"';
-                break;
-            case 'u': {
-                skip_input();
-                current_char = peek_input();
-                additional_length_offset++;
-
-                bool need_close_brace = false;
-                if (current_char == '{') {
-                    need_close_brace = true;
-
-                    skip_input();
-                    current_char = peek_input();
-                    additional_length_offset++;
-                }
-
-                // parse unicode escape - 1-6 hex digits
-                std::string num_str;
-                num_str.reserve(6);
-
-                // loop through to add entire hex number to string
-                while (is_x_digit(current_char) || current_char == '_') {
-                    if (current_char == '_') {
-                        // don't add _ to number
-                        skip_input();
-                        current_char = peek_input();
-
-                        additional_length_offset++;
-
-                        continue;
-                    }
-
-                    additional_length_offset++;
-
-                    // add raw hex numbers
-                    num_str += current_char;
-
-                    skip_input();
-                    current_char = peek_input();
-                }
-
-                // ensure closing brace if required
-                if (need_close_brace) {
-                    if (current_char == '}') {
-                        skip_input();
-                        current_char = peek_input();
-                        additional_length_offset++;
-                    } else {
-                        // actually an error
-                        rust_error_at(
-                          get_current_location(), "expected terminating '}' in unicode escape");
-                        // return false;
-                        return std::make_tuple(output_char, additional_length_offset, false);
-                    }
-                }
-
-                // ensure 1-6 hex characters
-                if (num_str.length() > 6 || num_str.length() < 1) {
-                    rust_error_at(get_current_location(),
-                      "unicode escape should be between 1 and 6 hex "
-                      "characters; it is %lu",
-                      num_str.length());
-                    // return false;
-                    return std::make_tuple(output_char, additional_length_offset, false);
-                }
-
-                long hex_num = std::strtol(num_str.c_str(), NULL, 16);
-
-                // assert fits a uint32_t
-                gcc_assert(hex_num < 4294967296);
-
-                output_char = Codepoint(static_cast<uint32_t>(hex_num));
-
-                // TODO: what is being outputted? the escape code for the unicode char
-                // (unicode number) or the character number?
-
-                // return true;
-                return std::make_tuple(output_char, additional_length_offset, false);
-            } break;
-            case '\r':
-            case '\n':
-                // string continue
-                while (is_whitespace(current_char)) {
-                    if (current_char == '\n') {
-                        current_line++;
-                        current_column = 1;
-                        // tell line_table that new line starts
-                        line_map->start_line(current_line, max_column_hint);
-
-                        // reset "length"
-                        additional_length_offset = 1;
-
-                        // get next char
-                        skip_input();
-                        current_char = peek_input();
-
-                        continue;
-                    }
-
-                    skip_input();
-                    current_char = peek_input();
-                    additional_length_offset++;
-                }
-
-                return std::make_tuple(0, additional_length_offset, true);
-            default:
-                rust_error_at(get_current_location(), "unknown escape sequence '\\%c'", current_char);
-                // returns false if no parsing could be done
-                // return false;
-                return std::make_tuple(output_char, additional_length_offset, false);
-                break;
-        }
-        /* all non-special cases (unicode, string continue) should skip their used
-         * char */
-        skip_input();
-        current_char = peek_input();
-        additional_length_offset++;
-
-        // returns true if parsing was successful
-        // return true;
-        return std::make_tuple(output_char, additional_length_offset, false);
-    }
-
-#if 0
-    bool Lexer::parse_ascii_escape(/*char& current_char, */ int& length, char& output_char) {
-        // skip to actual letter
-        skip_input();
-        current_char = peek_input();
-        length++;
-
-        switch (current_char) {
-            case 'x': {
-                // hex char string (null-terminated)
-                char hexNum[3] = { 0, 0, 0 };
-
-                // first hex char
-                skip_input();
-                current_char = peek_input();
-                length++;
-
-                if (!ISXDIGIT(current_char)) {
-                    rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
-                      current_char);
-                }
-                hexNum[0] = current_char;
-
-                // second hex char
-                skip_input();
-                current_char = peek_input();
-                length++;
-
-                if (!ISXDIGIT(current_char)) {
-                    rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
-                      current_char);
-                }
-                hexNum[1] = current_char;
-
-                long hexLong = ::std::strtol(hexNum, NULL, 16);
-
-                if (hexLong > 127)
-                    rust_error_at(get_current_location(),
-                      "ascii \\x escape '\\x%s' out of range - allows up to '\\x7F'", hexNum);
-                // gcc_assert(hexLong < 128); // as ascii
-                char hexChar = static_cast<char>(hexLong);
-
-                // TODO: fix - does this actually give the right character?
-                output_char = hexChar;
-            } break;
-            case 'n':
-                output_char = '\n';
-                break;
-            case 'r':
-                output_char = '\r';
-                break;
-            case 't':
-                output_char = '\t';
-                break;
-            case '\\':
-                output_char = '\\';
-                break;
-            case '0':
-                output_char = '\0';
-                break;
-            default:
-                // rust_error_at(get_current_location(), "unknown escape sequence '\\%c'", current_char);
-                // returns false if no parsing could be done
-                return false;
-                break;
-        }
-        // returns true if parsing was successful
-        return true;
-    }
-
-    bool Lexer::parse_quote_escape(/*char& current_char, */ int& length, char& output_char) {
-        // skip to actual letter
-        skip_input();
-        current_char = peek_input();
-        length++;
-
-        switch (current_char) {
-            case '\'':
-                output_char = '\'';
-                break;
-            case '"':
-                output_char = '"';
-                break;
-            default:
-                return false;
-                break;
-        }
-        return true;
-    }
-
-    bool Lexer::parse_unicode_escape(
-      /*char& current_char, */ int& length, /*char*/ uint32_t& output_char) {
-        // skip to actual letter
-        skip_input();
-        current_char = peek_input();
-        length++;
-
-        if (current_char != 'u') {
-            // not a unicode escape, but not necessarily an error
-            return false;
-        }
-
-        skip_input();
-        current_char = peek_input();
-        length++;
-
-        bool need_close_brace = false;
-
-        // TODO: rustc lexer doesn't seem to allow not having { but mrustc lexer does? look at spec?
-        if (current_char == '{') {
-            need_close_brace = true;
-
-            skip_input();
-            current_char = peek_input();
-            length++;
-        }
-
-        // parse unicode escape
-        // 1-6 hex digits?
-        ::std::string num_str;
-        num_str.reserve(6);
-
-        // test adding number directly
-        uint32_t test_val;
-
-        // loop through to add entire hex number to string
-        while (is_x_digit(current_char) || current_char == '_') {
-            if (current_char == '_') {
-                // don't add _ to number
-                skip_input();
-                current_char = peek_input();
-
-                length++;
-
-                continue;
-            }
-
-            length++;
-
-            // add raw hex numbers
-            num_str += current_char;
-
-            // test adding number directly
-            char tmp[2] = { current_char, 0 };
-            test_val *= 16;
-            test_val += ::std::strtol(tmp, NULL, 16);
-
-            skip_input();
-            current_char = peek_input();
-        }
-
-        // ensure closing brace
-        if (need_close_brace && current_char != '}') {
-            // actually an error
-            rust_error_at(get_current_location(), "expected terminating '}' in unicode escape");
-            return false;
-        }
-
-        // ensure 1-6 hex characters
-        if (num_str.length() > 6 || num_str.length() < 1) {
-            rust_error_at(get_current_location(),
-              "unicode escape should be between 1 and 6 hex characters; it is %lu", num_str.length());
-            return false;
-        }
-
-        long hex_num = ::std::strtol(num_str.c_str(), NULL, 16);
-
-        // as debug, check hex_num = test_val
-
-        // make output_char the value - UTF-8?
-        // TODO: actually make this work - output char must be 4 bytes, do I need a string for this?
-        output_char = static_cast<uint32_t>(hex_num);
-
-        return true;
-    }
-
-    bool Lexer::parse_byte_escape(/*char& current_char, */ int& length, char& output_char) {
-        // skip to actual letter
-        skip_input();
-        current_char = peek_input();
-        length++;
-
-        switch (current_char) {
-            case 'x': {
-                // hex char string (null-terminated)
-                char hexNum[3] = { 0, 0, 0 };
-
-                // first hex char
-                skip_input();
-                current_char = peek_input();
-                length++;
-
-                if (!ISXDIGIT(current_char)) {
-                    rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
-                      current_char);
-                }
-                hexNum[0] = current_char;
-
-                // second hex char
-                skip_input();
-                current_char = peek_input();
-                length++;
-
-                if (!ISXDIGIT(current_char)) {
-                    rust_error_at(get_current_location(), "invalid character '\\x%c' in \\x sequence",
-                      current_char);
-                }
-                hexNum[1] = current_char;
-
-                long hexLong = ::std::strtol(hexNum, NULL, 16);
-
-                if (hexLong > 255)
-                    rust_error_at(get_current_location(),
-                      "ascii \\x escape '\\x%s' out of range - allows up to '\\xFF'", hexNum);
-                // gcc_assert(hexLong < 128); // as ascii
-                char hexChar = static_cast<char>(hexLong);
-
-                // TODO: fix - does this actually give the right character?
-                output_char = hexChar;
-            } break;
-            case 'n':
-                output_char = '\n';
-                break;
-            case 'r':
-                output_char = '\r';
-                break;
-            case 't':
-                output_char = '\t';
-                break;
-            case '\\':
-                output_char = '\\';
-                break;
-            case '0':
-                output_char = '\0';
-                break;
-            default:
-                // rust_error_at(get_current_location(), "unknown escape sequence '\\%c'", current_char);
-                // returns false if no parsing could be done
-                return false;
-                break;
-        }
-        // returns true if parsing was successful
-        return true;
-    }
-#endif
-
-    // Returns the length of the codepoint at the current position.
-    int Lexer::test_get_input_codepoint_length() {
-        uint8_t input = peek_input();
-
-        if (input < 128) {
-            // ascii -- 1 byte
-            // return input;
-
-            return 1;
-        } else if ((input & 0xC0) == 0x80) {
-            // invalid (continuation; can't be first char)
-            // return 0xFFFE;
-
-            return 0;
-        } else if ((input & 0xE0) == 0xC0) {
-            // 2 bytes
-            uint8_t input2 = peek_input(1);
-            if ((input2 & 0xC0) != 0x80)
-                return 0;
-            // return 0xFFFE;
-
-            // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-            // return output;
-            return 2;
-        } else if ((input & 0xF0) == 0xE0) {
-            // 3 bytes
-            uint8_t input2 = peek_input(1);
-            if ((input2 & 0xC0) != 0x80)
-                return 0;
-            // return 0xFFFE;
-
-            uint8_t input3 = peek_input(2);
-            if ((input3 & 0xC0) != 0x80)
-                return 0;
-            // return 0xFFFE;
-
-            /*uint32_t output
-              = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
-            0); return output;*/
-            return 3;
-        } else if ((input & 0xF8) == 0xF0) {
-            // 4 bytes
-            uint8_t input2 = peek_input(1);
-            if ((input2 & 0xC0) != 0x80)
-                return 0;
-            // return 0xFFFE;
-
-            uint8_t input3 = peek_input(2);
-            if ((input3 & 0xC0) != 0x80)
-                return 0;
-            // return 0xFFFE;
-
-            uint8_t input4 = peek_input(3);
-            if ((input4 & 0xC0) != 0x80)
-                return 0;
-            // return 0xFFFE;
-
-            /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-                              | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
-            return output;*/
-            return 4;
-        } else {
-            rust_error_at(get_current_location(), "invalid UTF-8 (too long)");
-            return 0;
-        }
-    }
-
-    // Returns the codepoint at the current position.
-    Codepoint Lexer::test_peek_codepoint_input() {
-        uint8_t input = peek_input();
-
-        if (input < 128) {
-            // ascii -- 1 byte
-            return { input };
-        } else if ((input & 0xC0) == 0x80) {
-            // invalid (continuation; can't be first char)
-            return { 0xFFFE };
-        } else if ((input & 0xE0) == 0xC0) {
-            // 2 bytes
-            uint8_t input2 = peek_input(1);
-            if ((input2 & 0xC0) != 0x80)
-                return { 0xFFFE };
-
-            uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-            return { output };
-        } else if ((input & 0xF0) == 0xE0) {
-            // 3 bytes
-            uint8_t input2 = peek_input(1);
-            if ((input2 & 0xC0) != 0x80)
-                return { 0xFFFE };
-
-            uint8_t input3 = peek_input(2);
-            if ((input3 & 0xC0) != 0x80)
-                return { 0xFFFE };
-
-            uint32_t output
-              = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0);
-            return { output };
-        } else if ((input & 0xF8) == 0xF0) {
-            // 4 bytes
-            uint8_t input2 = peek_input(1);
-            if ((input2 & 0xC0) != 0x80)
-                return { 0xFFFE };
-
-            uint8_t input3 = peek_input(2);
-            if ((input3 & 0xC0) != 0x80)
-                return { 0xFFFE };
-
-            uint8_t input4 = peek_input(3);
-            if ((input4 & 0xC0) != 0x80)
-                return { 0xFFFE };
-
-            uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-                              | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
-            return { output };
-        } else {
-            rust_error_at(get_current_location(), "invalid UTF-8 (too long)");
-            return { 0xFFFE };
-        }
-    }
-
-    void Lexer::test_skip_codepoint_input() {
-        int toSkip = test_get_input_codepoint_length();
-        gcc_assert(toSkip >= 1);
-
-        skip_input(toSkip - 1);
-    }
-
-    int Lexer::test_get_input_codepoint_n_length(int n_start_offset) {
-        uint8_t input = peek_input(n_start_offset);
-
-        if (input < 128) {
-            // ascii -- 1 byte
-            // return input;
-            return 1;
-        } else if ((input & 0xC0) == 0x80) {
-            // invalid (continuation; can't be first char)
-            // return 0xFFFE;
-            return 0;
-        } else if ((input & 0xE0) == 0xC0) {
-            // 2 bytes
-            uint8_t input2 = peek_input(n_start_offset + 1);
-            if ((input2 & 0xC0) != 0x80)
-                // return 0xFFFE;
-                return 0;
-
-            // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-            // return output;
-            return 2;
-        } else if ((input & 0xF0) == 0xE0) {
-            // 3 bytes
-            uint8_t input2 = peek_input(n_start_offset + 1);
-            if ((input2 & 0xC0) != 0x80)
-                // return 0xFFFE;
-                return 0;
-
-            uint8_t input3 = peek_input(n_start_offset + 2);
-            if ((input3 & 0xC0) != 0x80)
-                // return 0xFFFE;
-                return 0;
-
-            /*uint32_t output
-              = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
-            0); return output;*/
-            return 3;
-        } else if ((input & 0xF8) == 0xF0) {
-            // 4 bytes
-            uint8_t input2 = peek_input(n_start_offset + 1);
-            if ((input2 & 0xC0) != 0x80)
-                // return 0xFFFE;
-                return 0;
-
-            uint8_t input3 = peek_input(n_start_offset + 2);
-            if ((input3 & 0xC0) != 0x80)
-                // return 0xFFFE;
-                return 0;
-
-            uint8_t input4 = peek_input(n_start_offset + 3);
-            if ((input4 & 0xC0) != 0x80)
-                // return 0xFFFE;
-                return 0;
-
-            /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-                              | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
-            return output;*/
-            return 4;
-        } else {
-            rust_error_at(get_current_location(), "invalid UTF-8 (too long)");
-            return 0;
-        }
-    }
-
-    // peeks the codepoint input at n codepoints ahead of current codepoint - try
-    // not to use
-    Codepoint Lexer::test_peek_codepoint_input(int n) {
-        int totalOffset = 0;
-
-        // add up all offsets into total offset? does this do what I want?
-        for (int i = 0; i < n; i++) {
-            totalOffset += test_get_input_codepoint_n_length(totalOffset);
-        }
-        // issues: this would have (at least) O(n) lookup time, not O(1) like the
-        // rest?
-
-        // TODO: implement if still needed
-
-        // error out of function as it is not implemented
-        gcc_assert(1 == 0);
-        return { 0 };
-        /*
-                uint8_t input = peek_input();
-
-                if (input < 128) {
-                    // ascii -- 1 byte
-                    return input;
-                } else if ((input & 0xC0) == 0x80) {
-                    // invalid (continuation; can't be first char)
-                    return 0xFFFE;
-                } else if ((input & 0xE0) == 0xC0) {
-                    // 2 bytes
-                    uint8_t input2 = peek_input(1);
-                    if ((input2 & 0xC0) != 0x80)
-                        return 0xFFFE;
-
-                    uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-                    return output;
-                } else if ((input & 0xF0) == 0xE0) {
-                    // 3 bytes
-                    uint8_t input2 = peek_input(1);
-                    if ((input2 & 0xC0) != 0x80)
-                        return 0xFFFE;
-
-                    uint8_t input3 = peek_input(2);
-                    if ((input3 & 0xC0) != 0x80)
-                        return 0xFFFE;
-
-                    uint32_t output
-                      = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 &
-           0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) {
-                    // 4 bytes
-                    uint8_t input2 = peek_input(1);
-                    if ((input2 & 0xC0) != 0x80)
-                        return 0xFFFE;
-
-                    uint8_t input3 = peek_input(2);
-                    if ((input3 & 0xC0) != 0x80)
-                        return 0xFFFE;
-
-                    uint8_t input4 = peek_input(3);
-                    if ((input4 & 0xC0) != 0x80)
-                        return 0xFFFE;
-
-                    uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-                                      | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) <<
-           0); return output; } else { rust_error_at(get_current_location(), "invalid
-           UTF-8 (too long)"); return 0xFFFE;
-                }*/
+      return 0;
+    }
+  else if ((input & 0xE0) == 0xC0)
+    {
+      // 2 bytes
+      uint8_t input2 = peek_input (1);
+      if ((input2 & 0xC0) != 0x80)
+	return 0;
+      // return 0xFFFE;
+
+      // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+      // return output;
+      return 2;
+    }
+  else if ((input & 0xF0) == 0xE0)
+    {
+      // 3 bytes
+      uint8_t input2 = peek_input (1);
+      if ((input2 & 0xC0) != 0x80)
+	return 0;
+      // return 0xFFFE;
+
+      uint8_t input3 = peek_input (2);
+      if ((input3 & 0xC0) != 0x80)
+	return 0;
+      // return 0xFFFE;
+
+      /*uint32_t output
+	= ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
+      0); return output;*/
+      return 3;
+    }
+  else if ((input & 0xF8) == 0xF0)
+    {
+      // 4 bytes
+      uint8_t input2 = peek_input (1);
+      if ((input2 & 0xC0) != 0x80)
+	return 0;
+      // return 0xFFFE;
+
+      uint8_t input3 = peek_input (2);
+      if ((input3 & 0xC0) != 0x80)
+	return 0;
+      // return 0xFFFE;
+
+      uint8_t input4 = peek_input (3);
+      if ((input4 & 0xC0) != 0x80)
+	return 0;
+      // return 0xFFFE;
+
+      /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+			| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+      return output;*/
+      return 4;
+    }
+  else
+    {
+      rust_error_at (get_current_location (), "invalid UTF-8 (too long)");
+      return 0;
+    }
+}
+
+// Returns the codepoint at the current position.
+Codepoint
+Lexer::peek_codepoint_input ()
+{
+  uint8_t input = peek_input ();
+
+  if (input < 128)
+    {
+      // ascii -- 1 byte
+      return {input};
+    }
+  else if ((input & 0xC0) == 0x80)
+    {
+      // invalid (continuation; can't be first char)
+      return {0xFFFE};
+    }
+  else if ((input & 0xE0) == 0xC0)
+    {
+      // 2 bytes
+      uint8_t input2 = peek_input (1);
+      if ((input2 & 0xC0) != 0x80)
+	return {0xFFFE};
+
+      uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+      return {output};
+    }
+  else if ((input & 0xF0) == 0xE0)
+    {
+      // 3 bytes
+      uint8_t input2 = peek_input (1);
+      if ((input2 & 0xC0) != 0x80)
+	return {0xFFFE};
+
+      uint8_t input3 = peek_input (2);
+      if ((input3 & 0xC0) != 0x80)
+	return {0xFFFE};
+
+      uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
+			| ((input3 & 0x3F) << 0);
+      return {output};
+    }
+  else if ((input & 0xF8) == 0xF0)
+    {
+      // 4 bytes
+      uint8_t input2 = peek_input (1);
+      if ((input2 & 0xC0) != 0x80)
+	return {0xFFFE};
+
+      uint8_t input3 = peek_input (2);
+      if ((input3 & 0xC0) != 0x80)
+	return {0xFFFE};
+
+      uint8_t input4 = peek_input (3);
+      if ((input4 & 0xC0) != 0x80)
+	return {0xFFFE};
+
+      uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+			| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+      return {output};
+    }
+  else
+    {
+      rust_error_at (get_current_location (), "invalid UTF-8 (too long)");
+      return {0xFFFE};
+    }
+}
+
+void
+Lexer::skip_codepoint_input ()
+{
+  int toSkip = get_input_codepoint_length ();
+  gcc_assert (toSkip >= 1);
+
+  skip_input (toSkip - 1);
+}
+
+int
+Lexer::test_get_input_codepoint_n_length (int n_start_offset)
+{
+  uint8_t input = peek_input (n_start_offset);
+
+  if (input < 128)
+    {
+      // ascii -- 1 byte
+      // return input;
+      return 1;
+    }
+  else if ((input & 0xC0) == 0x80)
+    {
+      // invalid (continuation; can't be first char)
+      // return 0xFFFE;
+      return 0;
+    }
+  else if ((input & 0xE0) == 0xC0)
+    {
+      // 2 bytes
+      uint8_t input2 = peek_input (n_start_offset + 1);
+      if ((input2 & 0xC0) != 0x80)
+	// return 0xFFFE;
+	return 0;
+
+      // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+      // return output;
+      return 2;
+    }
+  else if ((input & 0xF0) == 0xE0)
+    {
+      // 3 bytes
+      uint8_t input2 = peek_input (n_start_offset + 1);
+      if ((input2 & 0xC0) != 0x80)
+	// return 0xFFFE;
+	return 0;
+
+      uint8_t input3 = peek_input (n_start_offset + 2);
+      if ((input3 & 0xC0) != 0x80)
+	// return 0xFFFE;
+	return 0;
+
+      /*uint32_t output
+	= ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
+      0); return output;*/
+      return 3;
+    }
+  else if ((input & 0xF8) == 0xF0)
+    {
+      // 4 bytes
+      uint8_t input2 = peek_input (n_start_offset + 1);
+      if ((input2 & 0xC0) != 0x80)
+	// return 0xFFFE;
+	return 0;
+
+      uint8_t input3 = peek_input (n_start_offset + 2);
+      if ((input3 & 0xC0) != 0x80)
+	// return 0xFFFE;
+	return 0;
+
+      uint8_t input4 = peek_input (n_start_offset + 3);
+      if ((input4 & 0xC0) != 0x80)
+	// return 0xFFFE;
+	return 0;
+
+      /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+			| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+      return output;*/
+      return 4;
+    }
+  else
+    {
+      rust_error_at (get_current_location (), "invalid UTF-8 (too long)");
+      return 0;
+    }
+}
+
+// peeks the codepoint input at n codepoints ahead of current codepoint - try
+// not to use
+Codepoint
+Lexer::test_peek_codepoint_input (int n)
+{
+  int totalOffset = 0;
+
+  // add up all offsets into total offset? does this do what I want?
+  for (int i = 0; i < n; i++)
+    {
+      totalOffset += test_get_input_codepoint_n_length (totalOffset);
     }
+  // issues: this would have (at least) O(n) lookup time, not O(1) like the
+  // rest?
+
+  // TODO: implement if still needed
+
+  // error out of function as it is not implemented
+  gcc_assert (1 == 0);
+  return {0};
+  /*
+	  uint8_t input = peek_input();
+
+	  if (input < 128) {
+	      // ascii -- 1 byte
+	      return input;
+	  } else if ((input & 0xC0) == 0x80) {
+	      // invalid (continuation; can't be first char)
+	      return 0xFFFE;
+	  } else if ((input & 0xE0) == 0xC0) {
+	      // 2 bytes
+	      uint8_t input2 = peek_input(1);
+	      if ((input2 & 0xC0) != 0x80)
+		  return 0xFFFE;
+
+	      uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+	      return output;
+	  } else if ((input & 0xF0) == 0xE0) {
+	      // 3 bytes
+	      uint8_t input2 = peek_input(1);
+	      if ((input2 & 0xC0) != 0x80)
+		  return 0xFFFE;
+
+	      uint8_t input3 = peek_input(2);
+	      if ((input3 & 0xC0) != 0x80)
+		  return 0xFFFE;
+
+	      uint32_t output
+		= ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 &
+     0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) {
+	      // 4 bytes
+	      uint8_t input2 = peek_input(1);
+	      if ((input2 & 0xC0) != 0x80)
+		  return 0xFFFE;
+
+	      uint8_t input3 = peek_input(2);
+	      if ((input3 & 0xC0) != 0x80)
+		  return 0xFFFE;
+
+	      uint8_t input4 = peek_input(3);
+	      if ((input4 & 0xC0) != 0x80)
+		  return 0xFFFE;
+
+	      uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+				| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) <<
+     0); return output; } else { rust_error_at(get_current_location(), "invalid
+     UTF-8 (too long)"); return 0xFFFE;
+	  }*/
+}
 } // namespace Rust
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
index 1465cb2..8a031ed 100644
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@@ -31,28 +31,37 @@ private:
   // Builds a token from the input queue.
   TokenPtr build_token ();
 
-  // ok maybe all these may mean the lexer structure needs to be rethought
-  /* separated into functions because main method was too long, but they rely on
-   * and change state in the lexer, so variables must be passed by reference. */
   std::pair<std::string, int> parse_in_decimal ();
   std::pair<std::string, int> parse_in_exponent_part ();
   std::pair<PrimitiveCoreType, int> parse_in_type_suffix ();
-  /*bool parse_ascii_escape (int &length,
-				  char &output_char);*/
-  /*bool parse_quote_escape (char& current_char, int &length,
-				  char &output_char);*/
-  /*bool parse_unicode_escape (
-    char& current_char, int &length, Codepoint &output_char);*/
-  /*bool parse_byte_escape (char& current_char, int &length,
-				 char &output_char);*/
   std::tuple<char, int, bool> parse_escape (char opening_char);
   std::tuple<Codepoint, int, bool> parse_utf8_escape (char opening_char);
-  int test_get_input_codepoint_length ();
+  int parse_partial_string_continue ();
+  std::pair<long, int> parse_partial_hex_escape ();
+  std::pair<Codepoint, int> parse_partial_unicode_escape ();
+
+  int get_input_codepoint_length ();
   int test_get_input_codepoint_n_length (int n_start_offset);
-  Codepoint test_peek_codepoint_input ();
-  Codepoint test_peek_codepoint_input (
-    int n); // maybe can use get_input_codepoint_length to get starting index
-  void test_skip_codepoint_input ();
+  Codepoint peek_codepoint_input ();
+  Codepoint test_peek_codepoint_input (int n);
+  void skip_codepoint_input ();
+
+  TokenPtr parse_byte_char (Location loc);
+  TokenPtr parse_byte_string (Location loc);
+  TokenPtr parse_raw_byte_string (Location loc);
+  TokenPtr parse_raw_identifier (Location loc);
+  TokenPtr parse_string (Location loc);
+  TokenPtr maybe_parse_raw_string (Location loc);
+  TokenPtr parse_raw_string (Location loc, int initial_hash_count);
+  TokenPtr parse_non_decimal_int_literals (Location loc);
+  TokenPtr parse_decimal_int_or_float (Location loc);
+  TokenPtr parse_char_or_lifetime (Location loc);
+  TokenPtr parse_identifier_or_keyword (Location loc);
+
+  template <typename IsDigitFunc>
+  TokenPtr parse_non_decimal_int_literal (Location loc,
+					  IsDigitFunc is_digit_func,
+					  std::string existent_str, int base);
 
 public:
   // Construct lexer with input file and filename provided
@@ -68,14 +77,14 @@ public:
   Lexer &operator= (Lexer &&other) = default;
 
   // Returns token n tokens ahead of current position.
-  const_TokenPtr peek_token (int n);
+  const_TokenPtr peek_token (int n) { return token_queue.peek (n); }
   // Peeks the current token.
-  const_TokenPtr peek_token ();
+  const_TokenPtr peek_token () { return peek_token (0); }
 
   // Advances current token to n + 1 tokens ahead of current position.
-  void skip_token (int n);
+  void skip_token (int n) { token_queue.skip (n); }
   // Skips the current token.
-  void skip_token ();
+  void skip_token () { skip_token (0); }
 
   // Replaces the current token with a specified token.
   void replace_current_token (TokenPtr replacement);
@@ -90,6 +99,8 @@ private:
   int current_line;
   // Current column number.
   int current_column;
+  // Current character.
+  int current_char;
   // Line map.
   Linemap *line_map;
 
@@ -132,11 +143,6 @@ private:
   TokenSource token_source;
   // Token stream queue.
   buffered_queue<std::shared_ptr<Token>, TokenSource> token_queue;
-
-  // START CRAPPY CHANGES
-  int current_char;
-
-  // END CRAPPY CHANGES
 };
 } // namespace Rust