10 files changed, 304 insertions, 228 deletions
diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h
new file mode 100644
index 0000000..07137de
--- /dev/null
+++ b/gcc/rust/lex/rust-input-source.h
@@ -0,0 +1,193 @@
+#ifndef RUST_INPUT_SOURCE_H
+#define RUST_INPUT_SOURCE_H
+
+#include "rust-codepoint.h"
+#include "optional.h"
+
+namespace Rust {
+// Input source wrapper thing.
+class InputSource
+{
+private:
+  // position of current character
+  unsigned int pos;
+  std::vector<Codepoint> chars;
+  bool is_valid_utf8;
+
+  // Overload operator () to return next char from input stream.
+  virtual int next_byte () = 0;
+
+  Codepoint next_codepoint ()
+  {
+    uint32_t input = next_byte ();
+
+    if ((int32_t) input == EOF)
+      return Codepoint::eof ();
+    else if (input < 128)
+      {
+	// ascii -- 1 byte
+	return {input};
+      }
+    else if ((input & 0xC0) == 0x80)
+      {
+	// invalid (continuation; can't be first char)
+	return {0xFFFE};
+      }
+    else if ((input & 0xE0) == 0xC0)
+      {
+	// 2 bytes
+	uint8_t input2 = next_byte ();
+	if ((input2 & 0xC0) != 0x80)
+	  return {0xFFFE};
+
+	uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+	return output;
+      }
+    else if ((input & 0xF0) == 0xE0)
+      {
+	// 3 bytes or UTF-8 BOM
+	uint8_t input2 = next_byte ();
+	// If the second byte is equal to 0xBB then the input is no longer a
+	// valid UTF-8 char. Then, we check if the third byte makes up a UTF
+	// BOM.
+	if (input == 0xEF && input2 == 0xBB)
+	  {
+	    uint8_t input3 = next_byte ();
+	    if (input3 == 0xBF)
+	      // found BOM
+	      return next_codepoint ();
+	    else
+	      return {0xFFFE};
+	  }
+
+	if ((input2 & 0xC0) != 0x80)
+	  return {0xFFFE};
+
+	uint8_t input3 = next_byte ();
+
+	if ((input3 & 0xC0) != 0x80)
+	  return {0xFFFE};
+
+	uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
+			  | ((input3 & 0x3F) << 0);
+	return {output};
+      }
+    else if ((input & 0xF8) == 0xF0)
+      {
+	// 4 bytes
+	uint8_t input2 = next_byte ();
+	if ((input2 & 0xC0) != 0x80)
+	  return {0xFFFE};
+
+	uint8_t input3 = next_byte ();
+	if ((input3 & 0xC0) != 0x80)
+	  return {0xFFFE};
+
+	uint8_t input4 = next_byte ();
+	if ((input4 & 0xC0) != 0x80)
+	  return {0xFFFE};
+
+	uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+			  | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+	return {output};
+      }
+    else
+      {
+	return {0xFFFE};
+      }
+  }
+
+protected:
+  // Check if the input source is valid as utf-8 and copy all characters to
+  // `chars`.
+  void init ()
+  {
+    Codepoint char32 = next_codepoint ();
+    while (!char32.is_eof () && char32 != 0xFFFE)
+      {
+	chars.push_back (char32);
+	char32 = next_codepoint ();
+      }
+
+    if (char32 == 0xFFFE)
+      {
+	// Input source is not valid as utf-8.
+	is_valid_utf8 = false;
+      }
+  }
+
+public:
+  InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
+
+  virtual ~InputSource () {}
+
+  // Checks if input source is a valid UTF-8 string
+  bool is_valid () { return is_valid_utf8; }
+
+  // get the next UTF-8 character
+  Codepoint next ()
+  {
+    if (pos >= chars.size ())
+      return Codepoint::eof ();
+    else
+      {
+	Codepoint c = chars[pos];
+	pos++;
+	return c;
+      }
+  }
+
+  // Returns codepoint if input source is a valid UTF-8 string. Returns
+  // nullopt otherwise.
+  tl::optional<std::vector<Codepoint>> get_chars ()
+  {
+    if (is_valid ())
+      return {chars};
+    else
+      return tl::nullopt;
+  }
+};
+
+class FileInputSource : public InputSource
+{
+private:
+  // Input source file.
+  FILE *input;
+
+  int next_byte () override { return fgetc (input); }
+
+public:
+  // Create new input source from file.
+  FileInputSource (FILE *input) : InputSource (), input (input)
+  {
+    // TODO make this better?
+    init ();
+  }
+};
+
+class BufferInputSource : public InputSource
+{
+private:
+  const std::string &buffer;
+  size_t offs;
+
+  int next_byte () override
+  {
+    if (offs >= buffer.size ())
+      return EOF;
+    return (uint8_t) buffer.at (offs++);
+  }
+
+public:
+  // Create new input source from file.
+  BufferInputSource (const std::string &b, size_t offset)
+    : InputSource (), buffer (b), offs (offset)
+  {
+    // TODO make this better?
+    init ();
+  }
+};
+
+} // namespace Rust
+
+#endif
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index c40e700..2a92465 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -2534,8 +2534,7 @@ namespace selftest {
 
 // Checks if `src` has the same contents as the given characters
 void
-assert_source_content (Rust::Lexer::InputSource &src,
-		       std::vector<uint32_t> expected)
+assert_source_content (Rust::InputSource &src, std::vector<uint32_t> expected)
 {
   Rust::Codepoint src_char = src.next ();
   for (auto expected_char : expected)
@@ -2553,7 +2552,7 @@ assert_source_content (Rust::Lexer::InputSource &src,
 void
 test_buffer_input_source (std::string str, std::vector<uint32_t> expected)
 {
-  Rust::Lexer::BufferInputSource source (str, 0);
+  Rust::BufferInputSource source (str, 0);
   assert_source_content (source, expected);
 }
 
@@ -2564,7 +2563,7 @@ test_file_input_source (std::string str, std::vector<uint32_t> expected)
   // Moves to the first character
   fputs (str.c_str (), tmpf);
   std::rewind (tmpf);
-  Rust::Lexer::FileInputSource source (tmpf);
+  Rust::FileInputSource source (tmpf);
   assert_source_content (source, expected);
 }
 
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
index 273b8c7..6a29c0a 100644
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@@ -24,6 +24,7 @@
 #include "rust-token.h"
 #include "optional.h"
 #include "selftest.h"
+#include "rust-input-source.h"
 
 namespace Rust {
 // Simple wrapper for FILE* that simplifies destruction.
@@ -204,186 +205,6 @@ public:
   Linemap *get_line_map () { return line_map; }
   std::string get_filename () { return std::string (input.get_filename ()); }
 
-  // Input source wrapper thing.
-  class InputSource
-  {
-  private:
-    // position of current character
-    unsigned int pos;
-    std::vector<Codepoint> chars;
-    bool is_valid_utf8;
-
-    // Overload operator () to return next char from input stream.
-    virtual int next_byte () = 0;
-
-    Codepoint next_codepoint ()
-    {
-      uint32_t input = next_byte ();
-
-      if ((int32_t) input == EOF)
-	return Codepoint::eof ();
-      else if (input < 128)
-	{
-	  // ascii -- 1 byte
-	  return {input};
-	}
-      else if ((input & 0xC0) == 0x80)
-	{
-	  // invalid (continuation; can't be first char)
-	  return {0xFFFE};
-	}
-      else if ((input & 0xE0) == 0xC0)
-	{
-	  // 2 bytes
-	  uint8_t input2 = next_byte ();
-	  if ((input2 & 0xC0) != 0x80)
-	    return {0xFFFE};
-
-	  uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-	  return output;
-	}
-      else if ((input & 0xF0) == 0xE0)
-	{
-	  // 3 bytes or UTF-8 BOM
-	  uint8_t input2 = next_byte ();
-	  // If the second byte is equal to 0xBB then the input is no longer a
-	  // valid UTF-8 char. Then, we check if the third byte makes up a UTF
-	  // BOM.
-	  if (input == 0xEF && input2 == 0xBB)
-	    {
-	      uint8_t input3 = next_byte ();
-	      if (input3 == 0xBF)
-		// found BOM
-		return next_codepoint ();
-	      else
-		return {0xFFFE};
-	    }
-
-	  if ((input2 & 0xC0) != 0x80)
-	    return {0xFFFE};
-
-	  uint8_t input3 = next_byte ();
-
-	  if ((input3 & 0xC0) != 0x80)
-	    return {0xFFFE};
-
-	  uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
-			    | ((input3 & 0x3F) << 0);
-	  return {output};
-	}
-      else if ((input & 0xF8) == 0xF0)
-	{
-	  // 4 bytes
-	  uint8_t input2 = next_byte ();
-	  if ((input2 & 0xC0) != 0x80)
-	    return {0xFFFE};
-
-	  uint8_t input3 = next_byte ();
-	  if ((input3 & 0xC0) != 0x80)
-	    return {0xFFFE};
-
-	  uint8_t input4 = next_byte ();
-	  if ((input4 & 0xC0) != 0x80)
-	    return {0xFFFE};
-
-	  uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-			    | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
-	  return {output};
-	}
-      else
-	{
-	  return {0xFFFE};
-	}
-    }
-
-  protected:
-    // Check if the input source is valid as utf-8 and copy all characters to
-    // `chars`.
-    void init ()
-    {
-      Codepoint char32 = next_codepoint ();
-      while (!char32.is_eof () && char32 != 0xFFFE)
-	{
-	  chars.push_back (char32);
-	  char32 = next_codepoint ();
-	}
-
-      if (char32 == 0xFFFE)
-	{
-	  // Input source is not valid as utf-8.
-	  is_valid_utf8 = false;
-	}
-    }
-
-  public:
-    InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
-
-    virtual ~InputSource () {}
-
-    bool is_valid () { return is_valid_utf8; }
-
-    // get the next UTF-8 character
-    Codepoint next ()
-    {
-      if (pos >= chars.size ())
-	return Codepoint::eof ();
-      else
-	{
-	  Codepoint c = chars[pos];
-	  pos++;
-	  return c;
-	}
-    }
-
-    tl::optional<std::vector<Codepoint>> get_chars ()
-    {
-      if (is_valid ())
-	return {chars};
-      else
-	return tl::nullopt;
-    }
-  };
-
-  class FileInputSource : public InputSource
-  {
-  private:
-    // Input source file.
-    FILE *input;
-
-    int next_byte () override { return fgetc (input); }
-
-  public:
-    // Create new input source from file.
-    FileInputSource (FILE *input) : InputSource (), input (input)
-    {
-      // TODO make this better?
-      init ();
-    }
-  };
-
-  class BufferInputSource : public InputSource
-  {
-  private:
-    const std::string &buffer;
-    size_t offs;
-
-    int next_byte () override
-    {
-      if (offs >= buffer.size ())
-	return EOF;
-      return (uint8_t) buffer.at (offs++);
-    }
-
-  public:
-    // Create new input source from file.
-    BufferInputSource (const std::string &b, size_t offset)
-      : InputSource (), buffer (b), offs (offset)
-    {
-      // TODO make this better?
-      init ();
-    }
-  };
-
 private:
   void start_line (int current_line, int current_column);
 
diff --git a/gcc/rust/lex/rust-token.cc b/gcc/rust/lex/rust-token.cc
index 77ec6cf..9a1132f 100644
--- a/gcc/rust/lex/rust-token.cc
+++ b/gcc/rust/lex/rust-token.cc
@@ -19,6 +19,7 @@
 #include "rust-system.h"
 #include "rust-token.h"
 #include "rust-diagnostics.h"
+#include "rust-unicode.h"
 
 namespace Rust {
 // Hackily defined way to get token description for enum value using x-macros
@@ -150,6 +151,23 @@ Token::get_type_hint_str () const
   return get_type_hint_string (type_hint);
 }
 
+std::string
+nfc_normalize_token_string (location_t loc, TokenId id, const std::string &str)
+{
+  if (id == IDENTIFIER || id == LIFETIME)
+    {
+      tl::optional<Utf8String> ustring = Utf8String::make_utf8_string (str);
+      if (ustring.has_value ())
+	return ustring.value ().nfc_normalize ().as_string ();
+      else
+	rust_internal_error_at (loc,
+				"identifier '%s' is not a valid UTF-8 string",
+				str.c_str ());
+    }
+  else
+    return str;
+}
+
 const std::string &
 Token::get_str () const
 {
diff --git a/gcc/rust/lex/rust-token.h b/gcc/rust/lex/rust-token.h
index 5675351..c4ea176 100644
--- a/gcc/rust/lex/rust-token.h
+++ b/gcc/rust/lex/rust-token.h
@@ -21,7 +21,8 @@
 
 #include "rust-system.h"
 #include "rust-linemap.h"
-#include "rust-codepoint.h"
+#include "rust-make-unique.h"
+#include "rust-unicode.h"
 
 namespace Rust {
 // "Primitive core types" in Rust - the different int and float types, as well
@@ -236,6 +237,10 @@ token_id_keyword_string (TokenId id);
 const char *
 get_type_hint_string (PrimitiveCoreType type);
 
+/* Normalize string if a token is a identifier */
+std::string
+nfc_normalize_token_string (location_t loc, TokenId id, const std::string &str);
+
 // Represents a single token. Create using factory static methods.
 class Token
 {
@@ -259,29 +264,40 @@ private:
 
   // Token constructor from token id, location, and a string.
   Token (TokenId token_id, location_t location, std::string &&paramStr)
-    : token_id (token_id), locus (location),
-      str (new std::string (std::move (paramStr))), type_hint (CORETYPE_UNKNOWN)
-  {}
+    : token_id (token_id), locus (location), type_hint (CORETYPE_UNKNOWN)
+  {
+    // Normalize identifier tokens
+    str = Rust::make_unique<std::string> (
+      nfc_normalize_token_string (location, token_id, paramStr));
+  }
 
   // Token constructor from token id, location, and a char.
   Token (TokenId token_id, location_t location, char paramChar)
     : token_id (token_id), locus (location),
       str (new std::string (1, paramChar)), type_hint (CORETYPE_UNKNOWN)
-  {}
+  {
+    // Do not need to normalize 1byte char
+  }
 
   // Token constructor from token id, location, and a "codepoint".
   Token (TokenId token_id, location_t location, Codepoint paramCodepoint)
-    : token_id (token_id), locus (location),
-      str (new std::string (paramCodepoint.as_string ())),
-      type_hint (CORETYPE_UNKNOWN)
-  {}
+    : token_id (token_id), locus (location), type_hint (CORETYPE_UNKNOWN)
+  {
+    // Normalize identifier tokens
+    str = Rust::make_unique<std::string> (
+      nfc_normalize_token_string (location, token_id,
+				  paramCodepoint.as_string ()));
+  }
 
   // Token constructor from token id, location, a string, and type hint.
   Token (TokenId token_id, location_t location, std::string &&paramStr,
 	 PrimitiveCoreType parType)
-    : token_id (token_id), locus (location),
-      str (new std::string (std::move (paramStr))), type_hint (parType)
-  {}
+    : token_id (token_id), locus (location), type_hint (parType)
+  {
+    // Normalize identifier tokens
+    str = Rust::make_unique<std::string> (
+      nfc_normalize_token_string (location, token_id, paramStr));
+  }
 
 public:
   // No default constructor.
diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc
index f07d18f..44dc3fc 100644
--- a/gcc/rust/rust-lang.cc
+++ b/gcc/rust/rust-lang.cc
@@ -455,11 +455,11 @@ run_rust_tests ()
 {
   // Call tests for the rust frontend here
   rust_input_source_test ();
+  rust_utf8_normalize_test ();
   rust_cfg_parser_test ();
   rust_privacy_ctx_test ();
   rust_crate_name_validation_test ();
   rust_simple_path_resolve_test ();
-  rust_utf8_normalize_test ();
 }
 } // namespace selftest
 
diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc
index 3c00ccb..b860a32 100644
--- a/gcc/rust/rust-session-manager.cc
+++ b/gcc/rust/rust-session-manager.cc
@@ -115,16 +115,15 @@ infer_crate_name (const std::string &filename)
 static bool
 validate_crate_name (const std::string &crate_name, Error &error)
 {
-  Utf8String utf8_name = {crate_name};
-  tl::optional<std::vector<Codepoint>> uchars_opt = utf8_name.get_chars ();
-
-  if (!uchars_opt.has_value ())
+  tl::optional<Utf8String> utf8_name_opt
+    = Utf8String::make_utf8_string (crate_name);
+  if (!utf8_name_opt.has_value ())
     {
       error = Error (UNDEF_LOCATION, "crate name is not a valid UTF-8 string");
       return false;
     }
 
-  std::vector<Codepoint> uchars = uchars_opt.value ();
+  std::vector<Codepoint> uchars = utf8_name_opt->get_chars ();
   if (uchars.empty ())
     {
       error = Error (UNDEF_LOCATION, "crate name cannot be empty");
diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc
index c6aa063..b2ddaf0 100644
--- a/gcc/rust/util/rust-unicode.cc
+++ b/gcc/rust/util/rust-unicode.cc
@@ -1,12 +1,14 @@
 #include "rust-system.h"
 #include "optional.h"
 #include "selftest.h"
+#include "rust-lex.h"
+#include "rust-unicode.h"
 
 #include "rust-unicode-data.h"
 
 namespace Rust {
 
-typedef uint32_t codepoint_t;
+typedef Codepoint codepoint_t;
 typedef std::vector<codepoint_t> string_t;
 
 // These constants are used to compose and decompose of Hangul syllables.
@@ -85,7 +87,7 @@ binary_search_sorted_array (const std::array<uint32_t, SIZE> &array,
 int
 lookup_cc (codepoint_t c)
 {
-  auto it = Rust::CCC_TABLE.find (c);
+  auto it = Rust::CCC_TABLE.find (c.value);
   if (it != Rust::CCC_TABLE.end ())
     return it->second;
   else
@@ -96,11 +98,11 @@ lookup_cc (codepoint_t c)
 tl::optional<codepoint_t>
 lookup_recomp (codepoint_t starter, codepoint_t c)
 {
-  auto it = Rust::RECOMPOSITION_MAP.find ({starter, c});
+  auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value});
   if (it != Rust::RECOMPOSITION_MAP.end ())
     return {it->second};
 
-  it = Rust::RECOMPOSITION_MAP.find ({starter, 0});
+  it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0});
   if (it != Rust::RECOMPOSITION_MAP.end ())
     return {it->second};
 
@@ -110,11 +112,11 @@ lookup_recomp (codepoint_t starter, codepoint_t c)
 void
 recursive_decomp_cano (codepoint_t c, string_t &buf)
 {
-  auto it = Rust::DECOMPOSITION_MAP.find (c);
+  auto it = Rust::DECOMPOSITION_MAP.find (c.value);
   if (it != Rust::DECOMPOSITION_MAP.end ())
     {
-      string_t decomped = it->second;
-      for (codepoint_t cp : decomped)
+      std::vector<uint32_t> decomped = it->second;
+      for (uint32_t cp : decomped)
 	recursive_decomp_cano (cp, buf);
     }
   else
@@ -127,7 +129,7 @@ decomp_cano (string_t s)
   string_t buf;
   for (codepoint_t c : s)
     {
-      int64_t s_index = c - S_BASE;
+      int64_t s_index = c.value - S_BASE;
       if (0 <= s_index && s_index < S_COUNT)
 	{
 	  // decompose Hangul argorithmically
@@ -160,7 +162,7 @@ sort_cano (string_t &s)
       if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
 	{
 	  // swap
-	  int tmp = s[i];
+	  codepoint_t tmp = s[i];
 	  s[i] = s[i - 1];
 	  s[i - 1] = tmp;
 	  if (i > 1)
@@ -183,10 +185,10 @@ compose_hangul (string_t s)
       codepoint_t ch = s[src_pos];
 
       // L V => LV
-      int64_t l_index = last - L_BASE;
+      int64_t l_index = last.value - L_BASE;
       if (0 <= l_index && l_index < L_COUNT)
 	{
-	  int64_t v_index = ch - V_BASE;
+	  int64_t v_index = ch.value - V_BASE;
 	  if (0 <= v_index && v_index < V_COUNT)
 	    {
 	      last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
@@ -198,13 +200,13 @@ compose_hangul (string_t s)
 	}
 
       // LV T => LVT
-      int64_t s_index = last - S_BASE;
+      int64_t s_index = last.value - S_BASE;
       if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
 	{
-	  int64_t t_index = ch - T_BASE;
+	  int64_t t_index = ch.value - T_BASE;
 	  if (0 < t_index && t_index < T_COUNT)
 	    {
-	      last += t_index;
+	      last.value += t_index;
 	      // pop LV
 	      buf.pop_back ();
 	      buf.push_back (last);
@@ -282,6 +284,12 @@ nfc_normalize (string_t s)
   return r;
 }
 
+Utf8String
+Utf8String::nfc_normalize () const
+{
+  return Utf8String (Rust::nfc_normalize (chars));
+}
+
 bool
 is_alphabetic (uint32_t codepoint)
 {
@@ -309,9 +317,10 @@ is_numeric (uint32_t codepoint)
 namespace selftest {
 
 void
-assert_normalize (std::vector<uint32_t> origin, std::vector<uint32_t> expected)
+assert_normalize (const std::vector<Rust::Codepoint> origin,
+		  const std::vector<Rust::Codepoint> expected)
 {
-  std::vector<uint32_t> actual = Rust::nfc_normalize (origin);
+  std::vector<Rust::Codepoint> actual = Rust::nfc_normalize (origin);
 
   ASSERT_EQ (actual.size (), expected.size ());
   for (unsigned int i = 0; i < actual.size (); i++)
diff --git a/gcc/rust/util/rust-unicode.h b/gcc/rust/util/rust-unicode.h
index 6800558..becf6fb 100644
--- a/gcc/rust/util/rust-unicode.h
+++ b/gcc/rust/util/rust-unicode.h
@@ -21,28 +21,43 @@
 
 #include "optional.h"
 #include "rust-system.h"
-#include "rust-lex.h"
+#include "rust-input-source.h"
 
 namespace Rust {
 
 class Utf8String
 {
 private:
-  tl::optional<std::vector<Codepoint>> chars;
+  std::vector<Codepoint> chars;
 
 public:
-  Utf8String (const std::string &maybe_utf8)
+  static tl::optional<Utf8String>
+  make_utf8_string (const std::string &maybe_utf8)
   {
-    Lexer::BufferInputSource input_source = {maybe_utf8, 0};
-    chars = input_source.get_chars ();
+    BufferInputSource input_source = {maybe_utf8, 0};
+    tl::optional<std::vector<Codepoint>> chars_opt = input_source.get_chars ();
+    if (chars_opt.has_value ())
+      return {Utf8String (chars_opt.value ())};
+    else
+      return tl::nullopt;
   }
 
-  // Returns UTF codepoints when string is valid as UTF-8, returns nullopt
-  // otherwise.
-  tl::optional<std::vector<Codepoint>> get_chars () const { return chars; }
-};
+  Utf8String (const std::vector<Codepoint> codepoints) : chars ({codepoints}) {}
+
+  std::string as_string () const
+  {
+    std::stringstream ss;
+    for (Codepoint c : chars)
+      ss << c.as_string ();
 
-// TODO: add function nfc_normalize
+    return ss.str ();
+  };
+
+  // Returns characters
+  std::vector<Codepoint> get_chars () const { return chars; }
+
+  Utf8String nfc_normalize () const;
+};
 
 bool
 is_alphabetic (uint32_t codepoint);
diff --git a/gcc/testsuite/rust/compile/unicode_norm1.rs b/gcc/testsuite/rust/compile/unicode_norm1.rs
new file mode 100644
index 0000000..d496054
--- /dev/null
+++ b/gcc/testsuite/rust/compile/unicode_norm1.rs
@@ -0,0 +1,6 @@
+fn main() {
+    // U+304C
+    let が = ();
+    // U+304B + U+3099
+    let _ = が;
+}