aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/rust/lex/rust-input-source.h193
-rw-r--r--gcc/rust/lex/rust-lex.cc7
-rw-r--r--gcc/rust/lex/rust-lex.h181
-rw-r--r--gcc/rust/lex/rust-token.cc18
-rw-r--r--gcc/rust/lex/rust-token.h40
-rw-r--r--gcc/rust/rust-lang.cc2
-rw-r--r--gcc/rust/rust-session-manager.cc9
-rw-r--r--gcc/rust/util/rust-unicode.cc41
-rw-r--r--gcc/rust/util/rust-unicode.h35
-rw-r--r--gcc/testsuite/rust/compile/unicode_norm1.rs6
10 files changed, 304 insertions, 228 deletions
diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h
new file mode 100644
index 0000000..07137de
--- /dev/null
+++ b/gcc/rust/lex/rust-input-source.h
@@ -0,0 +1,193 @@
+#ifndef RUST_INPUT_SOURCE_H
+#define RUST_INPUT_SOURCE_H
+
+#include "rust-codepoint.h"
+#include "optional.h"
+
+namespace Rust {
+// Input source wrapper thing.
+class InputSource
+{
+private:
+ // position of current character
+ unsigned int pos;
+ std::vector<Codepoint> chars;
+ bool is_valid_utf8;
+
+ // Overload operator () to return next char from input stream.
+ virtual int next_byte () = 0;
+
+ Codepoint next_codepoint ()
+ {
+ uint32_t input = next_byte ();
+
+ if ((int32_t) input == EOF)
+ return Codepoint::eof ();
+ else if (input < 128)
+ {
+ // ascii -- 1 byte
+ return {input};
+ }
+ else if ((input & 0xC0) == 0x80)
+ {
+ // invalid (continuation; can't be first char)
+ return {0xFFFE};
+ }
+ else if ((input & 0xE0) == 0xC0)
+ {
+ // 2 bytes
+ uint8_t input2 = next_byte ();
+ if ((input2 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+ return output;
+ }
+ else if ((input & 0xF0) == 0xE0)
+ {
+ // 3 bytes or UTF-8 BOM
+ uint8_t input2 = next_byte ();
+ // If the second byte is equal to 0xBB then the input is no longer a
+ // valid UTF-8 char. Then, we check if the third byte makes up a UTF
+ // BOM.
+ if (input == 0xEF && input2 == 0xBB)
+ {
+ uint8_t input3 = next_byte ();
+ if (input3 == 0xBF)
+ // found BOM
+ return next_codepoint ();
+ else
+ return {0xFFFE};
+ }
+
+ if ((input2 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint8_t input3 = next_byte ();
+
+ if ((input3 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
+ | ((input3 & 0x3F) << 0);
+ return {output};
+ }
+ else if ((input & 0xF8) == 0xF0)
+ {
+ // 4 bytes
+ uint8_t input2 = next_byte ();
+ if ((input2 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint8_t input3 = next_byte ();
+ if ((input3 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint8_t input4 = next_byte ();
+ if ((input4 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+ | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+ return {output};
+ }
+ else
+ {
+ return {0xFFFE};
+ }
+ }
+
+protected:
+ // Check if the input source is valid as utf-8 and copy all characters to
+ // `chars`.
+ void init ()
+ {
+ Codepoint char32 = next_codepoint ();
+ while (!char32.is_eof () && char32 != 0xFFFE)
+ {
+ chars.push_back (char32);
+ char32 = next_codepoint ();
+ }
+
+ if (char32 == 0xFFFE)
+ {
+ // Input source is not valid as utf-8.
+ is_valid_utf8 = false;
+ }
+ }
+
+public:
+ InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
+
+ virtual ~InputSource () {}
+
+ // Checks if input source is a valid UTF-8 string
+ bool is_valid () { return is_valid_utf8; }
+
+ // get the next UTF-8 character
+ Codepoint next ()
+ {
+ if (pos >= chars.size ())
+ return Codepoint::eof ();
+ else
+ {
+ Codepoint c = chars[pos];
+ pos++;
+ return c;
+ }
+ }
+
+ // Returns codepoint if input source is a valid UTF-8 string. Returns
+ // nullopt otherwise.
+ tl::optional<std::vector<Codepoint>> get_chars ()
+ {
+ if (is_valid ())
+ return {chars};
+ else
+ return tl::nullopt;
+ }
+};
+
+class FileInputSource : public InputSource
+{
+private:
+ // Input source file.
+ FILE *input;
+
+ int next_byte () override { return fgetc (input); }
+
+public:
+ // Create new input source from file.
+ FileInputSource (FILE *input) : InputSource (), input (input)
+ {
+ // TODO make this better?
+ init ();
+ }
+};
+
+class BufferInputSource : public InputSource
+{
+private:
+ const std::string &buffer;
+ size_t offs;
+
+ int next_byte () override
+ {
+ if (offs >= buffer.size ())
+ return EOF;
+ return (uint8_t) buffer.at (offs++);
+ }
+
+public:
+ // Create new input source from file.
+ BufferInputSource (const std::string &b, size_t offset)
+ : InputSource (), buffer (b), offs (offset)
+ {
+ // TODO make this better?
+ init ();
+ }
+};
+
+} // namespace Rust
+
+#endif
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index c40e700..2a92465 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -2534,8 +2534,7 @@ namespace selftest {
// Checks if `src` has the same contents as the given characters
void
-assert_source_content (Rust::Lexer::InputSource &src,
- std::vector<uint32_t> expected)
+assert_source_content (Rust::InputSource &src, std::vector<uint32_t> expected)
{
Rust::Codepoint src_char = src.next ();
for (auto expected_char : expected)
@@ -2553,7 +2552,7 @@ assert_source_content (Rust::Lexer::InputSource &src,
void
test_buffer_input_source (std::string str, std::vector<uint32_t> expected)
{
- Rust::Lexer::BufferInputSource source (str, 0);
+ Rust::BufferInputSource source (str, 0);
assert_source_content (source, expected);
}
@@ -2564,7 +2563,7 @@ test_file_input_source (std::string str, std::vector<uint32_t> expected)
// Moves to the first character
fputs (str.c_str (), tmpf);
std::rewind (tmpf);
- Rust::Lexer::FileInputSource source (tmpf);
+ Rust::FileInputSource source (tmpf);
assert_source_content (source, expected);
}
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
index 273b8c7..6a29c0a 100644
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@@ -24,6 +24,7 @@
#include "rust-token.h"
#include "optional.h"
#include "selftest.h"
+#include "rust-input-source.h"
namespace Rust {
// Simple wrapper for FILE* that simplifies destruction.
@@ -204,186 +205,6 @@ public:
Linemap *get_line_map () { return line_map; }
std::string get_filename () { return std::string (input.get_filename ()); }
- // Input source wrapper thing.
- class InputSource
- {
- private:
- // position of current character
- unsigned int pos;
- std::vector<Codepoint> chars;
- bool is_valid_utf8;
-
- // Overload operator () to return next char from input stream.
- virtual int next_byte () = 0;
-
- Codepoint next_codepoint ()
- {
- uint32_t input = next_byte ();
-
- if ((int32_t) input == EOF)
- return Codepoint::eof ();
- else if (input < 128)
- {
- // ascii -- 1 byte
- return {input};
- }
- else if ((input & 0xC0) == 0x80)
- {
- // invalid (continuation; can't be first char)
- return {0xFFFE};
- }
- else if ((input & 0xE0) == 0xC0)
- {
- // 2 bytes
- uint8_t input2 = next_byte ();
- if ((input2 & 0xC0) != 0x80)
- return {0xFFFE};
-
- uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
- return output;
- }
- else if ((input & 0xF0) == 0xE0)
- {
- // 3 bytes or UTF-8 BOM
- uint8_t input2 = next_byte ();
- // If the second byte is equal to 0xBB then the input is no longer a
- // valid UTF-8 char. Then, we check if the third byte makes up a UTF
- // BOM.
- if (input == 0xEF && input2 == 0xBB)
- {
- uint8_t input3 = next_byte ();
- if (input3 == 0xBF)
- // found BOM
- return next_codepoint ();
- else
- return {0xFFFE};
- }
-
- if ((input2 & 0xC0) != 0x80)
- return {0xFFFE};
-
- uint8_t input3 = next_byte ();
-
- if ((input3 & 0xC0) != 0x80)
- return {0xFFFE};
-
- uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
- | ((input3 & 0x3F) << 0);
- return {output};
- }
- else if ((input & 0xF8) == 0xF0)
- {
- // 4 bytes
- uint8_t input2 = next_byte ();
- if ((input2 & 0xC0) != 0x80)
- return {0xFFFE};
-
- uint8_t input3 = next_byte ();
- if ((input3 & 0xC0) != 0x80)
- return {0xFFFE};
-
- uint8_t input4 = next_byte ();
- if ((input4 & 0xC0) != 0x80)
- return {0xFFFE};
-
- uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
- | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
- return {output};
- }
- else
- {
- return {0xFFFE};
- }
- }
-
- protected:
- // Check if the input source is valid as utf-8 and copy all characters to
- // `chars`.
- void init ()
- {
- Codepoint char32 = next_codepoint ();
- while (!char32.is_eof () && char32 != 0xFFFE)
- {
- chars.push_back (char32);
- char32 = next_codepoint ();
- }
-
- if (char32 == 0xFFFE)
- {
- // Input source is not valid as utf-8.
- is_valid_utf8 = false;
- }
- }
-
- public:
- InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
-
- virtual ~InputSource () {}
-
- bool is_valid () { return is_valid_utf8; }
-
- // get the next UTF-8 character
- Codepoint next ()
- {
- if (pos >= chars.size ())
- return Codepoint::eof ();
- else
- {
- Codepoint c = chars[pos];
- pos++;
- return c;
- }
- }
-
- tl::optional<std::vector<Codepoint>> get_chars ()
- {
- if (is_valid ())
- return {chars};
- else
- return tl::nullopt;
- }
- };
-
- class FileInputSource : public InputSource
- {
- private:
- // Input source file.
- FILE *input;
-
- int next_byte () override { return fgetc (input); }
-
- public:
- // Create new input source from file.
- FileInputSource (FILE *input) : InputSource (), input (input)
- {
- // TODO make this better?
- init ();
- }
- };
-
- class BufferInputSource : public InputSource
- {
- private:
- const std::string &buffer;
- size_t offs;
-
- int next_byte () override
- {
- if (offs >= buffer.size ())
- return EOF;
- return (uint8_t) buffer.at (offs++);
- }
-
- public:
- // Create new input source from file.
- BufferInputSource (const std::string &b, size_t offset)
- : InputSource (), buffer (b), offs (offset)
- {
- // TODO make this better?
- init ();
- }
- };
-
private:
void start_line (int current_line, int current_column);
diff --git a/gcc/rust/lex/rust-token.cc b/gcc/rust/lex/rust-token.cc
index 77ec6cf..9a1132f 100644
--- a/gcc/rust/lex/rust-token.cc
+++ b/gcc/rust/lex/rust-token.cc
@@ -19,6 +19,7 @@
#include "rust-system.h"
#include "rust-token.h"
#include "rust-diagnostics.h"
+#include "rust-unicode.h"
namespace Rust {
// Hackily defined way to get token description for enum value using x-macros
@@ -150,6 +151,23 @@ Token::get_type_hint_str () const
return get_type_hint_string (type_hint);
}
+std::string
+nfc_normalize_token_string (location_t loc, TokenId id, const std::string &str)
+{
+ if (id == IDENTIFIER || id == LIFETIME)
+ {
+ tl::optional<Utf8String> ustring = Utf8String::make_utf8_string (str);
+ if (ustring.has_value ())
+ return ustring.value ().nfc_normalize ().as_string ();
+ else
+ rust_internal_error_at (loc,
+ "identifier '%s' is not a valid UTF-8 string",
+ str.c_str ());
+ }
+ else
+ return str;
+}
+
const std::string &
Token::get_str () const
{
diff --git a/gcc/rust/lex/rust-token.h b/gcc/rust/lex/rust-token.h
index 5675351..c4ea176 100644
--- a/gcc/rust/lex/rust-token.h
+++ b/gcc/rust/lex/rust-token.h
@@ -21,7 +21,8 @@
#include "rust-system.h"
#include "rust-linemap.h"
-#include "rust-codepoint.h"
+#include "rust-make-unique.h"
+#include "rust-unicode.h"
namespace Rust {
// "Primitive core types" in Rust - the different int and float types, as well
@@ -236,6 +237,10 @@ token_id_keyword_string (TokenId id);
const char *
get_type_hint_string (PrimitiveCoreType type);
+/* Normalize string if a token is a identifier */
+std::string
+nfc_normalize_token_string (location_t loc, TokenId id, const std::string &str);
+
// Represents a single token. Create using factory static methods.
class Token
{
@@ -259,29 +264,40 @@ private:
// Token constructor from token id, location, and a string.
Token (TokenId token_id, location_t location, std::string &&paramStr)
- : token_id (token_id), locus (location),
- str (new std::string (std::move (paramStr))), type_hint (CORETYPE_UNKNOWN)
- {}
+ : token_id (token_id), locus (location), type_hint (CORETYPE_UNKNOWN)
+ {
+ // Normalize identifier tokens
+ str = Rust::make_unique<std::string> (
+ nfc_normalize_token_string (location, token_id, paramStr));
+ }
// Token constructor from token id, location, and a char.
Token (TokenId token_id, location_t location, char paramChar)
: token_id (token_id), locus (location),
str (new std::string (1, paramChar)), type_hint (CORETYPE_UNKNOWN)
- {}
+ {
+ // Do not need to normalize 1byte char
+ }
// Token constructor from token id, location, and a "codepoint".
Token (TokenId token_id, location_t location, Codepoint paramCodepoint)
- : token_id (token_id), locus (location),
- str (new std::string (paramCodepoint.as_string ())),
- type_hint (CORETYPE_UNKNOWN)
- {}
+ : token_id (token_id), locus (location), type_hint (CORETYPE_UNKNOWN)
+ {
+ // Normalize identifier tokens
+ str = Rust::make_unique<std::string> (
+ nfc_normalize_token_string (location, token_id,
+ paramCodepoint.as_string ()));
+ }
// Token constructor from token id, location, a string, and type hint.
Token (TokenId token_id, location_t location, std::string &&paramStr,
PrimitiveCoreType parType)
- : token_id (token_id), locus (location),
- str (new std::string (std::move (paramStr))), type_hint (parType)
- {}
+ : token_id (token_id), locus (location), type_hint (parType)
+ {
+ // Normalize identifier tokens
+ str = Rust::make_unique<std::string> (
+ nfc_normalize_token_string (location, token_id, paramStr));
+ }
public:
// No default constructor.
diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc
index f07d18f..44dc3fc 100644
--- a/gcc/rust/rust-lang.cc
+++ b/gcc/rust/rust-lang.cc
@@ -455,11 +455,11 @@ run_rust_tests ()
{
// Call tests for the rust frontend here
rust_input_source_test ();
+ rust_utf8_normalize_test ();
rust_cfg_parser_test ();
rust_privacy_ctx_test ();
rust_crate_name_validation_test ();
rust_simple_path_resolve_test ();
- rust_utf8_normalize_test ();
}
} // namespace selftest
diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc
index 3c00ccb..b860a32 100644
--- a/gcc/rust/rust-session-manager.cc
+++ b/gcc/rust/rust-session-manager.cc
@@ -115,16 +115,15 @@ infer_crate_name (const std::string &filename)
static bool
validate_crate_name (const std::string &crate_name, Error &error)
{
- Utf8String utf8_name = {crate_name};
- tl::optional<std::vector<Codepoint>> uchars_opt = utf8_name.get_chars ();
-
- if (!uchars_opt.has_value ())
+ tl::optional<Utf8String> utf8_name_opt
+ = Utf8String::make_utf8_string (crate_name);
+ if (!utf8_name_opt.has_value ())
{
error = Error (UNDEF_LOCATION, "crate name is not a valid UTF-8 string");
return false;
}
- std::vector<Codepoint> uchars = uchars_opt.value ();
+ std::vector<Codepoint> uchars = utf8_name_opt->get_chars ();
if (uchars.empty ())
{
error = Error (UNDEF_LOCATION, "crate name cannot be empty");
diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc
index c6aa063..b2ddaf0 100644
--- a/gcc/rust/util/rust-unicode.cc
+++ b/gcc/rust/util/rust-unicode.cc
@@ -1,12 +1,14 @@
#include "rust-system.h"
#include "optional.h"
#include "selftest.h"
+#include "rust-lex.h"
+#include "rust-unicode.h"
#include "rust-unicode-data.h"
namespace Rust {
-typedef uint32_t codepoint_t;
+typedef Codepoint codepoint_t;
typedef std::vector<codepoint_t> string_t;
// These constants are used to compose and decompose of Hangul syllables.
@@ -85,7 +87,7 @@ binary_search_sorted_array (const std::array<uint32_t, SIZE> &array,
int
lookup_cc (codepoint_t c)
{
- auto it = Rust::CCC_TABLE.find (c);
+ auto it = Rust::CCC_TABLE.find (c.value);
if (it != Rust::CCC_TABLE.end ())
return it->second;
else
@@ -96,11 +98,11 @@ lookup_cc (codepoint_t c)
tl::optional<codepoint_t>
lookup_recomp (codepoint_t starter, codepoint_t c)
{
- auto it = Rust::RECOMPOSITION_MAP.find ({starter, c});
+ auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value});
if (it != Rust::RECOMPOSITION_MAP.end ())
return {it->second};
- it = Rust::RECOMPOSITION_MAP.find ({starter, 0});
+ it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0});
if (it != Rust::RECOMPOSITION_MAP.end ())
return {it->second};
@@ -110,11 +112,11 @@ lookup_recomp (codepoint_t starter, codepoint_t c)
void
recursive_decomp_cano (codepoint_t c, string_t &buf)
{
- auto it = Rust::DECOMPOSITION_MAP.find (c);
+ auto it = Rust::DECOMPOSITION_MAP.find (c.value);
if (it != Rust::DECOMPOSITION_MAP.end ())
{
- string_t decomped = it->second;
- for (codepoint_t cp : decomped)
+ std::vector<uint32_t> decomped = it->second;
+ for (uint32_t cp : decomped)
recursive_decomp_cano (cp, buf);
}
else
@@ -127,7 +129,7 @@ decomp_cano (string_t s)
string_t buf;
for (codepoint_t c : s)
{
- int64_t s_index = c - S_BASE;
+ int64_t s_index = c.value - S_BASE;
if (0 <= s_index && s_index < S_COUNT)
{
// decompose Hangul argorithmically
@@ -160,7 +162,7 @@ sort_cano (string_t &s)
if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
{
// swap
- int tmp = s[i];
+ codepoint_t tmp = s[i];
s[i] = s[i - 1];
s[i - 1] = tmp;
if (i > 1)
@@ -183,10 +185,10 @@ compose_hangul (string_t s)
codepoint_t ch = s[src_pos];
// L V => LV
- int64_t l_index = last - L_BASE;
+ int64_t l_index = last.value - L_BASE;
if (0 <= l_index && l_index < L_COUNT)
{
- int64_t v_index = ch - V_BASE;
+ int64_t v_index = ch.value - V_BASE;
if (0 <= v_index && v_index < V_COUNT)
{
last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
@@ -198,13 +200,13 @@ compose_hangul (string_t s)
}
// LV T => LVT
- int64_t s_index = last - S_BASE;
+ int64_t s_index = last.value - S_BASE;
if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
{
- int64_t t_index = ch - T_BASE;
+ int64_t t_index = ch.value - T_BASE;
if (0 < t_index && t_index < T_COUNT)
{
- last += t_index;
+ last.value += t_index;
// pop LV
buf.pop_back ();
buf.push_back (last);
@@ -282,6 +284,12 @@ nfc_normalize (string_t s)
return r;
}
+Utf8String
+Utf8String::nfc_normalize () const
+{
+ return Utf8String (Rust::nfc_normalize (chars));
+}
+
bool
is_alphabetic (uint32_t codepoint)
{
@@ -309,9 +317,10 @@ is_numeric (uint32_t codepoint)
namespace selftest {
void
-assert_normalize (std::vector<uint32_t> origin, std::vector<uint32_t> expected)
+assert_normalize (const std::vector<Rust::Codepoint> origin,
+ const std::vector<Rust::Codepoint> expected)
{
- std::vector<uint32_t> actual = Rust::nfc_normalize (origin);
+ std::vector<Rust::Codepoint> actual = Rust::nfc_normalize (origin);
ASSERT_EQ (actual.size (), expected.size ());
for (unsigned int i = 0; i < actual.size (); i++)
diff --git a/gcc/rust/util/rust-unicode.h b/gcc/rust/util/rust-unicode.h
index 6800558..becf6fb 100644
--- a/gcc/rust/util/rust-unicode.h
+++ b/gcc/rust/util/rust-unicode.h
@@ -21,28 +21,43 @@
#include "optional.h"
#include "rust-system.h"
-#include "rust-lex.h"
+#include "rust-input-source.h"
namespace Rust {
class Utf8String
{
private:
- tl::optional<std::vector<Codepoint>> chars;
+ std::vector<Codepoint> chars;
public:
- Utf8String (const std::string &maybe_utf8)
+ static tl::optional<Utf8String>
+ make_utf8_string (const std::string &maybe_utf8)
{
- Lexer::BufferInputSource input_source = {maybe_utf8, 0};
- chars = input_source.get_chars ();
+ BufferInputSource input_source = {maybe_utf8, 0};
+ tl::optional<std::vector<Codepoint>> chars_opt = input_source.get_chars ();
+ if (chars_opt.has_value ())
+ return {Utf8String (chars_opt.value ())};
+ else
+ return tl::nullopt;
}
- // Returns UTF codepoints when string is valid as UTF-8, returns nullopt
- // otherwise.
- tl::optional<std::vector<Codepoint>> get_chars () const { return chars; }
-};
+ Utf8String (const std::vector<Codepoint> codepoints) : chars ({codepoints}) {}
+
+ std::string as_string () const
+ {
+ std::stringstream ss;
+ for (Codepoint c : chars)
+ ss << c.as_string ();
-// TODO: add function nfc_normalize
+ return ss.str ();
+ };
+
+ // Returns characters
+ std::vector<Codepoint> get_chars () const { return chars; }
+
+ Utf8String nfc_normalize () const;
+};
bool
is_alphabetic (uint32_t codepoint);
diff --git a/gcc/testsuite/rust/compile/unicode_norm1.rs b/gcc/testsuite/rust/compile/unicode_norm1.rs
new file mode 100644
index 0000000..d496054
--- /dev/null
+++ b/gcc/testsuite/rust/compile/unicode_norm1.rs
@@ -0,0 +1,6 @@
+fn main() {
+ // U+304C
+ let が = ();
+ // U+304B + U+3099
+ let _ = が;
+}