aboutsummaryrefslogtreecommitdiff
path: root/llvm/unittests/Support/UnicodeTest.cpp
diff options
context:
space:
mode:
authorCorentin Jabot <corentinjabot@gmail.com>2022-04-04 12:41:12 +0200
committerCorentin Jabot <corentinjabot@gmail.com>2022-06-25 19:03:33 +0200
commitc92056d038812c23800131892bee48abb2de7ca0 (patch)
tree9d6b03771d9072131513830402ee2312819948fb /llvm/unittests/Support/UnicodeTest.cpp
parentf8c1c9afd3e2286a8fac99fb9978f1566b89fa70 (diff)
downloadllvm-c92056d038812c23800131892bee48abb2de7ca0.zip
llvm-c92056d038812c23800131892bee48abb2de7ca0.tar.gz
llvm-c92056d038812c23800131892bee48abb2de7ca0.tar.bz2
[Clang][C++23] P2071 Named universal character escapes
Implements [[ https://wg21.link/p2071r1 | P2071 Named Universal Character Escapes ]] - as an extension in all language mode, the patch not warn in c++23 mode will be done later once this paper is plenary approved (in July). We add * A code generator that transforms `UnicodeData.txt` and `NameAliases.txt` to a space efficient data structure that can be queried in `O(NameLength)` * A set of functions in `Unicode.h` to query that data, including * A function to find an exact match of a given Unicode character name * A function to perform a loose (ignoring case, space, underscore, medial hyphen) matching * A function returning the best matching codepoint for a given string per edit distance * Support of `\N{}` escape sequences in String and character Literals, with loose and typos diagnostics/fixits * Support of `\N{}` as UCN with loose matching diagnostics/fixits. Loose matching is considered an error to match closely the semantics of P2071. The generated data contributes to 280kB of data to the binaries. `UnicodeData.txt` and `NameAliases.txt` are not committed to the repository in this patch, and regenerating the data is a manual process. Reviewed By: tahonermann Differential Revision: https://reviews.llvm.org/D123064
Diffstat (limited to 'llvm/unittests/Support/UnicodeTest.cpp')
-rw-r--r--llvm/unittests/Support/UnicodeTest.cpp315
1 files changed, 315 insertions, 0 deletions
diff --git a/llvm/unittests/Support/UnicodeTest.cpp b/llvm/unittests/Support/UnicodeTest.cpp
index 09f1cb3..89fbb5a 100644
--- a/llvm/unittests/Support/UnicodeTest.cpp
+++ b/llvm/unittests/Support/UnicodeTest.cpp
@@ -7,7 +7,10 @@
//===----------------------------------------------------------------------===//
#include "llvm/Support/Unicode.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/edit_distance.h"
#include "llvm/Support/ConvertUTF.h"
+#include "gmock/gmock.h"
#include "gtest/gtest.h"
namespace llvm {
@@ -101,6 +104,318 @@ TEST(Unicode, isPrintable) {
}
}
+TEST(Unicode, nameToCodepointStrict) {
+ auto map = [](StringRef Str) {
+ return nameToCodepointStrict(Str).getValueOr(0xFFFF'FFFF);
+ };
+
+ // generated codepoints
+ EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400"));
+ EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF"));
+ EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00"));
+ EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC"));
+ EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000"));
+ EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD"));
+ EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700"));
+ EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740"));
+ EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D"));
+ EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820"));
+ EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1"));
+ EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0"));
+ EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0"));
+ EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000"));
+ EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A"));
+ EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000"));
+ EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7"));
+ EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00"));
+ EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08"));
+ EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00"));
+ EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5"));
+ EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170"));
+ EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB"));
+ EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900"));
+ EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D"));
+ EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70"));
+ EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9"));
+ EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800"));
+ EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D"));
+
+ EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA"));
+ EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS"));
+ EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH"));
+ EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB"));
+ EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA"));
+ EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A"));
+ EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E"));
+ EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I"));
+
+ EXPECT_EQ(0x1F984u, map("UNICORN FACE"));
+ EXPECT_EQ(0x00640u, map("ARABIC TATWEEL"));
+ EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU"));
+ EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001"));
+ EXPECT_EQ(0x02235u, map("BECAUSE"));
+ EXPECT_EQ(0x1F514u, map("BELL"));
+ EXPECT_EQ(0x1F9A9u, map("FLAMINGO"));
+ EXPECT_EQ(0x1F402u, map("OX")); // 2 characters
+ EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+ "ABOVE WITH ALEF MAKSURA ISOLATED FORM"));
+
+ // Aliases
+ EXPECT_EQ(0x0000u, map("NULL"));
+ EXPECT_EQ(0x0007u, map("ALERT"));
+ EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION"));
+ EXPECT_EQ(0x0009u, map("CHARACTER TABULATION"));
+ EXPECT_EQ(0x000Au, map("LINE FEED"));
+ EXPECT_EQ(0x000Au, map("NEW LINE"));
+ EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION"));
+ EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION"));
+ EXPECT_EQ(0x2118u,
+ map("WEIERSTRASS ELLIPTIC FUNCTION")); // correction
+ EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P")); // correction
+ EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK")); // alternate
+ EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate
+
+ // Should perform exact case match
+ EXPECT_EQ(0xFFFFFFFFu, map(""));
+ EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER"));
+ EXPECT_EQ(0xFFFFFFFFu, map("unicorn face"));
+ EXPECT_EQ(0xFFFFFFFFu, map("UNICORN FaCE"));
+ EXPECT_EQ(0xFFFFFFFFu, map("UNICORNFaCE"));
+ EXPECT_EQ(0xFFFFFFFFu, map("UNICORN"));
+ EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE i"));
+ EXPECT_EQ(0xFFFFFFFFu, map("hANGUL SYLLABLE i"));
+ EXPECT_EQ(0xFFFFFFFFu, map("HANGULSYLLABLEI"));
+ EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE"));
+ EXPECT_EQ(0xFFFFFFFFu, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D"));
+ EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d"));
+ EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D"));
+ EXPECT_EQ(0xFFFFFFFF, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER"));
+ EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1"));
+ EXPECT_EQ(0xFFFFFFFFu, map("ZERO WIDTH NO BREAK SPACE"));
+
+ // Should not support abbreviations or figments
+ EXPECT_EQ(0xFFFFFFFFu, map("FVS1"));
+ EXPECT_EQ(0xFFFFFFFFu, map("HIGH OCTET PRESET"));
+ EXPECT_EQ(0xFFFFFFFFu, map("BEL"));
+}
+
+TEST(Unicode, nameToCodepointLoose) {
+ auto map = [](StringRef Str) {
+ auto Opt = nameToCodepointLooseMatching(Str);
+ if (!Opt)
+ return char32_t(0xFFFF'FFFF);
+ return Opt->CodePoint;
+ };
+
+ // generated codepoints
+ EXPECT_EQ(0x04DBFu, map("CJK UNIFIED IDEOGRAPH-4DBF"));
+ EXPECT_EQ(0x04E00u, map("CJK UNIFIED IDEOGRAPH-4E00"));
+ EXPECT_EQ(0x09FFCu, map("CJK UNIFIED IDEOGRAPH-9FFC"));
+ EXPECT_EQ(0x20000u, map("CJK UNIFIED IDEOGRAPH-20000"));
+ EXPECT_EQ(0x2A6DDu, map("CJK UNIFIED IDEOGRAPH-2A6DD"));
+ EXPECT_EQ(0x2A700u, map("CJK UNIFIED IDEOGRAPH-2A700"));
+ EXPECT_EQ(0x2B740u, map("CJK UNIFIED IDEOGRAPH-2B740"));
+ EXPECT_EQ(0x03400u, map("CJK UNIFIED IDEOGRAPH-3400"));
+ EXPECT_EQ(0x2B81Du, map("CJK UNIFIED IDEOGRAPH-2B81D"));
+ EXPECT_EQ(0x2B820u, map("CJK UNIFIED IDEOGRAPH-2B820"));
+ EXPECT_EQ(0x2CEA1u, map("CJK UNIFIED IDEOGRAPH-2CEA1"));
+ EXPECT_EQ(0x2CEB0u, map("CJK UNIFIED IDEOGRAPH-2CEB0"));
+ EXPECT_EQ(0x2EBE0u, map("CJK UNIFIED IDEOGRAPH-2EBE0"));
+ EXPECT_EQ(0x30000u, map("CJK UNIFIED IDEOGRAPH-30000"));
+ EXPECT_EQ(0x3134Au, map("CJK UNIFIED IDEOGRAPH-3134A"));
+ EXPECT_EQ(0x17000u, map("TANGUT IDEOGRAPH-17000"));
+ EXPECT_EQ(0x187F7u, map("TANGUT IDEOGRAPH-187F7"));
+ EXPECT_EQ(0x18D00u, map("TANGUT IDEOGRAPH-18D00"));
+ EXPECT_EQ(0x18D08u, map("TANGUT IDEOGRAPH-18D08"));
+ EXPECT_EQ(0x18B00u, map("KHITAN SMALL SCRIPT CHARACTER-18B00"));
+ EXPECT_EQ(0x18CD5u, map("KHITAN SMALL SCRIPT CHARACTER-18CD5"));
+ EXPECT_EQ(0x1B170u, map("NUSHU CHARACTER-1B170"));
+ EXPECT_EQ(0x1B2FBu, map("NUSHU CHARACTER-1B2FB"));
+ EXPECT_EQ(0x0F900u, map("CJK COMPATIBILITY IDEOGRAPH-F900"));
+ EXPECT_EQ(0x0FA6Du, map("CJK COMPATIBILITY IDEOGRAPH-FA6D"));
+ EXPECT_EQ(0x0FA70u, map("CJK COMPATIBILITY IDEOGRAPH-FA70"));
+ EXPECT_EQ(0x0FAD9u, map("CJK COMPATIBILITY IDEOGRAPH-FAD9"));
+ EXPECT_EQ(0x2F800u, map("CJK COMPATIBILITY IDEOGRAPH-2F800"));
+ EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1D"));
+
+ EXPECT_EQ(0xAC00u, map("HANGUL SYLLABLE GA"));
+ EXPECT_EQ(0xAC14u, map("HANGUL SYLLABLE GASS"));
+ EXPECT_EQ(0xAC2Bu, map("HANGUL SYLLABLE GAELH"));
+ EXPECT_EQ(0xAC7Bu, map("HANGUL SYLLABLE GEOLB"));
+ EXPECT_EQ(0xC640u, map("HANGUL SYLLABLE WA"));
+ EXPECT_EQ(0xC544u, map("HANGUL SYLLABLE A"));
+ EXPECT_EQ(0xC5D0u, map("HANGUL SYLLABLE E"));
+ EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE I"));
+
+ EXPECT_EQ(0x1F984u, map("UNICORN FACE"));
+ EXPECT_EQ(0x00640u, map("ARABIC TATWEEL"));
+ EXPECT_EQ(0x02C05u, map("GLAGOLITIC CAPITAL LETTER YESTU"));
+ EXPECT_EQ(0x13000u, map("EGYPTIAN HIEROGLYPH A001"));
+ EXPECT_EQ(0x02235u, map("BECAUSE"));
+ EXPECT_EQ(0x1F514u, map("BELL"));
+ EXPECT_EQ(0x1F9A9u, map("FLAMINGO"));
+ EXPECT_EQ(0x1F402u, map("OX")); // 2 characters
+ EXPECT_EQ(0x0FBF9u, map("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+ "ABOVE WITH ALEF MAKSURA ISOLATED FORM"));
+
+ // Aliases
+ EXPECT_EQ(0x0000u, map("NULL"));
+ EXPECT_EQ(0x0007u, map("ALERT"));
+ EXPECT_EQ(0x0009u, map("HORIZONTAL TABULATION"));
+ EXPECT_EQ(0x0009u, map("CHARACTER TABULATION"));
+ EXPECT_EQ(0x000Au, map("LINE FEED"));
+ EXPECT_EQ(0x000Au, map("NEW LINE"));
+ EXPECT_EQ(0x0089u, map("CHARACTER TABULATION WITH JUSTIFICATION"));
+ EXPECT_EQ(0x0089u, map("HORIZONTAL TABULATION WITH JUSTIFICATION"));
+ EXPECT_EQ(0x2118u,
+ map("WEIERSTRASS ELLIPTIC FUNCTION")); // correction
+ EXPECT_EQ(0x2118u, map("SCRIPT CAPITAL P")); // correction
+ EXPECT_EQ(0xFEFFu, map("BYTE ORDER MARK")); // alternate
+ EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO-BREAK SPACE")); // alternate
+ EXPECT_EQ(0xFEFFu, map("ZERO WIDTH NO BREAK SPACE")); // alternate
+
+ // Should perform loose matching
+ EXPECT_EQ(0xFFFFFFFFu, map(""));
+ EXPECT_EQ(0xFFFFFFFFu, map("NOT A UNICODE CHARACTER"));
+ EXPECT_EQ(0x0001F984u, map("unicorn face"));
+ EXPECT_EQ(0x0001F984u, map("UNICORN FaCE"));
+ EXPECT_EQ(0x0001F984u, map("UNICORNFaCE"));
+ EXPECT_EQ(0xFFFFFFFFu, map("UNICORN"));
+ EXPECT_EQ(0xC774u, map("HANGUL SYLLABLE i"));
+ EXPECT_EQ(0xC774u, map("hANGUL SYLLABLE i"));
+ EXPECT_EQ(0xC774u, map("HANGULSYLLABLEI"));
+ EXPECT_EQ(0xFFFFFFFFu, map("HANGUL SYLLABLE"));
+
+ EXPECT_EQ(0x2FA1Du, map("cJK COMPATIBILITY IDEOGRAPH-2FA1D"));
+ EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH-2FA1d"));
+ EXPECT_EQ(0x2FA1Du, map("CJK COMPATIBILITY IDEOGRAPH 2FA1D"));
+
+ EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-NOTANUMBER"));
+ EXPECT_EQ(0xFFFFFFFFu, map("CJK COMPATIBILITY IDEOGRAPH-1"));
+
+ // https://unicode.org/reports/tr44/#Matching_Names
+ // UAX44-LM2: Medial hypens are ignored, non medial hyphens are not
+ EXPECT_EQ(0x1FBC5u, map("S-T-I-C-K-F-I-G-U-R-E"));
+ EXPECT_EQ(0xFFFFFFFFu, map("-STICK FIGURE"));
+ EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE-"));
+ EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE -"));
+ EXPECT_EQ(0xFFFFFFFFu, map("STICK FIGURE --"));
+ EXPECT_EQ(0xFFFFFFFFu, map("STICK--FIGURE"));
+
+ EXPECT_EQ(0x0F68u, map("TIBETAN LETTER A"));
+ EXPECT_EQ(0x0F68u, map("TIBETAN LETTERA"));
+ EXPECT_EQ(0x0F68u, map("TIBETAN LETTER-A"));
+ EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A"));
+ EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A"));
+ ;
+
+ // special case
+ EXPECT_EQ(0x1180u, map("HANGUL JUNGSEONG O-E"));
+ EXPECT_EQ(0x116Cu, map("HANGUL JUNGSEONG OE"));
+
+ // names that are prefix to existing characters should not match
+ EXPECT_FALSE(nameToCodepointLooseMatching("B"));
+ EXPECT_FALSE(nameToCodepointLooseMatching("BE"));
+ EXPECT_FALSE(nameToCodepointLooseMatching("BEE"));
+ EXPECT_FALSE(nameToCodepointLooseMatching("BEET"));
+ EXPECT_FALSE(nameToCodepointLooseMatching("BEETL"));
+ EXPECT_TRUE(nameToCodepointLooseMatching("BEETLE"));
+}
+
+} // namespace
+
+bool operator==(MatchForCodepointName a, MatchForCodepointName b) {
+ return a.Name == b.Name && a.Distance == b.Distance && a.Value == b.Value;
+}
+
+namespace {
+
+TEST(Unicode, nearestMatchesForCodepointName) {
+ auto Normalize = [](StringRef Name) {
+ std::string Out;
+ Out.reserve(Name.size());
+ for (char C : Name) {
+ if (isAlnum(C))
+ Out.push_back(toUpper(C));
+ }
+ return Out;
+ };
+
+ auto L = [&](StringRef name) {
+ auto v = nearestMatchesForCodepointName(name, 3);
+ for (auto &r : v) {
+ auto A = Normalize(r.Name);
+ auto B = Normalize(name);
+ EXPECT_EQ(StringRef(A).edit_distance(B, true), r.Distance);
+ }
+ return v;
+ };
+ using ::testing::ElementsAre;
+ using M = MatchForCodepointName;
+
+ ASSERT_THAT(L(""), ElementsAre(M{"OX", 2, 0x1F402}, M{"ANT", 3, 0x1F41C},
+ M{"ARC", 3, 0x2312}));
+ // shortest name
+ ASSERT_THAT(L("OX"), ElementsAre(M{"OX", 0, 0x1F402}, M{"AXE", 2, 0x1FA93},
+ M{"BOY", 2, 0x1F466}));
+
+ // longest name
+ ASSERT_THAT(L("ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA ABOVE WITH ALEF "
+ "MAKSURA INITIAL FORM"),
+ ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+ "ABOVE WITH ALEF MAKSURA INITIAL FORM",
+ 0, 0xFBFB},
+ M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+ "ABOVE WITH ALEF MAKSURA FINAL FORM",
+ 4, 0xFBFA},
+ M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+ "ABOVE WITH ALEF MAKSURA ISOLATED FORM",
+ 7, 0xFBF9}));
+
+ // same result with underscore, spaces, etc
+ ASSERT_THAT(L("______ARABICLIGATUREUIGHUR KIRGHIZ YEH with HAMZA ABOVE WITH "
+ "ALEF MAKsURAINITIAL form_"),
+ ElementsAre(M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+ "ABOVE WITH ALEF MAKSURA INITIAL FORM",
+ 0, 0xFBFB},
+ M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+ "ABOVE WITH ALEF MAKSURA FINAL FORM",
+ 4, 0xFBFA},
+ M{"ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA "
+ "ABOVE WITH ALEF MAKSURA ISOLATED FORM",
+ 7, 0xFBF9}));
+
+ ASSERT_THAT(L("GREEK CAPITAL LETTER LAMBDA"),
+ ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B},
+ M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393},
+ M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391}));
+
+ ASSERT_THAT(L("greekcapitalletter-lambda"),
+ ElementsAre(M{"GREEK CAPITAL LETTER LAMDA", 1, 0x39B},
+ M{"GREEK CAPITAL LETTER GAMMA", 3, 0x0393},
+ M{"GREEK CAPITAL LETTER ALPHA", 4, 0x0391}));
+
+ // typo http://www.unicode.org/notes/tn27/tn27-5.html
+ ASSERT_THAT(
+ L("PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET"),
+ ElementsAre(
+ M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET", 0,
+ 0xFE18}, // typo
+ M{"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET", 2,
+ 0xFE18}, // correction
+ M{"PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET", 6,
+ 0xFE17}));
+
+ // typo http://www.unicode.org/notes/tn27/tn27-5.html
+ ASSERT_THAT(
+ L("BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS"),
+ ElementsAre(
+ M{"BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", 0, 0x1D0C5},
+ M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS", 2, 0x1D0C5},
+ M{"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA SYNAFI", 7,
+ 0x1D0C6}));
+}
+
} // namespace
} // namespace unicode
} // namespace sys