pure @safe unittest { import std.uni; import std.algorithm.comparison : equal; auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1); foreach (v; 'a'..'z'+1) assert(set[v]); // Cyrillic lowercase interval foreach (v; 'а'..'я'+1) assert(set[v]); //specific order is not required, intervals may interesect auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1); //the same end result assert(set2.byInterval.equal(set.byInterval)); // test constructor this(Range)(Range intervals) auto chessPiecesWhite = CodepointInterval(9812, 9818); auto chessPiecesBlack = CodepointInterval(9818, 9824); auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]); foreach (v; '♔'..'♟'+1) assert(set3[v]); } pure @safe unittest { import std.uni; auto gothic = unicode.Gothic; // Gothic letter ahsa assert(gothic['\U00010330']); // no ascii in Gothic obviously assert(!gothic['$']); } pure @safe unittest { import std.uni; import std.algorithm.comparison : equal; import std.range : iota; auto lower = unicode.LowerCase; auto upper = unicode.UpperCase; auto ascii = unicode.ASCII; assert((lower & upper).empty); // no intersection auto lowerASCII = lower & ascii; assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1))); // throw away all of the lowercase ASCII assert((ascii - lower).length == 128 - 26); auto onlyOneOf = lower ~ ascii; assert(!onlyOneOf['Δ']); // not ASCII and not lowercase assert(onlyOneOf['$']); // ASCII and not lowercase assert(!onlyOneOf['a']); // ASCII and lowercase assert(onlyOneOf['я']); // not ASCII but lowercase // throw away all cased letters from ASCII auto noLetters = ascii - (lower | upper); assert(noLetters.length == 128 - 26*2); } pure @safe unittest { import std.uni; assert('я' in unicode.Cyrillic); assert(!('z' in unicode.Cyrillic)); } pure @safe unittest { import std.uni; import std.algorithm.comparison : equal; import std.range : iota; auto set = unicode.ASCII; set.byCodepoint.equal(iota(0, 0x80)); } pure @safe unittest { import std.uni; import std.conv : to; import std.format : format; import std.uni : unicode; // This was originally using Cyrillic script. // Unfortunately this is a pretty active range for changes, // and hence broke in an update. // Therefore the range Basic latin was used instead as it // unlikely to ever change. assert(unicode.InBasic_latin.to!string == "[0..128)"); // The specs '%s' and '%d' are equivalent to the to!string call above. assert(format("%d", unicode.InBasic_latin) == unicode.InBasic_latin.to!string); assert(format("%#x", unicode.InBasic_latin) == "[0..0x80)"); assert(format("%#X", unicode.InBasic_latin) == "[0..0X80)"); } pure @safe unittest { import std.uni; CodepointSet someSet; someSet.add('0', '5').add('A','Z'+1); someSet.add('5', '9'+1); assert(someSet['0']); assert(someSet['5']); assert(someSet['9']); assert(someSet['Z']); } pure @safe unittest { import std.uni; auto set = unicode.ASCII; // union with the inverse gets all of the code points in the Unicode assert((set | set.inverted).length == 0x110000); // no intersection with the inverse assert((set & set.inverted).empty); } pure @safe unittest { import std.uni; CodepointSet emptySet; assert(emptySet.length == 0); assert(emptySet.empty); } pure @safe unittest { import std.uni; string truth = "2² = 4"; auto m = utfMatcher!char(unicode.Number); assert(m.match(truth)); // '2' is a number all right assert(truth == "² = 4"); // skips on match assert(m.match(truth)); // so is the superscript '2' assert(!m.match(truth)); // space is not a number assert(truth == " = 4"); // unaffected on no match assert(!m.skip(truth)); // same test ... assert(truth == "= 4"); // but skips a codepoint regardless assert(!m.test(truth)); // '=' is not a number assert(truth == "= 4"); // test never affects argument } @safe unittest { import std.uni; import std.exception : collectException; auto ascii = unicode.ASCII; assert(ascii['A']); assert(ascii['~']); assert(!ascii['\u00e0']); // matching is case-insensitive assert(ascii == unicode.ascII); assert(!ascii['à']); // underscores, '-' and whitespace in names are ignored too auto latin = unicode.in_latin1_Supplement; assert(latin['à']); assert(!latin['$']); // BTW Latin 1 Supplement is a block, hence "In" prefix assert(latin == unicode("In Latin 1 Supplement")); // run-time look up throws if no such set is found assert(collectException(unicode("InCyrilliac"))); } @safe unittest { import std.uni; // use .block for explicitness assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic); } @safe unittest { import std.uni; auto arabicScript = unicode.script.arabic; auto arabicBlock = unicode.block.arabic; // there is an intersection between script and block assert(arabicBlock['']); assert(arabicScript['']); // but they are different assert(arabicBlock != arabicScript); assert(arabicBlock == unicode.inArabic); assert(arabicScript == unicode.arabic); } @safe unittest { import std.uni; // L here is syllable type not Letter as in unicode.L short-cut auto leadingVowel = unicode.hangulSyllableType("L"); // check that some leading vowels are present foreach (vowel; '\u1110'..'\u115F') assert(leadingVowel[vowel]); assert(leadingVowel == unicode.hangulSyllableType.L); } @safe unittest { import std.uni; import std.uni : unicode; string pat = "[a-zA-Z0-9]hello"; auto set = unicode.parseSet(pat); // check some of the codepoints assert(set['a'] && set['A'] && set['9']); assert(pat == "hello"); } @safe unittest { import std.uni; assert(graphemeStride(" ", 1) == 1); // A + combing ring above string city = "A\u030Arhus"; size_t first = graphemeStride(city, 0); assert(first == 3); //\u030A has 2 UTF-8 code units assert(city[0 .. first] == "A\u030A"); assert(city[first..$] == "rhus"); } @safe pure unittest { import std.uni; // Two Union Jacks of the Great Britain in each string s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7"; wstring ws = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7"; dstring ds = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7"; // String pop length in code units, not points. assert(s.popGrapheme() == 8); assert(ws.popGrapheme() == 4); assert(ds.popGrapheme() == 2); assert(s == "\U0001F1EC\U0001F1E7"); assert(ws == "\U0001F1EC\U0001F1E7"); assert(ds == "\U0001F1EC\U0001F1E7"); import std.algorithm.comparison : equal; import std.algorithm.iteration : filter; // Also works for non-random access ranges as long as the // character type is 32-bit. auto testPiece = "\r\nhello!"d.filter!(x => !x.isAlpha); // Windows-style line ending is two code points in a single grapheme. assert(testPiece.popGrapheme() == 2); assert(testPiece.equal("!"d)); } @safe unittest { import std.uni; import std.algorithm.comparison : equal; import std.range.primitives : walkLength; import std.range : take, drop; auto text = "noe\u0308l"; // noël using e + combining diaeresis assert(text.walkLength == 5); // 5 code points auto gText = text.byGrapheme; assert(gText.walkLength == 4); // 4 graphemes assert(gText.take(3).equal("noe\u0308".byGrapheme)); assert(gText.drop(3).equal("l".byGrapheme)); } @safe unittest { import std.uni; import std.array : array; import std.conv : text; import std.range : retro; string s = "noe\u0308l"; // noël // reverse it and convert the result to a string string reverse = s.byGrapheme .array .retro .byCodePoint .text; assert(reverse == "le\u0308on"); // lëon } @safe unittest { import std.uni; auto g = Grapheme("A\u0302"); assert(g[0] == 'A'); assert(g.valid); g[1] = '~'; // ASCII tilda is not a combining mark assert(g[1] == '~'); assert(!g.valid); } @safe unittest { import std.uni; import std.algorithm.comparison : equal; auto g = Grapheme("A"); assert(g.valid); g ~= '\u0301'; assert(g[].equal("A\u0301")); assert(g.valid); g ~= "B"; // not a valid grapheme cluster anymore assert(!g.valid); // still could be useful though assert(g[].equal("A\u0301B")); } @safe unittest { import std.uni; import std.algorithm.comparison : equal; import std.algorithm.iteration : filter; import std.range : isRandomAccessRange; string bold = "ku\u0308hn"; // note that decodeGrapheme takes parameter by ref auto first = decodeGrapheme(bold); assert(first.length == 1); assert(first[0] == 'k'); // the next grapheme is 2 characters long auto wideOne = decodeGrapheme(bold); // slicing a grapheme yields a random-access range of dchar assert(wideOne[].equal("u\u0308")); assert(wideOne.length == 2); static assert(isRandomAccessRange!(typeof(wideOne[]))); // all of the usual range manipulation is possible assert(wideOne[].filter!isMark().equal("\u0308")); auto g = Grapheme("A"); assert(g.valid); g ~= '\u0301'; assert(g[].equal("A\u0301")); assert(g.valid); g ~= "B"; // not a valid grapheme cluster anymore assert(!g.valid); // still could be useful though assert(g[].equal("A\u0301B")); } @safe @nogc pure nothrow unittest { import std.uni; assert(sicmp("Август", "авгусТ") == 0); // Greek also works as long as there is no 1:M mapping in sight assert(sicmp("ΌΎ", "όύ") == 0); // things like the following won't get matched as equal // Greek small letter iota with dialytika and tonos assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); // while icmp has no problem with that assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0); assert(icmp("ΌΎ", "όύ") == 0); } @safe @nogc pure nothrow unittest { import std.uni; assert(icmp("Rußland", "Russland") == 0); assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); } @safe @nogc nothrow pure unittest { import std.uni; import std.utf : byDchar; assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0); assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0); } @safe unittest { import std.uni; // shorten the code alias CC = combiningClass; // combining tilda assert(CC('\u0303') == 230); // combining ring below assert(CC('\u0325') == 220); // the simple consequence is that "tilda" should be // placed after a "ring below" in a sequence } @safe unittest { import std.uni; assert(compose('A','\u0308') == '\u00C4'); assert(compose('A', 'B') == dchar.init); assert(compose('C', '\u0301') == '\u0106'); // note that the starter is the first one // thus the following doesn't compose assert(compose('\u0308', 'A') == dchar.init); } @safe unittest { import std.uni; import std.algorithm.comparison : equal; assert(compose('A','\u0308') == '\u00C4'); assert(compose('A', 'B') == dchar.init); assert(compose('C', '\u0301') == '\u0106'); // note that the starter is the first one // thus the following doesn't compose assert(compose('\u0308', 'A') == dchar.init); assert(decompose('Ĉ')[].equal("C\u0302")); assert(decompose('D')[].equal("D")); assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7")); assert(decompose!Compatibility('¹')[].equal("1")); } @safe unittest { import std.uni; import std.algorithm.comparison : equal; assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); } @safe unittest { import std.uni; assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); // leaving out T-vowel, or passing any codepoint // that is not trailing consonant composes an LV-syllable assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); assert(composeJamo('\u1111', 'A') == dchar.init); assert(composeJamo('A', '\u1171') == dchar.init); } @safe pure unittest { import std.uni; // any encoding works wstring greet = "Hello world"; assert(normalize(greet) is greet); // the same exact slice // An example of a character with all 4 forms being different: // Greek upsilon with acute and hook symbol (code point 0x03D3) assert(normalize!NFC("ϓ") == "\u03D3"); assert(normalize!NFD("ϓ") == "\u03D2\u0301"); assert(normalize!NFKC("ϓ") == "\u038E"); assert(normalize!NFKD("ϓ") == "\u03A5\u0301"); } @safe unittest { import std.uni; // e.g. Cyrillic is always allowed, so is ASCII assert(allowedIn!NFC('я')); assert(allowedIn!NFD('я')); assert(allowedIn!NFKC('я')); assert(allowedIn!NFKD('я')); assert(allowedIn!NFC('Z')); } @safe pure unittest { import std.uni; import std.algorithm.comparison : equal; assert("hEllo".asUpperCase.equal("HELLO")); } @safe pure unittest { import std.uni; import std.algorithm.comparison : equal; assert("hEllo".asCapitalized.equal("Hello")); } @safe unittest { import std.uni; import std.algorithm.iteration : map; import std.algorithm.mutation : copy; import std.array : appender; auto abuf = appender!(char[])(); "hello".map!toUpper.copy(abuf); assert(abuf.data == "HELLO"); }