diff options
author | Iain Buclaw <ibuclaw@gdcproject.org> | 2023-03-05 01:47:19 +0100 |
---|---|---|
committer | Iain Buclaw <ibuclaw@gdcproject.org> | 2023-03-16 17:29:57 +0100 |
commit | 8da8c7d337123b28fdeb539a283d00732118712e (patch) | |
tree | 74096a23b9e2f64a7e25ec1e8d4d3b1d8934842e /libphobos/src/std/uni | |
parent | c5e2c3dd6afcf9b152df72b30e205b0180c0afd5 (diff) | |
download | gcc-8da8c7d337123b28fdeb539a283d00732118712e.zip gcc-8da8c7d337123b28fdeb539a283d00732118712e.tar.gz gcc-8da8c7d337123b28fdeb539a283d00732118712e.tar.bz2 |
d: Merge upstream dmd, druntime 4ca4140e58, phobos 454dff14d.
D front-end changes:
- Import dmd v2.103.0-beta.1.
- Using `alias this' for classes has been deprecated.
- The feature `-fpreview=dip25` is now enabled by default.
- The compile-time traits `isVirtualFunction' and
`getVirtualFunctions' have been deprecated.
D runtime changes:
- Import druntime v2.103.0-beta.1.
Phobos changes:
- Import phobos v2.103.0-beta.1.
- Updated unicode grapheme walking updated to conform to Unicode
version 15.
- Improved friendliness of error messages when instantiating
`std.algorithm.iteration.joiner' and
`std.algorithm.sorting.sort' with wrong inputs.
gcc/d/ChangeLog:
* dmd/MERGE: Merge upstream dmd 4ca4140e58.
* dmd/VERSION: Bump version to v2.103.0-beta.1.
* Make-lang.in (D_FRONTEND_OBJS): Add d/errorsink.o.
* d-ctfloat.cc (CTFloat::sprint): Update signature for new front-end
interface.
* d-frontend.cc (getTypeInfoType): Likewise.
* d-lang.cc (d_handle_option): Remove handling of -fpreview=dip25 and
-frevert=dip25.
(d_post_options): Remove enabling of sealed references language
feature when scoped pointers is enabled.
* d-tree.h (create_typeinfo): Update signature.
* decl.cc (DeclVisitor::finish_vtable): Update for new front-end
interface.
(DeclVisitor::visit (VarDeclaration *)): Likewise.
(DeclVisitor::visit (FuncDeclaration *)): Check skipCodegen to see if
front-end explicitly requested not to generate code.
* expr.cc (ExprVisitor::visit (NewExp *)): Update for new front-end
interface.
* lang.opt (fpreview=dip25): Remove.
(frevert=dip25): Remove.
* modules.cc (layout_moduleinfo_fields): Update for new front-end
interface.
(layout_moduleinfo): Likewise.
* runtime.def (NEWCLASS): Remove.
* toir.cc (IRVisitor::visit (IfStatement *)): Don't generate IR for if
statement list when condition is `__ctfe'.
* typeinfo.cc (create_typeinfo): Add generate parameter.
* types.cc (layout_aggregate_members): Update for new front-end
interface.
libphobos/ChangeLog:
* libdruntime/MERGE: Merge upstream druntime 4ca4140e58.
* libdruntime/Makefile.am (DRUNTIME_DSOURCES): Add core/factory.d.
* libdruntime/Makefile.in: Regenerate.
* src/MERGE: Merge upstream phobos 454dff14d.
* testsuite/libphobos.hash/test_hash.d: Update test.
* testsuite/libphobos.shared/finalize.d: Update test.
* libdruntime/core/factory.d: New file.
gcc/testsuite/ChangeLog:
* gdc.dg/torture/simd23084.d: New test.
* gdc.dg/torture/simd23085.d: New test.
* gdc.dg/torture/simd23218.d: New test.
Diffstat (limited to 'libphobos/src/std/uni')
-rw-r--r-- | libphobos/src/std/uni/package.d | 367 |
1 files changed, 259 insertions, 108 deletions
diff --git a/libphobos/src/std/uni/package.d b/libphobos/src/std/uni/package.d index 5c0659e..e2a0de7 100644 --- a/libphobos/src/std/uni/package.d +++ b/libphobos/src/std/uni/package.d @@ -712,6 +712,8 @@ import std.traits : isConvertibleToString, isIntegral, isSomeChar, isSomeString, Unqual, isDynamicArray; // debug = std_uni; +import std.internal.unicode_tables; // generated file + debug(std_uni) import std.stdio; // writefln, writeln private: @@ -6962,23 +6964,192 @@ private: enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally -// control - '\r' -enum controlSwitch = ` - case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':.. - case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085': -`; // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too -// kill unrolled switches +// Use combined trie instead of checking for '\r' | '\n' | ccTrie, +// or extend | '\u200D' separately private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow { return ch >= '\U0001F1E6' && ch <= '\U0001F1FF'; } +// Our grapheme decoder is a state machine, this is list of all possible +// states before each code point. +private enum GraphemeState +{ + Start, + CR, + RI, + L, + V, + LVT, + Emoji, + EmojiZWJ, + Prepend, + End +} + +// Message values whether end of grapheme is reached +private enum TransformRes +{ + // No, unless the source range ends here + // (GB2 - break at end of text, unless text is empty) + goOn, + redo, // Run last character again with new state + retInclude, // Yes, after the just iterated character + retExclude // Yes, before the just iterated character +} + +// The logic of the grapheme decoding is all here +// GB# means Grapheme Breaking rule number # - see Unicode standard annex #29 +// Note, getting GB1 (break at start of text, unless text is empty) right +// relies on the user starting grapheme walking from beginning of the text, and +// not attempting to walk an empty text. +private enum TransformRes + function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms = +[ + GraphemeState.Start: (ref state, ch) + { + // GB4. Break after controls. + if (graphemeControlTrie[ch] || ch == '\n') + return TransformRes.retInclude; + + with (GraphemeState) state = + ch == '\r' ? CR : + isRegionalIndicator(ch) ? RI : + isHangL(ch) ? L : + hangLV[ch] || isHangV(ch) ? V : + hangLVT[ch] || isHangT(ch) ? LVT : + prependTrie[ch] ? Prepend : + xpictoTrie[ch] ? Emoji : + End; + + // No matter what we encountered, we always include the + // first code point in the grapheme. + return TransformRes.goOn; + }, + + // GB3, GB4. Do not break between a CR and LF. + // Otherwise, break after controls. + GraphemeState.CR: (ref state, ch) => ch == '\n' ? + TransformRes.retInclude : + TransformRes.retExclude, + + // GB12 - GB13. Do not break within emoji flag sequences. + // That is, do not break between regional indicator (RI) symbols if + // there is an odd number of RI characters before the break point. + // This state applies if one and only one RI code point has been + // encountered. + GraphemeState.RI: (ref state, ch) + { + state = GraphemeState.End; + + return isRegionalIndicator(ch) ? + TransformRes.goOn : + TransformRes.redo; + }, + + // GB6. Do not break Hangul syllable sequences. + GraphemeState.L: (ref state, ch) + { + if (isHangL(ch)) + return TransformRes.goOn; + else if (isHangV(ch) || hangLV[ch]) + { + state = GraphemeState.V; + return TransformRes.goOn; + } + else if (hangLVT[ch]) + { + state = GraphemeState.LVT; + return TransformRes.goOn; + } + + state = GraphemeState.End; + return TransformRes.redo; + }, + + // GB7. Do not break Hangul syllable sequences. + GraphemeState.V: (ref state, ch) + { + if (isHangV(ch)) + return TransformRes.goOn; + else if (isHangT(ch)) + { + state = GraphemeState.LVT; + return TransformRes.goOn; + } + + state = GraphemeState.End; + return TransformRes.redo; + }, + + // GB8. Do not break Hangul syllable sequences. + GraphemeState.LVT: (ref state, ch) + { + if (isHangT(ch)) + return TransformRes.goOn; + + state = GraphemeState.End; + return TransformRes.redo; + }, + + // GB11. Do not break within emoji modifier sequences or emoji + // zwj sequences. This state applies when the last code point was + // NOT a ZWJ. + GraphemeState.Emoji: (ref state, ch) + { + if (graphemeExtendTrie[ch]) + return TransformRes.goOn; + + static assert(!graphemeExtendTrie['\u200D']); + + if (ch == '\u200D') + { + state = GraphemeState.EmojiZWJ; + return TransformRes.goOn; + } + + state = GraphemeState.End; + // There might still be spacing marks are + // at the end, which are not allowed in + // middle of emoji sequences + return TransformRes.redo; + }, + + // GB11. Do not break within emoji modifier sequences or emoji + // zwj sequences. This state applies when the last code point was + // a ZWJ. + GraphemeState.EmojiZWJ: (ref state, ch) + { + state = GraphemeState.Emoji; + if (xpictoTrie[ch]) + return TransformRes.goOn; + return TransformRes.redo; + }, + + // GB9b. Do not break after Prepend characters. + GraphemeState.Prepend: (ref state, ch) + { + // GB5. Break before controls. + if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n') + return TransformRes.retExclude; + + state = GraphemeState.Start; + return TransformRes.redo; + }, + + // GB9, GB9a. Do not break before extending characters, ZWJ + // or SpacingMarks. + // GB999. Otherwise, break everywhere. + GraphemeState.End: (ref state, ch) + => !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ? + TransformRes.retExclude : + TransformRes.goOn +]; + template genericDecodeGrapheme(bool getValue) { - alias graphemeExtend = graphemeExtendTrie; - alias spacingMark = mcTrie; static if (getValue) alias Value = Grapheme; else @@ -6986,115 +7157,44 @@ template genericDecodeGrapheme(bool getValue) Value genericDecodeGrapheme(Input)(ref Input range) { - import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file - enum GraphemeState { - Start, - CR, - RI, - L, - V, - LVT - } static if (getValue) Grapheme grapheme; auto state = GraphemeState.Start; - enum eat = q{ - static if (getValue) - grapheme ~= ch; - range.popFront(); - }; - dchar ch; + assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof); + outer: while (!range.empty) { ch = range.front; - final switch (state) with(GraphemeState) + + rerun: + final switch (graphemeTransforms[state](state, ch)) + with(TransformRes) { - case Start: - mixin(eat); - if (ch == '\r') - state = CR; - else if (isRegionalIndicator(ch)) - state = RI; - else if (isHangL(ch)) - state = L; - else if (hangLV[ch] || isHangV(ch)) - state = V; - else if (hangLVT[ch]) - state = LVT; - else if (isHangT(ch)) - state = LVT; - else - { - switch (ch) - { - mixin(controlSwitch); - goto L_End; - default: - goto L_End_Extend; - } - } - break; - case CR: - if (ch == '\n') - mixin(eat); - goto L_End_Extend; - case RI: - if (isRegionalIndicator(ch)) - mixin(eat); - goto L_End_Extend; - case L: - if (isHangL(ch)) - mixin(eat); - else if (isHangV(ch) || hangLV[ch]) - { - state = V; - mixin(eat); - } - else if (hangLVT[ch]) - { - state = LVT; - mixin(eat); - } - else - goto L_End_Extend; - break; - case V: - if (isHangV(ch)) - mixin(eat); - else if (isHangT(ch)) - { - state = LVT; - mixin(eat); - } - else - goto L_End_Extend; - break; - case LVT: - if (isHangT(ch)) - { - mixin(eat); - } - else - goto L_End_Extend; - break; + case goOn: + static if (getValue) + grapheme ~= ch; + range.popFront(); + continue; + + case redo: + goto rerun; + + case retInclude: + static if (getValue) + grapheme ~= ch; + range.popFront(); + break outer; + + case retExclude: + break outer; } } - L_End_Extend: - while (!range.empty) - { - ch = range.front; - // extend & spacing marks - if (!graphemeExtend[ch] && !spacingMark[ch]) - break; - mixin(eat); - } - L_End: + static if (getValue) return grapheme; } - } public: // Public API continues @@ -7143,6 +7243,31 @@ if (is(C : dchar)) static assert(c2 == 3); // \u0301 has 2 UTF-8 code units } +// TODO: make this @nogc. Probably no big deal since the state machine is +// already GC-free. +@safe pure nothrow unittest +{ + // grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face + assert(graphemeStride("\U0001F600\U0001f3FE\U0001F600"d, 0) == 2); + // skier ~ female sign ~ '€' + assert(graphemeStride("\u26F7\u2640€"d, 0) == 1); + // skier ~ emoji modifier fitzpatrick type-5 ~ female sign ~ '€' + assert(graphemeStride("\u26F7\U0001f3FE\u2640€"d, 0) == 2); + // skier ~ zero-width joiner ~ female sign ~ '€' + assert(graphemeStride("\u26F7\u200D\u2640€"d, 0) == 3); + // skier ~ emoji modifier fitzpatrick type-5 ~ zero-width joiner + // ~ female sign ~ '€' + assert(graphemeStride("\u26F7\U0001f3FE\u200D\u2640€"d, 0) == 4); + // skier ~ zero-width joiner ~ '€' + assert(graphemeStride("\u26F7\u200D€"d, 0) == 2); + //'€' ~ zero-width joiner ~ skier + assert(graphemeStride("€\u200D\u26F7"d, 0) == 2); + // Kaithi number sign ~ Devanagari digit four ~ Devanagari digit two + assert(graphemeStride("\U000110BD\u096A\u0968"d, 0) == 2); + // Kaithi number sign ~ null + assert(graphemeStride("\U000110BD\0"d, 0) == 1); +} + /++ Reads one full grapheme cluster from an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`. @@ -7285,6 +7410,13 @@ private static @safe struct InputRangeString assert(nonForwardRange.walkLength == 4); } +// Issue 23474 +@safe pure unittest +{ + import std.range.primitives : walkLength; + assert(byGrapheme("\r\u0308").walkLength == 2); +} + /++ $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.) @@ -10530,8 +10662,6 @@ private: @safe pure nothrow @nogc @property { - import std.internal.unicode_tables; // generated file - // It's important to use auto return here, so that the compiler // only runs semantic on the return type if the function gets // used. Also these are functions rather than templates to not @@ -10578,10 +10708,10 @@ private: } //grapheme breaking algorithm tables - auto mcTrie() + auto spacingMarkTrie() { - import std.internal.unicode_grapheme : mcTrieEntries; - static immutable res = asTrie(mcTrieEntries); + import std.internal.unicode_grapheme : spacingMarkTrieEntries; + static immutable res = asTrie(spacingMarkTrieEntries); return res; } @@ -10606,6 +10736,27 @@ private: return res; } + auto prependTrie() + { + import std.internal.unicode_grapheme : prependTrieEntries; + static immutable res = asTrie(prependTrieEntries); + return res; + } + + auto graphemeControlTrie() + { + import std.internal.unicode_grapheme : controlTrieEntries; + static immutable res = asTrie(controlTrieEntries); + return res; + } + + auto xpictoTrie() + { + import std.internal.unicode_grapheme : Extended_PictographicTrieEntries; + static immutable res = asTrie(Extended_PictographicTrieEntries); + return res; + } + // tables below are used for composition/decomposition auto combiningClassTrie() { |