aboutsummaryrefslogtreecommitdiff
path: root/libphobos/src/std/uni.d
diff options
context:
space:
mode:
Diffstat (limited to 'libphobos/src/std/uni.d')
-rw-r--r--libphobos/src/std/uni.d9756
1 files changed, 9756 insertions, 0 deletions
diff --git a/libphobos/src/std/uni.d b/libphobos/src/std/uni.d
new file mode 100644
index 0000000..5f24ad1
--- /dev/null
+++ b/libphobos/src/std/uni.d
@@ -0,0 +1,9756 @@
+// Written in the D programming language.
+
+/++
+ $(P The $(D std.uni) module provides an implementation
+ of fundamental Unicode algorithms and data structures.
+ This doesn't include UTF encoding and decoding primitives,
+ see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf)
+ for this functionality. )
+
+$(SCRIPT inhibitQuickIndex = 1;)
+$(BOOKTABLE,
+$(TR $(TH Category) $(TH Functions))
+$(TR $(TD Decode) $(TD
+ $(LREF byCodePoint)
+ $(LREF byGrapheme)
+ $(LREF decodeGrapheme)
+ $(LREF graphemeStride)
+))
+$(TR $(TD Comparison) $(TD
+ $(LREF icmp)
+ $(LREF sicmp)
+))
+$(TR $(TD Classification) $(TD
+ $(LREF isAlpha)
+ $(LREF isAlphaNum)
+ $(LREF isCodepointSet)
+ $(LREF isControl)
+ $(LREF isFormat)
+ $(LREF isGraphical)
+ $(LREF isIntegralPair)
+ $(LREF isMark)
+ $(LREF isNonCharacter)
+ $(LREF isNumber)
+ $(LREF isPrivateUse)
+ $(LREF isPunctuation)
+ $(LREF isSpace)
+ $(LREF isSurrogate)
+ $(LREF isSurrogateHi)
+ $(LREF isSurrogateLo)
+ $(LREF isSymbol)
+ $(LREF isWhite)
+))
+$(TR $(TD Normalization) $(TD
+ $(LREF NFC)
+ $(LREF NFD)
+ $(LREF NFKD)
+ $(LREF NormalizationForm)
+ $(LREF normalize)
+))
+$(TR $(TD Decompose) $(TD
+ $(LREF decompose)
+ $(LREF decomposeHangul)
+ $(LREF UnicodeDecomposition)
+))
+$(TR $(TD Compose) $(TD
+ $(LREF compose)
+ $(LREF composeJamo)
+))
+$(TR $(TD Sets) $(TD
+ $(LREF CodepointInterval)
+ $(LREF CodepointSet)
+ $(LREF InversionList)
+ $(LREF unicode)
+))
+$(TR $(TD Trie) $(TD
+ $(LREF codepointSetTrie)
+ $(LREF CodepointSetTrie)
+ $(LREF codepointTrie)
+ $(LREF CodepointTrie)
+ $(LREF toTrie)
+ $(LREF toDelegate)
+))
+$(TR $(TD Casing) $(TD
+ $(LREF asCapitalized)
+ $(LREF asLowerCase)
+ $(LREF asUpperCase)
+ $(LREF isLower)
+ $(LREF isUpper)
+ $(LREF toLower)
+ $(LREF toLowerInPlace)
+ $(LREF toUpper)
+ $(LREF toUpperInPlace)
+))
+$(TR $(TD Utf8Matcher) $(TD
+ $(LREF isUtfMatcher)
+ $(LREF MatcherConcept)
+ $(LREF utfMatcher)
+))
+$(TR $(TD Separators) $(TD
+ $(LREF lineSep)
+ $(LREF nelSep)
+ $(LREF paraSep)
+))
+$(TR $(TD Building blocks) $(TD
+ $(LREF allowedIn)
+ $(LREF combiningClass)
+ $(LREF Grapheme)
+))
+)
+
+ $(P All primitives listed operate on Unicode characters and
+ sets of characters. For functions which operate on ASCII characters
+ and ignore Unicode $(CHARACTERS), see $(MREF std, ascii).
+ For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
+ used throughout this module see the $(S_LINK Terminology, terminology) section
+ below.
+ )
+ $(P The focus of this module is the core needs of developing Unicode-aware
+ applications. To that effect it provides the following optimized primitives:
+ )
+ $(UL
+ $(LI Character classification by category and common properties:
+ $(LREF isAlpha), $(LREF isWhite) and others.
+ )
+ $(LI
+ Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
+ )
+ $(LI
+ Converting text to any of the four normalization forms via $(LREF normalize).
+ )
+ $(LI
+ Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
+ by user-perceived characters, that is by $(LREF Grapheme) clusters.
+ )
+ $(LI
+ Decomposing and composing of individual character(s) according to canonical
+ or compatibility rules, see $(LREF compose) and $(LREF decompose),
+ including the specific version for Hangul syllables $(LREF composeJamo)
+ and $(LREF decomposeHangul).
+ )
+ )
+ $(P It's recognized that an application may need further enhancements
+ and extensions, such as less commonly known algorithms,
+ or tailoring existing ones for region specific needs. To help users
+ with building any extra functionality beyond the core primitives,
+ the module provides:
+ )
+ $(UL
+ $(LI
+ $(LREF CodepointSet), a type for easy manipulation of sets of characters.
+ Besides the typical set algebra it provides an unusual feature:
+ a D source code generator for detection of $(CODEPOINTS) in this set.
+ This is a boon for meta-programming parser frameworks,
+ and is used internally to power classification in small
+ sets like $(LREF isWhite).
+ )
+ $(LI
+ A way to construct optimal packed multi-stage tables also known as a
+ special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie).
+ The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
+ construct custom tries that map dchar to value.
+ The end result is a fast and predictable $(BIGOH 1) lookup that powers
+ functions like $(LREF isAlpha) and $(LREF combiningClass),
+ but for user-defined data sets.
+ )
+ $(LI
+ A useful technique for Unicode-aware parsers that perform
+ character classification of encoded $(CODEPOINTS)
+ is to avoid unnecassary decoding at all costs.
+ $(LREF utfMatcher) provides an improvement over the usual workflow
+ of decode-classify-process, combining the decoding and classification
+ steps. By extracting necessary bits directly from encoded
+ $(S_LINK Code unit, code units) matchers achieve
+ significant performance improvements. See $(LREF MatcherConcept) for
+ the common interface of UTF matchers.
+ )
+ $(LI
+ Generally useful building blocks for customized normalization:
+ $(LREF combiningClass) for querying combining class
+ and $(LREF allowedIn) for testing the Quick_Check
+ property of a given normalization form.
+ )
+ $(LI
+ Access to a large selection of commonly used sets of $(CODEPOINTS).
+ $(S_LINK Unicode properties, Supported sets) include Script,
+ Block and General Category. The exact contents of a set can be
+ observed in the CLDR utility, on the
+ $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page
+ of the Unicode website.
+ See $(LREF unicode) for easy and (optionally) compile-time checked set
+ queries.
+ )
+ )
+ $(SECTION Synopsis)
+ ---
+ import std.uni;
+ void main()
+ {
+ // initialize code point sets using script/block or property name
+ // now 'set' contains code points from both scripts.
+ auto set = unicode("Cyrillic") | unicode("Armenian");
+ // same thing but simpler and checked at compile-time
+ auto ascii = unicode.ASCII;
+ auto currency = unicode.Currency_Symbol;
+
+ // easy set ops
+ auto a = set & ascii;
+ assert(a.empty); // as it has no intersection with ascii
+ a = set | ascii;
+ auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
+
+ // some properties of code point sets
+ assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
+ // testing presence of a code point in a set
+ // is just fine, it is O(logN)
+ assert(!b['$']);
+ assert(!b['\u058F']); // Armenian dram sign
+ assert(b['¥']);
+
+ // building fast lookup tables, these guarantee O(1) complexity
+ // 1-level Trie lookup table essentially a huge bit-set ~262Kb
+ auto oneTrie = toTrie!1(b);
+ // 2-level far more compact but typically slightly slower
+ auto twoTrie = toTrie!2(b);
+ // 3-level even smaller, and a bit slower yet
+ auto threeTrie = toTrie!3(b);
+ assert(oneTrie['£']);
+ assert(twoTrie['£']);
+ assert(threeTrie['£']);
+
+ // build the trie with the most sensible trie level
+ // and bind it as a functor
+ auto cyrillicOrArmenian = toDelegate(set);
+ auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
+ assert(balance == "ընկեր!");
+ // compatible with bool delegate(dchar)
+ bool delegate(dchar) bindIt = cyrillicOrArmenian;
+
+ // Normalization
+ string s = "Plain ascii (and not only), is always normalized!";
+ assert(s is normalize(s));// is the same string
+
+ string nonS = "A\u0308ffin"; // A ligature
+ auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
+ assert(nS == "Äffin");
+ assert(nS != nonS);
+ string composed = "Äffin";
+
+ assert(normalize!NFD(composed) == "A\u0308ffin");
+ // to NFKD, compatibility decomposition useful for fuzzy matching/searching
+ assert(normalize!NFKD("2¹⁰") == "210");
+ }
+ ---
+ $(SECTION Terminology
+ )
+ $(P The following is a list of important Unicode notions
+ and definitions. Any conventions used specifically in this
+ module alone are marked as such. The descriptions are based on the formal
+ definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
+ chapter three of The Unicode Standard Core Specification.)
+ )
+ $(P $(DEF Abstract character) A unit of information used for the organization,
+ control, or representation of textual data.
+ Note that:
+ $(UL
+ $(LI When representing data, the nature of that data
+ is generally symbolic as opposed to some other
+ kind of data (for example, visual).
+ )
+ $(LI An abstract character has no concrete form
+ and should not be confused with a $(S_LINK Glyph, glyph).
+ )
+ $(LI An abstract character does not necessarily
+ correspond to what a user thinks of as a “character”
+ and should not be confused with a $(LREF Grapheme).
+ )
+ $(LI The abstract characters encoded (see Encoded character)
+ are known as Unicode abstract characters.
+ )
+ $(LI Abstract characters not directly
+ encoded by the Unicode Standard can often be
+ represented by the use of combining character sequences.
+ )
+ )
+ )
+ $(P $(DEF Canonical decomposition)
+ The decomposition of a character or character sequence
+ that results from recursively applying the canonical
+ mappings found in the Unicode Character Database
+ and these described in Conjoining Jamo Behavior
+ (section 12 of
+ $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
+ )
+ $(P $(DEF Canonical composition)
+ The precise definition of the Canonical composition
+ is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf,
+ Unicode Conformance) section 11.
+ Informally it's the process that does the reverse of the canonical
+ decomposition with the addition of certain rules
+ that e.g. prevent legacy characters from appearing in the composed result.
+ )
+ $(P $(DEF Canonical equivalent)
+ Two character sequences are said to be canonical equivalents if
+ their full canonical decompositions are identical.
+ )
+ $(P $(DEF Character) Typically differs by context.
+ For the purpose of this documentation the term $(I character)
+ implies $(I encoded character), that is, a code point having
+ an assigned abstract character (a symbolic meaning).
+ )
+ $(P $(DEF Code point) Any value in the Unicode codespace;
+ that is, the range of integers from 0 to 10FFFF (hex).
+ Not all code points are assigned to encoded characters.
+ )
+ $(P $(DEF Code unit) The minimal bit combination that can represent
+ a unit of encoded text for processing or interchange.
+ Depending on the encoding this could be:
+ 8-bit code units in the UTF-8 ($(D char)),
+ 16-bit code units in the UTF-16 ($(D wchar)),
+ and 32-bit code units in the UTF-32 ($(D dchar)).
+ $(I Note that in UTF-32, a code unit is a code point
+ and is represented by the D $(D dchar) type.)
+ )
+ $(P $(DEF Combining character) A character with the General Category
+ of Combining Mark(M).
+ $(UL
+ $(LI All characters with non-zero canonical combining class
+ are combining characters, but the reverse is not the case:
+ there are combining characters with a zero combining class.
+ )
+ $(LI These characters are not normally used in isolation
+ unless they are being described. They include such characters
+ as accents, diacritics, Hebrew points, Arabic vowel signs,
+ and Indic matras.
+ )
+ )
+ )
+ $(P $(DEF Combining class)
+ A numerical value used by the Unicode Canonical Ordering Algorithm
+ to determine which sequences of combining marks are to be
+ considered canonically equivalent and which are not.
+ )
+ $(P $(DEF Compatibility decomposition)
+ The decomposition of a character or character sequence that results
+ from recursively applying both the compatibility mappings and
+ the canonical mappings found in the Unicode Character Database, and those
+ described in Conjoining Jamo Behavior no characters
+ can be further decomposed.
+ )
+ $(P $(DEF Compatibility equivalent)
+ Two character sequences are said to be compatibility
+ equivalents if their full compatibility decompositions are identical.
+ )
+ $(P $(DEF Encoded character) An association (or mapping)
+ between an abstract character and a code point.
+ )
+ $(P $(DEF Glyph) The actual, concrete image of a glyph representation
+ having been rasterized or otherwise imaged onto some display surface.
+ )
+ $(P $(DEF Grapheme base) A character with the property
+ Grapheme_Base, or any standard Korean syllable block.
+ )
+ $(P $(DEF Grapheme cluster) Defined as the text between
+ grapheme boundaries as specified by Unicode Standard Annex #29,
+ $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation).
+ Important general properties of a grapheme:
+ $(UL
+ $(LI The grapheme cluster represents a horizontally segmentable
+ unit of text, consisting of some grapheme base (which may
+ consist of a Korean syllable) together with any number of
+ nonspacing marks applied to it.
+ )
+ $(LI A grapheme cluster typically starts with a grapheme base
+ and then extends across any subsequent sequence of nonspacing marks.
+ A grapheme cluster is most directly relevant to text rendering and
+ processes such as cursor placement and text selection in editing,
+ but may also be relevant to comparison and searching.
+ )
+ $(LI For many processes, a grapheme cluster behaves as if it was a
+ single character with the same properties as its grapheme base.
+ Effectively, nonspacing marks apply $(I graphically) to the base,
+ but do not change its properties.
+ )
+ )
+ $(P This module defines a number of primitives that work with graphemes:
+ $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
+ All of them are using $(I extended grapheme) boundaries
+ as defined in the aforementioned standard annex.
+ )
+ )
+ $(P $(DEF Nonspacing mark) A combining character with the
+ General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
+ )
+ $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.
+ )
+ $(SECTION Normalization
+ )
+ $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
+ or $(S_LINK Compatibility equivalent, compatibility equivalent)
+ characters in the Unicode Standard make it necessary to have a full, formal
+ definition of equivalence for Unicode strings.
+ String equivalence is determined by a process called normalization,
+ whereby strings are converted into forms which are compared
+ directly for identity. This is the primary goal of the normalization process,
+ see the function $(LREF normalize) to convert into any of
+ the four defined forms.
+ )
+ $(P A very important attribute of the Unicode Normalization Forms
+ is that they must remain stable between versions of the Unicode Standard.
+ A Unicode string normalized to a particular Unicode Normalization Form
+ in one version of the standard is guaranteed to remain in that Normalization
+ Form for implementations of future versions of the standard.
+ )
+ $(P The Unicode Standard specifies four normalization forms.
+ Informally, two of these forms are defined by maximal decomposition
+ of equivalent sequences, and two of these forms are defined
+ by maximal $(I composition) of equivalent sequences.
+ $(UL
+ $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
+ canonical decomposition) of a character sequence.)
+ $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
+ compatibility decomposition) of a character sequence.)
+ $(LI Normalization Form C (NFC): The canonical composition of the
+ $(S_LINK Canonical decomposition, canonical decomposition)
+ of a coded character sequence.)
+ $(LI Normalization Form KC (NFKC): The canonical composition
+ of the $(S_LINK Compatibility decomposition,
+ compatibility decomposition) of a character sequence)
+ )
+ )
+ $(P The choice of the normalization form depends on the particular use case.
+ NFC is the best form for general text, since it's more compatible with
+ strings converted from legacy encodings. NFKC is the preferred form for
+ identifiers, especially where there are security concerns. NFD and NFKD
+ are the most useful for internal processing.
+ )
+ $(SECTION Construction of lookup tables
+ )
+ $(P The Unicode standard describes a set of algorithms that
+ depend on having the ability to quickly look up various properties
+ of a code point. Given the the codespace of about 1 million $(CODEPOINTS),
+ it is not a trivial task to provide a space-efficient solution for
+ the multitude of properties.
+ )
+ $(P Common approaches such as hash-tables or binary search over
+ sorted code point intervals (as in $(LREF InversionList)) are insufficient.
+ Hash-tables have enormous memory footprint and binary search
+ over intervals is not fast enough for some heavy-duty algorithms.
+ )
+ $(P The recommended solution (see Unicode Implementation Guidelines)
+ is using multi-stage tables that are an implementation of the
+ $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer
+ keys and a fixed number of stages. For the remainder of the section
+ this will be called a fixed trie. The following describes a particular
+ implementation that is aimed for the speed of access at the expense
+ of ideal size savings.
+ )
+ $(P Taking a 2-level Trie as an example the principle of operation is as follows.
+ Split the number of bits in a key (code point, 21 bits) into 2 components
+ (e.g. 15 and 8). The first is the number of bits in the index of the trie
+ and the other is number of bits in each page of the trie.
+ The layout of the trie is then an array of size 2^^bits-of-index followed
+ an array of memory chunks of size 2^^bits-of-page/bits-per-element.
+ )
+ $(P The number of pages is variable (but not less then 1)
+ unlike the number of entries in the index. The slots of the index
+ all have to contain a number of a page that is present. The lookup is then
+ just a couple of operations - slice the upper bits,
+ lookup an index for these, take a page at this index and use
+ the lower bits as an offset within this page.
+
+ Assuming that pages are laid out consequently
+ in one array at $(D pages), the pseudo-code is:
+ )
+ ---
+ auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
+ pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
+ ---
+ $(P Where if $(D elemsPerPage) is a power of 2 the whole process is
+ a handful of simple instructions and 2 array reads. Subsequent levels
+ of the trie are introduced by recursing on this notion - the index array
+ is treated as values. The number of bits in index is then again
+ split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
+ )
+
+ $(P For completeness a level 1 trie is simply an array.
+ The current implementation takes advantage of bit-packing values
+ when the range is known to be limited in advance (such as $(D bool)).
+ See also $(LREF BitPacked) for enforcing it manually.
+ The major size advantage however comes from the fact
+ that multiple $(B identical pages on every level are merged) by construction.
+ )
+ $(P The process of constructing a trie is more involved and is hidden from
+ the user in a form of the convenience functions $(LREF codepointTrie),
+ $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
+ In general a set or built-in AA with $(D dchar) type
+ can be turned into a trie. The trie object in this module
+ is read-only (immutable); it's effectively frozen after construction.
+ )
+ $(SECTION Unicode properties
+ )
+ $(P This is a full list of Unicode properties accessible through $(LREF unicode)
+ with specific helpers per category nested within. Consult the
+ $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
+ when in doubt about the contents of a particular set.
+ )
+ $(P General category sets listed below are only accessible with the
+ $(LREF unicode) shorthand accessor.)
+ $(BOOKTABLE $(B General category ),
+ $(TR $(TH Abb.) $(TH Long form)
+ $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
+ $(TR $(TD L) $(TD Letter)
+ $(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation))
+ $(TR $(TD Ll) $(TD Lowercase_Letter)
+ $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
+ $(TR $(TD Lm) $(TD Modifier_Letter)
+ $(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol))
+ $(TR $(TD Lo) $(TD Other_Letter)
+ $(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol))
+ $(TR $(TD Lt) $(TD Titlecase_Letter)
+ $(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol))
+ $(TR $(TD Lu) $(TD Uppercase_Letter)
+ $(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol))
+ $(TR $(TD M) $(TD Mark)
+ $(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol))
+ $(TR $(TD Mc) $(TD Spacing_Mark)
+ $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
+ $(TR $(TD Me) $(TD Enclosing_Mark)
+ $(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator))
+ $(TR $(TD Mn) $(TD Nonspacing_Mark)
+ $(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator))
+ $(TR $(TD C) $(TD Other)
+ $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
+ $(TR $(TD Cc) $(TD Control) $(TD Pf)
+ $(TD Final_Punctuation) $(TD -) $(TD Any))
+ $(TR $(TD Cf) $(TD Format)
+ $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
+ )
+ $(P Sets for other commonly useful properties that are
+ accessible with $(LREF unicode):)
+ $(BOOKTABLE $(B Common binary properties),
+ $(TR $(TH Name) $(TH Name) $(TH Name))
+ $(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase))
+ $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
+ $(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space))
+ $(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark))
+ $(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical))
+ $(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted))
+ $(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm))
+ $(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation))
+ $(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
+ $(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase))
+ $(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector))
+ $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space))
+ $(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue))
+ $(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start))
+ $(TR $(TD Hyphen) $(TD Other_Lowercase) )
+ $(TR $(TD ID_Continue) $(TD Other_Math) )
+ )
+ $(P Below is the table with block names accepted by $(LREF unicode.block).
+ Note that the shorthand version $(LREF unicode) requires "In"
+ to be prepended to the names of blocks so as to disambiguate
+ scripts and blocks.
+ )
+ $(BOOKTABLE $(B Blocks),
+ $(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian))
+ $(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols))
+ $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar))
+ $(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A))
+ $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue))
+ $(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo))
+ $(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms))
+ $(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham))
+ $(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki))
+ $(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic))
+ $(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian))
+ $(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian))
+ $(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic))
+ $(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
+ $(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya))
+ $(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya))
+ $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa))
+ $(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc))
+ $(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician))
+ $(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions))
+ $(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement))
+ $(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards))
+ $(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area))
+ $(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang))
+ $(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols))
+ $(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic))
+ $(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan))
+ $(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra))
+ $(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada))
+ $(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian))
+ $(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala))
+ $(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants))
+ $(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng))
+ $(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters))
+ $(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials))
+ $(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese))
+ $(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement))
+ $(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts))
+ $(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A))
+ $(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B))
+ $(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
+ $(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation))
+ $(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A))
+ $(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B))
+ $(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri))
+ $(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac))
+ $(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog))
+ $(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa))
+ $(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags))
+ $(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le))
+ $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
+ $(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet))
+ $(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols))
+ $(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri))
+ $(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil))
+ $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu))
+ $(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana))
+ $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
+ $(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan))
+ $(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh))
+ $(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
+ $(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic))
+ $(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics))
+ $(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended))
+ $(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai))
+ $(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors))
+ $(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement))
+ $(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions))
+ $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms))
+ $(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
+ $(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals))
+ $(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
+ $(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) )
+ $(TR $(TD Ethiopic) $(TD Modifier Tone Letters) )
+ )
+ $(P Below is the table with script names accepted by $(LREF unicode.script)
+ and by the shorthand version $(LREF unicode):)
+ $(BOOKTABLE $(B Scripts),
+ $(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic))
+ $(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian))
+ $(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian))
+ $(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic))
+ $(TR $(TD Bamum) $(TD Inherited) $(TD Oriya))
+ $(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya))
+ $(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa))
+ $(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician))
+ $(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang))
+ $(TR $(TD Braille) $(TD Kannada) $(TD Runic))
+ $(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan))
+ $(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra))
+ $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada))
+ $(TR $(TD Carian) $(TD Khmer) $(TD Shavian))
+ $(TR $(TD Chakma) $(TD Lao) $(TD Sinhala))
+ $(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng))
+ $(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese))
+ $(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri))
+ $(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac))
+ $(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog))
+ $(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa))
+ $(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le))
+ $(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham))
+ $(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet))
+ $(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri))
+ $(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil))
+ $(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu))
+ $(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana))
+ $(TR $(TD Gothic) $(TD Mongolian) $(TD Thai))
+ $(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan))
+ $(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh))
+ $(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic))
+ $(TR $(TD Han) $(TD Ogham) $(TD Vai))
+ $(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi))
+ )
+ $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).)
+ $(BOOKTABLE $(B Hangul syllable type),
+ $(TR $(TH Abb.) $(TH Long form))
+ $(TR $(TD L) $(TD Leading_Jamo))
+ $(TR $(TD LV) $(TD LV_Syllable))
+ $(TR $(TD LVT) $(TD LVT_Syllable) )
+ $(TR $(TD T) $(TD Trailing_Jamo))
+ $(TR $(TD V) $(TD Vowel_Jamo))
+ )
+ References:
+ $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table),
+ $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia),
+ $(HTTP www.unicode.org, The Unicode Consortium),
+ $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms),
+ $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation)
+ $(HTTP www.unicode.org/uni2book/ch05.pdf,
+ Unicode Implementation Guidelines)
+ $(HTTP www.unicode.org/uni2book/ch03.pdf,
+ Unicode Conformance)
+ Trademarks:
+ Unicode(tm) is a trademark of Unicode, Inc.
+
+ Copyright: Copyright 2013 -
+ License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
+ Authors: Dmitry Olshansky
+ Source: $(PHOBOSSRC std/_uni.d)
+ Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
+
+Macros:
+
+SECTION = <h3><a id="$1">$0</a></h3>
+DEF = <div><a id="$1"><i>$0</i></a></div>
+S_LINK = <a href="#$1">$+</a>
+CODEPOINT = $(S_LINK Code point, code point)
+CODEPOINTS = $(S_LINK Code point, code points)
+CHARACTER = $(S_LINK Character, character)
+CHARACTERS = $(S_LINK Character, characters)
+CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
++/
+module std.uni;
+
+import std.meta; // AliasSeq
+import std.range.primitives; // back, ElementEncodingType, ElementType, empty,
+ // front, isForwardRange, isInputRange, isRandomAccessRange, popFront, put,
+ // save
+import std.traits; // isConvertibleToString, isIntegral, isSomeChar,
+ // isSomeString, Unqual
+import std.exception;// : enforce;
+import core.memory; //: pureMalloc, pureRealloc, pureFree;
+import core.exception; // : onOutOfMemoryError;
+static import std.ascii;
+// debug = std_uni;
+
+debug(std_uni) import std.stdio; // writefln, writeln
+
+private:
+
+version (unittest)
+{
+private:
+ struct TestAliasedString
+ {
+ string get() @safe @nogc pure nothrow { return _s; }
+ alias get this;
+ @disable this(this);
+ string _s;
+ }
+
+ bool testAliasedString(alias func, Args...)(string s, Args args)
+ {
+ import std.algorithm.comparison : equal;
+ auto a = func(TestAliasedString(s), args);
+ auto b = func(s, args);
+ static if (is(typeof(equal(a, b))))
+ {
+ // For ranges, compare contents instead of object identity.
+ return equal(a, b);
+ }
+ else
+ {
+ return a == b;
+ }
+ }
+}
+
+void copyBackwards(T,U)(T[] src, U[] dest)
+{
+ assert(src.length == dest.length);
+ for (size_t i=src.length; i-- > 0; )
+ dest[i] = src[i];
+}
+
+void copyForward(T,U)(T[] src, U[] dest)
+{
+ assert(src.length == dest.length);
+ for (size_t i=0; i<src.length; i++)
+ dest[i] = src[i];
+}
+
+// TODO: update to reflect all major CPUs supporting unaligned reads
+version (X86)
+ enum hasUnalignedReads = true;
+else version (X86_64)
+ enum hasUnalignedReads = true;
+else
+ enum hasUnalignedReads = false; // better be safe then sorry
+
+public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
+public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
+public enum dchar nelSep = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line.
+
+// test the intro example
+@safe unittest
+{
+ import std.algorithm.searching : find;
+ // initialize code point sets using script/block or property name
+ // set contains code points from both scripts.
+ auto set = unicode("Cyrillic") | unicode("Armenian");
+ // or simpler and statically-checked look
+ auto ascii = unicode.ASCII;
+ auto currency = unicode.Currency_Symbol;
+
+ // easy set ops
+ auto a = set & ascii;
+ assert(a.empty); // as it has no intersection with ascii
+ a = set | ascii;
+ auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
+
+ // some properties of code point sets
+ assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
+ // testing presence of a code point in a set
+ // is just fine, it is O(logN)
+ assert(!b['$']);
+ assert(!b['\u058F']); // Armenian dram sign
+ assert(b['¥']);
+
+ // building fast lookup tables, these guarantee O(1) complexity
+ // 1-level Trie lookup table essentially a huge bit-set ~262Kb
+ auto oneTrie = toTrie!1(b);
+ // 2-level far more compact but typically slightly slower
+ auto twoTrie = toTrie!2(b);
+ // 3-level even smaller, and a bit slower yet
+ auto threeTrie = toTrie!3(b);
+ assert(oneTrie['£']);
+ assert(twoTrie['£']);
+ assert(threeTrie['£']);
+
+ // build the trie with the most sensible trie level
+ // and bind it as a functor
+ auto cyrillicOrArmenian = toDelegate(set);
+ auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
+ assert(balance == "ընկեր!");
+ // compatible with bool delegate(dchar)
+ bool delegate(dchar) bindIt = cyrillicOrArmenian;
+
+ // Normalization
+ string s = "Plain ascii (and not only), is always normalized!";
+ assert(s is normalize(s));// is the same string
+
+ string nonS = "A\u0308ffin"; // A ligature
+ auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
+ assert(nS == "Äffin");
+ assert(nS != nonS);
+ string composed = "Äffin";
+
+ assert(normalize!NFD(composed) == "A\u0308ffin");
+ // to NFKD, compatibility decomposition useful for fuzzy matching/searching
+ assert(normalize!NFKD("2¹⁰") == "210");
+}
+
+enum lastDchar = 0x10FFFF;
+
+auto force(T, F)(F from)
+if (isIntegral!T && !is(T == F))
+{
+ assert(from <= T.max && from >= T.min);
+ return cast(T) from;
+}
+
+auto force(T, F)(F from)
+if (isBitPacked!T && !is(T == F))
+{
+ assert(from <= 2^^bitSizeOf!T-1);
+ return T(cast(TypeOfBitPacked!T) from);
+}
+
+auto force(T, F)(F from)
+if (is(T == F))
+{
+ return from;
+}
+
+// repeat X times the bit-pattern in val assuming it's length is 'bits'
+size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc
+{
+ static if (times == 1)
+ return val;
+ else static if (bits == 1)
+ {
+ static if (times == size_t.sizeof*8)
+ return val ? size_t.max : 0;
+ else
+ return val ? (1 << times)-1 : 0;
+ }
+ else static if (times % 2)
+ return (replicateBits!(times-1, bits)(val)<<bits) | val;
+ else
+ return replicateBits!(times/2, bits*2)((val << bits) | val);
+}
+
+@safe pure nothrow @nogc unittest // for replicate
+{
+ import std.algorithm.iteration : sum, map;
+ import std.range : iota;
+ size_t m = 0b111;
+ size_t m2 = 0b01;
+ foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
+ {
+ assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
+ assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum());
+ }
+}
+
+// multiple arrays squashed into one memory block
+struct MultiArray(Types...)
+{
+ import std.range.primitives : isOutputRange;
+ this(size_t[] sizes...) @safe pure nothrow
+ {
+ assert(dim == sizes.length);
+ size_t full_size;
+ foreach (i, v; Types)
+ {
+ full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
+ sz[i] = sizes[i];
+ static if (i >= 1)
+ offsets[i] = offsets[i-1] +
+ spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
+ }
+
+ storage = new size_t[full_size];
+ }
+
+ this(const(size_t)[] raw_offsets,
+ const(size_t)[] raw_sizes, const(size_t)[] data)const @safe pure nothrow @nogc
+ {
+ offsets[] = raw_offsets[];
+ sz[] = raw_sizes[];
+ storage = data;
+ }
+
+ @property auto slice(size_t n)()inout pure nothrow @nogc
+ {
+ auto ptr = raw_ptr!n;
+ return packedArrayView!(Types[n])(ptr, sz[n]);
+ }
+
+ @property auto ptr(size_t n)()inout pure nothrow @nogc
+ {
+ auto ptr = raw_ptr!n;
+ return inout(PackedPtr!(Types[n]))(ptr);
+ }
+
+ template length(size_t n)
+ {
+ @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; }
+
+ @property void length(size_t new_size)
+ {
+ if (new_size > sz[n])
+ {// extend
+ size_t delta = (new_size - sz[n]);
+ sz[n] += delta;
+ delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
+ storage.length += delta;// extend space at end
+ // raw_slice!x must follow resize as it could be moved!
+ // next stmts move all data past this array, last-one-goes-first
+ static if (n != dim-1)
+ {
+ auto start = raw_ptr!(n+1);
+ // len includes delta
+ size_t len = (storage.ptr+storage.length-start);
+
+ copyBackwards(start[0 .. len-delta], start[delta .. len]);
+
+ start[0 .. delta] = 0;
+ // offsets are used for raw_slice, ptr etc.
+ foreach (i; n+1 .. dim)
+ offsets[i] += delta;
+ }
+ }
+ else if (new_size < sz[n])
+ {// shrink
+ size_t delta = (sz[n] - new_size);
+ sz[n] -= delta;
+ delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
+ // move all data past this array, forward direction
+ static if (n != dim-1)
+ {
+ auto start = raw_ptr!(n+1);
+ size_t len = (storage.ptr+storage.length-start);
+ copyForward(start[0 .. len-delta], start[delta .. len]);
+
+ // adjust offsets last, they affect raw_slice
+ foreach (i; n+1 .. dim)
+ offsets[i] -= delta;
+ }
+ storage.length -= delta;
+ }
+ // else - NOP
+ }
+ }
+
+ @property size_t bytes(size_t n=size_t.max)() const @safe
+ {
+ static if (n == size_t.max)
+ return storage.length*size_t.sizeof;
+ else static if (n != Types.length-1)
+ return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
+ else
+ return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
+ }
+
+ void store(OutRange)(scope OutRange sink) const
+ if (isOutputRange!(OutRange, char))
+ {
+ import std.format : formattedWrite;
+ formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
+ formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
+ formattedWrite(sink, ", [%( 0x%x, %)]", storage);
+ }
+
+private:
+ import std.meta : staticMap;
+ @property auto raw_ptr(size_t n)()inout pure nothrow @nogc
+ {
+ static if (n == 0)
+ return storage.ptr;
+ else
+ {
+ return storage.ptr+offsets[n];
+ }
+ }
+ enum dim = Types.length;
+ size_t[dim] offsets;// offset for level x
+ size_t[dim] sz;// size of level x
+ alias bitWidth = staticMap!(bitSizeOf, Types);
+ size_t[] storage;
+}
+
+@system unittest
+{
+ import std.conv : text;
+ enum dg = (){
+ // sizes are:
+ // lvl0: 3, lvl1 : 2, lvl2: 1
+ auto m = MultiArray!(int, ubyte, int)(3,2,1);
+
+ static void check(size_t k, T)(ref T m, int n)
+ {
+ foreach (i; 0 .. n)
+ assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n]));
+ }
+
+ static void checkB(size_t k, T)(ref T m, int n)
+ {
+ foreach (i; 0 .. n)
+ assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n]));
+ }
+
+ static void fill(size_t k, T)(ref T m, int n)
+ {
+ foreach (i; 0 .. n)
+ m.slice!(k)[i] = force!ubyte(i+1);
+ }
+
+ static void fillB(size_t k, T)(ref T m, int n)
+ {
+ foreach (i; 0 .. n)
+ m.slice!(k)[i] = force!ubyte(n-i);
+ }
+
+ m.length!1 = 100;
+ fill!1(m, 100);
+ check!1(m, 100);
+
+ m.length!0 = 220;
+ fill!0(m, 220);
+ check!1(m, 100);
+ check!0(m, 220);
+
+ m.length!2 = 17;
+ fillB!2(m, 17);
+ checkB!2(m, 17);
+ check!0(m, 220);
+ check!1(m, 100);
+
+ m.length!2 = 33;
+ checkB!2(m, 17);
+ fillB!2(m, 33);
+ checkB!2(m, 33);
+ check!0(m, 220);
+ check!1(m, 100);
+
+ m.length!1 = 195;
+ fillB!1(m, 195);
+ checkB!1(m, 195);
+ checkB!2(m, 33);
+ check!0(m, 220);
+
+ auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
+ marr.length!0 = 15;
+ marr.length!1 = 30;
+ fill!1(marr, 30);
+ fill!0(marr, 15);
+ check!1(marr, 30);
+ check!0(marr, 15);
+ return 0;
+ };
+ enum ct = dg();
+ auto rt = dg();
+}
+
+@system unittest
+{// more bitpacking tests
+ import std.conv : text;
+
+ alias Bitty =
+ MultiArray!(BitPacked!(size_t, 3)
+ , BitPacked!(size_t, 4)
+ , BitPacked!(size_t, 3)
+ , BitPacked!(size_t, 6)
+ , bool);
+ alias fn1 = sliceBits!(13, 16);
+ alias fn2 = sliceBits!( 9, 13);
+ alias fn3 = sliceBits!( 6, 9);
+ alias fn4 = sliceBits!( 0, 6);
+ static void check(size_t lvl, MA)(ref MA arr){
+ for (size_t i = 0; i< arr.length!lvl; i++)
+ assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
+ }
+
+ static void fillIdx(size_t lvl, MA)(ref MA arr){
+ for (size_t i = 0; i< arr.length!lvl; i++)
+ arr.slice!(lvl)[i] = i;
+ }
+ Bitty m1;
+
+ m1.length!4 = 10;
+ m1.length!3 = 2^^6;
+ m1.length!2 = 2^^3;
+ m1.length!1 = 2^^4;
+ m1.length!0 = 2^^3;
+
+ m1.length!4 = 2^^16;
+
+ for (size_t i = 0; i< m1.length!4; i++)
+ m1.slice!(4)[i] = i % 2;
+
+ fillIdx!1(m1);
+ check!1(m1);
+ fillIdx!2(m1);
+ check!2(m1);
+ fillIdx!3(m1);
+ check!3(m1);
+ fillIdx!0(m1);
+ check!0(m1);
+ check!3(m1);
+ check!2(m1);
+ check!1(m1);
+ for (size_t i=0; i < 2^^16; i++)
+ {
+ m1.slice!(4)[i] = i % 2;
+ m1.slice!(0)[fn1(i)] = fn1(i);
+ m1.slice!(1)[fn2(i)] = fn2(i);
+ m1.slice!(2)[fn3(i)] = fn3(i);
+ m1.slice!(3)[fn4(i)] = fn4(i);
+ }
+ for (size_t i=0; i < 2^^16; i++)
+ {
+ assert(m1.slice!(4)[i] == i % 2);
+ assert(m1.slice!(0)[fn1(i)] == fn1(i));
+ assert(m1.slice!(1)[fn2(i)] == fn2(i));
+ assert(m1.slice!(2)[fn3(i)] == fn3(i));
+ assert(m1.slice!(3)[fn4(i)] == fn4(i));
+ }
+}
+
+size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc
+{
+ import std.math : nextPow2;
+ enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView
+ static if (bits > 8*size_t.sizeof)
+ {
+ static assert(bits % (size_t.sizeof*8) == 0);
+ return new_len * bits/(8*size_t.sizeof);
+ }
+ else
+ {
+ enum factor = size_t.sizeof*8/bits;
+ return (new_len+factor-1)/factor; // rounded up
+ }
+}
+
+template isBitPackableType(T)
+{
+ enum isBitPackableType = isBitPacked!T
+ || isIntegral!T || is(T == bool) || isSomeChar!T;
+}
+
+//============================================================================
+template PackedArrayView(T)
+if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
+ && isBitPackableType!U) || isBitPackableType!T)
+{
+ import std.math : nextPow2;
+ private enum bits = bitSizeOf!T;
+ alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
+}
+
+//unsafe and fast access to a chunk of RAM as if it contains packed values
+template PackedPtr(T)
+if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
+ && isBitPackableType!U) || isBitPackableType!T)
+{
+ import std.math : nextPow2;
+ private enum bits = bitSizeOf!T;
+ alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
+}
+
+struct PackedPtrImpl(T, size_t bits)
+{
+pure nothrow:
+ static assert(isPow2OrZero(bits));
+
+ this(inout(size_t)* ptr)inout @safe @nogc
+ {
+ origin = ptr;
+ }
+
+ private T simpleIndex(size_t n) inout
+ {
+ immutable q = n / factor;
+ immutable r = n % factor;
+ return cast(T)((origin[q] >> bits*r) & mask);
+ }
+
+ private void simpleWrite(TypeOfBitPacked!T val, size_t n)
+ in
+ {
+ static if (isIntegral!T)
+ assert(val <= mask);
+ }
+ body
+ {
+ immutable q = n / factor;
+ immutable r = n % factor;
+ immutable tgt_shift = bits*r;
+ immutable word = origin[q];
+ origin[q] = (word & ~(mask << tgt_shift))
+ | (cast(size_t) val << tgt_shift);
+ }
+
+ static if (factor == bytesPerWord// can safely pack by byte
+ || factor == 1 // a whole word at a time
+ || ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
+ && hasUnalignedReads)) // this needs unaligned reads
+ {
+ static if (factor == bytesPerWord)
+ alias U = ubyte;
+ else static if (factor == bytesPerWord/2)
+ alias U = ushort;
+ else static if (factor == bytesPerWord/4)
+ alias U = uint;
+ else static if (size_t.sizeof == 8 && factor == bytesPerWord/8)
+ alias U = ulong;
+
+ T opIndex(size_t idx) inout
+ {
+ return __ctfe ? simpleIndex(idx) :
+ cast(inout(T))(cast(U*) origin)[idx];
+ }
+
+ static if (isBitPacked!T) // lack of user-defined implicit conversion
+ {
+ void opIndexAssign(T val, size_t idx)
+ {
+ return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
+ }
+ }
+
+ void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
+ {
+ if (__ctfe)
+ simpleWrite(val, idx);
+ else
+ (cast(U*) origin)[idx] = cast(U) val;
+ }
+ }
+ else
+ {
+ T opIndex(size_t n) inout
+ {
+ return simpleIndex(n);
+ }
+
+ static if (isBitPacked!T) // lack of user-defined implicit conversion
+ {
+ void opIndexAssign(T val, size_t idx)
+ {
+ return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
+ }
+ }
+
+ void opIndexAssign(TypeOfBitPacked!T val, size_t n)
+ {
+ return simpleWrite(val, n);
+ }
+ }
+
+private:
+ // factor - number of elements in one machine word
+ enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
+ enum bytesPerWord = size_t.sizeof;
+ size_t* origin;
+}
+
+// data is packed only by power of two sized packs per word,
+// thus avoiding mul/div overhead at the cost of ultimate packing
+// this construct doesn't own memory, only provides access, see MultiArray for usage
+struct PackedArrayViewImpl(T, size_t bits)
+{
+pure nothrow:
+
+ this(inout(size_t)* origin, size_t offset, size_t items) inout @safe
+ {
+ ptr = inout(PackedPtr!(T))(origin);
+ ofs = offset;
+ limit = items;
+ }
+
+ bool zeros(size_t s, size_t e)
+ in
+ {
+ assert(s <= e);
+ }
+ body
+ {
+ s += ofs;
+ e += ofs;
+ immutable pad_s = roundUp(s);
+ if ( s >= e)
+ {
+ foreach (i; s .. e)
+ if (ptr[i])
+ return false;
+ return true;
+ }
+ immutable pad_e = roundDown(e);
+ size_t i;
+ for (i=s; i<pad_s; i++)
+ if (ptr[i])
+ return false;
+ // all in between is x*factor elements
+ for (size_t j=i/factor; i<pad_e; i+=factor, j++)
+ if (ptr.origin[j])
+ return false;
+ for (; i<e; i++)
+ if (ptr[i])
+ return false;
+ return true;
+ }
+
+ T opIndex(size_t idx) inout
+ in
+ {
+ assert(idx < limit);
+ }
+ body
+ {
+ return ptr[ofs + idx];
+ }
+
+ static if (isBitPacked!T) // lack of user-defined implicit conversion
+ {
+ void opIndexAssign(T val, size_t idx)
+ {
+ return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
+ }
+ }
+
+ void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
+ in
+ {
+ assert(idx < limit);
+ }
+ body
+ {
+ ptr[ofs + idx] = val;
+ }
+
+ static if (isBitPacked!T) // lack of user-defined implicit conversions
+ {
+ void opSliceAssign(T val, size_t start, size_t end)
+ {
+ opSliceAssign(cast(TypeOfBitPacked!T) val, start, end);
+ }
+ }
+
+ void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
+ in
+ {
+ assert(start <= end);
+ assert(end <= limit);
+ }
+ body
+ {
+ // account for ofsetted view
+ start += ofs;
+ end += ofs;
+ // rounded to factor granularity
+ immutable pad_start = roundUp(start);// rounded up
+ if (pad_start >= end) //rounded up >= then end of slice
+ {
+ //nothing to gain, use per element assignment
+ foreach (i; start .. end)
+ ptr[i] = val;
+ return;
+ }
+ immutable pad_end = roundDown(end); // rounded down
+ size_t i;
+ for (i=start; i<pad_start; i++)
+ ptr[i] = val;
+ // all in between is x*factor elements
+ if (pad_start != pad_end)
+ {
+ immutable repval = replicateBits!(factor, bits)(val);
+ for (size_t j=i/factor; i<pad_end; i+=factor, j++)
+ ptr.origin[j] = repval;// so speed it up by factor
+ }
+ for (; i<end; i++)
+ ptr[i] = val;
+ }
+
+ auto opSlice(size_t from, size_t to)inout
+ in
+ {
+ assert(from <= to);
+ assert(ofs + to <= limit);
+ }
+ body
+ {
+ return typeof(this)(ptr.origin, ofs + from, to - from);
+ }
+
+ auto opSlice(){ return opSlice(0, length); }
+
+ bool opEquals(T)(auto ref T arr) const
+ {
+ if (limit != arr.limit)
+ return false;
+ size_t s1 = ofs, s2 = arr.ofs;
+ size_t e1 = s1 + limit, e2 = s2 + limit;
+ if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
+ {
+ return ptr.origin[s1/factor .. e1/factor]
+ == arr.ptr.origin[s2/factor .. e2/factor];
+ }
+ for (size_t i=0;i<limit; i++)
+ if (this[i] != arr[i])
+ return false;
+ return true;
+ }
+
+ @property size_t length()const{ return limit; }
+
+private:
+ auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
+ auto roundDown()(size_t val){ return val/factor*factor; }
+ // factor - number of elements in one machine word
+ enum factor = size_t.sizeof*8/bits;
+ PackedPtr!(T) ptr;
+ size_t ofs, limit;
+}
+
+
+private struct SliceOverIndexed(T)
+{
+ enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
+ enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; }));
+ auto opIndex(size_t idx)const
+ in
+ {
+ assert(idx < to - from);
+ }
+ body
+ {
+ return (*arr)[from+idx];
+ }
+
+ static if (assignableIndex)
+ void opIndexAssign(Item val, size_t idx)
+ in
+ {
+ assert(idx < to - from);
+ }
+ body
+ {
+ (*arr)[from+idx] = val;
+ }
+
+ auto opSlice(size_t a, size_t b)
+ {
+ return typeof(this)(from+a, from+b, arr);
+ }
+
+ // static if (assignableSlice)
+ void opSliceAssign(T)(T val, size_t start, size_t end)
+ {
+ (*arr)[start+from .. end+from] = val;
+ }
+
+ auto opSlice()
+ {
+ return typeof(this)(from, to, arr);
+ }
+
+ @property size_t length()const { return to-from;}
+
+ auto opDollar()const { return length; }
+
+ @property bool empty()const { return from == to; }
+
+ @property auto front()const { return (*arr)[from]; }
+
+ static if (assignableIndex)
+ @property void front(Item val) { (*arr)[from] = val; }
+
+ @property auto back()const { return (*arr)[to-1]; }
+
+ static if (assignableIndex)
+ @property void back(Item val) { (*arr)[to-1] = val; }
+
+ @property auto save() inout { return this; }
+
+ void popFront() { from++; }
+
+ void popBack() { to--; }
+
+ bool opEquals(T)(auto ref T arr) const
+ {
+ if (arr.length != length)
+ return false;
+ for (size_t i=0; i <length; i++)
+ if (this[i] != arr[i])
+ return false;
+ return true;
+ }
+private:
+ alias Item = typeof(T.init[0]);
+ size_t from, to;
+ T* arr;
+}
+
+static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
+
+SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
+if (is(Unqual!T == T))
+{
+ return SliceOverIndexed!(const(T))(a, b, x);
+}
+
+// BUG? inout is out of reach
+//...SliceOverIndexed.arr only parameters or stack based variables can be inout
+SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
+if (is(Unqual!T == T))
+{
+ return SliceOverIndexed!T(a, b, x);
+}
+
+@system unittest
+{
+ int[] idxArray = [2, 3, 5, 8, 13];
+ auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
+
+ assert(!sliced.empty);
+ assert(sliced.front == 2);
+ sliced.front = 1;
+ assert(sliced.front == 1);
+ assert(sliced.back == 13);
+ sliced.popFront();
+ assert(sliced.front == 3);
+ assert(sliced.back == 13);
+ sliced.back = 11;
+ assert(sliced.back == 11);
+ sliced.popBack();
+
+ assert(sliced.front == 3);
+ assert(sliced[$-1] == 8);
+ sliced = sliced[];
+ assert(sliced[0] == 3);
+ assert(sliced.back == 8);
+ sliced = sliced[1..$];
+ assert(sliced.front == 5);
+ sliced = sliced[0..$-1];
+ assert(sliced[$-1] == 5);
+
+ int[] other = [2, 5];
+ assert(sliced[] == sliceOverIndexed(1, 2, &other));
+ sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1;
+ assert(idxArray[0 .. 2] == [-1, -1]);
+ uint[] nullArr = null;
+ auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
+ assert(nullSlice.empty);
+}
+
+private auto packedArrayView(T)(inout(size_t)* ptr, size_t items) @trusted pure nothrow
+{
+ return inout(PackedArrayView!T)(ptr, 0, items);
+}
+
+
+//============================================================================
+// Partially unrolled binary search using Shar's method
+//============================================================================
+
+string genUnrolledSwitchSearch(size_t size) @safe pure nothrow
+{
+ import core.bitop : bsr;
+ import std.array : replace;
+ import std.conv : to;
+ assert(isPow2OrZero(size));
+ string code = `
+ import core.bitop : bsr;
+ auto power = bsr(m)+1;
+ switch (power){`;
+ size_t i = bsr(size);
+ foreach_reverse (val; 0 .. bsr(size))
+ {
+ auto v = 2^^val;
+ code ~= `
+ case pow:
+ if (pred(range[idx+m], needle))
+ idx += m;
+ goto case;
+ `.replace("m", to!string(v))
+ .replace("pow", to!string(i));
+ i--;
+ }
+ code ~= `
+ case 0:
+ if (pred(range[idx], needle))
+ idx += 1;
+ goto default;
+ `;
+ code ~= `
+ default:
+ }`;
+ return code;
+}
+
+bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc
+{
+ // See also: std.math.isPowerOf2()
+ return (sz & (sz-1)) == 0;
+}
+
+size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
+if (is(T : ElementType!Range))
+{
+ assert(isPow2OrZero(range.length));
+ size_t idx = 0, m = range.length/2;
+ while (m != 0)
+ {
+ if (pred(range[idx+m], needle))
+ idx += m;
+ m /= 2;
+ }
+ if (pred(range[idx], needle))
+ idx += 1;
+ return idx;
+}
+
+size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
+if (is(T : ElementType!Range))
+{
+ assert(isPow2OrZero(range.length));
+ size_t idx = 0, m = range.length/2;
+ enum max = 1 << 10;
+ while (m >= max)
+ {
+ if (pred(range[idx+m], needle))
+ idx += m;
+ m /= 2;
+ }
+ mixin(genUnrolledSwitchSearch(max));
+ return idx;
+}
+
+template sharMethod(alias uniLowerBound)
+{
+ size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
+ if (is(T : ElementType!Range))
+ {
+ import std.functional : binaryFun;
+ import std.math : nextPow2, truncPow2;
+ alias pred = binaryFun!_pred;
+ if (range.length == 0)
+ return 0;
+ if (isPow2OrZero(range.length))
+ return uniLowerBound!pred(range, needle);
+ size_t n = truncPow2(range.length);
+ if (pred(range[n-1], needle))
+ {// search in another 2^^k area that fully covers the tail of range
+ size_t k = nextPow2(range.length - n + 1);
+ return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
+ }
+ else
+ return uniLowerBound!pred(range[0 .. n], needle);
+ }
+}
+
+alias sharLowerBound = sharMethod!uniformLowerBound;
+alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
+
+@safe unittest
+{
+ import std.array : array;
+ import std.range : assumeSorted, iota;
+
+ auto stdLowerBound(T)(T[] range, T needle)
+ {
+ return assumeSorted(range).lowerBound(needle).length;
+ }
+ immutable MAX = 5*1173;
+ auto arr = array(iota(5, MAX, 5));
+ assert(arr.length == MAX/5-1);
+ foreach (i; 0 .. MAX+5)
+ {
+ auto st = stdLowerBound(arr, i);
+ assert(st == sharLowerBound(arr, i));
+ assert(st == sharSwitchLowerBound(arr, i));
+ }
+ arr = [];
+ auto st = stdLowerBound(arr, 33);
+ assert(st == sharLowerBound(arr, 33));
+ assert(st == sharSwitchLowerBound(arr, 33));
+}
+//============================================================================
+
+@safe
+{
+// hope to see simillar stuff in public interface... once Allocators are out
+//@@@BUG moveFront and friends? dunno, for now it's POD-only
+
+@trusted size_t genericReplace(Policy=void, T, Range)
+ (ref T dest, size_t from, size_t to, Range stuff)
+{
+ import std.algorithm.mutation : copy;
+ size_t delta = to - from;
+ size_t stuff_end = from+stuff.length;
+ if (stuff.length > delta)
+ {// replace increases length
+ delta = stuff.length - delta;// now, new is > old by delta
+ static if (is(Policy == void))
+ dest.length = dest.length+delta;//@@@BUG lame @property
+ else
+ dest = Policy.realloc(dest, dest.length+delta);
+ copyBackwards(dest[to .. dest.length-delta],
+ dest[to+delta .. dest.length]);
+ copyForward(stuff, dest[from .. stuff_end]);
+ }
+ else if (stuff.length == delta)
+ {
+ copy(stuff, dest[from .. to]);
+ }
+ else
+ {// replace decreases length by delta
+ delta = delta - stuff.length;
+ copy(stuff, dest[from .. stuff_end]);
+ copyForward(dest[to .. dest.length],
+ dest[stuff_end .. dest.length-delta]);
+ static if (is(Policy == void))
+ dest.length = dest.length - delta;//@@@BUG lame @property
+ else
+ dest = Policy.realloc(dest, dest.length-delta);
+ }
+ return stuff_end;
+}
+
+
+// Simple storage manipulation policy
+@trusted private struct GcPolicy
+{
+ import std.traits : isDynamicArray;
+
+ static T[] dup(T)(const T[] arr)
+ {
+ return arr.dup;
+ }
+
+ static T[] alloc(T)(size_t size)
+ {
+ return new T[size];
+ }
+
+ static T[] realloc(T)(T[] arr, size_t sz)
+ {
+ arr.length = sz;
+ return arr;
+ }
+
+ static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
+ {
+ replaceInPlace(dest, from, to, stuff);
+ }
+
+ static void append(T, V)(ref T[] arr, V value)
+ if (!isInputRange!V)
+ {
+ arr ~= force!T(value);
+ }
+
+ static void append(T, V)(ref T[] arr, V value)
+ if (isInputRange!V)
+ {
+ insertInPlace(arr, arr.length, value);
+ }
+
+ static void destroy(T)(ref T arr)
+ if (isDynamicArray!T && is(Unqual!T == T))
+ {
+ debug
+ {
+ arr[] = cast(typeof(T.init[0]))(0xdead_beef);
+ }
+ arr = null;
+ }
+
+ static void destroy(T)(ref T arr)
+ if (isDynamicArray!T && !is(Unqual!T == T))
+ {
+ arr = null;
+ }
+}
+
+// ditto
+@trusted struct ReallocPolicy
+{
+ import std.range.primitives : hasLength;
+
+ static T[] dup(T)(const T[] arr)
+ {
+ auto result = alloc!T(arr.length);
+ result[] = arr[];
+ return result;
+ }
+
+ static T[] alloc(T)(size_t size)
+ {
+ import core.stdc.stdlib : malloc;
+ import std.exception : enforce;
+
+ import core.checkedint : mulu;
+ bool overflow;
+ size_t nbytes = mulu(size, T.sizeof, overflow);
+ if (overflow) assert(0);
+
+ auto ptr = cast(T*) enforce(malloc(nbytes), "out of memory on C heap");
+ return ptr[0 .. size];
+ }
+
+ static T[] realloc(T)(T[] arr, size_t size)
+ {
+ import core.stdc.stdlib : realloc;
+ import std.exception : enforce;
+ if (!size)
+ {
+ destroy(arr);
+ return null;
+ }
+
+ import core.checkedint : mulu;
+ bool overflow;
+ size_t nbytes = mulu(size, T.sizeof, overflow);
+ if (overflow) assert(0);
+
+ auto ptr = cast(T*) enforce(realloc(arr.ptr, nbytes), "out of memory on C heap");
+ return ptr[0 .. size];
+ }
+
+ static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
+ {
+ genericReplace!(ReallocPolicy)(dest, from, to, stuff);
+ }
+
+ static void append(T, V)(ref T[] arr, V value)
+ if (!isInputRange!V)
+ {
+ if (arr.length == size_t.max) assert(0);
+ arr = realloc(arr, arr.length+1);
+ arr[$-1] = force!T(value);
+ }
+
+ @safe unittest
+ {
+ int[] arr;
+ ReallocPolicy.append(arr, 3);
+
+ import std.algorithm.comparison : equal;
+ assert(equal(arr, [3]));
+ }
+
+ static void append(T, V)(ref T[] arr, V value)
+ if (isInputRange!V && hasLength!V)
+ {
+ import core.checkedint : addu;
+ bool overflow;
+ size_t nelems = addu(arr.length, value.length, overflow);
+ if (overflow) assert(0);
+
+ arr = realloc(arr, nelems);
+
+ import std.algorithm.mutation : copy;
+ copy(value, arr[$-value.length..$]);
+ }
+
+ @safe unittest
+ {
+ int[] arr;
+ ReallocPolicy.append(arr, [1,2,3]);
+
+ import std.algorithm.comparison : equal;
+ assert(equal(arr, [1,2,3]));
+ }
+
+ static void destroy(T)(ref T[] arr)
+ {
+ import core.stdc.stdlib : free;
+ if (arr.ptr)
+ free(arr.ptr);
+ arr = null;
+ }
+}
+
+//build hack
+alias _RealArray = CowArray!ReallocPolicy;
+
+@safe unittest
+{
+ import std.algorithm.comparison : equal;
+
+ with(ReallocPolicy)
+ {
+ bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
+ string file = __FILE__, size_t line = __LINE__)
+ {
+ {
+ replaceImpl(orig, from, to, toReplace);
+ scope(exit) destroy(orig);
+ if (!equal(orig, result))
+ return false;
+ }
+ return true;
+ }
+ static T[] arr(T)(T[] args... )
+ {
+ return dup(args);
+ }
+
+ assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
+ assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
+ assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
+ assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
+ assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
+ }
+}
+
+/**
+ Tests if T is some kind a set of code points. Intended for template constraints.
+*/
+public template isCodepointSet(T)
+{
+ static if (is(T dummy == InversionList!(Args), Args...))
+ enum isCodepointSet = true;
+ else
+ enum isCodepointSet = false;
+}
+
+/**
+ Tests if $(D T) is a pair of integers that implicitly convert to $(D V).
+ The following code must compile for any pair $(D T):
+ ---
+ (T x){ V a = x[0]; V b = x[1];}
+ ---
+ The following must not compile:
+ ---
+ (T x){ V c = x[2];}
+ ---
+*/
+public template isIntegralPair(T, V=uint)
+{
+ enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
+ && !is(typeof((T x){ V c = x[2]; }));
+}
+
+
+/**
+ The recommended default type for set of $(CODEPOINTS).
+ For details, see the current implementation: $(LREF InversionList).
+*/
+public alias CodepointSet = InversionList!GcPolicy;
+
+
+//@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
+// which relies on std.uni.isGraphical and this chain blows up with Forward reference error
+// hence below doesn't seem to work
+// public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
+
+/**
+ The recommended type of $(REF Tuple, std,_typecons)
+ to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
+ Any interval type should pass $(LREF isIntegralPair) trait.
+*/
+public struct CodepointInterval
+{
+pure:
+ uint[2] _tuple;
+ alias _tuple this;
+
+@safe pure nothrow @nogc:
+
+ this(uint low, uint high)
+ {
+ _tuple[0] = low;
+ _tuple[1] = high;
+ }
+ bool opEquals(T)(T val) const
+ {
+ return this[0] == val[0] && this[1] == val[1];
+ }
+ @property ref inout(uint) a() inout { return _tuple[0]; }
+ @property ref inout(uint) b() inout { return _tuple[1]; }
+}
+
+/**
+ $(P
+ $(D InversionList) is a set of $(CODEPOINTS)
+ represented as an array of open-right [a, b$(RPAREN)
+ intervals (see $(LREF CodepointInterval) above).
+ The name comes from the way the representation reads left to right.
+ For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
+ plus a singular value 60 looks like this:
+ )
+ ---
+ 10, 50, 60, 61, 80, 90
+ ---
+ $(P
+ The way to read this is: start with negative meaning that all numbers
+ smaller then the next one are not present in this set (and positive
+ - the contrary). Then switch positive/negative after each
+ number passed from left to right.
+ )
+ $(P This way negative spans until 10, then positive until 50,
+ then negative until 60, then positive until 61, and so on.
+ As seen this provides a space-efficient storage of highly redundant data
+ that comes in long runs. A description which Unicode $(CHARACTER)
+ properties fit nicely. The technique itself could be seen as a variation
+ on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding).
+ )
+
+ $(P Sets are value types (just like $(D int) is) thus they
+ are never aliased.
+ )
+ Example:
+ ---
+ auto a = CodepointSet('a', 'z'+1);
+ auto b = CodepointSet('A', 'Z'+1);
+ auto c = a;
+ a = a | b;
+ assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
+ assert(a != c);
+ ---
+ $(P See also $(LREF unicode) for simpler construction of sets
+ from predefined ones.
+ )
+
+ $(P Memory usage is 8 bytes per each contiguous interval in a set.
+ The value semantics are achieved by using the
+ $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique
+ and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
+ )
+
+ Note:
+ $(P It's not recommended to rely on the template parameters
+ or the exact type of a current $(CODEPOINT) set in $(D std.uni).
+ The type and parameters may change when the standard
+ allocators design is finalized.
+ Use $(LREF isCodepointSet) with templates or just stick with the default
+ alias $(LREF CodepointSet) throughout the whole code base.
+ )
+*/
+@trusted public struct InversionList(SP=GcPolicy)
+{
+ import std.range : assumeSorted;
+
+ /**
+ Construct from another code point set of any type.
+ */
+ this(Set)(Set set) pure
+ if (isCodepointSet!Set)
+ {
+ uint[] arr;
+ foreach (v; set.byInterval)
+ {
+ arr ~= v.a;
+ arr ~= v.b;
+ }
+ data = CowArray!(SP).reuse(arr);
+ }
+
+ /**
+ Construct a set from a forward range of code point intervals.
+ */
+ this(Range)(Range intervals) pure
+ if (isForwardRange!Range && isIntegralPair!(ElementType!Range))
+ {
+ uint[] arr;
+ foreach (v; intervals)
+ {
+ SP.append(arr, v.a);
+ SP.append(arr, v.b);
+ }
+ data = CowArray!(SP).reuse(arr);
+ sanitize(); //enforce invariant: sort intervals etc.
+ }
+
+ //helper function that avoids sanity check to be CTFE-friendly
+ private static fromIntervals(Range)(Range intervals) pure
+ {
+ import std.algorithm.iteration : map;
+ import std.range : roundRobin;
+ auto flattened = roundRobin(intervals.save.map!"a[0]"(),
+ intervals.save.map!"a[1]"());
+ InversionList set;
+ set.data = CowArray!(SP)(flattened);
+ return set;
+ }
+ //ditto untill sort is CTFE-able
+ private static fromIntervals()(uint[] intervals...) pure
+ in
+ {
+ import std.conv : text;
+ assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
+ for (uint i = 0; i < intervals.length; i += 2)
+ {
+ auto a = intervals[i], b = intervals[i+1];
+ assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
+ }
+ }
+ body
+ {
+ InversionList set;
+ set.data = CowArray!(SP)(intervals);
+ return set;
+ }
+
+ /**
+ Construct a set from plain values of code point intervals.
+ */
+ this()(uint[] intervals...)
+ in
+ {
+ import std.conv : text;
+ assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
+ for (uint i = 0; i < intervals.length; i += 2)
+ {
+ auto a = intervals[i], b = intervals[i+1];
+ assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
+ }
+ }
+ body
+ {
+ data = CowArray!(SP)(intervals);
+ sanitize(); //enforce invariant: sort intervals etc.
+ }
+
+ ///
+ @safe unittest
+ {
+ import std.algorithm.comparison : equal;
+
+ auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
+ foreach (v; 'a'..'z'+1)
+ assert(set[v]);
+ // Cyrillic lowercase interval
+ foreach (v; 'а'..'я'+1)
+ assert(set[v]);
+ //specific order is not required, intervals may interesect
+ auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
+ //the same end result
+ assert(set2.byInterval.equal(set.byInterval));
+ }
+
+ /**
+ Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
+
+ Example:
+ -----------
+ import std.algorithm.comparison : equal;
+ import std.typecons : tuple;
+
+ auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
+
+ assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')]));
+ -----------
+ */
+ @property auto byInterval()
+ {
+ return Intervals!(typeof(data))(data);
+ }
+
+ /**
+ Tests the presence of code point $(D val) in this set.
+ */
+ bool opIndex(uint val) const
+ {
+ // the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1
+ // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1;
+ return sharSwitchLowerBound!"a <= b"(data[], val) & 1;
+ }
+
+ ///
+ @safe unittest
+ {
+ auto gothic = unicode.Gothic;
+ // Gothic letter ahsa
+ assert(gothic['\U00010330']);
+ // no ascii in Gothic obviously
+ assert(!gothic['$']);
+ }
+
+
+ // Linear scan for $(D ch). Useful only for small sets.
+ // TODO:
+ // used internally in std.regex
+ // should be properly exposed in a public API ?
+ package auto scanFor()(dchar ch) const
+ {
+ immutable len = data.length;
+ for (size_t i = 0; i < len; i++)
+ if (ch < data[i])
+ return i & 1;
+ return 0;
+ }
+
+ /// Number of $(CODEPOINTS) in this set
+ @property size_t length()
+ {
+ size_t sum = 0;
+ foreach (iv; byInterval)
+ {
+ sum += iv.b - iv.a;
+ }
+ return sum;
+ }
+
+// bootstrap full set operations from 4 primitives (suitable as a template mixin):
+// addInterval, skipUpTo, dropUpTo & byInterval iteration
+//============================================================================
+public:
+ /**
+ $(P Sets support natural syntax for set algebra, namely: )
+ $(BOOKTABLE ,
+ $(TR $(TH Operator) $(TH Math notation) $(TH Description) )
+ $(TR $(TD &) $(TD a ∩ b) $(TD intersection) )
+ $(TR $(TD |) $(TD a ∪ b) $(TD union) )
+ $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) )
+ $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) )
+ )
+ */
+ This opBinary(string op, U)(U rhs)
+ if (isCodepointSet!U || is(U:dchar))
+ {
+ static if (op == "&" || op == "|" || op == "~")
+ {// symmetric ops thus can swap arguments to reuse r-value
+ static if (is(U:dchar))
+ {
+ auto tmp = this;
+ mixin("tmp "~op~"= rhs; ");
+ return tmp;
+ }
+ else
+ {
+ static if (is(Unqual!U == U))
+ {
+ // try hard to reuse r-value
+ mixin("rhs "~op~"= this;");
+ return rhs;
+ }
+ else
+ {
+ auto tmp = this;
+ mixin("tmp "~op~"= rhs;");
+ return tmp;
+ }
+ }
+ }
+ else static if (op == "-") // anti-symmetric
+ {
+ auto tmp = this;
+ tmp -= rhs;
+ return tmp;
+ }
+ else
+ static assert(0, "no operator "~op~" defined for Set");
+ }
+
+ ///
+ @safe unittest
+ {
+ import std.algorithm.comparison : equal;
+ import std.range : iota;
+
+ auto lower = unicode.LowerCase;
+ auto upper = unicode.UpperCase;
+ auto ascii = unicode.ASCII;
+
+ assert((lower & upper).empty); // no intersection
+ auto lowerASCII = lower & ascii;
+ assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
+ // throw away all of the lowercase ASCII
+ assert((ascii - lower).length == 128 - 26);
+
+ auto onlyOneOf = lower ~ ascii;
+ assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
+ assert(onlyOneOf['$']); // ASCII and not lowercase
+ assert(!onlyOneOf['a']); // ASCII and lowercase
+ assert(onlyOneOf['я']); // not ASCII but lowercase
+
+ // throw away all cased letters from ASCII
+ auto noLetters = ascii - (lower | upper);
+ assert(noLetters.length == 128 - 26*2);
+ }
+
+ /// The 'op=' versions of the above overloaded operators.
+ ref This opOpAssign(string op, U)(U rhs)
+ if (isCodepointSet!U || is(U:dchar))
+ {
+ static if (op == "|") // union
+ {
+ static if (is(U:dchar))
+ {
+ this.addInterval(rhs, rhs+1);
+ return this;
+ }
+ else
+ return this.add(rhs);
+ }
+ else static if (op == "&") // intersection
+ return this.intersect(rhs);// overloaded
+ else static if (op == "-") // set difference
+ return this.sub(rhs);// overloaded
+ else static if (op == "~") // symmetric set difference
+ {
+ auto copy = this & rhs;
+ this |= rhs;
+ this -= copy;
+ return this;
+ }
+ else
+ static assert(0, "no operator "~op~" defined for Set");
+ }
+
+ /**
+ Tests the presence of codepoint $(D ch) in this set,
+ the same as $(LREF opIndex).
+ */
+ bool opBinaryRight(string op: "in", U)(U ch) const
+ if (is(U : dchar))
+ {
+ return this[ch];
+ }
+
+ ///
+ @safe unittest
+ {
+ assert('я' in unicode.Cyrillic);
+ assert(!('z' in unicode.Cyrillic));
+ }
+
+
+
+ /**
+ * Obtains a set that is the inversion of this set.
+ *
+ * See_Also: $(LREF inverted)
+ */
+ auto opUnary(string op: "!")()
+ {
+ return this.inverted;
+ }
+
+ /**
+ A range that spans each $(CODEPOINT) in this set.
+ */
+ @property auto byCodepoint()
+ {
+ @trusted static struct CodepointRange
+ {
+ this(This set)
+ {
+ r = set.byInterval;
+ if (!r.empty)
+ cur = r.front.a;
+ }
+
+ @property dchar front() const
+ {
+ return cast(dchar) cur;
+ }
+
+ @property bool empty() const
+ {
+ return r.empty;
+ }
+
+ void popFront()
+ {
+ cur++;
+ while (cur >= r.front.b)
+ {
+ r.popFront();
+ if (r.empty)
+ break;
+ cur = r.front.a;
+ }
+ }
+ private:
+ uint cur;
+ typeof(This.init.byInterval) r;
+ }
+
+ return CodepointRange(this);
+ }
+
+ ///
+ @safe unittest
+ {
+ import std.algorithm.comparison : equal;
+ import std.range : iota;
+
+ auto set = unicode.ASCII;
+ set.byCodepoint.equal(iota(0, 0x80));
+ }
+
+ /**
+ $(P Obtain textual representation of this set in from of
+ open-right intervals and feed it to $(D sink).
+ )
+ $(P Used by various standard formatting facilities such as
+ $(REF formattedWrite, std,_format), $(REF write, std,_stdio),
+ $(REF writef, std,_stdio), $(REF to, std,_conv) and others.
+ )
+ Example:
+ ---
+ import std.conv;
+ assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
+ ---
+ */
+
+ private import std.format : FormatSpec;
+
+ /***************************************
+ * Obtain a textual representation of this InversionList
+ * in form of open-right intervals.
+ *
+ * The formatting flag is applied individually to each value, for example:
+ * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals)
+ * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters)
+ * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters)
+ */
+ void toString(Writer)(scope Writer sink,
+ FormatSpec!char fmt) /* const */
+ {
+ import std.format : formatValue;
+ auto range = byInterval;
+ if (range.empty)
+ return;
+
+ while (1)
+ {
+ auto i = range.front;
+ range.popFront();
+
+ put(sink, "[");
+ formatValue(sink, i.a, fmt);
+ put(sink, "..");
+ formatValue(sink, i.b, fmt);
+ put(sink, ")");
+ if (range.empty) return;
+ put(sink, " ");
+ }
+ }
+
+ ///
+ @safe unittest
+ {
+ import std.conv : to;
+ import std.format : format;
+ import std.uni : unicode;
+
+ assert(unicode.Cyrillic.to!string ==
+ "[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)");
+
+ // The specs '%s' and '%d' are equivalent to the to!string call above.
+ assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string);
+
+ assert(format("%#x", unicode.Cyrillic) ==
+ "[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) "
+ ~"[0xa640..0xa698) [0xa69f..0xa6a0)");
+
+ assert(format("%#X", unicode.Cyrillic) ==
+ "[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) "
+ ~"[0XA640..0XA698) [0XA69F..0XA6A0)");
+ }
+
+ @safe unittest
+ {
+ import std.exception : assertThrown;
+ import std.format : format, FormatException;
+ assertThrown!FormatException(format("%a", unicode.ASCII));
+ }
+
+
+ /**
+ Add an interval [a, b$(RPAREN) to this set.
+ */
+ ref add()(uint a, uint b)
+ {
+ addInterval(a, b);
+ return this;
+ }
+
+ ///
+ @safe unittest
+ {
+ CodepointSet someSet;
+ someSet.add('0', '5').add('A','Z'+1);
+ someSet.add('5', '9'+1);
+ assert(someSet['0']);
+ assert(someSet['5']);
+ assert(someSet['9']);
+ assert(someSet['Z']);
+ }
+
+private:
+
+ package(std) // used from: std.regex.internal.parser
+ ref intersect(U)(U rhs)
+ if (isCodepointSet!U)
+ {
+ Marker mark;
+ foreach ( i; rhs.byInterval)
+ {
+ mark = this.dropUpTo(i.a, mark);
+ mark = this.skipUpTo(i.b, mark);
+ }
+ this.dropUpTo(uint.max, mark);
+ return this;
+ }
+
+ ref intersect()(dchar ch)
+ {
+ foreach (i; byInterval)
+ if (i.a <= ch && ch < i.b)
+ return this = This.init.add(ch, ch+1);
+ this = This.init;
+ return this;
+ }
+
+ @safe unittest
+ {
+ assert(unicode.Cyrillic.intersect('-').byInterval.empty);
+ }
+
+ ref sub()(dchar ch)
+ {
+ return subChar(ch);
+ }
+
+ // same as the above except that skip & drop parts are swapped
+ package(std) // used from: std.regex.internal.parser
+ ref sub(U)(U rhs)
+ if (isCodepointSet!U)
+ {
+ Marker mark;
+ foreach (i; rhs.byInterval)
+ {
+ mark = this.skipUpTo(i.a, mark);
+ mark = this.dropUpTo(i.b, mark);
+ }
+ return this;
+ }
+
+ package(std) // used from: std.regex.internal.parse
+ ref add(U)(U rhs)
+ if (isCodepointSet!U)
+ {
+ Marker start;
+ foreach (i; rhs.byInterval)
+ {
+ start = addInterval(i.a, i.b, start);
+ }
+ return this;
+ }
+
+// end of mixin-able part
+//============================================================================
+public:
+ /**
+ Obtains a set that is the inversion of this set.
+
+ See the '!' $(LREF opUnary) for the same but using operators.
+ */
+ @property auto inverted()
+ {
+ InversionList inversion = this;
+ if (inversion.data.length == 0)
+ {
+ inversion.addInterval(0, lastDchar+1);
+ return inversion;
+ }
+ if (inversion.data[0] != 0)
+ genericReplace(inversion.data, 0, 0, [0]);
+ else
+ genericReplace(inversion.data, 0, 1, cast(uint[]) null);
+ if (data[data.length-1] != lastDchar+1)
+ genericReplace(inversion.data,
+ inversion.data.length, inversion.data.length, [lastDchar+1]);
+ else
+ genericReplace(inversion.data,
+ inversion.data.length-1, inversion.data.length, cast(uint[]) null);
+
+ return inversion;
+ }
+
+ ///
+ @safe unittest
+ {
+ auto set = unicode.ASCII;
+ // union with the inverse gets all of the code points in the Unicode
+ assert((set | set.inverted).length == 0x110000);
+ // no intersection with the inverse
+ assert((set & set.inverted).empty);
+ }
+
+ /**
+ Generates string with D source code of unary function with name of
+ $(D funcName) taking a single $(D dchar) argument. If $(D funcName) is empty
+ the code is adjusted to be a lambda function.
+
+ The function generated tests if the $(CODEPOINT) passed
+ belongs to this set or not. The result is to be used with string mixin.
+ The intended usage area is aggressive optimization via meta programming
+ in parser generators and the like.
+
+ Note: Use with care for relatively small or regular sets. It
+ could end up being slower then just using multi-staged tables.
+
+ Example:
+ ---
+ import std.stdio;
+
+ // construct set directly from [a, b$RPAREN intervals
+ auto set = CodepointSet(10, 12, 45, 65, 100, 200);
+ writeln(set);
+ writeln(set.toSourceCode("func"));
+ ---
+
+ The above outputs something along the lines of:
+ ---
+ bool func(dchar ch) @safe pure nothrow @nogc
+ {
+ if (ch < 45)
+ {
+ if (ch == 10 || ch == 11) return true;
+ return false;
+ }
+ else if (ch < 65) return true;
+ else
+ {
+ if (ch < 100) return false;
+ if (ch < 200) return true;
+ return false;
+ }
+ }
+ ---
+ */
+ string toSourceCode(string funcName="")
+ {
+ import std.algorithm.searching : countUntil;
+ import std.array : array;
+ import std.format : format;
+ enum maxBinary = 3;
+ static string linearScope(R)(R ivals, string indent)
+ {
+ string result = indent~"{\n";
+ string deeper = indent~" ";
+ foreach (ival; ivals)
+ {
+ immutable span = ival[1] - ival[0];
+ assert(span != 0);
+ if (span == 1)
+ {
+ result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]);
+ }
+ else if (span == 2)
+ {
+ result ~= format("%sif (ch == %s || ch == %s) return true;\n",
+ deeper, ival[0], ival[0]+1);
+ }
+ else
+ {
+ if (ival[0] != 0) // dchar is unsigned and < 0 is useless
+ result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]);
+ result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]);
+ }
+ }
+ result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
+ return result;
+ }
+
+ static string binaryScope(R)(R ivals, string indent)
+ {
+ // time to do unrolled comparisons?
+ if (ivals.length < maxBinary)
+ return linearScope(ivals, indent);
+ else
+ return bisect(ivals, ivals.length/2, indent);
+ }
+
+ // not used yet if/elsebinary search is far better with DMD as of 2.061
+ // and GDC is doing fine job either way
+ static string switchScope(R)(R ivals, string indent)
+ {
+ string result = indent~"switch (ch){\n";
+ string deeper = indent~" ";
+ foreach (ival; ivals)
+ {
+ if (ival[0]+1 == ival[1])
+ {
+ result ~= format("%scase %s: return true;\n",
+ deeper, ival[0]);
+ }
+ else
+ {
+ result ~= format("%scase %s: .. case %s: return true;\n",
+ deeper, ival[0], ival[1]-1);
+ }
+ }
+ result ~= deeper~"default: return false;\n"~indent~"}\n";
+ return result;
+ }
+
+ static string bisect(R)(R range, size_t idx, string indent)
+ {
+ string deeper = indent ~ " ";
+ // bisect on one [a, b) interval at idx
+ string result = indent~"{\n";
+ // less branch, < a
+ result ~= format("%sif (ch < %s)\n%s",
+ deeper, range[idx][0], binaryScope(range[0 .. idx], deeper));
+ // middle point, >= a && < b
+ result ~= format("%selse if (ch < %s) return true;\n",
+ deeper, range[idx][1]);
+ // greater or equal branch, >= b
+ result ~= format("%selse\n%s",
+ deeper, binaryScope(range[idx+1..$], deeper));
+ return result~indent~"}\n";
+ }
+
+ string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
+ funcName.empty ? "function" : funcName);
+ auto range = byInterval.array();
+ // special case first bisection to be on ASCII vs beyond
+ auto tillAscii = countUntil!"a[0] > 0x80"(range);
+ if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
+ code ~= binaryScope(range, "");
+ else
+ code ~= bisect(range, tillAscii, "");
+ return code;
+ }
+
+ /**
+ True if this set doesn't contain any $(CODEPOINTS).
+ */
+ @property bool empty() const
+ {
+ return data.length == 0;
+ }
+
+ ///
+ @safe unittest
+ {
+ CodepointSet emptySet;
+ assert(emptySet.length == 0);
+ assert(emptySet.empty);
+ }
+
+private:
+ alias This = typeof(this);
+ alias Marker = size_t;
+
+ // a random-access range of integral pairs
+ static struct Intervals(Range)
+ {
+ this(Range sp)
+ {
+ slice = sp;
+ start = 0;
+ end = sp.length;
+ }
+
+ this(Range sp, size_t s, size_t e)
+ {
+ slice = sp;
+ start = s;
+ end = e;
+ }
+
+ @property auto front()const
+ {
+ immutable a = slice[start];
+ immutable b = slice[start+1];
+ return CodepointInterval(a, b);
+ }
+
+ //may break sorted property - but we need std.sort to access it
+ //hence package protection attribute
+ package @property void front(CodepointInterval val)
+ {
+ slice[start] = val.a;
+ slice[start+1] = val.b;
+ }
+
+ @property auto back()const
+ {
+ immutable a = slice[end-2];
+ immutable b = slice[end-1];
+ return CodepointInterval(a, b);
+ }
+
+ //ditto about package
+ package @property void back(CodepointInterval val)
+ {
+ slice[end-2] = val.a;
+ slice[end-1] = val.b;
+ }
+
+ void popFront()
+ {
+ start += 2;
+ }
+
+ void popBack()
+ {
+ end -= 2;
+ }
+
+ auto opIndex(size_t idx) const
+ {
+ immutable a = slice[start+idx*2];
+ immutable b = slice[start+idx*2+1];
+ return CodepointInterval(a, b);
+ }
+
+ //ditto about package
+ package void opIndexAssign(CodepointInterval val, size_t idx)
+ {
+ slice[start+idx*2] = val.a;
+ slice[start+idx*2+1] = val.b;
+ }
+
+ auto opSlice(size_t s, size_t e)
+ {
+ return Intervals(slice, s*2+start, e*2+start);
+ }
+
+ @property size_t length()const { return slice.length/2; }
+
+ @property bool empty()const { return start == end; }
+
+ @property auto save(){ return this; }
+ private:
+ size_t start, end;
+ Range slice;
+ }
+
+ // called after construction from intervals
+ // to make sure invariants hold
+ void sanitize()
+ {
+ import std.algorithm.comparison : max;
+ import std.algorithm.mutation : SwapStrategy;
+ import std.algorithm.sorting : sort;
+ if (data.length == 0)
+ return;
+ alias Ival = CodepointInterval;
+ //intervals wrapper for a _range_ over packed array
+ auto ivals = Intervals!(typeof(data[]))(data[]);
+ //@@@BUG@@@ can't use "a.a < b.a" see issue 12265
+ sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
+ // what follows is a variation on stable remove
+ // differences:
+ // - predicate is binary, and is tested against
+ // the last kept element (at 'i').
+ // - predicate mutates lhs (merges rhs into lhs)
+ size_t len = ivals.length;
+ size_t i = 0;
+ size_t j = 1;
+ while (j < len)
+ {
+ if (ivals[i].b >= ivals[j].a)
+ {
+ ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
+ j++;
+ }
+ else //unmergable
+ {
+ // check if there is a hole after merges
+ // (in the best case we do 0 writes to ivals)
+ if (j != i+1)
+ ivals[i+1] = ivals[j]; //copy over
+ i++;
+ j++;
+ }
+ }
+ len = i + 1;
+ for (size_t k=0; k + 1 < len; k++)
+ {
+ assert(ivals[k].a < ivals[k].b);
+ assert(ivals[k].b < ivals[k+1].a);
+ }
+ data.length = len * 2;
+ }
+
+ // special case for normal InversionList
+ ref subChar(dchar ch)
+ {
+ auto mark = skipUpTo(ch);
+ if (mark != data.length
+ && data[mark] == ch && data[mark-1] == ch)
+ {
+ // it has split, meaning that ch happens to be in one of intervals
+ data[mark] = data[mark]+1;
+ }
+ return this;
+ }
+
+ //
+ Marker addInterval(int a, int b, Marker hint=Marker.init)
+ in
+ {
+ assert(a <= b);
+ }
+ body
+ {
+ import std.range : assumeSorted, SearchPolicy;
+ auto range = assumeSorted(data[]);
+ size_t pos;
+ size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
+ if (a_idx == range.length)
+ {
+ // [---+++----++++----++++++]
+ // [ a b]
+ data.append(a, b);
+ return data.length-1;
+ }
+ size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
+ uint[3] buf = void;
+ uint to_insert;
+ debug(std_uni)
+ {
+ writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
+ }
+ if (b_idx == range.length)
+ {
+ // [-------++++++++----++++++-]
+ // [ s a b]
+ if (a_idx & 1)// a in positive
+ {
+ buf[0] = b;
+ to_insert = 1;
+ }
+ else// a in negative
+ {
+ buf[0] = a;
+ buf[1] = b;
+ to_insert = 2;
+ }
+ pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]);
+ return pos - 1;
+ }
+
+ uint top = data[b_idx];
+
+ debug(std_uni)
+ {
+ writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
+ writefln("a=%s; b=%s; top=%s;", a, b, top);
+ }
+ if (a_idx & 1)
+ {// a in positive
+ if (b_idx & 1)// b in positive
+ {
+ // [-------++++++++----++++++-]
+ // [ s a b ]
+ buf[0] = top;
+ to_insert = 1;
+ }
+ else // b in negative
+ {
+ // [-------++++++++----++++++-]
+ // [ s a b ]
+ if (top == b)
+ {
+ assert(b_idx+1 < data.length);
+ buf[0] = data[b_idx+1];
+ pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]);
+ return pos - 1;
+ }
+ buf[0] = b;
+ buf[1] = top;
+ to_insert = 2;
+ }
+ }
+ else
+ { // a in negative
+ if (b_idx & 1) // b in positive
+ {
+ // [----------+++++----++++++-]
+ // [ a b ]
+ buf[0] = a;
+ buf[1] = top;
+ to_insert = 2;
+ }
+ else// b in negative
+ {
+ // [----------+++++----++++++-]
+ // [ a s b ]
+ if (top == b)
+ {
+ assert(b_idx+1 < data.length);
+ buf[0] = a;
+ buf[1] = data[b_idx+1];
+ pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]);
+ return pos - 1;
+ }
+ buf[0] = a;
+ buf[1] = b;
+ buf[2] = top;
+ to_insert = 3;
+ }
+ }
+ pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]);
+ debug(std_uni)
+ {
+ writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
+ writeln("inserting ", buf[0 .. to_insert]);
+ }
+ return pos - 1;
+ }
+
+ //
+ Marker dropUpTo(uint a, Marker pos=Marker.init)
+ in
+ {
+ assert(pos % 2 == 0); // at start of interval
+ }
+ body
+ {
+ auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
+ if (range.empty)
+ return pos;
+ size_t idx = pos;
+ idx += range.lowerBound(a).length;
+
+ debug(std_uni)
+ {
+ writeln("dropUpTo full length=", data.length);
+ writeln(pos,"~~~", idx);
+ }
+ if (idx == data.length)
+ return genericReplace(data, pos, idx, cast(uint[])[]);
+ if (idx & 1)
+ { // a in positive
+ //[--+++----++++++----+++++++------...]
+ // |<---si s a t
+ genericReplace(data, pos, idx, [a]);
+ }
+ else
+ { // a in negative
+ //[--+++----++++++----+++++++-------+++...]
+ // |<---si s a t
+ genericReplace(data, pos, idx, cast(uint[])[]);
+ }
+ return pos;
+ }
+
+ //
+ Marker skipUpTo(uint a, Marker pos=Marker.init)
+ out(result)
+ {
+ assert(result % 2 == 0);// always start of interval
+ //(may be 0-width after-split)
+ }
+ body
+ {
+ assert(data.length % 2 == 0);
+ auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
+ size_t idx = pos+range.lowerBound(a).length;
+
+ if (idx >= data.length) // could have Marker point to recently removed stuff
+ return data.length;
+
+ if (idx & 1)// inside of interval, check for split
+ {
+
+ immutable top = data[idx];
+ if (top == a)// no need to split, it's end
+ return idx+1;
+ immutable start = data[idx-1];
+ if (a == start)
+ return idx-1;
+ // split it up
+ genericReplace(data, idx, idx+1, [a, a, top]);
+ return idx+1; // avoid odd index
+ }
+ return idx;
+ }
+
+ CowArray!SP data;
+}
+
+@system unittest
+{
+ import std.conv : to;
+ assert(unicode.ASCII.to!string() == "[0..128)");
+}
+
+// pedantic version for ctfe, and aligned-access only architectures
+@system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
+{
+ idx *= 3;
+ version (LittleEndian)
+ return ptr[idx] + (cast(uint) ptr[idx+1]<<8)
+ + (cast(uint) ptr[idx+2]<<16);
+ else
+ return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8)
+ + ptr[idx+2];
+}
+
+// ditto
+@system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
+{
+ idx *= 3;
+ version (LittleEndian)
+ {
+ ptr[idx] = val & 0xFF;
+ ptr[idx+1] = (val >> 8) & 0xFF;
+ ptr[idx+2] = (val >> 16) & 0xFF;
+ }
+ else
+ {
+ ptr[idx] = (val >> 16) & 0xFF;
+ ptr[idx+1] = (val >> 8) & 0xFF;
+ ptr[idx+2] = val & 0xFF;
+ }
+}
+
+// unaligned x86-like read/write functions
+@system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
+{
+ uint* src = cast(uint*)(ptr+3*idx);
+ version (LittleEndian)
+ return *src & 0xFF_FFFF;
+ else
+ return *src >> 8;
+}
+
+// ditto
+@system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
+{
+ uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx);
+ version (LittleEndian)
+ *dest = val | (*dest & 0xFF00_0000);
+ else
+ *dest = (val << 8) | (*dest & 0xFF);
+}
+
+@system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
+{
+ static if (hasUnalignedReads)
+ return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
+ else
+ return safeRead24(ptr, idx);
+}
+
+@system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
+{
+ static if (hasUnalignedReads)
+ return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
+ else
+ return safeWrite24(ptr, val, idx);
+}
+
+struct CowArray(SP=GcPolicy)
+{
+ import std.range.primitives : hasLength;
+
+ @safe:
+ static auto reuse(uint[] arr)
+ {
+ CowArray cow;
+ cow.data = arr;
+ SP.append(cow.data, 1);
+ assert(cow.refCount == 1);
+ assert(cow.length == arr.length);
+ return cow;
+ }
+
+ this(Range)(Range range)
+ if (isInputRange!Range && hasLength!Range)
+ {
+ import std.algorithm.mutation : copy;
+ length = range.length;
+ copy(range, data[0..$-1]);
+ }
+
+ this(Range)(Range range)
+ if (isForwardRange!Range && !hasLength!Range)
+ {
+ import std.algorithm.mutation : copy;
+ import std.range.primitives : walkLength;
+ immutable len = walkLength(range.save);
+ length = len;
+ copy(range, data[0..$-1]);
+ }
+
+ this(this)
+ {
+ if (!empty)
+ {
+ refCount = refCount + 1;
+ }
+ }
+
+ ~this()
+ {
+ if (!empty)
+ {
+ immutable cnt = refCount;
+ if (cnt == 1)
+ SP.destroy(data);
+ else
+ refCount = cnt - 1;
+ }
+ }
+
+ // no ref-count for empty U24 array
+ @property bool empty() const { return data.length == 0; }
+
+ // report one less then actual size
+ @property size_t length() const
+ {
+ return data.length ? data.length - 1 : 0;
+ }
+
+ //+ an extra slot for ref-count
+ @property void length(size_t len)
+ {
+ import std.algorithm.comparison : min;
+ import std.algorithm.mutation : copy;
+ if (len == 0)
+ {
+ if (!empty)
+ freeThisReference();
+ return;
+ }
+ immutable total = len + 1; // including ref-count
+ if (empty)
+ {
+ data = SP.alloc!uint(total);
+ refCount = 1;
+ return;
+ }
+ immutable cur_cnt = refCount;
+ if (cur_cnt != 1) // have more references to this memory
+ {
+ refCount = cur_cnt - 1;
+ auto new_data = SP.alloc!uint(total);
+ // take shrinking into account
+ auto to_copy = min(total, data.length) - 1;
+ copy(data[0 .. to_copy], new_data[0 .. to_copy]);
+ data = new_data; // before setting refCount!
+ refCount = 1;
+ }
+ else // 'this' is the only reference
+ {
+ // use the realloc (hopefully in-place operation)
+ data = SP.realloc(data, total);
+ refCount = 1; // setup a ref-count in the new end of the array
+ }
+ }
+
+ alias opDollar = length;
+
+ uint opIndex()(size_t idx)const
+ {
+ return data[idx];
+ }
+
+ void opIndexAssign(uint val, size_t idx)
+ {
+ auto cnt = refCount;
+ if (cnt != 1)
+ dupThisReference(cnt);
+ data[idx] = val;
+ }
+
+ //
+ auto opSlice(size_t from, size_t to)
+ {
+ if (!empty)
+ {
+ auto cnt = refCount;
+ if (cnt != 1)
+ dupThisReference(cnt);
+ }
+ return data[from .. to];
+
+ }
+
+ //
+ auto opSlice(size_t from, size_t to) const
+ {
+ return data[from .. to];
+ }
+
+ // length slices before the ref count
+ auto opSlice()
+ {
+ return opSlice(0, length);
+ }
+
+ // ditto
+ auto opSlice() const
+ {
+ return opSlice(0, length);
+ }
+
+ void append(Range)(Range range)
+ if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
+ {
+ size_t nl = length + range.length;
+ length = nl;
+ copy(range, this[nl-range.length .. nl]);
+ }
+
+ void append()(uint[] val...)
+ {
+ length = length + val.length;
+ data[$-val.length-1 .. $-1] = val[];
+ }
+
+ bool opEquals()(auto const ref CowArray rhs)const
+ {
+ if (empty ^ rhs.empty)
+ return false; // one is empty and the other isn't
+ return empty || data[0..$-1] == rhs.data[0..$-1];
+ }
+
+private:
+ // ref-count is right after the data
+ @property uint refCount() const
+ {
+ return data[$-1];
+ }
+
+ @property void refCount(uint cnt)
+ {
+ data[$-1] = cnt;
+ }
+
+ void freeThisReference()
+ {
+ immutable count = refCount;
+ if (count != 1) // have more references to this memory
+ {
+ // dec shared ref-count
+ refCount = count - 1;
+ data = [];
+ }
+ else
+ SP.destroy(data);
+ assert(!data.ptr);
+ }
+
+ void dupThisReference(uint count)
+ in
+ {
+ assert(!empty && count != 1 && count == refCount);
+ }
+ body
+ {
+ import std.algorithm.mutation : copy;
+ // dec shared ref-count
+ refCount = count - 1;
+ // copy to the new chunk of RAM
+ auto new_data = SP.alloc!uint(data.length);
+ // bit-blit old stuff except the counter
+ copy(data[0..$-1], new_data[0..$-1]);
+ data = new_data; // before setting refCount!
+ refCount = 1; // so that this updates the right one
+ }
+
+ uint[] data;
+}
+
+@safe unittest// Uint24 tests
+{
+ import std.algorithm.comparison : equal;
+ import std.algorithm.mutation : copy;
+ import std.conv : text;
+ import std.range : iota, chain;
+ import std.range.primitives : isBidirectionalRange, isOutputRange;
+ void funcRef(T)(ref T u24)
+ {
+ u24.length = 2;
+ u24[1] = 1024;
+ T u24_c = u24;
+ assert(u24[1] == 1024);
+ u24.length = 0;
+ assert(u24.empty);
+ u24.append([1, 2]);
+ assert(equal(u24[], [1, 2]));
+ u24.append(111);
+ assert(equal(u24[], [1, 2, 111]));
+ assert(!u24_c.empty && u24_c[1] == 1024);
+ u24.length = 3;
+ copy(iota(0, 3), u24[]);
+ assert(equal(u24[], iota(0, 3)));
+ assert(u24_c[1] == 1024);
+ }
+
+ void func2(T)(T u24)
+ {
+ T u24_2 = u24;
+ T u24_3;
+ u24_3 = u24_2;
+ assert(u24_2 == u24_3);
+ assert(equal(u24[], u24_2[]));
+ assert(equal(u24_2[], u24_3[]));
+ funcRef(u24_3);
+
+ assert(equal(u24_3[], iota(0, 3)));
+ assert(!equal(u24_2[], u24_3[]));
+ assert(equal(u24_2[], u24[]));
+ u24_2 = u24_3;
+ assert(equal(u24_2[], iota(0, 3)));
+ // to test that passed arg is intact outside
+ // plus try out opEquals
+ u24 = u24_3;
+ u24 = T.init;
+ u24_3 = T.init;
+ assert(u24.empty);
+ assert(u24 == u24_3);
+ assert(u24 != u24_2);
+ }
+
+ foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy))
+ {
+ alias Range = typeof(CowArray!Policy.init[]);
+ alias U24A = CowArray!Policy;
+ static assert(isForwardRange!Range);
+ static assert(isBidirectionalRange!Range);
+ static assert(isOutputRange!(Range, uint));
+ static assert(isRandomAccessRange!(Range));
+
+ auto arr = U24A([42u, 36, 100]);
+ assert(arr[0] == 42);
+ assert(arr[1] == 36);
+ arr[0] = 72;
+ arr[1] = 0xFE_FEFE;
+ assert(arr[0] == 72);
+ assert(arr[1] == 0xFE_FEFE);
+ assert(arr[2] == 100);
+ U24A arr2 = arr;
+ assert(arr2[0] == 72);
+ arr2[0] = 11;
+ // test COW-ness
+ assert(arr[0] == 72);
+ assert(arr2[0] == 11);
+ // set this to about 100M to stress-test COW memory management
+ foreach (v; 0 .. 10_000)
+ func2(arr);
+ assert(equal(arr[], [72, 0xFE_FEFE, 100]));
+
+ auto r2 = U24A(iota(0, 100));
+ assert(equal(r2[], iota(0, 100)), text(r2[]));
+ copy(iota(10, 170, 2), r2[10 .. 90]);
+ assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
+ , text(r2[]));
+ }
+}
+
+version (unittest)
+{
+ private alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
+}
+
+@safe unittest// core set primitives test
+{
+ import std.conv : text;
+ foreach (CodeList; AllSets)
+ {
+ CodeList a;
+ //"plug a hole" test
+ a.add(10, 20).add(25, 30).add(15, 27);
+ assert(a == CodeList(10, 30), text(a));
+
+ auto x = CodeList.init;
+ x.add(10, 20).add(30, 40).add(50, 60);
+
+ a = x;
+ a.add(20, 49);//[10, 49) [50, 60)
+ assert(a == CodeList(10, 49, 50 ,60));
+
+ a = x;
+ a.add(20, 50);
+ assert(a == CodeList(10, 60), text(a));
+
+ // simple unions, mostly edge effects
+ x = CodeList.init;
+ x.add(10, 20).add(40, 60);
+
+ a = x;
+ a.add(10, 25); //[10, 25) [40, 60)
+ assert(a == CodeList(10, 25, 40, 60));
+
+ a = x;
+ a.add(5, 15); //[5, 20) [40, 60)
+ assert(a == CodeList(5, 20, 40, 60));
+
+ a = x;
+ a.add(0, 10); // [0, 20) [40, 60)
+ assert(a == CodeList(0, 20, 40, 60));
+
+ a = x;
+ a.add(0, 5); // prepand
+ assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
+
+ a = x;
+ a.add(5, 20);
+ assert(a == CodeList(5, 20, 40, 60));
+
+ a = x;
+ a.add(3, 37);
+ assert(a == CodeList(3, 37, 40, 60));
+
+ a = x;
+ a.add(37, 65);
+ assert(a == CodeList(10, 20, 37, 65));
+
+ // some tests on helpers for set intersection
+ x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
+ a = x;
+
+ auto m = a.skipUpTo(60);
+ a.dropUpTo(110, m);
+ assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
+
+ a = x;
+ a.dropUpTo(100);
+ assert(a == CodeList(100, 120), text(a.data[]));
+
+ a = x;
+ m = a.skipUpTo(50);
+ a.dropUpTo(140, m);
+ assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
+ a = x;
+ a.dropUpTo(60);
+ assert(a == CodeList(100, 120), text(a.data[]));
+ }
+}
+
+
+//test constructor to work with any order of intervals
+@safe unittest
+{
+ import std.algorithm.comparison : equal;
+ import std.conv : text, to;
+ import std.range : chain, iota;
+ import std.typecons : tuple;
+ //ensure constructor handles bad ordering and overlap
+ auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1);
+ foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1)))
+ assert(ch in c1, to!string(ch));
+
+ //contiguos
+ assert(CodepointSet(1000, 1006, 1006, 1009)
+ .byInterval.equal([tuple(1000, 1009)]));
+ //contains
+ assert(CodepointSet(900, 1200, 1000, 1100)
+ .byInterval.equal([tuple(900, 1200)]));
+ //intersect left
+ assert(CodepointSet(900, 1100, 1000, 1200)
+ .byInterval.equal([tuple(900, 1200)]));
+ //intersect right
+ assert(CodepointSet(1000, 1200, 900, 1100)
+ .byInterval.equal([tuple(900, 1200)]));
+
+ //ditto with extra items at end
+ assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
+ .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
+ assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
+ .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
+
+ //"plug a hole" test
+ auto c2 = CodepointSet(20, 40,
+ 60, 80, 100, 140, 150, 200,
+ 40, 60, 80, 100, 140, 150
+ );
+ assert(c2.byInterval.equal([tuple(20, 200)]));
+
+ auto c3 = CodepointSet(
+ 20, 40, 60, 80, 100, 140, 150, 200,
+ 0, 10, 15, 100, 10, 20, 200, 220);
+ assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
+}
+
+
+@safe unittest
+{ // full set operations
+ import std.conv : text;
+ foreach (CodeList; AllSets)
+ {
+ CodeList a, b, c, d;
+
+ //"plug a hole"
+ a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
+ b.add(40, 60).add(80, 100).add(140, 150);
+ c = a | b;
+ d = b | a;
+ assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
+ assert(c == d, text(c," vs ", d));
+
+ b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
+ c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
+ d = b | a;
+ assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
+ assert(c == d, text(c," vs ", d));
+
+ b = CodeList.init.add(10, 20).add(30,100).add(145,200);
+ c = a | b;//[10, 140) [145, 200)
+ d = b | a;
+ assert(c == CodeList(10, 140, 145, 200));
+ assert(c == d, text(c," vs ", d));
+
+ b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
+ c = a | b;//[0, 140) [150, 220)
+ d = b | a;
+ assert(c == CodeList(0, 140, 150, 220));
+ assert(c == d, text(c," vs ", d));
+
+
+ a = CodeList.init.add(20, 40).add(60, 80);
+ b = CodeList.init.add(25, 35).add(65, 75);
+ c = a & b;
+ d = b & a;
+ assert(c == CodeList(25, 35, 65, 75), text(c));
+ assert(c == d, text(c," vs ", d));
+
+ a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
+ b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180);
+ c = a & b;
+ d = b & a;
+ assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c));
+ assert(c == d, text(c," vs ", d));
+
+ a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
+ b = CodeList.init.add(10, 30).add(60, 120).add(135, 160);
+ c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160)
+ d = b & a;
+
+ assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c));
+ assert(c == d, text(c, " vs ",d));
+ assert((c & a) == c);
+ assert((d & b) == d);
+ assert((c & d) == d);
+
+ b = CodeList.init.add(40, 60).add(80, 100).add(140, 200);
+ c = a & b;
+ d = b & a;
+ assert(c == CodeList(150, 200), text(c));
+ assert(c == d, text(c, " vs ",d));
+ assert((c & a) == c);
+ assert((d & b) == d);
+ assert((c & d) == d);
+
+ assert((a & a) == a);
+ assert((b & b) == b);
+
+ a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
+ b = CodeList.init.add(30, 60).add(75, 120).add(190, 300);
+ c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190)
+ d = b - a;// [40, 60) [80, 100) [200, 300)
+ assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c));
+ assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d));
+ assert(c - d == c, text(c-d, " vs ", c));
+ assert(d - c == d, text(d-c, " vs ", d));
+ assert(c - c == CodeList.init);
+ assert(d - d == CodeList.init);
+
+ a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150, 200);
+ b = CodeList.init.add(10, 50).add(60, 160).add(190, 300);
+ c = a - b;// [160, 190)
+ d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300)
+ assert(c == CodeList(160, 190), text(c));
+ assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d));
+ assert(c - d == c, text(c-d, " vs ", c));
+ assert(d - c == d, text(d-c, " vs ", d));
+ assert(c - c == CodeList.init);
+ assert(d - d == CodeList.init);
+
+ a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
+ b = CodeList.init.add(10, 30).add(45, 100).add(130, 190);
+ c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200)
+ d = b ~ a;
+ assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200),
+ text(c));
+ assert(c == d, text(c, " vs ", d));
+ }
+}
+
+}
+
+@safe unittest// vs single dchar
+{
+ import std.conv : text;
+ CodepointSet a = CodepointSet(10, 100, 120, 200);
+ assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A'));
+ assert((a & 'B') == CodepointSet(66, 67));
+}
+
+@safe unittest// iteration & opIndex
+{
+ import std.algorithm.comparison : equal;
+ import std.conv : text;
+ import std.typecons : tuple, Tuple;
+
+ foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy)))
+ {
+ auto arr = "ABCDEFGHIJKLMabcdefghijklm"d;
+ auto a = CodeList('A','N','a', 'n');
+ assert(equal(a.byInterval,
+ [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')]
+ ), text(a.byInterval));
+
+ // same @@@BUG as in issue 8949 ?
+ version (bug8949)
+ {
+ import std.range : retro;
+ assert(equal(retro(a.byInterval),
+ [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')]
+ ), text(retro(a.byInterval)));
+ }
+ auto achr = a.byCodepoint;
+ assert(equal(achr, arr), text(a.byCodepoint));
+ foreach (ch; a.byCodepoint)
+ assert(a[ch]);
+ auto x = CodeList(100, 500, 600, 900, 1200, 1500);
+ assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval));
+ foreach (ch; x.byCodepoint)
+ assert(x[ch]);
+ static if (is(CodeList == CodepointSet))
+ {
+ auto y = CodeList(x.byInterval);
+ assert(equal(x.byInterval, y.byInterval));
+ }
+ assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[]));
+ assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[]));
+ }
+}
+
+//============================================================================
+// Generic Trie template and various ways to build it
+//============================================================================
+
+// debug helper to get a shortened array dump
+auto arrayRepr(T)(T x)
+{
+ import std.conv : text;
+ if (x.length > 32)
+ {
+ return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]);
+ }
+ else
+ return text(x);
+}
+
+/**
+ Maps $(D Key) to a suitable integer index within the range of $(D size_t).
+ The mapping is constructed by applying predicates from $(D Prefix) left to right
+ and concatenating the resulting bits.
+
+ The first (leftmost) predicate defines the most significant bits of
+ the resulting index.
+ */
+template mapTrieIndex(Prefix...)
+{
+ size_t mapTrieIndex(Key)(Key key)
+ if (isValidPrefixForTrie!(Key, Prefix))
+ {
+ alias p = Prefix;
+ size_t idx;
+ foreach (i, v; p[0..$-1])
+ {
+ idx |= p[i](key);
+ idx <<= p[i+1].bitSize;
+ }
+ idx |= p[$-1](key);
+ return idx;
+ }
+}
+
+/*
+ $(D TrieBuilder) is a type used for incremental construction
+ of $(LREF Trie)s.
+
+ See $(LREF buildTrie) for generic helpers built on top of it.
+*/
+@trusted private struct TrieBuilder(Value, Key, Args...)
+if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args))
+{
+ import std.exception : enforce;
+
+private:
+ // last index is not stored in table, it is used as an offset to values in a block.
+ static if (is(Value == bool))// always pack bool
+ alias V = BitPacked!(Value, 1);
+ else
+ alias V = Value;
+ static auto deduceMaxIndex(Preds...)()
+ {
+ size_t idx = 1;
+ foreach (v; Preds)
+ idx *= 2^^v.bitSize;
+ return idx;
+ }
+
+ static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key
+ {
+ alias Prefix = Args[1..$];
+ enum lastPageSize = 2^^Prefix[$-1].bitSize;
+ enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]);
+ enum roughedMaxIndex =
+ (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize;
+ // check warp around - if wrapped, use the default deduction rule
+ enum maxIndex = roughedMaxIndex < translatedMaxIndex ?
+ deduceMaxIndex!(Prefix)() : roughedMaxIndex;
+ }
+ else
+ {
+ alias Prefix = Args;
+ enum maxIndex = deduceMaxIndex!(Prefix)();
+ }
+
+ alias getIndex = mapTrieIndex!(Prefix);
+
+ enum lastLevel = Prefix.length-1;
+ struct ConstructState
+ {
+ size_t idx_zeros, idx_ones;
+ }
+ // iteration over levels of Trie, each indexes its own level and thus a shortened domain
+ size_t[Prefix.length] indices;
+ // default filler value to use
+ Value defValue;
+ // this is a full-width index of next item
+ size_t curIndex;
+ // all-zeros page index, all-ones page index (+ indicator if there is such a page)
+ ConstructState[Prefix.length] state;
+ // the table being constructed
+ MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table;
+
+ @disable this();
+
+ //shortcut for index variable at level 'level'
+ @property ref idx(size_t level)(){ return indices[level]; }
+
+ // this function assumes no holes in the input so
+ // indices are going one by one
+ void addValue(size_t level, T)(T val, size_t numVals)
+ {
+ alias j = idx!level;
+ enum pageSize = 1 << Prefix[level].bitSize;
+ if (numVals == 0)
+ return;
+ auto ptr = table.slice!(level);
+ if (numVals == 1)
+ {
+ static if (level == Prefix.length-1)
+ ptr[j] = val;
+ else
+ {// can incur narrowing conversion
+ assert(j < ptr.length);
+ ptr[j] = force!(typeof(ptr[j]))(val);
+ }
+ j++;
+ if (j % pageSize == 0)
+ spillToNextPage!level(ptr);
+ return;
+ }
+ // longer row of values
+ // get to the next page boundary
+ immutable nextPB = (j + pageSize) & ~(pageSize-1);
+ immutable n = nextPB - j;// can fill right in this page
+ if (numVals < n) //fits in current page
+ {
+ ptr[j .. j+numVals] = val;
+ j += numVals;
+ return;
+ }
+ static if (level != 0)//on the first level it always fits
+ {
+ numVals -= n;
+ //write till the end of current page
+ ptr[j .. j+n] = val;
+ j += n;
+ //spill to the next page
+ spillToNextPage!level(ptr);
+ // page at once loop
+ if (state[level].idx_zeros != size_t.max && val == T.init)
+ {
+ alias NextIdx = typeof(table.slice!(level-1)[0]);
+ addValue!(level-1)(force!NextIdx(state[level].idx_zeros),
+ numVals/pageSize);
+ ptr = table.slice!level; //table structure might have changed
+ numVals %= pageSize;
+ }
+ else
+ {
+ while (numVals >= pageSize)
+ {
+ numVals -= pageSize;
+ ptr[j .. j+pageSize] = val;
+ j += pageSize;
+ spillToNextPage!level(ptr);
+ }
+ }
+ if (numVals)
+ {
+ // the leftovers, an incomplete page
+ ptr[j .. j+numVals] = val;
+ j += numVals;
+ }
+ }
+ }
+
+ void spillToNextPage(size_t level, Slice)(ref Slice ptr)
+ {
+ // last level (i.e. topmost) has 1 "page"
+ // thus it need not to add a new page on upper level
+ static if (level != 0)
+ spillToNextPageImpl!(level)(ptr);
+ }
+
+ // this can re-use the current page if duplicate or allocate a new one
+ // it also makes sure that previous levels point to the correct page in this level
+ void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr)
+ {
+ alias NextIdx = typeof(table.slice!(level-1)[0]);
+ NextIdx next_lvl_index;
+ enum pageSize = 1 << Prefix[level].bitSize;
+ assert(idx!level % pageSize == 0);
+ immutable last = idx!level-pageSize;
+ const slice = ptr[idx!level - pageSize .. idx!level];
+ size_t j;
+ for (j=0; j<last; j+=pageSize)
+ {
+ if (ptr[j .. j+pageSize] == slice)
+ {
+ // get index to it, reuse ptr space for the next block
+ next_lvl_index = force!NextIdx(j/pageSize);
+ version (none)
+ {
+ import std.stdio : writefln, writeln;
+ writefln("LEVEL(%s) page mapped idx: %s: 0..%s ---> [%s..%s]"
+ ,level
+ ,indices[level-1], pageSize, j, j+pageSize);
+ writeln("LEVEL(", level
+ , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize]));
+ writeln("LEVEL(", level
+ , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize]));
+ }
+ idx!level -= pageSize; // reuse this page, it is duplicate
+ break;
+ }
+ }
+ if (j == last)
+ {
+ L_allocate_page:
+ next_lvl_index = force!NextIdx(idx!level/pageSize - 1);
+ if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize))
+ {
+ state[level].idx_zeros = next_lvl_index;
+ }
+ // allocate next page
+ version (none)
+ {
+ import std.stdio : writefln;
+ writefln("LEVEL(%s) page allocated: %s"
+ , level, arrayRepr(slice[0 .. pageSize]));
+ writefln("LEVEL(%s) index: %s ; page at this index %s"
+ , level
+ , next_lvl_index
+ , arrayRepr(
+ table.slice!(level)
+ [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize]
+ ));
+ }
+ table.length!level = table.length!level + pageSize;
+ }
+ L_know_index:
+ // for the previous level, values are indices to the pages in the current level
+ addValue!(level-1)(next_lvl_index, 1);
+ ptr = table.slice!level; //re-load the slice after moves
+ }
+
+ // idx - full-width index to fill with v (full-width index != key)
+ // fills everything in the range of [curIndex, idx) with filler
+ void putAt(size_t idx, Value v)
+ {
+ assert(idx >= curIndex);
+ immutable numFillers = idx - curIndex;
+ addValue!lastLevel(defValue, numFillers);
+ addValue!lastLevel(v, 1);
+ curIndex = idx + 1;
+ }
+
+ // ditto, but sets the range of [idxA, idxB) to v
+ void putRangeAt(size_t idxA, size_t idxB, Value v)
+ {
+ assert(idxA >= curIndex);
+ assert(idxB >= idxA);
+ size_t numFillers = idxA - curIndex;
+ addValue!lastLevel(defValue, numFillers);
+ addValue!lastLevel(v, idxB - idxA);
+ curIndex = idxB; // open-right
+ }
+
+ enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~
+ "duplicate key->value mapping";
+
+public:
+ /**
+ Construct a builder, where $(D filler) is a value
+ to indicate empty slots (or "not found" condition).
+ */
+ this(Value filler)
+ {
+ curIndex = 0;
+ defValue = filler;
+ // zeros-page index, ones-page index
+ foreach (ref v; state)
+ v = ConstructState(size_t.max, size_t.max);
+ table = typeof(table)(indices);
+ // one page per level is a bootstrap minimum
+ foreach (i, Pred; Prefix)
+ table.length!i = (1 << Pred.bitSize);
+ }
+
+ /**
+ Put a value $(D v) into interval as
+ mapped by keys from $(D a) to $(D b).
+ All slots prior to $(D a) are filled with
+ the default filler.
+ */
+ void putRange(Key a, Key b, Value v)
+ {
+ auto idxA = getIndex(a), idxB = getIndex(b);
+ // indexes of key should always grow
+ enforce(idxB >= idxA && idxA >= curIndex, errMsg);
+ putRangeAt(idxA, idxB, v);
+ }
+
+ /**
+ Put a value $(D v) into slot mapped by $(D key).
+ All slots prior to $(D key) are filled with the
+ default filler.
+ */
+ void putValue(Key key, Value v)
+ {
+ import std.conv : text;
+ auto idx = getIndex(key);
+ enforce(idx >= curIndex, text(errMsg, " ", idx));
+ putAt(idx, v);
+ }
+
+ /// Finishes construction of Trie, yielding an immutable Trie instance.
+ auto build()
+ {
+ static if (maxIndex != 0) // doesn't cover full range of size_t
+ {
+ assert(curIndex <= maxIndex);
+ addValue!lastLevel(defValue, maxIndex - curIndex);
+ }
+ else
+ {
+ if (curIndex != 0 // couldn't wrap around
+ || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
+ {
+ addValue!lastLevel(defValue, size_t.max - curIndex);
+ addValue!lastLevel(defValue, 1);
+ }
+ // else curIndex already completed the full range of size_t by wrapping around
+ }
+ return Trie!(V, Key, maxIndex, Prefix)(table);
+ }
+}
+
+/**
+ $(P A generic Trie data-structure for a fixed number of stages.
+ The design goal is optimal speed with smallest footprint size.
+ )
+ $(P It's intentionally read-only and doesn't provide constructors.
+ To construct one use a special builder,
+ see $(LREF TrieBuilder) and $(LREF buildTrie).
+ )
+
+*/
+@trusted private struct Trie(Value, Key, Args...)
+if (isValidPrefixForTrie!(Key, Args)
+ || (isValidPrefixForTrie!(Key, Args[1..$])
+ && is(typeof(Args[0]) : size_t)))
+{
+ import std.range.primitives : isOutputRange;
+ static if (is(typeof(Args[0]) : size_t))
+ {
+ private enum maxIndex = Args[0];
+ private enum hasBoundsCheck = true;
+ private alias Prefix = Args[1..$];
+ }
+ else
+ {
+ private enum hasBoundsCheck = false;
+ private alias Prefix = Args;
+ }
+
+ private this()(typeof(_table) table)
+ {
+ _table = table;
+ }
+
+ // only for constant Tries constructed from precompiled tables
+ private this()(const(size_t)[] offsets, const(size_t)[] sizes,
+ const(size_t)[] data) const
+ {
+ _table = typeof(_table)(offsets, sizes, data);
+ }
+
+ /**
+ $(P Lookup the $(D key) in this $(D Trie). )
+
+ $(P The lookup always succeeds if key fits the domain
+ provided during construction. The whole domain defined
+ is covered so instead of not found condition
+ the sentinel (filler) value could be used. )
+
+ $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
+ define a domain of $(D Trie) keys and the sentinel value. )
+
+ Note:
+ Domain range-checking is only enabled in debug builds
+ and results in assertion failure.
+ */
+ TypeOfBitPacked!Value opIndex()(Key key) const
+ {
+ static if (hasBoundsCheck)
+ assert(mapTrieIndex!Prefix(key) < maxIndex);
+ size_t idx;
+ alias p = Prefix;
+ idx = cast(size_t) p[0](key);
+ foreach (i, v; p[0..$-1])
+ idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key));
+ return _table.ptr!(p.length-1)[idx];
+ }
+
+ ///
+ @property size_t bytes(size_t n=size_t.max)() const
+ {
+ return _table.bytes!n;
+ }
+
+ ///
+ @property size_t pages(size_t n)() const
+ {
+ return (bytes!n+2^^(Prefix[n].bitSize-1))
+ /2^^Prefix[n].bitSize;
+ }
+
+ ///
+ void store(OutRange)(scope OutRange sink) const
+ if (isOutputRange!(OutRange, char))
+ {
+ _table.store(sink);
+ }
+
+private:
+ MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table;
+}
+
+// create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes'
+// left-to-right, the most significant bits first
+template GetBitSlicing(size_t top, sizes...)
+{
+ static if (sizes.length > 0)
+ alias GetBitSlicing =
+ AliasSeq!(sliceBits!(top - sizes[0], top),
+ GetBitSlicing!(top - sizes[0], sizes[1..$]));
+ else
+ alias GetBitSlicing = AliasSeq!();
+}
+
+template callableWith(T)
+{
+ template callableWith(alias Pred)
+ {
+ static if (!is(typeof(Pred(T.init))))
+ enum callableWith = false;
+ else
+ {
+ alias Result = typeof(Pred(T.init));
+ enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
+ }
+ }
+}
+
+/*
+ Check if $(D Prefix) is a valid set of predicates
+ for $(D Trie) template having $(D Key) as the type of keys.
+ This requires all predicates to be callable, take
+ single argument of type $(D Key) and return unsigned value.
+*/
+template isValidPrefixForTrie(Key, Prefix...)
+{
+ import std.meta : allSatisfy;
+ enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
+}
+
+/*
+ Check if $(D Args) is a set of maximum key value followed by valid predicates
+ for $(D Trie) template having $(D Key) as the type of keys.
+*/
+template isValidArgsForTrie(Key, Args...)
+{
+ static if (Args.length > 1)
+ {
+ enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
+ || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
+ }
+ else
+ enum isValidArgsForTrie = isValidPrefixForTrie!Args;
+}
+
+@property size_t sumOfIntegerTuple(ints...)()
+{
+ size_t count=0;
+ foreach (v; ints)
+ count += v;
+ return count;
+}
+
+/**
+ A shorthand for creating a custom multi-level fixed Trie
+ from a $(D CodepointSet). $(D sizes) are numbers of bits per level,
+ with the most significant bits used first.
+
+ Note: The sum of $(D sizes) must be equal 21.
+
+ See_Also: $(LREF toTrie), which is even simpler.
+
+ Example:
+ ---
+ {
+ import std.stdio;
+ auto set = unicode("Number");
+ auto trie = codepointSetTrie!(8, 5, 8)(set);
+ writeln("Input code points to test:");
+ foreach (line; stdin.byLine)
+ {
+ int count=0;
+ foreach (dchar ch; line)
+ if (trie[ch])// is number
+ count++;
+ writefln("Contains %d number code points.", count);
+ }
+ }
+ ---
+*/
+public template codepointSetTrie(sizes...)
+if (sumOfIntegerTuple!sizes == 21)
+{
+ auto codepointSetTrie(Set)(Set set)
+ if (isCodepointSet!Set)
+ {
+ auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
+ foreach (ival; set.byInterval)
+ builder.putRange(ival[0], ival[1], true);
+ return builder.build();
+ }
+}
+
+/// Type of Trie generated by codepointSetTrie function.
+public template CodepointSetTrie(sizes...)
+if (sumOfIntegerTuple!sizes == 21)
+{
+ alias Prefix = GetBitSlicing!(21, sizes);
+ alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
+}
+
+/**
+ A slightly more general tool for building fixed $(D Trie)
+ for the Unicode data.
+
+ Specifically unlike $(D codepointSetTrie) it's allows creating mappings
+ of $(D dchar) to an arbitrary type $(D T).
+
+ Note: Overload taking $(D CodepointSet)s will naturally convert
+ only to bool mapping $(D Trie)s.
+*/
+public template codepointTrie(T, sizes...)
+if (sumOfIntegerTuple!sizes == 21)
+{
+ alias Prefix = GetBitSlicing!(21, sizes);
+
+ static if (is(TypeOfBitPacked!T == bool))
+ {
+ auto codepointTrie(Set)(in Set set)
+ if (isCodepointSet!Set)
+ {
+ return codepointSetTrie(set);
+ }
+ }
+
+ auto codepointTrie()(T[dchar] map, T defValue=T.init)
+ {
+ return buildTrie!(T, dchar, Prefix)(map, defValue);
+ }
+
+ // unsorted range of pairs
+ auto codepointTrie(R)(R range, T defValue=T.init)
+ if (isInputRange!R
+ && is(typeof(ElementType!R.init[0]) : T)
+ && is(typeof(ElementType!R.init[1]) : dchar))
+ {
+ // build from unsorted array of pairs
+ // TODO: expose index sorting functions for Trie
+ return buildTrie!(T, dchar, Prefix)(range, defValue, true);
+ }
+}
+
+@system pure unittest
+{
+ import std.algorithm.comparison : max;
+ import std.algorithm.searching : count;
+
+ // pick characters from the Greek script
+ auto set = unicode.Greek;
+
+ // a user-defined property (or an expensive function)
+ // that we want to look up
+ static uint luckFactor(dchar ch)
+ {
+ // here we consider a character lucky
+ // if its code point has a lot of identical hex-digits
+ // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
+ ubyte[6] nibbles; // 6 4-bit chunks of code point
+ uint value = ch;
+ foreach (i; 0 .. 6)
+ {
+ nibbles[i] = value & 0xF;
+ value >>= 4;
+ }
+ uint luck;
+ foreach (n; nibbles)
+ luck = cast(uint) max(luck, count(nibbles[], n));
+ return luck;
+ }
+
+ // only unsigned built-ins are supported at the moment
+ alias LuckFactor = BitPacked!(uint, 3);
+
+ // create a temporary associative array (AA)
+ LuckFactor[dchar] map;
+ foreach (ch; set.byCodepoint)
+ map[ch] = LuckFactor(luckFactor(ch));
+
+ // bits per stage are chosen randomly, fell free to optimize
+ auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
+
+ // from now on the AA is not needed
+ foreach (ch; set.byCodepoint)
+ assert(trie[ch] == luckFactor(ch)); // verify
+ // CJK is not Greek, thus it has the default value
+ assert(trie['\u4444'] == 0);
+ // and here is a couple of quite lucky Greek characters:
+ // Greek small letter epsilon with dasia
+ assert(trie['\u1F11'] == 3);
+ // Ancient Greek metretes sign
+ assert(trie['\U00010181'] == 3);
+
+}
+
+/// Type of Trie as generated by codepointTrie function.
+public template CodepointTrie(T, sizes...)
+if (sumOfIntegerTuple!sizes == 21)
+{
+ alias Prefix = GetBitSlicing!(21, sizes);
+ alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
+}
+
+package template cmpK0(alias Pred)
+{
+ import std.typecons : Tuple;
+ static bool cmpK0(Value, Key)
+ (Tuple!(Value, Key) a, Tuple!(Value, Key) b)
+ {
+ return Pred(a[1]) < Pred(b[1]);
+ }
+}
+
+/**
+ The most general utility for construction of $(D Trie)s
+ short of using $(D TrieBuilder) directly.
+
+ Provides a number of convenience overloads.
+ $(D Args) is tuple of maximum key value followed by
+ predicates to construct index from key.
+
+ Alternatively if the first argument is not a value convertible to $(D Key)
+ then the whole tuple of $(D Args) is treated as predicates
+ and the maximum Key is deduced from predicates.
+*/
+private template buildTrie(Value, Key, Args...)
+if (isValidArgsForTrie!(Key, Args))
+{
+ static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
+ {
+ alias Prefix = Args[1..$];
+ }
+ else
+ alias Prefix = Args;
+
+ alias getIndex = mapTrieIndex!(Prefix);
+
+ // for multi-sort
+ template GetComparators(size_t n)
+ {
+ static if (n > 0)
+ alias GetComparators =
+ AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
+ else
+ alias GetComparators = AliasSeq!();
+ }
+
+ /*
+ Build $(D Trie) from a range of a Key-Value pairs,
+ assuming it is sorted by Key as defined by the following lambda:
+ ------
+ (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
+ ------
+ Exception is thrown if it's detected that the above order doesn't hold.
+
+ In other words $(LREF mapTrieIndex) should be a
+ monotonically increasing function that maps $(D Key) to an integer.
+
+ See_Also: $(REF sort, std,_algorithm),
+ $(REF SortedRange, std,_range),
+ $(REF setUnion, std,_algorithm).
+ */
+ auto buildTrie(Range)(Range range, Value filler=Value.init)
+ if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
+ && is(typeof(Range.init.front[1]) : Key))
+ {
+ auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
+ foreach (v; range)
+ builder.putValue(v[1], v[0]);
+ return builder.build();
+ }
+
+ /*
+ If $(D Value) is bool (or BitPacked!(bool, x)) then it's possible
+ to build $(D Trie) from a range of open-right intervals of $(D Key)s.
+ The requirement on the ordering of keys (and the behavior on the
+ violation of it) is the same as for Key-Value range overload.
+
+ Intervals denote ranges of !$(D filler) i.e. the opposite of filler.
+ If no filler provided keys inside of the intervals map to true,
+ and $(D filler) is false.
+ */
+ auto buildTrie(Range)(Range range, Value filler=Value.init)
+ if (is(TypeOfBitPacked!Value == bool)
+ && isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
+ && is(typeof(Range.init.front[1]) : Key))
+ {
+ auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
+ foreach (ival; range)
+ builder.putRange(ival[0], ival[1], !filler);
+ return builder.build();
+ }
+
+ auto buildTrie(Range)(Range range, Value filler, bool unsorted)
+ if (isInputRange!Range
+ && is(typeof(Range.init.front[0]) : Value)
+ && is(typeof(Range.init.front[1]) : Key))
+ {
+ import std.algorithm.sorting : multiSort;
+ alias Comps = GetComparators!(Prefix.length);
+ if (unsorted)
+ multiSort!(Comps)(range);
+ return buildTrie(range, filler);
+ }
+
+ /*
+ If $(D Value) is bool (or BitPacked!(bool, x)) then it's possible
+ to build $(D Trie) simply from an input range of $(D Key)s.
+ The requirement on the ordering of keys (and the behavior on the
+ violation of it) is the same as for Key-Value range overload.
+
+ Keys found in range denote !$(D filler) i.e. the opposite of filler.
+ If no filler provided keys map to true, and $(D filler) is false.
+ */
+ auto buildTrie(Range)(Range range, Value filler=Value.init)
+ if (is(TypeOfBitPacked!Value == bool)
+ && isInputRange!Range && is(typeof(Range.init.front) : Key))
+ {
+ auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
+ foreach (v; range)
+ builder.putValue(v, !filler);
+ return builder.build();
+ }
+
+ /*
+ If $(D Key) is unsigned integer $(D Trie) could be constructed from array
+ of values where array index serves as key.
+ */
+ auto buildTrie()(Value[] array, Value filler=Value.init)
+ if (isUnsigned!Key)
+ {
+ auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
+ foreach (idx, v; array)
+ builder.putValue(idx, v);
+ return builder.build();
+ }
+
+ /*
+ Builds $(D Trie) from associative array.
+ */
+ auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
+ {
+ import std.array : array;
+ import std.range : zip;
+ auto range = array(zip(map.values, map.keys));
+ return buildTrie(range, filler, true); // sort it
+ }
+}
+
+// helper in place of assumeSize to
+//reduce mangled name & help DMD inline Trie functors
+struct clamp(size_t bits)
+{
+ static size_t opCall(T)(T arg){ return arg; }
+ enum bitSize = bits;
+}
+
+struct clampIdx(size_t idx, size_t bits)
+{
+ static size_t opCall(T)(T arg){ return arg[idx]; }
+ enum bitSize = bits;
+}
+
+/**
+ Conceptual type that outlines the common properties of all UTF Matchers.
+
+ Note: For illustration purposes only, every method
+ call results in assertion failure.
+ Use $(LREF utfMatcher) to obtain a concrete matcher
+ for UTF-8 or UTF-16 encodings.
+*/
+public struct MatcherConcept
+{
+ /**
+ $(P Perform a semantic equivalent 2 operations:
+ decoding a $(CODEPOINT) at front of $(D inp) and testing if
+ it belongs to the set of $(CODEPOINTS) of this matcher. )
+
+ $(P The effect on $(D inp) depends on the kind of function called:)
+
+ $(P Match. If the codepoint is found in the set then range $(D inp)
+ is advanced by its size in $(S_LINK Code unit, code units),
+ otherwise the range is not modifed.)
+
+ $(P Skip. The range is always advanced by the size
+ of the tested $(CODEPOINT) regardless of the result of test.)
+
+ $(P Test. The range is left unaffected regardless
+ of the result of test.)
+ */
+ public bool match(Range)(ref Range inp)
+ if (isRandomAccessRange!Range && is(ElementType!Range : char))
+ {
+ assert(false);
+ }
+
+ ///ditto
+ public bool skip(Range)(ref Range inp)
+ if (isRandomAccessRange!Range && is(ElementType!Range : char))
+ {
+ assert(false);
+ }
+
+ ///ditto
+ public bool test(Range)(ref Range inp)
+ if (isRandomAccessRange!Range && is(ElementType!Range : char))
+ {
+ assert(false);
+ }
+ ///
+ @safe unittest
+ {
+ string truth = "2² = 4";
+ auto m = utfMatcher!char(unicode.Number);
+ assert(m.match(truth)); // '2' is a number all right
+ assert(truth == "² = 4"); // skips on match
+ assert(m.match(truth)); // so is the superscript '2'
+ assert(!m.match(truth)); // space is not a number
+ assert(truth == " = 4"); // unaffected on no match
+ assert(!m.skip(truth)); // same test ...
+ assert(truth == "= 4"); // but skips a codepoint regardless
+ assert(!m.test(truth)); // '=' is not a number
+ assert(truth == "= 4"); // test never affects argument
+ }
+
+ /**
+ Advanced feature - provide direct access to a subset of matcher based a
+ set of known encoding lengths. Lengths are provided in
+ $(S_LINK Code unit, code units). The sub-matcher then may do less
+ operations per any $(D test)/$(D match).
+
+ Use with care as the sub-matcher won't match
+ any $(CODEPOINTS) that have encoded length that doesn't belong
+ to the selected set of lengths. Also the sub-matcher object references
+ the parent matcher and must not be used past the liftetime
+ of the latter.
+
+ Another caveat of using sub-matcher is that skip is not available
+ preciesly because sub-matcher doesn't detect all lengths.
+ */
+ @property auto subMatcher(Lengths...)()
+ {
+ assert(0);
+ return this;
+ }
+
+ @safe unittest
+ {
+ auto m = utfMatcher!char(unicode.Number);
+ string square = "2²";
+ // about sub-matchers
+ assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
+ assert(m.subMatcher!1.match(square)); // ASCII-only, works
+ assert(!m.subMatcher!1.test(square)); // unicode '²'
+ assert(m.subMatcher!(2,3,4).match(square)); //
+ assert(square == "");
+ wstring wsquare = "2²";
+ auto m16 = utfMatcher!wchar(unicode.Number);
+ // may keep ref, but the orignal (m16) must be kept alive
+ auto bmp = m16.subMatcher!1;
+ assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
+ assert(bmp.match(wsquare)); // And '²' too
+ }
+}
+
+/**
+ Test if $(D M) is an UTF Matcher for ranges of $(D Char).
+*/
+public enum isUtfMatcher(M, C) = __traits(compiles, (){
+ C[] s;
+ auto d = s.decoder;
+ M m;
+ assert(is(typeof(m.match(d)) == bool));
+ assert(is(typeof(m.test(d)) == bool));
+ static if (is(typeof(m.skip(d))))
+ {
+ assert(is(typeof(m.skip(d)) == bool));
+ assert(is(typeof(m.skip(s)) == bool));
+ }
+ assert(is(typeof(m.match(s)) == bool));
+ assert(is(typeof(m.test(s)) == bool));
+});
+
+@safe unittest
+{
+ alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
+ alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
+ static assert(isUtfMatcher!(CharMatcher, char));
+ static assert(isUtfMatcher!(CharMatcher, immutable(char)));
+ static assert(isUtfMatcher!(WcharMatcher, wchar));
+ static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
+}
+
+enum Mode {
+ alwaysSkip,
+ neverSkip,
+ skipOnMatch
+}
+
+mixin template ForwardStrings()
+{
+ private bool fwdStr(string fn, C)(ref C[] str) const pure
+ {
+ import std.utf : byCodeUnit;
+ alias type = typeof(byCodeUnit(str));
+ return mixin(fn~"(*cast(type*)&str)");
+ }
+}
+
+template Utf8Matcher()
+{
+ enum validSize(int sz) = sz >= 1 && sz <= 4;
+
+ void badEncoding() pure @safe
+ {
+ import std.utf : UTFException;
+ throw new UTFException("Invalid UTF-8 sequence");
+ }
+
+ //for 1-stage ASCII
+ alias AsciiSpec = AliasSeq!(bool, char, clamp!7);
+ //for 2-stage lookup of 2 byte UTF-8 sequences
+ alias Utf8Spec2 = AliasSeq!(bool, char[2],
+ clampIdx!(0, 5), clampIdx!(1, 6));
+ //ditto for 3 byte
+ alias Utf8Spec3 = AliasSeq!(bool, char[3],
+ clampIdx!(0, 4),
+ clampIdx!(1, 6),
+ clampIdx!(2, 6)
+ );
+ //ditto for 4 byte
+ alias Utf8Spec4 = AliasSeq!(bool, char[4],
+ clampIdx!(0, 3), clampIdx!(1, 6),
+ clampIdx!(2, 6), clampIdx!(3, 6)
+ );
+ alias Tables = AliasSeq!(
+ typeof(TrieBuilder!(AsciiSpec)(false).build()),
+ typeof(TrieBuilder!(Utf8Spec2)(false).build()),
+ typeof(TrieBuilder!(Utf8Spec3)(false).build()),
+ typeof(TrieBuilder!(Utf8Spec4)(false).build())
+ );
+ alias Table(int size) = Tables[size-1];
+
+ enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1;
+ enum encMask(size_t size) = ((1 << size)-1)<<(8-size);
+
+ char truncate()(char ch) pure @safe
+ {
+ ch -= 0x80;
+ if (ch < 0x40)
+ {
+ return ch;
+ }
+ else
+ {
+ badEncoding();
+ return cast(char) 0;
+ }
+ }
+
+ static auto encode(size_t sz)(dchar ch)
+ if (sz > 1)
+ {
+ import std.utf : encodeUTF = encode;
+ char[4] buf;
+ encodeUTF(buf, ch);
+ char[sz] ret;
+ buf[0] &= leadMask!sz;
+ foreach (n; 1 .. sz)
+ buf[n] = buf[n] & 0x3f; //keep 6 lower bits
+ ret[] = buf[0 .. sz];
+ return ret;
+ }
+
+ auto build(Set)(Set set)
+ {
+ import std.algorithm.iteration : map;
+ auto ascii = set & unicode.ASCII;
+ auto utf8_2 = set & CodepointSet(0x80, 0x800);
+ auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
+ auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
+ auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
+ auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
+ auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
+ auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
+ alias Ret = Impl!(1,2,3,4);
+ return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
+ }
+
+ // Bootstrap UTF-8 static matcher interface
+ // from 3 primitives: tab!(size), lookup and Sizes
+ mixin template DefMatcher()
+ {
+ import std.format : format;
+ import std.meta : Erase, staticIndexOf;
+ enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
+ alias UniSizes = Erase!(1, Sizes);
+
+ //generate dispatch code sequence for unicode parts
+ static auto genDispatch()
+ {
+ string code;
+ foreach (size; UniSizes)
+ code ~= format(q{
+ if ((ch & ~leadMask!%d) == encMask!(%d))
+ return lookup!(%d, mode)(inp);
+ else
+ }, size, size, size);
+ static if (Sizes.length == 4) //covers all code unit cases
+ code ~= "{ badEncoding(); return false; }";
+ else
+ code ~= "return false;"; //may be just fine but not covered
+ return code;
+ }
+ enum dispatch = genDispatch();
+
+ public bool match(Range)(ref Range inp) const pure
+ if (isRandomAccessRange!Range && is(ElementType!Range : char))
+ {
+ enum mode = Mode.skipOnMatch;
+ assert(!inp.empty);
+ immutable ch = inp[0];
+ static if (hasASCII)
+ {
+ if (ch < 0x80)
+ {
+ immutable r = tab!1[ch];
+ if (r)
+ inp.popFront();
+ return r;
+ }
+ else
+ mixin(dispatch);
+ }
+ else
+ mixin(dispatch);
+ }
+
+ static if (Sizes.length == 4) // can skip iff can detect all encodings
+ {
+ public bool skip(Range)(ref Range inp) const pure @trusted
+ if (isRandomAccessRange!Range && is(ElementType!Range : char))
+ {
+ enum mode = Mode.alwaysSkip;
+ assert(!inp.empty);
+ auto ch = inp[0];
+ static if (hasASCII)
+ {
+ if (ch < 0x80)
+ {
+ inp.popFront();
+ return tab!1[ch];
+ }
+ else
+ mixin(dispatch);
+ }
+ else
+ mixin(dispatch);
+ }
+ }
+
+ public bool test(Range)(ref Range inp) const pure @trusted
+ if (isRandomAccessRange!Range && is(ElementType!Range : char))
+ {
+ enum mode = Mode.neverSkip;
+ assert(!inp.empty);
+ auto ch = inp[0];
+ static if (hasASCII)
+ {
+ if (ch < 0x80)
+ return tab!1[ch];
+ else
+ mixin(dispatch);
+ }
+ else
+ mixin(dispatch);
+ }
+
+ bool match(C)(ref C[] str) const pure @trusted
+ if (isSomeChar!C)
+ {
+ return fwdStr!"match"(str);
+ }
+
+ bool skip(C)(ref C[] str) const pure @trusted
+ if (isSomeChar!C)
+ {
+ return fwdStr!"skip"(str);
+ }
+
+ bool test(C)(ref C[] str) const pure @trusted
+ if (isSomeChar!C)
+ {
+ return fwdStr!"test"(str);
+ }
+
+ mixin ForwardStrings;
+ }
+
+ struct Impl(Sizes...)
+ {
+ import std.meta : allSatisfy, staticMap;
+ static assert(allSatisfy!(validSize, Sizes),
+ "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
+ private:
+ //pick tables for chosen sizes
+ alias OurTabs = staticMap!(Table, Sizes);
+ OurTabs tables;
+ mixin DefMatcher;
+ //static disptach helper UTF size ==> table
+ alias tab(int i) = tables[i - 1];
+
+ package @property auto subMatcher(SizesToPick...)() @trusted
+ {
+ return CherryPick!(Impl, SizesToPick)(&this);
+ }
+
+ bool lookup(int size, Mode mode, Range)(ref Range inp) const pure @trusted
+ {
+ import std.typecons : staticIota;
+ if (inp.length < size)
+ {
+ badEncoding();
+ return false;
+ }
+ char[size] needle = void;
+ needle[0] = leadMask!size & inp[0];
+ foreach (i; staticIota!(1, size))
+ {
+ needle[i] = truncate(inp[i]);
+ }
+ //overlong encoding checks
+ static if (size == 2)
+ {
+ //0x80-0x7FF
+ //got 6 bits in needle[1], must use at least 8 bits
+ //must use at least 2 bits in needle[1]
+ if (needle[0] < 2) badEncoding();
+ }
+ else static if (size == 3)
+ {
+ //0x800-0xFFFF
+ //got 6 bits in needle[2], must use at least 12bits
+ //must use 6 bits in needle[1] or anything in needle[0]
+ if (needle[0] == 0 && needle[1] < 0x20) badEncoding();
+ }
+ else static if (size == 4)
+ {
+ //0x800-0xFFFF
+ //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits
+ //must use 5 bits (or above) in needle[1] or anything in needle[0]
+ if (needle[0] == 0 && needle[1] < 0x10) badEncoding();
+ }
+ static if (mode == Mode.alwaysSkip)
+ {
+ inp.popFrontN(size);
+ return tab!size[needle];
+ }
+ else static if (mode == Mode.neverSkip)
+ {
+ return tab!size[needle];
+ }
+ else
+ {
+ static assert(mode == Mode.skipOnMatch);
+ if (tab!size[needle])
+ {
+ inp.popFrontN(size);
+ return true;
+ }
+ else
+ return false;
+ }
+ }
+ }
+
+ struct CherryPick(I, Sizes...)
+ {
+ import std.meta : allSatisfy;
+ static assert(allSatisfy!(validSize, Sizes),
+ "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
+ private:
+ I* m;
+ @property ref tab(int i)() const pure { return m.tables[i - 1]; }
+ bool lookup(int size, Mode mode, Range)(ref Range inp) const pure
+ {
+ return m.lookup!(size, mode)(inp);
+ }
+ mixin DefMatcher;
+ }
+}
+
+template Utf16Matcher()
+{
+ enum validSize(int sz) = sz >= 1 && sz <= 2;
+
+ void badEncoding() pure
+ {
+ import std.utf : UTFException;
+ throw new UTFException("Invalid UTF-16 sequence");
+ }
+
+ // 1-stage ASCII
+ alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
+ //2-stage BMP
+ alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
+ //4-stage - full Unicode
+ //assume that 0xD800 & 0xDC00 bits are cleared
+ //thus leaving 10 bit per wchar to worry about
+ alias UniSpec = AliasSeq!(bool, wchar[2],
+ assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
+ assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
+ );
+ alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
+ alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
+ alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
+
+ auto encode2(dchar ch)
+ {
+ ch -= 0x1_0000;
+ assert(ch <= 0xF_FFFF);
+ wchar[2] ret;
+ //do not put surrogate bits, they are sliced off
+ ret[0] = cast(wchar)(ch >> 10);
+ ret[1] = (ch & 0xFFF);
+ return ret;
+ }
+
+ auto build(Set)(Set set)
+ {
+ import std.algorithm.iteration : map;
+ auto ascii = set & unicode.ASCII;
+ auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
+ - CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
+ auto other = set - (bmp | ascii);
+ auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
+ auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec);
+ auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
+ alias Ret = Impl!(1,2);
+ return Ret(asciiT, bmpT, otherT);
+ }
+
+ //bootstrap full UTF-16 matcher interace from
+ //sizeFlags, lookupUni and ascii
+ mixin template DefMatcher()
+ {
+ public bool match(Range)(ref Range inp) const pure @trusted
+ if (isRandomAccessRange!Range && is(ElementType!Range : wchar))
+ {
+ enum mode = Mode.skipOnMatch;
+ assert(!inp.empty);
+ immutable ch = inp[0];
+ static if (sizeFlags & 1)
+ {
+ if (ch < 0x80)
+ {
+ if (ascii[ch])
+ {
+ inp.popFront();
+ return true;
+ }
+ else
+ return false;
+ }
+ return lookupUni!mode(inp);
+ }
+ else
+ return lookupUni!mode(inp);
+ }
+
+ static if (Sizes.length == 2)
+ {
+ public bool skip(Range)(ref Range inp) const pure @trusted
+ if (isRandomAccessRange!Range && is(ElementType!Range : wchar))
+ {
+ enum mode = Mode.alwaysSkip;
+ assert(!inp.empty);
+ immutable ch = inp[0];
+ static if (sizeFlags & 1)
+ {
+ if (ch < 0x80)
+ {
+ inp.popFront();
+ return ascii[ch];
+ }
+ else
+ return lookupUni!mode(inp);
+ }
+ else
+ return lookupUni!mode(inp);
+ }
+ }
+
+ public bool test(Range)(ref Range inp) const pure @trusted
+ if (isRandomAccessRange!Range && is(ElementType!Range : wchar))
+ {
+ enum mode = Mode.neverSkip;
+ assert(!inp.empty);
+ auto ch = inp[0];
+ static if (sizeFlags & 1)
+ return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
+ else
+ return lookupUni!mode(inp);
+ }
+
+ bool match(C)(ref C[] str) const pure @trusted
+ if (isSomeChar!C)
+ {
+ return fwdStr!"match"(str);
+ }
+
+ bool skip(C)(ref C[] str) const pure @trusted
+ if (isSomeChar!C)
+ {
+ return fwdStr!"skip"(str);
+ }
+
+ bool test(C)(ref C[] str) const pure @trusted
+ if (isSomeChar!C)
+ {
+ return fwdStr!"test"(str);
+ }
+
+ mixin ForwardStrings; //dispatch strings to range versions
+ }
+
+ struct Impl(Sizes...)
+ if (Sizes.length >= 1 && Sizes.length <= 2)
+ {
+ private:
+ import std.meta : allSatisfy;
+ static assert(allSatisfy!(validSize, Sizes),
+ "Only lengths of 1 and 2 code units are possible in UTF-16");
+ static if (Sizes.length > 1)
+ enum sizeFlags = Sizes[0] | Sizes[1];
+ else
+ enum sizeFlags = Sizes[0];
+
+ static if (sizeFlags & 1)
+ {
+ Ascii ascii;
+ Bmp bmp;
+ }
+ static if (sizeFlags & 2)
+ {
+ Uni uni;
+ }
+ mixin DefMatcher;
+
+ package @property auto subMatcher(SizesToPick...)() @trusted
+ {
+ return CherryPick!(Impl, SizesToPick)(&this);
+ }
+
+ bool lookupUni(Mode mode, Range)(ref Range inp) const pure
+ {
+ wchar x = cast(wchar)(inp[0] - 0xD800);
+ //not a high surrogate
+ if (x > 0x3FF)
+ {
+ //low surrogate
+ if (x <= 0x7FF) badEncoding();
+ static if (sizeFlags & 1)
+ {
+ auto ch = inp[0];
+ static if (mode == Mode.alwaysSkip)
+ inp.popFront();
+ static if (mode == Mode.skipOnMatch)
+ {
+ if (bmp[ch])
+ {
+ inp.popFront();
+ return true;
+ }
+ else
+ return false;
+ }
+ else
+ return bmp[ch];
+ }
+ else //skip is not available for sub-matchers, so just false
+ return false;
+ }
+ else
+ {
+ static if (sizeFlags & 2)
+ {
+ if (inp.length < 2)
+ badEncoding();
+ wchar y = cast(wchar)(inp[1] - 0xDC00);
+ //not a low surrogate
+ if (y > 0x3FF)
+ badEncoding();
+ wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
+ static if (mode == Mode.alwaysSkip)
+ inp.popFrontN(2);
+ static if (mode == Mode.skipOnMatch)
+ {
+ if (uni[needle])
+ {
+ inp.popFrontN(2);
+ return true;
+ }
+ else
+ return false;
+ }
+ else
+ return uni[needle];
+ }
+ else //ditto
+ return false;
+ }
+ }
+ }
+
+ struct CherryPick(I, Sizes...)
+ if (Sizes.length >= 1 && Sizes.length <= 2)
+ {
+ private:
+ import std.meta : allSatisfy;
+ I* m;
+ enum sizeFlags = I.sizeFlags;
+
+ static if (sizeFlags & 1)
+ {
+ @property ref ascii()() const pure{ return m.ascii; }
+ }
+
+ bool lookupUni(Mode mode, Range)(ref Range inp) const pure
+ {
+ return m.lookupUni!mode(inp);
+ }
+ mixin DefMatcher;
+ static assert(allSatisfy!(validSize, Sizes),
+ "Only lengths of 1 and 2 code units are possible in UTF-16");
+ }
+}
+
+private auto utf8Matcher(Set)(Set set) @trusted
+{
+ return Utf8Matcher!().build(set);
+}
+
+private auto utf16Matcher(Set)(Set set) @trusted
+{
+ return Utf16Matcher!().build(set);
+}
+
+/**
+ Constructs a matcher object
+ to classify $(CODEPOINTS) from the $(D set) for encoding
+ that has $(D Char) as code unit.
+
+ See $(LREF MatcherConcept) for API outline.
+*/
+public auto utfMatcher(Char, Set)(Set set) @trusted
+if (isCodepointSet!Set)
+{
+ static if (is(Char : char))
+ return utf8Matcher(set);
+ else static if (is(Char : wchar))
+ return utf16Matcher(set);
+ else static if (is(Char : dchar))
+ static assert(false, "UTF-32 needs no decoding,
+ and thus not supported by utfMatcher");
+ else
+ static assert(false, "Only character types 'char' and 'wchar' are allowed");
+}
+
+
+//a range of code units, packed with index to speed up forward iteration
+package auto decoder(C)(C[] s, size_t offset=0) @safe pure nothrow @nogc
+if (is(C : wchar) || is(C : char))
+{
+ static struct Decoder
+ {
+ pure nothrow:
+ C[] str;
+ size_t idx;
+ @property C front(){ return str[idx]; }
+ @property C back(){ return str[$-1]; }
+ void popFront(){ idx++; }
+ void popBack(){ str = str[0..$-1]; }
+ void popFrontN(size_t n){ idx += n; }
+ @property bool empty(){ return idx == str.length; }
+ @property auto save(){ return this; }
+ auto opIndex(size_t i){ return str[idx+i]; }
+ @property size_t length(){ return str.length - idx; }
+ alias opDollar = length;
+ auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); }
+ }
+ static assert(isRandomAccessRange!Decoder);
+ static assert(is(ElementType!Decoder : C));
+ return Decoder(s, offset);
+}
+
+@safe unittest
+{
+ string rs = "hi! ネемног砀 текста";
+ auto codec = rs.decoder;
+ auto utf8 = utf8Matcher(unicode.Letter);
+ auto asc = utf8.subMatcher!(1);
+ auto uni = utf8.subMatcher!(2,3,4);
+ assert(asc.test(codec));
+ assert(!uni.match(codec));
+ assert(utf8.skip(codec));
+ assert(codec.idx == 1);
+
+ assert(!uni.match(codec));
+ assert(asc.test(codec));
+ assert(utf8.skip(codec));
+ assert(codec.idx == 2);
+ assert(!asc.match(codec));
+
+ assert(!utf8.test(codec));
+ assert(!utf8.skip(codec));
+
+ assert(!asc.test(codec));
+ assert(!utf8.test(codec));
+ assert(!utf8.skip(codec));
+ assert(utf8.test(codec));
+ foreach (i; 0 .. 7)
+ {
+ assert(!asc.test(codec));
+ assert(uni.test(codec));
+ assert(utf8.skip(codec));
+ }
+ assert(!utf8.test(codec));
+ assert(!utf8.skip(codec));
+ //the same with match where applicable
+ codec = rs.decoder;
+ assert(utf8.match(codec));
+ assert(codec.idx == 1);
+ assert(utf8.match(codec));
+ assert(codec.idx == 2);
+ assert(!utf8.match(codec));
+ assert(codec.idx == 2);
+ assert(!utf8.skip(codec));
+ assert(!utf8.skip(codec));
+
+ foreach (i; 0 .. 7)
+ {
+ assert(!asc.test(codec));
+ assert(utf8.test(codec));
+ assert(utf8.match(codec));
+ }
+ auto i = codec.idx;
+ assert(!utf8.match(codec));
+ assert(codec.idx == i);
+}
+
+@safe unittest
+{
+ import std.range : stride;
+ static bool testAll(Matcher, Range)(ref Matcher m, ref Range r)
+ {
+ bool t = m.test(r);
+ auto save = r.idx;
+ assert(t == m.match(r));
+ assert(r.idx == save || t); //ether no change or was match
+ r.idx = save;
+ static if (is(typeof(m.skip(r))))
+ {
+ assert(t == m.skip(r));
+ assert(r.idx != save); //always changed
+ r.idx = save;
+ }
+ return t;
+ }
+ auto utf16 = utfMatcher!wchar(unicode.L);
+ auto bmp = utf16.subMatcher!1;
+ auto nonBmp = utf16.subMatcher!1;
+ auto utf8 = utfMatcher!char(unicode.L);
+ auto ascii = utf8.subMatcher!1;
+ auto uni2 = utf8.subMatcher!2;
+ auto uni3 = utf8.subMatcher!3;
+ auto uni24 = utf8.subMatcher!(2,4);
+ foreach (ch; unicode.L.byCodepoint.stride(3))
+ {
+ import std.utf : encode;
+ char[4] buf;
+ wchar[2] buf16;
+ auto len = encode(buf, ch);
+ auto len16 = encode(buf16, ch);
+ auto c8 = buf[0 .. len].decoder;
+ auto c16 = buf16[0 .. len16].decoder;
+ assert(testAll(utf16, c16));
+ assert(testAll(bmp, c16) || len16 != 1);
+ assert(testAll(nonBmp, c16) || len16 != 2);
+
+ assert(testAll(utf8, c8));
+
+ //submatchers return false on out of their domain
+ assert(testAll(ascii, c8) || len != 1);
+ assert(testAll(uni2, c8) || len != 2);
+ assert(testAll(uni3, c8) || len != 3);
+ assert(testAll(uni24, c8) || (len != 2 && len != 4));
+ }
+}
+
+// cover decode fail cases of Matcher
+@system unittest
+{
+ import std.algorithm.iteration : map;
+ import std.exception : collectException;
+ import std.format : format;
+ auto utf16 = utfMatcher!wchar(unicode.L);
+ auto utf8 = utfMatcher!char(unicode.L);
+ //decode failure cases UTF-8
+ alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
+ "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
+ "\xCF\x00\0x00\0x00\x00");
+ foreach (msg; fails8)
+ {
+ assert(collectException((){
+ auto s = msg;
+ size_t idx = 0;
+ utf8.test(s);
+ }()), format("%( %2x %)", cast(ubyte[]) msg));
+ }
+ //decode failure cases UTF-16
+ alias fails16 = AliasSeq!([0xD811], [0xDC02]);
+ foreach (msg; fails16)
+ {
+ assert(collectException((){
+ auto s = msg.map!(x => cast(wchar) x);
+ utf16.test(s);
+ }()));
+ }
+}
+
+/++
+ Convenience function to construct optimal configurations for
+ packed Trie from any $(D set) of $(CODEPOINTS).
+
+ The parameter $(D level) indicates the number of trie levels to use,
+ allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
+ speed-size wise.
+
+ $(P Level 1 is fastest and the most memory hungry (a bit array). )
+ $(P Level 4 is the slowest and has the smallest footprint. )
+
+ See the $(S_LINK Synopsis, Synopsis) section for example.
+
+ Note:
+ Level 4 stays very practical (being faster and more predictable)
+ compared to using direct lookup on the $(D set) itself.
+
+
++/
+public auto toTrie(size_t level, Set)(Set set)
+if (isCodepointSet!Set)
+{
+ static if (level == 1)
+ return codepointSetTrie!(21)(set);
+ else static if (level == 2)
+ return codepointSetTrie!(10, 11)(set);
+ else static if (level == 3)
+ return codepointSetTrie!(8, 5, 8)(set);
+ else static if (level == 4)
+ return codepointSetTrie!(6, 4, 4, 7)(set);
+ else
+ static assert(false,
+ "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
+}
+
+/**
+ $(P Builds a $(D Trie) with typically optimal speed-size trade-off
+ and wraps it into a delegate of the following type:
+ $(D bool delegate(dchar ch)). )
+
+ $(P Effectively this creates a 'tester' lambda suitable
+ for algorithms like std.algorithm.find that take unary predicates. )
+
+ See the $(S_LINK Synopsis, Synopsis) section for example.
+*/
+public auto toDelegate(Set)(Set set)
+if (isCodepointSet!Set)
+{
+ // 3 is very small and is almost as fast as 2-level (due to CPU caches?)
+ auto t = toTrie!3(set);
+ return (dchar ch) => t[ch];
+}
+
+/**
+ $(P Opaque wrapper around unsigned built-in integers and
+ code unit (char/wchar/dchar) types.
+ Parameter $(D sz) indicates that the value is confined
+ to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
+ packed more tightly when stored in certain
+ data-structures like trie. )
+
+ Note:
+ $(P The $(D BitPacked!(T, sz)) is implicitly convertible to $(D T)
+ but not vise-versa. Users have to ensure the value fits in
+ the range required and use the $(D cast)
+ operator to perform the conversion.)
+*/
+struct BitPacked(T, size_t sz)
+if (isIntegral!T || is(T:dchar))
+{
+ enum bitSize = sz;
+ T _value;
+ alias _value this;
+}
+
+/*
+ Depending on the form of the passed argument $(D bitSizeOf) returns
+ the amount of bits required to represent a given type
+ or a return type of a given functor.
+*/
+template bitSizeOf(Args...)
+if (Args.length == 1)
+{
+ import std.traits : ReturnType;
+ alias T = Args[0];
+ static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
+ {
+ enum bitSizeOf = T.bitSize;
+ }
+ else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
+ {
+ enum bitSizeOf = bitSizeOf!(ReturnType!T);
+ }
+ else
+ {
+ enum bitSizeOf = T.sizeof*8;
+ }
+}
+
+/**
+ Tests if $(D T) is some instantiation of $(LREF BitPacked)!(U, x)
+ and thus suitable for packing.
+*/
+template isBitPacked(T)
+{
+ static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
+ enum isBitPacked = true;
+ else
+ enum isBitPacked = false;
+}
+
+/**
+ Gives the type $(D U) from $(LREF BitPacked)!(U, x)
+ or $(D T) itself for every other type.
+*/
+template TypeOfBitPacked(T)
+{
+ static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
+ alias TypeOfBitPacked = U;
+ else
+ alias TypeOfBitPacked = T;
+}
+
+/*
+ Wrapper, used in definition of custom data structures from $(D Trie) template.
+ Applying it to a unary lambda function indicates that the returned value always
+ fits within $(D bits) of bits.
+*/
+struct assumeSize(alias Fn, size_t bits)
+{
+ enum bitSize = bits;
+ static auto ref opCall(T)(auto ref T arg)
+ {
+ return Fn(arg);
+ }
+}
+
+/*
+ A helper for defining lambda function that yields a slice
+ of certain bits from an unsigned integral value.
+ The resulting lambda is wrapped in assumeSize and can be used directly
+ with $(D Trie) template.
+*/
+struct sliceBits(size_t from, size_t to)
+{
+ //for now bypass assumeSize, DMD has trouble inlining it
+ enum bitSize = to-from;
+ static auto opCall(T)(T x)
+ out(result)
+ {
+ assert(result < (1 << to-from));
+ }
+ body
+ {
+ static assert(from < to);
+ static if (from == 0)
+ return x & ((1 << to)-1);
+ else
+ return (x >> from) & ((1<<(to-from))-1);
+ }
+}
+
+@safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; }
+@safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; }
+alias lo8 = assumeSize!(low_8, 8);
+alias mlo8 = assumeSize!(midlow_8, 8);
+
+static assert(bitSizeOf!lo8 == 8);
+static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
+static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
+
+template Sequence(size_t start, size_t end)
+{
+ static if (start < end)
+ alias Sequence = AliasSeq!(start, Sequence!(start+1, end));
+ else
+ alias Sequence = AliasSeq!();
+}
+
+//---- TRIE TESTS ----
+@system unittest
+{
+ import std.algorithm.iteration : map;
+ import std.algorithm.sorting : sort;
+ import std.array : array;
+ import std.conv : text, to;
+ import std.range : iota;
+ static trieStats(TRIE)(TRIE t)
+ {
+ version (std_uni_stats)
+ {
+ import std.stdio : writefln, writeln;
+ writeln("---TRIE FOOTPRINT STATS---");
+ foreach (i; staticIota!(0, t.table.dim) )
+ {
+ writefln("lvl%s = %s bytes; %s pages"
+ , i, t.bytes!i, t.pages!i);
+ }
+ writefln("TOTAL: %s bytes", t.bytes);
+ version (none)
+ {
+ writeln("INDEX (excluding value level):");
+ foreach (i; staticIota!(0, t.table.dim-1) )
+ writeln(t.table.slice!(i)[0 .. t.table.length!i]);
+ }
+ writeln("---------------------------");
+ }
+ }
+ //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
+ // alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; });
+ // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
+ alias Set = CodepointSet;
+ auto set = Set('A','Z','a','z');
+ auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
+ for (int a='a'; a<'z';a++)
+ assert(trie[a]);
+ for (int a='A'; a<'Z';a++)
+ assert(trie[a]);
+ for (int a=0; a<'A'; a++)
+ assert(!trie[a]);
+ for (int a ='Z'; a<'a'; a++)
+ assert(!trie[a]);
+ trieStats(trie);
+
+ auto redundant2 = Set(
+ 1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
+ auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
+ trieStats(trie2);
+ foreach (e; redundant2.byCodepoint)
+ assert(trie2[e], text(cast(uint) e, " - ", trie2[e]));
+ foreach (i; 0 .. 1024)
+ {
+ assert(trie2[i] == (i in redundant2));
+ }
+
+
+ auto redundant3 = Set(
+ 2, 4, 6, 8, 16,
+ 2+16, 4+16, 16+6, 16+8, 16+16,
+ 2+32, 4+32, 32+6, 32+8,
+ );
+
+ enum max3 = 256;
+ // sliceBits
+ auto trie3 = buildTrie!(bool, uint, max3,
+ sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
+ )(redundant3.byInterval);
+ trieStats(trie3);
+ foreach (i; 0 .. max3)
+ assert(trie3[i] == (i in redundant3), text(cast(uint) i));
+
+ auto redundant4 = Set(
+ 10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
+ 1000, 2000, 3000, 4000, 5000, 6000
+ );
+ enum max4 = 2^^16;
+ auto trie4 = buildTrie!(bool, size_t, max4,
+ sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
+ )(redundant4.byInterval);
+ foreach (i; 0 .. max4)
+ {
+ if (i in redundant4)
+ assert(trie4[i], text(cast(uint) i));
+ }
+ trieStats(trie4);
+
+ alias mapToS = mapTrieIndex!(useItemAt!(0, char));
+ string[] redundantS = ["tea", "start", "orange"];
+ redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
+ auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
+ // using first char only
+ assert(redundantS == ["orange", "start", "tea"]);
+ assert(strie["test"], text(strie["test"]));
+ assert(!strie["aea"]);
+ assert(strie["s"]);
+
+ // a bit size test
+ auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
+ auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
+ trieStats(bt);
+ foreach (i; 0 .. 256)
+ assert(bt[cast(ubyte) i]);
+}
+
+template useItemAt(size_t idx, T)
+if (isIntegral!T || is(T: dchar))
+{
+ size_t impl(in T[] arr){ return arr[idx]; }
+ alias useItemAt = assumeSize!(impl, 8*T.sizeof);
+}
+
+template useLastItem(T)
+{
+ size_t impl(in T[] arr){ return arr[$-1]; }
+ alias useLastItem = assumeSize!(impl, 8*T.sizeof);
+}
+
+template fullBitSize(Prefix...)
+{
+ static if (Prefix.length > 0)
+ enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
+ else
+ enum fullBitSize = 0;
+}
+
+template idxTypes(Key, size_t fullBits, Prefix...)
+{
+ static if (Prefix.length == 1)
+ {// the last level is value level, so no index once reduced to 1-level
+ alias idxTypes = AliasSeq!();
+ }
+ else
+ {
+ // Important note on bit packing
+ // Each level has to hold enough of bits to address the next one
+ // The bottom level is known to hold full bit width
+ // thus it's size in pages is full_bit_width - size_of_last_prefix
+ // Recourse on this notion
+ alias idxTypes =
+ AliasSeq!(
+ idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
+ BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
+ );
+ }
+}
+
+//============================================================================
+
+@safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
+if (is(Char1 : dchar) && is(Char2 : dchar))
+{
+ import std.algorithm.comparison : cmp;
+ import std.algorithm.iteration : map, filter;
+ import std.ascii : toLower;
+ static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';}
+ return cmp(
+ a.map!toLower.filter!pred,
+ b.map!toLower.filter!pred);
+}
+
+@safe pure unittest
+{
+ assert(!comparePropertyName("foo-bar", "fooBar"));
+}
+
+bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure
+if (is(Char1 : dchar) && is(Char2 : dchar))
+{
+ return comparePropertyName(a, b) < 0;
+}
+
+//============================================================================
+// Utilities for compression of Unicode code point sets
+//============================================================================
+
+@safe void compressTo(uint val, ref ubyte[] arr) pure nothrow
+{
+ // not optimized as usually done 1 time (and not public interface)
+ if (val < 128)
+ arr ~= cast(ubyte) val;
+ else if (val < (1 << 13))
+ {
+ arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8);
+ arr ~= val & 0xFF;
+ }
+ else
+ {
+ assert(val < (1 << 21));
+ arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16);
+ arr ~= (val >> 8) & 0xFF;
+ arr ~= val & 0xFF;
+ }
+}
+
+@safe uint decompressFrom(const(ubyte)[] arr, ref size_t idx) pure
+{
+ import std.exception : enforce;
+ immutable first = arr[idx++];
+ if (!(first & 0x80)) // no top bit -> [0 .. 127]
+ return first;
+ immutable extra = ((first >> 5) & 1) + 1; // [1, 2]
+ uint val = (first & 0x1F);
+ enforce(idx + extra <= arr.length, "bad code point interval encoding");
+ foreach (j; 0 .. extra)
+ val = (val << 8) | arr[idx+j];
+ idx += extra;
+ return val;
+}
+
+
+package ubyte[] compressIntervals(Range)(Range intervals)
+if (isInputRange!Range && isIntegralPair!(ElementType!Range))
+{
+ ubyte[] storage;
+ uint base = 0;
+ // RLE encode
+ foreach (val; intervals)
+ {
+ compressTo(val[0]-base, storage);
+ base = val[0];
+ if (val[1] != lastDchar+1) // till the end of the domain so don't store it
+ {
+ compressTo(val[1]-base, storage);
+ base = val[1];
+ }
+ }
+ return storage;
+}
+
+@safe pure unittest
+{
+ import std.algorithm.comparison : equal;
+ import std.typecons : tuple;
+
+ auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)];
+ ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0];
+ assert(compressIntervals(run) == enc);
+ auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)];
+ ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed
+ assert(compressIntervals(run2) == enc2);
+ size_t idx = 0;
+ assert(decompressFrom(enc, idx) == 80);
+ assert(decompressFrom(enc, idx) == 47);
+ assert(decompressFrom(enc, idx) == 1);
+ assert(decompressFrom(enc, idx) == (1 << 10));
+ idx = 0;
+ assert(decompressFrom(enc2, idx) == 0);
+ assert(decompressFrom(enc2, idx) == (1 << 20)+512+1);
+ assert(equal(decompressIntervals(compressIntervals(run)), run));
+ assert(equal(decompressIntervals(compressIntervals(run2)), run2));
+}
+
+// Creates a range of $(D CodepointInterval) that lazily decodes compressed data.
+@safe package auto decompressIntervals(const(ubyte)[] data) pure
+{
+ return DecompressedIntervals(data);
+}
+
+@safe struct DecompressedIntervals
+{
+pure:
+ const(ubyte)[] _stream;
+ size_t _idx;
+ CodepointInterval _front;
+
+ this(const(ubyte)[] stream)
+ {
+ _stream = stream;
+ popFront();
+ }
+
+ @property CodepointInterval front()
+ {
+ assert(!empty);
+ return _front;
+ }
+
+ void popFront()
+ {
+ if (_idx == _stream.length)
+ {
+ _idx = size_t.max;
+ return;
+ }
+ uint base = _front[1];
+ _front[0] = base + decompressFrom(_stream, _idx);
+ if (_idx == _stream.length)// odd length ---> till the end
+ _front[1] = lastDchar+1;
+ else
+ {
+ base = _front[0];
+ _front[1] = base + decompressFrom(_stream, _idx);
+ }
+ }
+
+ @property bool empty() const
+ {
+ return _idx == size_t.max;
+ }
+
+ @property DecompressedIntervals save() { return this; }
+}
+
+static assert(isInputRange!DecompressedIntervals);
+static assert(isForwardRange!DecompressedIntervals);
+//============================================================================
+
+version (std_uni_bootstrap){}
+else
+{
+
+// helper for looking up code point sets
+@trusted ptrdiff_t findUnicodeSet(alias table, C)(in C[] name) pure
+{
+ import std.algorithm.iteration : map;
+ import std.range : assumeSorted;
+ auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
+ (table.map!"a.name"());
+ size_t idx = range.lowerBound(name).length;
+ if (idx < range.length && comparePropertyName(range[idx], name) == 0)
+ return idx;
+ return -1;
+}
+
+// another one that loads it
+@trusted bool loadUnicodeSet(alias table, Set, C)(in C[] name, ref Set dest) pure
+{
+ auto idx = findUnicodeSet!table(name);
+ if (idx >= 0)
+ {
+ dest = Set(asSet(table[idx].compressed));
+ return true;
+ }
+ return false;
+}
+
+@trusted bool loadProperty(Set=CodepointSet, C)
+ (in C[] name, ref Set target) pure
+{
+ import std.internal.unicode_tables : uniProps; // generated file
+ alias ucmp = comparePropertyName;
+ // conjure cumulative properties by hand
+ if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
+ {
+ target = asSet(uniProps.Lu);
+ target |= asSet(uniProps.Ll);
+ target |= asSet(uniProps.Lt);
+ target |= asSet(uniProps.Lo);
+ target |= asSet(uniProps.Lm);
+ }
+ else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
+ {
+ target = asSet(uniProps.Ll);
+ target |= asSet(uniProps.Lu);
+ target |= asSet(uniProps.Lt);// Title case
+ }
+ else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
+ {
+ target = asSet(uniProps.Mn);
+ target |= asSet(uniProps.Mc);
+ target |= asSet(uniProps.Me);
+ }
+ else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
+ {
+ target = asSet(uniProps.Nd);
+ target |= asSet(uniProps.Nl);
+ target |= asSet(uniProps.No);
+ }
+ else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
+ {
+ target = asSet(uniProps.Pc);
+ target |= asSet(uniProps.Pd);
+ target |= asSet(uniProps.Ps);
+ target |= asSet(uniProps.Pe);
+ target |= asSet(uniProps.Pi);
+ target |= asSet(uniProps.Pf);
+ target |= asSet(uniProps.Po);
+ }
+ else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
+ {
+ target = asSet(uniProps.Sm);
+ target |= asSet(uniProps.Sc);
+ target |= asSet(uniProps.Sk);
+ target |= asSet(uniProps.So);
+ }
+ else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
+ {
+ target = asSet(uniProps.Zs);
+ target |= asSet(uniProps.Zl);
+ target |= asSet(uniProps.Zp);
+ }
+ else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
+ {
+ target = asSet(uniProps.Co);
+ target |= asSet(uniProps.Lo);
+ target |= asSet(uniProps.No);
+ target |= asSet(uniProps.So);
+ target |= asSet(uniProps.Po);
+ }
+ else if (ucmp(name, "graphical") == 0)
+ {
+ target = asSet(uniProps.Alphabetic);
+
+ target |= asSet(uniProps.Mn);
+ target |= asSet(uniProps.Mc);
+ target |= asSet(uniProps.Me);
+
+ target |= asSet(uniProps.Nd);
+ target |= asSet(uniProps.Nl);
+ target |= asSet(uniProps.No);
+
+ target |= asSet(uniProps.Pc);
+ target |= asSet(uniProps.Pd);
+ target |= asSet(uniProps.Ps);
+ target |= asSet(uniProps.Pe);
+ target |= asSet(uniProps.Pi);
+ target |= asSet(uniProps.Pf);
+ target |= asSet(uniProps.Po);
+
+ target |= asSet(uniProps.Zs);
+
+ target |= asSet(uniProps.Sm);
+ target |= asSet(uniProps.Sc);
+ target |= asSet(uniProps.Sk);
+ target |= asSet(uniProps.So);
+ }
+ else if (ucmp(name, "any") == 0)
+ target = Set.fromIntervals(0, 0x110000);
+ else if (ucmp(name, "ascii") == 0)
+ target = Set.fromIntervals(0, 0x80);
+ else
+ return loadUnicodeSet!(uniProps.tab)(name, target);
+ return true;
+}
+
+// CTFE-only helper for checking property names at compile-time
+@safe bool isPrettyPropertyName(C)(in C[] name)
+{
+ import std.algorithm.searching : find;
+ auto names = [
+ "L", "Letter",
+ "LC", "Cased Letter",
+ "M", "Mark",
+ "N", "Number",
+ "P", "Punctuation",
+ "S", "Symbol",
+ "Z", "Separator",
+ "Graphical",
+ "any",
+ "ascii"
+ ];
+ auto x = find!(x => comparePropertyName(x, name) == 0)(names);
+ return !x.empty;
+}
+
+// ditto, CTFE-only, not optimized
+@safe private static bool findSetName(alias table, C)(in C[] name)
+{
+ return findUnicodeSet!table(name) >= 0;
+}
+
+template SetSearcher(alias table, string kind)
+{
+ /// Run-time checked search.
+ static auto opCall(C)(in C[] name)
+ if (is(C : dchar))
+ {
+ import std.conv : to;
+ CodepointSet set;
+ if (loadUnicodeSet!table(name, set))
+ return set;
+ throw new Exception("No unicode set for "~kind~" by name "
+ ~name.to!string()~" was found.");
+ }
+ /// Compile-time checked search.
+ static @property auto opDispatch(string name)()
+ {
+ static if (findSetName!table(name))
+ {
+ CodepointSet set;
+ loadUnicodeSet!table(name, set);
+ return set;
+ }
+ else
+ static assert(false, "No unicode set for "~kind~" by name "
+ ~name~" was found.");
+ }
+}
+
+/**
+ A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
+ a block, script or general category.
+
+ It uses well defined standard rules of property name lookup.
+ This includes fuzzy matching of names, so that
+ 'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
+ and yield the same set of white space $(CHARACTERS).
+*/
+@safe public struct unicode
+{
+ /**
+ Performs the lookup of set of $(CODEPOINTS)
+ with compile-time correctness checking.
+ This short-cut version combines 3 searches:
+ across blocks, scripts, and common binary properties.
+
+ Note that since scripts and blocks overlap the
+ usual trick to disambiguate is used - to get a block use
+ $(D unicode.InBlockName), to search a script
+ use $(D unicode.ScriptName).
+
+ See_Also: $(LREF block), $(LREF script)
+ and (not included in this search) $(LREF hangulSyllableType).
+ */
+
+ static @property auto opDispatch(string name)() pure
+ {
+ static if (findAny(name))
+ return loadAny(name);
+ else
+ static assert(false, "No unicode set by name "~name~" was found.");
+ }
+
+ ///
+ @safe unittest
+ {
+ import std.exception : collectException;
+ auto ascii = unicode.ASCII;
+ assert(ascii['A']);
+ assert(ascii['~']);
+ assert(!ascii['\u00e0']);
+ // matching is case-insensitive
+ assert(ascii == unicode.ascII);
+ assert(!ascii['à']);
+ // underscores, '-' and whitespace in names are ignored too
+ auto latin = unicode.in_latin1_Supplement;
+ assert(latin['à']);
+ assert(!latin['$']);
+ // BTW Latin 1 Supplement is a block, hence "In" prefix
+ assert(latin == unicode("In Latin 1 Supplement"));
+ // run-time look up throws if no such set is found
+ assert(collectException(unicode("InCyrilliac")));
+ }
+
+ /**
+ The same lookup across blocks, scripts, or binary properties,
+ but performed at run-time.
+ This version is provided for cases where $(D name)
+ is not known beforehand; otherwise compile-time
+ checked $(LREF opDispatch) is typically a better choice.
+
+ See the $(S_LINK Unicode properties, table of properties) for available
+ sets.
+ */
+ static auto opCall(C)(in C[] name)
+ if (is(C : dchar))
+ {
+ return loadAny(name);
+ }
+
+ /**
+ Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
+
+ Note:
+ Here block names are unambiguous as no scripts are searched
+ and thus to search use simply $(D unicode.block.BlockName) notation.
+
+ See $(S_LINK Unicode properties, table of properties) for available sets.
+ See_Also: $(S_LINK Unicode properties, table of properties).
+ */
+ struct block
+ {
+ import std.internal.unicode_tables : blocks; // generated file
+ mixin SetSearcher!(blocks.tab, "block");
+ }
+
+ ///
+ @safe unittest
+ {
+ // use .block for explicitness
+ assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
+ }
+
+ /**
+ Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
+
+ See the $(S_LINK Unicode properties, table of properties) for available
+ sets.
+ */
+ struct script
+ {
+ import std.internal.unicode_tables : scripts; // generated file
+ mixin SetSearcher!(scripts.tab, "script");
+ }
+
+ ///
+ @safe unittest
+ {
+ auto arabicScript = unicode.script.arabic;
+ auto arabicBlock = unicode.block.arabic;
+ // there is an intersection between script and block
+ assert(arabicBlock['؁']);
+ assert(arabicScript['؁']);
+ // but they are different
+ assert(arabicBlock != arabicScript);
+ assert(arabicBlock == unicode.inArabic);
+ assert(arabicScript == unicode.arabic);
+ }
+
+ /**
+ Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
+
+ Other non-binary properties (once supported) follow the same
+ notation - $(D unicode.propertyName.propertyValue) for compile-time
+ checked access and $(D unicode.propertyName(propertyValue))
+ for run-time checked one.
+
+ See the $(S_LINK Unicode properties, table of properties) for available
+ sets.
+ */
+ struct hangulSyllableType
+ {
+ import std.internal.unicode_tables : hangul; // generated file
+ mixin SetSearcher!(hangul.tab, "hangul syllable type");
+ }
+
+ ///
+ @safe unittest
+ {
+ // L here is syllable type not Letter as in unicode.L short-cut
+ auto leadingVowel = unicode.hangulSyllableType("L");
+ // check that some leading vowels are present
+ foreach (vowel; '\u1110'..'\u115F')
+ assert(leadingVowel[vowel]);
+ assert(leadingVowel == unicode.hangulSyllableType.L);
+ }
+
+private:
+ alias ucmp = comparePropertyName;
+
+ static bool findAny(string name)
+ {
+ import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file
+ return isPrettyPropertyName(name)
+ || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
+ || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
+ }
+
+ static auto loadAny(Set=CodepointSet, C)(in C[] name) pure
+ {
+ import std.conv : to;
+ import std.internal.unicode_tables : blocks, scripts; // generated file
+ Set set;
+ immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
+ || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0
+ && loadUnicodeSet!(blocks.tab)(name[2..$], set));
+ if (loaded)
+ return set;
+ throw new Exception("No unicode set by name "~name.to!string()~" was found.");
+ }
+
+ // FIXME: re-disable once the compiler is fixed
+ // Disabled to prevent the mistake of creating instances of this pseudo-struct.
+ //@disable ~this();
+}
+
+@safe unittest
+{
+ import std.internal.unicode_tables : blocks, uniProps; // generated file
+ assert(unicode("InHebrew") == asSet(blocks.Hebrew));
+ assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
+ assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
+}
+
+enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
+
+// control - '\r'
+enum controlSwitch = `
+ case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':..
+ case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085':
+`;
+// TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
+// kill unrolled switches
+
+private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
+{
+ return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
+}
+
+template genericDecodeGrapheme(bool getValue)
+{
+ alias graphemeExtend = graphemeExtendTrie;
+ alias spacingMark = mcTrie;
+ static if (getValue)
+ alias Value = Grapheme;
+ else
+ alias Value = void;
+
+ Value genericDecodeGrapheme(Input)(ref Input range)
+ {
+ import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file
+ enum GraphemeState {
+ Start,
+ CR,
+ RI,
+ L,
+ V,
+ LVT
+ }
+ static if (getValue)
+ Grapheme grapheme;
+ auto state = GraphemeState.Start;
+ enum eat = q{
+ static if (getValue)
+ grapheme ~= ch;
+ range.popFront();
+ };
+
+ dchar ch;
+ assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
+ while (!range.empty)
+ {
+ ch = range.front;
+ final switch (state) with(GraphemeState)
+ {
+ case Start:
+ mixin(eat);
+ if (ch == '\r')
+ state = CR;
+ else if (isRegionalIndicator(ch))
+ state = RI;
+ else if (isHangL(ch))
+ state = L;
+ else if (hangLV[ch] || isHangV(ch))
+ state = V;
+ else if (hangLVT[ch])
+ state = LVT;
+ else if (isHangT(ch))
+ state = LVT;
+ else
+ {
+ switch (ch)
+ {
+ mixin(controlSwitch);
+ goto L_End;
+ default:
+ goto L_End_Extend;
+ }
+ }
+ break;
+ case CR:
+ if (ch == '\n')
+ mixin(eat);
+ goto L_End_Extend;
+ case RI:
+ if (isRegionalIndicator(ch))
+ mixin(eat);
+ else
+ goto L_End_Extend;
+ break;
+ case L:
+ if (isHangL(ch))
+ mixin(eat);
+ else if (isHangV(ch) || hangLV[ch])
+ {
+ state = V;
+ mixin(eat);
+ }
+ else if (hangLVT[ch])
+ {
+ state = LVT;
+ mixin(eat);
+ }
+ else
+ goto L_End_Extend;
+ break;
+ case V:
+ if (isHangV(ch))
+ mixin(eat);
+ else if (isHangT(ch))
+ {
+ state = LVT;
+ mixin(eat);
+ }
+ else
+ goto L_End_Extend;
+ break;
+ case LVT:
+ if (isHangT(ch))
+ {
+ mixin(eat);
+ }
+ else
+ goto L_End_Extend;
+ break;
+ }
+ }
+ L_End_Extend:
+ while (!range.empty)
+ {
+ ch = range.front;
+ // extend & spacing marks
+ if (!graphemeExtend[ch] && !spacingMark[ch])
+ break;
+ mixin(eat);
+ }
+ L_End:
+ static if (getValue)
+ return grapheme;
+ }
+
+}
+
+public: // Public API continues
+
+/++
+ Computes the length of grapheme cluster starting at $(D index).
+ Both the resulting length and the $(D index) are measured
+ in $(S_LINK Code unit, code units).
+
+ Params:
+ C = type that is implicitly convertible to $(D dchars)
+ input = array of grapheme clusters
+ index = starting index into $(D input[])
+
+ Returns:
+ length of grapheme cluster
++/
+size_t graphemeStride(C)(in C[] input, size_t index)
+if (is(C : dchar))
+{
+ auto src = input[index..$];
+ auto n = src.length;
+ genericDecodeGrapheme!(false)(src);
+ return n - src.length;
+}
+
+///
+@safe unittest
+{
+ assert(graphemeStride(" ", 1) == 1);
+ // A + combing ring above
+ string city = "A\u030Arhus";
+ size_t first = graphemeStride(city, 0);
+ assert(first == 3); //\u030A has 2 UTF-8 code units
+ assert(city[0 .. first] == "A\u030A");
+ assert(city[first..$] == "rhus");
+}
+
+/++
+ Reads one full grapheme cluster from an input range of dchar $(D inp).
+
+ For examples see the $(LREF Grapheme) below.
+
+ Note:
+ This function modifies $(D inp) and thus $(D inp)
+ must be an L-value.
++/
+Grapheme decodeGrapheme(Input)(ref Input inp)
+if (isInputRange!Input && is(Unqual!(ElementType!Input) == dchar))
+{
+ return genericDecodeGrapheme!true(inp);
+}
+
+@system unittest
+{
+ import std.algorithm.comparison : equal;
+
+ Grapheme gr;
+ string s = " \u0020\u0308 ";
+ gr = decodeGrapheme(s);
+ assert(gr.length == 1 && gr[0] == ' ');
+ gr = decodeGrapheme(s);
+ assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308"));
+ s = "\u0300\u0308\u1100";
+ assert(equal(decodeGrapheme(s)[], "\u0300\u0308"));
+ assert(equal(decodeGrapheme(s)[], "\u1100"));
+ s = "\u11A8\u0308\uAC01";
+ assert(equal(decodeGrapheme(s)[], "\u11A8\u0308"));
+ assert(equal(decodeGrapheme(s)[], "\uAC01"));
+}
+
+/++
+ $(P Iterate a string by grapheme.)
+
+ $(P Useful for doing string manipulation that needs to be aware
+ of graphemes.)
+
+ See_Also:
+ $(LREF byCodePoint)
++/
+auto byGrapheme(Range)(Range range)
+if (isInputRange!Range && is(Unqual!(ElementType!Range) == dchar))
+{
+ // TODO: Bidirectional access
+ static struct Result(R)
+ {
+ private R _range;
+ private Grapheme _front;
+
+ bool empty() @property
+ {
+ return _front.length == 0;
+ }
+
+ Grapheme front() @property
+ {
+ return _front;
+ }
+
+ void popFront()
+ {
+ _front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
+ }
+
+ static if (isForwardRange!R)
+ {
+ Result save() @property
+ {
+ return Result(_range.save, _front);
+ }
+ }
+ }
+
+ auto result = Result!(Range)(range);
+ result.popFront();
+ return result;
+}
+
+///
+@safe unittest
+{
+ import std.algorithm.comparison : equal;
+ import std.range.primitives : walkLength;
+ import std.range : take, drop;
+ auto text = "noe\u0308l"; // noël using e + combining diaeresis
+ assert(text.walkLength == 5); // 5 code points
+
+ auto gText = text.byGrapheme;
+ assert(gText.walkLength == 4); // 4 graphemes
+
+ assert(gText.take(3).equal("noe\u0308".byGrapheme));
+ assert(gText.drop(3).equal("l".byGrapheme));
+}
+
+// For testing non-forward-range input ranges
+version (unittest)
+private static struct InputRangeString
+{
+ private string s;
+
+ bool empty() @property { return s.empty; }
+ dchar front() @property { return s.front; }
+ void popFront() { s.popFront(); }
+}
+
+@system unittest
+{
+ import std.algorithm.comparison : equal;
+ import std.array : array;
+ import std.range : retro;
+ import std.range.primitives : walkLength;
+ assert("".byGrapheme.walkLength == 0);
+
+ auto reverse = "le\u0308on";
+ assert(reverse.walkLength == 5);
+
+ auto gReverse = reverse.byGrapheme;
+ assert(gReverse.walkLength == 4);
+
+ foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
+ {
+ assert(text.walkLength == 5);
+ static assert(isForwardRange!(typeof(text)));
+
+ auto gText = text.byGrapheme;
+ static assert(isForwardRange!(typeof(gText)));
+ assert(gText.walkLength == 4);
+ assert(gText.array.retro.equal(gReverse));
+ }
+
+ auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
+ static assert(!isForwardRange!(typeof(nonForwardRange)));
+ assert(nonForwardRange.walkLength == 4);
+}
+
+/++
+ $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
+
+ $(P Useful for converting the result to a string after doing operations
+ on graphemes.)
+
+ $(P Acts as the identity function when given a range of code points.)
++/
+auto byCodePoint(Range)(Range range)
+if (isInputRange!Range && is(Unqual!(ElementType!Range) == Grapheme))
+{
+ // TODO: Propagate bidirectional access
+ static struct Result
+ {
+ private Range _range;
+ private size_t i = 0;
+
+ bool empty() @property
+ {
+ return _range.empty;
+ }
+
+ dchar front() @property
+ {
+ return _range.front[i];
+ }
+
+ void popFront()
+ {
+ ++i;
+
+ if (i >= _range.front.length)
+ {
+ _range.popFront();
+ i = 0;
+ }
+ }
+
+ static if (isForwardRange!Range)
+ {
+ Result save() @property
+ {
+ return Result(_range.save, i);
+ }
+ }
+ }
+
+ return Result(range);
+}
+
+/// Ditto
+Range byCodePoint(Range)(Range range)
+if (isInputRange!Range && is(Unqual!(ElementType!Range) == dchar))
+{
+ return range;
+}
+
+///
+@safe unittest
+{
+ import std.array : array;
+ import std.conv : text;
+ import std.range : retro;
+
+ string s = "noe\u0308l"; // noël
+
+ // reverse it and convert the result to a string
+ string reverse = s.byGrapheme
+ .array
+ .retro
+ .byCodePoint
+ .text;
+
+ assert(reverse == "le\u0308on"); // lëon
+}
+
+@system unittest
+{
+ import std.algorithm.comparison : equal;
+ import std.range.primitives : walkLength;
+ assert("".byGrapheme.byCodePoint.equal(""));
+
+ string text = "noe\u0308l";
+ static assert(is(typeof(text.byCodePoint) == string));
+
+ auto gText = InputRangeString(text).byGrapheme;
+ static assert(!isForwardRange!(typeof(gText)));
+
+ auto cpText = gText.byCodePoint;
+ static assert(!isForwardRange!(typeof(cpText)));
+
+ assert(cpText.walkLength == text.walkLength);
+}
+
+@trusted:
+
+/++
+ $(P A structure designed to effectively pack $(CHARACTERS)
+ of a $(CLUSTER).
+ )
+
+ $(P $(D Grapheme) has value semantics so 2 copies of a $(D Grapheme)
+ always refer to distinct objects. In most actual scenarios a $(D Grapheme)
+ fits on the stack and avoids memory allocation overhead for all but quite
+ long clusters.
+ )
+
+ See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride)
++/
+@trusted struct Grapheme
+{
+ import std.traits : isDynamicArray;
+
+public:
+ /// Ctor
+ this(C)(in C[] chars...)
+ if (is(C : dchar))
+ {
+ this ~= chars;
+ }
+
+ ///ditto
+ this(Input)(Input seq)
+ if (!isDynamicArray!Input
+ && isInputRange!Input && is(ElementType!Input : dchar))
+ {
+ this ~= seq;
+ }
+
+ /// Gets a $(CODEPOINT) at the given index in this cluster.
+ dchar opIndex(size_t index) const pure nothrow @nogc
+ {
+ assert(index < length);
+ return read24(isBig ? ptr_ : small_.ptr, index);
+ }
+
+ /++
+ Writes a $(CODEPOINT) $(D ch) at given index in this cluster.
+
+ Warning:
+ Use of this facility may invalidate grapheme cluster,
+ see also $(LREF Grapheme.valid).
+ +/
+ void opIndexAssign(dchar ch, size_t index) pure nothrow @nogc
+ {
+ assert(index < length);
+ write24(isBig ? ptr_ : small_.ptr, ch, index);
+ }
+
+ ///
+ @safe unittest
+ {
+ auto g = Grapheme("A\u0302");
+ assert(g[0] == 'A');
+ assert(g.valid);
+ g[1] = '~'; // ASCII tilda is not a combining mark
+ assert(g[1] == '~');
+ assert(!g.valid);
+ }
+
+ /++
+ Random-access range over Grapheme's $(CHARACTERS).
+
+ Warning: Invalidates when this Grapheme leaves the scope,
+ attempts to use it then would lead to memory corruption.
+ +/
+ @system SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) pure nothrow @nogc
+ {
+ return sliceOverIndexed(a, b, &this);
+ }
+
+ /// ditto
+ @system SliceOverIndexed!Grapheme opSlice() pure nothrow @nogc
+ {
+ return sliceOverIndexed(0, length, &this);
+ }
+
+ /// Grapheme cluster length in $(CODEPOINTS).
+ @property size_t length() const pure nothrow @nogc
+ {
+ return isBig ? len_ : slen_ & 0x7F;
+ }
+
+ /++
+ Append $(CHARACTER) $(D ch) to this grapheme.
+ Warning:
+ Use of this facility may invalidate grapheme cluster,
+ see also $(D valid).
+
+ See_Also: $(LREF Grapheme.valid)
+ +/
+ ref opOpAssign(string op)(dchar ch)
+ {
+ static if (op == "~")
+ {
+ if (!isBig)
+ {
+ if (slen_ == small_cap)
+ convertToBig();// & fallthrough to "big" branch
+ else
+ {
+ write24(small_.ptr, ch, smallLength);
+ slen_++;
+ return this;
+ }
+ }
+
+ assert(isBig);
+ if (len_ == cap_)
+ {
+ import core.checkedint : addu, mulu;
+ bool overflow;
+ cap_ = addu(cap_, grow, overflow);
+ auto nelems = mulu(3, addu(cap_, 1, overflow), overflow);
+ if (overflow) assert(0);
+ ptr_ = cast(ubyte*) pureRealloc(ptr_, nelems);
+ if (ptr_ is null) onOutOfMemoryError();
+ }
+ write24(ptr_, ch, len_++);
+ return this;
+ }
+ else
+ static assert(false, "No operation "~op~" defined for Grapheme");
+ }
+
+ ///
+ @system unittest
+ {
+ import std.algorithm.comparison : equal;
+ auto g = Grapheme("A");
+ assert(g.valid);
+ g ~= '\u0301';
+ assert(g[].equal("A\u0301"));
+ assert(g.valid);
+ g ~= "B";
+ // not a valid grapheme cluster anymore
+ assert(!g.valid);
+ // still could be useful though
+ assert(g[].equal("A\u0301B"));
+ }
+
+ /// Append all $(CHARACTERS) from the input range $(D inp) to this Grapheme.
+ ref opOpAssign(string op, Input)(Input inp)
+ if (isInputRange!Input && is(ElementType!Input : dchar))
+ {
+ static if (op == "~")
+ {
+ foreach (dchar ch; inp)
+ this ~= ch;
+ return this;
+ }
+ else
+ static assert(false, "No operation "~op~" defined for Grapheme");
+ }
+
+ /++
+ True if this object contains valid extended grapheme cluster.
+ Decoding primitives of this module always return a valid $(D Grapheme).
+
+ Appending to and direct manipulation of grapheme's $(CHARACTERS) may
+ render it no longer valid. Certain applications may chose to use
+ Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
+ entirely.
+ +/
+ @property bool valid()() /*const*/
+ {
+ auto r = this[];
+ genericDecodeGrapheme!false(r);
+ return r.length == 0;
+ }
+
+ this(this) pure @nogc nothrow
+ {
+ if (isBig)
+ {// dup it
+ import core.checkedint : addu, mulu;
+ bool overflow;
+ auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow);
+ if (overflow) assert(0);
+
+ auto p = cast(ubyte*) pureMalloc(raw_cap);
+ if (p is null) onOutOfMemoryError();
+ p[0 .. raw_cap] = ptr_[0 .. raw_cap];
+ ptr_ = p;
+ }
+ }
+
+ ~this() pure @nogc nothrow
+ {
+ if (isBig)
+ {
+ pureFree(ptr_);
+ }
+ }
+
+
+private:
+ enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
+ // "out of the blue" grow rate, needs testing
+ // (though graphemes are typically small < 9)
+ enum grow = 20;
+ enum small_cap = small_bytes/3;
+ enum small_flag = 0x80, small_mask = 0x7F;
+ // 16 bytes in 32bits, should be enough for the majority of cases
+ union
+ {
+ struct
+ {
+ ubyte* ptr_;
+ size_t cap_;
+ size_t len_;
+ size_t padding_;
+ }
+ struct
+ {
+ ubyte[small_bytes] small_;
+ ubyte slen_;
+ }
+ }
+
+ void convertToBig() pure @nogc nothrow
+ {
+ static assert(grow.max / 3 - 1 >= grow);
+ enum nbytes = 3 * (grow + 1);
+ size_t k = smallLength;
+ ubyte* p = cast(ubyte*) pureMalloc(nbytes);
+ if (p is null) onOutOfMemoryError();
+ for (int i=0; i<k; i++)
+ write24(p, read24(small_.ptr, i), i);
+ // now we can overwrite small array data
+ ptr_ = p;
+ len_ = slen_;
+ assert(grow > len_);
+ cap_ = grow;
+ setBig();
+ }
+
+ void setBig() pure nothrow @nogc { slen_ |= small_flag; }
+
+ @property size_t smallLength() const pure nothrow @nogc
+ {
+ return slen_ & small_mask;
+ }
+ @property ubyte isBig() const pure nothrow @nogc
+ {
+ return slen_ & small_flag;
+ }
+}
+
+static assert(Grapheme.sizeof == size_t.sizeof*4);
+
+
+@system pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw
+{
+ import std.algorithm.comparison : equal;
+ Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")];
+ assert(byGrapheme("ЮУЗ").equal(data[]));
+}
+
+///
+@system unittest
+{
+ import std.algorithm.comparison : equal;
+ import std.algorithm.iteration : filter;
+ import std.range : isRandomAccessRange;
+
+ string bold = "ku\u0308hn";
+
+ // note that decodeGrapheme takes parameter by ref
+ auto first = decodeGrapheme(bold);
+
+ assert(first.length == 1);
+ assert(first[0] == 'k');
+
+ // the next grapheme is 2 characters long
+ auto wideOne = decodeGrapheme(bold);
+ // slicing a grapheme yields a random-access range of dchar
+ assert(wideOne[].equal("u\u0308"));
+ assert(wideOne.length == 2);
+ static assert(isRandomAccessRange!(typeof(wideOne[])));
+
+ // all of the usual range manipulation is possible
+ assert(wideOne[].filter!isMark().equal("\u0308"));
+
+ auto g = Grapheme("A");
+ assert(g.valid);
+ g ~= '\u0301';
+ assert(g[].equal("A\u0301"));
+ assert(g.valid);
+ g ~= "B";
+ // not a valid grapheme cluster anymore
+ assert(!g.valid);
+ // still could be useful though
+ assert(g[].equal("A\u0301B"));
+}
+
+@safe unittest
+{
+ auto g = Grapheme("A\u0302");
+ assert(g[0] == 'A');
+ assert(g.valid);
+ g[1] = '~'; // ASCII tilda is not a combining mark
+ assert(g[1] == '~');
+ assert(!g.valid);
+}
+
+@system unittest
+{
+ import std.algorithm.comparison : equal;
+ import std.algorithm.iteration : map;
+ import std.conv : text;
+ import std.range : iota;
+
+ // not valid clusters (but it just a test)
+ auto g = Grapheme('a', 'b', 'c', 'd', 'e');
+ assert(g[0] == 'a');
+ assert(g[1] == 'b');
+ assert(g[2] == 'c');
+ assert(g[3] == 'd');
+ assert(g[4] == 'e');
+ g[3] = 'Й';
+ assert(g[2] == 'c');
+ assert(g[3] == 'Й', text(g[3], " vs ", 'Й'));
+ assert(g[4] == 'e');
+ assert(!g.valid);
+
+ g ~= 'ц';
+ g ~= '~';
+ assert(g[0] == 'a');
+ assert(g[1] == 'b');
+ assert(g[2] == 'c');
+ assert(g[3] == 'Й');
+ assert(g[4] == 'e');
+ assert(g[5] == 'ц');
+ assert(g[6] == '~');
+ assert(!g.valid);
+
+ Grapheme copy = g;
+ copy[0] = 'X';
+ copy[1] = '-';
+ assert(g[0] == 'a' && copy[0] == 'X');
+ assert(g[1] == 'b' && copy[1] == '-');
+ assert(equal(g[2 .. g.length], copy[2 .. copy.length]));
+ copy = Grapheme("АБВГДЕЁЖЗИКЛМ");
+ assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8]));
+ copy ~= "xyz";
+ assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15]));
+ assert(!copy.valid);
+
+ Grapheme h;
+ foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
+ h ~= v;
+ assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1)));
+}
+
+/++
+ $(P Does basic case-insensitive comparison of $(D r1) and $(D r2).
+ This function uses simpler comparison rule thus achieving better performance
+ than $(LREF icmp). However keep in mind the warning below.)
+
+ Params:
+ r1 = an input range of characters
+ r2 = an input range of characters
+
+ Returns:
+ An $(D int) that is 0 if the strings match,
+ &lt;0 if $(D r1) is lexicographically "less" than $(D r2),
+ &gt;0 if $(D r1) is lexicographically "greater" than $(D r2)
+
+ Warning:
+ This function only handles 1:1 $(CODEPOINT) mapping
+ and thus is not sufficient for certain alphabets
+ like German, Greek and few others.
+
+ See_Also:
+ $(LREF icmp)
+ $(REF cmp, std,algorithm,comparison)
++/
+int sicmp(S1, S2)(S1 r1, S2 r2)
+if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1)
+ && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2))
+{
+ import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file
+ import std.utf : byDchar;
+
+ auto str1 = r1.byDchar;
+ auto str2 = r2.byDchar;
+
+ foreach (immutable lhs; str1)
+ {
+ if (str2.empty)
+ return 1;
+ immutable rhs = str2.front;
+ str2.popFront();
+ int diff = lhs - rhs;
+ if (!diff)
+ continue;
+ size_t idx = simpleCaseTrie[lhs];
+ size_t idx2 = simpleCaseTrie[rhs];
+ // simpleCaseTrie is packed index table
+ if (idx != EMPTY_CASE_TRIE)
+ {
+ if (idx2 != EMPTY_CASE_TRIE)
+ {// both cased chars
+ // adjust idx --> start of bucket
+ idx = idx - sTable[idx].n;
+ idx2 = idx2 - sTable[idx2].n;
+ if (idx == idx2)// one bucket, equivalent chars
+ continue;
+ else// not the same bucket
+ diff = sTable[idx].ch - sTable[idx2].ch;
+ }
+ else
+ diff = sTable[idx - sTable[idx].n].ch - rhs;
+ }
+ else if (idx2 != EMPTY_CASE_TRIE)
+ {
+ diff = lhs - sTable[idx2 - sTable[idx2].n].ch;
+ }
+ // one of chars is not cased at all
+ return diff;
+ }
+ return str2.empty ? 0 : -1;
+}
+
+///
+@safe @nogc pure nothrow unittest
+{
+ assert(sicmp("Август", "авгусТ") == 0);
+ // Greek also works as long as there is no 1:M mapping in sight
+ assert(sicmp("ΌΎ", "όύ") == 0);
+ // things like the following won't get matched as equal
+ // Greek small letter iota with dialytika and tonos
+ assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
+
+ // while icmp has no problem with that
+ assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
+ assert(icmp("ΌΎ", "όύ") == 0);
+}
+
+// overloads for the most common cases to reduce compile time
+@safe @nogc pure nothrow
+{
+ int sicmp(const(char)[] str1, const(char)[] str2)
+ { return sicmp!(const(char)[], const(char)[])(str1, str2); }
+ int sicmp(const(wchar)[] str1, const(wchar)[] str2)
+ { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
+ int sicmp(const(dchar)[] str1, const(dchar)[] str2)
+ { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
+}
+
+private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
+{
+ import std.algorithm.searching : skipOver;
+ import std.internal.unicode_tables : fullCaseTable; // generated file
+ alias fTable = fullCaseTable;
+ size_t idx = fullCaseTrie[lhs];
+ // fullCaseTrie is packed index table
+ if (idx == EMPTY_CASE_TRIE)
+ return lhs;
+ immutable start = idx - fTable[idx].n;
+ immutable end = fTable[idx].size + start;
+ assert(fTable[start].entry_len == 1);
+ for (idx=start; idx<end; idx++)
+ {
+ auto entryLen = fTable[idx].entry_len;
+ if (entryLen == 1)
+ {
+ if (fTable[idx].seq[0] == rhs)
+ {
+ return 0;
+ }
+ }
+ else
+ {// OK it's a long chunk, like 'ss' for German
+ dstring seq = fTable[idx].seq[0 .. entryLen];
+ if (rhs == seq[0]
+ && rtail.skipOver(seq[1..$]))
+ {
+ // note that this path modifies rtail
+ // iff we managed to get there
+ return 0;
+ }
+ }
+ }
+ return fTable[start].seq[0]; // new remapped character for accurate diffs
+}
+
+/++
+ Does case insensitive comparison of `r1` and `r2`.
+ Follows the rules of full case-folding mapping.
+ This includes matching as equal german ß with "ss" and
+ other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp).
+ The cost of `icmp` being pedantically correct is
+ slightly worse performance.
+
+ Params:
+ r1 = a forward range of characters
+ r2 = a forward range of characters
+
+ Returns:
+ An $(D int) that is 0 if the strings match,
+ &lt;0 if $(D str1) is lexicographically "less" than $(D str2),
+ &gt;0 if $(D str1) is lexicographically "greater" than $(D str2)
+
+ See_Also:
+ $(LREF sicmp)
+ $(REF cmp, std,algorithm,comparison)
++/
+int icmp(S1, S2)(S1 r1, S2 r2)
+if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1)
+ && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2))
+{
+ import std.utf : byDchar;
+
+ auto str1 = r1.byDchar;
+ auto str2 = r2.byDchar;
+
+ for (;;)
+ {
+ if (str1.empty)
+ return str2.empty ? 0 : -1;
+ immutable lhs = str1.front;
+ if (str2.empty)
+ return 1;
+ immutable rhs = str2.front;
+ str1.popFront();
+ str2.popFront();
+ if (!(lhs - rhs))
+ continue;
+ // first try to match lhs to <rhs,right-tail> sequence
+ immutable cmpLR = fullCasedCmp(lhs, rhs, str2);
+ if (!cmpLR)
+ continue;
+ // then rhs to <lhs,left-tail> sequence
+ immutable cmpRL = fullCasedCmp(rhs, lhs, str1);
+ if (!cmpRL)
+ continue;
+ // cmpXX contain remapped codepoints
+ // to obtain stable ordering of icmp
+ return cmpLR - cmpRL;
+ }
+}
+
+///
+@safe @nogc pure nothrow unittest
+{
+ assert(icmp("Rußland", "Russland") == 0);
+ assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
+}
+
+/**
+ * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding
+ * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`.
+ */
+@safe @nogc nothrow pure unittest
+{
+ import std.utf : byDchar;
+
+ assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0);
+ assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0);
+}
+
+// test different character types
+@safe unittest
+{
+ assert(icmp("Rußland", "Russland") == 0);
+ assert(icmp("Rußland"w, "Russland") == 0);
+ assert(icmp("Rußland", "Russland"w) == 0);
+ assert(icmp("Rußland"w, "Russland"w) == 0);
+ assert(icmp("Rußland"d, "Russland"w) == 0);
+ assert(icmp("Rußland"w, "Russland"d) == 0);
+}
+
+// overloads for the most common cases to reduce compile time
+@safe @nogc pure nothrow
+{
+ int icmp(const(char)[] str1, const(char)[] str2)
+ { return icmp!(const(char)[], const(char)[])(str1, str2); }
+ int icmp(const(wchar)[] str1, const(wchar)[] str2)
+ { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
+ int icmp(const(dchar)[] str1, const(dchar)[] str2)
+ { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
+}
+
+@safe unittest
+{
+ import std.algorithm.sorting : sort;
+ import std.conv : to;
+ import std.exception : assertCTFEable;
+ assertCTFEable!(
+ {
+ foreach (cfunc; AliasSeq!(icmp, sicmp))
+ {
+ foreach (S1; AliasSeq!(string, wstring, dstring))
+ foreach (S2; AliasSeq!(string, wstring, dstring))
+ (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396
+ assert(cfunc("".to!S1(), "".to!S2()) == 0);
+ assert(cfunc("A".to!S1(), "".to!S2()) > 0);
+ assert(cfunc("".to!S1(), "0".to!S2()) < 0);
+ assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
+ assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
+ assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
+ assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
+ assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0);
+ // Check example:
+ assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0);
+ assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0);
+ }();
+ // check that the order is properly agnostic to the case
+ auto strs = [ "Apple", "ORANGE", "orAcle", "amp", "banana"];
+ sort!((a,b) => cfunc(a,b) < 0)(strs);
+ assert(strs == ["amp", "Apple", "banana", "orAcle", "ORANGE"]);
+ }
+ assert(icmp("ßb", "ssa") > 0);
+ // Check example:
+ assert(icmp("Russland", "Rußland") == 0);
+ assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
+ assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0);
+ assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
+ //bugzilla 11057
+ assert( icmp("K", "L") < 0 );
+ });
+}
+
+// issue 17372
+@safe pure unittest
+{
+ import std.algorithm.iteration : joiner, map;
+ import std.algorithm.sorting : sort;
+ import std.array : array;
+ auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0);
+}
+
+// This is package for the moment to be used as a support tool for std.regex
+// It needs a better API
+/*
+ Return a range of all $(CODEPOINTS) that casefold to
+ and from this $(D ch).
+*/
+package auto simpleCaseFoldings(dchar ch) @safe
+{
+ import std.internal.unicode_tables : simpleCaseTable; // generated file
+ alias sTable = simpleCaseTable;
+ static struct Range
+ {
+ @safe pure nothrow:
+ uint idx; //if == uint.max, then read c.
+ union
+ {
+ dchar c; // == 0 - empty range
+ uint len;
+ }
+ @property bool isSmall() const { return idx == uint.max; }
+
+ this(dchar ch)
+ {
+ idx = uint.max;
+ c = ch;
+ }
+
+ this(uint start, uint size)
+ {
+ idx = start;
+ len = size;
+ }
+
+ @property dchar front() const
+ {
+ assert(!empty);
+ if (isSmall)
+ {
+ return c;
+ }
+ auto ch = sTable[idx].ch;
+ return ch;
+ }
+
+ @property bool empty() const
+ {
+ if (isSmall)
+ {
+ return c == 0;
+ }
+ return len == 0;
+ }
+
+ @property size_t length() const
+ {
+ if (isSmall)
+ {
+ return c == 0 ? 0 : 1;
+ }
+ return len;
+ }
+
+ void popFront()
+ {
+ if (isSmall)
+ c = 0;
+ else
+ {
+ idx++;
+ len--;
+ }
+ }
+ }
+ immutable idx = simpleCaseTrie[ch];
+ if (idx == EMPTY_CASE_TRIE)
+ return Range(ch);
+ auto entry = sTable[idx];
+ immutable start = idx - entry.n;
+ return Range(start, entry.size);
+}
+
+@system unittest
+{
+ import std.algorithm.comparison : equal;
+ import std.algorithm.searching : canFind;
+ import std.array : array;
+ import std.exception : assertCTFEable;
+ assertCTFEable!((){
+ auto r = simpleCaseFoldings('Э').array;
+ assert(r.length == 2);
+ assert(r.canFind('э') && r.canFind('Э'));
+ auto sr = simpleCaseFoldings('~');
+ assert(sr.equal("~"));
+ //A with ring above - casefolds to the same bucket as Angstrom sign
+ sr = simpleCaseFoldings('Å');
+ assert(sr.length == 3);
+ assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B'));
+ });
+}
+
+/++
+ $(P Returns the $(S_LINK Combining class, combining class) of $(D ch).)
++/
+ubyte combiningClass(dchar ch) @safe pure nothrow @nogc
+{
+ return combiningClassTrie[ch];
+}
+
+///
+@safe unittest
+{
+ // shorten the code
+ alias CC = combiningClass;
+
+ // combining tilda
+ assert(CC('\u0303') == 230);
+ // combining ring below
+ assert(CC('\u0325') == 220);
+ // the simple consequence is that "tilda" should be
+ // placed after a "ring below" in a sequence
+}
+
+@safe pure nothrow @nogc unittest
+{
+ foreach (ch; 0 .. 0x80)
+ assert(combiningClass(ch) == 0);
+ assert(combiningClass('\u05BD') == 22);
+ assert(combiningClass('\u0300') == 230);
+ assert(combiningClass('\u0317') == 220);
+ assert(combiningClass('\u1939') == 222);
+}
+
+/// Unicode character decomposition type.
+enum UnicodeDecomposition {
+ /// Canonical decomposition. The result is canonically equivalent sequence.
+ Canonical,
+ /**
+ Compatibility decomposition. The result is compatibility equivalent sequence.
+ Note: Compatibility decomposition is a $(B lossy) conversion,
+ typically suitable only for fuzzy matching and internal processing.
+ */
+ Compatibility
+}
+
+/**
+ Shorthand aliases for character decomposition type, passed as a
+ template parameter to $(LREF decompose).
+*/
+enum {
+ Canonical = UnicodeDecomposition.Canonical,
+ Compatibility = UnicodeDecomposition.Compatibility
+}
+
+/++
+ Try to canonically compose 2 $(CHARACTERS).
+ Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
+
+ The assumption is that $(D first) comes before $(D second) in the original text,
+ usually meaning that the first is a starter.
+
+ Note: Hangul syllables are not covered by this function.
+ See $(D composeJamo) below.
++/
+public dchar compose(dchar first, dchar second) pure nothrow @safe
+{
+ import std.algorithm.iteration : map;
+ import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask;
+ import std.range : assumeSorted;
+ immutable packed = compositionJumpTrie[first];
+ if (packed == ushort.max)
+ return dchar.init;
+ // unpack offset and length
+ immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
+ // TODO: optimize this micro binary search (no more then 4-5 steps)
+ auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted();
+ immutable target = r.lowerBound(second).length;
+ if (target == cnt)
+ return dchar.init;
+ immutable entry = compositionTable[idx+target];
+ if (entry.rhs != second)
+ return dchar.init;
+ return entry.composed;
+}
+
+///
+@safe unittest
+{
+ assert(compose('A','\u0308') == '\u00C4');
+ assert(compose('A', 'B') == dchar.init);
+ assert(compose('C', '\u0301') == '\u0106');
+ // note that the starter is the first one
+ // thus the following doesn't compose
+ assert(compose('\u0308', 'A') == dchar.init);
+}
+
+/++
+ Returns a full $(S_LINK Canonical decomposition, Canonical)
+ (by default) or $(S_LINK Compatibility decomposition, Compatibility)
+ decomposition of $(CHARACTER) $(D ch).
+ If no decomposition is available returns a $(LREF Grapheme)
+ with the $(D ch) itself.
+
+ Note:
+ This function also decomposes hangul syllables
+ as prescribed by the standard.
+
+ See_Also: $(LREF decomposeHangul) for a restricted version
+ that takes into account only hangul syllables but
+ no other decompositions.
++/
+public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe
+{
+ import std.algorithm.searching : until;
+ import std.internal.unicode_decomp : decompCompatTable, decompCanonTable;
+ static if (decompType == Canonical)
+ {
+ alias table = decompCanonTable;
+ alias mapping = canonMappingTrie;
+ }
+ else static if (decompType == Compatibility)
+ {
+ alias table = decompCompatTable;
+ alias mapping = compatMappingTrie;
+ }
+ immutable idx = mapping[ch];
+ if (!idx) // not found, check hangul arithmetic decomposition
+ return decomposeHangul(ch);
+ auto decomp = table[idx..$].until(0);
+ return Grapheme(decomp);
+}
+
+///
+@system unittest
+{
+ import std.algorithm.comparison : equal;
+
+ assert(compose('A','\u0308') == '\u00C4');
+ assert(compose('A', 'B') == dchar.init);
+ assert(compose('C', '\u0301') == '\u0106');
+ // note that the starter is the first one
+ // thus the following doesn't compose
+ assert(compose('\u0308', 'A') == dchar.init);
+
+ assert(decompose('Ĉ')[].equal("C\u0302"));
+ assert(decompose('D')[].equal("D"));
+ assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
+ assert(decompose!Compatibility('¹')[].equal("1"));
+}
+
+//----------------------------------------------------------------------------
+// Hangul specific composition/decomposition
+enum jamoSBase = 0xAC00;
+enum jamoLBase = 0x1100;
+enum jamoVBase = 0x1161;
+enum jamoTBase = 0x11A7;
+enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
+enum jamoNCount = jamoVCount * jamoTCount;
+enum jamoSCount = jamoLCount * jamoNCount;
+
+// Tests if $(D ch) is a Hangul leading consonant jamo.
+bool isJamoL(dchar ch) pure nothrow @nogc @safe
+{
+ // first cmp rejects ~ 1M code points above leading jamo range
+ return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
+}
+
+// Tests if $(D ch) is a Hangul vowel jamo.
+bool isJamoT(dchar ch) pure nothrow @nogc @safe
+{
+ // first cmp rejects ~ 1M code points above trailing jamo range
+ // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
+ return ch < jamoTBase+jamoTCount && ch > jamoTBase;
+}
+
+// Tests if $(D ch) is a Hangul trailnig consonant jamo.
+bool isJamoV(dchar ch) pure nothrow @nogc @safe
+{
+ // first cmp rejects ~ 1M code points above vowel range
+ return ch < jamoVBase+jamoVCount && ch >= jamoVBase;
+}
+
+int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe
+{
+ int idxS = cast(int) ch - jamoSBase;
+ return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
+}
+
+// internal helper: compose hangul syllables leaving dchar.init in holes
+void hangulRecompose(dchar[] seq) pure nothrow @nogc @safe
+{
+ for (size_t idx = 0; idx + 1 < seq.length; )
+ {
+ if (isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
+ {
+ immutable int indexL = seq[idx] - jamoLBase;
+ immutable int indexV = seq[idx+1] - jamoVBase;
+ immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount;
+ if (idx + 2 < seq.length && isJamoT(seq[idx+2]))
+ {
+ seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
+ seq[idx+1] = dchar.init;
+ seq[idx+2] = dchar.init;
+ idx += 3;
+ }
+ else
+ {
+ seq[idx] = jamoSBase + indexLV;
+ seq[idx+1] = dchar.init;
+ idx += 2;
+ }
+ }
+ else
+ idx++;
+ }
+}
+
+//----------------------------------------------------------------------------
+public:
+
+/**
+ Decomposes a Hangul syllable. If $(D ch) is not a composed syllable
+ then this function returns $(LREF Grapheme) containing only $(D ch) as is.
+*/
+Grapheme decomposeHangul(dchar ch) @safe
+{
+ immutable idxS = cast(int) ch - jamoSBase;
+ if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
+ immutable idxL = idxS / jamoNCount;
+ immutable idxV = (idxS % jamoNCount) / jamoTCount;
+ immutable idxT = idxS % jamoTCount;
+
+ immutable partL = jamoLBase + idxL;
+ immutable partV = jamoVBase + idxV;
+ if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition
+ return Grapheme(partL, partV, jamoTBase + idxT);
+ else // <L, V> decomposition
+ return Grapheme(partL, partV);
+}
+
+///
+@system unittest
+{
+ import std.algorithm.comparison : equal;
+ assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
+}
+
+/++
+ Try to compose hangul syllable out of a leading consonant ($(D lead)),
+ a $(D vowel) and optional $(D trailing) consonant jamos.
+
+ On success returns the composed LV or LVT hangul syllable.
+
+ If any of $(D lead) and $(D vowel) are not a valid hangul jamo
+ of the respective $(CHARACTER) class returns dchar.init.
++/
+dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe
+{
+ if (!isJamoL(lead))
+ return dchar.init;
+ immutable indexL = lead - jamoLBase;
+ if (!isJamoV(vowel))
+ return dchar.init;
+ immutable indexV = vowel - jamoVBase;
+ immutable indexLV = indexL * jamoNCount + indexV * jamoTCount;
+ immutable dchar syllable = jamoSBase + indexLV;
+ return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
+}
+
+///
+@safe unittest
+{
+ assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
+ // leaving out T-vowel, or passing any codepoint
+ // that is not trailing consonant composes an LV-syllable
+ assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
+ assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
+ assert(composeJamo('\u1111', 'A') == dchar.init);
+ assert(composeJamo('A', '\u1171') == dchar.init);
+}
+
+@system unittest
+{
+ import std.algorithm.comparison : equal;
+ import std.conv : text;
+
+ static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
+ {
+ Grapheme g = decompose!T(ch);
+ assert(equal(g[], r), text(g[], " vs ", r));
+ }
+ testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
+ testDecomp!Canonical('\uF907', "\u9F9C");
+ testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
+ testDecomp!Compatibility('\uA7F9', "\u0153");
+
+ // check examples
+ assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
+ assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
+ assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
+ assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
+ assert(composeJamo('\u1111', 'A') == dchar.init);
+ assert(composeJamo('A', '\u1171') == dchar.init);
+}
+
+/**
+ Enumeration type for normalization forms,
+ passed as template parameter for functions like $(LREF normalize).
+*/
+enum NormalizationForm {
+ NFC,
+ NFD,
+ NFKC,
+ NFKD
+}
+
+
+enum {
+ /**
+ Shorthand aliases from values indicating normalization forms.
+ */
+ NFC = NormalizationForm.NFC,
+ ///ditto
+ NFD = NormalizationForm.NFD,
+ ///ditto
+ NFKC = NormalizationForm.NFKC,
+ ///ditto
+ NFKD = NormalizationForm.NFKD
+}
+
+/++
+ Returns $(D input) string normalized to the chosen form.
+ Form C is used by default.
+
+ For more information on normalization forms see
+ the $(S_LINK Normalization, normalization section).
+
+ Note:
+ In cases where the string in question is already normalized,
+ it is returned unmodified and no memory allocation happens.
++/
+inout(C)[] normalize(NormalizationForm norm=NFC, C)(inout(C)[] input)
+{
+ import std.algorithm.mutation : SwapStrategy;
+ import std.algorithm.sorting : sort;
+ import std.array : appender;
+ import std.range : zip;
+
+ auto anchors = splitNormalized!norm(input);
+ if (anchors[0] == input.length && anchors[1] == input.length)
+ return input;
+ dchar[] decomposed;
+ decomposed.reserve(31);
+ ubyte[] ccc;
+ ccc.reserve(31);
+ auto app = appender!(C[])();
+ do
+ {
+ app.put(input[0 .. anchors[0]]);
+ foreach (dchar ch; input[anchors[0]..anchors[1]])
+ static if (norm == NFD || norm == NFC)
+ {
+ foreach (dchar c; decompose!Canonical(ch)[])
+ decomposed ~= c;
+ }
+ else // NFKD & NFKC
+ {
+ foreach (dchar c; decompose!Compatibility(ch)[])
+ decomposed ~= c;
+ }
+ ccc.length = decomposed.length;
+ size_t firstNonStable = 0;
+ ubyte lastClazz = 0;
+
+ foreach (idx, dchar ch; decomposed)
+ {
+ immutable clazz = combiningClass(ch);
+ ccc[idx] = clazz;
+ if (clazz == 0 && lastClazz != 0)
+ {
+ // found a stable code point after unstable ones
+ sort!("a[0] < b[0]", SwapStrategy.stable)
+ (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx]));
+ firstNonStable = decomposed.length;
+ }
+ else if (clazz != 0 && lastClazz == 0)
+ {
+ // found first unstable code point after stable ones
+ firstNonStable = idx;
+ }
+ lastClazz = clazz;
+ }
+ sort!("a[0] < b[0]", SwapStrategy.stable)
+ (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
+ static if (norm == NFC || norm == NFKC)
+ {
+ import std.algorithm.searching : countUntil;
+ auto first = countUntil(ccc, 0);
+ if (first >= 0) // no starters?? no recomposition
+ {
+ for (;;)
+ {
+ immutable second = recompose(first, decomposed, ccc);
+ if (second == decomposed.length)
+ break;
+ first = second;
+ }
+ // 2nd pass for hangul syllables
+ hangulRecompose(decomposed);
+ }
+ }
+ static if (norm == NFD || norm == NFKD)
+ app.put(decomposed);
+ else
+ {
+ import std.algorithm.mutation : remove;
+ auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
+ app.put(decomposed[0 .. clean.length]);
+ }
+ // reset variables
+ decomposed.length = 0;
+ decomposed.assumeSafeAppend();
+ ccc.length = 0;
+ ccc.assumeSafeAppend();
+ input = input[anchors[1]..$];
+ // and move on
+ anchors = splitNormalized!norm(input);
+ }while (anchors[0] != input.length);
+ app.put(input[0 .. anchors[0]]);
+ return cast(inout(C)[])app.data;
+}
+
+///
+@safe unittest
+{
+ // any encoding works
+ wstring greet = "Hello world";
+ assert(normalize(greet) is greet); // the same exact slice
+
+ // An example of a character with all 4 forms being different:
+ // Greek upsilon with acute and hook symbol (code point 0x03D3)
+ assert(normalize!NFC("ϓ") == "\u03D3");
+ assert(normalize!NFD("ϓ") == "\u03D2\u0301");
+ assert(normalize!NFKC("ϓ") == "\u038E");
+ assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
+}
+
+@safe unittest
+{
+ import std.conv : text;
+
+ assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
+ assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰"));
+ assert(normalize!NFD("Äffin") == "A\u0308ffin");
+
+ // check example
+
+ // any encoding works
+ wstring greet = "Hello world";
+ assert(normalize(greet) is greet); // the same exact slice
+
+ // An example of a character with all 4 forms being different:
+ // Greek upsilon with acute and hook symbol (code point 0x03D3)
+ assert(normalize!NFC("ϓ") == "\u03D3");
+ assert(normalize!NFD("ϓ") == "\u03D2\u0301");
+ assert(normalize!NFKC("ϓ") == "\u038E");
+ assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
+}
+
+// canonically recompose given slice of code points, works in-place and mutates data
+private size_t recompose(size_t start, dchar[] input, ubyte[] ccc) pure nothrow @safe
+{
+ assert(input.length == ccc.length);
+ int accumCC = -1;// so that it's out of 0 .. 255 range
+ // writefln("recomposing %( %04x %)", input);
+ // first one is always a starter thus we start at i == 1
+ size_t i = start+1;
+ for (; ; )
+ {
+ if (i == input.length)
+ break;
+ immutable curCC = ccc[i];
+ // In any character sequence beginning with a starter S
+ // a character C is blocked from S if and only if there
+ // is some character B between S and C, and either B
+ // is a starter or it has the same or higher combining class as C.
+ //------------------------
+ // Applying to our case:
+ // S is input[0]
+ // accumCC is the maximum CCC of characters between C and S,
+ // as ccc are sorted
+ // C is input[i]
+
+ if (curCC > accumCC)
+ {
+ immutable comp = compose(input[start], input[i]);
+ if (comp != dchar.init)
+ {
+ input[start] = comp;
+ input[i] = dchar.init;// put a sentinel
+ // current was merged so its CCC shouldn't affect
+ // composing with the next one
+ }
+ else
+ {
+ // if it was a starter then accumCC is now 0, end of loop
+ accumCC = curCC;
+ if (accumCC == 0)
+ break;
+ }
+ }
+ else
+ {
+ // ditto here
+ accumCC = curCC;
+ if (accumCC == 0)
+ break;
+ }
+ i++;
+ }
+ return i;
+}
+
+// returns tuple of 2 indexes that delimit:
+// normalized text, piece that needs normalization and
+// the rest of input starting with stable code point
+private auto splitNormalized(NormalizationForm norm, C)(const(C)[] input)
+{
+ import std.typecons : tuple;
+ ubyte lastCC = 0;
+
+ foreach (idx, dchar ch; input)
+ {
+ static if (norm == NFC)
+ if (ch < 0x0300)
+ {
+ lastCC = 0;
+ continue;
+ }
+ immutable ubyte CC = combiningClass(ch);
+ if (lastCC > CC && CC != 0)
+ {
+ return seekStable!norm(idx, input);
+ }
+
+ if (notAllowedIn!norm(ch))
+ {
+ return seekStable!norm(idx, input);
+ }
+ lastCC = CC;
+ }
+ return tuple(input.length, input.length);
+}
+
+private auto seekStable(NormalizationForm norm, C)(size_t idx, in C[] input)
+{
+ import std.typecons : tuple;
+ import std.utf : codeLength;
+
+ auto br = input[0 .. idx];
+ size_t region_start = 0;// default
+ for (;;)
+ {
+ if (br.empty)// start is 0
+ break;
+ dchar ch = br.back;
+ if (combiningClass(ch) == 0 && allowedIn!norm(ch))
+ {
+ region_start = br.length - codeLength!C(ch);
+ break;
+ }
+ br.popFront();
+ }
+ ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
+ size_t region_end=input.length;// end is $ by default
+ foreach (i, dchar ch; input[idx..$])
+ {
+ if (combiningClass(ch) == 0 && allowedIn!norm(ch))
+ {
+ region_end = i+idx;
+ break;
+ }
+ }
+ // writeln("Region to normalize: ", input[region_start .. region_end]);
+ return tuple(region_start, region_end);
+}
+
+/**
+ Tests if dchar $(D ch) is always allowed (Quick_Check=YES) in normalization
+ form $(D norm).
+*/
+public bool allowedIn(NormalizationForm norm)(dchar ch)
+{
+ return !notAllowedIn!norm(ch);
+}
+
+///
+@safe unittest
+{
+ // e.g. Cyrillic is always allowed, so is ASCII
+ assert(allowedIn!NFC('я'));
+ assert(allowedIn!NFD('я'));
+ assert(allowedIn!NFKC('я'));
+ assert(allowedIn!NFKD('я'));
+ assert(allowedIn!NFC('Z'));
+}
+
+// not user friendly name but more direct
+private bool notAllowedIn(NormalizationForm norm)(dchar ch)
+{
+ static if (norm == NFC)
+ alias qcTrie = nfcQCTrie;
+ else static if (norm == NFD)
+ alias qcTrie = nfdQCTrie;
+ else static if (norm == NFKC)
+ alias qcTrie = nfkcQCTrie;
+ else static if (norm == NFKD)
+ alias qcTrie = nfkdQCTrie;
+ else
+ static assert("Unknown normalization form "~norm);
+ return qcTrie[ch];
+}
+
+@safe unittest
+{
+ assert(allowedIn!NFC('я'));
+ assert(allowedIn!NFD('я'));
+ assert(allowedIn!NFKC('я'));
+ assert(allowedIn!NFKD('я'));
+ assert(allowedIn!NFC('Z'));
+}
+
+}
+
+version (std_uni_bootstrap)
+{
+ // old version used for bootstrapping of gen_uni.d that generates
+ // up to date optimal versions of all of isXXX functions
+ @safe pure nothrow @nogc public bool isWhite(dchar c)
+ {
+ import std.ascii : isWhite;
+ return isWhite(c) ||
+ c == lineSep || c == paraSep ||
+ c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
+ (c >= '\u2000' && c <= '\u200A') ||
+ c == '\u202F' || c == '\u205F' || c == '\u3000';
+ }
+}
+else
+{
+
+// trusted -> avoid bounds check
+@trusted pure nothrow @nogc private
+{
+ import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file
+
+ // hide template instances behind functions (Bugzilla 13232)
+ ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
+ ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
+ dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
+
+ ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
+ ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
+ dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
+
+ ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
+ ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
+ dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
+}
+
+public:
+
+/++
+ Whether or not $(D c) is a Unicode whitespace $(CHARACTER).
+ (general Unicode category: Part of C0(tab, vertical tab, form feed,
+ carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
++/
+@safe pure nothrow @nogc
+public bool isWhite(dchar c)
+{
+ import std.internal.unicode_tables : isWhiteGen; // generated file
+ return isWhiteGen(c); // call pregenerated binary search
+}
+
+/++
+ Return whether $(D c) is a Unicode lowercase $(CHARACTER).
++/
+@safe pure nothrow @nogc
+bool isLower(dchar c)
+{
+ import std.ascii : isLower, isASCII;
+ if (isASCII(c))
+ return isLower(c);
+ return lowerCaseTrie[c];
+}
+
+@safe unittest
+{
+ import std.ascii : isLower;
+ foreach (v; 0 .. 0x80)
+ assert(isLower(v) == .isLower(v));
+ assert(.isLower('я'));
+ assert(.isLower('й'));
+ assert(!.isLower('Ж'));
+ // Greek HETA
+ assert(!.isLower('\u0370'));
+ assert(.isLower('\u0371'));
+ assert(!.isLower('\u039C')); // capital MU
+ assert(.isLower('\u03B2')); // beta
+ // from extended Greek
+ assert(!.isLower('\u1F18'));
+ assert(.isLower('\u1F00'));
+ foreach (v; unicode.lowerCase.byCodepoint)
+ assert(.isLower(v) && !isUpper(v));
+}
+
+
+/++
+ Return whether $(D c) is a Unicode uppercase $(CHARACTER).
++/
+@safe pure nothrow @nogc
+bool isUpper(dchar c)
+{
+ import std.ascii : isUpper, isASCII;
+ if (isASCII(c))
+ return isUpper(c);
+ return upperCaseTrie[c];
+}
+
+@safe unittest
+{
+ import std.ascii : isLower;
+ foreach (v; 0 .. 0x80)
+ assert(isLower(v) == .isLower(v));
+ assert(!isUpper('й'));
+ assert(isUpper('Ж'));
+ // Greek HETA
+ assert(isUpper('\u0370'));
+ assert(!isUpper('\u0371'));
+ assert(isUpper('\u039C')); // capital MU
+ assert(!isUpper('\u03B2')); // beta
+ // from extended Greek
+ assert(!isUpper('\u1F00'));
+ assert(isUpper('\u1F18'));
+ foreach (v; unicode.upperCase.byCodepoint)
+ assert(isUpper(v) && !.isLower(v));
+}
+
+
+//TODO: Hidden for now, needs better API.
+//Other transforms could use better API as well, but this one is a new primitive.
+@safe pure nothrow @nogc
+private dchar toTitlecase(dchar c)
+{
+ // optimize ASCII case
+ if (c < 0xAA)
+ {
+ if (c < 'a')
+ return c;
+ if (c <= 'z')
+ return c - 32;
+ return c;
+ }
+ size_t idx = toTitleSimpleIndex(c);
+ if (idx != ushort.max)
+ {
+ return toTitleTab(idx);
+ }
+ return c;
+}
+
+private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
+private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
+
+// generic toUpper/toLower on whole string, creates new or returns as is
+private S toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s) @trusted pure
+if (isSomeString!S)
+{
+ import std.array : appender;
+ import std.ascii : isASCII;
+
+ foreach (i, dchar cOuter; s)
+ {
+ ushort idx = indexFn(cOuter);
+ if (idx == ushort.max)
+ continue;
+ auto result = appender!S(s[0 .. i]);
+ result.reserve(s.length);
+ foreach (dchar c; s[i .. $])
+ {
+ if (c.isASCII)
+ {
+ result.put(asciiConvert(c));
+ }
+ else
+ {
+ idx = indexFn(c);
+ if (idx == ushort.max)
+ result.put(c);
+ else if (idx < maxIdx)
+ {
+ c = tableFn(idx);
+ result.put(c);
+ }
+ else
+ {
+ auto val = tableFn(idx);
+ // unpack length + codepoint
+ immutable uint len = val >> 24;
+ result.put(cast(dchar)(val & 0xFF_FFFF));
+ foreach (j; idx+1 .. idx+len)
+ result.put(tableFn(j));
+ }
+ }
+ }
+ return result.data;
+ }
+ return s;
+}
+
+@safe unittest //12428
+{
+ import std.array : replicate;
+ auto s = "abcdefghij".replicate(300);
+ s = s[0 .. 10];
+
+ toUpper(s);
+
+ assert(s == "abcdefghij");
+}
+
+
+// generic toUpper/toLower on whole range, returns range
+private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str)
+ // Accept range of dchar's
+if (isInputRange!Range &&
+ isSomeChar!(ElementEncodingType!Range) &&
+ ElementEncodingType!Range.sizeof == dchar.sizeof)
+{
+ static struct ToCaserImpl
+ {
+ @property bool empty()
+ {
+ return !nLeft && r.empty;
+ }
+
+ @property auto front()
+ {
+ import std.ascii : isASCII;
+
+ if (!nLeft)
+ {
+ dchar c = r.front;
+ if (c.isASCII)
+ {
+ buf[0] = asciiConvert(c);
+ nLeft = 1;
+ }
+ else
+ {
+ const idx = indexFn(c);
+ if (idx == ushort.max)
+ {
+ buf[0] = c;
+ nLeft = 1;
+ }
+ else if (idx < maxIdx)
+ {
+ buf[0] = tableFn(idx);
+ nLeft = 1;
+ }
+ else
+ {
+ immutable val = tableFn(idx);
+ // unpack length + codepoint
+ nLeft = val >> 24;
+ if (nLeft == 0)
+ nLeft = 1;
+ assert(nLeft <= buf.length);
+ buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
+ foreach (j; 1 .. nLeft)
+ buf[nLeft - j - 1] = tableFn(idx + j);
+ }
+ }
+ }
+ return buf[nLeft - 1];
+ }
+
+ void popFront()
+ {
+ if (!nLeft)
+ front;
+ assert(nLeft);
+ --nLeft;
+ if (!nLeft)
+ r.popFront();
+ }
+
+ static if (isForwardRange!Range)
+ {
+ @property auto save()
+ {
+ auto ret = this;
+ ret.r = r.save;
+ return ret;
+ }
+ }
+
+ private:
+ Range r;
+ uint nLeft;
+ dchar[3] buf = void;
+ }
+
+ return ToCaserImpl(str);
+}
+
+/*********************
+ * Convert input range or string to upper or lower case.
+ *
+ * Does not allocate memory.
+ * Characters in UTF-8 or UTF-16 format that cannot be decoded
+ * are treated as $(REF replacementDchar, std,utf).
+ *
+ * Params:
+ * str = string or range of characters
+ *
+ * Returns:
+ * an InputRange of dchars
+ *
+ * See_Also:
+ * $(LREF toUpper), $(LREF toLower)
+ */
+
+auto asLowerCase(Range)(Range str)
+if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
+ !isConvertibleToString!Range)
+{
+ static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
+ {
+ import std.utf : byDchar;
+
+ // Decode first
+ return asLowerCase(str.byDchar);
+ }
+ else
+ {
+ static import std.ascii;
+ return toCaser!(LowerTriple, std.ascii.toLower)(str);
+ }
+}
+
+/// ditto
+auto asUpperCase(Range)(Range str)
+if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
+ !isConvertibleToString!Range)
+{
+ static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
+ {
+ import std.utf : byDchar;
+
+ // Decode first
+ return asUpperCase(str.byDchar);
+ }
+ else
+ {
+ static import std.ascii;
+ return toCaser!(UpperTriple, std.ascii.toUpper)(str);
+ }
+}
+
+///
+@safe pure unittest
+{
+ import std.algorithm.comparison : equal;
+
+ assert("hEllo".asUpperCase.equal("HELLO"));
+}
+
+// explicitly undocumented
+auto asLowerCase(Range)(auto ref Range str)
+if (isConvertibleToString!Range)
+{
+ import std.traits : StringTypeOf;
+ return asLowerCase!(StringTypeOf!Range)(str);
+}
+
+// explicitly undocumented
+auto asUpperCase(Range)(auto ref Range str)
+if (isConvertibleToString!Range)
+{
+ import std.traits : StringTypeOf;
+ return asUpperCase!(StringTypeOf!Range)(str);
+}
+
+@safe unittest
+{
+ assert(testAliasedString!asLowerCase("hEllo"));
+ assert(testAliasedString!asUpperCase("hEllo"));
+}
+
+@safe unittest
+{
+ import std.array : array;
+
+ auto a = "HELLo".asLowerCase;
+ auto savea = a.save;
+ auto s = a.array;
+ assert(s == "hello");
+ s = savea.array;
+ assert(s == "hello");
+
+ string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
+ string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
+
+ foreach (i, slwr; lower)
+ {
+ import std.utf : byChar;
+
+ auto sx = slwr.asUpperCase.byChar.array;
+ assert(sx == toUpper(slwr));
+ auto sy = upper[i].asLowerCase.byChar.array;
+ assert(sy == toLower(upper[i]));
+ }
+
+ // Not necessary to call r.front
+ for (auto r = lower[3].asUpperCase; !r.empty; r.popFront())
+ {
+ }
+
+ import std.algorithm.comparison : equal;
+
+ "HELLo"w.asLowerCase.equal("hello"d);
+ "HELLo"w.asUpperCase.equal("HELLO"d);
+ "HELLo"d.asLowerCase.equal("hello"d);
+ "HELLo"d.asUpperCase.equal("HELLO"d);
+
+ import std.utf : byChar;
+ assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array);
+}
+
+// generic capitalizer on whole range, returns range
+private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper,
+ Range)(Range str)
+ // Accept range of dchar's
+if (isInputRange!Range &&
+ isSomeChar!(ElementEncodingType!Range) &&
+ ElementEncodingType!Range.sizeof == dchar.sizeof)
+{
+ static struct ToCapitalizerImpl
+ {
+ @property bool empty()
+ {
+ return lower ? lwr.empty : !nLeft && r.empty;
+ }
+
+ @property auto front()
+ {
+ if (lower)
+ return lwr.front;
+
+ if (!nLeft)
+ {
+ immutable dchar c = r.front;
+ const idx = indexFnUpper(c);
+ if (idx == ushort.max)
+ {
+ buf[0] = c;
+ nLeft = 1;
+ }
+ else if (idx < maxIdxUpper)
+ {
+ buf[0] = tableFnUpper(idx);
+ nLeft = 1;
+ }
+ else
+ {
+ immutable val = tableFnUpper(idx);
+ // unpack length + codepoint
+ nLeft = val >> 24;
+ if (nLeft == 0)
+ nLeft = 1;
+ assert(nLeft <= buf.length);
+ buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
+ foreach (j; 1 .. nLeft)
+ buf[nLeft - j - 1] = tableFnUpper(idx + j);
+ }
+ }
+ return buf[nLeft - 1];
+ }
+
+ void popFront()
+ {
+ if (lower)
+ lwr.popFront();
+ else
+ {
+ if (!nLeft)
+ front;
+ assert(nLeft);
+ --nLeft;
+ if (!nLeft)
+ {
+ r.popFront();
+ lwr = r.asLowerCase();
+ lower = true;
+ }
+ }
+ }
+
+ static if (isForwardRange!Range)
+ {
+ @property auto save()
+ {
+ auto ret = this;
+ ret.r = r.save;
+ ret.lwr = lwr.save;
+ return ret;
+ }
+ }
+
+ private:
+ Range r;
+ typeof(r.asLowerCase) lwr; // range representing the lower case rest of string
+ bool lower = false; // false for first character, true for rest of string
+ dchar[3] buf = void;
+ uint nLeft = 0;
+ }
+
+ return ToCapitalizerImpl(str);
+}
+
+/*********************
+ * Capitalize input range or string, meaning convert the first
+ * character to upper case and subsequent characters to lower case.
+ *
+ * Does not allocate memory.
+ * Characters in UTF-8 or UTF-16 format that cannot be decoded
+ * are treated as $(REF replacementDchar, std,utf).
+ *
+ * Params:
+ * str = string or range of characters
+ *
+ * Returns:
+ * an InputRange of dchars
+ *
+ * See_Also:
+ * $(LREF toUpper), $(LREF toLower)
+ * $(LREF asUpperCase), $(LREF asLowerCase)
+ */
+
+auto asCapitalized(Range)(Range str)
+if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
+ !isConvertibleToString!Range)
+{
+ static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
+ {
+ import std.utf : byDchar;
+
+ // Decode first
+ return toCapitalizer!UpperTriple(str.byDchar);
+ }
+ else
+ {
+ return toCapitalizer!UpperTriple(str);
+ }
+}
+
+///
+@safe pure unittest
+{
+ import std.algorithm.comparison : equal;
+
+ assert("hEllo".asCapitalized.equal("Hello"));
+}
+
+auto asCapitalized(Range)(auto ref Range str)
+if (isConvertibleToString!Range)
+{
+ import std.traits : StringTypeOf;
+ return asCapitalized!(StringTypeOf!Range)(str);
+}
+
+@safe unittest
+{
+ assert(testAliasedString!asCapitalized("hEllo"));
+}
+
+@safe pure nothrow @nogc unittest
+{
+ auto r = "hEllo".asCapitalized();
+ assert(r.front == 'H');
+}
+
+@safe unittest
+{
+ import std.array : array;
+
+ auto a = "hELLo".asCapitalized;
+ auto savea = a.save;
+ auto s = a.array;
+ assert(s == "Hello");
+ s = savea.array;
+ assert(s == "Hello");
+
+ string[2][] cases =
+ [
+ ["", ""],
+ ["h", "H"],
+ ["H", "H"],
+ ["3", "3"],
+ ["123", "123"],
+ ["h123A", "H123a"],
+ ["феж", "Феж"],
+ ["\u1Fe2", "\u03a5\u0308\u0300"],
+ ];
+
+ foreach (i; 0 .. cases.length)
+ {
+ import std.utf : byChar;
+
+ auto r = cases[i][0].asCapitalized.byChar.array;
+ auto result = cases[i][1];
+ assert(r == result);
+ }
+
+ // Don't call r.front
+ for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront())
+ {
+ }
+
+ import std.algorithm.comparison : equal;
+
+ "HELLo"w.asCapitalized.equal("Hello"d);
+ "hElLO"w.asCapitalized.equal("Hello"d);
+ "hello"d.asCapitalized.equal("Hello"d);
+ "HELLO"d.asCapitalized.equal("Hello"d);
+
+ import std.utf : byChar;
+ assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array);
+}
+
+// TODO: helper, I wish std.utf was more flexible (and stright)
+private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
+{
+ if (c <= 0x7F)
+ {
+ buf[idx] = cast(char) c;
+ idx++;
+ }
+ else if (c <= 0x7FF)
+ {
+ buf[idx] = cast(char)(0xC0 | (c >> 6));
+ buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
+ idx += 2;
+ }
+ else if (c <= 0xFFFF)
+ {
+ buf[idx] = cast(char)(0xE0 | (c >> 12));
+ buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
+ buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
+ idx += 3;
+ }
+ else if (c <= 0x10FFFF)
+ {
+ buf[idx] = cast(char)(0xF0 | (c >> 18));
+ buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
+ buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
+ buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
+ idx += 4;
+ }
+ else
+ assert(0);
+ return idx;
+}
+
+@safe unittest
+{
+ char[] s = "abcd".dup;
+ size_t i = 0;
+ i = encodeTo(s, i, 'X');
+ assert(s == "Xbcd");
+
+ i = encodeTo(s, i, cast(dchar)'\u00A9');
+ assert(s == "X\xC2\xA9d");
+}
+
+// TODO: helper, I wish std.utf was more flexible (and stright)
+private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure
+{
+ import std.utf : UTFException;
+ if (c <= 0xFFFF)
+ {
+ if (0xD800 <= c && c <= 0xDFFF)
+ throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
+ buf[idx] = cast(wchar) c;
+ idx++;
+ }
+ else if (c <= 0x10FFFF)
+ {
+ buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
+ buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
+ idx += 2;
+ }
+ else
+ assert(0);
+ return idx;
+}
+
+private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
+{
+ buf[idx] = c;
+ idx++;
+ return idx;
+}
+
+private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
+if (is(C == char) || is(C == wchar) || is(C == dchar))
+{
+ import std.utf : decode, codeLength;
+ size_t curIdx = 0;
+ size_t destIdx = 0;
+ alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
+ size_t lastUnchanged = 0;
+ // in-buffer move of bytes to a new start index
+ // the trick is that it may not need to copy at all
+ static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
+ {
+ // Interestingly we may just bump pointer for a while
+ // then have to copy if a re-cased char was smaller the original
+ // later we may regain pace with char that got bigger
+ // In the end it sometimes flip-flops between the 2 cases below
+ if (dest == from)
+ return to;
+ // got to copy
+ foreach (C c; str[from .. to])
+ str[dest++] = c;
+ return dest;
+ }
+ while (curIdx != s.length)
+ {
+ size_t startIdx = curIdx;
+ immutable ch = decode(s, curIdx);
+ // TODO: special case for ASCII
+ immutable caseIndex = indexFn(ch);
+ if (caseIndex == ushort.max) // unchanged, skip over
+ {
+ continue;
+ }
+ else if (caseIndex < maxIdx) // 1:1 codepoint mapping
+ {
+ // previous cased chars had the same length as uncased ones
+ // thus can just adjust pointer
+ destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
+ lastUnchanged = curIdx;
+ immutable cased = tableFn(caseIndex);
+ immutable casedLen = codeLength!C(cased);
+ if (casedLen + destIdx > curIdx) // no place to fit cased char
+ {
+ // switch to slow codepath, where we allocate
+ return slowToCase(s, startIdx, destIdx);
+ }
+ else
+ {
+ destIdx = encodeTo(s, destIdx, cased);
+ }
+ }
+ else // 1:m codepoint mapping, slow codepath
+ {
+ destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
+ lastUnchanged = curIdx;
+ return slowToCase(s, startIdx, destIdx);
+ }
+ assert(destIdx <= curIdx);
+ }
+ if (lastUnchanged != s.length)
+ {
+ destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
+ }
+ s = s[0 .. destIdx];
+}
+
+// helper to precalculate size of case-converted string
+private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
+{
+ size_t toCaseLength(C)(in C[] str)
+ {
+ import std.utf : decode, codeLength;
+ size_t codeLen = 0;
+ size_t lastNonTrivial = 0;
+ size_t curIdx = 0;
+ while (curIdx != str.length)
+ {
+ immutable startIdx = curIdx;
+ immutable ch = decode(str, curIdx);
+ immutable ushort caseIndex = indexFn(ch);
+ if (caseIndex == ushort.max)
+ continue;
+ else if (caseIndex < maxIdx)
+ {
+ codeLen += startIdx - lastNonTrivial;
+ lastNonTrivial = curIdx;
+ immutable cased = tableFn(caseIndex);
+ codeLen += codeLength!C(cased);
+ }
+ else
+ {
+ codeLen += startIdx - lastNonTrivial;
+ lastNonTrivial = curIdx;
+ immutable val = tableFn(caseIndex);
+ immutable len = val >> 24;
+ immutable dchar cased = val & 0xFF_FFFF;
+ codeLen += codeLength!C(cased);
+ foreach (j; caseIndex+1 .. caseIndex+len)
+ codeLen += codeLength!C(tableFn(j));
+ }
+ }
+ if (lastNonTrivial != str.length)
+ codeLen += str.length - lastNonTrivial;
+ return codeLen;
+ }
+}
+
+@safe unittest
+{
+ alias toLowerLength = toCaseLength!(LowerTriple);
+ assert(toLowerLength("abcd") == 4);
+ assert(toLowerLength("аБВгд456") == 10+3);
+}
+
+// slower code path that preallocates and then copies
+// case-converted stuf to the new string
+private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
+{
+ void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
+ size_t destIdx) @trusted pure
+ if (is(C == char) || is(C == wchar) || is(C == dchar))
+ {
+ import std.utf : decode;
+ alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
+ auto trueLength = destIdx + caseLength(s[curIdx..$]);
+ C[] ns = new C[trueLength];
+ ns[0 .. destIdx] = s[0 .. destIdx];
+ size_t lastUnchanged = curIdx;
+ while (curIdx != s.length)
+ {
+ immutable startIdx = curIdx; // start of current codepoint
+ immutable ch = decode(s, curIdx);
+ immutable caseIndex = indexFn(ch);
+ if (caseIndex == ushort.max) // skip over
+ {
+ continue;
+ }
+ else if (caseIndex < maxIdx) // 1:1 codepoint mapping
+ {
+ immutable cased = tableFn(caseIndex);
+ auto toCopy = startIdx - lastUnchanged;
+ ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
+ lastUnchanged = curIdx;
+ destIdx += toCopy;
+ destIdx = encodeTo(ns, destIdx, cased);
+ }
+ else // 1:m codepoint mapping, slow codepath
+ {
+ auto toCopy = startIdx - lastUnchanged;
+ ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
+ lastUnchanged = curIdx;
+ destIdx += toCopy;
+ auto val = tableFn(caseIndex);
+ // unpack length + codepoint
+ immutable uint len = val >> 24;
+ destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
+ foreach (j; caseIndex+1 .. caseIndex+len)
+ destIdx = encodeTo(ns, destIdx, tableFn(j));
+ }
+ }
+ if (lastUnchanged != s.length)
+ {
+ auto toCopy = s.length - lastUnchanged;
+ ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$];
+ destIdx += toCopy;
+ }
+ assert(ns.length == destIdx);
+ s = ns;
+ }
+}
+
+/++
+ Converts $(D s) to lowercase (by performing Unicode lowercase mapping) in place.
+ For a few characters string length may increase after the transformation,
+ in such a case the function reallocates exactly once.
+ If $(D s) does not have any uppercase characters, then $(D s) is unaltered.
++/
+void toLowerInPlace(C)(ref C[] s) @trusted pure
+if (is(C == char) || is(C == wchar) || is(C == dchar))
+{
+ toCaseInPlace!(LowerTriple)(s);
+}
+// overloads for the most common cases to reduce compile time
+@safe pure /*TODO nothrow*/
+{
+ void toLowerInPlace(ref char[] s)
+ { toLowerInPlace!char(s); }
+ void toLowerInPlace(ref wchar[] s)
+ { toLowerInPlace!wchar(s); }
+ void toLowerInPlace(ref dchar[] s)
+ { toLowerInPlace!dchar(s); }
+}
+
+/++
+ Converts $(D s) to uppercase (by performing Unicode uppercase mapping) in place.
+ For a few characters string length may increase after the transformation,
+ in such a case the function reallocates exactly once.
+ If $(D s) does not have any lowercase characters, then $(D s) is unaltered.
++/
+void toUpperInPlace(C)(ref C[] s) @trusted pure
+if (is(C == char) || is(C == wchar) || is(C == dchar))
+{
+ toCaseInPlace!(UpperTriple)(s);
+}
+// overloads for the most common cases to reduce compile time/code size
+@safe pure /*TODO nothrow*/
+{
+ void toUpperInPlace(ref char[] s)
+ { toUpperInPlace!char(s); }
+ void toUpperInPlace(ref wchar[] s)
+ { toUpperInPlace!wchar(s); }
+ void toUpperInPlace(ref dchar[] s)
+ { toUpperInPlace!dchar(s); }
+}
+
+/++
+ If $(D c) is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
+ is returned. Otherwise $(D c) is returned.
+
+ Warning: certain alphabets like German and Greek have no 1:1
+ upper-lower mapping. Use overload of toLower which takes full string instead.
++/
+@safe pure nothrow @nogc
+dchar toLower(dchar c)
+{
+ // optimize ASCII case
+ if (c < 0xAA)
+ {
+ if (c < 'A')
+ return c;
+ if (c <= 'Z')
+ return c + 32;
+ return c;
+ }
+ size_t idx = toLowerSimpleIndex(c);
+ if (idx != ushort.max)
+ {
+ return toLowerTab(idx);
+ }
+ return c;
+}
+
+/++
+ Returns a string which is identical to $(D s) except that all of its
+ characters are converted to lowercase (by preforming Unicode lowercase mapping).
+ If none of $(D s) characters were affected, then $(D s) itself is returned.
++/
+S toLower(S)(S s) @trusted pure
+if (isSomeString!S)
+{
+ static import std.ascii;
+ return toCase!(LowerTriple, std.ascii.toLower)(s);
+}
+// overloads for the most common cases to reduce compile time
+@safe pure /*TODO nothrow*/
+{
+ string toLower(string s)
+ { return toLower!string(s); }
+ wstring toLower(wstring s)
+ { return toLower!wstring(s); }
+ dstring toLower(dstring s)
+ { return toLower!dstring(s); }
+
+ @safe unittest
+ {
+ // https://issues.dlang.org/show_bug.cgi?id=16663
+
+ static struct String
+ {
+ string data;
+ alias data this;
+ }
+
+ void foo()
+ {
+ auto u = toLower(String(""));
+ }
+ }
+}
+
+
+@system unittest //@@@BUG std.format is not @safe
+{
+ static import std.ascii;
+ import std.format : format;
+ foreach (ch; 0 .. 0x80)
+ assert(std.ascii.toLower(ch) == toLower(ch));
+ assert(toLower('Я') == 'я');
+ assert(toLower('Δ') == 'δ');
+ foreach (ch; unicode.upperCase.byCodepoint)
+ {
+ dchar low = ch.toLower();
+ assert(low == ch || isLower(low), format("%s -> %s", ch, low));
+ }
+ assert(toLower("АЯ") == "ая");
+
+ assert("\u1E9E".toLower == "\u00df");
+ assert("\u00df".toUpper == "SS");
+}
+
+//bugzilla 9629
+@safe unittest
+{
+ wchar[] test = "hello þ world"w.dup;
+ auto piece = test[6 .. 7];
+ toUpperInPlace(piece);
+ assert(test == "hello Þ world");
+}
+
+
+@safe unittest
+{
+ import std.algorithm.comparison : cmp;
+ string s1 = "FoL";
+ string s2 = toLower(s1);
+ assert(cmp(s2, "fol") == 0, s2);
+ assert(s2 != s1);
+
+ char[] s3 = s1.dup;
+ toLowerInPlace(s3);
+ assert(s3 == s2);
+
+ s1 = "A\u0100B\u0101d";
+ s2 = toLower(s1);
+ s3 = s1.dup;
+ assert(cmp(s2, "a\u0101b\u0101d") == 0);
+ assert(s2 !is s1);
+ toLowerInPlace(s3);
+ assert(s3 == s2);
+
+ s1 = "A\u0460B\u0461d";
+ s2 = toLower(s1);
+ s3 = s1.dup;
+ assert(cmp(s2, "a\u0461b\u0461d") == 0);
+ assert(s2 !is s1);
+ toLowerInPlace(s3);
+ assert(s3 == s2);
+
+ s1 = "\u0130";
+ s2 = toLower(s1);
+ s3 = s1.dup;
+ assert(s2 == "i\u0307");
+ assert(s2 !is s1);
+ toLowerInPlace(s3);
+ assert(s3 == s2);
+
+ // Test on wchar and dchar strings.
+ assert(toLower("Some String"w) == "some string"w);
+ assert(toLower("Some String"d) == "some string"d);
+
+ // bugzilla 12455
+ dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
+ assert(isUpper(c));
+ assert(toLower(c) == 'i');
+ // extend on 12455 reprot - check simple-case toUpper too
+ c = '\u1f87';
+ assert(isLower(c));
+ assert(toUpper(c) == '\u1F8F');
+}
+
+
+/++
+ If $(D c) is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
+ is returned. Otherwise $(D c) is returned.
+
+ Warning:
+ Certain alphabets like German and Greek have no 1:1
+ upper-lower mapping. Use overload of toUpper which takes full string instead.
+
+ toUpper can be used as an argument to $(REF map, std,algorithm,iteration)
+ to produce an algorithm that can convert a range of characters to upper case
+ without allocating memory.
+ A string can then be produced by using $(REF copy, std,algorithm,mutation)
+ to send it to an $(REF appender, std,array).
++/
+@safe pure nothrow @nogc
+dchar toUpper(dchar c)
+{
+ // optimize ASCII case
+ if (c < 0xAA)
+ {
+ if (c < 'a')
+ return c;
+ if (c <= 'z')
+ return c - 32;
+ return c;
+ }
+ size_t idx = toUpperSimpleIndex(c);
+ if (idx != ushort.max)
+ {
+ return toUpperTab(idx);
+ }
+ return c;
+}
+
+///
+@system unittest
+{
+ import std.algorithm.iteration : map;
+ import std.algorithm.mutation : copy;
+ import std.array : appender;
+
+ auto abuf = appender!(char[])();
+ "hello".map!toUpper.copy(&abuf);
+ assert(abuf.data == "HELLO");
+}
+
+@safe unittest
+{
+ static import std.ascii;
+ import std.format : format;
+ foreach (ch; 0 .. 0x80)
+ assert(std.ascii.toUpper(ch) == toUpper(ch));
+ assert(toUpper('я') == 'Я');
+ assert(toUpper('δ') == 'Δ');
+ auto title = unicode.Titlecase_Letter;
+ foreach (ch; unicode.lowerCase.byCodepoint)
+ {
+ dchar up = ch.toUpper();
+ assert(up == ch || isUpper(up) || title[up],
+ format("%x -> %x", ch, up));
+ }
+}
+
+/++
+ Returns a string which is identical to $(D s) except that all of its
+ characters are converted to uppercase (by preforming Unicode uppercase mapping).
+ If none of $(D s) characters were affected, then $(D s) itself is returned.
++/
+S toUpper(S)(S s) @trusted pure
+if (isSomeString!S)
+{
+ static import std.ascii;
+ return toCase!(UpperTriple, std.ascii.toUpper)(s);
+}
+// overloads for the most common cases to reduce compile time
+@safe pure /*TODO nothrow*/
+{
+ string toUpper(string s)
+ { return toUpper!string(s); }
+ wstring toUpper(wstring s)
+ { return toUpper!wstring(s); }
+ dstring toUpper(dstring s)
+ { return toUpper!dstring(s); }
+
+ @safe unittest
+ {
+ // https://issues.dlang.org/show_bug.cgi?id=16663
+
+ static struct String
+ {
+ string data;
+ alias data this;
+ }
+
+ void foo()
+ {
+ auto u = toUpper(String(""));
+ }
+ }
+}
+
+@safe unittest
+{
+ import std.algorithm.comparison : cmp;
+
+ string s1 = "FoL";
+ string s2;
+ char[] s3;
+
+ s2 = toUpper(s1);
+ s3 = s1.dup; toUpperInPlace(s3);
+ assert(s3 == s2, s3);
+ assert(cmp(s2, "FOL") == 0);
+ assert(s2 !is s1);
+
+ s1 = "a\u0100B\u0101d";
+ s2 = toUpper(s1);
+ s3 = s1.dup; toUpperInPlace(s3);
+ assert(s3 == s2);
+ assert(cmp(s2, "A\u0100B\u0100D") == 0);
+ assert(s2 !is s1);
+
+ s1 = "a\u0460B\u0461d";
+ s2 = toUpper(s1);
+ s3 = s1.dup; toUpperInPlace(s3);
+ assert(s3 == s2);
+ assert(cmp(s2, "A\u0460B\u0460D") == 0);
+ assert(s2 !is s1);
+}
+
+@system unittest
+{
+ static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
+ {
+ import std.format : format;
+ string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
+ auto low = s.toLower() , up = s.toUpper();
+ auto lowInp = s.dup, upInp = s.dup;
+ lowInp.toLowerInPlace();
+ upInp.toUpperInPlace();
+ assert(low == trueLow, format(diff, low, trueLow));
+ assert(up == trueUp, format(diff, up, trueUp));
+ assert(lowInp == trueLow,
+ format(diff, cast(ubyte[]) s, cast(ubyte[]) lowInp, cast(ubyte[]) trueLow));
+ assert(upInp == trueUp,
+ format(diff, cast(ubyte[]) s, cast(ubyte[]) upInp, cast(ubyte[]) trueUp));
+ }
+ foreach (S; AliasSeq!(dstring, wstring, string))
+ {
+
+ S easy = "123";
+ S good = "abCФеж";
+ S awful = "\u0131\u023f\u2126";
+ S wicked = "\u0130\u1FE2";
+ auto options = [easy, good, awful, wicked];
+ S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
+ S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
+
+ foreach (val; AliasSeq!(easy, good))
+ {
+ auto e = val.dup;
+ auto g = e;
+ e.toUpperInPlace();
+ assert(e is g);
+ e.toLowerInPlace();
+ assert(e is g);
+ }
+ foreach (i, v; options)
+ {
+ doTest(v, upper[i], lower[i]);
+ }
+
+ // a few combinatorial runs
+ foreach (i; 0 .. options.length)
+ foreach (j; i .. options.length)
+ foreach (k; j .. options.length)
+ {
+ auto sample = options[i] ~ options[j] ~ options[k];
+ auto sample2 = options[k] ~ options[j] ~ options[i];
+ doTest(sample, upper[i] ~ upper[j] ~ upper[k],
+ lower[i] ~ lower[j] ~ lower[k]);
+ doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
+ lower[k] ~ lower[j] ~ lower[i]);
+ }
+ }
+}
+
+
+/++
+ Returns whether $(D c) is a Unicode alphabetic $(CHARACTER)
+ (general Unicode category: Alphabetic).
++/
+@safe pure nothrow @nogc
+bool isAlpha(dchar c)
+{
+ // optimization
+ if (c < 0xAA)
+ {
+ size_t x = c - 'A';
+ if (x <= 'Z' - 'A')
+ return true;
+ else
+ {
+ x = c - 'a';
+ if (x <= 'z'-'a')
+ return true;
+ }
+ return false;
+ }
+
+ return alphaTrie[c];
+}
+
+@safe unittest
+{
+ auto alpha = unicode("Alphabetic");
+ foreach (ch; alpha.byCodepoint)
+ assert(isAlpha(ch));
+ foreach (ch; 0 .. 0x4000)
+ assert((ch in alpha) == isAlpha(ch));
+}
+
+
+/++
+ Returns whether $(D c) is a Unicode mark
+ (general Unicode category: Mn, Me, Mc).
++/
+@safe pure nothrow @nogc
+bool isMark(dchar c)
+{
+ return markTrie[c];
+}
+
+@safe unittest
+{
+ auto mark = unicode("Mark");
+ foreach (ch; mark.byCodepoint)
+ assert(isMark(ch));
+ foreach (ch; 0 .. 0x4000)
+ assert((ch in mark) == isMark(ch));
+}
+
+/++
+ Returns whether $(D c) is a Unicode numerical $(CHARACTER)
+ (general Unicode category: Nd, Nl, No).
++/
+@safe pure nothrow @nogc
+bool isNumber(dchar c)
+{
+ // optimization for ascii case
+ if (c <= 0x7F)
+ {
+ return c >= '0' && c <= '9';
+ }
+ else
+ {
+ return numberTrie[c];
+ }
+}
+
+@safe unittest
+{
+ auto n = unicode("N");
+ foreach (ch; n.byCodepoint)
+ assert(isNumber(ch));
+ foreach (ch; 0 .. 0x4000)
+ assert((ch in n) == isNumber(ch));
+}
+
+/++
+ Returns whether $(D c) is a Unicode alphabetic $(CHARACTER) or number.
+ (general Unicode category: Alphabetic, Nd, Nl, No).
+
+ Params:
+ c = any Unicode character
+ Returns:
+ `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode
+ categories
++/
+@safe pure nothrow @nogc
+bool isAlphaNum(dchar c)
+{
+ static import std.ascii;
+
+ // optimization for ascii case
+ if (std.ascii.isASCII(c))
+ {
+ return std.ascii.isAlphaNum(c);
+ }
+ else
+ {
+ return isAlpha(c) || isNumber(c);
+ }
+}
+
+@safe unittest
+{
+ auto n = unicode("N");
+ auto alpha = unicode("Alphabetic");
+
+ foreach (ch; n.byCodepoint)
+ assert(isAlphaNum(ch));
+
+ foreach (ch; alpha.byCodepoint)
+ assert(isAlphaNum(ch));
+
+ foreach (ch; 0 .. 0x4000)
+ {
+ assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch));
+ }
+}
+
+/++
+ Returns whether $(D c) is a Unicode punctuation $(CHARACTER)
+ (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
++/
+@safe pure nothrow @nogc
+bool isPunctuation(dchar c)
+{
+ static import std.ascii;
+
+ // optimization for ascii case
+ if (c <= 0x7F)
+ {
+ return std.ascii.isPunctuation(c);
+ }
+ else
+ {
+ return punctuationTrie[c];
+ }
+}
+
+@safe unittest
+{
+ assert(isPunctuation('\u0021'));
+ assert(isPunctuation('\u0028'));
+ assert(isPunctuation('\u0029'));
+ assert(isPunctuation('\u002D'));
+ assert(isPunctuation('\u005F'));
+ assert(isPunctuation('\u00AB'));
+ assert(isPunctuation('\u00BB'));
+ foreach (ch; unicode("P").byCodepoint)
+ assert(isPunctuation(ch));
+}
+
+/++
+ Returns whether $(D c) is a Unicode symbol $(CHARACTER)
+ (general Unicode category: Sm, Sc, Sk, So).
++/
+@safe pure nothrow @nogc
+bool isSymbol(dchar c)
+{
+ return symbolTrie[c];
+}
+
+@safe unittest
+{
+ import std.format : format;
+ assert(isSymbol('\u0024'));
+ assert(isSymbol('\u002B'));
+ assert(isSymbol('\u005E'));
+ assert(isSymbol('\u00A6'));
+ foreach (ch; unicode("S").byCodepoint)
+ assert(isSymbol(ch), format("%04x", ch));
+}
+
+/++
+ Returns whether $(D c) is a Unicode space $(CHARACTER)
+ (general Unicode category: Zs)
+ Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
+ For commonly used less strict semantics see $(LREF isWhite).
++/
+@safe pure nothrow @nogc
+bool isSpace(dchar c)
+{
+ import std.internal.unicode_tables : isSpaceGen; // generated file
+ return isSpaceGen(c);
+}
+
+@safe unittest
+{
+ assert(isSpace('\u0020'));
+ auto space = unicode.Zs;
+ foreach (ch; space.byCodepoint)
+ assert(isSpace(ch));
+ foreach (ch; 0 .. 0x1000)
+ assert(isSpace(ch) == space[ch]);
+}
+
+
+/++
+ Returns whether $(D c) is a Unicode graphical $(CHARACTER)
+ (general Unicode category: L, M, N, P, S, Zs).
+
++/
+@safe pure nothrow @nogc
+bool isGraphical(dchar c)
+{
+ return graphicalTrie[c];
+}
+
+
+@safe unittest
+{
+ auto set = unicode("Graphical");
+ import std.format : format;
+ foreach (ch; set.byCodepoint)
+ assert(isGraphical(ch), format("%4x", ch));
+ foreach (ch; 0 .. 0x4000)
+ assert((ch in set) == isGraphical(ch));
+}
+
+
+/++
+ Returns whether $(D c) is a Unicode control $(CHARACTER)
+ (general Unicode category: Cc).
++/
+@safe pure nothrow @nogc
+bool isControl(dchar c)
+{
+ import std.internal.unicode_tables : isControlGen; // generated file
+ return isControlGen(c);
+}
+
+@safe unittest
+{
+ assert(isControl('\u0000'));
+ assert(isControl('\u0081'));
+ assert(!isControl('\u0100'));
+ auto cc = unicode.Cc;
+ foreach (ch; cc.byCodepoint)
+ assert(isControl(ch));
+ foreach (ch; 0 .. 0x1000)
+ assert(isControl(ch) == cc[ch]);
+}
+
+
+/++
+ Returns whether $(D c) is a Unicode formatting $(CHARACTER)
+ (general Unicode category: Cf).
++/
+@safe pure nothrow @nogc
+bool isFormat(dchar c)
+{
+ import std.internal.unicode_tables : isFormatGen; // generated file
+ return isFormatGen(c);
+}
+
+
+@safe unittest
+{
+ assert(isFormat('\u00AD'));
+ foreach (ch; unicode("Format").byCodepoint)
+ assert(isFormat(ch));
+}
+
+// code points for private use, surrogates are not likely to change in near feature
+// if need be they can be generated from unicode data as well
+
+/++
+ Returns whether $(D c) is a Unicode Private Use $(CODEPOINT)
+ (general Unicode category: Co).
++/
+@safe pure nothrow @nogc
+bool isPrivateUse(dchar c)
+{
+ return (0x00_E000 <= c && c <= 0x00_F8FF)
+ || (0x0F_0000 <= c && c <= 0x0F_FFFD)
+ || (0x10_0000 <= c && c <= 0x10_FFFD);
+}
+
+/++
+ Returns whether $(D c) is a Unicode surrogate $(CODEPOINT)
+ (general Unicode category: Cs).
++/
+@safe pure nothrow @nogc
+bool isSurrogate(dchar c)
+{
+ return (0xD800 <= c && c <= 0xDFFF);
+}
+
+/++
+ Returns whether $(D c) is a Unicode high surrogate (lead surrogate).
++/
+@safe pure nothrow @nogc
+bool isSurrogateHi(dchar c)
+{
+ return (0xD800 <= c && c <= 0xDBFF);
+}
+
+/++
+ Returns whether $(D c) is a Unicode low surrogate (trail surrogate).
++/
+@safe pure nothrow @nogc
+bool isSurrogateLo(dchar c)
+{
+ return (0xDC00 <= c && c <= 0xDFFF);
+}
+
+/++
+ Returns whether $(D c) is a Unicode non-character i.e.
+ a $(CODEPOINT) with no assigned abstract character.
+ (general Unicode category: Cn)
++/
+@safe pure nothrow @nogc
+bool isNonCharacter(dchar c)
+{
+ return nonCharacterTrie[c];
+}
+
+@safe unittest
+{
+ auto set = unicode("Cn");
+ foreach (ch; set.byCodepoint)
+ assert(isNonCharacter(ch));
+}
+
+private:
+// load static data from pre-generated tables into usable datastructures
+
+
+@safe auto asSet(const (ubyte)[] compressed) pure
+{
+ return CodepointSet.fromIntervals(decompressIntervals(compressed));
+}
+
+@safe pure nothrow auto asTrie(T...)(in TrieEntry!T e)
+{
+ return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
+}
+
+@safe pure nothrow @nogc @property
+{
+ import std.internal.unicode_tables; // generated file
+
+ // It's important to use auto return here, so that the compiler
+ // only runs semantic on the return type if the function gets
+ // used. Also these are functions rather than templates to not
+ // increase the object size of the caller.
+ auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
+ auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
+ auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
+ auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
+ auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
+ auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
+ auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
+ auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
+ auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
+ auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
+ auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
+
+ //normalization quick-check tables
+ auto nfcQCTrie()
+ {
+ import std.internal.unicode_norm : nfcQCTrieEntries;
+ static immutable res = asTrie(nfcQCTrieEntries);
+ return res;
+ }
+
+ auto nfdQCTrie()
+ {
+ import std.internal.unicode_norm : nfdQCTrieEntries;
+ static immutable res = asTrie(nfdQCTrieEntries);
+ return res;
+ }
+
+ auto nfkcQCTrie()
+ {
+ import std.internal.unicode_norm : nfkcQCTrieEntries;
+ static immutable res = asTrie(nfkcQCTrieEntries);
+ return res;
+ }
+
+ auto nfkdQCTrie()
+ {
+ import std.internal.unicode_norm : nfkdQCTrieEntries;
+ static immutable res = asTrie(nfkdQCTrieEntries);
+ return res;
+ }
+
+ //grapheme breaking algorithm tables
+ auto mcTrie()
+ {
+ import std.internal.unicode_grapheme : mcTrieEntries;
+ static immutable res = asTrie(mcTrieEntries);
+ return res;
+ }
+
+ auto graphemeExtendTrie()
+ {
+ import std.internal.unicode_grapheme : graphemeExtendTrieEntries;
+ static immutable res = asTrie(graphemeExtendTrieEntries);
+ return res;
+ }
+
+ auto hangLV()
+ {
+ import std.internal.unicode_grapheme : hangulLVTrieEntries;
+ static immutable res = asTrie(hangulLVTrieEntries);
+ return res;
+ }
+
+ auto hangLVT()
+ {
+ import std.internal.unicode_grapheme : hangulLVTTrieEntries;
+ static immutable res = asTrie(hangulLVTTrieEntries);
+ return res;
+ }
+
+ // tables below are used for composition/decomposition
+ auto combiningClassTrie()
+ {
+ import std.internal.unicode_comp : combiningClassTrieEntries;
+ static immutable res = asTrie(combiningClassTrieEntries);
+ return res;
+ }
+
+ auto compatMappingTrie()
+ {
+ import std.internal.unicode_decomp : compatMappingTrieEntries;
+ static immutable res = asTrie(compatMappingTrieEntries);
+ return res;
+ }
+
+ auto canonMappingTrie()
+ {
+ import std.internal.unicode_decomp : canonMappingTrieEntries;
+ static immutable res = asTrie(canonMappingTrieEntries);
+ return res;
+ }
+
+ auto compositionJumpTrie()
+ {
+ import std.internal.unicode_comp : compositionJumpTrieEntries;
+ static immutable res = asTrie(compositionJumpTrieEntries);
+ return res;
+ }
+
+ //case conversion tables
+ auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
+ auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
+ auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
+ //simple case conversion tables
+ auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
+ auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
+ auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
+
+}
+
+}// version (!std_uni_bootstrap)