diff options
Diffstat (limited to 'libphobos/src/std/uni.d')
-rw-r--r-- | libphobos/src/std/uni.d | 9756 |
1 files changed, 9756 insertions, 0 deletions
diff --git a/libphobos/src/std/uni.d b/libphobos/src/std/uni.d new file mode 100644 index 0000000..5f24ad1 --- /dev/null +++ b/libphobos/src/std/uni.d @@ -0,0 +1,9756 @@ +// Written in the D programming language. + +/++ + $(P The $(D std.uni) module provides an implementation + of fundamental Unicode algorithms and data structures. + This doesn't include UTF encoding and decoding primitives, + see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf) + for this functionality. ) + +$(SCRIPT inhibitQuickIndex = 1;) +$(BOOKTABLE, +$(TR $(TH Category) $(TH Functions)) +$(TR $(TD Decode) $(TD + $(LREF byCodePoint) + $(LREF byGrapheme) + $(LREF decodeGrapheme) + $(LREF graphemeStride) +)) +$(TR $(TD Comparison) $(TD + $(LREF icmp) + $(LREF sicmp) +)) +$(TR $(TD Classification) $(TD + $(LREF isAlpha) + $(LREF isAlphaNum) + $(LREF isCodepointSet) + $(LREF isControl) + $(LREF isFormat) + $(LREF isGraphical) + $(LREF isIntegralPair) + $(LREF isMark) + $(LREF isNonCharacter) + $(LREF isNumber) + $(LREF isPrivateUse) + $(LREF isPunctuation) + $(LREF isSpace) + $(LREF isSurrogate) + $(LREF isSurrogateHi) + $(LREF isSurrogateLo) + $(LREF isSymbol) + $(LREF isWhite) +)) +$(TR $(TD Normalization) $(TD + $(LREF NFC) + $(LREF NFD) + $(LREF NFKD) + $(LREF NormalizationForm) + $(LREF normalize) +)) +$(TR $(TD Decompose) $(TD + $(LREF decompose) + $(LREF decomposeHangul) + $(LREF UnicodeDecomposition) +)) +$(TR $(TD Compose) $(TD + $(LREF compose) + $(LREF composeJamo) +)) +$(TR $(TD Sets) $(TD + $(LREF CodepointInterval) + $(LREF CodepointSet) + $(LREF InversionList) + $(LREF unicode) +)) +$(TR $(TD Trie) $(TD + $(LREF codepointSetTrie) + $(LREF CodepointSetTrie) + $(LREF codepointTrie) + $(LREF CodepointTrie) + $(LREF toTrie) + $(LREF toDelegate) +)) +$(TR $(TD Casing) $(TD + $(LREF asCapitalized) + $(LREF asLowerCase) + $(LREF asUpperCase) + $(LREF isLower) + $(LREF isUpper) + $(LREF toLower) + $(LREF toLowerInPlace) + $(LREF toUpper) + $(LREF toUpperInPlace) +)) +$(TR $(TD Utf8Matcher) $(TD + $(LREF isUtfMatcher) + $(LREF MatcherConcept) + $(LREF utfMatcher) +)) +$(TR $(TD Separators) $(TD + $(LREF lineSep) + $(LREF nelSep) + $(LREF paraSep) +)) +$(TR $(TD Building blocks) $(TD + $(LREF allowedIn) + $(LREF combiningClass) + $(LREF Grapheme) +)) +) + + $(P All primitives listed operate on Unicode characters and + sets of characters. For functions which operate on ASCII characters + and ignore Unicode $(CHARACTERS), see $(MREF std, ascii). + For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms + used throughout this module see the $(S_LINK Terminology, terminology) section + below. + ) + $(P The focus of this module is the core needs of developing Unicode-aware + applications. To that effect it provides the following optimized primitives: + ) + $(UL + $(LI Character classification by category and common properties: + $(LREF isAlpha), $(LREF isWhite) and others. + ) + $(LI + Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)). + ) + $(LI + Converting text to any of the four normalization forms via $(LREF normalize). + ) + $(LI + Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride)) + by user-perceived characters, that is by $(LREF Grapheme) clusters. + ) + $(LI + Decomposing and composing of individual character(s) according to canonical + or compatibility rules, see $(LREF compose) and $(LREF decompose), + including the specific version for Hangul syllables $(LREF composeJamo) + and $(LREF decomposeHangul). + ) + ) + $(P It's recognized that an application may need further enhancements + and extensions, such as less commonly known algorithms, + or tailoring existing ones for region specific needs. To help users + with building any extra functionality beyond the core primitives, + the module provides: + ) + $(UL + $(LI + $(LREF CodepointSet), a type for easy manipulation of sets of characters. + Besides the typical set algebra it provides an unusual feature: + a D source code generator for detection of $(CODEPOINTS) in this set. + This is a boon for meta-programming parser frameworks, + and is used internally to power classification in small + sets like $(LREF isWhite). + ) + $(LI + A way to construct optimal packed multi-stage tables also known as a + special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie). + The functions $(LREF codepointTrie), $(LREF codepointSetTrie) + construct custom tries that map dchar to value. + The end result is a fast and predictable $(BIGOH 1) lookup that powers + functions like $(LREF isAlpha) and $(LREF combiningClass), + but for user-defined data sets. + ) + $(LI + A useful technique for Unicode-aware parsers that perform + character classification of encoded $(CODEPOINTS) + is to avoid unnecassary decoding at all costs. + $(LREF utfMatcher) provides an improvement over the usual workflow + of decode-classify-process, combining the decoding and classification + steps. By extracting necessary bits directly from encoded + $(S_LINK Code unit, code units) matchers achieve + significant performance improvements. See $(LREF MatcherConcept) for + the common interface of UTF matchers. + ) + $(LI + Generally useful building blocks for customized normalization: + $(LREF combiningClass) for querying combining class + and $(LREF allowedIn) for testing the Quick_Check + property of a given normalization form. + ) + $(LI + Access to a large selection of commonly used sets of $(CODEPOINTS). + $(S_LINK Unicode properties, Supported sets) include Script, + Block and General Category. The exact contents of a set can be + observed in the CLDR utility, on the + $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page + of the Unicode website. + See $(LREF unicode) for easy and (optionally) compile-time checked set + queries. + ) + ) + $(SECTION Synopsis) + --- + import std.uni; + void main() + { + // initialize code point sets using script/block or property name + // now 'set' contains code points from both scripts. + auto set = unicode("Cyrillic") | unicode("Armenian"); + // same thing but simpler and checked at compile-time + auto ascii = unicode.ASCII; + auto currency = unicode.Currency_Symbol; + + // easy set ops + auto a = set & ascii; + assert(a.empty); // as it has no intersection with ascii + a = set | ascii; + auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian + + // some properties of code point sets + assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 + // testing presence of a code point in a set + // is just fine, it is O(logN) + assert(!b['$']); + assert(!b['\u058F']); // Armenian dram sign + assert(b['¥']); + + // building fast lookup tables, these guarantee O(1) complexity + // 1-level Trie lookup table essentially a huge bit-set ~262Kb + auto oneTrie = toTrie!1(b); + // 2-level far more compact but typically slightly slower + auto twoTrie = toTrie!2(b); + // 3-level even smaller, and a bit slower yet + auto threeTrie = toTrie!3(b); + assert(oneTrie['£']); + assert(twoTrie['£']); + assert(threeTrie['£']); + + // build the trie with the most sensible trie level + // and bind it as a functor + auto cyrillicOrArmenian = toDelegate(set); + auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); + assert(balance == "ընկեր!"); + // compatible with bool delegate(dchar) + bool delegate(dchar) bindIt = cyrillicOrArmenian; + + // Normalization + string s = "Plain ascii (and not only), is always normalized!"; + assert(s is normalize(s));// is the same string + + string nonS = "A\u0308ffin"; // A ligature + auto nS = normalize(nonS); // to NFC, the W3C endorsed standard + assert(nS == "Äffin"); + assert(nS != nonS); + string composed = "Äffin"; + + assert(normalize!NFD(composed) == "A\u0308ffin"); + // to NFKD, compatibility decomposition useful for fuzzy matching/searching + assert(normalize!NFKD("2¹⁰") == "210"); + } + --- + $(SECTION Terminology + ) + $(P The following is a list of important Unicode notions + and definitions. Any conventions used specifically in this + module alone are marked as such. The descriptions are based on the formal + definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf, + chapter three of The Unicode Standard Core Specification.) + ) + $(P $(DEF Abstract character) A unit of information used for the organization, + control, or representation of textual data. + Note that: + $(UL + $(LI When representing data, the nature of that data + is generally symbolic as opposed to some other + kind of data (for example, visual). + ) + $(LI An abstract character has no concrete form + and should not be confused with a $(S_LINK Glyph, glyph). + ) + $(LI An abstract character does not necessarily + correspond to what a user thinks of as a “character” + and should not be confused with a $(LREF Grapheme). + ) + $(LI The abstract characters encoded (see Encoded character) + are known as Unicode abstract characters. + ) + $(LI Abstract characters not directly + encoded by the Unicode Standard can often be + represented by the use of combining character sequences. + ) + ) + ) + $(P $(DEF Canonical decomposition) + The decomposition of a character or character sequence + that results from recursively applying the canonical + mappings found in the Unicode Character Database + and these described in Conjoining Jamo Behavior + (section 12 of + $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)). + ) + $(P $(DEF Canonical composition) + The precise definition of the Canonical composition + is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf, + Unicode Conformance) section 11. + Informally it's the process that does the reverse of the canonical + decomposition with the addition of certain rules + that e.g. prevent legacy characters from appearing in the composed result. + ) + $(P $(DEF Canonical equivalent) + Two character sequences are said to be canonical equivalents if + their full canonical decompositions are identical. + ) + $(P $(DEF Character) Typically differs by context. + For the purpose of this documentation the term $(I character) + implies $(I encoded character), that is, a code point having + an assigned abstract character (a symbolic meaning). + ) + $(P $(DEF Code point) Any value in the Unicode codespace; + that is, the range of integers from 0 to 10FFFF (hex). + Not all code points are assigned to encoded characters. + ) + $(P $(DEF Code unit) The minimal bit combination that can represent + a unit of encoded text for processing or interchange. + Depending on the encoding this could be: + 8-bit code units in the UTF-8 ($(D char)), + 16-bit code units in the UTF-16 ($(D wchar)), + and 32-bit code units in the UTF-32 ($(D dchar)). + $(I Note that in UTF-32, a code unit is a code point + and is represented by the D $(D dchar) type.) + ) + $(P $(DEF Combining character) A character with the General Category + of Combining Mark(M). + $(UL + $(LI All characters with non-zero canonical combining class + are combining characters, but the reverse is not the case: + there are combining characters with a zero combining class. + ) + $(LI These characters are not normally used in isolation + unless they are being described. They include such characters + as accents, diacritics, Hebrew points, Arabic vowel signs, + and Indic matras. + ) + ) + ) + $(P $(DEF Combining class) + A numerical value used by the Unicode Canonical Ordering Algorithm + to determine which sequences of combining marks are to be + considered canonically equivalent and which are not. + ) + $(P $(DEF Compatibility decomposition) + The decomposition of a character or character sequence that results + from recursively applying both the compatibility mappings and + the canonical mappings found in the Unicode Character Database, and those + described in Conjoining Jamo Behavior no characters + can be further decomposed. + ) + $(P $(DEF Compatibility equivalent) + Two character sequences are said to be compatibility + equivalents if their full compatibility decompositions are identical. + ) + $(P $(DEF Encoded character) An association (or mapping) + between an abstract character and a code point. + ) + $(P $(DEF Glyph) The actual, concrete image of a glyph representation + having been rasterized or otherwise imaged onto some display surface. + ) + $(P $(DEF Grapheme base) A character with the property + Grapheme_Base, or any standard Korean syllable block. + ) + $(P $(DEF Grapheme cluster) Defined as the text between + grapheme boundaries as specified by Unicode Standard Annex #29, + $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation). + Important general properties of a grapheme: + $(UL + $(LI The grapheme cluster represents a horizontally segmentable + unit of text, consisting of some grapheme base (which may + consist of a Korean syllable) together with any number of + nonspacing marks applied to it. + ) + $(LI A grapheme cluster typically starts with a grapheme base + and then extends across any subsequent sequence of nonspacing marks. + A grapheme cluster is most directly relevant to text rendering and + processes such as cursor placement and text selection in editing, + but may also be relevant to comparison and searching. + ) + $(LI For many processes, a grapheme cluster behaves as if it was a + single character with the same properties as its grapheme base. + Effectively, nonspacing marks apply $(I graphically) to the base, + but do not change its properties. + ) + ) + $(P This module defines a number of primitives that work with graphemes: + $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride). + All of them are using $(I extended grapheme) boundaries + as defined in the aforementioned standard annex. + ) + ) + $(P $(DEF Nonspacing mark) A combining character with the + General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me). + ) + $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark. + ) + $(SECTION Normalization + ) + $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent) + or $(S_LINK Compatibility equivalent, compatibility equivalent) + characters in the Unicode Standard make it necessary to have a full, formal + definition of equivalence for Unicode strings. + String equivalence is determined by a process called normalization, + whereby strings are converted into forms which are compared + directly for identity. This is the primary goal of the normalization process, + see the function $(LREF normalize) to convert into any of + the four defined forms. + ) + $(P A very important attribute of the Unicode Normalization Forms + is that they must remain stable between versions of the Unicode Standard. + A Unicode string normalized to a particular Unicode Normalization Form + in one version of the standard is guaranteed to remain in that Normalization + Form for implementations of future versions of the standard. + ) + $(P The Unicode Standard specifies four normalization forms. + Informally, two of these forms are defined by maximal decomposition + of equivalent sequences, and two of these forms are defined + by maximal $(I composition) of equivalent sequences. + $(UL + $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition, + canonical decomposition) of a character sequence.) + $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition, + compatibility decomposition) of a character sequence.) + $(LI Normalization Form C (NFC): The canonical composition of the + $(S_LINK Canonical decomposition, canonical decomposition) + of a coded character sequence.) + $(LI Normalization Form KC (NFKC): The canonical composition + of the $(S_LINK Compatibility decomposition, + compatibility decomposition) of a character sequence) + ) + ) + $(P The choice of the normalization form depends on the particular use case. + NFC is the best form for general text, since it's more compatible with + strings converted from legacy encodings. NFKC is the preferred form for + identifiers, especially where there are security concerns. NFD and NFKD + are the most useful for internal processing. + ) + $(SECTION Construction of lookup tables + ) + $(P The Unicode standard describes a set of algorithms that + depend on having the ability to quickly look up various properties + of a code point. Given the the codespace of about 1 million $(CODEPOINTS), + it is not a trivial task to provide a space-efficient solution for + the multitude of properties. + ) + $(P Common approaches such as hash-tables or binary search over + sorted code point intervals (as in $(LREF InversionList)) are insufficient. + Hash-tables have enormous memory footprint and binary search + over intervals is not fast enough for some heavy-duty algorithms. + ) + $(P The recommended solution (see Unicode Implementation Guidelines) + is using multi-stage tables that are an implementation of the + $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer + keys and a fixed number of stages. For the remainder of the section + this will be called a fixed trie. The following describes a particular + implementation that is aimed for the speed of access at the expense + of ideal size savings. + ) + $(P Taking a 2-level Trie as an example the principle of operation is as follows. + Split the number of bits in a key (code point, 21 bits) into 2 components + (e.g. 15 and 8). The first is the number of bits in the index of the trie + and the other is number of bits in each page of the trie. + The layout of the trie is then an array of size 2^^bits-of-index followed + an array of memory chunks of size 2^^bits-of-page/bits-per-element. + ) + $(P The number of pages is variable (but not less then 1) + unlike the number of entries in the index. The slots of the index + all have to contain a number of a page that is present. The lookup is then + just a couple of operations - slice the upper bits, + lookup an index for these, take a page at this index and use + the lower bits as an offset within this page. + + Assuming that pages are laid out consequently + in one array at $(D pages), the pseudo-code is: + ) + --- + auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits; + pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)]; + --- + $(P Where if $(D elemsPerPage) is a power of 2 the whole process is + a handful of simple instructions and 2 array reads. Subsequent levels + of the trie are introduced by recursing on this notion - the index array + is treated as values. The number of bits in index is then again + split into 2 parts, with pages over 'current-index' and the new 'upper-index'. + ) + + $(P For completeness a level 1 trie is simply an array. + The current implementation takes advantage of bit-packing values + when the range is known to be limited in advance (such as $(D bool)). + See also $(LREF BitPacked) for enforcing it manually. + The major size advantage however comes from the fact + that multiple $(B identical pages on every level are merged) by construction. + ) + $(P The process of constructing a trie is more involved and is hidden from + the user in a form of the convenience functions $(LREF codepointTrie), + $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie). + In general a set or built-in AA with $(D dchar) type + can be turned into a trie. The trie object in this module + is read-only (immutable); it's effectively frozen after construction. + ) + $(SECTION Unicode properties + ) + $(P This is a full list of Unicode properties accessible through $(LREF unicode) + with specific helpers per category nested within. Consult the + $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility) + when in doubt about the contents of a particular set. + ) + $(P General category sets listed below are only accessible with the + $(LREF unicode) shorthand accessor.) + $(BOOKTABLE $(B General category ), + $(TR $(TH Abb.) $(TH Long form) + $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form)) + $(TR $(TD L) $(TD Letter) + $(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation)) + $(TR $(TD Ll) $(TD Lowercase_Letter) + $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation)) + $(TR $(TD Lm) $(TD Modifier_Letter) + $(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol)) + $(TR $(TD Lo) $(TD Other_Letter) + $(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol)) + $(TR $(TD Lt) $(TD Titlecase_Letter) + $(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol)) + $(TR $(TD Lu) $(TD Uppercase_Letter) + $(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol)) + $(TR $(TD M) $(TD Mark) + $(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol)) + $(TR $(TD Mc) $(TD Spacing_Mark) + $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator)) + $(TR $(TD Me) $(TD Enclosing_Mark) + $(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator)) + $(TR $(TD Mn) $(TD Nonspacing_Mark) + $(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator)) + $(TR $(TD C) $(TD Other) + $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator)) + $(TR $(TD Cc) $(TD Control) $(TD Pf) + $(TD Final_Punctuation) $(TD -) $(TD Any)) + $(TR $(TD Cf) $(TD Format) + $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII)) + ) + $(P Sets for other commonly useful properties that are + accessible with $(LREF unicode):) + $(BOOKTABLE $(B Common binary properties), + $(TR $(TH Name) $(TH Name) $(TH Name)) + $(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase)) + $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax)) + $(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space)) + $(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark)) + $(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical)) + $(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted)) + $(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm)) + $(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation)) + $(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph)) + $(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase)) + $(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector)) + $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space)) + $(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue)) + $(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start)) + $(TR $(TD Hyphen) $(TD Other_Lowercase) ) + $(TR $(TD ID_Continue) $(TD Other_Math) ) + ) + $(P Below is the table with block names accepted by $(LREF unicode.block). + Note that the shorthand version $(LREF unicode) requires "In" + to be prepended to the names of blocks so as to disambiguate + scripts and blocks. + ) + $(BOOKTABLE $(B Blocks), + $(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian)) + $(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols)) + $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar)) + $(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A)) + $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue)) + $(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo)) + $(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms)) + $(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham)) + $(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki)) + $(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic)) + $(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian)) + $(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian)) + $(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic)) + $(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition)) + $(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya)) + $(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya)) + $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa)) + $(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc)) + $(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician)) + $(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions)) + $(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement)) + $(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards)) + $(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area)) + $(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang)) + $(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols)) + $(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic)) + $(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan)) + $(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra)) + $(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada)) + $(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian)) + $(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala)) + $(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants)) + $(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng)) + $(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters)) + $(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials)) + $(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese)) + $(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement)) + $(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts)) + $(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A)) + $(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B)) + $(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators)) + $(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation)) + $(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A)) + $(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B)) + $(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri)) + $(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac)) + $(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog)) + $(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa)) + $(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags)) + $(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le)) + $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham)) + $(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet)) + $(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols)) + $(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri)) + $(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil)) + $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu)) + $(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana)) + $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai)) + $(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan)) + $(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh)) + $(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols)) + $(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic)) + $(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics)) + $(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended)) + $(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai)) + $(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors)) + $(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement)) + $(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions)) + $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms)) + $(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols)) + $(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals)) + $(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables)) + $(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) ) + $(TR $(TD Ethiopic) $(TD Modifier Tone Letters) ) + ) + $(P Below is the table with script names accepted by $(LREF unicode.script) + and by the shorthand version $(LREF unicode):) + $(BOOKTABLE $(B Scripts), + $(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic)) + $(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian)) + $(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian)) + $(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic)) + $(TR $(TD Bamum) $(TD Inherited) $(TD Oriya)) + $(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya)) + $(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa)) + $(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician)) + $(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang)) + $(TR $(TD Braille) $(TD Kannada) $(TD Runic)) + $(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan)) + $(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra)) + $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada)) + $(TR $(TD Carian) $(TD Khmer) $(TD Shavian)) + $(TR $(TD Chakma) $(TD Lao) $(TD Sinhala)) + $(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng)) + $(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese)) + $(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri)) + $(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac)) + $(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog)) + $(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa)) + $(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le)) + $(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham)) + $(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet)) + $(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri)) + $(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil)) + $(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu)) + $(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana)) + $(TR $(TD Gothic) $(TD Mongolian) $(TD Thai)) + $(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan)) + $(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh)) + $(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic)) + $(TR $(TD Han) $(TD Ogham) $(TD Vai)) + $(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi)) + ) + $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).) + $(BOOKTABLE $(B Hangul syllable type), + $(TR $(TH Abb.) $(TH Long form)) + $(TR $(TD L) $(TD Leading_Jamo)) + $(TR $(TD LV) $(TD LV_Syllable)) + $(TR $(TD LVT) $(TD LVT_Syllable) ) + $(TR $(TD T) $(TD Trailing_Jamo)) + $(TR $(TD V) $(TD Vowel_Jamo)) + ) + References: + $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table), + $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia), + $(HTTP www.unicode.org, The Unicode Consortium), + $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms), + $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation) + $(HTTP www.unicode.org/uni2book/ch05.pdf, + Unicode Implementation Guidelines) + $(HTTP www.unicode.org/uni2book/ch03.pdf, + Unicode Conformance) + Trademarks: + Unicode(tm) is a trademark of Unicode, Inc. + + Copyright: Copyright 2013 - + License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). + Authors: Dmitry Olshansky + Source: $(PHOBOSSRC std/_uni.d) + Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2) + +Macros: + +SECTION = <h3><a id="$1">$0</a></h3> +DEF = <div><a id="$1"><i>$0</i></a></div> +S_LINK = <a href="#$1">$+</a> +CODEPOINT = $(S_LINK Code point, code point) +CODEPOINTS = $(S_LINK Code point, code points) +CHARACTER = $(S_LINK Character, character) +CHARACTERS = $(S_LINK Character, characters) +CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster) ++/ +module std.uni; + +import std.meta; // AliasSeq +import std.range.primitives; // back, ElementEncodingType, ElementType, empty, + // front, isForwardRange, isInputRange, isRandomAccessRange, popFront, put, + // save +import std.traits; // isConvertibleToString, isIntegral, isSomeChar, + // isSomeString, Unqual +import std.exception;// : enforce; +import core.memory; //: pureMalloc, pureRealloc, pureFree; +import core.exception; // : onOutOfMemoryError; +static import std.ascii; +// debug = std_uni; + +debug(std_uni) import std.stdio; // writefln, writeln + +private: + +version (unittest) +{ +private: + struct TestAliasedString + { + string get() @safe @nogc pure nothrow { return _s; } + alias get this; + @disable this(this); + string _s; + } + + bool testAliasedString(alias func, Args...)(string s, Args args) + { + import std.algorithm.comparison : equal; + auto a = func(TestAliasedString(s), args); + auto b = func(s, args); + static if (is(typeof(equal(a, b)))) + { + // For ranges, compare contents instead of object identity. + return equal(a, b); + } + else + { + return a == b; + } + } +} + +void copyBackwards(T,U)(T[] src, U[] dest) +{ + assert(src.length == dest.length); + for (size_t i=src.length; i-- > 0; ) + dest[i] = src[i]; +} + +void copyForward(T,U)(T[] src, U[] dest) +{ + assert(src.length == dest.length); + for (size_t i=0; i<src.length; i++) + dest[i] = src[i]; +} + +// TODO: update to reflect all major CPUs supporting unaligned reads +version (X86) + enum hasUnalignedReads = true; +else version (X86_64) + enum hasUnalignedReads = true; +else + enum hasUnalignedReads = false; // better be safe then sorry + +public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator. +public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator. +public enum dchar nelSep = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line. + +// test the intro example +@safe unittest +{ + import std.algorithm.searching : find; + // initialize code point sets using script/block or property name + // set contains code points from both scripts. + auto set = unicode("Cyrillic") | unicode("Armenian"); + // or simpler and statically-checked look + auto ascii = unicode.ASCII; + auto currency = unicode.Currency_Symbol; + + // easy set ops + auto a = set & ascii; + assert(a.empty); // as it has no intersection with ascii + a = set | ascii; + auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian + + // some properties of code point sets + assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 + // testing presence of a code point in a set + // is just fine, it is O(logN) + assert(!b['$']); + assert(!b['\u058F']); // Armenian dram sign + assert(b['¥']); + + // building fast lookup tables, these guarantee O(1) complexity + // 1-level Trie lookup table essentially a huge bit-set ~262Kb + auto oneTrie = toTrie!1(b); + // 2-level far more compact but typically slightly slower + auto twoTrie = toTrie!2(b); + // 3-level even smaller, and a bit slower yet + auto threeTrie = toTrie!3(b); + assert(oneTrie['£']); + assert(twoTrie['£']); + assert(threeTrie['£']); + + // build the trie with the most sensible trie level + // and bind it as a functor + auto cyrillicOrArmenian = toDelegate(set); + auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); + assert(balance == "ընկեր!"); + // compatible with bool delegate(dchar) + bool delegate(dchar) bindIt = cyrillicOrArmenian; + + // Normalization + string s = "Plain ascii (and not only), is always normalized!"; + assert(s is normalize(s));// is the same string + + string nonS = "A\u0308ffin"; // A ligature + auto nS = normalize(nonS); // to NFC, the W3C endorsed standard + assert(nS == "Äffin"); + assert(nS != nonS); + string composed = "Äffin"; + + assert(normalize!NFD(composed) == "A\u0308ffin"); + // to NFKD, compatibility decomposition useful for fuzzy matching/searching + assert(normalize!NFKD("2¹⁰") == "210"); +} + +enum lastDchar = 0x10FFFF; + +auto force(T, F)(F from) +if (isIntegral!T && !is(T == F)) +{ + assert(from <= T.max && from >= T.min); + return cast(T) from; +} + +auto force(T, F)(F from) +if (isBitPacked!T && !is(T == F)) +{ + assert(from <= 2^^bitSizeOf!T-1); + return T(cast(TypeOfBitPacked!T) from); +} + +auto force(T, F)(F from) +if (is(T == F)) +{ + return from; +} + +// repeat X times the bit-pattern in val assuming it's length is 'bits' +size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc +{ + static if (times == 1) + return val; + else static if (bits == 1) + { + static if (times == size_t.sizeof*8) + return val ? size_t.max : 0; + else + return val ? (1 << times)-1 : 0; + } + else static if (times % 2) + return (replicateBits!(times-1, bits)(val)<<bits) | val; + else + return replicateBits!(times/2, bits*2)((val << bits) | val); +} + +@safe pure nothrow @nogc unittest // for replicate +{ + import std.algorithm.iteration : sum, map; + import std.range : iota; + size_t m = 0b111; + size_t m2 = 0b01; + foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) + { + assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i))); + assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum()); + } +} + +// multiple arrays squashed into one memory block +struct MultiArray(Types...) +{ + import std.range.primitives : isOutputRange; + this(size_t[] sizes...) @safe pure nothrow + { + assert(dim == sizes.length); + size_t full_size; + foreach (i, v; Types) + { + full_size += spaceFor!(bitSizeOf!v)(sizes[i]); + sz[i] = sizes[i]; + static if (i >= 1) + offsets[i] = offsets[i-1] + + spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]); + } + + storage = new size_t[full_size]; + } + + this(const(size_t)[] raw_offsets, + const(size_t)[] raw_sizes, const(size_t)[] data)const @safe pure nothrow @nogc + { + offsets[] = raw_offsets[]; + sz[] = raw_sizes[]; + storage = data; + } + + @property auto slice(size_t n)()inout pure nothrow @nogc + { + auto ptr = raw_ptr!n; + return packedArrayView!(Types[n])(ptr, sz[n]); + } + + @property auto ptr(size_t n)()inout pure nothrow @nogc + { + auto ptr = raw_ptr!n; + return inout(PackedPtr!(Types[n]))(ptr); + } + + template length(size_t n) + { + @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; } + + @property void length(size_t new_size) + { + if (new_size > sz[n]) + {// extend + size_t delta = (new_size - sz[n]); + sz[n] += delta; + delta = spaceFor!(bitSizeOf!(Types[n]))(delta); + storage.length += delta;// extend space at end + // raw_slice!x must follow resize as it could be moved! + // next stmts move all data past this array, last-one-goes-first + static if (n != dim-1) + { + auto start = raw_ptr!(n+1); + // len includes delta + size_t len = (storage.ptr+storage.length-start); + + copyBackwards(start[0 .. len-delta], start[delta .. len]); + + start[0 .. delta] = 0; + // offsets are used for raw_slice, ptr etc. + foreach (i; n+1 .. dim) + offsets[i] += delta; + } + } + else if (new_size < sz[n]) + {// shrink + size_t delta = (sz[n] - new_size); + sz[n] -= delta; + delta = spaceFor!(bitSizeOf!(Types[n]))(delta); + // move all data past this array, forward direction + static if (n != dim-1) + { + auto start = raw_ptr!(n+1); + size_t len = (storage.ptr+storage.length-start); + copyForward(start[0 .. len-delta], start[delta .. len]); + + // adjust offsets last, they affect raw_slice + foreach (i; n+1 .. dim) + offsets[i] -= delta; + } + storage.length -= delta; + } + // else - NOP + } + } + + @property size_t bytes(size_t n=size_t.max)() const @safe + { + static if (n == size_t.max) + return storage.length*size_t.sizeof; + else static if (n != Types.length-1) + return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof; + else + return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof; + } + + void store(OutRange)(scope OutRange sink) const + if (isOutputRange!(OutRange, char)) + { + import std.format : formattedWrite; + formattedWrite(sink, "[%( 0x%x, %)]", offsets[]); + formattedWrite(sink, ", [%( 0x%x, %)]", sz[]); + formattedWrite(sink, ", [%( 0x%x, %)]", storage); + } + +private: + import std.meta : staticMap; + @property auto raw_ptr(size_t n)()inout pure nothrow @nogc + { + static if (n == 0) + return storage.ptr; + else + { + return storage.ptr+offsets[n]; + } + } + enum dim = Types.length; + size_t[dim] offsets;// offset for level x + size_t[dim] sz;// size of level x + alias bitWidth = staticMap!(bitSizeOf, Types); + size_t[] storage; +} + +@system unittest +{ + import std.conv : text; + enum dg = (){ + // sizes are: + // lvl0: 3, lvl1 : 2, lvl2: 1 + auto m = MultiArray!(int, ubyte, int)(3,2,1); + + static void check(size_t k, T)(ref T m, int n) + { + foreach (i; 0 .. n) + assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n])); + } + + static void checkB(size_t k, T)(ref T m, int n) + { + foreach (i; 0 .. n) + assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n])); + } + + static void fill(size_t k, T)(ref T m, int n) + { + foreach (i; 0 .. n) + m.slice!(k)[i] = force!ubyte(i+1); + } + + static void fillB(size_t k, T)(ref T m, int n) + { + foreach (i; 0 .. n) + m.slice!(k)[i] = force!ubyte(n-i); + } + + m.length!1 = 100; + fill!1(m, 100); + check!1(m, 100); + + m.length!0 = 220; + fill!0(m, 220); + check!1(m, 100); + check!0(m, 220); + + m.length!2 = 17; + fillB!2(m, 17); + checkB!2(m, 17); + check!0(m, 220); + check!1(m, 100); + + m.length!2 = 33; + checkB!2(m, 17); + fillB!2(m, 33); + checkB!2(m, 33); + check!0(m, 220); + check!1(m, 100); + + m.length!1 = 195; + fillB!1(m, 195); + checkB!1(m, 195); + checkB!2(m, 33); + check!0(m, 220); + + auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10); + marr.length!0 = 15; + marr.length!1 = 30; + fill!1(marr, 30); + fill!0(marr, 15); + check!1(marr, 30); + check!0(marr, 15); + return 0; + }; + enum ct = dg(); + auto rt = dg(); +} + +@system unittest +{// more bitpacking tests + import std.conv : text; + + alias Bitty = + MultiArray!(BitPacked!(size_t, 3) + , BitPacked!(size_t, 4) + , BitPacked!(size_t, 3) + , BitPacked!(size_t, 6) + , bool); + alias fn1 = sliceBits!(13, 16); + alias fn2 = sliceBits!( 9, 13); + alias fn3 = sliceBits!( 6, 9); + alias fn4 = sliceBits!( 0, 6); + static void check(size_t lvl, MA)(ref MA arr){ + for (size_t i = 0; i< arr.length!lvl; i++) + assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i])); + } + + static void fillIdx(size_t lvl, MA)(ref MA arr){ + for (size_t i = 0; i< arr.length!lvl; i++) + arr.slice!(lvl)[i] = i; + } + Bitty m1; + + m1.length!4 = 10; + m1.length!3 = 2^^6; + m1.length!2 = 2^^3; + m1.length!1 = 2^^4; + m1.length!0 = 2^^3; + + m1.length!4 = 2^^16; + + for (size_t i = 0; i< m1.length!4; i++) + m1.slice!(4)[i] = i % 2; + + fillIdx!1(m1); + check!1(m1); + fillIdx!2(m1); + check!2(m1); + fillIdx!3(m1); + check!3(m1); + fillIdx!0(m1); + check!0(m1); + check!3(m1); + check!2(m1); + check!1(m1); + for (size_t i=0; i < 2^^16; i++) + { + m1.slice!(4)[i] = i % 2; + m1.slice!(0)[fn1(i)] = fn1(i); + m1.slice!(1)[fn2(i)] = fn2(i); + m1.slice!(2)[fn3(i)] = fn3(i); + m1.slice!(3)[fn4(i)] = fn4(i); + } + for (size_t i=0; i < 2^^16; i++) + { + assert(m1.slice!(4)[i] == i % 2); + assert(m1.slice!(0)[fn1(i)] == fn1(i)); + assert(m1.slice!(1)[fn2(i)] == fn2(i)); + assert(m1.slice!(2)[fn3(i)] == fn3(i)); + assert(m1.slice!(3)[fn4(i)] == fn4(i)); + } +} + +size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc +{ + import std.math : nextPow2; + enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView + static if (bits > 8*size_t.sizeof) + { + static assert(bits % (size_t.sizeof*8) == 0); + return new_len * bits/(8*size_t.sizeof); + } + else + { + enum factor = size_t.sizeof*8/bits; + return (new_len+factor-1)/factor; // rounded up + } +} + +template isBitPackableType(T) +{ + enum isBitPackableType = isBitPacked!T + || isIntegral!T || is(T == bool) || isSomeChar!T; +} + +//============================================================================ +template PackedArrayView(T) +if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) + && isBitPackableType!U) || isBitPackableType!T) +{ + import std.math : nextPow2; + private enum bits = bitSizeOf!T; + alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); +} + +//unsafe and fast access to a chunk of RAM as if it contains packed values +template PackedPtr(T) +if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) + && isBitPackableType!U) || isBitPackableType!T) +{ + import std.math : nextPow2; + private enum bits = bitSizeOf!T; + alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); +} + +struct PackedPtrImpl(T, size_t bits) +{ +pure nothrow: + static assert(isPow2OrZero(bits)); + + this(inout(size_t)* ptr)inout @safe @nogc + { + origin = ptr; + } + + private T simpleIndex(size_t n) inout + { + immutable q = n / factor; + immutable r = n % factor; + return cast(T)((origin[q] >> bits*r) & mask); + } + + private void simpleWrite(TypeOfBitPacked!T val, size_t n) + in + { + static if (isIntegral!T) + assert(val <= mask); + } + body + { + immutable q = n / factor; + immutable r = n % factor; + immutable tgt_shift = bits*r; + immutable word = origin[q]; + origin[q] = (word & ~(mask << tgt_shift)) + | (cast(size_t) val << tgt_shift); + } + + static if (factor == bytesPerWord// can safely pack by byte + || factor == 1 // a whole word at a time + || ((factor == bytesPerWord/2 || factor == bytesPerWord/4) + && hasUnalignedReads)) // this needs unaligned reads + { + static if (factor == bytesPerWord) + alias U = ubyte; + else static if (factor == bytesPerWord/2) + alias U = ushort; + else static if (factor == bytesPerWord/4) + alias U = uint; + else static if (size_t.sizeof == 8 && factor == bytesPerWord/8) + alias U = ulong; + + T opIndex(size_t idx) inout + { + return __ctfe ? simpleIndex(idx) : + cast(inout(T))(cast(U*) origin)[idx]; + } + + static if (isBitPacked!T) // lack of user-defined implicit conversion + { + void opIndexAssign(T val, size_t idx) + { + return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); + } + } + + void opIndexAssign(TypeOfBitPacked!T val, size_t idx) + { + if (__ctfe) + simpleWrite(val, idx); + else + (cast(U*) origin)[idx] = cast(U) val; + } + } + else + { + T opIndex(size_t n) inout + { + return simpleIndex(n); + } + + static if (isBitPacked!T) // lack of user-defined implicit conversion + { + void opIndexAssign(T val, size_t idx) + { + return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); + } + } + + void opIndexAssign(TypeOfBitPacked!T val, size_t n) + { + return simpleWrite(val, n); + } + } + +private: + // factor - number of elements in one machine word + enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1; + enum bytesPerWord = size_t.sizeof; + size_t* origin; +} + +// data is packed only by power of two sized packs per word, +// thus avoiding mul/div overhead at the cost of ultimate packing +// this construct doesn't own memory, only provides access, see MultiArray for usage +struct PackedArrayViewImpl(T, size_t bits) +{ +pure nothrow: + + this(inout(size_t)* origin, size_t offset, size_t items) inout @safe + { + ptr = inout(PackedPtr!(T))(origin); + ofs = offset; + limit = items; + } + + bool zeros(size_t s, size_t e) + in + { + assert(s <= e); + } + body + { + s += ofs; + e += ofs; + immutable pad_s = roundUp(s); + if ( s >= e) + { + foreach (i; s .. e) + if (ptr[i]) + return false; + return true; + } + immutable pad_e = roundDown(e); + size_t i; + for (i=s; i<pad_s; i++) + if (ptr[i]) + return false; + // all in between is x*factor elements + for (size_t j=i/factor; i<pad_e; i+=factor, j++) + if (ptr.origin[j]) + return false; + for (; i<e; i++) + if (ptr[i]) + return false; + return true; + } + + T opIndex(size_t idx) inout + in + { + assert(idx < limit); + } + body + { + return ptr[ofs + idx]; + } + + static if (isBitPacked!T) // lack of user-defined implicit conversion + { + void opIndexAssign(T val, size_t idx) + { + return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); + } + } + + void opIndexAssign(TypeOfBitPacked!T val, size_t idx) + in + { + assert(idx < limit); + } + body + { + ptr[ofs + idx] = val; + } + + static if (isBitPacked!T) // lack of user-defined implicit conversions + { + void opSliceAssign(T val, size_t start, size_t end) + { + opSliceAssign(cast(TypeOfBitPacked!T) val, start, end); + } + } + + void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end) + in + { + assert(start <= end); + assert(end <= limit); + } + body + { + // account for ofsetted view + start += ofs; + end += ofs; + // rounded to factor granularity + immutable pad_start = roundUp(start);// rounded up + if (pad_start >= end) //rounded up >= then end of slice + { + //nothing to gain, use per element assignment + foreach (i; start .. end) + ptr[i] = val; + return; + } + immutable pad_end = roundDown(end); // rounded down + size_t i; + for (i=start; i<pad_start; i++) + ptr[i] = val; + // all in between is x*factor elements + if (pad_start != pad_end) + { + immutable repval = replicateBits!(factor, bits)(val); + for (size_t j=i/factor; i<pad_end; i+=factor, j++) + ptr.origin[j] = repval;// so speed it up by factor + } + for (; i<end; i++) + ptr[i] = val; + } + + auto opSlice(size_t from, size_t to)inout + in + { + assert(from <= to); + assert(ofs + to <= limit); + } + body + { + return typeof(this)(ptr.origin, ofs + from, to - from); + } + + auto opSlice(){ return opSlice(0, length); } + + bool opEquals(T)(auto ref T arr) const + { + if (limit != arr.limit) + return false; + size_t s1 = ofs, s2 = arr.ofs; + size_t e1 = s1 + limit, e2 = s2 + limit; + if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0) + { + return ptr.origin[s1/factor .. e1/factor] + == arr.ptr.origin[s2/factor .. e2/factor]; + } + for (size_t i=0;i<limit; i++) + if (this[i] != arr[i]) + return false; + return true; + } + + @property size_t length()const{ return limit; } + +private: + auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; } + auto roundDown()(size_t val){ return val/factor*factor; } + // factor - number of elements in one machine word + enum factor = size_t.sizeof*8/bits; + PackedPtr!(T) ptr; + size_t ofs, limit; +} + + +private struct SliceOverIndexed(T) +{ + enum assignableIndex = is(typeof((){ T.init[0] = Item.init; })); + enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; })); + auto opIndex(size_t idx)const + in + { + assert(idx < to - from); + } + body + { + return (*arr)[from+idx]; + } + + static if (assignableIndex) + void opIndexAssign(Item val, size_t idx) + in + { + assert(idx < to - from); + } + body + { + (*arr)[from+idx] = val; + } + + auto opSlice(size_t a, size_t b) + { + return typeof(this)(from+a, from+b, arr); + } + + // static if (assignableSlice) + void opSliceAssign(T)(T val, size_t start, size_t end) + { + (*arr)[start+from .. end+from] = val; + } + + auto opSlice() + { + return typeof(this)(from, to, arr); + } + + @property size_t length()const { return to-from;} + + auto opDollar()const { return length; } + + @property bool empty()const { return from == to; } + + @property auto front()const { return (*arr)[from]; } + + static if (assignableIndex) + @property void front(Item val) { (*arr)[from] = val; } + + @property auto back()const { return (*arr)[to-1]; } + + static if (assignableIndex) + @property void back(Item val) { (*arr)[to-1] = val; } + + @property auto save() inout { return this; } + + void popFront() { from++; } + + void popBack() { to--; } + + bool opEquals(T)(auto ref T arr) const + { + if (arr.length != length) + return false; + for (size_t i=0; i <length; i++) + if (this[i] != arr[i]) + return false; + return true; + } +private: + alias Item = typeof(T.init[0]); + size_t from, to; + T* arr; +} + +static assert(isRandomAccessRange!(SliceOverIndexed!(int[]))); + +SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x) +if (is(Unqual!T == T)) +{ + return SliceOverIndexed!(const(T))(a, b, x); +} + +// BUG? inout is out of reach +//...SliceOverIndexed.arr only parameters or stack based variables can be inout +SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x) +if (is(Unqual!T == T)) +{ + return SliceOverIndexed!T(a, b, x); +} + +@system unittest +{ + int[] idxArray = [2, 3, 5, 8, 13]; + auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray); + + assert(!sliced.empty); + assert(sliced.front == 2); + sliced.front = 1; + assert(sliced.front == 1); + assert(sliced.back == 13); + sliced.popFront(); + assert(sliced.front == 3); + assert(sliced.back == 13); + sliced.back = 11; + assert(sliced.back == 11); + sliced.popBack(); + + assert(sliced.front == 3); + assert(sliced[$-1] == 8); + sliced = sliced[]; + assert(sliced[0] == 3); + assert(sliced.back == 8); + sliced = sliced[1..$]; + assert(sliced.front == 5); + sliced = sliced[0..$-1]; + assert(sliced[$-1] == 5); + + int[] other = [2, 5]; + assert(sliced[] == sliceOverIndexed(1, 2, &other)); + sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1; + assert(idxArray[0 .. 2] == [-1, -1]); + uint[] nullArr = null; + auto nullSlice = sliceOverIndexed(0, 0, &idxArray); + assert(nullSlice.empty); +} + +private auto packedArrayView(T)(inout(size_t)* ptr, size_t items) @trusted pure nothrow +{ + return inout(PackedArrayView!T)(ptr, 0, items); +} + + +//============================================================================ +// Partially unrolled binary search using Shar's method +//============================================================================ + +string genUnrolledSwitchSearch(size_t size) @safe pure nothrow +{ + import core.bitop : bsr; + import std.array : replace; + import std.conv : to; + assert(isPow2OrZero(size)); + string code = ` + import core.bitop : bsr; + auto power = bsr(m)+1; + switch (power){`; + size_t i = bsr(size); + foreach_reverse (val; 0 .. bsr(size)) + { + auto v = 2^^val; + code ~= ` + case pow: + if (pred(range[idx+m], needle)) + idx += m; + goto case; + `.replace("m", to!string(v)) + .replace("pow", to!string(i)); + i--; + } + code ~= ` + case 0: + if (pred(range[idx], needle)) + idx += 1; + goto default; + `; + code ~= ` + default: + }`; + return code; +} + +bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc +{ + // See also: std.math.isPowerOf2() + return (sz & (sz-1)) == 0; +} + +size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle) +if (is(T : ElementType!Range)) +{ + assert(isPow2OrZero(range.length)); + size_t idx = 0, m = range.length/2; + while (m != 0) + { + if (pred(range[idx+m], needle)) + idx += m; + m /= 2; + } + if (pred(range[idx], needle)) + idx += 1; + return idx; +} + +size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle) +if (is(T : ElementType!Range)) +{ + assert(isPow2OrZero(range.length)); + size_t idx = 0, m = range.length/2; + enum max = 1 << 10; + while (m >= max) + { + if (pred(range[idx+m], needle)) + idx += m; + m /= 2; + } + mixin(genUnrolledSwitchSearch(max)); + return idx; +} + +template sharMethod(alias uniLowerBound) +{ + size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle) + if (is(T : ElementType!Range)) + { + import std.functional : binaryFun; + import std.math : nextPow2, truncPow2; + alias pred = binaryFun!_pred; + if (range.length == 0) + return 0; + if (isPow2OrZero(range.length)) + return uniLowerBound!pred(range, needle); + size_t n = truncPow2(range.length); + if (pred(range[n-1], needle)) + {// search in another 2^^k area that fully covers the tail of range + size_t k = nextPow2(range.length - n + 1); + return range.length - k + uniLowerBound!pred(range[$-k..$], needle); + } + else + return uniLowerBound!pred(range[0 .. n], needle); + } +} + +alias sharLowerBound = sharMethod!uniformLowerBound; +alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound; + +@safe unittest +{ + import std.array : array; + import std.range : assumeSorted, iota; + + auto stdLowerBound(T)(T[] range, T needle) + { + return assumeSorted(range).lowerBound(needle).length; + } + immutable MAX = 5*1173; + auto arr = array(iota(5, MAX, 5)); + assert(arr.length == MAX/5-1); + foreach (i; 0 .. MAX+5) + { + auto st = stdLowerBound(arr, i); + assert(st == sharLowerBound(arr, i)); + assert(st == sharSwitchLowerBound(arr, i)); + } + arr = []; + auto st = stdLowerBound(arr, 33); + assert(st == sharLowerBound(arr, 33)); + assert(st == sharSwitchLowerBound(arr, 33)); +} +//============================================================================ + +@safe +{ +// hope to see simillar stuff in public interface... once Allocators are out +//@@@BUG moveFront and friends? dunno, for now it's POD-only + +@trusted size_t genericReplace(Policy=void, T, Range) + (ref T dest, size_t from, size_t to, Range stuff) +{ + import std.algorithm.mutation : copy; + size_t delta = to - from; + size_t stuff_end = from+stuff.length; + if (stuff.length > delta) + {// replace increases length + delta = stuff.length - delta;// now, new is > old by delta + static if (is(Policy == void)) + dest.length = dest.length+delta;//@@@BUG lame @property + else + dest = Policy.realloc(dest, dest.length+delta); + copyBackwards(dest[to .. dest.length-delta], + dest[to+delta .. dest.length]); + copyForward(stuff, dest[from .. stuff_end]); + } + else if (stuff.length == delta) + { + copy(stuff, dest[from .. to]); + } + else + {// replace decreases length by delta + delta = delta - stuff.length; + copy(stuff, dest[from .. stuff_end]); + copyForward(dest[to .. dest.length], + dest[stuff_end .. dest.length-delta]); + static if (is(Policy == void)) + dest.length = dest.length - delta;//@@@BUG lame @property + else + dest = Policy.realloc(dest, dest.length-delta); + } + return stuff_end; +} + + +// Simple storage manipulation policy +@trusted private struct GcPolicy +{ + import std.traits : isDynamicArray; + + static T[] dup(T)(const T[] arr) + { + return arr.dup; + } + + static T[] alloc(T)(size_t size) + { + return new T[size]; + } + + static T[] realloc(T)(T[] arr, size_t sz) + { + arr.length = sz; + return arr; + } + + static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) + { + replaceInPlace(dest, from, to, stuff); + } + + static void append(T, V)(ref T[] arr, V value) + if (!isInputRange!V) + { + arr ~= force!T(value); + } + + static void append(T, V)(ref T[] arr, V value) + if (isInputRange!V) + { + insertInPlace(arr, arr.length, value); + } + + static void destroy(T)(ref T arr) + if (isDynamicArray!T && is(Unqual!T == T)) + { + debug + { + arr[] = cast(typeof(T.init[0]))(0xdead_beef); + } + arr = null; + } + + static void destroy(T)(ref T arr) + if (isDynamicArray!T && !is(Unqual!T == T)) + { + arr = null; + } +} + +// ditto +@trusted struct ReallocPolicy +{ + import std.range.primitives : hasLength; + + static T[] dup(T)(const T[] arr) + { + auto result = alloc!T(arr.length); + result[] = arr[]; + return result; + } + + static T[] alloc(T)(size_t size) + { + import core.stdc.stdlib : malloc; + import std.exception : enforce; + + import core.checkedint : mulu; + bool overflow; + size_t nbytes = mulu(size, T.sizeof, overflow); + if (overflow) assert(0); + + auto ptr = cast(T*) enforce(malloc(nbytes), "out of memory on C heap"); + return ptr[0 .. size]; + } + + static T[] realloc(T)(T[] arr, size_t size) + { + import core.stdc.stdlib : realloc; + import std.exception : enforce; + if (!size) + { + destroy(arr); + return null; + } + + import core.checkedint : mulu; + bool overflow; + size_t nbytes = mulu(size, T.sizeof, overflow); + if (overflow) assert(0); + + auto ptr = cast(T*) enforce(realloc(arr.ptr, nbytes), "out of memory on C heap"); + return ptr[0 .. size]; + } + + static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) + { + genericReplace!(ReallocPolicy)(dest, from, to, stuff); + } + + static void append(T, V)(ref T[] arr, V value) + if (!isInputRange!V) + { + if (arr.length == size_t.max) assert(0); + arr = realloc(arr, arr.length+1); + arr[$-1] = force!T(value); + } + + @safe unittest + { + int[] arr; + ReallocPolicy.append(arr, 3); + + import std.algorithm.comparison : equal; + assert(equal(arr, [3])); + } + + static void append(T, V)(ref T[] arr, V value) + if (isInputRange!V && hasLength!V) + { + import core.checkedint : addu; + bool overflow; + size_t nelems = addu(arr.length, value.length, overflow); + if (overflow) assert(0); + + arr = realloc(arr, nelems); + + import std.algorithm.mutation : copy; + copy(value, arr[$-value.length..$]); + } + + @safe unittest + { + int[] arr; + ReallocPolicy.append(arr, [1,2,3]); + + import std.algorithm.comparison : equal; + assert(equal(arr, [1,2,3])); + } + + static void destroy(T)(ref T[] arr) + { + import core.stdc.stdlib : free; + if (arr.ptr) + free(arr.ptr); + arr = null; + } +} + +//build hack +alias _RealArray = CowArray!ReallocPolicy; + +@safe unittest +{ + import std.algorithm.comparison : equal; + + with(ReallocPolicy) + { + bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result, + string file = __FILE__, size_t line = __LINE__) + { + { + replaceImpl(orig, from, to, toReplace); + scope(exit) destroy(orig); + if (!equal(orig, result)) + return false; + } + return true; + } + static T[] arr(T)(T[] args... ) + { + return dup(args); + } + + assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4])); + assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4])); + assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7])); + assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4])); + assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4])); + } +} + +/** + Tests if T is some kind a set of code points. Intended for template constraints. +*/ +public template isCodepointSet(T) +{ + static if (is(T dummy == InversionList!(Args), Args...)) + enum isCodepointSet = true; + else + enum isCodepointSet = false; +} + +/** + Tests if $(D T) is a pair of integers that implicitly convert to $(D V). + The following code must compile for any pair $(D T): + --- + (T x){ V a = x[0]; V b = x[1];} + --- + The following must not compile: + --- + (T x){ V c = x[2];} + --- +*/ +public template isIntegralPair(T, V=uint) +{ + enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];})) + && !is(typeof((T x){ V c = x[2]; })); +} + + +/** + The recommended default type for set of $(CODEPOINTS). + For details, see the current implementation: $(LREF InversionList). +*/ +public alias CodepointSet = InversionList!GcPolicy; + + +//@@@BUG: std.typecons tuples depend on std.format to produce fields mixin +// which relies on std.uni.isGraphical and this chain blows up with Forward reference error +// hence below doesn't seem to work +// public alias CodepointInterval = Tuple!(uint, "a", uint, "b"); + +/** + The recommended type of $(REF Tuple, std,_typecons) + to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList). + Any interval type should pass $(LREF isIntegralPair) trait. +*/ +public struct CodepointInterval +{ +pure: + uint[2] _tuple; + alias _tuple this; + +@safe pure nothrow @nogc: + + this(uint low, uint high) + { + _tuple[0] = low; + _tuple[1] = high; + } + bool opEquals(T)(T val) const + { + return this[0] == val[0] && this[1] == val[1]; + } + @property ref inout(uint) a() inout { return _tuple[0]; } + @property ref inout(uint) b() inout { return _tuple[1]; } +} + +/** + $(P + $(D InversionList) is a set of $(CODEPOINTS) + represented as an array of open-right [a, b$(RPAREN) + intervals (see $(LREF CodepointInterval) above). + The name comes from the way the representation reads left to right. + For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN), + plus a singular value 60 looks like this: + ) + --- + 10, 50, 60, 61, 80, 90 + --- + $(P + The way to read this is: start with negative meaning that all numbers + smaller then the next one are not present in this set (and positive + - the contrary). Then switch positive/negative after each + number passed from left to right. + ) + $(P This way negative spans until 10, then positive until 50, + then negative until 60, then positive until 61, and so on. + As seen this provides a space-efficient storage of highly redundant data + that comes in long runs. A description which Unicode $(CHARACTER) + properties fit nicely. The technique itself could be seen as a variation + on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding). + ) + + $(P Sets are value types (just like $(D int) is) thus they + are never aliased. + ) + Example: + --- + auto a = CodepointSet('a', 'z'+1); + auto b = CodepointSet('A', 'Z'+1); + auto c = a; + a = a | b; + assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1)); + assert(a != c); + --- + $(P See also $(LREF unicode) for simpler construction of sets + from predefined ones. + ) + + $(P Memory usage is 8 bytes per each contiguous interval in a set. + The value semantics are achieved by using the + $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique + and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared). + ) + + Note: + $(P It's not recommended to rely on the template parameters + or the exact type of a current $(CODEPOINT) set in $(D std.uni). + The type and parameters may change when the standard + allocators design is finalized. + Use $(LREF isCodepointSet) with templates or just stick with the default + alias $(LREF CodepointSet) throughout the whole code base. + ) +*/ +@trusted public struct InversionList(SP=GcPolicy) +{ + import std.range : assumeSorted; + + /** + Construct from another code point set of any type. + */ + this(Set)(Set set) pure + if (isCodepointSet!Set) + { + uint[] arr; + foreach (v; set.byInterval) + { + arr ~= v.a; + arr ~= v.b; + } + data = CowArray!(SP).reuse(arr); + } + + /** + Construct a set from a forward range of code point intervals. + */ + this(Range)(Range intervals) pure + if (isForwardRange!Range && isIntegralPair!(ElementType!Range)) + { + uint[] arr; + foreach (v; intervals) + { + SP.append(arr, v.a); + SP.append(arr, v.b); + } + data = CowArray!(SP).reuse(arr); + sanitize(); //enforce invariant: sort intervals etc. + } + + //helper function that avoids sanity check to be CTFE-friendly + private static fromIntervals(Range)(Range intervals) pure + { + import std.algorithm.iteration : map; + import std.range : roundRobin; + auto flattened = roundRobin(intervals.save.map!"a[0]"(), + intervals.save.map!"a[1]"()); + InversionList set; + set.data = CowArray!(SP)(flattened); + return set; + } + //ditto untill sort is CTFE-able + private static fromIntervals()(uint[] intervals...) pure + in + { + import std.conv : text; + assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); + for (uint i = 0; i < intervals.length; i += 2) + { + auto a = intervals[i], b = intervals[i+1]; + assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); + } + } + body + { + InversionList set; + set.data = CowArray!(SP)(intervals); + return set; + } + + /** + Construct a set from plain values of code point intervals. + */ + this()(uint[] intervals...) + in + { + import std.conv : text; + assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); + for (uint i = 0; i < intervals.length; i += 2) + { + auto a = intervals[i], b = intervals[i+1]; + assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); + } + } + body + { + data = CowArray!(SP)(intervals); + sanitize(); //enforce invariant: sort intervals etc. + } + + /// + @safe unittest + { + import std.algorithm.comparison : equal; + + auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1); + foreach (v; 'a'..'z'+1) + assert(set[v]); + // Cyrillic lowercase interval + foreach (v; 'а'..'я'+1) + assert(set[v]); + //specific order is not required, intervals may interesect + auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1); + //the same end result + assert(set2.byInterval.equal(set.byInterval)); + } + + /** + Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList). + + Example: + ----------- + import std.algorithm.comparison : equal; + import std.typecons : tuple; + + auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1); + + assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')])); + ----------- + */ + @property auto byInterval() + { + return Intervals!(typeof(data))(data); + } + + /** + Tests the presence of code point $(D val) in this set. + */ + bool opIndex(uint val) const + { + // the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1 + // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1; + return sharSwitchLowerBound!"a <= b"(data[], val) & 1; + } + + /// + @safe unittest + { + auto gothic = unicode.Gothic; + // Gothic letter ahsa + assert(gothic['\U00010330']); + // no ascii in Gothic obviously + assert(!gothic['$']); + } + + + // Linear scan for $(D ch). Useful only for small sets. + // TODO: + // used internally in std.regex + // should be properly exposed in a public API ? + package auto scanFor()(dchar ch) const + { + immutable len = data.length; + for (size_t i = 0; i < len; i++) + if (ch < data[i]) + return i & 1; + return 0; + } + + /// Number of $(CODEPOINTS) in this set + @property size_t length() + { + size_t sum = 0; + foreach (iv; byInterval) + { + sum += iv.b - iv.a; + } + return sum; + } + +// bootstrap full set operations from 4 primitives (suitable as a template mixin): +// addInterval, skipUpTo, dropUpTo & byInterval iteration +//============================================================================ +public: + /** + $(P Sets support natural syntax for set algebra, namely: ) + $(BOOKTABLE , + $(TR $(TH Operator) $(TH Math notation) $(TH Description) ) + $(TR $(TD &) $(TD a ∩ b) $(TD intersection) ) + $(TR $(TD |) $(TD a ∪ b) $(TD union) ) + $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) ) + $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) ) + ) + */ + This opBinary(string op, U)(U rhs) + if (isCodepointSet!U || is(U:dchar)) + { + static if (op == "&" || op == "|" || op == "~") + {// symmetric ops thus can swap arguments to reuse r-value + static if (is(U:dchar)) + { + auto tmp = this; + mixin("tmp "~op~"= rhs; "); + return tmp; + } + else + { + static if (is(Unqual!U == U)) + { + // try hard to reuse r-value + mixin("rhs "~op~"= this;"); + return rhs; + } + else + { + auto tmp = this; + mixin("tmp "~op~"= rhs;"); + return tmp; + } + } + } + else static if (op == "-") // anti-symmetric + { + auto tmp = this; + tmp -= rhs; + return tmp; + } + else + static assert(0, "no operator "~op~" defined for Set"); + } + + /// + @safe unittest + { + import std.algorithm.comparison : equal; + import std.range : iota; + + auto lower = unicode.LowerCase; + auto upper = unicode.UpperCase; + auto ascii = unicode.ASCII; + + assert((lower & upper).empty); // no intersection + auto lowerASCII = lower & ascii; + assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1))); + // throw away all of the lowercase ASCII + assert((ascii - lower).length == 128 - 26); + + auto onlyOneOf = lower ~ ascii; + assert(!onlyOneOf['Δ']); // not ASCII and not lowercase + assert(onlyOneOf['$']); // ASCII and not lowercase + assert(!onlyOneOf['a']); // ASCII and lowercase + assert(onlyOneOf['я']); // not ASCII but lowercase + + // throw away all cased letters from ASCII + auto noLetters = ascii - (lower | upper); + assert(noLetters.length == 128 - 26*2); + } + + /// The 'op=' versions of the above overloaded operators. + ref This opOpAssign(string op, U)(U rhs) + if (isCodepointSet!U || is(U:dchar)) + { + static if (op == "|") // union + { + static if (is(U:dchar)) + { + this.addInterval(rhs, rhs+1); + return this; + } + else + return this.add(rhs); + } + else static if (op == "&") // intersection + return this.intersect(rhs);// overloaded + else static if (op == "-") // set difference + return this.sub(rhs);// overloaded + else static if (op == "~") // symmetric set difference + { + auto copy = this & rhs; + this |= rhs; + this -= copy; + return this; + } + else + static assert(0, "no operator "~op~" defined for Set"); + } + + /** + Tests the presence of codepoint $(D ch) in this set, + the same as $(LREF opIndex). + */ + bool opBinaryRight(string op: "in", U)(U ch) const + if (is(U : dchar)) + { + return this[ch]; + } + + /// + @safe unittest + { + assert('я' in unicode.Cyrillic); + assert(!('z' in unicode.Cyrillic)); + } + + + + /** + * Obtains a set that is the inversion of this set. + * + * See_Also: $(LREF inverted) + */ + auto opUnary(string op: "!")() + { + return this.inverted; + } + + /** + A range that spans each $(CODEPOINT) in this set. + */ + @property auto byCodepoint() + { + @trusted static struct CodepointRange + { + this(This set) + { + r = set.byInterval; + if (!r.empty) + cur = r.front.a; + } + + @property dchar front() const + { + return cast(dchar) cur; + } + + @property bool empty() const + { + return r.empty; + } + + void popFront() + { + cur++; + while (cur >= r.front.b) + { + r.popFront(); + if (r.empty) + break; + cur = r.front.a; + } + } + private: + uint cur; + typeof(This.init.byInterval) r; + } + + return CodepointRange(this); + } + + /// + @safe unittest + { + import std.algorithm.comparison : equal; + import std.range : iota; + + auto set = unicode.ASCII; + set.byCodepoint.equal(iota(0, 0x80)); + } + + /** + $(P Obtain textual representation of this set in from of + open-right intervals and feed it to $(D sink). + ) + $(P Used by various standard formatting facilities such as + $(REF formattedWrite, std,_format), $(REF write, std,_stdio), + $(REF writef, std,_stdio), $(REF to, std,_conv) and others. + ) + Example: + --- + import std.conv; + assert(unicode.ASCII.to!string == "[0..128$(RPAREN)"); + --- + */ + + private import std.format : FormatSpec; + + /*************************************** + * Obtain a textual representation of this InversionList + * in form of open-right intervals. + * + * The formatting flag is applied individually to each value, for example: + * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals) + * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters) + * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters) + */ + void toString(Writer)(scope Writer sink, + FormatSpec!char fmt) /* const */ + { + import std.format : formatValue; + auto range = byInterval; + if (range.empty) + return; + + while (1) + { + auto i = range.front; + range.popFront(); + + put(sink, "["); + formatValue(sink, i.a, fmt); + put(sink, ".."); + formatValue(sink, i.b, fmt); + put(sink, ")"); + if (range.empty) return; + put(sink, " "); + } + } + + /// + @safe unittest + { + import std.conv : to; + import std.format : format; + import std.uni : unicode; + + assert(unicode.Cyrillic.to!string == + "[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)"); + + // The specs '%s' and '%d' are equivalent to the to!string call above. + assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string); + + assert(format("%#x", unicode.Cyrillic) == + "[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) " + ~"[0xa640..0xa698) [0xa69f..0xa6a0)"); + + assert(format("%#X", unicode.Cyrillic) == + "[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) " + ~"[0XA640..0XA698) [0XA69F..0XA6A0)"); + } + + @safe unittest + { + import std.exception : assertThrown; + import std.format : format, FormatException; + assertThrown!FormatException(format("%a", unicode.ASCII)); + } + + + /** + Add an interval [a, b$(RPAREN) to this set. + */ + ref add()(uint a, uint b) + { + addInterval(a, b); + return this; + } + + /// + @safe unittest + { + CodepointSet someSet; + someSet.add('0', '5').add('A','Z'+1); + someSet.add('5', '9'+1); + assert(someSet['0']); + assert(someSet['5']); + assert(someSet['9']); + assert(someSet['Z']); + } + +private: + + package(std) // used from: std.regex.internal.parser + ref intersect(U)(U rhs) + if (isCodepointSet!U) + { + Marker mark; + foreach ( i; rhs.byInterval) + { + mark = this.dropUpTo(i.a, mark); + mark = this.skipUpTo(i.b, mark); + } + this.dropUpTo(uint.max, mark); + return this; + } + + ref intersect()(dchar ch) + { + foreach (i; byInterval) + if (i.a <= ch && ch < i.b) + return this = This.init.add(ch, ch+1); + this = This.init; + return this; + } + + @safe unittest + { + assert(unicode.Cyrillic.intersect('-').byInterval.empty); + } + + ref sub()(dchar ch) + { + return subChar(ch); + } + + // same as the above except that skip & drop parts are swapped + package(std) // used from: std.regex.internal.parser + ref sub(U)(U rhs) + if (isCodepointSet!U) + { + Marker mark; + foreach (i; rhs.byInterval) + { + mark = this.skipUpTo(i.a, mark); + mark = this.dropUpTo(i.b, mark); + } + return this; + } + + package(std) // used from: std.regex.internal.parse + ref add(U)(U rhs) + if (isCodepointSet!U) + { + Marker start; + foreach (i; rhs.byInterval) + { + start = addInterval(i.a, i.b, start); + } + return this; + } + +// end of mixin-able part +//============================================================================ +public: + /** + Obtains a set that is the inversion of this set. + + See the '!' $(LREF opUnary) for the same but using operators. + */ + @property auto inverted() + { + InversionList inversion = this; + if (inversion.data.length == 0) + { + inversion.addInterval(0, lastDchar+1); + return inversion; + } + if (inversion.data[0] != 0) + genericReplace(inversion.data, 0, 0, [0]); + else + genericReplace(inversion.data, 0, 1, cast(uint[]) null); + if (data[data.length-1] != lastDchar+1) + genericReplace(inversion.data, + inversion.data.length, inversion.data.length, [lastDchar+1]); + else + genericReplace(inversion.data, + inversion.data.length-1, inversion.data.length, cast(uint[]) null); + + return inversion; + } + + /// + @safe unittest + { + auto set = unicode.ASCII; + // union with the inverse gets all of the code points in the Unicode + assert((set | set.inverted).length == 0x110000); + // no intersection with the inverse + assert((set & set.inverted).empty); + } + + /** + Generates string with D source code of unary function with name of + $(D funcName) taking a single $(D dchar) argument. If $(D funcName) is empty + the code is adjusted to be a lambda function. + + The function generated tests if the $(CODEPOINT) passed + belongs to this set or not. The result is to be used with string mixin. + The intended usage area is aggressive optimization via meta programming + in parser generators and the like. + + Note: Use with care for relatively small or regular sets. It + could end up being slower then just using multi-staged tables. + + Example: + --- + import std.stdio; + + // construct set directly from [a, b$RPAREN intervals + auto set = CodepointSet(10, 12, 45, 65, 100, 200); + writeln(set); + writeln(set.toSourceCode("func")); + --- + + The above outputs something along the lines of: + --- + bool func(dchar ch) @safe pure nothrow @nogc + { + if (ch < 45) + { + if (ch == 10 || ch == 11) return true; + return false; + } + else if (ch < 65) return true; + else + { + if (ch < 100) return false; + if (ch < 200) return true; + return false; + } + } + --- + */ + string toSourceCode(string funcName="") + { + import std.algorithm.searching : countUntil; + import std.array : array; + import std.format : format; + enum maxBinary = 3; + static string linearScope(R)(R ivals, string indent) + { + string result = indent~"{\n"; + string deeper = indent~" "; + foreach (ival; ivals) + { + immutable span = ival[1] - ival[0]; + assert(span != 0); + if (span == 1) + { + result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]); + } + else if (span == 2) + { + result ~= format("%sif (ch == %s || ch == %s) return true;\n", + deeper, ival[0], ival[0]+1); + } + else + { + if (ival[0] != 0) // dchar is unsigned and < 0 is useless + result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]); + result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]); + } + } + result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals + return result; + } + + static string binaryScope(R)(R ivals, string indent) + { + // time to do unrolled comparisons? + if (ivals.length < maxBinary) + return linearScope(ivals, indent); + else + return bisect(ivals, ivals.length/2, indent); + } + + // not used yet if/elsebinary search is far better with DMD as of 2.061 + // and GDC is doing fine job either way + static string switchScope(R)(R ivals, string indent) + { + string result = indent~"switch (ch){\n"; + string deeper = indent~" "; + foreach (ival; ivals) + { + if (ival[0]+1 == ival[1]) + { + result ~= format("%scase %s: return true;\n", + deeper, ival[0]); + } + else + { + result ~= format("%scase %s: .. case %s: return true;\n", + deeper, ival[0], ival[1]-1); + } + } + result ~= deeper~"default: return false;\n"~indent~"}\n"; + return result; + } + + static string bisect(R)(R range, size_t idx, string indent) + { + string deeper = indent ~ " "; + // bisect on one [a, b) interval at idx + string result = indent~"{\n"; + // less branch, < a + result ~= format("%sif (ch < %s)\n%s", + deeper, range[idx][0], binaryScope(range[0 .. idx], deeper)); + // middle point, >= a && < b + result ~= format("%selse if (ch < %s) return true;\n", + deeper, range[idx][1]); + // greater or equal branch, >= b + result ~= format("%selse\n%s", + deeper, binaryScope(range[idx+1..$], deeper)); + return result~indent~"}\n"; + } + + string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n", + funcName.empty ? "function" : funcName); + auto range = byInterval.array(); + // special case first bisection to be on ASCII vs beyond + auto tillAscii = countUntil!"a[0] > 0x80"(range); + if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0) + code ~= binaryScope(range, ""); + else + code ~= bisect(range, tillAscii, ""); + return code; + } + + /** + True if this set doesn't contain any $(CODEPOINTS). + */ + @property bool empty() const + { + return data.length == 0; + } + + /// + @safe unittest + { + CodepointSet emptySet; + assert(emptySet.length == 0); + assert(emptySet.empty); + } + +private: + alias This = typeof(this); + alias Marker = size_t; + + // a random-access range of integral pairs + static struct Intervals(Range) + { + this(Range sp) + { + slice = sp; + start = 0; + end = sp.length; + } + + this(Range sp, size_t s, size_t e) + { + slice = sp; + start = s; + end = e; + } + + @property auto front()const + { + immutable a = slice[start]; + immutable b = slice[start+1]; + return CodepointInterval(a, b); + } + + //may break sorted property - but we need std.sort to access it + //hence package protection attribute + package @property void front(CodepointInterval val) + { + slice[start] = val.a; + slice[start+1] = val.b; + } + + @property auto back()const + { + immutable a = slice[end-2]; + immutable b = slice[end-1]; + return CodepointInterval(a, b); + } + + //ditto about package + package @property void back(CodepointInterval val) + { + slice[end-2] = val.a; + slice[end-1] = val.b; + } + + void popFront() + { + start += 2; + } + + void popBack() + { + end -= 2; + } + + auto opIndex(size_t idx) const + { + immutable a = slice[start+idx*2]; + immutable b = slice[start+idx*2+1]; + return CodepointInterval(a, b); + } + + //ditto about package + package void opIndexAssign(CodepointInterval val, size_t idx) + { + slice[start+idx*2] = val.a; + slice[start+idx*2+1] = val.b; + } + + auto opSlice(size_t s, size_t e) + { + return Intervals(slice, s*2+start, e*2+start); + } + + @property size_t length()const { return slice.length/2; } + + @property bool empty()const { return start == end; } + + @property auto save(){ return this; } + private: + size_t start, end; + Range slice; + } + + // called after construction from intervals + // to make sure invariants hold + void sanitize() + { + import std.algorithm.comparison : max; + import std.algorithm.mutation : SwapStrategy; + import std.algorithm.sorting : sort; + if (data.length == 0) + return; + alias Ival = CodepointInterval; + //intervals wrapper for a _range_ over packed array + auto ivals = Intervals!(typeof(data[]))(data[]); + //@@@BUG@@@ can't use "a.a < b.a" see issue 12265 + sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals); + // what follows is a variation on stable remove + // differences: + // - predicate is binary, and is tested against + // the last kept element (at 'i'). + // - predicate mutates lhs (merges rhs into lhs) + size_t len = ivals.length; + size_t i = 0; + size_t j = 1; + while (j < len) + { + if (ivals[i].b >= ivals[j].a) + { + ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b)); + j++; + } + else //unmergable + { + // check if there is a hole after merges + // (in the best case we do 0 writes to ivals) + if (j != i+1) + ivals[i+1] = ivals[j]; //copy over + i++; + j++; + } + } + len = i + 1; + for (size_t k=0; k + 1 < len; k++) + { + assert(ivals[k].a < ivals[k].b); + assert(ivals[k].b < ivals[k+1].a); + } + data.length = len * 2; + } + + // special case for normal InversionList + ref subChar(dchar ch) + { + auto mark = skipUpTo(ch); + if (mark != data.length + && data[mark] == ch && data[mark-1] == ch) + { + // it has split, meaning that ch happens to be in one of intervals + data[mark] = data[mark]+1; + } + return this; + } + + // + Marker addInterval(int a, int b, Marker hint=Marker.init) + in + { + assert(a <= b); + } + body + { + import std.range : assumeSorted, SearchPolicy; + auto range = assumeSorted(data[]); + size_t pos; + size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length; + if (a_idx == range.length) + { + // [---+++----++++----++++++] + // [ a b] + data.append(a, b); + return data.length-1; + } + size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx; + uint[3] buf = void; + uint to_insert; + debug(std_uni) + { + writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); + } + if (b_idx == range.length) + { + // [-------++++++++----++++++-] + // [ s a b] + if (a_idx & 1)// a in positive + { + buf[0] = b; + to_insert = 1; + } + else// a in negative + { + buf[0] = a; + buf[1] = b; + to_insert = 2; + } + pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]); + return pos - 1; + } + + uint top = data[b_idx]; + + debug(std_uni) + { + writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); + writefln("a=%s; b=%s; top=%s;", a, b, top); + } + if (a_idx & 1) + {// a in positive + if (b_idx & 1)// b in positive + { + // [-------++++++++----++++++-] + // [ s a b ] + buf[0] = top; + to_insert = 1; + } + else // b in negative + { + // [-------++++++++----++++++-] + // [ s a b ] + if (top == b) + { + assert(b_idx+1 < data.length); + buf[0] = data[b_idx+1]; + pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]); + return pos - 1; + } + buf[0] = b; + buf[1] = top; + to_insert = 2; + } + } + else + { // a in negative + if (b_idx & 1) // b in positive + { + // [----------+++++----++++++-] + // [ a b ] + buf[0] = a; + buf[1] = top; + to_insert = 2; + } + else// b in negative + { + // [----------+++++----++++++-] + // [ a s b ] + if (top == b) + { + assert(b_idx+1 < data.length); + buf[0] = a; + buf[1] = data[b_idx+1]; + pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]); + return pos - 1; + } + buf[0] = a; + buf[1] = b; + buf[2] = top; + to_insert = 3; + } + } + pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]); + debug(std_uni) + { + writefln("marker idx: %d; length=%d", pos, data[pos], data.length); + writeln("inserting ", buf[0 .. to_insert]); + } + return pos - 1; + } + + // + Marker dropUpTo(uint a, Marker pos=Marker.init) + in + { + assert(pos % 2 == 0); // at start of interval + } + body + { + auto range = assumeSorted!"a <= b"(data[pos .. data.length]); + if (range.empty) + return pos; + size_t idx = pos; + idx += range.lowerBound(a).length; + + debug(std_uni) + { + writeln("dropUpTo full length=", data.length); + writeln(pos,"~~~", idx); + } + if (idx == data.length) + return genericReplace(data, pos, idx, cast(uint[])[]); + if (idx & 1) + { // a in positive + //[--+++----++++++----+++++++------...] + // |<---si s a t + genericReplace(data, pos, idx, [a]); + } + else + { // a in negative + //[--+++----++++++----+++++++-------+++...] + // |<---si s a t + genericReplace(data, pos, idx, cast(uint[])[]); + } + return pos; + } + + // + Marker skipUpTo(uint a, Marker pos=Marker.init) + out(result) + { + assert(result % 2 == 0);// always start of interval + //(may be 0-width after-split) + } + body + { + assert(data.length % 2 == 0); + auto range = assumeSorted!"a <= b"(data[pos .. data.length]); + size_t idx = pos+range.lowerBound(a).length; + + if (idx >= data.length) // could have Marker point to recently removed stuff + return data.length; + + if (idx & 1)// inside of interval, check for split + { + + immutable top = data[idx]; + if (top == a)// no need to split, it's end + return idx+1; + immutable start = data[idx-1]; + if (a == start) + return idx-1; + // split it up + genericReplace(data, idx, idx+1, [a, a, top]); + return idx+1; // avoid odd index + } + return idx; + } + + CowArray!SP data; +} + +@system unittest +{ + import std.conv : to; + assert(unicode.ASCII.to!string() == "[0..128)"); +} + +// pedantic version for ctfe, and aligned-access only architectures +@system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc +{ + idx *= 3; + version (LittleEndian) + return ptr[idx] + (cast(uint) ptr[idx+1]<<8) + + (cast(uint) ptr[idx+2]<<16); + else + return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8) + + ptr[idx+2]; +} + +// ditto +@system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc +{ + idx *= 3; + version (LittleEndian) + { + ptr[idx] = val & 0xFF; + ptr[idx+1] = (val >> 8) & 0xFF; + ptr[idx+2] = (val >> 16) & 0xFF; + } + else + { + ptr[idx] = (val >> 16) & 0xFF; + ptr[idx+1] = (val >> 8) & 0xFF; + ptr[idx+2] = val & 0xFF; + } +} + +// unaligned x86-like read/write functions +@system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc +{ + uint* src = cast(uint*)(ptr+3*idx); + version (LittleEndian) + return *src & 0xFF_FFFF; + else + return *src >> 8; +} + +// ditto +@system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc +{ + uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx); + version (LittleEndian) + *dest = val | (*dest & 0xFF00_0000); + else + *dest = (val << 8) | (*dest & 0xFF); +} + +@system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc +{ + static if (hasUnalignedReads) + return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx); + else + return safeRead24(ptr, idx); +} + +@system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc +{ + static if (hasUnalignedReads) + return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx); + else + return safeWrite24(ptr, val, idx); +} + +struct CowArray(SP=GcPolicy) +{ + import std.range.primitives : hasLength; + + @safe: + static auto reuse(uint[] arr) + { + CowArray cow; + cow.data = arr; + SP.append(cow.data, 1); + assert(cow.refCount == 1); + assert(cow.length == arr.length); + return cow; + } + + this(Range)(Range range) + if (isInputRange!Range && hasLength!Range) + { + import std.algorithm.mutation : copy; + length = range.length; + copy(range, data[0..$-1]); + } + + this(Range)(Range range) + if (isForwardRange!Range && !hasLength!Range) + { + import std.algorithm.mutation : copy; + import std.range.primitives : walkLength; + immutable len = walkLength(range.save); + length = len; + copy(range, data[0..$-1]); + } + + this(this) + { + if (!empty) + { + refCount = refCount + 1; + } + } + + ~this() + { + if (!empty) + { + immutable cnt = refCount; + if (cnt == 1) + SP.destroy(data); + else + refCount = cnt - 1; + } + } + + // no ref-count for empty U24 array + @property bool empty() const { return data.length == 0; } + + // report one less then actual size + @property size_t length() const + { + return data.length ? data.length - 1 : 0; + } + + //+ an extra slot for ref-count + @property void length(size_t len) + { + import std.algorithm.comparison : min; + import std.algorithm.mutation : copy; + if (len == 0) + { + if (!empty) + freeThisReference(); + return; + } + immutable total = len + 1; // including ref-count + if (empty) + { + data = SP.alloc!uint(total); + refCount = 1; + return; + } + immutable cur_cnt = refCount; + if (cur_cnt != 1) // have more references to this memory + { + refCount = cur_cnt - 1; + auto new_data = SP.alloc!uint(total); + // take shrinking into account + auto to_copy = min(total, data.length) - 1; + copy(data[0 .. to_copy], new_data[0 .. to_copy]); + data = new_data; // before setting refCount! + refCount = 1; + } + else // 'this' is the only reference + { + // use the realloc (hopefully in-place operation) + data = SP.realloc(data, total); + refCount = 1; // setup a ref-count in the new end of the array + } + } + + alias opDollar = length; + + uint opIndex()(size_t idx)const + { + return data[idx]; + } + + void opIndexAssign(uint val, size_t idx) + { + auto cnt = refCount; + if (cnt != 1) + dupThisReference(cnt); + data[idx] = val; + } + + // + auto opSlice(size_t from, size_t to) + { + if (!empty) + { + auto cnt = refCount; + if (cnt != 1) + dupThisReference(cnt); + } + return data[from .. to]; + + } + + // + auto opSlice(size_t from, size_t to) const + { + return data[from .. to]; + } + + // length slices before the ref count + auto opSlice() + { + return opSlice(0, length); + } + + // ditto + auto opSlice() const + { + return opSlice(0, length); + } + + void append(Range)(Range range) + if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint)) + { + size_t nl = length + range.length; + length = nl; + copy(range, this[nl-range.length .. nl]); + } + + void append()(uint[] val...) + { + length = length + val.length; + data[$-val.length-1 .. $-1] = val[]; + } + + bool opEquals()(auto const ref CowArray rhs)const + { + if (empty ^ rhs.empty) + return false; // one is empty and the other isn't + return empty || data[0..$-1] == rhs.data[0..$-1]; + } + +private: + // ref-count is right after the data + @property uint refCount() const + { + return data[$-1]; + } + + @property void refCount(uint cnt) + { + data[$-1] = cnt; + } + + void freeThisReference() + { + immutable count = refCount; + if (count != 1) // have more references to this memory + { + // dec shared ref-count + refCount = count - 1; + data = []; + } + else + SP.destroy(data); + assert(!data.ptr); + } + + void dupThisReference(uint count) + in + { + assert(!empty && count != 1 && count == refCount); + } + body + { + import std.algorithm.mutation : copy; + // dec shared ref-count + refCount = count - 1; + // copy to the new chunk of RAM + auto new_data = SP.alloc!uint(data.length); + // bit-blit old stuff except the counter + copy(data[0..$-1], new_data[0..$-1]); + data = new_data; // before setting refCount! + refCount = 1; // so that this updates the right one + } + + uint[] data; +} + +@safe unittest// Uint24 tests +{ + import std.algorithm.comparison : equal; + import std.algorithm.mutation : copy; + import std.conv : text; + import std.range : iota, chain; + import std.range.primitives : isBidirectionalRange, isOutputRange; + void funcRef(T)(ref T u24) + { + u24.length = 2; + u24[1] = 1024; + T u24_c = u24; + assert(u24[1] == 1024); + u24.length = 0; + assert(u24.empty); + u24.append([1, 2]); + assert(equal(u24[], [1, 2])); + u24.append(111); + assert(equal(u24[], [1, 2, 111])); + assert(!u24_c.empty && u24_c[1] == 1024); + u24.length = 3; + copy(iota(0, 3), u24[]); + assert(equal(u24[], iota(0, 3))); + assert(u24_c[1] == 1024); + } + + void func2(T)(T u24) + { + T u24_2 = u24; + T u24_3; + u24_3 = u24_2; + assert(u24_2 == u24_3); + assert(equal(u24[], u24_2[])); + assert(equal(u24_2[], u24_3[])); + funcRef(u24_3); + + assert(equal(u24_3[], iota(0, 3))); + assert(!equal(u24_2[], u24_3[])); + assert(equal(u24_2[], u24[])); + u24_2 = u24_3; + assert(equal(u24_2[], iota(0, 3))); + // to test that passed arg is intact outside + // plus try out opEquals + u24 = u24_3; + u24 = T.init; + u24_3 = T.init; + assert(u24.empty); + assert(u24 == u24_3); + assert(u24 != u24_2); + } + + foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy)) + { + alias Range = typeof(CowArray!Policy.init[]); + alias U24A = CowArray!Policy; + static assert(isForwardRange!Range); + static assert(isBidirectionalRange!Range); + static assert(isOutputRange!(Range, uint)); + static assert(isRandomAccessRange!(Range)); + + auto arr = U24A([42u, 36, 100]); + assert(arr[0] == 42); + assert(arr[1] == 36); + arr[0] = 72; + arr[1] = 0xFE_FEFE; + assert(arr[0] == 72); + assert(arr[1] == 0xFE_FEFE); + assert(arr[2] == 100); + U24A arr2 = arr; + assert(arr2[0] == 72); + arr2[0] = 11; + // test COW-ness + assert(arr[0] == 72); + assert(arr2[0] == 11); + // set this to about 100M to stress-test COW memory management + foreach (v; 0 .. 10_000) + func2(arr); + assert(equal(arr[], [72, 0xFE_FEFE, 100])); + + auto r2 = U24A(iota(0, 100)); + assert(equal(r2[], iota(0, 100)), text(r2[])); + copy(iota(10, 170, 2), r2[10 .. 90]); + assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100))) + , text(r2[])); + } +} + +version (unittest) +{ + private alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); +} + +@safe unittest// core set primitives test +{ + import std.conv : text; + foreach (CodeList; AllSets) + { + CodeList a; + //"plug a hole" test + a.add(10, 20).add(25, 30).add(15, 27); + assert(a == CodeList(10, 30), text(a)); + + auto x = CodeList.init; + x.add(10, 20).add(30, 40).add(50, 60); + + a = x; + a.add(20, 49);//[10, 49) [50, 60) + assert(a == CodeList(10, 49, 50 ,60)); + + a = x; + a.add(20, 50); + assert(a == CodeList(10, 60), text(a)); + + // simple unions, mostly edge effects + x = CodeList.init; + x.add(10, 20).add(40, 60); + + a = x; + a.add(10, 25); //[10, 25) [40, 60) + assert(a == CodeList(10, 25, 40, 60)); + + a = x; + a.add(5, 15); //[5, 20) [40, 60) + assert(a == CodeList(5, 20, 40, 60)); + + a = x; + a.add(0, 10); // [0, 20) [40, 60) + assert(a == CodeList(0, 20, 40, 60)); + + a = x; + a.add(0, 5); // prepand + assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a)); + + a = x; + a.add(5, 20); + assert(a == CodeList(5, 20, 40, 60)); + + a = x; + a.add(3, 37); + assert(a == CodeList(3, 37, 40, 60)); + + a = x; + a.add(37, 65); + assert(a == CodeList(10, 20, 37, 65)); + + // some tests on helpers for set intersection + x = CodeList.init.add(10, 20).add(40, 60).add(100, 120); + a = x; + + auto m = a.skipUpTo(60); + a.dropUpTo(110, m); + assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[])); + + a = x; + a.dropUpTo(100); + assert(a == CodeList(100, 120), text(a.data[])); + + a = x; + m = a.skipUpTo(50); + a.dropUpTo(140, m); + assert(a == CodeList(10, 20, 40, 50), text(a.data[])); + a = x; + a.dropUpTo(60); + assert(a == CodeList(100, 120), text(a.data[])); + } +} + + +//test constructor to work with any order of intervals +@safe unittest +{ + import std.algorithm.comparison : equal; + import std.conv : text, to; + import std.range : chain, iota; + import std.typecons : tuple; + //ensure constructor handles bad ordering and overlap + auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1); + foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1))) + assert(ch in c1, to!string(ch)); + + //contiguos + assert(CodepointSet(1000, 1006, 1006, 1009) + .byInterval.equal([tuple(1000, 1009)])); + //contains + assert(CodepointSet(900, 1200, 1000, 1100) + .byInterval.equal([tuple(900, 1200)])); + //intersect left + assert(CodepointSet(900, 1100, 1000, 1200) + .byInterval.equal([tuple(900, 1200)])); + //intersect right + assert(CodepointSet(1000, 1200, 900, 1100) + .byInterval.equal([tuple(900, 1200)])); + + //ditto with extra items at end + assert(CodepointSet(1000, 1200, 900, 1100, 800, 850) + .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); + assert(CodepointSet(900, 1100, 1000, 1200, 800, 850) + .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); + + //"plug a hole" test + auto c2 = CodepointSet(20, 40, + 60, 80, 100, 140, 150, 200, + 40, 60, 80, 100, 140, 150 + ); + assert(c2.byInterval.equal([tuple(20, 200)])); + + auto c3 = CodepointSet( + 20, 40, 60, 80, 100, 140, 150, 200, + 0, 10, 15, 100, 10, 20, 200, 220); + assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)])); +} + + +@safe unittest +{ // full set operations + import std.conv : text; + foreach (CodeList; AllSets) + { + CodeList a, b, c, d; + + //"plug a hole" + a.add(20, 40).add(60, 80).add(100, 140).add(150, 200); + b.add(40, 60).add(80, 100).add(140, 150); + c = a | b; + d = b | a; + assert(c == CodeList(20, 200), text(CodeList.stringof," ", c)); + assert(c == d, text(c," vs ", d)); + + b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210); + c = a | b; //[20,45) [60, 85) [95, 140) [150, 210) + d = b | a; + assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c)); + assert(c == d, text(c," vs ", d)); + + b = CodeList.init.add(10, 20).add(30,100).add(145,200); + c = a | b;//[10, 140) [145, 200) + d = b | a; + assert(c == CodeList(10, 140, 145, 200)); + assert(c == d, text(c," vs ", d)); + + b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220); + c = a | b;//[0, 140) [150, 220) + d = b | a; + assert(c == CodeList(0, 140, 150, 220)); + assert(c == d, text(c," vs ", d)); + + + a = CodeList.init.add(20, 40).add(60, 80); + b = CodeList.init.add(25, 35).add(65, 75); + c = a & b; + d = b & a; + assert(c == CodeList(25, 35, 65, 75), text(c)); + assert(c == d, text(c," vs ", d)); + + a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); + b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180); + c = a & b; + d = b & a; + assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c)); + assert(c == d, text(c," vs ", d)); + + a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); + b = CodeList.init.add(10, 30).add(60, 120).add(135, 160); + c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160) + d = b & a; + + assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c)); + assert(c == d, text(c, " vs ",d)); + assert((c & a) == c); + assert((d & b) == d); + assert((c & d) == d); + + b = CodeList.init.add(40, 60).add(80, 100).add(140, 200); + c = a & b; + d = b & a; + assert(c == CodeList(150, 200), text(c)); + assert(c == d, text(c, " vs ",d)); + assert((c & a) == c); + assert((d & b) == d); + assert((c & d) == d); + + assert((a & a) == a); + assert((b & b) == b); + + a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); + b = CodeList.init.add(30, 60).add(75, 120).add(190, 300); + c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190) + d = b - a;// [40, 60) [80, 100) [200, 300) + assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c)); + assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d)); + assert(c - d == c, text(c-d, " vs ", c)); + assert(d - c == d, text(d-c, " vs ", d)); + assert(c - c == CodeList.init); + assert(d - d == CodeList.init); + + a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150, 200); + b = CodeList.init.add(10, 50).add(60, 160).add(190, 300); + c = a - b;// [160, 190) + d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300) + assert(c == CodeList(160, 190), text(c)); + assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d)); + assert(c - d == c, text(c-d, " vs ", c)); + assert(d - c == d, text(d-c, " vs ", d)); + assert(c - c == CodeList.init); + assert(d - d == CodeList.init); + + a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); + b = CodeList.init.add(10, 30).add(45, 100).add(130, 190); + c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200) + d = b ~ a; + assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200), + text(c)); + assert(c == d, text(c, " vs ", d)); + } +} + +} + +@safe unittest// vs single dchar +{ + import std.conv : text; + CodepointSet a = CodepointSet(10, 100, 120, 200); + assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A')); + assert((a & 'B') == CodepointSet(66, 67)); +} + +@safe unittest// iteration & opIndex +{ + import std.algorithm.comparison : equal; + import std.conv : text; + import std.typecons : tuple, Tuple; + + foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy))) + { + auto arr = "ABCDEFGHIJKLMabcdefghijklm"d; + auto a = CodeList('A','N','a', 'n'); + assert(equal(a.byInterval, + [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')] + ), text(a.byInterval)); + + // same @@@BUG as in issue 8949 ? + version (bug8949) + { + import std.range : retro; + assert(equal(retro(a.byInterval), + [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')] + ), text(retro(a.byInterval))); + } + auto achr = a.byCodepoint; + assert(equal(achr, arr), text(a.byCodepoint)); + foreach (ch; a.byCodepoint) + assert(a[ch]); + auto x = CodeList(100, 500, 600, 900, 1200, 1500); + assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval)); + foreach (ch; x.byCodepoint) + assert(x[ch]); + static if (is(CodeList == CodepointSet)) + { + auto y = CodeList(x.byInterval); + assert(equal(x.byInterval, y.byInterval)); + } + assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[])); + assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[])); + } +} + +//============================================================================ +// Generic Trie template and various ways to build it +//============================================================================ + +// debug helper to get a shortened array dump +auto arrayRepr(T)(T x) +{ + import std.conv : text; + if (x.length > 32) + { + return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]); + } + else + return text(x); +} + +/** + Maps $(D Key) to a suitable integer index within the range of $(D size_t). + The mapping is constructed by applying predicates from $(D Prefix) left to right + and concatenating the resulting bits. + + The first (leftmost) predicate defines the most significant bits of + the resulting index. + */ +template mapTrieIndex(Prefix...) +{ + size_t mapTrieIndex(Key)(Key key) + if (isValidPrefixForTrie!(Key, Prefix)) + { + alias p = Prefix; + size_t idx; + foreach (i, v; p[0..$-1]) + { + idx |= p[i](key); + idx <<= p[i+1].bitSize; + } + idx |= p[$-1](key); + return idx; + } +} + +/* + $(D TrieBuilder) is a type used for incremental construction + of $(LREF Trie)s. + + See $(LREF buildTrie) for generic helpers built on top of it. +*/ +@trusted private struct TrieBuilder(Value, Key, Args...) +if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args)) +{ + import std.exception : enforce; + +private: + // last index is not stored in table, it is used as an offset to values in a block. + static if (is(Value == bool))// always pack bool + alias V = BitPacked!(Value, 1); + else + alias V = Value; + static auto deduceMaxIndex(Preds...)() + { + size_t idx = 1; + foreach (v; Preds) + idx *= 2^^v.bitSize; + return idx; + } + + static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key + { + alias Prefix = Args[1..$]; + enum lastPageSize = 2^^Prefix[$-1].bitSize; + enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]); + enum roughedMaxIndex = + (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize; + // check warp around - if wrapped, use the default deduction rule + enum maxIndex = roughedMaxIndex < translatedMaxIndex ? + deduceMaxIndex!(Prefix)() : roughedMaxIndex; + } + else + { + alias Prefix = Args; + enum maxIndex = deduceMaxIndex!(Prefix)(); + } + + alias getIndex = mapTrieIndex!(Prefix); + + enum lastLevel = Prefix.length-1; + struct ConstructState + { + size_t idx_zeros, idx_ones; + } + // iteration over levels of Trie, each indexes its own level and thus a shortened domain + size_t[Prefix.length] indices; + // default filler value to use + Value defValue; + // this is a full-width index of next item + size_t curIndex; + // all-zeros page index, all-ones page index (+ indicator if there is such a page) + ConstructState[Prefix.length] state; + // the table being constructed + MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table; + + @disable this(); + + //shortcut for index variable at level 'level' + @property ref idx(size_t level)(){ return indices[level]; } + + // this function assumes no holes in the input so + // indices are going one by one + void addValue(size_t level, T)(T val, size_t numVals) + { + alias j = idx!level; + enum pageSize = 1 << Prefix[level].bitSize; + if (numVals == 0) + return; + auto ptr = table.slice!(level); + if (numVals == 1) + { + static if (level == Prefix.length-1) + ptr[j] = val; + else + {// can incur narrowing conversion + assert(j < ptr.length); + ptr[j] = force!(typeof(ptr[j]))(val); + } + j++; + if (j % pageSize == 0) + spillToNextPage!level(ptr); + return; + } + // longer row of values + // get to the next page boundary + immutable nextPB = (j + pageSize) & ~(pageSize-1); + immutable n = nextPB - j;// can fill right in this page + if (numVals < n) //fits in current page + { + ptr[j .. j+numVals] = val; + j += numVals; + return; + } + static if (level != 0)//on the first level it always fits + { + numVals -= n; + //write till the end of current page + ptr[j .. j+n] = val; + j += n; + //spill to the next page + spillToNextPage!level(ptr); + // page at once loop + if (state[level].idx_zeros != size_t.max && val == T.init) + { + alias NextIdx = typeof(table.slice!(level-1)[0]); + addValue!(level-1)(force!NextIdx(state[level].idx_zeros), + numVals/pageSize); + ptr = table.slice!level; //table structure might have changed + numVals %= pageSize; + } + else + { + while (numVals >= pageSize) + { + numVals -= pageSize; + ptr[j .. j+pageSize] = val; + j += pageSize; + spillToNextPage!level(ptr); + } + } + if (numVals) + { + // the leftovers, an incomplete page + ptr[j .. j+numVals] = val; + j += numVals; + } + } + } + + void spillToNextPage(size_t level, Slice)(ref Slice ptr) + { + // last level (i.e. topmost) has 1 "page" + // thus it need not to add a new page on upper level + static if (level != 0) + spillToNextPageImpl!(level)(ptr); + } + + // this can re-use the current page if duplicate or allocate a new one + // it also makes sure that previous levels point to the correct page in this level + void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr) + { + alias NextIdx = typeof(table.slice!(level-1)[0]); + NextIdx next_lvl_index; + enum pageSize = 1 << Prefix[level].bitSize; + assert(idx!level % pageSize == 0); + immutable last = idx!level-pageSize; + const slice = ptr[idx!level - pageSize .. idx!level]; + size_t j; + for (j=0; j<last; j+=pageSize) + { + if (ptr[j .. j+pageSize] == slice) + { + // get index to it, reuse ptr space for the next block + next_lvl_index = force!NextIdx(j/pageSize); + version (none) + { + import std.stdio : writefln, writeln; + writefln("LEVEL(%s) page mapped idx: %s: 0..%s ---> [%s..%s]" + ,level + ,indices[level-1], pageSize, j, j+pageSize); + writeln("LEVEL(", level + , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize])); + writeln("LEVEL(", level + , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize])); + } + idx!level -= pageSize; // reuse this page, it is duplicate + break; + } + } + if (j == last) + { + L_allocate_page: + next_lvl_index = force!NextIdx(idx!level/pageSize - 1); + if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize)) + { + state[level].idx_zeros = next_lvl_index; + } + // allocate next page + version (none) + { + import std.stdio : writefln; + writefln("LEVEL(%s) page allocated: %s" + , level, arrayRepr(slice[0 .. pageSize])); + writefln("LEVEL(%s) index: %s ; page at this index %s" + , level + , next_lvl_index + , arrayRepr( + table.slice!(level) + [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize] + )); + } + table.length!level = table.length!level + pageSize; + } + L_know_index: + // for the previous level, values are indices to the pages in the current level + addValue!(level-1)(next_lvl_index, 1); + ptr = table.slice!level; //re-load the slice after moves + } + + // idx - full-width index to fill with v (full-width index != key) + // fills everything in the range of [curIndex, idx) with filler + void putAt(size_t idx, Value v) + { + assert(idx >= curIndex); + immutable numFillers = idx - curIndex; + addValue!lastLevel(defValue, numFillers); + addValue!lastLevel(v, 1); + curIndex = idx + 1; + } + + // ditto, but sets the range of [idxA, idxB) to v + void putRangeAt(size_t idxA, size_t idxB, Value v) + { + assert(idxA >= curIndex); + assert(idxB >= idxA); + size_t numFillers = idxA - curIndex; + addValue!lastLevel(defValue, numFillers); + addValue!lastLevel(v, idxB - idxA); + curIndex = idxB; // open-right + } + + enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~ + "duplicate key->value mapping"; + +public: + /** + Construct a builder, where $(D filler) is a value + to indicate empty slots (or "not found" condition). + */ + this(Value filler) + { + curIndex = 0; + defValue = filler; + // zeros-page index, ones-page index + foreach (ref v; state) + v = ConstructState(size_t.max, size_t.max); + table = typeof(table)(indices); + // one page per level is a bootstrap minimum + foreach (i, Pred; Prefix) + table.length!i = (1 << Pred.bitSize); + } + + /** + Put a value $(D v) into interval as + mapped by keys from $(D a) to $(D b). + All slots prior to $(D a) are filled with + the default filler. + */ + void putRange(Key a, Key b, Value v) + { + auto idxA = getIndex(a), idxB = getIndex(b); + // indexes of key should always grow + enforce(idxB >= idxA && idxA >= curIndex, errMsg); + putRangeAt(idxA, idxB, v); + } + + /** + Put a value $(D v) into slot mapped by $(D key). + All slots prior to $(D key) are filled with the + default filler. + */ + void putValue(Key key, Value v) + { + import std.conv : text; + auto idx = getIndex(key); + enforce(idx >= curIndex, text(errMsg, " ", idx)); + putAt(idx, v); + } + + /// Finishes construction of Trie, yielding an immutable Trie instance. + auto build() + { + static if (maxIndex != 0) // doesn't cover full range of size_t + { + assert(curIndex <= maxIndex); + addValue!lastLevel(defValue, maxIndex - curIndex); + } + else + { + if (curIndex != 0 // couldn't wrap around + || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty + { + addValue!lastLevel(defValue, size_t.max - curIndex); + addValue!lastLevel(defValue, 1); + } + // else curIndex already completed the full range of size_t by wrapping around + } + return Trie!(V, Key, maxIndex, Prefix)(table); + } +} + +/** + $(P A generic Trie data-structure for a fixed number of stages. + The design goal is optimal speed with smallest footprint size. + ) + $(P It's intentionally read-only and doesn't provide constructors. + To construct one use a special builder, + see $(LREF TrieBuilder) and $(LREF buildTrie). + ) + +*/ +@trusted private struct Trie(Value, Key, Args...) +if (isValidPrefixForTrie!(Key, Args) + || (isValidPrefixForTrie!(Key, Args[1..$]) + && is(typeof(Args[0]) : size_t))) +{ + import std.range.primitives : isOutputRange; + static if (is(typeof(Args[0]) : size_t)) + { + private enum maxIndex = Args[0]; + private enum hasBoundsCheck = true; + private alias Prefix = Args[1..$]; + } + else + { + private enum hasBoundsCheck = false; + private alias Prefix = Args; + } + + private this()(typeof(_table) table) + { + _table = table; + } + + // only for constant Tries constructed from precompiled tables + private this()(const(size_t)[] offsets, const(size_t)[] sizes, + const(size_t)[] data) const + { + _table = typeof(_table)(offsets, sizes, data); + } + + /** + $(P Lookup the $(D key) in this $(D Trie). ) + + $(P The lookup always succeeds if key fits the domain + provided during construction. The whole domain defined + is covered so instead of not found condition + the sentinel (filler) value could be used. ) + + $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to + define a domain of $(D Trie) keys and the sentinel value. ) + + Note: + Domain range-checking is only enabled in debug builds + and results in assertion failure. + */ + TypeOfBitPacked!Value opIndex()(Key key) const + { + static if (hasBoundsCheck) + assert(mapTrieIndex!Prefix(key) < maxIndex); + size_t idx; + alias p = Prefix; + idx = cast(size_t) p[0](key); + foreach (i, v; p[0..$-1]) + idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key)); + return _table.ptr!(p.length-1)[idx]; + } + + /// + @property size_t bytes(size_t n=size_t.max)() const + { + return _table.bytes!n; + } + + /// + @property size_t pages(size_t n)() const + { + return (bytes!n+2^^(Prefix[n].bitSize-1)) + /2^^Prefix[n].bitSize; + } + + /// + void store(OutRange)(scope OutRange sink) const + if (isOutputRange!(OutRange, char)) + { + _table.store(sink); + } + +private: + MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table; +} + +// create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes' +// left-to-right, the most significant bits first +template GetBitSlicing(size_t top, sizes...) +{ + static if (sizes.length > 0) + alias GetBitSlicing = + AliasSeq!(sliceBits!(top - sizes[0], top), + GetBitSlicing!(top - sizes[0], sizes[1..$])); + else + alias GetBitSlicing = AliasSeq!(); +} + +template callableWith(T) +{ + template callableWith(alias Pred) + { + static if (!is(typeof(Pred(T.init)))) + enum callableWith = false; + else + { + alias Result = typeof(Pred(T.init)); + enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result)); + } + } +} + +/* + Check if $(D Prefix) is a valid set of predicates + for $(D Trie) template having $(D Key) as the type of keys. + This requires all predicates to be callable, take + single argument of type $(D Key) and return unsigned value. +*/ +template isValidPrefixForTrie(Key, Prefix...) +{ + import std.meta : allSatisfy; + enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws +} + +/* + Check if $(D Args) is a set of maximum key value followed by valid predicates + for $(D Trie) template having $(D Key) as the type of keys. +*/ +template isValidArgsForTrie(Key, Args...) +{ + static if (Args.length > 1) + { + enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args) + || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key)); + } + else + enum isValidArgsForTrie = isValidPrefixForTrie!Args; +} + +@property size_t sumOfIntegerTuple(ints...)() +{ + size_t count=0; + foreach (v; ints) + count += v; + return count; +} + +/** + A shorthand for creating a custom multi-level fixed Trie + from a $(D CodepointSet). $(D sizes) are numbers of bits per level, + with the most significant bits used first. + + Note: The sum of $(D sizes) must be equal 21. + + See_Also: $(LREF toTrie), which is even simpler. + + Example: + --- + { + import std.stdio; + auto set = unicode("Number"); + auto trie = codepointSetTrie!(8, 5, 8)(set); + writeln("Input code points to test:"); + foreach (line; stdin.byLine) + { + int count=0; + foreach (dchar ch; line) + if (trie[ch])// is number + count++; + writefln("Contains %d number code points.", count); + } + } + --- +*/ +public template codepointSetTrie(sizes...) +if (sumOfIntegerTuple!sizes == 21) +{ + auto codepointSetTrie(Set)(Set set) + if (isCodepointSet!Set) + { + auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false); + foreach (ival; set.byInterval) + builder.putRange(ival[0], ival[1], true); + return builder.build(); + } +} + +/// Type of Trie generated by codepointSetTrie function. +public template CodepointSetTrie(sizes...) +if (sumOfIntegerTuple!sizes == 21) +{ + alias Prefix = GetBitSlicing!(21, sizes); + alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build()); +} + +/** + A slightly more general tool for building fixed $(D Trie) + for the Unicode data. + + Specifically unlike $(D codepointSetTrie) it's allows creating mappings + of $(D dchar) to an arbitrary type $(D T). + + Note: Overload taking $(D CodepointSet)s will naturally convert + only to bool mapping $(D Trie)s. +*/ +public template codepointTrie(T, sizes...) +if (sumOfIntegerTuple!sizes == 21) +{ + alias Prefix = GetBitSlicing!(21, sizes); + + static if (is(TypeOfBitPacked!T == bool)) + { + auto codepointTrie(Set)(in Set set) + if (isCodepointSet!Set) + { + return codepointSetTrie(set); + } + } + + auto codepointTrie()(T[dchar] map, T defValue=T.init) + { + return buildTrie!(T, dchar, Prefix)(map, defValue); + } + + // unsorted range of pairs + auto codepointTrie(R)(R range, T defValue=T.init) + if (isInputRange!R + && is(typeof(ElementType!R.init[0]) : T) + && is(typeof(ElementType!R.init[1]) : dchar)) + { + // build from unsorted array of pairs + // TODO: expose index sorting functions for Trie + return buildTrie!(T, dchar, Prefix)(range, defValue, true); + } +} + +@system pure unittest +{ + import std.algorithm.comparison : max; + import std.algorithm.searching : count; + + // pick characters from the Greek script + auto set = unicode.Greek; + + // a user-defined property (or an expensive function) + // that we want to look up + static uint luckFactor(dchar ch) + { + // here we consider a character lucky + // if its code point has a lot of identical hex-digits + // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2 + ubyte[6] nibbles; // 6 4-bit chunks of code point + uint value = ch; + foreach (i; 0 .. 6) + { + nibbles[i] = value & 0xF; + value >>= 4; + } + uint luck; + foreach (n; nibbles) + luck = cast(uint) max(luck, count(nibbles[], n)); + return luck; + } + + // only unsigned built-ins are supported at the moment + alias LuckFactor = BitPacked!(uint, 3); + + // create a temporary associative array (AA) + LuckFactor[dchar] map; + foreach (ch; set.byCodepoint) + map[ch] = LuckFactor(luckFactor(ch)); + + // bits per stage are chosen randomly, fell free to optimize + auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map); + + // from now on the AA is not needed + foreach (ch; set.byCodepoint) + assert(trie[ch] == luckFactor(ch)); // verify + // CJK is not Greek, thus it has the default value + assert(trie['\u4444'] == 0); + // and here is a couple of quite lucky Greek characters: + // Greek small letter epsilon with dasia + assert(trie['\u1F11'] == 3); + // Ancient Greek metretes sign + assert(trie['\U00010181'] == 3); + +} + +/// Type of Trie as generated by codepointTrie function. +public template CodepointTrie(T, sizes...) +if (sumOfIntegerTuple!sizes == 21) +{ + alias Prefix = GetBitSlicing!(21, sizes); + alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build()); +} + +package template cmpK0(alias Pred) +{ + import std.typecons : Tuple; + static bool cmpK0(Value, Key) + (Tuple!(Value, Key) a, Tuple!(Value, Key) b) + { + return Pred(a[1]) < Pred(b[1]); + } +} + +/** + The most general utility for construction of $(D Trie)s + short of using $(D TrieBuilder) directly. + + Provides a number of convenience overloads. + $(D Args) is tuple of maximum key value followed by + predicates to construct index from key. + + Alternatively if the first argument is not a value convertible to $(D Key) + then the whole tuple of $(D Args) is treated as predicates + and the maximum Key is deduced from predicates. +*/ +private template buildTrie(Value, Key, Args...) +if (isValidArgsForTrie!(Key, Args)) +{ + static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key + { + alias Prefix = Args[1..$]; + } + else + alias Prefix = Args; + + alias getIndex = mapTrieIndex!(Prefix); + + // for multi-sort + template GetComparators(size_t n) + { + static if (n > 0) + alias GetComparators = + AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1])); + else + alias GetComparators = AliasSeq!(); + } + + /* + Build $(D Trie) from a range of a Key-Value pairs, + assuming it is sorted by Key as defined by the following lambda: + ------ + (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b) + ------ + Exception is thrown if it's detected that the above order doesn't hold. + + In other words $(LREF mapTrieIndex) should be a + monotonically increasing function that maps $(D Key) to an integer. + + See_Also: $(REF sort, std,_algorithm), + $(REF SortedRange, std,_range), + $(REF setUnion, std,_algorithm). + */ + auto buildTrie(Range)(Range range, Value filler=Value.init) + if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value) + && is(typeof(Range.init.front[1]) : Key)) + { + auto builder = TrieBuilder!(Value, Key, Prefix)(filler); + foreach (v; range) + builder.putValue(v[1], v[0]); + return builder.build(); + } + + /* + If $(D Value) is bool (or BitPacked!(bool, x)) then it's possible + to build $(D Trie) from a range of open-right intervals of $(D Key)s. + The requirement on the ordering of keys (and the behavior on the + violation of it) is the same as for Key-Value range overload. + + Intervals denote ranges of !$(D filler) i.e. the opposite of filler. + If no filler provided keys inside of the intervals map to true, + and $(D filler) is false. + */ + auto buildTrie(Range)(Range range, Value filler=Value.init) + if (is(TypeOfBitPacked!Value == bool) + && isInputRange!Range && is(typeof(Range.init.front[0]) : Key) + && is(typeof(Range.init.front[1]) : Key)) + { + auto builder = TrieBuilder!(Value, Key, Prefix)(filler); + foreach (ival; range) + builder.putRange(ival[0], ival[1], !filler); + return builder.build(); + } + + auto buildTrie(Range)(Range range, Value filler, bool unsorted) + if (isInputRange!Range + && is(typeof(Range.init.front[0]) : Value) + && is(typeof(Range.init.front[1]) : Key)) + { + import std.algorithm.sorting : multiSort; + alias Comps = GetComparators!(Prefix.length); + if (unsorted) + multiSort!(Comps)(range); + return buildTrie(range, filler); + } + + /* + If $(D Value) is bool (or BitPacked!(bool, x)) then it's possible + to build $(D Trie) simply from an input range of $(D Key)s. + The requirement on the ordering of keys (and the behavior on the + violation of it) is the same as for Key-Value range overload. + + Keys found in range denote !$(D filler) i.e. the opposite of filler. + If no filler provided keys map to true, and $(D filler) is false. + */ + auto buildTrie(Range)(Range range, Value filler=Value.init) + if (is(TypeOfBitPacked!Value == bool) + && isInputRange!Range && is(typeof(Range.init.front) : Key)) + { + auto builder = TrieBuilder!(Value, Key, Prefix)(filler); + foreach (v; range) + builder.putValue(v, !filler); + return builder.build(); + } + + /* + If $(D Key) is unsigned integer $(D Trie) could be constructed from array + of values where array index serves as key. + */ + auto buildTrie()(Value[] array, Value filler=Value.init) + if (isUnsigned!Key) + { + auto builder = TrieBuilder!(Value, Key, Prefix)(filler); + foreach (idx, v; array) + builder.putValue(idx, v); + return builder.build(); + } + + /* + Builds $(D Trie) from associative array. + */ + auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init) + { + import std.array : array; + import std.range : zip; + auto range = array(zip(map.values, map.keys)); + return buildTrie(range, filler, true); // sort it + } +} + +// helper in place of assumeSize to +//reduce mangled name & help DMD inline Trie functors +struct clamp(size_t bits) +{ + static size_t opCall(T)(T arg){ return arg; } + enum bitSize = bits; +} + +struct clampIdx(size_t idx, size_t bits) +{ + static size_t opCall(T)(T arg){ return arg[idx]; } + enum bitSize = bits; +} + +/** + Conceptual type that outlines the common properties of all UTF Matchers. + + Note: For illustration purposes only, every method + call results in assertion failure. + Use $(LREF utfMatcher) to obtain a concrete matcher + for UTF-8 or UTF-16 encodings. +*/ +public struct MatcherConcept +{ + /** + $(P Perform a semantic equivalent 2 operations: + decoding a $(CODEPOINT) at front of $(D inp) and testing if + it belongs to the set of $(CODEPOINTS) of this matcher. ) + + $(P The effect on $(D inp) depends on the kind of function called:) + + $(P Match. If the codepoint is found in the set then range $(D inp) + is advanced by its size in $(S_LINK Code unit, code units), + otherwise the range is not modifed.) + + $(P Skip. The range is always advanced by the size + of the tested $(CODEPOINT) regardless of the result of test.) + + $(P Test. The range is left unaffected regardless + of the result of test.) + */ + public bool match(Range)(ref Range inp) + if (isRandomAccessRange!Range && is(ElementType!Range : char)) + { + assert(false); + } + + ///ditto + public bool skip(Range)(ref Range inp) + if (isRandomAccessRange!Range && is(ElementType!Range : char)) + { + assert(false); + } + + ///ditto + public bool test(Range)(ref Range inp) + if (isRandomAccessRange!Range && is(ElementType!Range : char)) + { + assert(false); + } + /// + @safe unittest + { + string truth = "2² = 4"; + auto m = utfMatcher!char(unicode.Number); + assert(m.match(truth)); // '2' is a number all right + assert(truth == "² = 4"); // skips on match + assert(m.match(truth)); // so is the superscript '2' + assert(!m.match(truth)); // space is not a number + assert(truth == " = 4"); // unaffected on no match + assert(!m.skip(truth)); // same test ... + assert(truth == "= 4"); // but skips a codepoint regardless + assert(!m.test(truth)); // '=' is not a number + assert(truth == "= 4"); // test never affects argument + } + + /** + Advanced feature - provide direct access to a subset of matcher based a + set of known encoding lengths. Lengths are provided in + $(S_LINK Code unit, code units). The sub-matcher then may do less + operations per any $(D test)/$(D match). + + Use with care as the sub-matcher won't match + any $(CODEPOINTS) that have encoded length that doesn't belong + to the selected set of lengths. Also the sub-matcher object references + the parent matcher and must not be used past the liftetime + of the latter. + + Another caveat of using sub-matcher is that skip is not available + preciesly because sub-matcher doesn't detect all lengths. + */ + @property auto subMatcher(Lengths...)() + { + assert(0); + return this; + } + + @safe unittest + { + auto m = utfMatcher!char(unicode.Number); + string square = "2²"; + // about sub-matchers + assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered + assert(m.subMatcher!1.match(square)); // ASCII-only, works + assert(!m.subMatcher!1.test(square)); // unicode '²' + assert(m.subMatcher!(2,3,4).match(square)); // + assert(square == ""); + wstring wsquare = "2²"; + auto m16 = utfMatcher!wchar(unicode.Number); + // may keep ref, but the orignal (m16) must be kept alive + auto bmp = m16.subMatcher!1; + assert(bmp.match(wsquare)); // Okay, in basic multilingual plan + assert(bmp.match(wsquare)); // And '²' too + } +} + +/** + Test if $(D M) is an UTF Matcher for ranges of $(D Char). +*/ +public enum isUtfMatcher(M, C) = __traits(compiles, (){ + C[] s; + auto d = s.decoder; + M m; + assert(is(typeof(m.match(d)) == bool)); + assert(is(typeof(m.test(d)) == bool)); + static if (is(typeof(m.skip(d)))) + { + assert(is(typeof(m.skip(d)) == bool)); + assert(is(typeof(m.skip(s)) == bool)); + } + assert(is(typeof(m.match(s)) == bool)); + assert(is(typeof(m.test(s)) == bool)); +}); + +@safe unittest +{ + alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init)); + alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init)); + static assert(isUtfMatcher!(CharMatcher, char)); + static assert(isUtfMatcher!(CharMatcher, immutable(char))); + static assert(isUtfMatcher!(WcharMatcher, wchar)); + static assert(isUtfMatcher!(WcharMatcher, immutable(wchar))); +} + +enum Mode { + alwaysSkip, + neverSkip, + skipOnMatch +} + +mixin template ForwardStrings() +{ + private bool fwdStr(string fn, C)(ref C[] str) const pure + { + import std.utf : byCodeUnit; + alias type = typeof(byCodeUnit(str)); + return mixin(fn~"(*cast(type*)&str)"); + } +} + +template Utf8Matcher() +{ + enum validSize(int sz) = sz >= 1 && sz <= 4; + + void badEncoding() pure @safe + { + import std.utf : UTFException; + throw new UTFException("Invalid UTF-8 sequence"); + } + + //for 1-stage ASCII + alias AsciiSpec = AliasSeq!(bool, char, clamp!7); + //for 2-stage lookup of 2 byte UTF-8 sequences + alias Utf8Spec2 = AliasSeq!(bool, char[2], + clampIdx!(0, 5), clampIdx!(1, 6)); + //ditto for 3 byte + alias Utf8Spec3 = AliasSeq!(bool, char[3], + clampIdx!(0, 4), + clampIdx!(1, 6), + clampIdx!(2, 6) + ); + //ditto for 4 byte + alias Utf8Spec4 = AliasSeq!(bool, char[4], + clampIdx!(0, 3), clampIdx!(1, 6), + clampIdx!(2, 6), clampIdx!(3, 6) + ); + alias Tables = AliasSeq!( + typeof(TrieBuilder!(AsciiSpec)(false).build()), + typeof(TrieBuilder!(Utf8Spec2)(false).build()), + typeof(TrieBuilder!(Utf8Spec3)(false).build()), + typeof(TrieBuilder!(Utf8Spec4)(false).build()) + ); + alias Table(int size) = Tables[size-1]; + + enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1; + enum encMask(size_t size) = ((1 << size)-1)<<(8-size); + + char truncate()(char ch) pure @safe + { + ch -= 0x80; + if (ch < 0x40) + { + return ch; + } + else + { + badEncoding(); + return cast(char) 0; + } + } + + static auto encode(size_t sz)(dchar ch) + if (sz > 1) + { + import std.utf : encodeUTF = encode; + char[4] buf; + encodeUTF(buf, ch); + char[sz] ret; + buf[0] &= leadMask!sz; + foreach (n; 1 .. sz) + buf[n] = buf[n] & 0x3f; //keep 6 lower bits + ret[] = buf[0 .. sz]; + return ret; + } + + auto build(Set)(Set set) + { + import std.algorithm.iteration : map; + auto ascii = set & unicode.ASCII; + auto utf8_2 = set & CodepointSet(0x80, 0x800); + auto utf8_3 = set & CodepointSet(0x800, 0x1_0000); + auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1); + auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); + auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2); + auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3); + auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4); + alias Ret = Impl!(1,2,3,4); + return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T); + } + + // Bootstrap UTF-8 static matcher interface + // from 3 primitives: tab!(size), lookup and Sizes + mixin template DefMatcher() + { + import std.format : format; + import std.meta : Erase, staticIndexOf; + enum hasASCII = staticIndexOf!(1, Sizes) >= 0; + alias UniSizes = Erase!(1, Sizes); + + //generate dispatch code sequence for unicode parts + static auto genDispatch() + { + string code; + foreach (size; UniSizes) + code ~= format(q{ + if ((ch & ~leadMask!%d) == encMask!(%d)) + return lookup!(%d, mode)(inp); + else + }, size, size, size); + static if (Sizes.length == 4) //covers all code unit cases + code ~= "{ badEncoding(); return false; }"; + else + code ~= "return false;"; //may be just fine but not covered + return code; + } + enum dispatch = genDispatch(); + + public bool match(Range)(ref Range inp) const pure + if (isRandomAccessRange!Range && is(ElementType!Range : char)) + { + enum mode = Mode.skipOnMatch; + assert(!inp.empty); + immutable ch = inp[0]; + static if (hasASCII) + { + if (ch < 0x80) + { + immutable r = tab!1[ch]; + if (r) + inp.popFront(); + return r; + } + else + mixin(dispatch); + } + else + mixin(dispatch); + } + + static if (Sizes.length == 4) // can skip iff can detect all encodings + { + public bool skip(Range)(ref Range inp) const pure @trusted + if (isRandomAccessRange!Range && is(ElementType!Range : char)) + { + enum mode = Mode.alwaysSkip; + assert(!inp.empty); + auto ch = inp[0]; + static if (hasASCII) + { + if (ch < 0x80) + { + inp.popFront(); + return tab!1[ch]; + } + else + mixin(dispatch); + } + else + mixin(dispatch); + } + } + + public bool test(Range)(ref Range inp) const pure @trusted + if (isRandomAccessRange!Range && is(ElementType!Range : char)) + { + enum mode = Mode.neverSkip; + assert(!inp.empty); + auto ch = inp[0]; + static if (hasASCII) + { + if (ch < 0x80) + return tab!1[ch]; + else + mixin(dispatch); + } + else + mixin(dispatch); + } + + bool match(C)(ref C[] str) const pure @trusted + if (isSomeChar!C) + { + return fwdStr!"match"(str); + } + + bool skip(C)(ref C[] str) const pure @trusted + if (isSomeChar!C) + { + return fwdStr!"skip"(str); + } + + bool test(C)(ref C[] str) const pure @trusted + if (isSomeChar!C) + { + return fwdStr!"test"(str); + } + + mixin ForwardStrings; + } + + struct Impl(Sizes...) + { + import std.meta : allSatisfy, staticMap; + static assert(allSatisfy!(validSize, Sizes), + "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); + private: + //pick tables for chosen sizes + alias OurTabs = staticMap!(Table, Sizes); + OurTabs tables; + mixin DefMatcher; + //static disptach helper UTF size ==> table + alias tab(int i) = tables[i - 1]; + + package @property auto subMatcher(SizesToPick...)() @trusted + { + return CherryPick!(Impl, SizesToPick)(&this); + } + + bool lookup(int size, Mode mode, Range)(ref Range inp) const pure @trusted + { + import std.typecons : staticIota; + if (inp.length < size) + { + badEncoding(); + return false; + } + char[size] needle = void; + needle[0] = leadMask!size & inp[0]; + foreach (i; staticIota!(1, size)) + { + needle[i] = truncate(inp[i]); + } + //overlong encoding checks + static if (size == 2) + { + //0x80-0x7FF + //got 6 bits in needle[1], must use at least 8 bits + //must use at least 2 bits in needle[1] + if (needle[0] < 2) badEncoding(); + } + else static if (size == 3) + { + //0x800-0xFFFF + //got 6 bits in needle[2], must use at least 12bits + //must use 6 bits in needle[1] or anything in needle[0] + if (needle[0] == 0 && needle[1] < 0x20) badEncoding(); + } + else static if (size == 4) + { + //0x800-0xFFFF + //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits + //must use 5 bits (or above) in needle[1] or anything in needle[0] + if (needle[0] == 0 && needle[1] < 0x10) badEncoding(); + } + static if (mode == Mode.alwaysSkip) + { + inp.popFrontN(size); + return tab!size[needle]; + } + else static if (mode == Mode.neverSkip) + { + return tab!size[needle]; + } + else + { + static assert(mode == Mode.skipOnMatch); + if (tab!size[needle]) + { + inp.popFrontN(size); + return true; + } + else + return false; + } + } + } + + struct CherryPick(I, Sizes...) + { + import std.meta : allSatisfy; + static assert(allSatisfy!(validSize, Sizes), + "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); + private: + I* m; + @property ref tab(int i)() const pure { return m.tables[i - 1]; } + bool lookup(int size, Mode mode, Range)(ref Range inp) const pure + { + return m.lookup!(size, mode)(inp); + } + mixin DefMatcher; + } +} + +template Utf16Matcher() +{ + enum validSize(int sz) = sz >= 1 && sz <= 2; + + void badEncoding() pure + { + import std.utf : UTFException; + throw new UTFException("Invalid UTF-16 sequence"); + } + + // 1-stage ASCII + alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7); + //2-stage BMP + alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7)); + //4-stage - full Unicode + //assume that 0xD800 & 0xDC00 bits are cleared + //thus leaving 10 bit per wchar to worry about + alias UniSpec = AliasSeq!(bool, wchar[2], + assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4), + assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6), + ); + alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build()); + alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build()); + alias Uni = typeof(TrieBuilder!(UniSpec)(false).build()); + + auto encode2(dchar ch) + { + ch -= 0x1_0000; + assert(ch <= 0xF_FFFF); + wchar[2] ret; + //do not put surrogate bits, they are sliced off + ret[0] = cast(wchar)(ch >> 10); + ret[1] = (ch & 0xFFF); + return ret; + } + + auto build(Set)(Set set) + { + import std.algorithm.iteration : map; + auto ascii = set & unicode.ASCII; + auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1)) + - CodepointSet.fromIntervals(0xD800, 0xDFFF+1); + auto other = set - (bmp | ascii); + auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); + auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec); + auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec); + alias Ret = Impl!(1,2); + return Ret(asciiT, bmpT, otherT); + } + + //bootstrap full UTF-16 matcher interace from + //sizeFlags, lookupUni and ascii + mixin template DefMatcher() + { + public bool match(Range)(ref Range inp) const pure @trusted + if (isRandomAccessRange!Range && is(ElementType!Range : wchar)) + { + enum mode = Mode.skipOnMatch; + assert(!inp.empty); + immutable ch = inp[0]; + static if (sizeFlags & 1) + { + if (ch < 0x80) + { + if (ascii[ch]) + { + inp.popFront(); + return true; + } + else + return false; + } + return lookupUni!mode(inp); + } + else + return lookupUni!mode(inp); + } + + static if (Sizes.length == 2) + { + public bool skip(Range)(ref Range inp) const pure @trusted + if (isRandomAccessRange!Range && is(ElementType!Range : wchar)) + { + enum mode = Mode.alwaysSkip; + assert(!inp.empty); + immutable ch = inp[0]; + static if (sizeFlags & 1) + { + if (ch < 0x80) + { + inp.popFront(); + return ascii[ch]; + } + else + return lookupUni!mode(inp); + } + else + return lookupUni!mode(inp); + } + } + + public bool test(Range)(ref Range inp) const pure @trusted + if (isRandomAccessRange!Range && is(ElementType!Range : wchar)) + { + enum mode = Mode.neverSkip; + assert(!inp.empty); + auto ch = inp[0]; + static if (sizeFlags & 1) + return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp); + else + return lookupUni!mode(inp); + } + + bool match(C)(ref C[] str) const pure @trusted + if (isSomeChar!C) + { + return fwdStr!"match"(str); + } + + bool skip(C)(ref C[] str) const pure @trusted + if (isSomeChar!C) + { + return fwdStr!"skip"(str); + } + + bool test(C)(ref C[] str) const pure @trusted + if (isSomeChar!C) + { + return fwdStr!"test"(str); + } + + mixin ForwardStrings; //dispatch strings to range versions + } + + struct Impl(Sizes...) + if (Sizes.length >= 1 && Sizes.length <= 2) + { + private: + import std.meta : allSatisfy; + static assert(allSatisfy!(validSize, Sizes), + "Only lengths of 1 and 2 code units are possible in UTF-16"); + static if (Sizes.length > 1) + enum sizeFlags = Sizes[0] | Sizes[1]; + else + enum sizeFlags = Sizes[0]; + + static if (sizeFlags & 1) + { + Ascii ascii; + Bmp bmp; + } + static if (sizeFlags & 2) + { + Uni uni; + } + mixin DefMatcher; + + package @property auto subMatcher(SizesToPick...)() @trusted + { + return CherryPick!(Impl, SizesToPick)(&this); + } + + bool lookupUni(Mode mode, Range)(ref Range inp) const pure + { + wchar x = cast(wchar)(inp[0] - 0xD800); + //not a high surrogate + if (x > 0x3FF) + { + //low surrogate + if (x <= 0x7FF) badEncoding(); + static if (sizeFlags & 1) + { + auto ch = inp[0]; + static if (mode == Mode.alwaysSkip) + inp.popFront(); + static if (mode == Mode.skipOnMatch) + { + if (bmp[ch]) + { + inp.popFront(); + return true; + } + else + return false; + } + else + return bmp[ch]; + } + else //skip is not available for sub-matchers, so just false + return false; + } + else + { + static if (sizeFlags & 2) + { + if (inp.length < 2) + badEncoding(); + wchar y = cast(wchar)(inp[1] - 0xDC00); + //not a low surrogate + if (y > 0x3FF) + badEncoding(); + wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff]; + static if (mode == Mode.alwaysSkip) + inp.popFrontN(2); + static if (mode == Mode.skipOnMatch) + { + if (uni[needle]) + { + inp.popFrontN(2); + return true; + } + else + return false; + } + else + return uni[needle]; + } + else //ditto + return false; + } + } + } + + struct CherryPick(I, Sizes...) + if (Sizes.length >= 1 && Sizes.length <= 2) + { + private: + import std.meta : allSatisfy; + I* m; + enum sizeFlags = I.sizeFlags; + + static if (sizeFlags & 1) + { + @property ref ascii()() const pure{ return m.ascii; } + } + + bool lookupUni(Mode mode, Range)(ref Range inp) const pure + { + return m.lookupUni!mode(inp); + } + mixin DefMatcher; + static assert(allSatisfy!(validSize, Sizes), + "Only lengths of 1 and 2 code units are possible in UTF-16"); + } +} + +private auto utf8Matcher(Set)(Set set) @trusted +{ + return Utf8Matcher!().build(set); +} + +private auto utf16Matcher(Set)(Set set) @trusted +{ + return Utf16Matcher!().build(set); +} + +/** + Constructs a matcher object + to classify $(CODEPOINTS) from the $(D set) for encoding + that has $(D Char) as code unit. + + See $(LREF MatcherConcept) for API outline. +*/ +public auto utfMatcher(Char, Set)(Set set) @trusted +if (isCodepointSet!Set) +{ + static if (is(Char : char)) + return utf8Matcher(set); + else static if (is(Char : wchar)) + return utf16Matcher(set); + else static if (is(Char : dchar)) + static assert(false, "UTF-32 needs no decoding, + and thus not supported by utfMatcher"); + else + static assert(false, "Only character types 'char' and 'wchar' are allowed"); +} + + +//a range of code units, packed with index to speed up forward iteration +package auto decoder(C)(C[] s, size_t offset=0) @safe pure nothrow @nogc +if (is(C : wchar) || is(C : char)) +{ + static struct Decoder + { + pure nothrow: + C[] str; + size_t idx; + @property C front(){ return str[idx]; } + @property C back(){ return str[$-1]; } + void popFront(){ idx++; } + void popBack(){ str = str[0..$-1]; } + void popFrontN(size_t n){ idx += n; } + @property bool empty(){ return idx == str.length; } + @property auto save(){ return this; } + auto opIndex(size_t i){ return str[idx+i]; } + @property size_t length(){ return str.length - idx; } + alias opDollar = length; + auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); } + } + static assert(isRandomAccessRange!Decoder); + static assert(is(ElementType!Decoder : C)); + return Decoder(s, offset); +} + +@safe unittest +{ + string rs = "hi! ネемног砀 текста"; + auto codec = rs.decoder; + auto utf8 = utf8Matcher(unicode.Letter); + auto asc = utf8.subMatcher!(1); + auto uni = utf8.subMatcher!(2,3,4); + assert(asc.test(codec)); + assert(!uni.match(codec)); + assert(utf8.skip(codec)); + assert(codec.idx == 1); + + assert(!uni.match(codec)); + assert(asc.test(codec)); + assert(utf8.skip(codec)); + assert(codec.idx == 2); + assert(!asc.match(codec)); + + assert(!utf8.test(codec)); + assert(!utf8.skip(codec)); + + assert(!asc.test(codec)); + assert(!utf8.test(codec)); + assert(!utf8.skip(codec)); + assert(utf8.test(codec)); + foreach (i; 0 .. 7) + { + assert(!asc.test(codec)); + assert(uni.test(codec)); + assert(utf8.skip(codec)); + } + assert(!utf8.test(codec)); + assert(!utf8.skip(codec)); + //the same with match where applicable + codec = rs.decoder; + assert(utf8.match(codec)); + assert(codec.idx == 1); + assert(utf8.match(codec)); + assert(codec.idx == 2); + assert(!utf8.match(codec)); + assert(codec.idx == 2); + assert(!utf8.skip(codec)); + assert(!utf8.skip(codec)); + + foreach (i; 0 .. 7) + { + assert(!asc.test(codec)); + assert(utf8.test(codec)); + assert(utf8.match(codec)); + } + auto i = codec.idx; + assert(!utf8.match(codec)); + assert(codec.idx == i); +} + +@safe unittest +{ + import std.range : stride; + static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) + { + bool t = m.test(r); + auto save = r.idx; + assert(t == m.match(r)); + assert(r.idx == save || t); //ether no change or was match + r.idx = save; + static if (is(typeof(m.skip(r)))) + { + assert(t == m.skip(r)); + assert(r.idx != save); //always changed + r.idx = save; + } + return t; + } + auto utf16 = utfMatcher!wchar(unicode.L); + auto bmp = utf16.subMatcher!1; + auto nonBmp = utf16.subMatcher!1; + auto utf8 = utfMatcher!char(unicode.L); + auto ascii = utf8.subMatcher!1; + auto uni2 = utf8.subMatcher!2; + auto uni3 = utf8.subMatcher!3; + auto uni24 = utf8.subMatcher!(2,4); + foreach (ch; unicode.L.byCodepoint.stride(3)) + { + import std.utf : encode; + char[4] buf; + wchar[2] buf16; + auto len = encode(buf, ch); + auto len16 = encode(buf16, ch); + auto c8 = buf[0 .. len].decoder; + auto c16 = buf16[0 .. len16].decoder; + assert(testAll(utf16, c16)); + assert(testAll(bmp, c16) || len16 != 1); + assert(testAll(nonBmp, c16) || len16 != 2); + + assert(testAll(utf8, c8)); + + //submatchers return false on out of their domain + assert(testAll(ascii, c8) || len != 1); + assert(testAll(uni2, c8) || len != 2); + assert(testAll(uni3, c8) || len != 3); + assert(testAll(uni24, c8) || (len != 2 && len != 4)); + } +} + +// cover decode fail cases of Matcher +@system unittest +{ + import std.algorithm.iteration : map; + import std.exception : collectException; + import std.format : format; + auto utf16 = utfMatcher!wchar(unicode.L); + auto utf8 = utfMatcher!char(unicode.L); + //decode failure cases UTF-8 + alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79", + "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00", + "\xCF\x00\0x00\0x00\x00"); + foreach (msg; fails8) + { + assert(collectException((){ + auto s = msg; + size_t idx = 0; + utf8.test(s); + }()), format("%( %2x %)", cast(ubyte[]) msg)); + } + //decode failure cases UTF-16 + alias fails16 = AliasSeq!([0xD811], [0xDC02]); + foreach (msg; fails16) + { + assert(collectException((){ + auto s = msg.map!(x => cast(wchar) x); + utf16.test(s); + }())); + } +} + +/++ + Convenience function to construct optimal configurations for + packed Trie from any $(D set) of $(CODEPOINTS). + + The parameter $(D level) indicates the number of trie levels to use, + allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs + speed-size wise. + + $(P Level 1 is fastest and the most memory hungry (a bit array). ) + $(P Level 4 is the slowest and has the smallest footprint. ) + + See the $(S_LINK Synopsis, Synopsis) section for example. + + Note: + Level 4 stays very practical (being faster and more predictable) + compared to using direct lookup on the $(D set) itself. + + ++/ +public auto toTrie(size_t level, Set)(Set set) +if (isCodepointSet!Set) +{ + static if (level == 1) + return codepointSetTrie!(21)(set); + else static if (level == 2) + return codepointSetTrie!(10, 11)(set); + else static if (level == 3) + return codepointSetTrie!(8, 5, 8)(set); + else static if (level == 4) + return codepointSetTrie!(6, 4, 4, 7)(set); + else + static assert(false, + "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly"); +} + +/** + $(P Builds a $(D Trie) with typically optimal speed-size trade-off + and wraps it into a delegate of the following type: + $(D bool delegate(dchar ch)). ) + + $(P Effectively this creates a 'tester' lambda suitable + for algorithms like std.algorithm.find that take unary predicates. ) + + See the $(S_LINK Synopsis, Synopsis) section for example. +*/ +public auto toDelegate(Set)(Set set) +if (isCodepointSet!Set) +{ + // 3 is very small and is almost as fast as 2-level (due to CPU caches?) + auto t = toTrie!3(set); + return (dchar ch) => t[ch]; +} + +/** + $(P Opaque wrapper around unsigned built-in integers and + code unit (char/wchar/dchar) types. + Parameter $(D sz) indicates that the value is confined + to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be + packed more tightly when stored in certain + data-structures like trie. ) + + Note: + $(P The $(D BitPacked!(T, sz)) is implicitly convertible to $(D T) + but not vise-versa. Users have to ensure the value fits in + the range required and use the $(D cast) + operator to perform the conversion.) +*/ +struct BitPacked(T, size_t sz) +if (isIntegral!T || is(T:dchar)) +{ + enum bitSize = sz; + T _value; + alias _value this; +} + +/* + Depending on the form of the passed argument $(D bitSizeOf) returns + the amount of bits required to represent a given type + or a return type of a given functor. +*/ +template bitSizeOf(Args...) +if (Args.length == 1) +{ + import std.traits : ReturnType; + alias T = Args[0]; + static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t)) + { + enum bitSizeOf = T.bitSize; + } + else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits)) + { + enum bitSizeOf = bitSizeOf!(ReturnType!T); + } + else + { + enum bitSizeOf = T.sizeof*8; + } +} + +/** + Tests if $(D T) is some instantiation of $(LREF BitPacked)!(U, x) + and thus suitable for packing. +*/ +template isBitPacked(T) +{ + static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) + enum isBitPacked = true; + else + enum isBitPacked = false; +} + +/** + Gives the type $(D U) from $(LREF BitPacked)!(U, x) + or $(D T) itself for every other type. +*/ +template TypeOfBitPacked(T) +{ + static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) + alias TypeOfBitPacked = U; + else + alias TypeOfBitPacked = T; +} + +/* + Wrapper, used in definition of custom data structures from $(D Trie) template. + Applying it to a unary lambda function indicates that the returned value always + fits within $(D bits) of bits. +*/ +struct assumeSize(alias Fn, size_t bits) +{ + enum bitSize = bits; + static auto ref opCall(T)(auto ref T arg) + { + return Fn(arg); + } +} + +/* + A helper for defining lambda function that yields a slice + of certain bits from an unsigned integral value. + The resulting lambda is wrapped in assumeSize and can be used directly + with $(D Trie) template. +*/ +struct sliceBits(size_t from, size_t to) +{ + //for now bypass assumeSize, DMD has trouble inlining it + enum bitSize = to-from; + static auto opCall(T)(T x) + out(result) + { + assert(result < (1 << to-from)); + } + body + { + static assert(from < to); + static if (from == 0) + return x & ((1 << to)-1); + else + return (x >> from) & ((1<<(to-from))-1); + } +} + +@safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; } +@safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; } +alias lo8 = assumeSize!(low_8, 8); +alias mlo8 = assumeSize!(midlow_8, 8); + +static assert(bitSizeOf!lo8 == 8); +static assert(bitSizeOf!(sliceBits!(4, 7)) == 3); +static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2); + +template Sequence(size_t start, size_t end) +{ + static if (start < end) + alias Sequence = AliasSeq!(start, Sequence!(start+1, end)); + else + alias Sequence = AliasSeq!(); +} + +//---- TRIE TESTS ---- +@system unittest +{ + import std.algorithm.iteration : map; + import std.algorithm.sorting : sort; + import std.array : array; + import std.conv : text, to; + import std.range : iota; + static trieStats(TRIE)(TRIE t) + { + version (std_uni_stats) + { + import std.stdio : writefln, writeln; + writeln("---TRIE FOOTPRINT STATS---"); + foreach (i; staticIota!(0, t.table.dim) ) + { + writefln("lvl%s = %s bytes; %s pages" + , i, t.bytes!i, t.pages!i); + } + writefln("TOTAL: %s bytes", t.bytes); + version (none) + { + writeln("INDEX (excluding value level):"); + foreach (i; staticIota!(0, t.table.dim-1) ) + writeln(t.table.slice!(i)[0 .. t.table.length!i]); + } + writeln("---------------------------"); + } + } + //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2) + // alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; }); + // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; }); + alias Set = CodepointSet; + auto set = Set('A','Z','a','z'); + auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array + for (int a='a'; a<'z';a++) + assert(trie[a]); + for (int a='A'; a<'Z';a++) + assert(trie[a]); + for (int a=0; a<'A'; a++) + assert(!trie[a]); + for (int a ='Z'; a<'a'; a++) + assert(!trie[a]); + trieStats(trie); + + auto redundant2 = Set( + 1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111); + auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval); + trieStats(trie2); + foreach (e; redundant2.byCodepoint) + assert(trie2[e], text(cast(uint) e, " - ", trie2[e])); + foreach (i; 0 .. 1024) + { + assert(trie2[i] == (i in redundant2)); + } + + + auto redundant3 = Set( + 2, 4, 6, 8, 16, + 2+16, 4+16, 16+6, 16+8, 16+16, + 2+32, 4+32, 32+6, 32+8, + ); + + enum max3 = 256; + // sliceBits + auto trie3 = buildTrie!(bool, uint, max3, + sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4) + )(redundant3.byInterval); + trieStats(trie3); + foreach (i; 0 .. max3) + assert(trie3[i] == (i in redundant3), text(cast(uint) i)); + + auto redundant4 = Set( + 10, 64, 64+10, 128, 128+10, 256, 256+10, 512, + 1000, 2000, 3000, 4000, 5000, 6000 + ); + enum max4 = 2^^16; + auto trie4 = buildTrie!(bool, size_t, max4, + sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6) + )(redundant4.byInterval); + foreach (i; 0 .. max4) + { + if (i in redundant4) + assert(trie4[i], text(cast(uint) i)); + } + trieStats(trie4); + + alias mapToS = mapTrieIndex!(useItemAt!(0, char)); + string[] redundantS = ["tea", "start", "orange"]; + redundantS.sort!((a,b) => mapToS(a) < mapToS(b))(); + auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS); + // using first char only + assert(redundantS == ["orange", "start", "tea"]); + assert(strie["test"], text(strie["test"])); + assert(!strie["aea"]); + assert(strie["s"]); + + // a bit size test + auto a = array(map!(x => to!ubyte(x))(iota(0, 256))); + auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a); + trieStats(bt); + foreach (i; 0 .. 256) + assert(bt[cast(ubyte) i]); +} + +template useItemAt(size_t idx, T) +if (isIntegral!T || is(T: dchar)) +{ + size_t impl(in T[] arr){ return arr[idx]; } + alias useItemAt = assumeSize!(impl, 8*T.sizeof); +} + +template useLastItem(T) +{ + size_t impl(in T[] arr){ return arr[$-1]; } + alias useLastItem = assumeSize!(impl, 8*T.sizeof); +} + +template fullBitSize(Prefix...) +{ + static if (Prefix.length > 0) + enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]); + else + enum fullBitSize = 0; +} + +template idxTypes(Key, size_t fullBits, Prefix...) +{ + static if (Prefix.length == 1) + {// the last level is value level, so no index once reduced to 1-level + alias idxTypes = AliasSeq!(); + } + else + { + // Important note on bit packing + // Each level has to hold enough of bits to address the next one + // The bottom level is known to hold full bit width + // thus it's size in pages is full_bit_width - size_of_last_prefix + // Recourse on this notion + alias idxTypes = + AliasSeq!( + idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]), + BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1])) + ); + } +} + +//============================================================================ + +@safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) +if (is(Char1 : dchar) && is(Char2 : dchar)) +{ + import std.algorithm.comparison : cmp; + import std.algorithm.iteration : map, filter; + import std.ascii : toLower; + static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';} + return cmp( + a.map!toLower.filter!pred, + b.map!toLower.filter!pred); +} + +@safe pure unittest +{ + assert(!comparePropertyName("foo-bar", "fooBar")); +} + +bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure +if (is(Char1 : dchar) && is(Char2 : dchar)) +{ + return comparePropertyName(a, b) < 0; +} + +//============================================================================ +// Utilities for compression of Unicode code point sets +//============================================================================ + +@safe void compressTo(uint val, ref ubyte[] arr) pure nothrow +{ + // not optimized as usually done 1 time (and not public interface) + if (val < 128) + arr ~= cast(ubyte) val; + else if (val < (1 << 13)) + { + arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8); + arr ~= val & 0xFF; + } + else + { + assert(val < (1 << 21)); + arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16); + arr ~= (val >> 8) & 0xFF; + arr ~= val & 0xFF; + } +} + +@safe uint decompressFrom(const(ubyte)[] arr, ref size_t idx) pure +{ + import std.exception : enforce; + immutable first = arr[idx++]; + if (!(first & 0x80)) // no top bit -> [0 .. 127] + return first; + immutable extra = ((first >> 5) & 1) + 1; // [1, 2] + uint val = (first & 0x1F); + enforce(idx + extra <= arr.length, "bad code point interval encoding"); + foreach (j; 0 .. extra) + val = (val << 8) | arr[idx+j]; + idx += extra; + return val; +} + + +package ubyte[] compressIntervals(Range)(Range intervals) +if (isInputRange!Range && isIntegralPair!(ElementType!Range)) +{ + ubyte[] storage; + uint base = 0; + // RLE encode + foreach (val; intervals) + { + compressTo(val[0]-base, storage); + base = val[0]; + if (val[1] != lastDchar+1) // till the end of the domain so don't store it + { + compressTo(val[1]-base, storage); + base = val[1]; + } + } + return storage; +} + +@safe pure unittest +{ + import std.algorithm.comparison : equal; + import std.typecons : tuple; + + auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)]; + ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0]; + assert(compressIntervals(run) == enc); + auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)]; + ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed + assert(compressIntervals(run2) == enc2); + size_t idx = 0; + assert(decompressFrom(enc, idx) == 80); + assert(decompressFrom(enc, idx) == 47); + assert(decompressFrom(enc, idx) == 1); + assert(decompressFrom(enc, idx) == (1 << 10)); + idx = 0; + assert(decompressFrom(enc2, idx) == 0); + assert(decompressFrom(enc2, idx) == (1 << 20)+512+1); + assert(equal(decompressIntervals(compressIntervals(run)), run)); + assert(equal(decompressIntervals(compressIntervals(run2)), run2)); +} + +// Creates a range of $(D CodepointInterval) that lazily decodes compressed data. +@safe package auto decompressIntervals(const(ubyte)[] data) pure +{ + return DecompressedIntervals(data); +} + +@safe struct DecompressedIntervals +{ +pure: + const(ubyte)[] _stream; + size_t _idx; + CodepointInterval _front; + + this(const(ubyte)[] stream) + { + _stream = stream; + popFront(); + } + + @property CodepointInterval front() + { + assert(!empty); + return _front; + } + + void popFront() + { + if (_idx == _stream.length) + { + _idx = size_t.max; + return; + } + uint base = _front[1]; + _front[0] = base + decompressFrom(_stream, _idx); + if (_idx == _stream.length)// odd length ---> till the end + _front[1] = lastDchar+1; + else + { + base = _front[0]; + _front[1] = base + decompressFrom(_stream, _idx); + } + } + + @property bool empty() const + { + return _idx == size_t.max; + } + + @property DecompressedIntervals save() { return this; } +} + +static assert(isInputRange!DecompressedIntervals); +static assert(isForwardRange!DecompressedIntervals); +//============================================================================ + +version (std_uni_bootstrap){} +else +{ + +// helper for looking up code point sets +@trusted ptrdiff_t findUnicodeSet(alias table, C)(in C[] name) pure +{ + import std.algorithm.iteration : map; + import std.range : assumeSorted; + auto range = assumeSorted!((a,b) => propertyNameLess(a,b)) + (table.map!"a.name"()); + size_t idx = range.lowerBound(name).length; + if (idx < range.length && comparePropertyName(range[idx], name) == 0) + return idx; + return -1; +} + +// another one that loads it +@trusted bool loadUnicodeSet(alias table, Set, C)(in C[] name, ref Set dest) pure +{ + auto idx = findUnicodeSet!table(name); + if (idx >= 0) + { + dest = Set(asSet(table[idx].compressed)); + return true; + } + return false; +} + +@trusted bool loadProperty(Set=CodepointSet, C) + (in C[] name, ref Set target) pure +{ + import std.internal.unicode_tables : uniProps; // generated file + alias ucmp = comparePropertyName; + // conjure cumulative properties by hand + if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0) + { + target = asSet(uniProps.Lu); + target |= asSet(uniProps.Ll); + target |= asSet(uniProps.Lt); + target |= asSet(uniProps.Lo); + target |= asSet(uniProps.Lm); + } + else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0) + { + target = asSet(uniProps.Ll); + target |= asSet(uniProps.Lu); + target |= asSet(uniProps.Lt);// Title case + } + else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0) + { + target = asSet(uniProps.Mn); + target |= asSet(uniProps.Mc); + target |= asSet(uniProps.Me); + } + else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0) + { + target = asSet(uniProps.Nd); + target |= asSet(uniProps.Nl); + target |= asSet(uniProps.No); + } + else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0) + { + target = asSet(uniProps.Pc); + target |= asSet(uniProps.Pd); + target |= asSet(uniProps.Ps); + target |= asSet(uniProps.Pe); + target |= asSet(uniProps.Pi); + target |= asSet(uniProps.Pf); + target |= asSet(uniProps.Po); + } + else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0) + { + target = asSet(uniProps.Sm); + target |= asSet(uniProps.Sc); + target |= asSet(uniProps.Sk); + target |= asSet(uniProps.So); + } + else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0) + { + target = asSet(uniProps.Zs); + target |= asSet(uniProps.Zl); + target |= asSet(uniProps.Zp); + } + else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0) + { + target = asSet(uniProps.Co); + target |= asSet(uniProps.Lo); + target |= asSet(uniProps.No); + target |= asSet(uniProps.So); + target |= asSet(uniProps.Po); + } + else if (ucmp(name, "graphical") == 0) + { + target = asSet(uniProps.Alphabetic); + + target |= asSet(uniProps.Mn); + target |= asSet(uniProps.Mc); + target |= asSet(uniProps.Me); + + target |= asSet(uniProps.Nd); + target |= asSet(uniProps.Nl); + target |= asSet(uniProps.No); + + target |= asSet(uniProps.Pc); + target |= asSet(uniProps.Pd); + target |= asSet(uniProps.Ps); + target |= asSet(uniProps.Pe); + target |= asSet(uniProps.Pi); + target |= asSet(uniProps.Pf); + target |= asSet(uniProps.Po); + + target |= asSet(uniProps.Zs); + + target |= asSet(uniProps.Sm); + target |= asSet(uniProps.Sc); + target |= asSet(uniProps.Sk); + target |= asSet(uniProps.So); + } + else if (ucmp(name, "any") == 0) + target = Set.fromIntervals(0, 0x110000); + else if (ucmp(name, "ascii") == 0) + target = Set.fromIntervals(0, 0x80); + else + return loadUnicodeSet!(uniProps.tab)(name, target); + return true; +} + +// CTFE-only helper for checking property names at compile-time +@safe bool isPrettyPropertyName(C)(in C[] name) +{ + import std.algorithm.searching : find; + auto names = [ + "L", "Letter", + "LC", "Cased Letter", + "M", "Mark", + "N", "Number", + "P", "Punctuation", + "S", "Symbol", + "Z", "Separator", + "Graphical", + "any", + "ascii" + ]; + auto x = find!(x => comparePropertyName(x, name) == 0)(names); + return !x.empty; +} + +// ditto, CTFE-only, not optimized +@safe private static bool findSetName(alias table, C)(in C[] name) +{ + return findUnicodeSet!table(name) >= 0; +} + +template SetSearcher(alias table, string kind) +{ + /// Run-time checked search. + static auto opCall(C)(in C[] name) + if (is(C : dchar)) + { + import std.conv : to; + CodepointSet set; + if (loadUnicodeSet!table(name, set)) + return set; + throw new Exception("No unicode set for "~kind~" by name " + ~name.to!string()~" was found."); + } + /// Compile-time checked search. + static @property auto opDispatch(string name)() + { + static if (findSetName!table(name)) + { + CodepointSet set; + loadUnicodeSet!table(name, set); + return set; + } + else + static assert(false, "No unicode set for "~kind~" by name " + ~name~" was found."); + } +} + +/** + A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of + a block, script or general category. + + It uses well defined standard rules of property name lookup. + This includes fuzzy matching of names, so that + 'White_Space', 'white-SpAce' and 'whitespace' are all considered equal + and yield the same set of white space $(CHARACTERS). +*/ +@safe public struct unicode +{ + /** + Performs the lookup of set of $(CODEPOINTS) + with compile-time correctness checking. + This short-cut version combines 3 searches: + across blocks, scripts, and common binary properties. + + Note that since scripts and blocks overlap the + usual trick to disambiguate is used - to get a block use + $(D unicode.InBlockName), to search a script + use $(D unicode.ScriptName). + + See_Also: $(LREF block), $(LREF script) + and (not included in this search) $(LREF hangulSyllableType). + */ + + static @property auto opDispatch(string name)() pure + { + static if (findAny(name)) + return loadAny(name); + else + static assert(false, "No unicode set by name "~name~" was found."); + } + + /// + @safe unittest + { + import std.exception : collectException; + auto ascii = unicode.ASCII; + assert(ascii['A']); + assert(ascii['~']); + assert(!ascii['\u00e0']); + // matching is case-insensitive + assert(ascii == unicode.ascII); + assert(!ascii['à']); + // underscores, '-' and whitespace in names are ignored too + auto latin = unicode.in_latin1_Supplement; + assert(latin['à']); + assert(!latin['$']); + // BTW Latin 1 Supplement is a block, hence "In" prefix + assert(latin == unicode("In Latin 1 Supplement")); + // run-time look up throws if no such set is found + assert(collectException(unicode("InCyrilliac"))); + } + + /** + The same lookup across blocks, scripts, or binary properties, + but performed at run-time. + This version is provided for cases where $(D name) + is not known beforehand; otherwise compile-time + checked $(LREF opDispatch) is typically a better choice. + + See the $(S_LINK Unicode properties, table of properties) for available + sets. + */ + static auto opCall(C)(in C[] name) + if (is(C : dchar)) + { + return loadAny(name); + } + + /** + Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks. + + Note: + Here block names are unambiguous as no scripts are searched + and thus to search use simply $(D unicode.block.BlockName) notation. + + See $(S_LINK Unicode properties, table of properties) for available sets. + See_Also: $(S_LINK Unicode properties, table of properties). + */ + struct block + { + import std.internal.unicode_tables : blocks; // generated file + mixin SetSearcher!(blocks.tab, "block"); + } + + /// + @safe unittest + { + // use .block for explicitness + assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic); + } + + /** + Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts. + + See the $(S_LINK Unicode properties, table of properties) for available + sets. + */ + struct script + { + import std.internal.unicode_tables : scripts; // generated file + mixin SetSearcher!(scripts.tab, "script"); + } + + /// + @safe unittest + { + auto arabicScript = unicode.script.arabic; + auto arabicBlock = unicode.block.arabic; + // there is an intersection between script and block + assert(arabicBlock['']); + assert(arabicScript['']); + // but they are different + assert(arabicBlock != arabicScript); + assert(arabicBlock == unicode.inArabic); + assert(arabicScript == unicode.arabic); + } + + /** + Fetch a set of $(CODEPOINTS) that have the given hangul syllable type. + + Other non-binary properties (once supported) follow the same + notation - $(D unicode.propertyName.propertyValue) for compile-time + checked access and $(D unicode.propertyName(propertyValue)) + for run-time checked one. + + See the $(S_LINK Unicode properties, table of properties) for available + sets. + */ + struct hangulSyllableType + { + import std.internal.unicode_tables : hangul; // generated file + mixin SetSearcher!(hangul.tab, "hangul syllable type"); + } + + /// + @safe unittest + { + // L here is syllable type not Letter as in unicode.L short-cut + auto leadingVowel = unicode.hangulSyllableType("L"); + // check that some leading vowels are present + foreach (vowel; '\u1110'..'\u115F') + assert(leadingVowel[vowel]); + assert(leadingVowel == unicode.hangulSyllableType.L); + } + +private: + alias ucmp = comparePropertyName; + + static bool findAny(string name) + { + import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file + return isPrettyPropertyName(name) + || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name) + || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$])); + } + + static auto loadAny(Set=CodepointSet, C)(in C[] name) pure + { + import std.conv : to; + import std.internal.unicode_tables : blocks, scripts; // generated file + Set set; + immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set) + || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0 + && loadUnicodeSet!(blocks.tab)(name[2..$], set)); + if (loaded) + return set; + throw new Exception("No unicode set by name "~name.to!string()~" was found."); + } + + // FIXME: re-disable once the compiler is fixed + // Disabled to prevent the mistake of creating instances of this pseudo-struct. + //@disable ~this(); +} + +@safe unittest +{ + import std.internal.unicode_tables : blocks, uniProps; // generated file + assert(unicode("InHebrew") == asSet(blocks.Hebrew)); + assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp))); + assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi)); +} + +enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally + +// control - '\r' +enum controlSwitch = ` + case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':.. + case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085': +`; +// TODO: redo the most of hangul stuff algorithmically in case of Graphemes too +// kill unrolled switches + +private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow +{ + return ch >= '\U0001F1E6' && ch <= '\U0001F1FF'; +} + +template genericDecodeGrapheme(bool getValue) +{ + alias graphemeExtend = graphemeExtendTrie; + alias spacingMark = mcTrie; + static if (getValue) + alias Value = Grapheme; + else + alias Value = void; + + Value genericDecodeGrapheme(Input)(ref Input range) + { + import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file + enum GraphemeState { + Start, + CR, + RI, + L, + V, + LVT + } + static if (getValue) + Grapheme grapheme; + auto state = GraphemeState.Start; + enum eat = q{ + static if (getValue) + grapheme ~= ch; + range.popFront(); + }; + + dchar ch; + assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof); + while (!range.empty) + { + ch = range.front; + final switch (state) with(GraphemeState) + { + case Start: + mixin(eat); + if (ch == '\r') + state = CR; + else if (isRegionalIndicator(ch)) + state = RI; + else if (isHangL(ch)) + state = L; + else if (hangLV[ch] || isHangV(ch)) + state = V; + else if (hangLVT[ch]) + state = LVT; + else if (isHangT(ch)) + state = LVT; + else + { + switch (ch) + { + mixin(controlSwitch); + goto L_End; + default: + goto L_End_Extend; + } + } + break; + case CR: + if (ch == '\n') + mixin(eat); + goto L_End_Extend; + case RI: + if (isRegionalIndicator(ch)) + mixin(eat); + else + goto L_End_Extend; + break; + case L: + if (isHangL(ch)) + mixin(eat); + else if (isHangV(ch) || hangLV[ch]) + { + state = V; + mixin(eat); + } + else if (hangLVT[ch]) + { + state = LVT; + mixin(eat); + } + else + goto L_End_Extend; + break; + case V: + if (isHangV(ch)) + mixin(eat); + else if (isHangT(ch)) + { + state = LVT; + mixin(eat); + } + else + goto L_End_Extend; + break; + case LVT: + if (isHangT(ch)) + { + mixin(eat); + } + else + goto L_End_Extend; + break; + } + } + L_End_Extend: + while (!range.empty) + { + ch = range.front; + // extend & spacing marks + if (!graphemeExtend[ch] && !spacingMark[ch]) + break; + mixin(eat); + } + L_End: + static if (getValue) + return grapheme; + } + +} + +public: // Public API continues + +/++ + Computes the length of grapheme cluster starting at $(D index). + Both the resulting length and the $(D index) are measured + in $(S_LINK Code unit, code units). + + Params: + C = type that is implicitly convertible to $(D dchars) + input = array of grapheme clusters + index = starting index into $(D input[]) + + Returns: + length of grapheme cluster ++/ +size_t graphemeStride(C)(in C[] input, size_t index) +if (is(C : dchar)) +{ + auto src = input[index..$]; + auto n = src.length; + genericDecodeGrapheme!(false)(src); + return n - src.length; +} + +/// +@safe unittest +{ + assert(graphemeStride(" ", 1) == 1); + // A + combing ring above + string city = "A\u030Arhus"; + size_t first = graphemeStride(city, 0); + assert(first == 3); //\u030A has 2 UTF-8 code units + assert(city[0 .. first] == "A\u030A"); + assert(city[first..$] == "rhus"); +} + +/++ + Reads one full grapheme cluster from an input range of dchar $(D inp). + + For examples see the $(LREF Grapheme) below. + + Note: + This function modifies $(D inp) and thus $(D inp) + must be an L-value. ++/ +Grapheme decodeGrapheme(Input)(ref Input inp) +if (isInputRange!Input && is(Unqual!(ElementType!Input) == dchar)) +{ + return genericDecodeGrapheme!true(inp); +} + +@system unittest +{ + import std.algorithm.comparison : equal; + + Grapheme gr; + string s = " \u0020\u0308 "; + gr = decodeGrapheme(s); + assert(gr.length == 1 && gr[0] == ' '); + gr = decodeGrapheme(s); + assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308")); + s = "\u0300\u0308\u1100"; + assert(equal(decodeGrapheme(s)[], "\u0300\u0308")); + assert(equal(decodeGrapheme(s)[], "\u1100")); + s = "\u11A8\u0308\uAC01"; + assert(equal(decodeGrapheme(s)[], "\u11A8\u0308")); + assert(equal(decodeGrapheme(s)[], "\uAC01")); +} + +/++ + $(P Iterate a string by grapheme.) + + $(P Useful for doing string manipulation that needs to be aware + of graphemes.) + + See_Also: + $(LREF byCodePoint) ++/ +auto byGrapheme(Range)(Range range) +if (isInputRange!Range && is(Unqual!(ElementType!Range) == dchar)) +{ + // TODO: Bidirectional access + static struct Result(R) + { + private R _range; + private Grapheme _front; + + bool empty() @property + { + return _front.length == 0; + } + + Grapheme front() @property + { + return _front; + } + + void popFront() + { + _front = _range.empty ? Grapheme.init : _range.decodeGrapheme(); + } + + static if (isForwardRange!R) + { + Result save() @property + { + return Result(_range.save, _front); + } + } + } + + auto result = Result!(Range)(range); + result.popFront(); + return result; +} + +/// +@safe unittest +{ + import std.algorithm.comparison : equal; + import std.range.primitives : walkLength; + import std.range : take, drop; + auto text = "noe\u0308l"; // noël using e + combining diaeresis + assert(text.walkLength == 5); // 5 code points + + auto gText = text.byGrapheme; + assert(gText.walkLength == 4); // 4 graphemes + + assert(gText.take(3).equal("noe\u0308".byGrapheme)); + assert(gText.drop(3).equal("l".byGrapheme)); +} + +// For testing non-forward-range input ranges +version (unittest) +private static struct InputRangeString +{ + private string s; + + bool empty() @property { return s.empty; } + dchar front() @property { return s.front; } + void popFront() { s.popFront(); } +} + +@system unittest +{ + import std.algorithm.comparison : equal; + import std.array : array; + import std.range : retro; + import std.range.primitives : walkLength; + assert("".byGrapheme.walkLength == 0); + + auto reverse = "le\u0308on"; + assert(reverse.walkLength == 5); + + auto gReverse = reverse.byGrapheme; + assert(gReverse.walkLength == 4); + + foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d)) + { + assert(text.walkLength == 5); + static assert(isForwardRange!(typeof(text))); + + auto gText = text.byGrapheme; + static assert(isForwardRange!(typeof(gText))); + assert(gText.walkLength == 4); + assert(gText.array.retro.equal(gReverse)); + } + + auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme; + static assert(!isForwardRange!(typeof(nonForwardRange))); + assert(nonForwardRange.walkLength == 4); +} + +/++ + $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.) + + $(P Useful for converting the result to a string after doing operations + on graphemes.) + + $(P Acts as the identity function when given a range of code points.) ++/ +auto byCodePoint(Range)(Range range) +if (isInputRange!Range && is(Unqual!(ElementType!Range) == Grapheme)) +{ + // TODO: Propagate bidirectional access + static struct Result + { + private Range _range; + private size_t i = 0; + + bool empty() @property + { + return _range.empty; + } + + dchar front() @property + { + return _range.front[i]; + } + + void popFront() + { + ++i; + + if (i >= _range.front.length) + { + _range.popFront(); + i = 0; + } + } + + static if (isForwardRange!Range) + { + Result save() @property + { + return Result(_range.save, i); + } + } + } + + return Result(range); +} + +/// Ditto +Range byCodePoint(Range)(Range range) +if (isInputRange!Range && is(Unqual!(ElementType!Range) == dchar)) +{ + return range; +} + +/// +@safe unittest +{ + import std.array : array; + import std.conv : text; + import std.range : retro; + + string s = "noe\u0308l"; // noël + + // reverse it and convert the result to a string + string reverse = s.byGrapheme + .array + .retro + .byCodePoint + .text; + + assert(reverse == "le\u0308on"); // lëon +} + +@system unittest +{ + import std.algorithm.comparison : equal; + import std.range.primitives : walkLength; + assert("".byGrapheme.byCodePoint.equal("")); + + string text = "noe\u0308l"; + static assert(is(typeof(text.byCodePoint) == string)); + + auto gText = InputRangeString(text).byGrapheme; + static assert(!isForwardRange!(typeof(gText))); + + auto cpText = gText.byCodePoint; + static assert(!isForwardRange!(typeof(cpText))); + + assert(cpText.walkLength == text.walkLength); +} + +@trusted: + +/++ + $(P A structure designed to effectively pack $(CHARACTERS) + of a $(CLUSTER). + ) + + $(P $(D Grapheme) has value semantics so 2 copies of a $(D Grapheme) + always refer to distinct objects. In most actual scenarios a $(D Grapheme) + fits on the stack and avoids memory allocation overhead for all but quite + long clusters. + ) + + See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride) ++/ +@trusted struct Grapheme +{ + import std.traits : isDynamicArray; + +public: + /// Ctor + this(C)(in C[] chars...) + if (is(C : dchar)) + { + this ~= chars; + } + + ///ditto + this(Input)(Input seq) + if (!isDynamicArray!Input + && isInputRange!Input && is(ElementType!Input : dchar)) + { + this ~= seq; + } + + /// Gets a $(CODEPOINT) at the given index in this cluster. + dchar opIndex(size_t index) const pure nothrow @nogc + { + assert(index < length); + return read24(isBig ? ptr_ : small_.ptr, index); + } + + /++ + Writes a $(CODEPOINT) $(D ch) at given index in this cluster. + + Warning: + Use of this facility may invalidate grapheme cluster, + see also $(LREF Grapheme.valid). + +/ + void opIndexAssign(dchar ch, size_t index) pure nothrow @nogc + { + assert(index < length); + write24(isBig ? ptr_ : small_.ptr, ch, index); + } + + /// + @safe unittest + { + auto g = Grapheme("A\u0302"); + assert(g[0] == 'A'); + assert(g.valid); + g[1] = '~'; // ASCII tilda is not a combining mark + assert(g[1] == '~'); + assert(!g.valid); + } + + /++ + Random-access range over Grapheme's $(CHARACTERS). + + Warning: Invalidates when this Grapheme leaves the scope, + attempts to use it then would lead to memory corruption. + +/ + @system SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) pure nothrow @nogc + { + return sliceOverIndexed(a, b, &this); + } + + /// ditto + @system SliceOverIndexed!Grapheme opSlice() pure nothrow @nogc + { + return sliceOverIndexed(0, length, &this); + } + + /// Grapheme cluster length in $(CODEPOINTS). + @property size_t length() const pure nothrow @nogc + { + return isBig ? len_ : slen_ & 0x7F; + } + + /++ + Append $(CHARACTER) $(D ch) to this grapheme. + Warning: + Use of this facility may invalidate grapheme cluster, + see also $(D valid). + + See_Also: $(LREF Grapheme.valid) + +/ + ref opOpAssign(string op)(dchar ch) + { + static if (op == "~") + { + if (!isBig) + { + if (slen_ == small_cap) + convertToBig();// & fallthrough to "big" branch + else + { + write24(small_.ptr, ch, smallLength); + slen_++; + return this; + } + } + + assert(isBig); + if (len_ == cap_) + { + import core.checkedint : addu, mulu; + bool overflow; + cap_ = addu(cap_, grow, overflow); + auto nelems = mulu(3, addu(cap_, 1, overflow), overflow); + if (overflow) assert(0); + ptr_ = cast(ubyte*) pureRealloc(ptr_, nelems); + if (ptr_ is null) onOutOfMemoryError(); + } + write24(ptr_, ch, len_++); + return this; + } + else + static assert(false, "No operation "~op~" defined for Grapheme"); + } + + /// + @system unittest + { + import std.algorithm.comparison : equal; + auto g = Grapheme("A"); + assert(g.valid); + g ~= '\u0301'; + assert(g[].equal("A\u0301")); + assert(g.valid); + g ~= "B"; + // not a valid grapheme cluster anymore + assert(!g.valid); + // still could be useful though + assert(g[].equal("A\u0301B")); + } + + /// Append all $(CHARACTERS) from the input range $(D inp) to this Grapheme. + ref opOpAssign(string op, Input)(Input inp) + if (isInputRange!Input && is(ElementType!Input : dchar)) + { + static if (op == "~") + { + foreach (dchar ch; inp) + this ~= ch; + return this; + } + else + static assert(false, "No operation "~op~" defined for Grapheme"); + } + + /++ + True if this object contains valid extended grapheme cluster. + Decoding primitives of this module always return a valid $(D Grapheme). + + Appending to and direct manipulation of grapheme's $(CHARACTERS) may + render it no longer valid. Certain applications may chose to use + Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property + entirely. + +/ + @property bool valid()() /*const*/ + { + auto r = this[]; + genericDecodeGrapheme!false(r); + return r.length == 0; + } + + this(this) pure @nogc nothrow + { + if (isBig) + {// dup it + import core.checkedint : addu, mulu; + bool overflow; + auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow); + if (overflow) assert(0); + + auto p = cast(ubyte*) pureMalloc(raw_cap); + if (p is null) onOutOfMemoryError(); + p[0 .. raw_cap] = ptr_[0 .. raw_cap]; + ptr_ = p; + } + } + + ~this() pure @nogc nothrow + { + if (isBig) + { + pureFree(ptr_); + } + } + + +private: + enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1); + // "out of the blue" grow rate, needs testing + // (though graphemes are typically small < 9) + enum grow = 20; + enum small_cap = small_bytes/3; + enum small_flag = 0x80, small_mask = 0x7F; + // 16 bytes in 32bits, should be enough for the majority of cases + union + { + struct + { + ubyte* ptr_; + size_t cap_; + size_t len_; + size_t padding_; + } + struct + { + ubyte[small_bytes] small_; + ubyte slen_; + } + } + + void convertToBig() pure @nogc nothrow + { + static assert(grow.max / 3 - 1 >= grow); + enum nbytes = 3 * (grow + 1); + size_t k = smallLength; + ubyte* p = cast(ubyte*) pureMalloc(nbytes); + if (p is null) onOutOfMemoryError(); + for (int i=0; i<k; i++) + write24(p, read24(small_.ptr, i), i); + // now we can overwrite small array data + ptr_ = p; + len_ = slen_; + assert(grow > len_); + cap_ = grow; + setBig(); + } + + void setBig() pure nothrow @nogc { slen_ |= small_flag; } + + @property size_t smallLength() const pure nothrow @nogc + { + return slen_ & small_mask; + } + @property ubyte isBig() const pure nothrow @nogc + { + return slen_ & small_flag; + } +} + +static assert(Grapheme.sizeof == size_t.sizeof*4); + + +@system pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw +{ + import std.algorithm.comparison : equal; + Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")]; + assert(byGrapheme("ЮУЗ").equal(data[])); +} + +/// +@system unittest +{ + import std.algorithm.comparison : equal; + import std.algorithm.iteration : filter; + import std.range : isRandomAccessRange; + + string bold = "ku\u0308hn"; + + // note that decodeGrapheme takes parameter by ref + auto first = decodeGrapheme(bold); + + assert(first.length == 1); + assert(first[0] == 'k'); + + // the next grapheme is 2 characters long + auto wideOne = decodeGrapheme(bold); + // slicing a grapheme yields a random-access range of dchar + assert(wideOne[].equal("u\u0308")); + assert(wideOne.length == 2); + static assert(isRandomAccessRange!(typeof(wideOne[]))); + + // all of the usual range manipulation is possible + assert(wideOne[].filter!isMark().equal("\u0308")); + + auto g = Grapheme("A"); + assert(g.valid); + g ~= '\u0301'; + assert(g[].equal("A\u0301")); + assert(g.valid); + g ~= "B"; + // not a valid grapheme cluster anymore + assert(!g.valid); + // still could be useful though + assert(g[].equal("A\u0301B")); +} + +@safe unittest +{ + auto g = Grapheme("A\u0302"); + assert(g[0] == 'A'); + assert(g.valid); + g[1] = '~'; // ASCII tilda is not a combining mark + assert(g[1] == '~'); + assert(!g.valid); +} + +@system unittest +{ + import std.algorithm.comparison : equal; + import std.algorithm.iteration : map; + import std.conv : text; + import std.range : iota; + + // not valid clusters (but it just a test) + auto g = Grapheme('a', 'b', 'c', 'd', 'e'); + assert(g[0] == 'a'); + assert(g[1] == 'b'); + assert(g[2] == 'c'); + assert(g[3] == 'd'); + assert(g[4] == 'e'); + g[3] = 'Й'; + assert(g[2] == 'c'); + assert(g[3] == 'Й', text(g[3], " vs ", 'Й')); + assert(g[4] == 'e'); + assert(!g.valid); + + g ~= 'ц'; + g ~= '~'; + assert(g[0] == 'a'); + assert(g[1] == 'b'); + assert(g[2] == 'c'); + assert(g[3] == 'Й'); + assert(g[4] == 'e'); + assert(g[5] == 'ц'); + assert(g[6] == '~'); + assert(!g.valid); + + Grapheme copy = g; + copy[0] = 'X'; + copy[1] = '-'; + assert(g[0] == 'a' && copy[0] == 'X'); + assert(g[1] == 'b' && copy[1] == '-'); + assert(equal(g[2 .. g.length], copy[2 .. copy.length])); + copy = Grapheme("АБВГДЕЁЖЗИКЛМ"); + assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8])); + copy ~= "xyz"; + assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15])); + assert(!copy.valid); + + Grapheme h; + foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"()) + h ~= v; + assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1))); +} + +/++ + $(P Does basic case-insensitive comparison of $(D r1) and $(D r2). + This function uses simpler comparison rule thus achieving better performance + than $(LREF icmp). However keep in mind the warning below.) + + Params: + r1 = an input range of characters + r2 = an input range of characters + + Returns: + An $(D int) that is 0 if the strings match, + <0 if $(D r1) is lexicographically "less" than $(D r2), + >0 if $(D r1) is lexicographically "greater" than $(D r2) + + Warning: + This function only handles 1:1 $(CODEPOINT) mapping + and thus is not sufficient for certain alphabets + like German, Greek and few others. + + See_Also: + $(LREF icmp) + $(REF cmp, std,algorithm,comparison) ++/ +int sicmp(S1, S2)(S1 r1, S2 r2) +if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1) + && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2)) +{ + import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file + import std.utf : byDchar; + + auto str1 = r1.byDchar; + auto str2 = r2.byDchar; + + foreach (immutable lhs; str1) + { + if (str2.empty) + return 1; + immutable rhs = str2.front; + str2.popFront(); + int diff = lhs - rhs; + if (!diff) + continue; + size_t idx = simpleCaseTrie[lhs]; + size_t idx2 = simpleCaseTrie[rhs]; + // simpleCaseTrie is packed index table + if (idx != EMPTY_CASE_TRIE) + { + if (idx2 != EMPTY_CASE_TRIE) + {// both cased chars + // adjust idx --> start of bucket + idx = idx - sTable[idx].n; + idx2 = idx2 - sTable[idx2].n; + if (idx == idx2)// one bucket, equivalent chars + continue; + else// not the same bucket + diff = sTable[idx].ch - sTable[idx2].ch; + } + else + diff = sTable[idx - sTable[idx].n].ch - rhs; + } + else if (idx2 != EMPTY_CASE_TRIE) + { + diff = lhs - sTable[idx2 - sTable[idx2].n].ch; + } + // one of chars is not cased at all + return diff; + } + return str2.empty ? 0 : -1; +} + +/// +@safe @nogc pure nothrow unittest +{ + assert(sicmp("Август", "авгусТ") == 0); + // Greek also works as long as there is no 1:M mapping in sight + assert(sicmp("ΌΎ", "όύ") == 0); + // things like the following won't get matched as equal + // Greek small letter iota with dialytika and tonos + assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); + + // while icmp has no problem with that + assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0); + assert(icmp("ΌΎ", "όύ") == 0); +} + +// overloads for the most common cases to reduce compile time +@safe @nogc pure nothrow +{ + int sicmp(const(char)[] str1, const(char)[] str2) + { return sicmp!(const(char)[], const(char)[])(str1, str2); } + int sicmp(const(wchar)[] str1, const(wchar)[] str2) + { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); } + int sicmp(const(dchar)[] str1, const(dchar)[] str2) + { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); } +} + +private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail) +{ + import std.algorithm.searching : skipOver; + import std.internal.unicode_tables : fullCaseTable; // generated file + alias fTable = fullCaseTable; + size_t idx = fullCaseTrie[lhs]; + // fullCaseTrie is packed index table + if (idx == EMPTY_CASE_TRIE) + return lhs; + immutable start = idx - fTable[idx].n; + immutable end = fTable[idx].size + start; + assert(fTable[start].entry_len == 1); + for (idx=start; idx<end; idx++) + { + auto entryLen = fTable[idx].entry_len; + if (entryLen == 1) + { + if (fTable[idx].seq[0] == rhs) + { + return 0; + } + } + else + {// OK it's a long chunk, like 'ss' for German + dstring seq = fTable[idx].seq[0 .. entryLen]; + if (rhs == seq[0] + && rtail.skipOver(seq[1..$])) + { + // note that this path modifies rtail + // iff we managed to get there + return 0; + } + } + } + return fTable[start].seq[0]; // new remapped character for accurate diffs +} + +/++ + Does case insensitive comparison of `r1` and `r2`. + Follows the rules of full case-folding mapping. + This includes matching as equal german ß with "ss" and + other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp). + The cost of `icmp` being pedantically correct is + slightly worse performance. + + Params: + r1 = a forward range of characters + r2 = a forward range of characters + + Returns: + An $(D int) that is 0 if the strings match, + <0 if $(D str1) is lexicographically "less" than $(D str2), + >0 if $(D str1) is lexicographically "greater" than $(D str2) + + See_Also: + $(LREF sicmp) + $(REF cmp, std,algorithm,comparison) ++/ +int icmp(S1, S2)(S1 r1, S2 r2) +if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1) + && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2)) +{ + import std.utf : byDchar; + + auto str1 = r1.byDchar; + auto str2 = r2.byDchar; + + for (;;) + { + if (str1.empty) + return str2.empty ? 0 : -1; + immutable lhs = str1.front; + if (str2.empty) + return 1; + immutable rhs = str2.front; + str1.popFront(); + str2.popFront(); + if (!(lhs - rhs)) + continue; + // first try to match lhs to <rhs,right-tail> sequence + immutable cmpLR = fullCasedCmp(lhs, rhs, str2); + if (!cmpLR) + continue; + // then rhs to <lhs,left-tail> sequence + immutable cmpRL = fullCasedCmp(rhs, lhs, str1); + if (!cmpRL) + continue; + // cmpXX contain remapped codepoints + // to obtain stable ordering of icmp + return cmpLR - cmpRL; + } +} + +/// +@safe @nogc pure nothrow unittest +{ + assert(icmp("Rußland", "Russland") == 0); + assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); +} + +/** + * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding + * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`. + */ +@safe @nogc nothrow pure unittest +{ + import std.utf : byDchar; + + assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0); + assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0); +} + +// test different character types +@safe unittest +{ + assert(icmp("Rußland", "Russland") == 0); + assert(icmp("Rußland"w, "Russland") == 0); + assert(icmp("Rußland", "Russland"w) == 0); + assert(icmp("Rußland"w, "Russland"w) == 0); + assert(icmp("Rußland"d, "Russland"w) == 0); + assert(icmp("Rußland"w, "Russland"d) == 0); +} + +// overloads for the most common cases to reduce compile time +@safe @nogc pure nothrow +{ + int icmp(const(char)[] str1, const(char)[] str2) + { return icmp!(const(char)[], const(char)[])(str1, str2); } + int icmp(const(wchar)[] str1, const(wchar)[] str2) + { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); } + int icmp(const(dchar)[] str1, const(dchar)[] str2) + { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); } +} + +@safe unittest +{ + import std.algorithm.sorting : sort; + import std.conv : to; + import std.exception : assertCTFEable; + assertCTFEable!( + { + foreach (cfunc; AliasSeq!(icmp, sicmp)) + { + foreach (S1; AliasSeq!(string, wstring, dstring)) + foreach (S2; AliasSeq!(string, wstring, dstring)) + (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396 + assert(cfunc("".to!S1(), "".to!S2()) == 0); + assert(cfunc("A".to!S1(), "".to!S2()) > 0); + assert(cfunc("".to!S1(), "0".to!S2()) < 0); + assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0); + assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0); + assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0); + assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0); + assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0); + // Check example: + assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0); + assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0); + }(); + // check that the order is properly agnostic to the case + auto strs = [ "Apple", "ORANGE", "orAcle", "amp", "banana"]; + sort!((a,b) => cfunc(a,b) < 0)(strs); + assert(strs == ["amp", "Apple", "banana", "orAcle", "ORANGE"]); + } + assert(icmp("ßb", "ssa") > 0); + // Check example: + assert(icmp("Russland", "Rußland") == 0); + assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); + assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0); + assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); + //bugzilla 11057 + assert( icmp("K", "L") < 0 ); + }); +} + +// issue 17372 +@safe pure unittest +{ + import std.algorithm.iteration : joiner, map; + import std.algorithm.sorting : sort; + import std.array : array; + auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0); +} + +// This is package for the moment to be used as a support tool for std.regex +// It needs a better API +/* + Return a range of all $(CODEPOINTS) that casefold to + and from this $(D ch). +*/ +package auto simpleCaseFoldings(dchar ch) @safe +{ + import std.internal.unicode_tables : simpleCaseTable; // generated file + alias sTable = simpleCaseTable; + static struct Range + { + @safe pure nothrow: + uint idx; //if == uint.max, then read c. + union + { + dchar c; // == 0 - empty range + uint len; + } + @property bool isSmall() const { return idx == uint.max; } + + this(dchar ch) + { + idx = uint.max; + c = ch; + } + + this(uint start, uint size) + { + idx = start; + len = size; + } + + @property dchar front() const + { + assert(!empty); + if (isSmall) + { + return c; + } + auto ch = sTable[idx].ch; + return ch; + } + + @property bool empty() const + { + if (isSmall) + { + return c == 0; + } + return len == 0; + } + + @property size_t length() const + { + if (isSmall) + { + return c == 0 ? 0 : 1; + } + return len; + } + + void popFront() + { + if (isSmall) + c = 0; + else + { + idx++; + len--; + } + } + } + immutable idx = simpleCaseTrie[ch]; + if (idx == EMPTY_CASE_TRIE) + return Range(ch); + auto entry = sTable[idx]; + immutable start = idx - entry.n; + return Range(start, entry.size); +} + +@system unittest +{ + import std.algorithm.comparison : equal; + import std.algorithm.searching : canFind; + import std.array : array; + import std.exception : assertCTFEable; + assertCTFEable!((){ + auto r = simpleCaseFoldings('Э').array; + assert(r.length == 2); + assert(r.canFind('э') && r.canFind('Э')); + auto sr = simpleCaseFoldings('~'); + assert(sr.equal("~")); + //A with ring above - casefolds to the same bucket as Angstrom sign + sr = simpleCaseFoldings('Å'); + assert(sr.length == 3); + assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B')); + }); +} + +/++ + $(P Returns the $(S_LINK Combining class, combining class) of $(D ch).) ++/ +ubyte combiningClass(dchar ch) @safe pure nothrow @nogc +{ + return combiningClassTrie[ch]; +} + +/// +@safe unittest +{ + // shorten the code + alias CC = combiningClass; + + // combining tilda + assert(CC('\u0303') == 230); + // combining ring below + assert(CC('\u0325') == 220); + // the simple consequence is that "tilda" should be + // placed after a "ring below" in a sequence +} + +@safe pure nothrow @nogc unittest +{ + foreach (ch; 0 .. 0x80) + assert(combiningClass(ch) == 0); + assert(combiningClass('\u05BD') == 22); + assert(combiningClass('\u0300') == 230); + assert(combiningClass('\u0317') == 220); + assert(combiningClass('\u1939') == 222); +} + +/// Unicode character decomposition type. +enum UnicodeDecomposition { + /// Canonical decomposition. The result is canonically equivalent sequence. + Canonical, + /** + Compatibility decomposition. The result is compatibility equivalent sequence. + Note: Compatibility decomposition is a $(B lossy) conversion, + typically suitable only for fuzzy matching and internal processing. + */ + Compatibility +} + +/** + Shorthand aliases for character decomposition type, passed as a + template parameter to $(LREF decompose). +*/ +enum { + Canonical = UnicodeDecomposition.Canonical, + Compatibility = UnicodeDecomposition.Compatibility +} + +/++ + Try to canonically compose 2 $(CHARACTERS). + Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise. + + The assumption is that $(D first) comes before $(D second) in the original text, + usually meaning that the first is a starter. + + Note: Hangul syllables are not covered by this function. + See $(D composeJamo) below. ++/ +public dchar compose(dchar first, dchar second) pure nothrow @safe +{ + import std.algorithm.iteration : map; + import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask; + import std.range : assumeSorted; + immutable packed = compositionJumpTrie[first]; + if (packed == ushort.max) + return dchar.init; + // unpack offset and length + immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift; + // TODO: optimize this micro binary search (no more then 4-5 steps) + auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted(); + immutable target = r.lowerBound(second).length; + if (target == cnt) + return dchar.init; + immutable entry = compositionTable[idx+target]; + if (entry.rhs != second) + return dchar.init; + return entry.composed; +} + +/// +@safe unittest +{ + assert(compose('A','\u0308') == '\u00C4'); + assert(compose('A', 'B') == dchar.init); + assert(compose('C', '\u0301') == '\u0106'); + // note that the starter is the first one + // thus the following doesn't compose + assert(compose('\u0308', 'A') == dchar.init); +} + +/++ + Returns a full $(S_LINK Canonical decomposition, Canonical) + (by default) or $(S_LINK Compatibility decomposition, Compatibility) + decomposition of $(CHARACTER) $(D ch). + If no decomposition is available returns a $(LREF Grapheme) + with the $(D ch) itself. + + Note: + This function also decomposes hangul syllables + as prescribed by the standard. + + See_Also: $(LREF decomposeHangul) for a restricted version + that takes into account only hangul syllables but + no other decompositions. ++/ +public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe +{ + import std.algorithm.searching : until; + import std.internal.unicode_decomp : decompCompatTable, decompCanonTable; + static if (decompType == Canonical) + { + alias table = decompCanonTable; + alias mapping = canonMappingTrie; + } + else static if (decompType == Compatibility) + { + alias table = decompCompatTable; + alias mapping = compatMappingTrie; + } + immutable idx = mapping[ch]; + if (!idx) // not found, check hangul arithmetic decomposition + return decomposeHangul(ch); + auto decomp = table[idx..$].until(0); + return Grapheme(decomp); +} + +/// +@system unittest +{ + import std.algorithm.comparison : equal; + + assert(compose('A','\u0308') == '\u00C4'); + assert(compose('A', 'B') == dchar.init); + assert(compose('C', '\u0301') == '\u0106'); + // note that the starter is the first one + // thus the following doesn't compose + assert(compose('\u0308', 'A') == dchar.init); + + assert(decompose('Ĉ')[].equal("C\u0302")); + assert(decompose('D')[].equal("D")); + assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7")); + assert(decompose!Compatibility('¹')[].equal("1")); +} + +//---------------------------------------------------------------------------- +// Hangul specific composition/decomposition +enum jamoSBase = 0xAC00; +enum jamoLBase = 0x1100; +enum jamoVBase = 0x1161; +enum jamoTBase = 0x11A7; +enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28; +enum jamoNCount = jamoVCount * jamoTCount; +enum jamoSCount = jamoLCount * jamoNCount; + +// Tests if $(D ch) is a Hangul leading consonant jamo. +bool isJamoL(dchar ch) pure nothrow @nogc @safe +{ + // first cmp rejects ~ 1M code points above leading jamo range + return ch < jamoLBase+jamoLCount && ch >= jamoLBase; +} + +// Tests if $(D ch) is a Hangul vowel jamo. +bool isJamoT(dchar ch) pure nothrow @nogc @safe +{ + // first cmp rejects ~ 1M code points above trailing jamo range + // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0) + return ch < jamoTBase+jamoTCount && ch > jamoTBase; +} + +// Tests if $(D ch) is a Hangul trailnig consonant jamo. +bool isJamoV(dchar ch) pure nothrow @nogc @safe +{ + // first cmp rejects ~ 1M code points above vowel range + return ch < jamoVBase+jamoVCount && ch >= jamoVBase; +} + +int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe +{ + int idxS = cast(int) ch - jamoSBase; + return idxS >= 0 && idxS < jamoSCount ? idxS : -1; +} + +// internal helper: compose hangul syllables leaving dchar.init in holes +void hangulRecompose(dchar[] seq) pure nothrow @nogc @safe +{ + for (size_t idx = 0; idx + 1 < seq.length; ) + { + if (isJamoL(seq[idx]) && isJamoV(seq[idx+1])) + { + immutable int indexL = seq[idx] - jamoLBase; + immutable int indexV = seq[idx+1] - jamoVBase; + immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount; + if (idx + 2 < seq.length && isJamoT(seq[idx+2])) + { + seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase; + seq[idx+1] = dchar.init; + seq[idx+2] = dchar.init; + idx += 3; + } + else + { + seq[idx] = jamoSBase + indexLV; + seq[idx+1] = dchar.init; + idx += 2; + } + } + else + idx++; + } +} + +//---------------------------------------------------------------------------- +public: + +/** + Decomposes a Hangul syllable. If $(D ch) is not a composed syllable + then this function returns $(LREF Grapheme) containing only $(D ch) as is. +*/ +Grapheme decomposeHangul(dchar ch) @safe +{ + immutable idxS = cast(int) ch - jamoSBase; + if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch); + immutable idxL = idxS / jamoNCount; + immutable idxV = (idxS % jamoNCount) / jamoTCount; + immutable idxT = idxS % jamoTCount; + + immutable partL = jamoLBase + idxL; + immutable partV = jamoVBase + idxV; + if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition + return Grapheme(partL, partV, jamoTBase + idxT); + else // <L, V> decomposition + return Grapheme(partL, partV); +} + +/// +@system unittest +{ + import std.algorithm.comparison : equal; + assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); +} + +/++ + Try to compose hangul syllable out of a leading consonant ($(D lead)), + a $(D vowel) and optional $(D trailing) consonant jamos. + + On success returns the composed LV or LVT hangul syllable. + + If any of $(D lead) and $(D vowel) are not a valid hangul jamo + of the respective $(CHARACTER) class returns dchar.init. ++/ +dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe +{ + if (!isJamoL(lead)) + return dchar.init; + immutable indexL = lead - jamoLBase; + if (!isJamoV(vowel)) + return dchar.init; + immutable indexV = vowel - jamoVBase; + immutable indexLV = indexL * jamoNCount + indexV * jamoTCount; + immutable dchar syllable = jamoSBase + indexLV; + return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable; +} + +/// +@safe unittest +{ + assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); + // leaving out T-vowel, or passing any codepoint + // that is not trailing consonant composes an LV-syllable + assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); + assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); + assert(composeJamo('\u1111', 'A') == dchar.init); + assert(composeJamo('A', '\u1171') == dchar.init); +} + +@system unittest +{ + import std.algorithm.comparison : equal; + import std.conv : text; + + static void testDecomp(UnicodeDecomposition T)(dchar ch, string r) + { + Grapheme g = decompose!T(ch); + assert(equal(g[], r), text(g[], " vs ", r)); + } + testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345"); + testDecomp!Canonical('\uF907', "\u9F9C"); + testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C"); + testDecomp!Compatibility('\uA7F9', "\u0153"); + + // check examples + assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); + assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); + assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel + assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); + assert(composeJamo('\u1111', 'A') == dchar.init); + assert(composeJamo('A', '\u1171') == dchar.init); +} + +/** + Enumeration type for normalization forms, + passed as template parameter for functions like $(LREF normalize). +*/ +enum NormalizationForm { + NFC, + NFD, + NFKC, + NFKD +} + + +enum { + /** + Shorthand aliases from values indicating normalization forms. + */ + NFC = NormalizationForm.NFC, + ///ditto + NFD = NormalizationForm.NFD, + ///ditto + NFKC = NormalizationForm.NFKC, + ///ditto + NFKD = NormalizationForm.NFKD +} + +/++ + Returns $(D input) string normalized to the chosen form. + Form C is used by default. + + For more information on normalization forms see + the $(S_LINK Normalization, normalization section). + + Note: + In cases where the string in question is already normalized, + it is returned unmodified and no memory allocation happens. ++/ +inout(C)[] normalize(NormalizationForm norm=NFC, C)(inout(C)[] input) +{ + import std.algorithm.mutation : SwapStrategy; + import std.algorithm.sorting : sort; + import std.array : appender; + import std.range : zip; + + auto anchors = splitNormalized!norm(input); + if (anchors[0] == input.length && anchors[1] == input.length) + return input; + dchar[] decomposed; + decomposed.reserve(31); + ubyte[] ccc; + ccc.reserve(31); + auto app = appender!(C[])(); + do + { + app.put(input[0 .. anchors[0]]); + foreach (dchar ch; input[anchors[0]..anchors[1]]) + static if (norm == NFD || norm == NFC) + { + foreach (dchar c; decompose!Canonical(ch)[]) + decomposed ~= c; + } + else // NFKD & NFKC + { + foreach (dchar c; decompose!Compatibility(ch)[]) + decomposed ~= c; + } + ccc.length = decomposed.length; + size_t firstNonStable = 0; + ubyte lastClazz = 0; + + foreach (idx, dchar ch; decomposed) + { + immutable clazz = combiningClass(ch); + ccc[idx] = clazz; + if (clazz == 0 && lastClazz != 0) + { + // found a stable code point after unstable ones + sort!("a[0] < b[0]", SwapStrategy.stable) + (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx])); + firstNonStable = decomposed.length; + } + else if (clazz != 0 && lastClazz == 0) + { + // found first unstable code point after stable ones + firstNonStable = idx; + } + lastClazz = clazz; + } + sort!("a[0] < b[0]", SwapStrategy.stable) + (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$])); + static if (norm == NFC || norm == NFKC) + { + import std.algorithm.searching : countUntil; + auto first = countUntil(ccc, 0); + if (first >= 0) // no starters?? no recomposition + { + for (;;) + { + immutable second = recompose(first, decomposed, ccc); + if (second == decomposed.length) + break; + first = second; + } + // 2nd pass for hangul syllables + hangulRecompose(decomposed); + } + } + static if (norm == NFD || norm == NFKD) + app.put(decomposed); + else + { + import std.algorithm.mutation : remove; + auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed); + app.put(decomposed[0 .. clean.length]); + } + // reset variables + decomposed.length = 0; + decomposed.assumeSafeAppend(); + ccc.length = 0; + ccc.assumeSafeAppend(); + input = input[anchors[1]..$]; + // and move on + anchors = splitNormalized!norm(input); + }while (anchors[0] != input.length); + app.put(input[0 .. anchors[0]]); + return cast(inout(C)[])app.data; +} + +/// +@safe unittest +{ + // any encoding works + wstring greet = "Hello world"; + assert(normalize(greet) is greet); // the same exact slice + + // An example of a character with all 4 forms being different: + // Greek upsilon with acute and hook symbol (code point 0x03D3) + assert(normalize!NFC("ϓ") == "\u03D3"); + assert(normalize!NFD("ϓ") == "\u03D2\u0301"); + assert(normalize!NFKC("ϓ") == "\u038E"); + assert(normalize!NFKD("ϓ") == "\u03A5\u0301"); +} + +@safe unittest +{ + import std.conv : text; + + assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def"))); + assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰")); + assert(normalize!NFD("Äffin") == "A\u0308ffin"); + + // check example + + // any encoding works + wstring greet = "Hello world"; + assert(normalize(greet) is greet); // the same exact slice + + // An example of a character with all 4 forms being different: + // Greek upsilon with acute and hook symbol (code point 0x03D3) + assert(normalize!NFC("ϓ") == "\u03D3"); + assert(normalize!NFD("ϓ") == "\u03D2\u0301"); + assert(normalize!NFKC("ϓ") == "\u038E"); + assert(normalize!NFKD("ϓ") == "\u03A5\u0301"); +} + +// canonically recompose given slice of code points, works in-place and mutates data +private size_t recompose(size_t start, dchar[] input, ubyte[] ccc) pure nothrow @safe +{ + assert(input.length == ccc.length); + int accumCC = -1;// so that it's out of 0 .. 255 range + // writefln("recomposing %( %04x %)", input); + // first one is always a starter thus we start at i == 1 + size_t i = start+1; + for (; ; ) + { + if (i == input.length) + break; + immutable curCC = ccc[i]; + // In any character sequence beginning with a starter S + // a character C is blocked from S if and only if there + // is some character B between S and C, and either B + // is a starter or it has the same or higher combining class as C. + //------------------------ + // Applying to our case: + // S is input[0] + // accumCC is the maximum CCC of characters between C and S, + // as ccc are sorted + // C is input[i] + + if (curCC > accumCC) + { + immutable comp = compose(input[start], input[i]); + if (comp != dchar.init) + { + input[start] = comp; + input[i] = dchar.init;// put a sentinel + // current was merged so its CCC shouldn't affect + // composing with the next one + } + else + { + // if it was a starter then accumCC is now 0, end of loop + accumCC = curCC; + if (accumCC == 0) + break; + } + } + else + { + // ditto here + accumCC = curCC; + if (accumCC == 0) + break; + } + i++; + } + return i; +} + +// returns tuple of 2 indexes that delimit: +// normalized text, piece that needs normalization and +// the rest of input starting with stable code point +private auto splitNormalized(NormalizationForm norm, C)(const(C)[] input) +{ + import std.typecons : tuple; + ubyte lastCC = 0; + + foreach (idx, dchar ch; input) + { + static if (norm == NFC) + if (ch < 0x0300) + { + lastCC = 0; + continue; + } + immutable ubyte CC = combiningClass(ch); + if (lastCC > CC && CC != 0) + { + return seekStable!norm(idx, input); + } + + if (notAllowedIn!norm(ch)) + { + return seekStable!norm(idx, input); + } + lastCC = CC; + } + return tuple(input.length, input.length); +} + +private auto seekStable(NormalizationForm norm, C)(size_t idx, in C[] input) +{ + import std.typecons : tuple; + import std.utf : codeLength; + + auto br = input[0 .. idx]; + size_t region_start = 0;// default + for (;;) + { + if (br.empty)// start is 0 + break; + dchar ch = br.back; + if (combiningClass(ch) == 0 && allowedIn!norm(ch)) + { + region_start = br.length - codeLength!C(ch); + break; + } + br.popFront(); + } + ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..." + size_t region_end=input.length;// end is $ by default + foreach (i, dchar ch; input[idx..$]) + { + if (combiningClass(ch) == 0 && allowedIn!norm(ch)) + { + region_end = i+idx; + break; + } + } + // writeln("Region to normalize: ", input[region_start .. region_end]); + return tuple(region_start, region_end); +} + +/** + Tests if dchar $(D ch) is always allowed (Quick_Check=YES) in normalization + form $(D norm). +*/ +public bool allowedIn(NormalizationForm norm)(dchar ch) +{ + return !notAllowedIn!norm(ch); +} + +/// +@safe unittest +{ + // e.g. Cyrillic is always allowed, so is ASCII + assert(allowedIn!NFC('я')); + assert(allowedIn!NFD('я')); + assert(allowedIn!NFKC('я')); + assert(allowedIn!NFKD('я')); + assert(allowedIn!NFC('Z')); +} + +// not user friendly name but more direct +private bool notAllowedIn(NormalizationForm norm)(dchar ch) +{ + static if (norm == NFC) + alias qcTrie = nfcQCTrie; + else static if (norm == NFD) + alias qcTrie = nfdQCTrie; + else static if (norm == NFKC) + alias qcTrie = nfkcQCTrie; + else static if (norm == NFKD) + alias qcTrie = nfkdQCTrie; + else + static assert("Unknown normalization form "~norm); + return qcTrie[ch]; +} + +@safe unittest +{ + assert(allowedIn!NFC('я')); + assert(allowedIn!NFD('я')); + assert(allowedIn!NFKC('я')); + assert(allowedIn!NFKD('я')); + assert(allowedIn!NFC('Z')); +} + +} + +version (std_uni_bootstrap) +{ + // old version used for bootstrapping of gen_uni.d that generates + // up to date optimal versions of all of isXXX functions + @safe pure nothrow @nogc public bool isWhite(dchar c) + { + import std.ascii : isWhite; + return isWhite(c) || + c == lineSep || c == paraSep || + c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' || + (c >= '\u2000' && c <= '\u200A') || + c == '\u202F' || c == '\u205F' || c == '\u3000'; + } +} +else +{ + +// trusted -> avoid bounds check +@trusted pure nothrow @nogc private +{ + import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file + + // hide template instances behind functions (Bugzilla 13232) + ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; } + ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; } + dchar toLowerTab(size_t idx) { return toLowerTable[idx]; } + + ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; } + ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; } + dchar toTitleTab(size_t idx) { return toTitleTable[idx]; } + + ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; } + ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; } + dchar toUpperTab(size_t idx) { return toUpperTable[idx]; } +} + +public: + +/++ + Whether or not $(D c) is a Unicode whitespace $(CHARACTER). + (general Unicode category: Part of C0(tab, vertical tab, form feed, + carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085)) ++/ +@safe pure nothrow @nogc +public bool isWhite(dchar c) +{ + import std.internal.unicode_tables : isWhiteGen; // generated file + return isWhiteGen(c); // call pregenerated binary search +} + +/++ + Return whether $(D c) is a Unicode lowercase $(CHARACTER). ++/ +@safe pure nothrow @nogc +bool isLower(dchar c) +{ + import std.ascii : isLower, isASCII; + if (isASCII(c)) + return isLower(c); + return lowerCaseTrie[c]; +} + +@safe unittest +{ + import std.ascii : isLower; + foreach (v; 0 .. 0x80) + assert(isLower(v) == .isLower(v)); + assert(.isLower('я')); + assert(.isLower('й')); + assert(!.isLower('Ж')); + // Greek HETA + assert(!.isLower('\u0370')); + assert(.isLower('\u0371')); + assert(!.isLower('\u039C')); // capital MU + assert(.isLower('\u03B2')); // beta + // from extended Greek + assert(!.isLower('\u1F18')); + assert(.isLower('\u1F00')); + foreach (v; unicode.lowerCase.byCodepoint) + assert(.isLower(v) && !isUpper(v)); +} + + +/++ + Return whether $(D c) is a Unicode uppercase $(CHARACTER). ++/ +@safe pure nothrow @nogc +bool isUpper(dchar c) +{ + import std.ascii : isUpper, isASCII; + if (isASCII(c)) + return isUpper(c); + return upperCaseTrie[c]; +} + +@safe unittest +{ + import std.ascii : isLower; + foreach (v; 0 .. 0x80) + assert(isLower(v) == .isLower(v)); + assert(!isUpper('й')); + assert(isUpper('Ж')); + // Greek HETA + assert(isUpper('\u0370')); + assert(!isUpper('\u0371')); + assert(isUpper('\u039C')); // capital MU + assert(!isUpper('\u03B2')); // beta + // from extended Greek + assert(!isUpper('\u1F00')); + assert(isUpper('\u1F18')); + foreach (v; unicode.upperCase.byCodepoint) + assert(isUpper(v) && !.isLower(v)); +} + + +//TODO: Hidden for now, needs better API. +//Other transforms could use better API as well, but this one is a new primitive. +@safe pure nothrow @nogc +private dchar toTitlecase(dchar c) +{ + // optimize ASCII case + if (c < 0xAA) + { + if (c < 'a') + return c; + if (c <= 'z') + return c - 32; + return c; + } + size_t idx = toTitleSimpleIndex(c); + if (idx != ushort.max) + { + return toTitleTab(idx); + } + return c; +} + +private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab); +private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab); + +// generic toUpper/toLower on whole string, creates new or returns as is +private S toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s) @trusted pure +if (isSomeString!S) +{ + import std.array : appender; + import std.ascii : isASCII; + + foreach (i, dchar cOuter; s) + { + ushort idx = indexFn(cOuter); + if (idx == ushort.max) + continue; + auto result = appender!S(s[0 .. i]); + result.reserve(s.length); + foreach (dchar c; s[i .. $]) + { + if (c.isASCII) + { + result.put(asciiConvert(c)); + } + else + { + idx = indexFn(c); + if (idx == ushort.max) + result.put(c); + else if (idx < maxIdx) + { + c = tableFn(idx); + result.put(c); + } + else + { + auto val = tableFn(idx); + // unpack length + codepoint + immutable uint len = val >> 24; + result.put(cast(dchar)(val & 0xFF_FFFF)); + foreach (j; idx+1 .. idx+len) + result.put(tableFn(j)); + } + } + } + return result.data; + } + return s; +} + +@safe unittest //12428 +{ + import std.array : replicate; + auto s = "abcdefghij".replicate(300); + s = s[0 .. 10]; + + toUpper(s); + + assert(s == "abcdefghij"); +} + + +// generic toUpper/toLower on whole range, returns range +private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str) + // Accept range of dchar's +if (isInputRange!Range && + isSomeChar!(ElementEncodingType!Range) && + ElementEncodingType!Range.sizeof == dchar.sizeof) +{ + static struct ToCaserImpl + { + @property bool empty() + { + return !nLeft && r.empty; + } + + @property auto front() + { + import std.ascii : isASCII; + + if (!nLeft) + { + dchar c = r.front; + if (c.isASCII) + { + buf[0] = asciiConvert(c); + nLeft = 1; + } + else + { + const idx = indexFn(c); + if (idx == ushort.max) + { + buf[0] = c; + nLeft = 1; + } + else if (idx < maxIdx) + { + buf[0] = tableFn(idx); + nLeft = 1; + } + else + { + immutable val = tableFn(idx); + // unpack length + codepoint + nLeft = val >> 24; + if (nLeft == 0) + nLeft = 1; + assert(nLeft <= buf.length); + buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); + foreach (j; 1 .. nLeft) + buf[nLeft - j - 1] = tableFn(idx + j); + } + } + } + return buf[nLeft - 1]; + } + + void popFront() + { + if (!nLeft) + front; + assert(nLeft); + --nLeft; + if (!nLeft) + r.popFront(); + } + + static if (isForwardRange!Range) + { + @property auto save() + { + auto ret = this; + ret.r = r.save; + return ret; + } + } + + private: + Range r; + uint nLeft; + dchar[3] buf = void; + } + + return ToCaserImpl(str); +} + +/********************* + * Convert input range or string to upper or lower case. + * + * Does not allocate memory. + * Characters in UTF-8 or UTF-16 format that cannot be decoded + * are treated as $(REF replacementDchar, std,utf). + * + * Params: + * str = string or range of characters + * + * Returns: + * an InputRange of dchars + * + * See_Also: + * $(LREF toUpper), $(LREF toLower) + */ + +auto asLowerCase(Range)(Range str) +if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && + !isConvertibleToString!Range) +{ + static if (ElementEncodingType!Range.sizeof < dchar.sizeof) + { + import std.utf : byDchar; + + // Decode first + return asLowerCase(str.byDchar); + } + else + { + static import std.ascii; + return toCaser!(LowerTriple, std.ascii.toLower)(str); + } +} + +/// ditto +auto asUpperCase(Range)(Range str) +if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && + !isConvertibleToString!Range) +{ + static if (ElementEncodingType!Range.sizeof < dchar.sizeof) + { + import std.utf : byDchar; + + // Decode first + return asUpperCase(str.byDchar); + } + else + { + static import std.ascii; + return toCaser!(UpperTriple, std.ascii.toUpper)(str); + } +} + +/// +@safe pure unittest +{ + import std.algorithm.comparison : equal; + + assert("hEllo".asUpperCase.equal("HELLO")); +} + +// explicitly undocumented +auto asLowerCase(Range)(auto ref Range str) +if (isConvertibleToString!Range) +{ + import std.traits : StringTypeOf; + return asLowerCase!(StringTypeOf!Range)(str); +} + +// explicitly undocumented +auto asUpperCase(Range)(auto ref Range str) +if (isConvertibleToString!Range) +{ + import std.traits : StringTypeOf; + return asUpperCase!(StringTypeOf!Range)(str); +} + +@safe unittest +{ + assert(testAliasedString!asLowerCase("hEllo")); + assert(testAliasedString!asUpperCase("hEllo")); +} + +@safe unittest +{ + import std.array : array; + + auto a = "HELLo".asLowerCase; + auto savea = a.save; + auto s = a.array; + assert(s == "hello"); + s = savea.array; + assert(s == "hello"); + + string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; + string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; + + foreach (i, slwr; lower) + { + import std.utf : byChar; + + auto sx = slwr.asUpperCase.byChar.array; + assert(sx == toUpper(slwr)); + auto sy = upper[i].asLowerCase.byChar.array; + assert(sy == toLower(upper[i])); + } + + // Not necessary to call r.front + for (auto r = lower[3].asUpperCase; !r.empty; r.popFront()) + { + } + + import std.algorithm.comparison : equal; + + "HELLo"w.asLowerCase.equal("hello"d); + "HELLo"w.asUpperCase.equal("HELLO"d); + "HELLo"d.asLowerCase.equal("hello"d); + "HELLo"d.asUpperCase.equal("HELLO"d); + + import std.utf : byChar; + assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array); +} + +// generic capitalizer on whole range, returns range +private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper, + Range)(Range str) + // Accept range of dchar's +if (isInputRange!Range && + isSomeChar!(ElementEncodingType!Range) && + ElementEncodingType!Range.sizeof == dchar.sizeof) +{ + static struct ToCapitalizerImpl + { + @property bool empty() + { + return lower ? lwr.empty : !nLeft && r.empty; + } + + @property auto front() + { + if (lower) + return lwr.front; + + if (!nLeft) + { + immutable dchar c = r.front; + const idx = indexFnUpper(c); + if (idx == ushort.max) + { + buf[0] = c; + nLeft = 1; + } + else if (idx < maxIdxUpper) + { + buf[0] = tableFnUpper(idx); + nLeft = 1; + } + else + { + immutable val = tableFnUpper(idx); + // unpack length + codepoint + nLeft = val >> 24; + if (nLeft == 0) + nLeft = 1; + assert(nLeft <= buf.length); + buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); + foreach (j; 1 .. nLeft) + buf[nLeft - j - 1] = tableFnUpper(idx + j); + } + } + return buf[nLeft - 1]; + } + + void popFront() + { + if (lower) + lwr.popFront(); + else + { + if (!nLeft) + front; + assert(nLeft); + --nLeft; + if (!nLeft) + { + r.popFront(); + lwr = r.asLowerCase(); + lower = true; + } + } + } + + static if (isForwardRange!Range) + { + @property auto save() + { + auto ret = this; + ret.r = r.save; + ret.lwr = lwr.save; + return ret; + } + } + + private: + Range r; + typeof(r.asLowerCase) lwr; // range representing the lower case rest of string + bool lower = false; // false for first character, true for rest of string + dchar[3] buf = void; + uint nLeft = 0; + } + + return ToCapitalizerImpl(str); +} + +/********************* + * Capitalize input range or string, meaning convert the first + * character to upper case and subsequent characters to lower case. + * + * Does not allocate memory. + * Characters in UTF-8 or UTF-16 format that cannot be decoded + * are treated as $(REF replacementDchar, std,utf). + * + * Params: + * str = string or range of characters + * + * Returns: + * an InputRange of dchars + * + * See_Also: + * $(LREF toUpper), $(LREF toLower) + * $(LREF asUpperCase), $(LREF asLowerCase) + */ + +auto asCapitalized(Range)(Range str) +if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && + !isConvertibleToString!Range) +{ + static if (ElementEncodingType!Range.sizeof < dchar.sizeof) + { + import std.utf : byDchar; + + // Decode first + return toCapitalizer!UpperTriple(str.byDchar); + } + else + { + return toCapitalizer!UpperTriple(str); + } +} + +/// +@safe pure unittest +{ + import std.algorithm.comparison : equal; + + assert("hEllo".asCapitalized.equal("Hello")); +} + +auto asCapitalized(Range)(auto ref Range str) +if (isConvertibleToString!Range) +{ + import std.traits : StringTypeOf; + return asCapitalized!(StringTypeOf!Range)(str); +} + +@safe unittest +{ + assert(testAliasedString!asCapitalized("hEllo")); +} + +@safe pure nothrow @nogc unittest +{ + auto r = "hEllo".asCapitalized(); + assert(r.front == 'H'); +} + +@safe unittest +{ + import std.array : array; + + auto a = "hELLo".asCapitalized; + auto savea = a.save; + auto s = a.array; + assert(s == "Hello"); + s = savea.array; + assert(s == "Hello"); + + string[2][] cases = + [ + ["", ""], + ["h", "H"], + ["H", "H"], + ["3", "3"], + ["123", "123"], + ["h123A", "H123a"], + ["феж", "Феж"], + ["\u1Fe2", "\u03a5\u0308\u0300"], + ]; + + foreach (i; 0 .. cases.length) + { + import std.utf : byChar; + + auto r = cases[i][0].asCapitalized.byChar.array; + auto result = cases[i][1]; + assert(r == result); + } + + // Don't call r.front + for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront()) + { + } + + import std.algorithm.comparison : equal; + + "HELLo"w.asCapitalized.equal("Hello"d); + "hElLO"w.asCapitalized.equal("Hello"d); + "hello"d.asCapitalized.equal("Hello"d); + "HELLO"d.asCapitalized.equal("Hello"d); + + import std.utf : byChar; + assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array); +} + +// TODO: helper, I wish std.utf was more flexible (and stright) +private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc +{ + if (c <= 0x7F) + { + buf[idx] = cast(char) c; + idx++; + } + else if (c <= 0x7FF) + { + buf[idx] = cast(char)(0xC0 | (c >> 6)); + buf[idx+1] = cast(char)(0x80 | (c & 0x3F)); + idx += 2; + } + else if (c <= 0xFFFF) + { + buf[idx] = cast(char)(0xE0 | (c >> 12)); + buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[idx+2] = cast(char)(0x80 | (c & 0x3F)); + idx += 3; + } + else if (c <= 0x10FFFF) + { + buf[idx] = cast(char)(0xF0 | (c >> 18)); + buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); + buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); + buf[idx+3] = cast(char)(0x80 | (c & 0x3F)); + idx += 4; + } + else + assert(0); + return idx; +} + +@safe unittest +{ + char[] s = "abcd".dup; + size_t i = 0; + i = encodeTo(s, i, 'X'); + assert(s == "Xbcd"); + + i = encodeTo(s, i, cast(dchar)'\u00A9'); + assert(s == "X\xC2\xA9d"); +} + +// TODO: helper, I wish std.utf was more flexible (and stright) +private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure +{ + import std.utf : UTFException; + if (c <= 0xFFFF) + { + if (0xD800 <= c && c <= 0xDFFF) + throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c); + buf[idx] = cast(wchar) c; + idx++; + } + else if (c <= 0x10FFFF) + { + buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); + buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); + idx += 2; + } + else + assert(0); + return idx; +} + +private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc +{ + buf[idx] = c; + idx++; + return idx; +} + +private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure +if (is(C == char) || is(C == wchar) || is(C == dchar)) +{ + import std.utf : decode, codeLength; + size_t curIdx = 0; + size_t destIdx = 0; + alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn); + size_t lastUnchanged = 0; + // in-buffer move of bytes to a new start index + // the trick is that it may not need to copy at all + static size_t moveTo(C[] str, size_t dest, size_t from, size_t to) + { + // Interestingly we may just bump pointer for a while + // then have to copy if a re-cased char was smaller the original + // later we may regain pace with char that got bigger + // In the end it sometimes flip-flops between the 2 cases below + if (dest == from) + return to; + // got to copy + foreach (C c; str[from .. to]) + str[dest++] = c; + return dest; + } + while (curIdx != s.length) + { + size_t startIdx = curIdx; + immutable ch = decode(s, curIdx); + // TODO: special case for ASCII + immutable caseIndex = indexFn(ch); + if (caseIndex == ushort.max) // unchanged, skip over + { + continue; + } + else if (caseIndex < maxIdx) // 1:1 codepoint mapping + { + // previous cased chars had the same length as uncased ones + // thus can just adjust pointer + destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); + lastUnchanged = curIdx; + immutable cased = tableFn(caseIndex); + immutable casedLen = codeLength!C(cased); + if (casedLen + destIdx > curIdx) // no place to fit cased char + { + // switch to slow codepath, where we allocate + return slowToCase(s, startIdx, destIdx); + } + else + { + destIdx = encodeTo(s, destIdx, cased); + } + } + else // 1:m codepoint mapping, slow codepath + { + destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); + lastUnchanged = curIdx; + return slowToCase(s, startIdx, destIdx); + } + assert(destIdx <= curIdx); + } + if (lastUnchanged != s.length) + { + destIdx = moveTo(s, destIdx, lastUnchanged, s.length); + } + s = s[0 .. destIdx]; +} + +// helper to precalculate size of case-converted string +private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn) +{ + size_t toCaseLength(C)(in C[] str) + { + import std.utf : decode, codeLength; + size_t codeLen = 0; + size_t lastNonTrivial = 0; + size_t curIdx = 0; + while (curIdx != str.length) + { + immutable startIdx = curIdx; + immutable ch = decode(str, curIdx); + immutable ushort caseIndex = indexFn(ch); + if (caseIndex == ushort.max) + continue; + else if (caseIndex < maxIdx) + { + codeLen += startIdx - lastNonTrivial; + lastNonTrivial = curIdx; + immutable cased = tableFn(caseIndex); + codeLen += codeLength!C(cased); + } + else + { + codeLen += startIdx - lastNonTrivial; + lastNonTrivial = curIdx; + immutable val = tableFn(caseIndex); + immutable len = val >> 24; + immutable dchar cased = val & 0xFF_FFFF; + codeLen += codeLength!C(cased); + foreach (j; caseIndex+1 .. caseIndex+len) + codeLen += codeLength!C(tableFn(j)); + } + } + if (lastNonTrivial != str.length) + codeLen += str.length - lastNonTrivial; + return codeLen; + } +} + +@safe unittest +{ + alias toLowerLength = toCaseLength!(LowerTriple); + assert(toLowerLength("abcd") == 4); + assert(toLowerLength("аБВгд456") == 10+3); +} + +// slower code path that preallocates and then copies +// case-converted stuf to the new string +private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn) +{ + void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx, + size_t destIdx) @trusted pure + if (is(C == char) || is(C == wchar) || is(C == dchar)) + { + import std.utf : decode; + alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn); + auto trueLength = destIdx + caseLength(s[curIdx..$]); + C[] ns = new C[trueLength]; + ns[0 .. destIdx] = s[0 .. destIdx]; + size_t lastUnchanged = curIdx; + while (curIdx != s.length) + { + immutable startIdx = curIdx; // start of current codepoint + immutable ch = decode(s, curIdx); + immutable caseIndex = indexFn(ch); + if (caseIndex == ushort.max) // skip over + { + continue; + } + else if (caseIndex < maxIdx) // 1:1 codepoint mapping + { + immutable cased = tableFn(caseIndex); + auto toCopy = startIdx - lastUnchanged; + ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; + lastUnchanged = curIdx; + destIdx += toCopy; + destIdx = encodeTo(ns, destIdx, cased); + } + else // 1:m codepoint mapping, slow codepath + { + auto toCopy = startIdx - lastUnchanged; + ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; + lastUnchanged = curIdx; + destIdx += toCopy; + auto val = tableFn(caseIndex); + // unpack length + codepoint + immutable uint len = val >> 24; + destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF)); + foreach (j; caseIndex+1 .. caseIndex+len) + destIdx = encodeTo(ns, destIdx, tableFn(j)); + } + } + if (lastUnchanged != s.length) + { + auto toCopy = s.length - lastUnchanged; + ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$]; + destIdx += toCopy; + } + assert(ns.length == destIdx); + s = ns; + } +} + +/++ + Converts $(D s) to lowercase (by performing Unicode lowercase mapping) in place. + For a few characters string length may increase after the transformation, + in such a case the function reallocates exactly once. + If $(D s) does not have any uppercase characters, then $(D s) is unaltered. ++/ +void toLowerInPlace(C)(ref C[] s) @trusted pure +if (is(C == char) || is(C == wchar) || is(C == dchar)) +{ + toCaseInPlace!(LowerTriple)(s); +} +// overloads for the most common cases to reduce compile time +@safe pure /*TODO nothrow*/ +{ + void toLowerInPlace(ref char[] s) + { toLowerInPlace!char(s); } + void toLowerInPlace(ref wchar[] s) + { toLowerInPlace!wchar(s); } + void toLowerInPlace(ref dchar[] s) + { toLowerInPlace!dchar(s); } +} + +/++ + Converts $(D s) to uppercase (by performing Unicode uppercase mapping) in place. + For a few characters string length may increase after the transformation, + in such a case the function reallocates exactly once. + If $(D s) does not have any lowercase characters, then $(D s) is unaltered. ++/ +void toUpperInPlace(C)(ref C[] s) @trusted pure +if (is(C == char) || is(C == wchar) || is(C == dchar)) +{ + toCaseInPlace!(UpperTriple)(s); +} +// overloads for the most common cases to reduce compile time/code size +@safe pure /*TODO nothrow*/ +{ + void toUpperInPlace(ref char[] s) + { toUpperInPlace!char(s); } + void toUpperInPlace(ref wchar[] s) + { toUpperInPlace!wchar(s); } + void toUpperInPlace(ref dchar[] s) + { toUpperInPlace!dchar(s); } +} + +/++ + If $(D c) is a Unicode uppercase $(CHARACTER), then its lowercase equivalent + is returned. Otherwise $(D c) is returned. + + Warning: certain alphabets like German and Greek have no 1:1 + upper-lower mapping. Use overload of toLower which takes full string instead. ++/ +@safe pure nothrow @nogc +dchar toLower(dchar c) +{ + // optimize ASCII case + if (c < 0xAA) + { + if (c < 'A') + return c; + if (c <= 'Z') + return c + 32; + return c; + } + size_t idx = toLowerSimpleIndex(c); + if (idx != ushort.max) + { + return toLowerTab(idx); + } + return c; +} + +/++ + Returns a string which is identical to $(D s) except that all of its + characters are converted to lowercase (by preforming Unicode lowercase mapping). + If none of $(D s) characters were affected, then $(D s) itself is returned. ++/ +S toLower(S)(S s) @trusted pure +if (isSomeString!S) +{ + static import std.ascii; + return toCase!(LowerTriple, std.ascii.toLower)(s); +} +// overloads for the most common cases to reduce compile time +@safe pure /*TODO nothrow*/ +{ + string toLower(string s) + { return toLower!string(s); } + wstring toLower(wstring s) + { return toLower!wstring(s); } + dstring toLower(dstring s) + { return toLower!dstring(s); } + + @safe unittest + { + // https://issues.dlang.org/show_bug.cgi?id=16663 + + static struct String + { + string data; + alias data this; + } + + void foo() + { + auto u = toLower(String("")); + } + } +} + + +@system unittest //@@@BUG std.format is not @safe +{ + static import std.ascii; + import std.format : format; + foreach (ch; 0 .. 0x80) + assert(std.ascii.toLower(ch) == toLower(ch)); + assert(toLower('Я') == 'я'); + assert(toLower('Δ') == 'δ'); + foreach (ch; unicode.upperCase.byCodepoint) + { + dchar low = ch.toLower(); + assert(low == ch || isLower(low), format("%s -> %s", ch, low)); + } + assert(toLower("АЯ") == "ая"); + + assert("\u1E9E".toLower == "\u00df"); + assert("\u00df".toUpper == "SS"); +} + +//bugzilla 9629 +@safe unittest +{ + wchar[] test = "hello þ world"w.dup; + auto piece = test[6 .. 7]; + toUpperInPlace(piece); + assert(test == "hello Þ world"); +} + + +@safe unittest +{ + import std.algorithm.comparison : cmp; + string s1 = "FoL"; + string s2 = toLower(s1); + assert(cmp(s2, "fol") == 0, s2); + assert(s2 != s1); + + char[] s3 = s1.dup; + toLowerInPlace(s3); + assert(s3 == s2); + + s1 = "A\u0100B\u0101d"; + s2 = toLower(s1); + s3 = s1.dup; + assert(cmp(s2, "a\u0101b\u0101d") == 0); + assert(s2 !is s1); + toLowerInPlace(s3); + assert(s3 == s2); + + s1 = "A\u0460B\u0461d"; + s2 = toLower(s1); + s3 = s1.dup; + assert(cmp(s2, "a\u0461b\u0461d") == 0); + assert(s2 !is s1); + toLowerInPlace(s3); + assert(s3 == s2); + + s1 = "\u0130"; + s2 = toLower(s1); + s3 = s1.dup; + assert(s2 == "i\u0307"); + assert(s2 !is s1); + toLowerInPlace(s3); + assert(s3 == s2); + + // Test on wchar and dchar strings. + assert(toLower("Some String"w) == "some string"w); + assert(toLower("Some String"d) == "some string"d); + + // bugzilla 12455 + dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE + assert(isUpper(c)); + assert(toLower(c) == 'i'); + // extend on 12455 reprot - check simple-case toUpper too + c = '\u1f87'; + assert(isLower(c)); + assert(toUpper(c) == '\u1F8F'); +} + + +/++ + If $(D c) is a Unicode lowercase $(CHARACTER), then its uppercase equivalent + is returned. Otherwise $(D c) is returned. + + Warning: + Certain alphabets like German and Greek have no 1:1 + upper-lower mapping. Use overload of toUpper which takes full string instead. + + toUpper can be used as an argument to $(REF map, std,algorithm,iteration) + to produce an algorithm that can convert a range of characters to upper case + without allocating memory. + A string can then be produced by using $(REF copy, std,algorithm,mutation) + to send it to an $(REF appender, std,array). ++/ +@safe pure nothrow @nogc +dchar toUpper(dchar c) +{ + // optimize ASCII case + if (c < 0xAA) + { + if (c < 'a') + return c; + if (c <= 'z') + return c - 32; + return c; + } + size_t idx = toUpperSimpleIndex(c); + if (idx != ushort.max) + { + return toUpperTab(idx); + } + return c; +} + +/// +@system unittest +{ + import std.algorithm.iteration : map; + import std.algorithm.mutation : copy; + import std.array : appender; + + auto abuf = appender!(char[])(); + "hello".map!toUpper.copy(&abuf); + assert(abuf.data == "HELLO"); +} + +@safe unittest +{ + static import std.ascii; + import std.format : format; + foreach (ch; 0 .. 0x80) + assert(std.ascii.toUpper(ch) == toUpper(ch)); + assert(toUpper('я') == 'Я'); + assert(toUpper('δ') == 'Δ'); + auto title = unicode.Titlecase_Letter; + foreach (ch; unicode.lowerCase.byCodepoint) + { + dchar up = ch.toUpper(); + assert(up == ch || isUpper(up) || title[up], + format("%x -> %x", ch, up)); + } +} + +/++ + Returns a string which is identical to $(D s) except that all of its + characters are converted to uppercase (by preforming Unicode uppercase mapping). + If none of $(D s) characters were affected, then $(D s) itself is returned. ++/ +S toUpper(S)(S s) @trusted pure +if (isSomeString!S) +{ + static import std.ascii; + return toCase!(UpperTriple, std.ascii.toUpper)(s); +} +// overloads for the most common cases to reduce compile time +@safe pure /*TODO nothrow*/ +{ + string toUpper(string s) + { return toUpper!string(s); } + wstring toUpper(wstring s) + { return toUpper!wstring(s); } + dstring toUpper(dstring s) + { return toUpper!dstring(s); } + + @safe unittest + { + // https://issues.dlang.org/show_bug.cgi?id=16663 + + static struct String + { + string data; + alias data this; + } + + void foo() + { + auto u = toUpper(String("")); + } + } +} + +@safe unittest +{ + import std.algorithm.comparison : cmp; + + string s1 = "FoL"; + string s2; + char[] s3; + + s2 = toUpper(s1); + s3 = s1.dup; toUpperInPlace(s3); + assert(s3 == s2, s3); + assert(cmp(s2, "FOL") == 0); + assert(s2 !is s1); + + s1 = "a\u0100B\u0101d"; + s2 = toUpper(s1); + s3 = s1.dup; toUpperInPlace(s3); + assert(s3 == s2); + assert(cmp(s2, "A\u0100B\u0100D") == 0); + assert(s2 !is s1); + + s1 = "a\u0460B\u0461d"; + s2 = toUpper(s1); + s3 = s1.dup; toUpperInPlace(s3); + assert(s3 == s2); + assert(cmp(s2, "A\u0460B\u0460D") == 0); + assert(s2 !is s1); +} + +@system unittest +{ + static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow) + { + import std.format : format; + string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)"; + auto low = s.toLower() , up = s.toUpper(); + auto lowInp = s.dup, upInp = s.dup; + lowInp.toLowerInPlace(); + upInp.toUpperInPlace(); + assert(low == trueLow, format(diff, low, trueLow)); + assert(up == trueUp, format(diff, up, trueUp)); + assert(lowInp == trueLow, + format(diff, cast(ubyte[]) s, cast(ubyte[]) lowInp, cast(ubyte[]) trueLow)); + assert(upInp == trueUp, + format(diff, cast(ubyte[]) s, cast(ubyte[]) upInp, cast(ubyte[]) trueUp)); + } + foreach (S; AliasSeq!(dstring, wstring, string)) + { + + S easy = "123"; + S good = "abCФеж"; + S awful = "\u0131\u023f\u2126"; + S wicked = "\u0130\u1FE2"; + auto options = [easy, good, awful, wicked]; + S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; + S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; + + foreach (val; AliasSeq!(easy, good)) + { + auto e = val.dup; + auto g = e; + e.toUpperInPlace(); + assert(e is g); + e.toLowerInPlace(); + assert(e is g); + } + foreach (i, v; options) + { + doTest(v, upper[i], lower[i]); + } + + // a few combinatorial runs + foreach (i; 0 .. options.length) + foreach (j; i .. options.length) + foreach (k; j .. options.length) + { + auto sample = options[i] ~ options[j] ~ options[k]; + auto sample2 = options[k] ~ options[j] ~ options[i]; + doTest(sample, upper[i] ~ upper[j] ~ upper[k], + lower[i] ~ lower[j] ~ lower[k]); + doTest(sample2, upper[k] ~ upper[j] ~ upper[i], + lower[k] ~ lower[j] ~ lower[i]); + } + } +} + + +/++ + Returns whether $(D c) is a Unicode alphabetic $(CHARACTER) + (general Unicode category: Alphabetic). ++/ +@safe pure nothrow @nogc +bool isAlpha(dchar c) +{ + // optimization + if (c < 0xAA) + { + size_t x = c - 'A'; + if (x <= 'Z' - 'A') + return true; + else + { + x = c - 'a'; + if (x <= 'z'-'a') + return true; + } + return false; + } + + return alphaTrie[c]; +} + +@safe unittest +{ + auto alpha = unicode("Alphabetic"); + foreach (ch; alpha.byCodepoint) + assert(isAlpha(ch)); + foreach (ch; 0 .. 0x4000) + assert((ch in alpha) == isAlpha(ch)); +} + + +/++ + Returns whether $(D c) is a Unicode mark + (general Unicode category: Mn, Me, Mc). ++/ +@safe pure nothrow @nogc +bool isMark(dchar c) +{ + return markTrie[c]; +} + +@safe unittest +{ + auto mark = unicode("Mark"); + foreach (ch; mark.byCodepoint) + assert(isMark(ch)); + foreach (ch; 0 .. 0x4000) + assert((ch in mark) == isMark(ch)); +} + +/++ + Returns whether $(D c) is a Unicode numerical $(CHARACTER) + (general Unicode category: Nd, Nl, No). ++/ +@safe pure nothrow @nogc +bool isNumber(dchar c) +{ + // optimization for ascii case + if (c <= 0x7F) + { + return c >= '0' && c <= '9'; + } + else + { + return numberTrie[c]; + } +} + +@safe unittest +{ + auto n = unicode("N"); + foreach (ch; n.byCodepoint) + assert(isNumber(ch)); + foreach (ch; 0 .. 0x4000) + assert((ch in n) == isNumber(ch)); +} + +/++ + Returns whether $(D c) is a Unicode alphabetic $(CHARACTER) or number. + (general Unicode category: Alphabetic, Nd, Nl, No). + + Params: + c = any Unicode character + Returns: + `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode + categories ++/ +@safe pure nothrow @nogc +bool isAlphaNum(dchar c) +{ + static import std.ascii; + + // optimization for ascii case + if (std.ascii.isASCII(c)) + { + return std.ascii.isAlphaNum(c); + } + else + { + return isAlpha(c) || isNumber(c); + } +} + +@safe unittest +{ + auto n = unicode("N"); + auto alpha = unicode("Alphabetic"); + + foreach (ch; n.byCodepoint) + assert(isAlphaNum(ch)); + + foreach (ch; alpha.byCodepoint) + assert(isAlphaNum(ch)); + + foreach (ch; 0 .. 0x4000) + { + assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch)); + } +} + +/++ + Returns whether $(D c) is a Unicode punctuation $(CHARACTER) + (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf). ++/ +@safe pure nothrow @nogc +bool isPunctuation(dchar c) +{ + static import std.ascii; + + // optimization for ascii case + if (c <= 0x7F) + { + return std.ascii.isPunctuation(c); + } + else + { + return punctuationTrie[c]; + } +} + +@safe unittest +{ + assert(isPunctuation('\u0021')); + assert(isPunctuation('\u0028')); + assert(isPunctuation('\u0029')); + assert(isPunctuation('\u002D')); + assert(isPunctuation('\u005F')); + assert(isPunctuation('\u00AB')); + assert(isPunctuation('\u00BB')); + foreach (ch; unicode("P").byCodepoint) + assert(isPunctuation(ch)); +} + +/++ + Returns whether $(D c) is a Unicode symbol $(CHARACTER) + (general Unicode category: Sm, Sc, Sk, So). ++/ +@safe pure nothrow @nogc +bool isSymbol(dchar c) +{ + return symbolTrie[c]; +} + +@safe unittest +{ + import std.format : format; + assert(isSymbol('\u0024')); + assert(isSymbol('\u002B')); + assert(isSymbol('\u005E')); + assert(isSymbol('\u00A6')); + foreach (ch; unicode("S").byCodepoint) + assert(isSymbol(ch), format("%04x", ch)); +} + +/++ + Returns whether $(D c) is a Unicode space $(CHARACTER) + (general Unicode category: Zs) + Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER). + For commonly used less strict semantics see $(LREF isWhite). ++/ +@safe pure nothrow @nogc +bool isSpace(dchar c) +{ + import std.internal.unicode_tables : isSpaceGen; // generated file + return isSpaceGen(c); +} + +@safe unittest +{ + assert(isSpace('\u0020')); + auto space = unicode.Zs; + foreach (ch; space.byCodepoint) + assert(isSpace(ch)); + foreach (ch; 0 .. 0x1000) + assert(isSpace(ch) == space[ch]); +} + + +/++ + Returns whether $(D c) is a Unicode graphical $(CHARACTER) + (general Unicode category: L, M, N, P, S, Zs). + ++/ +@safe pure nothrow @nogc +bool isGraphical(dchar c) +{ + return graphicalTrie[c]; +} + + +@safe unittest +{ + auto set = unicode("Graphical"); + import std.format : format; + foreach (ch; set.byCodepoint) + assert(isGraphical(ch), format("%4x", ch)); + foreach (ch; 0 .. 0x4000) + assert((ch in set) == isGraphical(ch)); +} + + +/++ + Returns whether $(D c) is a Unicode control $(CHARACTER) + (general Unicode category: Cc). ++/ +@safe pure nothrow @nogc +bool isControl(dchar c) +{ + import std.internal.unicode_tables : isControlGen; // generated file + return isControlGen(c); +} + +@safe unittest +{ + assert(isControl('\u0000')); + assert(isControl('\u0081')); + assert(!isControl('\u0100')); + auto cc = unicode.Cc; + foreach (ch; cc.byCodepoint) + assert(isControl(ch)); + foreach (ch; 0 .. 0x1000) + assert(isControl(ch) == cc[ch]); +} + + +/++ + Returns whether $(D c) is a Unicode formatting $(CHARACTER) + (general Unicode category: Cf). ++/ +@safe pure nothrow @nogc +bool isFormat(dchar c) +{ + import std.internal.unicode_tables : isFormatGen; // generated file + return isFormatGen(c); +} + + +@safe unittest +{ + assert(isFormat('\u00AD')); + foreach (ch; unicode("Format").byCodepoint) + assert(isFormat(ch)); +} + +// code points for private use, surrogates are not likely to change in near feature +// if need be they can be generated from unicode data as well + +/++ + Returns whether $(D c) is a Unicode Private Use $(CODEPOINT) + (general Unicode category: Co). ++/ +@safe pure nothrow @nogc +bool isPrivateUse(dchar c) +{ + return (0x00_E000 <= c && c <= 0x00_F8FF) + || (0x0F_0000 <= c && c <= 0x0F_FFFD) + || (0x10_0000 <= c && c <= 0x10_FFFD); +} + +/++ + Returns whether $(D c) is a Unicode surrogate $(CODEPOINT) + (general Unicode category: Cs). ++/ +@safe pure nothrow @nogc +bool isSurrogate(dchar c) +{ + return (0xD800 <= c && c <= 0xDFFF); +} + +/++ + Returns whether $(D c) is a Unicode high surrogate (lead surrogate). ++/ +@safe pure nothrow @nogc +bool isSurrogateHi(dchar c) +{ + return (0xD800 <= c && c <= 0xDBFF); +} + +/++ + Returns whether $(D c) is a Unicode low surrogate (trail surrogate). ++/ +@safe pure nothrow @nogc +bool isSurrogateLo(dchar c) +{ + return (0xDC00 <= c && c <= 0xDFFF); +} + +/++ + Returns whether $(D c) is a Unicode non-character i.e. + a $(CODEPOINT) with no assigned abstract character. + (general Unicode category: Cn) ++/ +@safe pure nothrow @nogc +bool isNonCharacter(dchar c) +{ + return nonCharacterTrie[c]; +} + +@safe unittest +{ + auto set = unicode("Cn"); + foreach (ch; set.byCodepoint) + assert(isNonCharacter(ch)); +} + +private: +// load static data from pre-generated tables into usable datastructures + + +@safe auto asSet(const (ubyte)[] compressed) pure +{ + return CodepointSet.fromIntervals(decompressIntervals(compressed)); +} + +@safe pure nothrow auto asTrie(T...)(in TrieEntry!T e) +{ + return const(CodepointTrie!T)(e.offsets, e.sizes, e.data); +} + +@safe pure nothrow @nogc @property +{ + import std.internal.unicode_tables; // generated file + + // It's important to use auto return here, so that the compiler + // only runs semantic on the return type if the function gets + // used. Also these are functions rather than templates to not + // increase the object size of the caller. + auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; } + auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; } + auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; } + auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; } + auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; } + auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; } + auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; } + auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; } + auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; } + auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; } + auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; } + + //normalization quick-check tables + auto nfcQCTrie() + { + import std.internal.unicode_norm : nfcQCTrieEntries; + static immutable res = asTrie(nfcQCTrieEntries); + return res; + } + + auto nfdQCTrie() + { + import std.internal.unicode_norm : nfdQCTrieEntries; + static immutable res = asTrie(nfdQCTrieEntries); + return res; + } + + auto nfkcQCTrie() + { + import std.internal.unicode_norm : nfkcQCTrieEntries; + static immutable res = asTrie(nfkcQCTrieEntries); + return res; + } + + auto nfkdQCTrie() + { + import std.internal.unicode_norm : nfkdQCTrieEntries; + static immutable res = asTrie(nfkdQCTrieEntries); + return res; + } + + //grapheme breaking algorithm tables + auto mcTrie() + { + import std.internal.unicode_grapheme : mcTrieEntries; + static immutable res = asTrie(mcTrieEntries); + return res; + } + + auto graphemeExtendTrie() + { + import std.internal.unicode_grapheme : graphemeExtendTrieEntries; + static immutable res = asTrie(graphemeExtendTrieEntries); + return res; + } + + auto hangLV() + { + import std.internal.unicode_grapheme : hangulLVTrieEntries; + static immutable res = asTrie(hangulLVTrieEntries); + return res; + } + + auto hangLVT() + { + import std.internal.unicode_grapheme : hangulLVTTrieEntries; + static immutable res = asTrie(hangulLVTTrieEntries); + return res; + } + + // tables below are used for composition/decomposition + auto combiningClassTrie() + { + import std.internal.unicode_comp : combiningClassTrieEntries; + static immutable res = asTrie(combiningClassTrieEntries); + return res; + } + + auto compatMappingTrie() + { + import std.internal.unicode_decomp : compatMappingTrieEntries; + static immutable res = asTrie(compatMappingTrieEntries); + return res; + } + + auto canonMappingTrie() + { + import std.internal.unicode_decomp : canonMappingTrieEntries; + static immutable res = asTrie(canonMappingTrieEntries); + return res; + } + + auto compositionJumpTrie() + { + import std.internal.unicode_comp : compositionJumpTrieEntries; + static immutable res = asTrie(compositionJumpTrieEntries); + return res; + } + + //case conversion tables + auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; } + auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; } + auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; } + //simple case conversion tables + auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; } + auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; } + auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; } + +} + +}// version (!std_uni_bootstrap) |