/** * Functions related to UTF encoding. * * Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved * Authors: $(LINK2 https://www.digitalmars.com, Walter Bright) * License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/root/utf.d, _utf.d) * Documentation: https://dlang.org/phobos/dmd_root_utf.html * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/root/utf.d */ module dmd.root.utf; @nogc nothrow pure @safe: /// The Unicode code space is the range of code points [0x000000,0x10FFFF] /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF] bool utf_isValidDchar(dchar c) { // TODO: Whether non-char code points should be rejected is pending review. // 0xFFFE and 0xFFFF are valid for internal use, like Phobos std.utf.isValidDChar // See also https://issues.dlang.org/show_bug.cgi?id=1357 if (c < 0xD800) // Almost all characters in a typical document. return true; if (c > 0xDFFF && c <= 0x10FFFF) return true; return false; } /** * Returns the code length of c in code units. */ int utf_codeLengthChar(dchar c) { if (c <= 0x7F) return 1; if (c <= 0x7FF) return 2; if (c <= 0xFFFF) return 3; if (c <= 0x10FFFF) return 4; assert(false); } int utf_codeLengthWchar(dchar c) { return c <= 0xFFFF ? 1 : 2; } /** * Returns the code length of c in code units for the encoding. * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32. */ int utf_codeLength(int sz, dchar c) { if (sz == 1) return utf_codeLengthChar(c); if (sz == 2) return utf_codeLengthWchar(c); assert(sz == 4); return 1; } void utf_encodeChar(char* s, dchar c) @system { assert(s !is null); assert(utf_isValidDchar(c)); if (c <= 0x7F) { s[0] = cast(char)c; } else if (c <= 0x07FF) { s[0] = cast(char)(0xC0 | (c >> 6)); s[1] = cast(char)(0x80 | (c & 0x3F)); } else if (c <= 0xFFFF) { s[0] = cast(char)(0xE0 | (c >> 12)); s[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); s[2] = cast(char)(0x80 | (c & 0x3F)); } else if (c <= 0x10FFFF) { s[0] = cast(char)(0xF0 | (c >> 18)); s[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); s[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); s[3] = cast(char)(0x80 | (c & 0x3F)); } else assert(0); } void utf_encodeWchar(wchar* s, dchar c) @system { assert(s !is null); assert(utf_isValidDchar(c)); if (c <= 0xFFFF) { s[0] = cast(wchar)c; } else { s[0] = cast(wchar)((((c - 0x010000) >> 10) & 0x03FF) + 0xD800); s[1] = cast(wchar)(((c - 0x010000) & 0x03FF) + 0xDC00); } } void utf_encode(int sz, void* s, dchar c) @system { if (sz == 1) utf_encodeChar(cast(char*)s, c); else if (sz == 2) utf_encodeWchar(cast(wchar*)s, c); else { assert(sz == 4); *(cast(dchar*)s) = c; } } /******************************************** * Checks whether an Unicode code point is a bidirectional * control character. */ bool isBidiControl(dchar c) { // Source: https://www.unicode.org/versions/Unicode15.0.0, table 23-3. switch(c) { case '\u061C': case '\u200E': case '\u200F': case '\u202A': .. case '\u202E': case '\u2066': .. case '\u2069': return true; default: return false; } } /******************************************** * Decode a UTF-8 sequence as a single UTF-32 code point. * Params: * s = UTF-8 sequence * ridx = starting index in s[], updated to reflect number of code units decoded * rresult = set to character decoded * Returns: * null on success, otherwise error message string */ string utf_decodeChar(const(char)[] s, ref size_t ridx, out dchar rresult) { // UTF-8 decoding errors static immutable string UTF8_DECODE_OK = null; // no error static immutable string UTF8_DECODE_OUTSIDE_CODE_SPACE = "Outside Unicode code space"; static immutable string UTF8_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-8 sequence"; static immutable string UTF8_DECODE_OVERLONG = "Overlong UTF-8 sequence"; static immutable string UTF8_DECODE_INVALID_TRAILER = "Invalid trailing code unit"; static immutable string UTF8_DECODE_INVALID_CODE_POINT = "Invalid code point decoded"; /* The following encodings are valid, except for the 5 and 6 byte * combinations: * 0xxxxxxx * 110xxxxx 10xxxxxx * 1110xxxx 10xxxxxx 10xxxxxx * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ static immutable ubyte[256] UTF8_STRIDE = [ 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2, 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3, 4,4,4,4, 4,4,4,4, 5,5,5,5, 6,6,0xFF,0xFF ]; assert(s !is null); size_t i = ridx++; const char u = s[i]; // Pre-stage results for ASCII and error cases rresult = u; //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len); // Get expected sequence length const size_t n = UTF8_STRIDE[u]; switch (n) { case 1: // ASCII return UTF8_DECODE_OK; case 2: case 3: case 4: // multi-byte UTF-8 break; default: // 5- or 6-byte sequence return UTF8_DECODE_OUTSIDE_CODE_SPACE; } if (s.length < i + n) // source too short return UTF8_DECODE_TRUNCATED_SEQUENCE; // Pick off 7 - n low bits from first code unit dchar c = u & ((1 << (7 - n)) - 1); /* The following combinations are overlong, and illegal: * 1100000x (10xxxxxx) * 11100000 100xxxxx (10xxxxxx) * 11110000 1000xxxx (10xxxxxx 10xxxxxx) * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) */ const char u2 = s[++i]; // overlong combination if ((u & 0xFE) == 0xC0 || (u == 0xE0 && (u2 & 0xE0) == 0x80) || (u == 0xF0 && (u2 & 0xF0) == 0x80) || (u == 0xF8 && (u2 & 0xF8) == 0x80) || (u == 0xFC && (u2 & 0xFC) == 0x80)) return UTF8_DECODE_OVERLONG; // Decode remaining bits for (const m = n + i - 1; i != m; ++i) { const u3 = s[i]; if ((u3 & 0xC0) != 0x80) // trailing bytes are 10xxxxxx return UTF8_DECODE_INVALID_TRAILER; c = (c << 6) | (u3 & 0x3F); } if (!utf_isValidDchar(c)) return UTF8_DECODE_INVALID_CODE_POINT; ridx = i; rresult = c; return UTF8_DECODE_OK; } /******************************************** * Decode a UTF-16 sequence as a single UTF-32 code point. * Params: * s = UTF-16 sequence * ridx = starting index in s[], updated to reflect number of code units decoded * rresult = set to character decoded * Returns: * null on success, otherwise error message string */ string utf_decodeWchar(const(wchar)[] s, ref size_t ridx, out dchar rresult) { // UTF-16 decoding errors static immutable string UTF16_DECODE_OK = null; // no error static immutable string UTF16_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-16 sequence"; static immutable string UTF16_DECODE_INVALID_SURROGATE = "Invalid low surrogate"; static immutable string UTF16_DECODE_UNPAIRED_SURROGATE = "Unpaired surrogate"; static immutable string UTF16_DECODE_INVALID_CODE_POINT = "Invalid code point decoded"; assert(s !is null); size_t i = ridx++; // Pre-stage results for single wchar and error cases dchar u = rresult = s[i]; if (u < 0xD800) // Single wchar codepoint return UTF16_DECODE_OK; if (0xD800 <= u && u <= 0xDBFF) // Surrogate pair { if (s.length <= i + 1) return UTF16_DECODE_TRUNCATED_SEQUENCE; wchar u2 = s[i + 1]; if (u2 < 0xDC00 || 0xDFFF < u) return UTF16_DECODE_INVALID_SURROGATE; u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); ++ridx; } else if (0xDC00 <= u && u <= 0xDFFF) return UTF16_DECODE_UNPAIRED_SURROGATE; if (!utf_isValidDchar(u)) return UTF16_DECODE_INVALID_CODE_POINT; rresult = u; return UTF16_DECODE_OK; }