/**
 * Functions related to UTF encoding.
 *
 * Copyright:   Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
 * Authors:     $(LINK2 https://www.digitalmars.com, Walter Bright)
 * License:     $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
 * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/root/utf.d, _utf.d)
 * Documentation:  https://dlang.org/phobos/dmd_root_utf.html
 * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/root/utf.d
 */

module dmd.root.utf;

@nogc nothrow pure @safe:

/// The Unicode code space is the range of code points [0x000000,0x10FFFF]
/// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF]
bool utf_isValidDchar(dchar c)
{
    // TODO: Whether non-char code points should be rejected is pending review.
    // 0xFFFE and 0xFFFF are valid for internal use, like Phobos std.utf.isValidDChar
    // See also https://issues.dlang.org/show_bug.cgi?id=1357
    if (c < 0xD800) // Almost all characters in a typical document.
        return true;
    if (c > 0xDFFF && c <= 0x10FFFF)
        return true;
    return false;
}

/**
 * Returns the code length of c in code units.
 */
int utf_codeLengthChar(dchar c)
{
    if (c <= 0x7F)
        return 1;
    if (c <= 0x7FF)
        return 2;
    if (c <= 0xFFFF)
        return 3;
    if (c <= 0x10FFFF)
        return 4;
    assert(false);
}

int utf_codeLengthWchar(dchar c)
{
    return c <= 0xFFFF ? 1 : 2;
}

/**
 * Returns the code length of c in code units for the encoding.
 * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32.
 */
int utf_codeLength(int sz, dchar c)
{
    if (sz == 1)
        return utf_codeLengthChar(c);
    if (sz == 2)
        return utf_codeLengthWchar(c);
    assert(sz == 4);
    return 1;
}

void utf_encodeChar(char* s, dchar c) @system
{
    assert(s !is null);
    assert(utf_isValidDchar(c));
    if (c <= 0x7F)
    {
        s[0] = cast(char)c;
    }
    else if (c <= 0x07FF)
    {
        s[0] = cast(char)(0xC0 | (c >> 6));
        s[1] = cast(char)(0x80 | (c & 0x3F));
    }
    else if (c <= 0xFFFF)
    {
        s[0] = cast(char)(0xE0 | (c >> 12));
        s[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
        s[2] = cast(char)(0x80 | (c & 0x3F));
    }
    else if (c <= 0x10FFFF)
    {
        s[0] = cast(char)(0xF0 | (c >> 18));
        s[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
        s[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
        s[3] = cast(char)(0x80 | (c & 0x3F));
    }
    else
        assert(0);
}

void utf_encodeWchar(wchar* s, dchar c) @system
{
    assert(s !is null);
    assert(utf_isValidDchar(c));
    if (c <= 0xFFFF)
    {
        s[0] = cast(wchar)c;
    }
    else
    {
        s[0] = cast(wchar)((((c - 0x010000) >> 10) & 0x03FF) + 0xD800);
        s[1] = cast(wchar)(((c - 0x010000) & 0x03FF) + 0xDC00);
    }
}

void utf_encode(int sz, void* s, dchar c) @system
{
    if (sz == 1)
        utf_encodeChar(cast(char*)s, c);
    else if (sz == 2)
        utf_encodeWchar(cast(wchar*)s, c);
    else
    {
        assert(sz == 4);
        *(cast(dchar*)s) = c;
    }
}

/********************************************
 * Checks whether an Unicode code point is a bidirectional
 * control character.
 */
bool isBidiControl(dchar c)
{
    // Source: https://www.unicode.org/versions/Unicode15.0.0, table 23-3.
    switch(c)
    {
        case '\u061C':
        case '\u200E':
        case '\u200F':
        case '\u202A': .. case '\u202E':
        case '\u2066': .. case '\u2069':
            return true;
        default:
            return false;
    }
}

/********************************************
 * Decode a UTF-8 sequence as a single UTF-32 code point.
 * Params:
 *      s = UTF-8 sequence
 *      ridx = starting index in s[], updated to reflect number of code units decoded
 *      rresult = set to character decoded
 * Returns:
 *      null on success, otherwise error message string
 */
string utf_decodeChar(const(char)[] s, ref size_t ridx, out dchar rresult)
{
    // UTF-8 decoding errors
    static immutable string UTF8_DECODE_OK = null; // no error
    static immutable string UTF8_DECODE_OUTSIDE_CODE_SPACE = "Outside Unicode code space";
    static immutable string UTF8_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-8 sequence";
    static immutable string UTF8_DECODE_OVERLONG = "Overlong UTF-8 sequence";
    static immutable string UTF8_DECODE_INVALID_TRAILER = "Invalid trailing code unit";
    static immutable string UTF8_DECODE_INVALID_CODE_POINT = "Invalid code point decoded";

    /* The following encodings are valid, except for the 5 and 6 byte
     * combinations:
     *      0xxxxxxx
     *      110xxxxx 10xxxxxx
     *      1110xxxx 10xxxxxx 10xxxxxx
     *      11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     *      111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     *      1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
     */
    static immutable ubyte[256] UTF8_STRIDE =
    [
        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,

        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,
        1,1,1,1, 1,1,1,1,

        0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,

        2,2,2,2, 2,2,2,2,
        2,2,2,2, 2,2,2,2,
        2,2,2,2, 2,2,2,2,
        2,2,2,2, 2,2,2,2,

        3,3,3,3, 3,3,3,3,
        3,3,3,3, 3,3,3,3,

        4,4,4,4, 4,4,4,4,
        5,5,5,5, 6,6,0xFF,0xFF
    ];

    assert(s !is null);
    size_t i = ridx++;

    const char u = s[i];
    // Pre-stage results for ASCII and error cases
    rresult = u;
    //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len);
    // Get expected sequence length
    const size_t n = UTF8_STRIDE[u];
    switch (n)
    {
    case 1:
        // ASCII
        return UTF8_DECODE_OK;
    case 2:
    case 3:
    case 4:
        // multi-byte UTF-8
        break;
    default:
        // 5- or 6-byte sequence
        return UTF8_DECODE_OUTSIDE_CODE_SPACE;
    }
    if (s.length < i + n) // source too short
        return UTF8_DECODE_TRUNCATED_SEQUENCE;
    // Pick off 7 - n low bits from first code unit
    dchar c = u & ((1 << (7 - n)) - 1);
    /* The following combinations are overlong, and illegal:
     *      1100000x (10xxxxxx)
     *      11100000 100xxxxx (10xxxxxx)
     *      11110000 1000xxxx (10xxxxxx 10xxxxxx)
     *      11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
     *      11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
     */
    const char u2 = s[++i];
    // overlong combination
    if ((u & 0xFE) == 0xC0 || (u == 0xE0 && (u2 & 0xE0) == 0x80) || (u == 0xF0 && (u2 & 0xF0) == 0x80) || (u == 0xF8 && (u2 & 0xF8) == 0x80) || (u == 0xFC && (u2 & 0xFC) == 0x80))
        return UTF8_DECODE_OVERLONG;
    // Decode remaining bits
    for (const m = n + i - 1; i != m; ++i)
    {
        const u3 = s[i];
        if ((u3 & 0xC0) != 0x80) // trailing bytes are 10xxxxxx
            return UTF8_DECODE_INVALID_TRAILER;
        c = (c << 6) | (u3 & 0x3F);
    }
    if (!utf_isValidDchar(c))
        return UTF8_DECODE_INVALID_CODE_POINT;
    ridx = i;
    rresult = c;
    return UTF8_DECODE_OK;
}

/********************************************
 * Decode a UTF-16 sequence as a single UTF-32 code point.
 * Params:
 *      s = UTF-16 sequence
 *      ridx = starting index in s[], updated to reflect number of code units decoded
 *      rresult = set to character decoded
 * Returns:
 *      null on success, otherwise error message string
 */
string utf_decodeWchar(const(wchar)[] s, ref size_t ridx, out dchar rresult)
{
    // UTF-16 decoding errors
    static immutable string UTF16_DECODE_OK = null; // no error
    static immutable string UTF16_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-16 sequence";
    static immutable string UTF16_DECODE_INVALID_SURROGATE = "Invalid low surrogate";
    static immutable string UTF16_DECODE_UNPAIRED_SURROGATE = "Unpaired surrogate";
    static immutable string UTF16_DECODE_INVALID_CODE_POINT = "Invalid code point decoded";

    assert(s !is null);
    size_t i = ridx++;

    // Pre-stage results for single wchar and error cases
    dchar u = rresult = s[i];
    if (u < 0xD800) // Single wchar codepoint
        return UTF16_DECODE_OK;
    if (0xD800 <= u && u <= 0xDBFF) // Surrogate pair
    {
        if (s.length <= i + 1)
            return UTF16_DECODE_TRUNCATED_SEQUENCE;
        wchar u2 = s[i + 1];
        if (u2 < 0xDC00 || 0xDFFF < u)
            return UTF16_DECODE_INVALID_SURROGATE;
        u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
        ++ridx;
    }
    else if (0xDC00 <= u && u <= 0xDFFF)
        return UTF16_DECODE_UNPAIRED_SURROGATE;
    if (!utf_isValidDchar(u))
        return UTF16_DECODE_INVALID_CODE_POINT;
    rresult = u;
    return UTF16_DECODE_OK;
}