//===-- runtime/utf.h -----------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // UTF-8 is the variant-width standard encoding of Unicode (ISO 10646) // code points. // // 7-bit values in [00 .. 7F] represent themselves as single bytes, so true // 7-bit ASCII is also valid UTF-8. // // Larger values are encoded with a start byte in [C0 .. FE] that carries // the length of the encoding and some of the upper bits of the value, followed // by one or more bytes in the range [80 .. BF]. // // Specifically, the first byte holds two or more uppermost set bits, // a zero bit, and some payload; the second and later bytes each start with // their uppermost bit set, the next bit clear, and six bits of payload. // Payload parcels are in big-endian order. All bytes must be present in a // valid sequence; i.e., low-order sezo bits must be explicit. UTF-8 is // self-synchronizing on input as any byte value cannot be both a valid // first byte or trailing byte. // // 0xxxxxxx - 7 bit ASCII // 110xxxxx 10xxxxxx - 11-bit value // 1110xxxx 10xxxxxx 10xxxxxx - 16-bit value // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21-bit value // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26-bit value // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 31-bit value // 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 36-bit value // // Canonical UTF-8 sequences should be minimal, and our output is so, but // we do not reject non-minimal sequences on input. Unicode only defines // code points up to 0x10FFFF, so 21-bit (4-byte) UTF-8 is the actual // standard maximum. However, we support extended forms up to 32 bits so that // CHARACTER(KIND=4) can be abused to hold arbitrary 32-bit data. #ifndef FORTRAN_RUNTIME_UTF_H_ #define FORTRAN_RUNTIME_UTF_H_ #include #include #include namespace Fortran::runtime { // Derive the length of a UTF-8 character encoding from its first byte. // A zero result signifies an invalid encoding. extern const std::uint8_t UTF8FirstByteTable[256]; static inline std::size_t MeasureUTF8Bytes(char first) { return UTF8FirstByteTable[static_cast(first)]; } static constexpr std::size_t maxUTF8Bytes{7}; // Ensure that all bytes are present in sequence in the input buffer // before calling; use MeasureUTF8Bytes(first byte) to count them. std::optional DecodeUTF8(const char *); // Ensure that at least maxUTF8Bytes remain in the output // buffer before calling. std::size_t EncodeUTF8(char *, char32_t); } // namespace Fortran::runtime #endif // FORTRAN_RUNTIME_UTF_H_