diff options
author | sribee8 <sriya.pratipati@gmail.com> | 2025-06-17 09:24:01 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-06-17 16:24:01 +0000 |
commit | e6a41399cb8796e5d18940d49b0151704568321a (patch) | |
tree | 0cfcb4b33e526bad28220a2404ce00ba51ba4d5c | |
parent | eb31c422d0dc816bf285a81bf92690d4d16273ed (diff) | |
download | llvm-e6a41399cb8796e5d18940d49b0151704568321a.zip llvm-e6a41399cb8796e5d18940d49b0151704568321a.tar.gz llvm-e6a41399cb8796e5d18940d49b0151704568321a.tar.bz2 |
Reland "[libc] utf8 to 32 CharacterConverter" (#144450)
Reverts llvm/llvm-project#144446
Figured out the issue, so creating a new pull request.
---------
Co-authored-by: Sriya Pratipati <sriyap@google.com>
-rw-r--r-- | libc/src/__support/wchar/character_converter.cpp | 56 | ||||
-rw-r--r-- | libc/test/src/__support/CMakeLists.txt | 5 | ||||
-rw-r--r-- | libc/test/src/__support/wchar/CMakeLists.txt | 10 | ||||
-rw-r--r-- | libc/test/src/__support/wchar/utf8_to_32_test.cpp | 196 |
4 files changed, 264 insertions, 3 deletions
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp index ca70976..3b9046d 100644 --- a/libc/src/__support/wchar/character_converter.cpp +++ b/libc/src/__support/wchar/character_converter.cpp @@ -8,6 +8,7 @@ #include "hdr/types/char32_t.h" #include "hdr/types/char8_t.h" +#include "src/__support/CPP/bit.h" #include "src/__support/common.h" #include "src/__support/error_or.h" #include "src/__support/math_extras.h" @@ -30,6 +31,50 @@ bool CharacterConverter::isComplete() { return state->bytes_processed == state->total_bytes; } +int CharacterConverter::push(char8_t utf8_byte) { + uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte)); + // Checking the first byte if first push + if (state->bytes_processed == 0) { + // UTF-8 char has 1 byte total + if (num_ones == 0) { + state->total_bytes = 1; + } + // UTF-8 char has 2 through 4 bytes total + else if (num_ones >= 2 && num_ones <= 4) { + /* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4, + we will make the base mask with 7 ones and right shift it as necessary. */ + constexpr size_t SIGNIFICANT_BITS = 7; + char8_t base_mask = + static_cast<char8_t>(mask_trailing_ones<uint8_t, SIGNIFICANT_BITS>()); + state->total_bytes = num_ones; + utf8_byte &= (base_mask >> num_ones); + } + // Invalid first byte + else { + // bytes_processed and total_bytes will always be 0 here + state->partial = static_cast<char32_t>(0); + return -1; + } + state->partial = static_cast<char32_t>(utf8_byte); + state->bytes_processed++; + return 0; + } + // Any subsequent push + // Adding 6 more bits so need to left shift + constexpr size_t ENCODED_BITS_PER_UTF8 = 6; + if (num_ones == 1 && !isComplete()) { + char32_t byte = + utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>(); + state->partial = state->partial << ENCODED_BITS_PER_UTF8; + state->partial |= byte; + state->bytes_processed++; + return 0; + } + // Invalid byte -> reset the state + clear(); + return -1; +} + int CharacterConverter::push(char32_t utf32) { // we can't be partially through a conversion when pushing a utf32 value if (!isComplete()) @@ -54,6 +99,17 @@ int CharacterConverter::push(char32_t utf32) { return -1; } +ErrorOr<char32_t> CharacterConverter::pop_utf32() { + // If pop is called too early, do not reset the state, use error to determine + // whether enough bytes have been pushed + if (!isComplete() || state->bytes_processed == 0) + return Error(-1); + char32_t utf32 = state->partial; + // reset if successful pop + clear(); + return utf32; +} + ErrorOr<char8_t> CharacterConverter::pop_utf8() { if (isComplete()) return Error(-1); diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index 76218a1..9f626ed 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -275,9 +275,8 @@ add_subdirectory(fixed_point) add_subdirectory(HashTable) add_subdirectory(time) add_subdirectory(threads) - -# Requires access to uchar header which is not on macos -# Therefore, cannot currently build this on macos in overlay mode +# Requires access to uchar header which is not on MacOS +# Cannot currently build this on MacOS in overlay mode if(NOT(LIBC_TARGET_OS_IS_DARWIN)) add_subdirectory(wchar) endif() diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt index 5dff6e9..5176bfd 100644 --- a/libc/test/src/__support/wchar/CMakeLists.txt +++ b/libc/test/src/__support/wchar/CMakeLists.txt @@ -1,6 +1,16 @@ add_custom_target(libc-support-wchar-tests) add_libc_test( + utf8_to_32_test + SUITE + libc-support-tests + SRCS + utf8_to_32_test.cpp + DEPENDS + libc.src.__support.wchar.character_converter +) + +add_libc_test( utf32_to_8_test SUITE libc-support-tests diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp new file mode 100644 index 0000000..9cb059fa --- /dev/null +++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp @@ -0,0 +1,196 @@ +//===-- Unittests for character_converter utf8->utf32 ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/error_or.h" +#include "src/__support/wchar/character_converter.h" +#include "src/__support/wchar/mbstate.h" +#include "test/UnitTest/Test.h" + +TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) { + LIBC_NAMESPACE::internal::mbstate state; + state.bytes_processed = 0; + state.total_bytes = 0; + char ch = 'A'; + + LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); + int err = char_conv.push(static_cast<char8_t>(ch)); + auto wch = char_conv.pop_utf32(); + + ASSERT_EQ(err, 0); + ASSERT_TRUE(wch.has_value()); + ASSERT_EQ(static_cast<int>(wch.value()), 65); +} + +TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) { + LIBC_NAMESPACE::internal::mbstate state; + state.bytes_processed = 0; + state.total_bytes = 0; + const char ch[2] = {static_cast<char>(0xC2), + static_cast<char>(0x8E)}; // car symbol + + LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); + char_conv.push(static_cast<char8_t>(ch[0])); + char_conv.push(static_cast<char8_t>(ch[1])); + auto wch = char_conv.pop_utf32(); + + ASSERT_TRUE(wch.has_value()); + ASSERT_EQ(static_cast<int>(wch.value()), 142); +} + +TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) { + LIBC_NAMESPACE::internal::mbstate state; + state.bytes_processed = 0; + state.total_bytes = 0; + const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88), + static_cast<char>(0x91)}; // ∑ sigma symbol + + LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); + char_conv.push(static_cast<char8_t>(ch[0])); + char_conv.push(static_cast<char8_t>(ch[1])); + char_conv.push(static_cast<char8_t>(ch[2])); + auto wch = char_conv.pop_utf32(); + + ASSERT_TRUE(wch.has_value()); + ASSERT_EQ(static_cast<int>(wch.value()), 8721); +} + +TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) { + LIBC_NAMESPACE::internal::mbstate state; + state.bytes_processed = 0; + state.total_bytes = 0; + const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F), + static_cast<char>(0xA4), + static_cast<char>(0xA1)}; // 🤡 clown emoji + + LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); + char_conv.push(static_cast<char8_t>(ch[0])); + char_conv.push(static_cast<char8_t>(ch[1])); + char_conv.push(static_cast<char8_t>(ch[2])); + char_conv.push(static_cast<char8_t>(ch[3])); + auto wch = char_conv.pop_utf32(); + + ASSERT_TRUE(wch.has_value()); + ASSERT_EQ(static_cast<int>(wch.value()), 129313); +} + +TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) { + LIBC_NAMESPACE::internal::mbstate state; + state.bytes_processed = 0; + state.total_bytes = 0; + const char ch = static_cast<char>(0x80); // invalid starting bit sequence + + LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); + int err = char_conv.push(static_cast<char8_t>(ch)); + + ASSERT_EQ(err, -1); +} + +TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) { + LIBC_NAMESPACE::internal::mbstate state; + state.bytes_processed = 0; + state.total_bytes = 0; + const char ch[4] = { + static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80), + static_cast<char>(0x00)}; // first and third bytes are invalid + + LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); + int err = char_conv.push(static_cast<char8_t>(ch[0])); + ASSERT_EQ(err, -1); + err = char_conv.push(static_cast<char8_t>(ch[1])); + ASSERT_EQ(err, 0); + // Prev byte was single byte so trying to push another should error. + err = char_conv.push(static_cast<char8_t>(ch[2])); + ASSERT_EQ(err, -1); + err = char_conv.push(static_cast<char8_t>(ch[3])); + ASSERT_EQ(err, 0); +} + +TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) { + LIBC_NAMESPACE::internal::mbstate state; + state.bytes_processed = 0; + state.total_bytes = 0; + // Last byte is invalid since it does not have correct starting sequence. + // 0xC0 --> 11000000 starting sequence should be 10xxxxxx + const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80), + static_cast<char>(0x80), static_cast<char>(0xC0)}; + + LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); + int err = char_conv.push(static_cast<char8_t>(ch[0])); + ASSERT_EQ(err, 0); + err = char_conv.push(static_cast<char8_t>(ch[1])); + ASSERT_EQ(err, 0); + err = char_conv.push(static_cast<char8_t>(ch[2])); + ASSERT_EQ(err, 0); + err = char_conv.push(static_cast<char8_t>(ch[3])); + ASSERT_EQ(err, -1); +} + +TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) { + LIBC_NAMESPACE::internal::mbstate state; + state.bytes_processed = 0; + state.total_bytes = 0; + const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E), + static_cast<char>(0x80)}; + + LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); + int err = char_conv.push(static_cast<char8_t>(ch[0])); + ASSERT_EQ(err, 0); + err = char_conv.push(static_cast<char8_t>(ch[1])); + ASSERT_EQ(err, 0); + // Should produce an error on 3rd byte + err = char_conv.push(static_cast<char8_t>(ch[2])); + ASSERT_EQ(err, -1); + + // Should produce an error since mbstate was reset + auto wch = char_conv.pop_utf32(); + ASSERT_FALSE(wch.has_value()); +} + +TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) { + LIBC_NAMESPACE::internal::mbstate state; + state.bytes_processed = 0; + state.total_bytes = 0; + const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E), + static_cast<char>(0xC7), static_cast<char>(0x8C)}; + + LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); + int err = char_conv.push(static_cast<char8_t>(ch[0])); + ASSERT_EQ(err, 0); + err = char_conv.push(static_cast<char8_t>(ch[1])); + ASSERT_EQ(err, 0); + auto wch = char_conv.pop_utf32(); + ASSERT_TRUE(wch.has_value()); + ASSERT_EQ(static_cast<int>(wch.value()), 142); + + // Second two byte character + err = char_conv.push(static_cast<char8_t>(ch[2])); + ASSERT_EQ(err, 0); + err = char_conv.push(static_cast<char8_t>(ch[3])); + ASSERT_EQ(err, 0); + wch = char_conv.pop_utf32(); + ASSERT_TRUE(wch.has_value()); + ASSERT_EQ(static_cast<int>(wch.value()), 460); +} + +TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) { + LIBC_NAMESPACE::internal::mbstate state; + state.bytes_processed = 0; + state.total_bytes = 0; + LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state); + const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)}; + int err = char_conv.push(static_cast<char8_t>(ch[0])); + ASSERT_EQ(err, 0); + auto wch = char_conv.pop_utf32(); + ASSERT_FALSE( + wch.has_value()); // Should fail since we have not read enough bytes + err = char_conv.push(static_cast<char8_t>(ch[1])); + ASSERT_EQ(err, 0); + wch = char_conv.pop_utf32(); + ASSERT_TRUE(wch.has_value()); + ASSERT_EQ(static_cast<int>(wch.value()), 142); +} |