aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsribee8 <sriya.pratipati@gmail.com>2025-06-17 09:24:01 -0700
committerGitHub <noreply@github.com>2025-06-17 16:24:01 +0000
commite6a41399cb8796e5d18940d49b0151704568321a (patch)
tree0cfcb4b33e526bad28220a2404ce00ba51ba4d5c
parenteb31c422d0dc816bf285a81bf92690d4d16273ed (diff)
downloadllvm-e6a41399cb8796e5d18940d49b0151704568321a.zip
llvm-e6a41399cb8796e5d18940d49b0151704568321a.tar.gz
llvm-e6a41399cb8796e5d18940d49b0151704568321a.tar.bz2
Reland "[libc] utf8 to 32 CharacterConverter" (#144450)
Reverts llvm/llvm-project#144446 Figured out the issue, so creating a new pull request. --------- Co-authored-by: Sriya Pratipati <sriyap@google.com>
-rw-r--r--libc/src/__support/wchar/character_converter.cpp56
-rw-r--r--libc/test/src/__support/CMakeLists.txt5
-rw-r--r--libc/test/src/__support/wchar/CMakeLists.txt10
-rw-r--r--libc/test/src/__support/wchar/utf8_to_32_test.cpp196
4 files changed, 264 insertions, 3 deletions
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index ca70976..3b9046d 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,6 +8,7 @@
#include "hdr/types/char32_t.h"
#include "hdr/types/char8_t.h"
+#include "src/__support/CPP/bit.h"
#include "src/__support/common.h"
#include "src/__support/error_or.h"
#include "src/__support/math_extras.h"
@@ -30,6 +31,50 @@ bool CharacterConverter::isComplete() {
return state->bytes_processed == state->total_bytes;
}
+int CharacterConverter::push(char8_t utf8_byte) {
+ uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
+ // Checking the first byte if first push
+ if (state->bytes_processed == 0) {
+ // UTF-8 char has 1 byte total
+ if (num_ones == 0) {
+ state->total_bytes = 1;
+ }
+ // UTF-8 char has 2 through 4 bytes total
+ else if (num_ones >= 2 && num_ones <= 4) {
+ /* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
+ we will make the base mask with 7 ones and right shift it as necessary. */
+ constexpr size_t SIGNIFICANT_BITS = 7;
+ char8_t base_mask =
+ static_cast<char8_t>(mask_trailing_ones<uint8_t, SIGNIFICANT_BITS>());
+ state->total_bytes = num_ones;
+ utf8_byte &= (base_mask >> num_ones);
+ }
+ // Invalid first byte
+ else {
+ // bytes_processed and total_bytes will always be 0 here
+ state->partial = static_cast<char32_t>(0);
+ return -1;
+ }
+ state->partial = static_cast<char32_t>(utf8_byte);
+ state->bytes_processed++;
+ return 0;
+ }
+ // Any subsequent push
+ // Adding 6 more bits so need to left shift
+ constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
+ if (num_ones == 1 && !isComplete()) {
+ char32_t byte =
+ utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
+ state->partial = state->partial << ENCODED_BITS_PER_UTF8;
+ state->partial |= byte;
+ state->bytes_processed++;
+ return 0;
+ }
+ // Invalid byte -> reset the state
+ clear();
+ return -1;
+}
+
int CharacterConverter::push(char32_t utf32) {
// we can't be partially through a conversion when pushing a utf32 value
if (!isComplete())
@@ -54,6 +99,17 @@ int CharacterConverter::push(char32_t utf32) {
return -1;
}
+ErrorOr<char32_t> CharacterConverter::pop_utf32() {
+ // If pop is called too early, do not reset the state, use error to determine
+ // whether enough bytes have been pushed
+ if (!isComplete() || state->bytes_processed == 0)
+ return Error(-1);
+ char32_t utf32 = state->partial;
+ // reset if successful pop
+ clear();
+ return utf32;
+}
+
ErrorOr<char8_t> CharacterConverter::pop_utf8() {
if (isComplete())
return Error(-1);
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 76218a1..9f626ed 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,9 +275,8 @@ add_subdirectory(fixed_point)
add_subdirectory(HashTable)
add_subdirectory(time)
add_subdirectory(threads)
-
-# Requires access to uchar header which is not on macos
-# Therefore, cannot currently build this on macos in overlay mode
+# Requires access to uchar header which is not on MacOS
+# Cannot currently build this on MacOS in overlay mode
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
add_subdirectory(wchar)
endif()
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 5dff6e9..5176bfd 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -1,6 +1,16 @@
add_custom_target(libc-support-wchar-tests)
add_libc_test(
+ utf8_to_32_test
+ SUITE
+ libc-support-tests
+ SRCS
+ utf8_to_32_test.cpp
+ DEPENDS
+ libc.src.__support.wchar.character_converter
+)
+
+add_libc_test(
utf32_to_8_test
SUITE
libc-support-tests
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
new file mode 100644
index 0000000..9cb059fa
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -0,0 +1,196 @@
+//===-- Unittests for character_converter utf8->utf32 ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ char ch = 'A';
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch));
+ auto wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(err, 0);
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 65);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[2] = {static_cast<char>(0xC2),
+ static_cast<char>(0x8E)}; // Ž car symbol
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ auto wch = char_conv.pop_utf32();
+
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+ static_cast<char>(0x91)}; // ∑ sigma symbol
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ char_conv.push(static_cast<char8_t>(ch[2]));
+ auto wch = char_conv.pop_utf32();
+
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 8721);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+ static_cast<char>(0xA4),
+ static_cast<char>(0xA1)}; // 🤡 clown emoji
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ char_conv.push(static_cast<char8_t>(ch[2]));
+ char_conv.push(static_cast<char8_t>(ch[3]));
+ auto wch = char_conv.pop_utf32();
+
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 129313);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch = static_cast<char>(0x80); // invalid starting bit sequence
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch));
+
+ ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {
+ static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
+ static_cast<char>(0x00)}; // first and third bytes are invalid
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, 0);
+ // Prev byte was single byte so trying to push another should error.
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[3]));
+ ASSERT_EQ(err, 0);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ // Last byte is invalid since it does not have correct starting sequence.
+ // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
+ const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
+ static_cast<char>(0x80), static_cast<char>(0xC0)};
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[3]));
+ ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+ static_cast<char>(0x80)};
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, 0);
+ // Should produce an error on 3rd byte
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, -1);
+
+ // Should produce an error since mbstate was reset
+ auto wch = char_conv.pop_utf32();
+ ASSERT_FALSE(wch.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+ static_cast<char>(0xC7), static_cast<char>(0x8C)};
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, 0);
+ auto wch = char_conv.pop_utf32();
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 142);
+
+ // Second two byte character
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[3]));
+ ASSERT_EQ(err, 0);
+ wch = char_conv.pop_utf32();
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 460);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, 0);
+ auto wch = char_conv.pop_utf32();
+ ASSERT_FALSE(
+ wch.has_value()); // Should fail since we have not read enough bytes
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, 0);
+ wch = char_conv.pop_utf32();
+ ASSERT_TRUE(wch.has_value());
+ ASSERT_EQ(static_cast<int>(wch.value()), 142);
+}