diff options
author | sribee8 <sriya.pratipati@gmail.com> | 2025-07-24 18:28:25 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-07-24 18:28:25 +0000 |
commit | 4f2686e5a131094d985677591482508586f1f5c9 (patch) | |
tree | 3c0a66c3bddc6e2e3173d56b17934fffb35c0bf5 /libc | |
parent | 1a0f482de8bb095a518a78c421776474de9141e4 (diff) | |
download | llvm-4f2686e5a131094d985677591482508586f1f5c9.zip llvm-4f2686e5a131094d985677591482508586f1f5c9.tar.gz llvm-4f2686e5a131094d985677591482508586f1f5c9.tar.bz2 |
[libc] Implemented mblen functions (#150141)
Implemented mblen and mbrlen as well as tests
---------
Co-authored-by: Sriya Pratipati <sriyap@google.com>
Diffstat (limited to 'libc')
-rw-r--r-- | libc/config/linux/x86_64/entrypoints.txt | 2 | ||||
-rw-r--r-- | libc/include/wchar.yaml | 15 | ||||
-rw-r--r-- | libc/src/wchar/CMakeLists.txt | 31 | ||||
-rw-r--r-- | libc/src/wchar/mblen.cpp | 35 | ||||
-rw-r--r-- | libc/src/wchar/mblen.h | 21 | ||||
-rw-r--r-- | libc/src/wchar/mbrlen.cpp | 37 | ||||
-rw-r--r-- | libc/src/wchar/mbrlen.h | 22 | ||||
-rw-r--r-- | libc/test/src/wchar/CMakeLists.txt | 27 | ||||
-rw-r--r-- | libc/test/src/wchar/mblen_test.cpp | 104 | ||||
-rw-r--r-- | libc/test/src/wchar/mbrlen_test.cpp | 139 |
10 files changed, 433 insertions, 0 deletions
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 381359c..3cb1c48 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -1261,6 +1261,8 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.sys.socket.recvmsg # wchar.h entrypoints + libc.src.wchar.mblen + libc.src.wchar.mbrlen libc.src.wchar.mbrtowc libc.src.wchar.mbtowc libc.src.wchar.wcrtomb diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 123d344..4adf596 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -53,6 +53,21 @@ functions: - type: wchar_t *__restrict - type: const char *__restrict - type: size_t + - name: mblen + standards: + - stdc + return_type: int + arguments: + - type: const char * + - type: size_t + - name: mbrlen + standards: + - stdc + return_type: size_t + arguments: + - type: const char *__restrict + - type: size_t + - type: mbstate_t *__restrict - name: wmemset standards: - stdc diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt index 159778d..2b95d94 100644 --- a/libc/src/wchar/CMakeLists.txt +++ b/libc/src/wchar/CMakeLists.txt @@ -170,6 +170,37 @@ add_entrypoint_object( ) add_entrypoint_object( + mblen + SRCS + mblen.cpp + HDRS + mblen.h + DEPENDS + libc.hdr.types.size_t + libc.src.__support.common + libc.src.__support.macros.config + libc.src.__support.libc_errno + libc.src.__support.wchar.mbrtowc + libc.src.__support.wchar.mbstate +) + +add_entrypoint_object( + mbrlen + SRCS + mbrlen.cpp + HDRS + mbrlen.h + DEPENDS + libc.hdr.types.size_t + libc.hdr.types.mbstate_t + libc.src.__support.common + libc.src.__support.macros.config + libc.src.__support.wchar.mbrtowc + libc.src.__support.libc_errno + libc.src.__support.wchar.mbstate +) + +add_entrypoint_object( wmemset SRCS wmemset.cpp diff --git a/libc/src/wchar/mblen.cpp b/libc/src/wchar/mblen.cpp new file mode 100644 index 0000000..2d15b3e --- /dev/null +++ b/libc/src/wchar/mblen.cpp @@ -0,0 +1,35 @@ +//===-- Implementation of mblen -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/wchar/mblen.h" + +#include "hdr/types/size_t.h" +#include "src/__support/common.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/mbrtowc.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, mblen, (const char *s, size_t n)) { + // returns 0 since UTF-8 encoding is not state-dependent + if (s == nullptr) + return 0; + internal::mbstate internal_mbstate; + auto ret = internal::mbrtowc(nullptr, s, n, &internal_mbstate); + if (!ret.has_value() || static_cast<int>(ret.value()) == -2) { + // Encoding failure + if (!ret.has_value()) + libc_errno = EILSEQ; + return -1; + } + return static_cast<int>(ret.value()); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wchar/mblen.h b/libc/src/wchar/mblen.h new file mode 100644 index 0000000..a315a2f --- /dev/null +++ b/libc/src/wchar/mblen.h @@ -0,0 +1,21 @@ +//===-- Implementation header for mblen -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_WCHAR_MBLEN_H +#define LLVM_LIBC_SRC_WCHAR_MBLEN_H + +#include "hdr/types/size_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int mblen(const char *s, size_t n); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_WCHAR_MBLEN_H diff --git a/libc/src/wchar/mbrlen.cpp b/libc/src/wchar/mbrlen.cpp new file mode 100644 index 0000000..8de78e0 --- /dev/null +++ b/libc/src/wchar/mbrlen.cpp @@ -0,0 +1,37 @@ +//===-- Implementation of mbrlen ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/wchar/mbrlen.h" + +#include "hdr/types/mbstate_t.h" +#include "hdr/types/size_t.h" +#include "src/__support/common.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include "src/__support/wchar/mbrtowc.h" +#include "src/__support/wchar/mbstate.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(size_t, mbrlen, + (const char *__restrict s, size_t n, + mbstate_t *__restrict ps)) { + static internal::mbstate internal_mbstate; + auto ret = internal::mbrtowc(nullptr, s, n, + ps == nullptr + ? &internal_mbstate + : reinterpret_cast<internal::mbstate *>(ps)); + if (!ret.has_value()) { + // Encoding failure + libc_errno = ret.error(); + return -1; + } + return ret.value(); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wchar/mbrlen.h b/libc/src/wchar/mbrlen.h new file mode 100644 index 0000000..08b59cf --- /dev/null +++ b/libc/src/wchar/mbrlen.h @@ -0,0 +1,22 @@ +//===-- Implementation header for mbrlen ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_WCHAR_MBRLEN_H +#define LLVM_LIBC_SRC_WCHAR_MBRLEN_H + +#include "hdr/types/mbstate_t.h" +#include "hdr/types/size_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_WCHAR_MBRLEN_H diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt index 176cf7c..baa52b7 100644 --- a/libc/test/src/wchar/CMakeLists.txt +++ b/libc/test/src/wchar/CMakeLists.txt @@ -65,6 +65,33 @@ add_libc_test( ) add_libc_test( + mblen_test + SUITE + libc_wchar_unittests + SRCS + mblen_test.cpp + DEPENDS + libc.src.__support.libc_errno + libc.src.wchar.mblen + libc.test.UnitTest.ErrnoCheckingTest +) + +add_libc_test( + mbrlen_test + SUITE + libc_wchar_unittests + SRCS + mbrlen_test.cpp + DEPENDS + libc.src.__support.libc_errno + libc.src.__support.wchar.mbstate + libc.src.string.memset + libc.src.wchar.mbrlen + libc.hdr.types.mbstate_t + libc.test.UnitTest.ErrnoCheckingTest +) + +add_libc_test( wctob_test SUITE libc_wchar_unittests diff --git a/libc/test/src/wchar/mblen_test.cpp b/libc/test/src/wchar/mblen_test.cpp new file mode 100644 index 0000000..efd4df7 --- /dev/null +++ b/libc/test/src/wchar/mblen_test.cpp @@ -0,0 +1,104 @@ +//===-- Unittests for mblen -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/libc_errno.h" +#include "src/wchar/mblen.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcMBLenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcMBLenTest, OneByte) { + const char *ch = "A"; + int n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, 1); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 0); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, -1); +} + +TEST_F(LlvmLibcMBLenTest, TwoByte) { + const char ch[2] = {static_cast<char>(0xC2), + static_cast<char>(0x8E)}; // car symbol + int n = LIBC_NAMESPACE::mblen(ch, 4); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, 2); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_SUCCESS(); + // Should fail after trying to read next byte too + n = LIBC_NAMESPACE::mblen(ch + 1, 1); + ASSERT_EQ(n, -1); + // This one should be an invalid starting byte so should set errno + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBLenTest, ThreeByte) { + const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88), + static_cast<char>(0x91)}; // ∑ sigma symbol + int n = LIBC_NAMESPACE::mblen(ch, 3); + ASSERT_EQ(n, 3); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 2); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBLenTest, FourByte) { + const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F), + static_cast<char>(0xA4), + static_cast<char>(0xA1)}; // 🤡 clown emoji + int n = LIBC_NAMESPACE::mblen(ch, 4); + ASSERT_EQ(n, 4); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mblen(ch, 2); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBLenTest, InvalidByte) { + const char ch[1] = {static_cast<char>(0x80)}; + int n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBLenTest, InvalidMultiByte) { + const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00), + static_cast<char>(0x80), + static_cast<char>(0x00)}; // invalid sequence of bytes + // Trying to push all 4 should error + int n = LIBC_NAMESPACE::mblen(ch, 4); + ASSERT_EQ(n, -1); + ASSERT_ERRNO_EQ(EILSEQ); + + // Trying to push the second and third should correspond to null wc + n = LIBC_NAMESPACE::mblen(ch + 1, 2); + ASSERT_EQ(n, 0); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBLenTest, NullString) { + // reading on nullptr should return 0 + int n = LIBC_NAMESPACE::mblen(nullptr, 2); + ASSERT_EQ(n, 0); + ASSERT_ERRNO_SUCCESS(); + // reading a null terminator should return 0 + const char *ch = "\0"; + n = LIBC_NAMESPACE::mblen(ch, 1); + ASSERT_EQ(n, 0); +} diff --git a/libc/test/src/wchar/mbrlen_test.cpp b/libc/test/src/wchar/mbrlen_test.cpp new file mode 100644 index 0000000..e1452bf4 --- /dev/null +++ b/libc/test/src/wchar/mbrlen_test.cpp @@ -0,0 +1,139 @@ +//===-- Unittests for mbrlen ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/types/wchar_t.h" +#include "src/__support/libc_errno.h" +#include "src/__support/wchar/mbstate.h" +#include "src/string/memset.h" +#include "src/wchar/mbrlen.h" +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcMBRLenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcMBRLenTest, OneByte) { + const char *ch = "A"; + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 1, &mb); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, static_cast<size_t>(1)); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrlen(ch, 0, &mb); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(n, static_cast<size_t>(-2)); +} + +TEST_F(LlvmLibcMBRLenTest, TwoByte) { + const char ch[2] = {static_cast<char>(0xC2), + static_cast<char>(0x8E)}; // car symbol + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 4, nullptr); + ASSERT_ERRNO_SUCCESS(); + ASSERT_EQ(static_cast<int>(n), 2); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrlen(ch, 1, &mb); + ASSERT_EQ(static_cast<int>(n), -2); + ASSERT_ERRNO_SUCCESS(); + // Should pass after trying to read next byte + n = LIBC_NAMESPACE::mbrlen(ch + 1, 1, &mb); + ASSERT_EQ(static_cast<int>(n), 1); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBRLenTest, ThreeByte) { + const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88), + static_cast<char>(0x91)}; // ∑ sigma symbol + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 3, &mb); + ASSERT_EQ(static_cast<int>(n), 3); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrlen(ch, 2, &mb); + ASSERT_EQ(static_cast<int>(n), -2); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBRLenTest, FourByte) { + const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F), + static_cast<char>(0xA4), + static_cast<char>(0xA1)}; // 🤡 clown emoji + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + size_t n = LIBC_NAMESPACE::mbrlen(ch, 4, &mb); + ASSERT_EQ(static_cast<int>(n), 4); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrlen(ch, 2, &mb); + ASSERT_EQ(static_cast<int>(n), -2); + ASSERT_ERRNO_SUCCESS(); + + // Should fail since we have not read enough + n = LIBC_NAMESPACE::mbrlen(ch + 2, 1, &mb); + ASSERT_EQ(static_cast<int>(n), -2); + ASSERT_ERRNO_SUCCESS(); + + // Should pass after reading final byte + n = LIBC_NAMESPACE::mbrlen(ch + 3, 5, &mb); + ASSERT_EQ(static_cast<int>(n), 1); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBRLenTest, InvalidByte) { + const char ch[1] = {static_cast<char>(0x80)}; + size_t n = LIBC_NAMESPACE::mbrlen(ch, 1, nullptr); + ASSERT_EQ(static_cast<int>(n), -1); + ASSERT_ERRNO_EQ(EILSEQ); +} + +TEST_F(LlvmLibcMBRLenTest, InvalidMultiByte) { + const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00), + static_cast<char>(0x80), + static_cast<char>(0x00)}; // invalid sequence of bytes + mbstate_t mb; + LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t)); + // Trying to push all 4 should error + size_t n = LIBC_NAMESPACE::mbrlen(ch, 4, &mb); + ASSERT_EQ(static_cast<int>(n), -1); + ASSERT_ERRNO_EQ(EILSEQ); + + // Trying to push the second and third should correspond to null wc + n = LIBC_NAMESPACE::mbrlen(ch + 1, 2, &mb); + ASSERT_EQ(static_cast<int>(n), 0); + ASSERT_ERRNO_SUCCESS(); +} + +TEST_F(LlvmLibcMBRLenTest, NullString) { + // reading on nullptr should return 0 + size_t n = LIBC_NAMESPACE::mbrlen(nullptr, 2, nullptr); + ASSERT_EQ(static_cast<int>(n), 0); + ASSERT_ERRNO_SUCCESS(); + // reading a null terminator should return 0 + const char *ch = "\0"; + n = LIBC_NAMESPACE::mbrlen(ch, 1, nullptr); + ASSERT_EQ(static_cast<int>(n), 0); +} + +TEST_F(LlvmLibcMBRLenTest, InvalidMBState) { + const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E), + static_cast<char>(0xC7), static_cast<char>(0x8C)}; + mbstate_t *mb; + LIBC_NAMESPACE::internal::mbstate inv; + inv.total_bytes = 6; + mb = reinterpret_cast<mbstate_t *>(&inv); + // invalid mbstate should error + size_t n = LIBC_NAMESPACE::mbrlen(ch, 2, mb); + ASSERT_EQ(static_cast<int>(n), -1); + ASSERT_ERRNO_EQ(EINVAL); +} |