aboutsummaryrefslogtreecommitdiff
path: root/wcsmbs
diff options
context:
space:
mode:
authorTom Honermann <tom@honermann.net>2022-06-30 08:52:14 -0400
committerAdhemerval Zanella <adhemerval.zanella@linaro.org>2022-07-06 09:29:42 -0300
commit8bcca1db3d7c0dc900a4cad4054c1439baf73684 (patch)
treec3d2bb8a6e32462178ba347f755b43c5ae51caff /wcsmbs
parent598f790fb17bcfff7fedde5209933a82d7748328 (diff)
downloadglibc-8bcca1db3d7c0dc900a4cad4054c1439baf73684.zip
glibc-8bcca1db3d7c0dc900a4cad4054c1439baf73684.tar.gz
glibc-8bcca1db3d7c0dc900a4cad4054c1439baf73684.tar.bz2
stdlib: Implement mbrtoc8, c8rtomb, and the char8_t typedef.
This change provides implementations for the mbrtoc8 and c8rtomb functions adopted for C++20 via WG21 P0482R6 and for C2X via WG14 N2653. It also provides the char8_t typedef from WG14 N2653. The mbrtoc8 and c8rtomb functions are declared in uchar.h in C2X mode or when the _GNU_SOURCE macro or C++20 __cpp_char8_t feature test macro is defined. The char8_t typedef is declared in uchar.h in C2X mode or when the _GNU_SOURCE macro is defined and the C++20 __cpp_char8_t feature test macro is not defined (if __cpp_char8_t is defined, then char8_t is a builtin type). Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Diffstat (limited to 'wcsmbs')
-rw-r--r--wcsmbs/Makefile2
-rw-r--r--wcsmbs/Versions3
-rw-r--r--wcsmbs/c8rtomb.c132
-rw-r--r--wcsmbs/mbrtoc8.c126
-rw-r--r--wcsmbs/uchar.h21
5 files changed, 283 insertions, 1 deletions
diff --git a/wcsmbs/Makefile b/wcsmbs/Makefile
index df9a85f..bda281a 100644
--- a/wcsmbs/Makefile
+++ b/wcsmbs/Makefile
@@ -42,7 +42,7 @@ routines := wcscat wcschr wcscmp wcscpy wcscspn wcsdup wcslen wcsncat \
wcsmbsload mbsrtowcs_l \
isoc99_wscanf isoc99_vwscanf isoc99_fwscanf isoc99_vfwscanf \
isoc99_swscanf isoc99_vswscanf \
- mbrtoc16 c16rtomb mbrtoc32 c32rtomb
+ mbrtoc8 c8rtomb mbrtoc16 c16rtomb mbrtoc32 c32rtomb
strop-tests := wcscmp wcsncmp wmemcmp wcslen wcschr wcsrchr wcscpy wcsnlen \
wcpcpy wcsncpy wcpncpy wcscat wcsncat wcschrnul wcsspn wcspbrk \
diff --git a/wcsmbs/Versions b/wcsmbs/Versions
index 0b31c1b..ec28acf 100644
--- a/wcsmbs/Versions
+++ b/wcsmbs/Versions
@@ -49,4 +49,7 @@ libc {
wcstof32; wcstof64; wcstof32x;
wcstof32_l; wcstof64_l; wcstof32x_l;
}
+ GLIBC_2.36 {
+ c8rtomb; mbrtoc8;
+ }
}
diff --git a/wcsmbs/c8rtomb.c b/wcsmbs/c8rtomb.c
new file mode 100644
index 0000000..b564770
--- /dev/null
+++ b/wcsmbs/c8rtomb.c
@@ -0,0 +1,132 @@
+/* UTF-8 to multibyte conversion.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <errno.h>
+#include <uchar.h>
+#include <wchar.h>
+
+
+/* This is the private state used if PS is NULL. */
+static mbstate_t state;
+
+size_t
+c8rtomb (char *s, char8_t c8, mbstate_t *ps)
+{
+ /* This implementation depends on the converter invoked by wcrtomb not
+ needing to retain state in either the top most bit of ps->__count or
+ in ps->__value between invocations. This implementation uses the
+ top most bit of ps->__count to indicate that trailing code units are
+ expected and uses ps->__value to store previously seen code units. */
+
+ wchar_t wc;
+
+ if (ps == NULL)
+ ps = &state;
+
+ if (s == NULL)
+ {
+ /* if 's' is a null pointer, behave as if u8'\0' was passed as 'c8'. If
+ this occurs for an incomplete code unit sequence, then an error will
+ be reported below. */
+ c8 = u8""[0];
+ }
+
+ if (! (ps->__count & 0x80000000))
+ {
+ /* Initial state. */
+ if ((c8 >= 0x80 && c8 <= 0xC1) || c8 >= 0xF5)
+ {
+ /* An invalid lead code unit. */
+ __set_errno (EILSEQ);
+ return -1;
+ }
+ if (c8 >= 0xC2)
+ {
+ /* A valid lead code unit. */
+ ps->__count |= 0x80000000;
+ ps->__value.__wchb[0] = c8;
+ ps->__value.__wchb[3] = 1;
+ return 0;
+ }
+ /* A single byte (ASCII) code unit. */
+ wc = c8;
+ }
+ else
+ {
+ char8_t cu1 = ps->__value.__wchb[0];
+ if (ps->__value.__wchb[3] == 1)
+ {
+ /* A single lead code unit was previously seen. */
+ if ((c8 < 0x80 || c8 > 0xBF)
+ || (cu1 == 0xE0 && c8 < 0xA0)
+ || (cu1 == 0xED && c8 > 0x9F)
+ || (cu1 == 0xF0 && c8 < 0x90)
+ || (cu1 == 0xF4 && c8 > 0x8F))
+ {
+ /* An invalid second code unit. */
+ __set_errno (EILSEQ);
+ return -1;
+ }
+ if (cu1 >= 0xE0)
+ {
+ /* A three or four code unit sequence. */
+ ps->__value.__wchb[1] = c8;
+ ++ps->__value.__wchb[3];
+ return 0;
+ }
+ wc = ((cu1 & 0x1F) << 6)
+ + (c8 & 0x3F);
+ }
+ else
+ {
+ char8_t cu2 = ps->__value.__wchb[1];
+ /* A three or four byte code unit sequence. */
+ if (c8 < 0x80 || c8 > 0xBF)
+ {
+ /* An invalid third or fourth code unit. */
+ __set_errno (EILSEQ);
+ return -1;
+ }
+ if (ps->__value.__wchb[3] == 2 && cu1 >= 0xF0)
+ {
+ /* A four code unit sequence. */
+ ps->__value.__wchb[2] = c8;
+ ++ps->__value.__wchb[3];
+ return 0;
+ }
+ if (cu1 < 0xF0)
+ {
+ wc = ((cu1 & 0x0F) << 12)
+ + ((cu2 & 0x3F) << 6)
+ + (c8 & 0x3F);
+ }
+ else
+ {
+ char8_t cu3 = ps->__value.__wchb[2];
+ wc = ((cu1 & 0x07) << 18)
+ + ((cu2 & 0x3F) << 12)
+ + ((cu3 & 0x3F) << 6)
+ + (c8 & 0x3F);
+ }
+ }
+ ps->__count &= 0x7fffffff;
+ ps->__value.__wch = 0;
+ }
+
+ return wcrtomb (s, wc, ps);
+}
diff --git a/wcsmbs/mbrtoc8.c b/wcsmbs/mbrtoc8.c
new file mode 100644
index 0000000..dd80b52
--- /dev/null
+++ b/wcsmbs/mbrtoc8.c
@@ -0,0 +1,126 @@
+/* Multibyte to UTF-8 conversion.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <assert.h>
+#include <dlfcn.h>
+#include <errno.h>
+#include <gconv.h>
+#include <uchar.h>
+#include <wcsmbsload.h>
+
+#include <sysdep.h>
+
+#ifndef EILSEQ
+# define EILSEQ EINVAL
+#endif
+
+
+/* This is the private state used if PS is NULL. */
+static mbstate_t state;
+
+size_t
+mbrtoc8 (char8_t *pc8, const char *s, size_t n, mbstate_t *ps)
+{
+ /* This implementation depends on the converter invoked by mbrtowc not
+ needing to retain state in either the top most bit of ps->__count or
+ in ps->__value between invocations. This implementation uses the
+ top most bit of ps->__count to indicate that trailing code units are
+ yet to be written and uses ps->__value to store those code units. */
+
+ if (ps == NULL)
+ ps = &state;
+
+ /* If state indicates that trailing code units are yet to be written, write
+ those first regardless of whether 's' is a null pointer. */
+ if (ps->__count & 0x80000000)
+ {
+ /* ps->__value.__wchb[3] stores the index of the next code unit to
+ write. Code units are stored in reverse order. */
+ size_t i = ps->__value.__wchb[3];
+ if (pc8 != NULL)
+ {
+ *pc8 = ps->__value.__wchb[i];
+ }
+ if (i == 0)
+ {
+ ps->__count &= 0x7fffffff;
+ ps->__value.__wch = 0;
+ }
+ else
+ --ps->__value.__wchb[3];
+ return -3;
+ }
+
+ if (s == NULL)
+ {
+ /* if 's' is a null pointer, behave as if a null pointer was passed for
+ 'pc8', an empty string was passed for 's', and 1 passed for 'n'. */
+ pc8 = NULL;
+ s = "";
+ n = 1;
+ }
+
+ wchar_t wc;
+ size_t result;
+
+ result = mbrtowc (&wc, s, n, ps);
+ if (result <= n)
+ {
+ if (wc <= 0x7F)
+ {
+ if (pc8 != NULL)
+ *pc8 = wc;
+ }
+ else if (wc <= 0x7FF)
+ {
+ if (pc8 != NULL)
+ *pc8 = 0xC0 + ((wc >> 6) & 0x1F);
+ ps->__value.__wchb[0] = 0x80 + (wc & 0x3F);
+ ps->__value.__wchb[3] = 0;
+ ps->__count |= 0x80000000;
+ }
+ else if (wc <= 0xFFFF)
+ {
+ if (pc8 != NULL)
+ *pc8 = 0xE0 + ((wc >> 12) & 0x0F);
+ ps->__value.__wchb[1] = 0x80 + ((wc >> 6) & 0x3F);
+ ps->__value.__wchb[0] = 0x80 + (wc & 0x3F);
+ ps->__value.__wchb[3] = 1;
+ ps->__count |= 0x80000000;
+ }
+ else if (wc <= 0x10FFFF)
+ {
+ if (pc8 != NULL)
+ *pc8 = 0xF0 + ((wc >> 18) & 0x07);
+ ps->__value.__wchb[2] = 0x80 + ((wc >> 12) & 0x3F);
+ ps->__value.__wchb[1] = 0x80 + ((wc >> 6) & 0x3F);
+ ps->__value.__wchb[0] = 0x80 + (wc & 0x3F);
+ ps->__value.__wchb[3] = 2;
+ ps->__count |= 0x80000000;
+ }
+ }
+ if (result == 0 && wc != 0)
+ {
+ /* mbrtowc() never returns -3. When a MB sequence converts to multiple
+ WCs, no input is consumed when writing the subsequent WCs resulting
+ in a result of 0 even if a null character wasn't written. */
+ result = -3;
+ }
+
+ return result;
+}
diff --git a/wcsmbs/uchar.h b/wcsmbs/uchar.h
index 051cdcb..c37e861 100644
--- a/wcsmbs/uchar.h
+++ b/wcsmbs/uchar.h
@@ -31,6 +31,13 @@
#include <bits/types.h>
#include <bits/types/mbstate_t.h>
+/* Declare the C2x char8_t typedef in C2x modes, but only if the C++
+ __cpp_char8_t feature test macro is not defined. */
+#if __GLIBC_USE (ISOC2X) && !defined __cpp_char8_t
+/* Define the 8-bit character type. */
+typedef unsigned char char8_t;
+#endif
+
#ifndef __USE_ISOCXX11
/* Define the 16-bit and 32-bit character types. */
typedef __uint_least16_t char16_t;
@@ -40,6 +47,20 @@ typedef __uint_least32_t char32_t;
__BEGIN_DECLS
+/* Declare the C2x mbrtoc8() and c8rtomb() functions in C2x modes or if
+ the C++ __cpp_char8_t feature test macro is defined. */
+#if __GLIBC_USE (ISOC2X) || defined __cpp_char8_t
+/* Write char8_t representation of multibyte character pointed
+ to by S to PC8. */
+extern size_t mbrtoc8 (char8_t *__restrict __pc8,
+ const char *__restrict __s, size_t __n,
+ mbstate_t *__restrict __p) __THROW;
+
+/* Write multibyte representation of char8_t C8 to S. */
+extern size_t c8rtomb (char *__restrict __s, char8_t __c8,
+ mbstate_t *__restrict __ps) __THROW;
+#endif
+
/* Write char16_t representation of multibyte character pointed
to by S to PC16. */
extern size_t mbrtoc16 (char16_t *__restrict __pc16,