aboutsummaryrefslogtreecommitdiff
path: root/libcpp/charset.c
diff options
context:
space:
mode:
Diffstat (limited to 'libcpp/charset.c')
-rw-r--r--libcpp/charset.c157
1 files changed, 112 insertions, 45 deletions
diff --git a/libcpp/charset.c b/libcpp/charset.c
index 3e5578b..b84a974 100644
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -1,5 +1,5 @@
/* CPP Library - charsets
- Copyright (C) 1998-2020 Free Software Foundation, Inc.
+ Copyright (C) 1998-2021 Free Software Foundation, Inc.
Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
@@ -630,7 +630,11 @@ static const struct cpp_conversion conversion_tab[] = {
cset_converter structure for conversion from FROM to TO. If
iconv_open() fails, issue an error and return an identity
converter. Silently return an identity converter if FROM and TO
- are identical. */
+ are identical.
+
+ PFILE is only used for generating diagnostics; setting it to NULL
+ suppresses diagnostics. */
+
static struct cset_converter
init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
{
@@ -672,25 +676,31 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
if (ret.cd == (iconv_t) -1)
{
- if (errno == EINVAL)
- cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
- "conversion from %s to %s not supported by iconv",
- from, to);
- else
- cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
-
+ if (pfile)
+ {
+ if (errno == EINVAL)
+ cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
+ "conversion from %s to %s not supported by iconv",
+ from, to);
+ else
+ cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
+ }
ret.func = convert_no_conversion;
}
}
else
{
- cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
- "no iconv implementation, cannot convert from %s to %s",
- from, to);
+ if (pfile)
+ {
+ cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
+ "no iconv implementation, cannot convert from %s to %s",
+ from, to);
+ }
ret.func = convert_no_conversion;
ret.cd = (iconv_t) -1;
ret.width = -1;
}
+
return ret;
}
@@ -884,14 +894,18 @@ enum {
C11 = 8,
/* Valid in a C11/C++11 identifier, but not as the first character? */
N11 = 16,
+ /* Valid in a C++23 identifier? */
+ CXX23 = 32,
+ /* Valid in a C++23 identifier, but not as the first character? */
+ NXX23 = 64,
/* NFC representation is not valid in an identifier? */
- CID = 32,
+ CID = 128,
/* Might be valid NFC form? */
- NFC = 64,
+ NFC = 256,
/* Might be valid NFKC form? */
- NKC = 128,
+ NKC = 512,
/* Certain preceding characters might make it not valid NFC/NKFC form? */
- CTX = 256
+ CTX = 1024
};
struct ucnrange {
@@ -938,10 +952,12 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
/* When -pedantic, we require the character to have been listed by
the standard for the current language. Otherwise, we accept the
union of the acceptable sets for all supported language versions. */
- valid_flags = C99 | CXX | C11;
+ valid_flags = C99 | CXX | C11 | CXX23;
if (CPP_PEDANTIC (pfile))
{
- if (CPP_OPTION (pfile, c11_identifiers))
+ if (CPP_OPTION (pfile, cxx23_identifiers))
+ valid_flags = CXX23;
+ else if (CPP_OPTION (pfile, c11_identifiers))
valid_flags = C11;
else if (CPP_OPTION (pfile, c99))
valid_flags = C99;
@@ -950,12 +966,6 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
}
if (! (ucnranges[mn].flags & valid_flags))
return 0;
- if (CPP_OPTION (pfile, c11_identifiers))
- invalid_start_flags = N11;
- else if (CPP_OPTION (pfile, c99))
- invalid_start_flags = N99;
- else
- invalid_start_flags = 0;
/* Update NST. */
if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
@@ -998,6 +1008,28 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
nst->previous = c;
nst->prev_class = ucnranges[mn].combine;
+ if (!CPP_PEDANTIC (pfile))
+ {
+ /* If not -pedantic, accept as character that may
+ begin an identifier a union of characters allowed
+ at that position in each of the character sets. */
+ if ((ucnranges[mn].flags & (C99 | N99)) == C99
+ || (ucnranges[mn].flags & CXX) != 0
+ || (ucnranges[mn].flags & (C11 | N11)) == C11
+ || (ucnranges[mn].flags & (CXX23 | NXX23)) == CXX23)
+ return 1;
+ return 2;
+ }
+
+ if (CPP_OPTION (pfile, cxx23_identifiers))
+ invalid_start_flags = NXX23;
+ else if (CPP_OPTION (pfile, c11_identifiers))
+ invalid_start_flags = N11;
+ else if (CPP_OPTION (pfile, c99))
+ invalid_start_flags = N99;
+ else
+ invalid_start_flags = 0;
+
/* In C99, UCN digits may not begin identifiers. In C11 and C++11,
UCN combining characters may not begin identifiers. */
if (ucnranges[mn].flags & invalid_start_flags)
@@ -2122,6 +2154,25 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
buf, bufp - buf, HT_ALLOC));
}
+
+/* Utility to strip a UTF-8 byte order marking from the beginning
+ of a buffer. Returns the number of bytes to skip, which currently
+ will be either 0 or 3. */
+int
+cpp_check_utf8_bom (const char *data, size_t data_length)
+{
+
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+ const unsigned char *udata = (const unsigned char *) data;
+ if (data_length >= 3 && udata[0] == 0xef && udata[1] == 0xbb
+ && udata[2] == 0xbf)
+ return 3;
+#endif
+
+ return 0;
+}
+
+
/* Convert an input buffer (containing the complete contents of one
source file) from INPUT_CHARSET to the source character set. INPUT
points to the input buffer, SIZE is its allocated size, and LEN is
@@ -2135,7 +2186,11 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
INPUT is expected to have been allocated with xmalloc. This
function will either set *BUFFER_START to INPUT, or free it and set
*BUFFER_START to a pointer to another xmalloc-allocated block of
- memory. */
+ memory.
+
+ PFILE is only used to generate diagnostics; setting it to NULL suppresses
+ diagnostics, and causes a return of NULL if there was any error instead. */
+
uchar *
_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
uchar *input, size_t size, size_t len,
@@ -2158,17 +2213,27 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
to.text = XNEWVEC (uchar, to.asize);
to.len = 0;
- if (!APPLY_CONVERSION (input_cset, input, len, &to))
- cpp_error (pfile, CPP_DL_ERROR,
- "failure to convert %s to %s",
- CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
-
+ const bool ok = APPLY_CONVERSION (input_cset, input, len, &to);
free (input);
- }
- /* Clean up the mess. */
- if (input_cset.func == convert_using_iconv)
- iconv_close (input_cset.cd);
+ /* Clean up the mess. */
+ if (input_cset.func == convert_using_iconv)
+ iconv_close (input_cset.cd);
+
+ /* Handle conversion failure. */
+ if (!ok)
+ {
+ if (!pfile)
+ {
+ XDELETEVEC (to.text);
+ *buffer_start = NULL;
+ *st_size = 0;
+ return NULL;
+ }
+ cpp_error (pfile, CPP_DL_ERROR, "failure to convert %s to %s",
+ input_charset, SOURCE_CHARSET);
+ }
+ }
/* Resize buffer if we allocated substantially too much, or if we
haven't enough space for the \n-terminator or following
@@ -2192,19 +2257,14 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
buffer = to.text;
*st_size = to.len;
-#if HOST_CHARSET == HOST_CHARSET_ASCII
- /* The HOST_CHARSET test just above ensures that the source charset
- is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that
- glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
+
+ /* Ignore a UTF-8 BOM if we see one and the source charset is UTF-8. Note
+ that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
BOM -- however, even if it did, we would still need this code due
to the 'convert_no_conversion' case. */
- if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
- && to.text[2] == 0xbf)
- {
- *st_size -= 3;
- buffer += 3;
- }
-#endif
+ const int bom_len = cpp_check_utf8_bom ((const char *) to.text, to.len);
+ *st_size -= bom_len;
+ buffer += bom_len;
*buffer_start = to.text;
return buffer;
@@ -2244,6 +2304,13 @@ _cpp_default_encoding (void)
return current_encoding;
}
+/* Check if the configured input charset requires no conversion, other than
+ possibly stripping a UTF-8 BOM. */
+bool cpp_input_conversion_is_trivial (const char *input_charset)
+{
+ return !strcasecmp (input_charset, SOURCE_CHARSET);
+}
+
/* Implementation of class cpp_string_location_reader. */
/* Constructor for cpp_string_location_reader. */