diff options
Diffstat (limited to 'libcpp/charset.c')
-rw-r--r-- | libcpp/charset.c | 157 |
1 files changed, 112 insertions, 45 deletions
diff --git a/libcpp/charset.c b/libcpp/charset.c index 3e5578b..b84a974 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -1,5 +1,5 @@ /* CPP Library - charsets - Copyright (C) 1998-2020 Free Software Foundation, Inc. + Copyright (C) 1998-2021 Free Software Foundation, Inc. Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges. @@ -630,7 +630,11 @@ static const struct cpp_conversion conversion_tab[] = { cset_converter structure for conversion from FROM to TO. If iconv_open() fails, issue an error and return an identity converter. Silently return an identity converter if FROM and TO - are identical. */ + are identical. + + PFILE is only used for generating diagnostics; setting it to NULL + suppresses diagnostics. */ + static struct cset_converter init_iconv_desc (cpp_reader *pfile, const char *to, const char *from) { @@ -672,25 +676,31 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from) if (ret.cd == (iconv_t) -1) { - if (errno == EINVAL) - cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */ - "conversion from %s to %s not supported by iconv", - from, to); - else - cpp_errno (pfile, CPP_DL_ERROR, "iconv_open"); - + if (pfile) + { + if (errno == EINVAL) + cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */ + "conversion from %s to %s not supported by iconv", + from, to); + else + cpp_errno (pfile, CPP_DL_ERROR, "iconv_open"); + } ret.func = convert_no_conversion; } } else { - cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */ - "no iconv implementation, cannot convert from %s to %s", - from, to); + if (pfile) + { + cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */ + "no iconv implementation, cannot convert from %s to %s", + from, to); + } ret.func = convert_no_conversion; ret.cd = (iconv_t) -1; ret.width = -1; } + return ret; } @@ -884,14 +894,18 @@ enum { C11 = 8, /* Valid in a C11/C++11 identifier, but not as the first character? */ N11 = 16, + /* Valid in a C++23 identifier? */ + CXX23 = 32, + /* Valid in a C++23 identifier, but not as the first character? */ + NXX23 = 64, /* NFC representation is not valid in an identifier? */ - CID = 32, + CID = 128, /* Might be valid NFC form? */ - NFC = 64, + NFC = 256, /* Might be valid NFKC form? */ - NKC = 128, + NKC = 512, /* Certain preceding characters might make it not valid NFC/NKFC form? */ - CTX = 256 + CTX = 1024 }; struct ucnrange { @@ -938,10 +952,12 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c, /* When -pedantic, we require the character to have been listed by the standard for the current language. Otherwise, we accept the union of the acceptable sets for all supported language versions. */ - valid_flags = C99 | CXX | C11; + valid_flags = C99 | CXX | C11 | CXX23; if (CPP_PEDANTIC (pfile)) { - if (CPP_OPTION (pfile, c11_identifiers)) + if (CPP_OPTION (pfile, cxx23_identifiers)) + valid_flags = CXX23; + else if (CPP_OPTION (pfile, c11_identifiers)) valid_flags = C11; else if (CPP_OPTION (pfile, c99)) valid_flags = C99; @@ -950,12 +966,6 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c, } if (! (ucnranges[mn].flags & valid_flags)) return 0; - if (CPP_OPTION (pfile, c11_identifiers)) - invalid_start_flags = N11; - else if (CPP_OPTION (pfile, c99)) - invalid_start_flags = N99; - else - invalid_start_flags = 0; /* Update NST. */ if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class) @@ -998,6 +1008,28 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c, nst->previous = c; nst->prev_class = ucnranges[mn].combine; + if (!CPP_PEDANTIC (pfile)) + { + /* If not -pedantic, accept as character that may + begin an identifier a union of characters allowed + at that position in each of the character sets. */ + if ((ucnranges[mn].flags & (C99 | N99)) == C99 + || (ucnranges[mn].flags & CXX) != 0 + || (ucnranges[mn].flags & (C11 | N11)) == C11 + || (ucnranges[mn].flags & (CXX23 | NXX23)) == CXX23) + return 1; + return 2; + } + + if (CPP_OPTION (pfile, cxx23_identifiers)) + invalid_start_flags = NXX23; + else if (CPP_OPTION (pfile, c11_identifiers)) + invalid_start_flags = N11; + else if (CPP_OPTION (pfile, c99)) + invalid_start_flags = N99; + else + invalid_start_flags = 0; + /* In C99, UCN digits may not begin identifiers. In C11 and C++11, UCN combining characters may not begin identifiers. */ if (ucnranges[mn].flags & invalid_start_flags) @@ -2122,6 +2154,25 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) buf, bufp - buf, HT_ALLOC)); } + +/* Utility to strip a UTF-8 byte order marking from the beginning + of a buffer. Returns the number of bytes to skip, which currently + will be either 0 or 3. */ +int +cpp_check_utf8_bom (const char *data, size_t data_length) +{ + +#if HOST_CHARSET == HOST_CHARSET_ASCII + const unsigned char *udata = (const unsigned char *) data; + if (data_length >= 3 && udata[0] == 0xef && udata[1] == 0xbb + && udata[2] == 0xbf) + return 3; +#endif + + return 0; +} + + /* Convert an input buffer (containing the complete contents of one source file) from INPUT_CHARSET to the source character set. INPUT points to the input buffer, SIZE is its allocated size, and LEN is @@ -2135,7 +2186,11 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) INPUT is expected to have been allocated with xmalloc. This function will either set *BUFFER_START to INPUT, or free it and set *BUFFER_START to a pointer to another xmalloc-allocated block of - memory. */ + memory. + + PFILE is only used to generate diagnostics; setting it to NULL suppresses + diagnostics, and causes a return of NULL if there was any error instead. */ + uchar * _cpp_convert_input (cpp_reader *pfile, const char *input_charset, uchar *input, size_t size, size_t len, @@ -2158,17 +2213,27 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, to.text = XNEWVEC (uchar, to.asize); to.len = 0; - if (!APPLY_CONVERSION (input_cset, input, len, &to)) - cpp_error (pfile, CPP_DL_ERROR, - "failure to convert %s to %s", - CPP_OPTION (pfile, input_charset), SOURCE_CHARSET); - + const bool ok = APPLY_CONVERSION (input_cset, input, len, &to); free (input); - } - /* Clean up the mess. */ - if (input_cset.func == convert_using_iconv) - iconv_close (input_cset.cd); + /* Clean up the mess. */ + if (input_cset.func == convert_using_iconv) + iconv_close (input_cset.cd); + + /* Handle conversion failure. */ + if (!ok) + { + if (!pfile) + { + XDELETEVEC (to.text); + *buffer_start = NULL; + *st_size = 0; + return NULL; + } + cpp_error (pfile, CPP_DL_ERROR, "failure to convert %s to %s", + input_charset, SOURCE_CHARSET); + } + } /* Resize buffer if we allocated substantially too much, or if we haven't enough space for the \n-terminator or following @@ -2192,19 +2257,14 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, buffer = to.text; *st_size = to.len; -#if HOST_CHARSET == HOST_CHARSET_ASCII - /* The HOST_CHARSET test just above ensures that the source charset - is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that - glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a + + /* Ignore a UTF-8 BOM if we see one and the source charset is UTF-8. Note + that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a BOM -- however, even if it did, we would still need this code due to the 'convert_no_conversion' case. */ - if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb - && to.text[2] == 0xbf) - { - *st_size -= 3; - buffer += 3; - } -#endif + const int bom_len = cpp_check_utf8_bom ((const char *) to.text, to.len); + *st_size -= bom_len; + buffer += bom_len; *buffer_start = to.text; return buffer; @@ -2244,6 +2304,13 @@ _cpp_default_encoding (void) return current_encoding; } +/* Check if the configured input charset requires no conversion, other than + possibly stripping a UTF-8 BOM. */ +bool cpp_input_conversion_is_trivial (const char *input_charset) +{ + return !strcasecmp (input_charset, SOURCE_CHARSET); +} + /* Implementation of class cpp_string_location_reader. */ /* Constructor for cpp_string_location_reader. */ |