diff options
author | Lewis Hyatt <lhyatt@gmail.com> | 2021-08-24 19:30:44 -0400 |
---|---|---|
committer | Lewis Hyatt <lhyatt@gmail.com> | 2021-08-25 11:15:28 -0400 |
commit | 3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2 (patch) | |
tree | 688dfb2b2708df32fd2e6b548061eea352e79cea /libcpp/charset.c | |
parent | 43a5d46feabd93ba78983919234f05f5fc9a0982 (diff) | |
download | gcc-3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2.zip gcc-3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2.tar.gz gcc-3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2.tar.bz2 |
diagnostics: Support for -finput-charset [PR93067]
Adds the logic to handle -finput-charset in layout_get_source_line(), so that
source lines are converted from their input encodings prior to being output by
diagnostics machinery. Also adds the ability to strip a UTF-8 BOM similarly.
gcc/c-family/ChangeLog:
PR other/93067
* c-opts.c (c_common_input_charset_cb): New function.
(c_common_post_options): Call new function
diagnostic_initialize_input_context().
gcc/d/ChangeLog:
PR other/93067
* d-lang.cc (d_input_charset_callback): New function.
(d_init): Call new function
diagnostic_initialize_input_context().
gcc/fortran/ChangeLog:
PR other/93067
* cpp.c (gfc_cpp_post_options): Call new function
diagnostic_initialize_input_context().
gcc/ChangeLog:
PR other/93067
* coretypes.h (typedef diagnostic_input_charset_callback): Declare.
* diagnostic.c (diagnostic_initialize_input_context): New function.
* diagnostic.h (diagnostic_initialize_input_context): Declare.
* input.c (default_charset_callback): New function.
(file_cache::initialize_input_context): New function.
(file_cache_slot::create): Added ability to convert the input
according to the input context.
(file_cache::file_cache): Initialize the new input context.
(class file_cache_slot): Added new m_alloc_offset member.
(file_cache_slot::file_cache_slot): Initialize the new member.
(file_cache_slot::~file_cache_slot): Handle potentially offset buffer.
(file_cache_slot::maybe_grow): Likewise.
(file_cache_slot::needs_read_p): Handle NULL fp, which is now possible.
(file_cache_slot::get_next_line): Likewise.
* input.h (class file_cache): Added input context member.
libcpp/ChangeLog:
PR other/93067
* charset.c (init_iconv_desc): Adapt to permit PFILE argument to
be NULL.
(_cpp_convert_input): Likewise. Also move UTF-8 BOM logic to...
(cpp_check_utf8_bom): ...here. New function.
(cpp_input_conversion_is_trivial): New function.
* files.c (read_file_guts): Allow PFILE argument to be NULL. Add
INPUT_CHARSET argument as an alternate source of this information.
(read_file): Pass the new argument to read_file_guts.
(cpp_get_converted_source): New function.
* include/cpplib.h (struct cpp_converted_source): Declare.
(cpp_get_converted_source): Declare.
(cpp_input_conversion_is_trivial): Declare.
(cpp_check_utf8_bom): Declare.
gcc/testsuite/ChangeLog:
PR other/93067
* gcc.dg/diagnostic-input-charset-1.c: New test.
* gcc.dg/diagnostic-input-utf8-bom.c: New test.
Diffstat (limited to 'libcpp/charset.c')
-rw-r--r-- | libcpp/charset.c | 109 |
1 files changed, 77 insertions, 32 deletions
diff --git a/libcpp/charset.c b/libcpp/charset.c index 99a9b73..61881f9 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -630,7 +630,11 @@ static const struct cpp_conversion conversion_tab[] = { cset_converter structure for conversion from FROM to TO. If iconv_open() fails, issue an error and return an identity converter. Silently return an identity converter if FROM and TO - are identical. */ + are identical. + + PFILE is only used for generating diagnostics; setting it to NULL + suppresses diagnostics. */ + static struct cset_converter init_iconv_desc (cpp_reader *pfile, const char *to, const char *from) { @@ -672,25 +676,31 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from) if (ret.cd == (iconv_t) -1) { - if (errno == EINVAL) - cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */ - "conversion from %s to %s not supported by iconv", - from, to); - else - cpp_errno (pfile, CPP_DL_ERROR, "iconv_open"); - + if (pfile) + { + if (errno == EINVAL) + cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */ + "conversion from %s to %s not supported by iconv", + from, to); + else + cpp_errno (pfile, CPP_DL_ERROR, "iconv_open"); + } ret.func = convert_no_conversion; } } else { - cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */ - "no iconv implementation, cannot convert from %s to %s", - from, to); + if (pfile) + { + cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */ + "no iconv implementation, cannot convert from %s to %s", + from, to); + } ret.func = convert_no_conversion; ret.cd = (iconv_t) -1; ret.width = -1; } + return ret; } @@ -2122,6 +2132,25 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) buf, bufp - buf, HT_ALLOC)); } + +/* Utility to strip a UTF-8 byte order marking from the beginning + of a buffer. Returns the number of bytes to skip, which currently + will be either 0 or 3. */ +int +cpp_check_utf8_bom (const char *data, size_t data_length) +{ + +#if HOST_CHARSET == HOST_CHARSET_ASCII + const unsigned char *udata = (const unsigned char *) data; + if (data_length >= 3 && udata[0] == 0xef && udata[1] == 0xbb + && udata[2] == 0xbf) + return 3; +#endif + + return 0; +} + + /* Convert an input buffer (containing the complete contents of one source file) from INPUT_CHARSET to the source character set. INPUT points to the input buffer, SIZE is its allocated size, and LEN is @@ -2135,7 +2164,11 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) INPUT is expected to have been allocated with xmalloc. This function will either set *BUFFER_START to INPUT, or free it and set *BUFFER_START to a pointer to another xmalloc-allocated block of - memory. */ + memory. + + PFILE is only used to generate diagnostics; setting it to NULL suppresses + diagnostics, and causes a return of NULL if there was any error instead. */ + uchar * _cpp_convert_input (cpp_reader *pfile, const char *input_charset, uchar *input, size_t size, size_t len, @@ -2158,17 +2191,27 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, to.text = XNEWVEC (uchar, to.asize); to.len = 0; - if (!APPLY_CONVERSION (input_cset, input, len, &to)) - cpp_error (pfile, CPP_DL_ERROR, - "failure to convert %s to %s", - CPP_OPTION (pfile, input_charset), SOURCE_CHARSET); - + const bool ok = APPLY_CONVERSION (input_cset, input, len, &to); free (input); - } - /* Clean up the mess. */ - if (input_cset.func == convert_using_iconv) - iconv_close (input_cset.cd); + /* Clean up the mess. */ + if (input_cset.func == convert_using_iconv) + iconv_close (input_cset.cd); + + /* Handle conversion failure. */ + if (!ok) + { + if (!pfile) + { + XDELETEVEC (to.text); + *buffer_start = NULL; + *st_size = 0; + return NULL; + } + cpp_error (pfile, CPP_DL_ERROR, "failure to convert %s to %s", + input_charset, SOURCE_CHARSET); + } + } /* Resize buffer if we allocated substantially too much, or if we haven't enough space for the \n-terminator or following @@ -2192,19 +2235,14 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, buffer = to.text; *st_size = to.len; -#if HOST_CHARSET == HOST_CHARSET_ASCII - /* The HOST_CHARSET test just above ensures that the source charset - is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that - glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a + + /* Ignore a UTF-8 BOM if we see one and the source charset is UTF-8. Note + that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a BOM -- however, even if it did, we would still need this code due to the 'convert_no_conversion' case. */ - if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb - && to.text[2] == 0xbf) - { - *st_size -= 3; - buffer += 3; - } -#endif + const int bom_len = cpp_check_utf8_bom ((const char *) to.text, to.len); + *st_size -= bom_len; + buffer += bom_len; *buffer_start = to.text; return buffer; @@ -2244,6 +2282,13 @@ _cpp_default_encoding (void) return current_encoding; } +/* Check if the configured input charset requires no conversion, other than + possibly stripping a UTF-8 BOM. */ +bool cpp_input_conversion_is_trivial (const char *input_charset) +{ + return !strcasecmp (input_charset, SOURCE_CHARSET); +} + /* Implementation of class cpp_string_location_reader. */ /* Constructor for cpp_string_location_reader. */ |