aboutsummaryrefslogtreecommitdiff
path: root/libcpp
diff options
context:
space:
mode:
authorLewis Hyatt <lhyatt@gmail.com>2021-08-24 19:30:44 -0400
committerLewis Hyatt <lhyatt@gmail.com>2021-08-25 11:15:28 -0400
commit3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2 (patch)
tree688dfb2b2708df32fd2e6b548061eea352e79cea /libcpp
parent43a5d46feabd93ba78983919234f05f5fc9a0982 (diff)
downloadgcc-3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2.zip
gcc-3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2.tar.gz
gcc-3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2.tar.bz2
diagnostics: Support for -finput-charset [PR93067]
Adds the logic to handle -finput-charset in layout_get_source_line(), so that source lines are converted from their input encodings prior to being output by diagnostics machinery. Also adds the ability to strip a UTF-8 BOM similarly. gcc/c-family/ChangeLog: PR other/93067 * c-opts.c (c_common_input_charset_cb): New function. (c_common_post_options): Call new function diagnostic_initialize_input_context(). gcc/d/ChangeLog: PR other/93067 * d-lang.cc (d_input_charset_callback): New function. (d_init): Call new function diagnostic_initialize_input_context(). gcc/fortran/ChangeLog: PR other/93067 * cpp.c (gfc_cpp_post_options): Call new function diagnostic_initialize_input_context(). gcc/ChangeLog: PR other/93067 * coretypes.h (typedef diagnostic_input_charset_callback): Declare. * diagnostic.c (diagnostic_initialize_input_context): New function. * diagnostic.h (diagnostic_initialize_input_context): Declare. * input.c (default_charset_callback): New function. (file_cache::initialize_input_context): New function. (file_cache_slot::create): Added ability to convert the input according to the input context. (file_cache::file_cache): Initialize the new input context. (class file_cache_slot): Added new m_alloc_offset member. (file_cache_slot::file_cache_slot): Initialize the new member. (file_cache_slot::~file_cache_slot): Handle potentially offset buffer. (file_cache_slot::maybe_grow): Likewise. (file_cache_slot::needs_read_p): Handle NULL fp, which is now possible. (file_cache_slot::get_next_line): Likewise. * input.h (class file_cache): Added input context member. libcpp/ChangeLog: PR other/93067 * charset.c (init_iconv_desc): Adapt to permit PFILE argument to be NULL. (_cpp_convert_input): Likewise. Also move UTF-8 BOM logic to... (cpp_check_utf8_bom): ...here. New function. (cpp_input_conversion_is_trivial): New function. * files.c (read_file_guts): Allow PFILE argument to be NULL. Add INPUT_CHARSET argument as an alternate source of this information. (read_file): Pass the new argument to read_file_guts. (cpp_get_converted_source): New function. * include/cpplib.h (struct cpp_converted_source): Declare. (cpp_get_converted_source): Declare. (cpp_input_conversion_is_trivial): Declare. (cpp_check_utf8_bom): Declare. gcc/testsuite/ChangeLog: PR other/93067 * gcc.dg/diagnostic-input-charset-1.c: New test. * gcc.dg/diagnostic-input-utf8-bom.c: New test.
Diffstat (limited to 'libcpp')
-rw-r--r--libcpp/charset.c109
-rw-r--r--libcpp/files.c56
-rw-r--r--libcpp/include/cpplib.h18
3 files changed, 137 insertions, 46 deletions
diff --git a/libcpp/charset.c b/libcpp/charset.c
index 99a9b73..61881f9 100644
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -630,7 +630,11 @@ static const struct cpp_conversion conversion_tab[] = {
cset_converter structure for conversion from FROM to TO. If
iconv_open() fails, issue an error and return an identity
converter. Silently return an identity converter if FROM and TO
- are identical. */
+ are identical.
+
+ PFILE is only used for generating diagnostics; setting it to NULL
+ suppresses diagnostics. */
+
static struct cset_converter
init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
{
@@ -672,25 +676,31 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
if (ret.cd == (iconv_t) -1)
{
- if (errno == EINVAL)
- cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
- "conversion from %s to %s not supported by iconv",
- from, to);
- else
- cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
-
+ if (pfile)
+ {
+ if (errno == EINVAL)
+ cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
+ "conversion from %s to %s not supported by iconv",
+ from, to);
+ else
+ cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
+ }
ret.func = convert_no_conversion;
}
}
else
{
- cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
- "no iconv implementation, cannot convert from %s to %s",
- from, to);
+ if (pfile)
+ {
+ cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
+ "no iconv implementation, cannot convert from %s to %s",
+ from, to);
+ }
ret.func = convert_no_conversion;
ret.cd = (iconv_t) -1;
ret.width = -1;
}
+
return ret;
}
@@ -2122,6 +2132,25 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
buf, bufp - buf, HT_ALLOC));
}
+
+/* Utility to strip a UTF-8 byte order marking from the beginning
+ of a buffer. Returns the number of bytes to skip, which currently
+ will be either 0 or 3. */
+int
+cpp_check_utf8_bom (const char *data, size_t data_length)
+{
+
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+ const unsigned char *udata = (const unsigned char *) data;
+ if (data_length >= 3 && udata[0] == 0xef && udata[1] == 0xbb
+ && udata[2] == 0xbf)
+ return 3;
+#endif
+
+ return 0;
+}
+
+
/* Convert an input buffer (containing the complete contents of one
source file) from INPUT_CHARSET to the source character set. INPUT
points to the input buffer, SIZE is its allocated size, and LEN is
@@ -2135,7 +2164,11 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
INPUT is expected to have been allocated with xmalloc. This
function will either set *BUFFER_START to INPUT, or free it and set
*BUFFER_START to a pointer to another xmalloc-allocated block of
- memory. */
+ memory.
+
+ PFILE is only used to generate diagnostics; setting it to NULL suppresses
+ diagnostics, and causes a return of NULL if there was any error instead. */
+
uchar *
_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
uchar *input, size_t size, size_t len,
@@ -2158,17 +2191,27 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
to.text = XNEWVEC (uchar, to.asize);
to.len = 0;
- if (!APPLY_CONVERSION (input_cset, input, len, &to))
- cpp_error (pfile, CPP_DL_ERROR,
- "failure to convert %s to %s",
- CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
-
+ const bool ok = APPLY_CONVERSION (input_cset, input, len, &to);
free (input);
- }
- /* Clean up the mess. */
- if (input_cset.func == convert_using_iconv)
- iconv_close (input_cset.cd);
+ /* Clean up the mess. */
+ if (input_cset.func == convert_using_iconv)
+ iconv_close (input_cset.cd);
+
+ /* Handle conversion failure. */
+ if (!ok)
+ {
+ if (!pfile)
+ {
+ XDELETEVEC (to.text);
+ *buffer_start = NULL;
+ *st_size = 0;
+ return NULL;
+ }
+ cpp_error (pfile, CPP_DL_ERROR, "failure to convert %s to %s",
+ input_charset, SOURCE_CHARSET);
+ }
+ }
/* Resize buffer if we allocated substantially too much, or if we
haven't enough space for the \n-terminator or following
@@ -2192,19 +2235,14 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
buffer = to.text;
*st_size = to.len;
-#if HOST_CHARSET == HOST_CHARSET_ASCII
- /* The HOST_CHARSET test just above ensures that the source charset
- is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that
- glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
+
+ /* Ignore a UTF-8 BOM if we see one and the source charset is UTF-8. Note
+ that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
BOM -- however, even if it did, we would still need this code due
to the 'convert_no_conversion' case. */
- if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
- && to.text[2] == 0xbf)
- {
- *st_size -= 3;
- buffer += 3;
- }
-#endif
+ const int bom_len = cpp_check_utf8_bom ((const char *) to.text, to.len);
+ *st_size -= bom_len;
+ buffer += bom_len;
*buffer_start = to.text;
return buffer;
@@ -2244,6 +2282,13 @@ _cpp_default_encoding (void)
return current_encoding;
}
+/* Check if the configured input charset requires no conversion, other than
+ possibly stripping a UTF-8 BOM. */
+bool cpp_input_conversion_is_trivial (const char *input_charset)
+{
+ return !strcasecmp (input_charset, SOURCE_CHARSET);
+}
+
/* Implementation of class cpp_string_location_reader. */
/* Constructor for cpp_string_location_reader. */
diff --git a/libcpp/files.c b/libcpp/files.c
index 6e20fc5..c93a03c 100644
--- a/libcpp/files.c
+++ b/libcpp/files.c
@@ -173,7 +173,7 @@ static bool pch_open_file (cpp_reader *pfile, _cpp_file *file,
static bool find_file_in_dir (cpp_reader *pfile, _cpp_file *file,
bool *invalid_pch, location_t loc);
static bool read_file_guts (cpp_reader *pfile, _cpp_file *file,
- location_t loc);
+ location_t loc, const char *input_charset);
static bool read_file (cpp_reader *pfile, _cpp_file *file,
location_t loc);
static struct cpp_dir *search_path_head (cpp_reader *, const char *fname,
@@ -671,9 +671,12 @@ _cpp_find_file (cpp_reader *pfile, const char *fname, cpp_dir *start_dir,
Use LOC for any diagnostics.
+ PFILE may be NULL. In this case, no diagnostics are issued.
+
FIXME: Flush file cache and try again if we run out of memory. */
static bool
-read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc)
+read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc,
+ const char *input_charset)
{
ssize_t size, total, count;
uchar *buf;
@@ -681,8 +684,9 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc)
if (S_ISBLK (file->st.st_mode))
{
- cpp_error_at (pfile, CPP_DL_ERROR, loc,
- "%s is a block device", file->path);
+ if (pfile)
+ cpp_error_at (pfile, CPP_DL_ERROR, loc,
+ "%s is a block device", file->path);
return false;
}
@@ -699,8 +703,9 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc)
does not bite us. */
if (file->st.st_size > INTTYPE_MAXIMUM (ssize_t))
{
- cpp_error_at (pfile, CPP_DL_ERROR, loc,
- "%s is too large", file->path);
+ if (pfile)
+ cpp_error_at (pfile, CPP_DL_ERROR, loc,
+ "%s is too large", file->path);
return false;
}
@@ -733,29 +738,29 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc)
if (count < 0)
{
- cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc);
+ if (pfile)
+ cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc);
free (buf);
return false;
}
- if (regular && total != size && STAT_SIZE_RELIABLE (file->st))
+ if (pfile && regular && total != size && STAT_SIZE_RELIABLE (file->st))
cpp_error_at (pfile, CPP_DL_WARNING, loc,
"%s is shorter than expected", file->path);
file->buffer = _cpp_convert_input (pfile,
- CPP_OPTION (pfile, input_charset),
+ input_charset,
buf, size + 16, total,
&file->buffer_start,
&file->st.st_size);
- file->buffer_valid = true;
-
- return true;
+ file->buffer_valid = file->buffer;
+ return file->buffer_valid;
}
/* Convenience wrapper around read_file_guts that opens the file if
necessary and closes the file descriptor after reading. FILE must
have been passed through find_file() at some stage. Use LOC for
- any diagnostics. */
+ any diagnostics. Unlike read_file_guts(), PFILE may not be NULL. */
static bool
read_file (cpp_reader *pfile, _cpp_file *file, location_t loc)
{
@@ -773,7 +778,8 @@ read_file (cpp_reader *pfile, _cpp_file *file, location_t loc)
return false;
}
- file->dont_read = !read_file_guts (pfile, file, loc);
+ file->dont_read = !read_file_guts (pfile, file, loc,
+ CPP_OPTION (pfile, input_charset));
close (file->fd);
file->fd = -1;
@@ -2145,3 +2151,25 @@ _cpp_has_header (cpp_reader *pfile, const char *fname, int angle_brackets,
return file->err_no != ENOENT;
}
+/* Read a file and convert to input charset, the same as if it were being read
+ by a cpp_reader. */
+
+cpp_converted_source
+cpp_get_converted_source (const char *fname, const char *input_charset)
+{
+ cpp_converted_source res = {};
+ _cpp_file file = {};
+ file.fd = -1;
+ file.name = lbasename (fname);
+ file.path = fname;
+ if (!open_file (&file))
+ return res;
+ const bool ok = read_file_guts (NULL, &file, 0, input_charset);
+ close (file.fd);
+ if (!ok)
+ return res;
+ res.to_free = (char *) file.buffer_start;
+ res.data = (char *) file.buffer;
+ res.len = file.st.st_size;
+ return res;
+}
diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
index 7e84063..af14291 100644
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -1379,6 +1379,20 @@ extern struct _cpp_file *cpp_get_file (cpp_buffer *);
extern cpp_buffer *cpp_get_prev (cpp_buffer *);
extern void cpp_clear_file_cache (cpp_reader *);
+/* cpp_get_converted_source returns the contents of the given file, as it exists
+ after cpplib has read it and converted it from the input charset to the
+ source charset. Return struct will be zero-filled if the data could not be
+ read for any reason. The data starts at the DATA pointer, but the TO_FREE
+ pointer is what should be passed to free(), as there may be an offset. */
+struct cpp_converted_source
+{
+ char *to_free;
+ char *data;
+ size_t len;
+};
+cpp_converted_source cpp_get_converted_source (const char *fname,
+ const char *input_charset);
+
/* In pch.c */
struct save_macro_data;
extern int cpp_save_state (cpp_reader *, FILE *);
@@ -1449,6 +1463,7 @@ class cpp_display_width_computation {
/* Convenience functions that are simple use cases for class
cpp_display_width_computation. Tab characters will be expanded to spaces
as determined by TABSTOP. */
+
int cpp_byte_column_to_display_column (const char *data, int data_length,
int column, int tabstop);
inline int cpp_display_width (const char *data, int data_length,
@@ -1461,4 +1476,7 @@ int cpp_display_column_to_byte_column (const char *data, int data_length,
int display_col, int tabstop);
int cpp_wcwidth (cppchar_t c);
+bool cpp_input_conversion_is_trivial (const char *input_charset);
+int cpp_check_utf8_bom (const char *data, size_t data_length);
+
#endif /* ! LIBCPP_CPPLIB_H */