diagnostics: Support for -finput-charset [PR93067]

Adds the logic to handle -finput-charset in layout_get_source_line(), so that source lines are converted from their input encodings prior to being output by diagnostics machinery. Also adds the ability to strip a UTF-8 BOM similarly. gcc/c-family/ChangeLog: PR other/93067 * c-opts.c (c_common_input_charset_cb): New function. (c_common_post_options): Call new function diagnostic_initialize_input_context(). gcc/d/ChangeLog: PR other/93067 * d-lang.cc (d_input_charset_callback): New function. (d_init): Call new function diagnostic_initialize_input_context(). gcc/fortran/ChangeLog: PR other/93067 * cpp.c (gfc_cpp_post_options): Call new function diagnostic_initialize_input_context(). gcc/ChangeLog: PR other/93067 * coretypes.h (typedef diagnostic_input_charset_callback): Declare. * diagnostic.c (diagnostic_initialize_input_context): New function. * diagnostic.h (diagnostic_initialize_input_context): Declare. * input.c (default_charset_callback): New function. (file_cache::initialize_input_context): New function. (file_cache_slot::create): Added ability to convert the input according to the input context. (file_cache::file_cache): Initialize the new input context. (class file_cache_slot): Added new m_alloc_offset member. (file_cache_slot::file_cache_slot): Initialize the new member. (file_cache_slot::~file_cache_slot): Handle potentially offset buffer. (file_cache_slot::maybe_grow): Likewise. (file_cache_slot::needs_read_p): Handle NULL fp, which is now possible. (file_cache_slot::get_next_line): Likewise. * input.h (class file_cache): Added input context member. libcpp/ChangeLog: PR other/93067 * charset.c (init_iconv_desc): Adapt to permit PFILE argument to be NULL. (_cpp_convert_input): Likewise. Also move UTF-8 BOM logic to... (cpp_check_utf8_bom): ...here. New function. (cpp_input_conversion_is_trivial): New function. * files.c (read_file_guts): Allow PFILE argument to be NULL. Add INPUT_CHARSET argument as an alternate source of this information. (read_file): Pass the new argument to read_file_guts. (cpp_get_converted_source): New function. * include/cpplib.h (struct cpp_converted_source): Declare. (cpp_get_converted_source): Declare. (cpp_input_conversion_is_trivial): Declare. (cpp_check_utf8_bom): Declare. gcc/testsuite/ChangeLog: PR other/93067 * gcc.dg/diagnostic-input-charset-1.c: New test. * gcc.dg/diagnostic-input-utf8-bom.c: New test.
author: Lewis Hyatt <lhyatt@gmail.com> 2021-08-24 19:30:44 -0400
committer: Lewis Hyatt <lhyatt@gmail.com> 2021-08-25 11:15:28 -0400
commit: 3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2 (patch)
tree: 688dfb2b2708df32fd2e6b548061eea352e79cea /gcc
parent: 43a5d46feabd93ba78983919234f05f5fc9a0982 (diff)
download: gcc-3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2.zip
gcc-3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2.tar.gz
gcc-3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2.tar.bz2
10 files changed, 198 insertions, 12 deletions
diff --git a/gcc/c-family/c-opts.c b/gcc/c-family/c-opts.c
index 373af0c..fdde082 100644
--- a/gcc/c-family/c-opts.c
+++ b/gcc/c-family/c-opts.c
@@ -188,6 +188,14 @@ c_common_diagnostics_set_defaults (diagnostic_context *context)
   context->opt_permissive = OPT_fpermissive;
 }
 
+/* Input charset configuration for diagnostics.  */
+static const char *
+c_common_input_charset_cb (const char * /*filename*/)
+{
+  const char *cs = cpp_opts->input_charset;
+  return cpp_input_conversion_is_trivial (cs) ? nullptr : cs;
+}
+
 /* Whether options from all C-family languages should be accepted
    quietly.  */
 static bool accept_all_c_family_options = false;
@@ -1136,6 +1144,11 @@ c_common_post_options (const char **pfilename)
   cpp_post_options (parse_in);
   init_global_opts_from_cpp (&global_options, cpp_get_options (parse_in));
 
+  /* Let diagnostics infrastructure know how to convert input files the same
+     way libcpp will do it, namely using the configured input charset and
+     skipping a UTF-8 BOM if present.  */
+  diagnostic_initialize_input_context (global_dc,
+				       c_common_input_charset_cb, true);
   input_location = UNKNOWN_LOCATION;
 
   *pfilename = this_input_filename
diff --git a/gcc/coretypes.h b/gcc/coretypes.h
index 406572e..726fcad 100644
--- a/gcc/coretypes.h
+++ b/gcc/coretypes.h
@@ -154,6 +154,7 @@ struct cl_option_handlers;
 struct diagnostic_context;
 class pretty_printer;
 class diagnostic_event_id_t;
+typedef const char * (*diagnostic_input_charset_callback)(const char *);
 
 template<typename T> struct array_traits;
 
diff --git a/gcc/d/d-lang.cc b/gcc/d/d-lang.cc
index 4386a48..fa29a46a 100644
--- a/gcc/d/d-lang.cc
+++ b/gcc/d/d-lang.cc
@@ -50,6 +50,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "output.h"
 #include "print-tree.h"
 #include "debug.h"
+#include "input.h"
 
 #include "d-tree.h"
 #include "id.h"
@@ -362,6 +363,19 @@ d_option_lang_mask (void)
   return CL_D;
 }
 
+/* Implements input charset and BOM skipping configuration for
+   diagnostics.  */
+static const char *d_input_charset_callback (const char * /*filename*/)
+{
+  /* TODO: The input charset is automatically determined by code in
+     dmd/dmodule.c based on the contents of the file.  If this detection
+     logic were factored out and could be reused here, then we would be able
+     to return UTF-16 or UTF-32 as needed here.  For now, we return always
+     NULL, which means no conversion is necessary, i.e. the input is assumed
+     to be UTF-8 when diagnostics read this file.  */
+  return nullptr;
+}
+
 /* Implements the lang_hooks.init routine for language D.  */
 
 static bool
@@ -373,6 +387,11 @@ d_init (void)
   Expression::_init ();
   Objc::_init ();
 
+  /* Diagnostics input init, to enable BOM skipping and
+     input charset conversion.  */
+  diagnostic_initialize_input_context (global_dc,
+				       d_input_charset_callback, true);
+
   /* Back-end init.  */
   global_binding_level = ggc_cleared_alloc <binding_level> ();
   current_binding_level = global_binding_level;
diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c
index 8361f68..b3afbea 100644
--- a/gcc/diagnostic.c
+++ b/gcc/diagnostic.c
@@ -293,6 +293,17 @@ diagnostic_urls_init (diagnostic_context *context, int value /*= -1 */)
     = determine_url_format ((diagnostic_url_rule_t) value);
 }
 
+/* Create the file_cache, if not already created, and tell it how to
+   translate files on input.  */
+void diagnostic_initialize_input_context (diagnostic_context *context,
+					  diagnostic_input_charset_callback ccb,
+					  bool should_skip_bom)
+{
+  if (!context->m_file_cache)
+    context->m_file_cache = new file_cache;
+  context->m_file_cache->initialize_input_context (ccb, should_skip_bom);
+}
+
 /* Do any cleaning up required after the last diagnostic is emitted.  */
 
 void
diff --git a/gcc/diagnostic.h b/gcc/diagnostic.h
index 7227dae..f90d20a 100644
--- a/gcc/diagnostic.h
+++ b/gcc/diagnostic.h
@@ -446,6 +446,25 @@ extern void diagnostic_show_locus (diagnostic_context *,
 				   diagnostic_t diagnostic_kind);
 extern void diagnostic_show_any_path (diagnostic_context *, diagnostic_info *);
 
+/* Because we read source files a second time after the frontend did it the
+   first time, we need to know how the frontend handled things like character
+   set conversion and UTF-8 BOM stripping, in order to make everything
+   consistent.  This function needs to be called by each frontend that requires
+   non-default behavior, to inform the diagnostics infrastructure how input is
+   to be processed.  The default behavior is to do no conversion and not to
+   strip a UTF-8 BOM.
+
+   The callback should return the input charset to be used to convert the given
+   file's contents to UTF-8, or it should return NULL if no conversion is needed
+   for this file.  SHOULD_SKIP_BOM only applies in case no conversion was
+   performed, and if true, it will cause a UTF-8 BOM to be skipped at the
+   beginning of the file.  (In case a conversion was performed, the BOM is
+   rather skipped as part of the conversion process.)  */
+
+void diagnostic_initialize_input_context (diagnostic_context *context,
+					  diagnostic_input_charset_callback ccb,
+					  bool should_skip_bom);
+
 /* Force diagnostics controlled by OPTIDX to be kind KIND.  */
 extern diagnostic_t diagnostic_classify_diagnostic (diagnostic_context *,
 						    int /* optidx */,
diff --git a/gcc/fortran/cpp.c b/gcc/fortran/cpp.c
index 419cd6a..83c4517 100644
--- a/gcc/fortran/cpp.c
+++ b/gcc/fortran/cpp.c
@@ -493,6 +493,12 @@ gfc_cpp_post_options (void)
 
   cpp_post_options (cpp_in);
 
+
+  /* Let diagnostics infrastructure know how to convert input files the same
+     way libcpp will do it, namely, with no charset conversion but with
+     skipping of a UTF-8 BOM if present.  */
+  diagnostic_initialize_input_context (global_dc, nullptr, true);
+
   gfc_cpp_register_include_paths ();
 }
 
diff --git a/gcc/input.c b/gcc/input.c
index de20d98..4b80986 100644
--- a/gcc/input.c
+++ b/gcc/input.c
@@ -22,7 +22,6 @@ along with GCC; see the file COPYING3.  If not see
 #include "coretypes.h"
 #include "intl.h"
 #include "diagnostic.h"
-#include "diagnostic-core.h"
 #include "selftest.h"
 #include "cpplib.h"
 
@@ -30,6 +29,20 @@ along with GCC; see the file COPYING3.  If not see
 #define HAVE_ICONV 0
 #endif
 
+/* Input charset configuration.  */
+static const char *default_charset_callback (const char *)
+{
+  return nullptr;
+}
+
+void
+file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
+				      bool should_skip_bom)
+{
+  in_context.ccb = (ccb ? ccb : default_charset_callback);
+  in_context.should_skip_bom = should_skip_bom;
+}
+
 /* This is a cache used by get_next_line to store the content of a
    file to be searched for file lines.  */
 class file_cache_slot
@@ -51,7 +64,8 @@ public:
 
   void inc_use_count () { m_use_count++; }
 
-  void create (const char *file_path, FILE *fp, unsigned highest_use_count);
+  bool create (const file_cache::input_context &in_context,
+	       const char *file_path, FILE *fp, unsigned highest_use_count);
   void evict ();
 
  private:
@@ -110,6 +124,10 @@ public:
      far.  */
   char *m_data;
 
+  /* The allocated buffer to be freed may start a little earlier than DATA,
+     e.g. if a UTF8 BOM was skipped at the beginning.  */
+  int m_alloc_offset;
+
   /*  The size of the DATA array above.*/
   size_t m_size;
 
@@ -147,6 +165,17 @@ public:
      doesn't explode.  We thus scale total_lines down to
      line_record_size.  */
   vec<line_info, va_heap> m_line_record;
+
+  void offset_buffer (int offset)
+  {
+    gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
+		: (size_t) offset <= m_size);
+    gcc_assert (m_data);
+    m_alloc_offset += offset;
+    m_data += offset;
+    m_size -= offset;
+  }
+
 };
 
 /* Current position in real source file.  */
@@ -419,21 +448,25 @@ file_cache::add_file (const char *file_path)
 
   unsigned highest_use_count = 0;
   file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
-  r->create (file_path, fp, highest_use_count);
+  if (!r->create (in_context, file_path, fp, highest_use_count))
+    return NULL;
   return r;
 }
 
 /* Populate this slot for use on FILE_PATH and FP, dropping any
    existing cached content within it.  */
 
-void
-file_cache_slot::create (const char *file_path, FILE *fp,
+bool
+file_cache_slot::create (const file_cache::input_context &in_context,
+			 const char *file_path, FILE *fp,
 			 unsigned highest_use_count)
 {
   m_file_path = file_path;
   if (m_fp)
     fclose (m_fp);
   m_fp = fp;
+  if (m_alloc_offset)
+    offset_buffer (-m_alloc_offset);
   m_nb_read = 0;
   m_line_start_idx = 0;
   m_line_num = 0;
@@ -443,6 +476,36 @@ file_cache_slot::create (const char *file_path, FILE *fp,
   m_use_count = ++highest_use_count;
   m_total_lines = total_lines_num (file_path);
   m_missing_trailing_newline = true;
+
+
+  /* Check the input configuration to determine if we need to do any
+     transformations, such as charset conversion or BOM skipping.  */
+  if (const char *input_charset = in_context.ccb (file_path))
+    {
+      /* Need a full-blown conversion of the input charset.  */
+      fclose (m_fp);
+      m_fp = NULL;
+      const cpp_converted_source cs
+	= cpp_get_converted_source (file_path, input_charset);
+      if (!cs.data)
+	return false;
+      if (m_data)
+	XDELETEVEC (m_data);
+      m_data = cs.data;
+      m_nb_read = m_size = cs.len;
+      m_alloc_offset = cs.data - cs.to_free;
+    }
+  else if (in_context.should_skip_bom)
+    {
+      if (read_data ())
+	{
+	  const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
+	  offset_buffer (offset);
+	  m_nb_read -= offset;
+	}
+    }
+
+  return true;
 }
 
 /* file_cache's ctor.  */
@@ -450,6 +513,7 @@ file_cache_slot::create (const char *file_path, FILE *fp,
 file_cache::file_cache ()
 : m_file_slots (new file_cache_slot[num_file_slots])
 {
+  initialize_input_context (nullptr, false);
 }
 
 /* file_cache's dtor.  */
@@ -478,8 +542,8 @@ file_cache::lookup_or_add_file (const char *file_path)
 
 file_cache_slot::file_cache_slot ()
 : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
-  m_size (0), m_nb_read (0), m_line_start_idx (0), m_line_num (0),
-  m_total_lines (0), m_missing_trailing_newline (true)
+  m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
+  m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
 {
   m_line_record.create (0);
 }
@@ -495,6 +559,7 @@ file_cache_slot::~file_cache_slot ()
     }
   if (m_data)
     {
+      offset_buffer (-m_alloc_offset);
       XDELETEVEC (m_data);
       m_data = 0;
     }
@@ -509,7 +574,7 @@ file_cache_slot::~file_cache_slot ()
 bool
 file_cache_slot::needs_read_p () const
 {
-  return (m_nb_read == 0
+  return m_fp && (m_nb_read == 0
 	  || m_nb_read == m_size
 	  || (m_line_start_idx >= m_nb_read - 1));
 }
@@ -531,9 +596,20 @@ file_cache_slot::maybe_grow ()
   if (!needs_grow_p ())
     return;
 
-  size_t size = m_size == 0 ? buffer_size : m_size * 2;
-  m_data = XRESIZEVEC (char, m_data, size);
-  m_size = size;
+  if (!m_data)
+    {
+      gcc_assert (m_size == 0 && m_alloc_offset == 0);
+      m_size = buffer_size;
+      m_data = XNEWVEC (char, m_size);
+    }
+  else
+    {
+      const int offset = m_alloc_offset;
+      offset_buffer (-offset);
+      m_size *= 2;
+      m_data = XRESIZEVEC (char, m_data, m_size);
+      offset_buffer (offset);
+    }
 }
 
 /*  Read more data into the cache.  Extends the cache if need be.
@@ -632,7 +708,7 @@ file_cache_slot::get_next_line (char **line, ssize_t *line_len)
       m_missing_trailing_newline = false;
     }
 
-  if (ferror (m_fp))
+  if (m_fp && ferror (m_fp))
     return false;
 
   /* At this point, we've found the end of the of line.  It either
diff --git a/gcc/input.h b/gcc/input.h
index bbcec84..e688107 100644
--- a/gcc/input.h
+++ b/gcc/input.h
@@ -111,6 +111,15 @@ class file_cache
   file_cache_slot *lookup_or_add_file (const char *file_path);
   void forcibly_evict_file (const char *file_path);
 
+  /* See comments in diagnostic.h about the input conversion context.  */
+  struct input_context
+  {
+    diagnostic_input_charset_callback ccb;
+    bool should_skip_bom;
+  };
+  void initialize_input_context (diagnostic_input_charset_callback ccb,
+				 bool should_skip_bom);
+
  private:
   file_cache_slot *evicted_cache_tab_entry (unsigned *highest_use_count);
   file_cache_slot *add_file (const char *file_path);
@@ -119,6 +128,7 @@ class file_cache
  private:
   static const size_t num_file_slots = 16;
   file_cache_slot *m_file_slots;
+  input_context in_context;
 };
 
 extern expanded_location
diff --git a/gcc/testsuite/gcc.dg/diagnostic-input-charset-1.c b/gcc/testsuite/gcc.dg/diagnostic-input-charset-1.c
new file mode 100644
index 0000000..4e56833
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/diagnostic-input-charset-1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-iconv "CP850" } */
+/* { dg-options "-finput-charset=CP850 -fdiagnostics-show-caret" } */
+
+/* Test that diagnostics are converted to UTF-8; this file is encoded in
+   CP850.  Why CP850?  -finput-charset only supports encodings that are a
+   superset of ASCII.  But encodings that look like latin-1 are automatically
+   converted by expect to UTF-8, and hence by the time dg sees them, it can't
+   verify they were actually output in UTF-8.  So codepage 850 was chosen as one
+   that is hopefully available and meets the requirements of matching ASCII and
+   not matching latin-1.  */
+const char *section = "�"
+/* { dg-error "expected .* at end of input" "" { target *-*-*} .-1 } */
+/* { dg-begin-multiline-output "" }
+ const char *section = "§"
+ ^~~~~
+   { dg-end-multiline-output "" } */
diff --git a/gcc/testsuite/gcc.dg/diagnostic-input-utf8-bom.c b/gcc/testsuite/gcc.dg/diagnostic-input-utf8-bom.c
new file mode 100644
index 0000000..1a3f352
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/diagnostic-input-utf8-bom.c
@@ -0,0 +1,14 @@
+int 1;
+/* { dg-do compile } */
+/* { dg-options "-fdiagnostics-show-caret" } */
+
+/* This file begins with a UTF-8 byte order mark.  Verify that diagnostics
+   still point to the right place, since the stripping of the BOM happens twice,
+   once when libcpp reads the file, and once when diagnostics infrastucture
+   reads it.  */
+
+/* { dg-error "expected .* before numeric constant" "" { target *-*-*} 1 } */
+/* { dg-begin-multiline-output "" }
+ int 1;
+     ^
+   { dg-end-multiline-output "" } */
author	Lewis Hyatt <lhyatt@gmail.com>	2021-08-24 19:30:44 -0400
committer	Lewis Hyatt <lhyatt@gmail.com>	2021-08-25 11:15:28 -0400
commit	3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2 (patch)
tree	688dfb2b2708df32fd2e6b548061eea352e79cea /gcc
parent	43a5d46feabd93ba78983919234f05f5fc9a0982 (diff)
download	gcc-3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2.zip gcc-3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2.tar.gz gcc-3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2.tar.bz2