aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorDavid Malcolm <dmalcolm@redhat.com>2016-08-05 18:08:33 +0000
committerDavid Malcolm <dmalcolm@gcc.gnu.org>2016-08-05 18:08:33 +0000
commit88fa5555a309e5d6c6171b957daaf2f800920869 (patch)
treeda539f06e1f9c9f39cb46858aa0305df57b746c7 /gcc
parent1addb9e62b8970734343074e4a2b38f924954850 (diff)
downloadgcc-88fa5555a309e5d6c6171b957daaf2f800920869.zip
gcc-88fa5555a309e5d6c6171b957daaf2f800920869.tar.gz
gcc-88fa5555a309e5d6c6171b957daaf2f800920869.tar.bz2
On-demand locations within string-literals
gcc/c-family/ChangeLog: * c-common.c: Include "substring-locations.h". (get_cpp_ttype_from_string_type): New function. (g_string_concat_db): New global. (substring_loc::get_range): New method. * c-common.h (g_string_concat_db): New declaration. (class substring_loc): New class. * c-lex.c (lex_string): When concatenating strings, capture the locations of all tokens using a new obstack, and record the concatenation locations within g_string_concat_db. * c-opts.c (c_common_init_options): Construct g_string_concat_db on the ggc-heap. gcc/ChangeLog: * input.c (string_concat::string_concat): New constructor. (string_concat_db::string_concat_db): New constructor. (string_concat_db::record_string_concatenation): New method. (string_concat_db::get_string_concatenation): New method. (string_concat_db::get_key_loc): New method. (class auto_cpp_string_vec): New class. (get_substring_ranges_for_loc): New function. (get_source_range_for_substring): New function. (get_num_source_ranges_for_substring): New function. (class selftest::lexer_test_options): New class. (struct selftest::lexer_test): New struct. (class selftest::ebcdic_execution_charset): New class. (selftest::ebcdic_execution_charset::s_singleton): New variable. (selftest::lexer_test::lexer_test): New constructor. (selftest::lexer_test::~lexer_test): New destructor. (selftest::lexer_test::get_token): New method. (selftest::assert_char_at_range): New function. (ASSERT_CHAR_AT_RANGE): New macro. (selftest::assert_num_substring_ranges): New function. (ASSERT_NUM_SUBSTRING_RANGES): New macro. (selftest::assert_has_no_substring_ranges): New function. (ASSERT_HAS_NO_SUBSTRING_RANGES): New macro. (selftest::test_lexer_string_locations_simple): New function. (selftest::test_lexer_string_locations_ebcdic): New function. (selftest::test_lexer_string_locations_hex): New function. (selftest::test_lexer_string_locations_oct): New function. (selftest::test_lexer_string_locations_letter_escape_1): New function. (selftest::test_lexer_string_locations_letter_escape_2): New function. (selftest::test_lexer_string_locations_ucn4): New function. (selftest::test_lexer_string_locations_ucn8): New function. (selftest::uint32_from_big_endian): New function. (selftest::test_lexer_string_locations_wide_string): New function. (selftest::uint16_from_big_endian): New function. (selftest::test_lexer_string_locations_string16): New function. (selftest::test_lexer_string_locations_string32): New function. (selftest::test_lexer_string_locations_u8): New function. (selftest::test_lexer_string_locations_utf8_source): New function. (selftest::test_lexer_string_locations_concatenation_1): New function. (selftest::test_lexer_string_locations_concatenation_2): New function. (selftest::test_lexer_string_locations_concatenation_3): New function. (selftest::test_lexer_string_locations_macro): New function. (selftest::test_lexer_string_locations_stringified_macro_argument): New function. (selftest::test_lexer_string_locations_non_string): New function. (selftest::test_lexer_string_locations_long_line): New function. (selftest::test_lexer_char_constants): New function. (selftest::input_c_tests): Call the new test functions once per case within the line_table test matrix. * input.h (struct string_concat): New struct. (struct location_hash): New struct. (class string_concat_db): New class. * substring-locations.h: New header. gcc/testsuite/ChangeLog: * gcc.dg/plugin/diagnostic-test-string-literals-1.c: New file. * gcc.dg/plugin/diagnostic-test-string-literals-2.c: New file. * gcc.dg/plugin/diagnostic_plugin_test_string_literals.c: New file. * gcc.dg/plugin/plugin.exp (plugin_test_list): Add the above new files. libcpp/ChangeLog: * charset.c (cpp_substring_ranges::cpp_substring_ranges): New constructor. (cpp_substring_ranges::~cpp_substring_ranges): New destructor. (cpp_substring_ranges::add_range): New method. (cpp_substring_ranges::add_n_ranges): New method. (_cpp_valid_ucn): Add "char_range" and "loc_reader" params; if they are non-NULL, read position information from *loc_reader and update char_range->m_finish accordingly. (convert_ucn): Add "char_range", "loc_reader", and "ranges" params. If loc_reader is non-NULL, read location information from it, and update *ranges accordingly, using char_range. Conditionalize the conversion into tbuf on tbuf being non-NULL. (convert_hex): Likewise, conditionalizing the call to emit_numeric_escape on tbuf. (convert_oct): Likewise. (convert_escape): Add params "loc_reader" and "ranges". If loc_reader is non-NULL, read location information from it, and update *ranges accordingly. Conditionalize the conversion into tbuf on tbuf being non-NULL. (cpp_interpret_string): Rename to... (cpp_interpret_string_1): ...this, adding params "loc_readers" and "out". Use "to" to conditionalize the initialization and usage of "tbuf", such as running the converter. If "loc_readers" is non-NULL, use the instances within it, reading location information from them, and passing them to convert_escape; likewise write to "out" if loc_readers is non-NULL. Check for leading quote and issue an error if it is not present. Update boundary check from "== limit" to ">= limit" to protect against erroneous location values to calls that are not parsing string literals. (cpp_interpret_string): Reimplement in terms to cpp_interpret_string_1. (noop_error_cb): New function. (cpp_interpret_string_ranges): New function. (cpp_string_location_reader::cpp_string_location_reader): New constructor. (cpp_string_location_reader::get_next): New method. * include/cpplib.h (class cpp_string_location_reader): New class. (class cpp_substring_ranges): New class. (cpp_interpret_string_ranges): New prototype. * internal.h (_cpp_valid_ucn): Add params "char_range" and "loc_reader". * lex.c (forms_identifier_p): Pass NULL for new params to _cpp_valid_ucn. From-SVN: r239175
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog58
-rw-r--r--gcc/c-family/ChangeLog14
-rw-r--r--gcc/c-family/c-common.c62
-rw-r--r--gcc/c-family/c-common.h29
-rw-r--r--gcc/c-family/c-lex.c24
-rw-r--r--gcc/c-family/c-opts.c3
-rw-r--r--gcc/input.c1547
-rw-r--r--gcc/input.h35
-rw-r--r--gcc/substring-locations.h30
-rw-r--r--gcc/testsuite/ChangeLog7
-rw-r--r--gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-1.c211
-rw-r--r--gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-2.c53
-rw-r--r--gcc/testsuite/gcc.dg/plugin/diagnostic_plugin_test_string_literals.c212
-rw-r--r--gcc/testsuite/gcc.dg/plugin/plugin.exp3
14 files changed, 2286 insertions, 2 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 10d7224..40ca18c 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,61 @@
+2016-08-05 David Malcolm <dmalcolm@redhat.com>
+
+ * input.c (string_concat::string_concat): New constructor.
+ (string_concat_db::string_concat_db): New constructor.
+ (string_concat_db::record_string_concatenation): New method.
+ (string_concat_db::get_string_concatenation): New method.
+ (string_concat_db::get_key_loc): New method.
+ (class auto_cpp_string_vec): New class.
+ (get_substring_ranges_for_loc): New function.
+ (get_source_range_for_substring): New function.
+ (get_num_source_ranges_for_substring): New function.
+ (class selftest::lexer_test_options): New class.
+ (struct selftest::lexer_test): New struct.
+ (class selftest::ebcdic_execution_charset): New class.
+ (selftest::ebcdic_execution_charset::s_singleton): New variable.
+ (selftest::lexer_test::lexer_test): New constructor.
+ (selftest::lexer_test::~lexer_test): New destructor.
+ (selftest::lexer_test::get_token): New method.
+ (selftest::assert_char_at_range): New function.
+ (ASSERT_CHAR_AT_RANGE): New macro.
+ (selftest::assert_num_substring_ranges): New function.
+ (ASSERT_NUM_SUBSTRING_RANGES): New macro.
+ (selftest::assert_has_no_substring_ranges): New function.
+ (ASSERT_HAS_NO_SUBSTRING_RANGES): New macro.
+ (selftest::test_lexer_string_locations_simple): New function.
+ (selftest::test_lexer_string_locations_ebcdic): New function.
+ (selftest::test_lexer_string_locations_hex): New function.
+ (selftest::test_lexer_string_locations_oct): New function.
+ (selftest::test_lexer_string_locations_letter_escape_1): New function.
+ (selftest::test_lexer_string_locations_letter_escape_2): New function.
+ (selftest::test_lexer_string_locations_ucn4): New function.
+ (selftest::test_lexer_string_locations_ucn8): New function.
+ (selftest::uint32_from_big_endian): New function.
+ (selftest::test_lexer_string_locations_wide_string): New function.
+ (selftest::uint16_from_big_endian): New function.
+ (selftest::test_lexer_string_locations_string16): New function.
+ (selftest::test_lexer_string_locations_string32): New function.
+ (selftest::test_lexer_string_locations_u8): New function.
+ (selftest::test_lexer_string_locations_utf8_source): New function.
+ (selftest::test_lexer_string_locations_concatenation_1): New
+ function.
+ (selftest::test_lexer_string_locations_concatenation_2): New
+ function.
+ (selftest::test_lexer_string_locations_concatenation_3): New
+ function.
+ (selftest::test_lexer_string_locations_macro): New function.
+ (selftest::test_lexer_string_locations_stringified_macro_argument):
+ New function.
+ (selftest::test_lexer_string_locations_non_string): New function.
+ (selftest::test_lexer_string_locations_long_line): New function.
+ (selftest::test_lexer_char_constants): New function.
+ (selftest::input_c_tests): Call the new test functions once per
+ case within the line_table test matrix.
+ * input.h (struct string_concat): New struct.
+ (struct location_hash): New struct.
+ (class string_concat_db): New class.
+ * substring-locations.h: New header.
+
2016-08-05 Patrick Palka <ppalka@gcc.gnu.org>
PR tree-optimization/72810
diff --git a/gcc/c-family/ChangeLog b/gcc/c-family/ChangeLog
index aed494a..d5cdfed 100644
--- a/gcc/c-family/ChangeLog
+++ b/gcc/c-family/ChangeLog
@@ -1,3 +1,17 @@
+2016-08-05 David Malcolm <dmalcolm@redhat.com>
+
+ * c-common.c: Include "substring-locations.h".
+ (get_cpp_ttype_from_string_type): New function.
+ (g_string_concat_db): New global.
+ (substring_loc::get_range): New method.
+ * c-common.h (g_string_concat_db): New declaration.
+ (class substring_loc): New class.
+ * c-lex.c (lex_string): When concatenating strings, capture the
+ locations of all tokens using a new obstack, and record the
+ concatenation locations within g_string_concat_db.
+ * c-opts.c (c_common_init_options): Construct g_string_concat_db
+ on the ggc-heap.
+
2016-07-29 Marek Polacek <polacek@redhat.com>
PR c/71926
diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c
index 27031b5..569f000 100644
--- a/gcc/c-family/c-common.c
+++ b/gcc/c-family/c-common.c
@@ -45,6 +45,7 @@ along with GCC; see the file COPYING3. If not see
#include "tree-iterator.h"
#include "opts.h"
#include "gimplify.h"
+#include "substring-locations.h"
cpp_reader *parse_in; /* Declared in c-pragma.h. */
@@ -1098,6 +1099,67 @@ fix_string_type (tree value)
TREE_STATIC (value) = 1;
return value;
}
+
+/* Given a string of type STRING_TYPE, determine what kind of string
+ token would give an equivalent execution encoding: CPP_STRING,
+ CPP_STRING16, or CPP_STRING32. Return CPP_OTHER in case of error.
+ This may not be exactly the string token type that initially created
+ the string, since CPP_WSTRING is indistinguishable from the 16/32 bit
+ string type at this point.
+
+ This effectively reverses part of the logic in lex_string and
+ fix_string_type. */
+
+static enum cpp_ttype
+get_cpp_ttype_from_string_type (tree string_type)
+{
+ gcc_assert (string_type);
+ if (TREE_CODE (string_type) != ARRAY_TYPE)
+ return CPP_OTHER;
+
+ tree element_type = TREE_TYPE (string_type);
+ if (TREE_CODE (element_type) != INTEGER_TYPE)
+ return CPP_OTHER;
+
+ int bits_per_character = TYPE_PRECISION (element_type);
+ switch (bits_per_character)
+ {
+ case 8:
+ return CPP_STRING; /* It could have also been CPP_UTF8STRING. */
+ case 16:
+ return CPP_STRING16;
+ case 32:
+ return CPP_STRING32;
+ }
+
+ return CPP_OTHER;
+}
+
+/* The global record of string concatentations, for use in
+ extracting locations within string literals. */
+
+GTY(()) string_concat_db *g_string_concat_db;
+
+/* Attempt to determine the source range of the substring.
+ If successful, return NULL and write the source range to *OUT_RANGE.
+ Otherwise return an error message. Error messages are intended
+ for GCC developers (to help debugging) rather than for end-users. */
+
+const char *
+substring_loc::get_range (source_range *out_range) const
+{
+ gcc_assert (out_range);
+
+ enum cpp_ttype tok_type = get_cpp_ttype_from_string_type (m_string_type);
+ if (tok_type == CPP_OTHER)
+ return "unrecognized string type";
+
+ return get_source_range_for_substring (parse_in, g_string_concat_db,
+ m_fmt_string_loc, tok_type,
+ m_start_idx, m_end_idx,
+ out_range);
+}
+
/* Fold X for consideration by one of the warning functions when checking
whether an expression has a constant value. */
diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
index 8c80574..7b5da57 100644
--- a/gcc/c-family/c-common.h
+++ b/gcc/c-family/c-common.h
@@ -1110,6 +1110,35 @@ extern time_t cb_get_source_date_epoch (cpp_reader *pfile);
__TIME__ can store. */
#define MAX_SOURCE_DATE_EPOCH HOST_WIDE_INT_C (253402300799)
+extern GTY(()) string_concat_db *g_string_concat_db;
+
+/* libcpp can calculate location information about a range of characters
+ within a string literal, but doing so is non-trivial.
+
+ This class encapsulates such a source location, so that it can be
+ passed around (e.g. within c-format.c). It is effectively a deferred
+ call into libcpp. If needed by a diagnostic, the actual source_range
+ can be calculated by calling the get_range method. */
+
+class substring_loc
+{
+ public:
+ substring_loc (location_t fmt_string_loc, tree string_type,
+ int start_idx, int end_idx)
+ : m_fmt_string_loc (fmt_string_loc), m_string_type (string_type),
+ m_start_idx (start_idx), m_end_idx (end_idx) {}
+
+ const char *get_range (source_range *out_range) const;
+
+ location_t get_fmt_string_loc () const { return m_fmt_string_loc; }
+
+ private:
+ location_t m_fmt_string_loc;
+ tree m_string_type;
+ int m_start_idx;
+ int m_end_idx;
+};
+
/* In c-gimplify.c */
extern void c_genericize (tree);
extern int c_gimplify_expr (tree *, gimple_seq *, gimple_seq *);
diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c
index 8f33d86..4c7e385 100644
--- a/gcc/c-family/c-lex.c
+++ b/gcc/c-family/c-lex.c
@@ -1097,13 +1097,16 @@ lex_string (const cpp_token *tok, tree *valp, bool objc_string, bool translate)
tree value;
size_t concats = 0;
struct obstack str_ob;
+ struct obstack loc_ob;
cpp_string istr;
enum cpp_ttype type = tok->type;
/* Try to avoid the overhead of creating and destroying an obstack
for the common case of just one string. */
cpp_string str = tok->val.str;
+ location_t init_loc = tok->src_loc;
cpp_string *strs = &str;
+ location_t *locs = NULL;
/* objc_at_sign_was_seen is only used when doing Objective-C string
concatenation. It is 'true' if we have seen an '@' before the
@@ -1142,16 +1145,21 @@ lex_string (const cpp_token *tok, tree *valp, bool objc_string, bool translate)
else
error ("unsupported non-standard concatenation of string literals");
}
+ /* FALLTHROUGH */
case CPP_STRING:
if (!concats)
{
gcc_obstack_init (&str_ob);
+ gcc_obstack_init (&loc_ob);
obstack_grow (&str_ob, &str, sizeof (cpp_string));
+ obstack_grow (&loc_ob, &init_loc, sizeof (location_t));
}
concats++;
obstack_grow (&str_ob, &tok->val.str, sizeof (cpp_string));
+ obstack_grow (&loc_ob, &tok->src_loc, sizeof (location_t));
+
if (objc_string)
objc_at_sign_was_seen = false;
goto retry;
@@ -1164,7 +1172,10 @@ lex_string (const cpp_token *tok, tree *valp, bool objc_string, bool translate)
/* We have read one more token than we want. */
_cpp_backup_tokens (parse_in, 1);
if (concats)
- strs = XOBFINISH (&str_ob, cpp_string *);
+ {
+ strs = XOBFINISH (&str_ob, cpp_string *);
+ locs = XOBFINISH (&loc_ob, location_t *);
+ }
if (concats && !objc_string && !in_system_header_at (input_location))
warning (OPT_Wtraditional,
@@ -1176,6 +1187,12 @@ lex_string (const cpp_token *tok, tree *valp, bool objc_string, bool translate)
{
value = build_string (istr.len, (const char *) istr.text);
free (CONST_CAST (unsigned char *, istr.text));
+ if (concats)
+ {
+ gcc_assert (locs);
+ gcc_assert (g_string_concat_db);
+ g_string_concat_db->record_string_concatenation (concats + 1, locs);
+ }
}
else
{
@@ -1227,7 +1244,10 @@ lex_string (const cpp_token *tok, tree *valp, bool objc_string, bool translate)
*valp = fix_string_type (value);
if (concats)
- obstack_free (&str_ob, 0);
+ {
+ obstack_free (&str_ob, 0);
+ obstack_free (&loc_ob, 0);
+ }
return objc_string ? CPP_OBJC_STRING : type;
}
diff --git a/gcc/c-family/c-opts.c b/gcc/c-family/c-opts.c
index c11e7e7..0715b2e 100644
--- a/gcc/c-family/c-opts.c
+++ b/gcc/c-family/c-opts.c
@@ -216,6 +216,9 @@ c_common_init_options (unsigned int decoded_options_count,
unsigned int i;
struct cpp_callbacks *cb;
+ g_string_concat_db
+ = new (ggc_alloc <string_concat_db> ()) string_concat_db ();
+
parse_in = cpp_create_reader (c_dialect_cxx () ? CLK_GNUCXX: CLK_GNUC89,
ident_hash, line_table);
cb = cpp_get_callbacks (parse_in);
diff --git a/gcc/input.c b/gcc/input.c
index f91a702..d058b8a 100644
--- a/gcc/input.c
+++ b/gcc/input.c
@@ -1189,6 +1189,279 @@ dump_location_info (FILE *stream)
MAX_SOURCE_LOCATION + 1, UINT_MAX);
}
+/* string_concat's constructor. */
+
+string_concat::string_concat (int num, location_t *locs)
+ : m_num (num)
+{
+ m_locs = ggc_vec_alloc <location_t> (num);
+ for (int i = 0; i < num; i++)
+ m_locs[i] = locs[i];
+}
+
+/* string_concat_db's constructor. */
+
+string_concat_db::string_concat_db ()
+{
+ m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
+}
+
+/* Record that a string concatenation occurred, covering NUM
+ string literal tokens. LOCS is an array of size NUM, containing the
+ locations of the tokens. A copy of LOCS is taken. */
+
+void
+string_concat_db::record_string_concatenation (int num, location_t *locs)
+{
+ gcc_assert (num > 1);
+ gcc_assert (locs);
+
+ location_t key_loc = get_key_loc (locs[0]);
+
+ string_concat *concat
+ = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
+ m_table->put (key_loc, concat);
+}
+
+/* Determine if LOC was the location of the the initial token of a
+ concatenation of string literal tokens.
+ If so, *OUT_NUM is written to with the number of tokens, and
+ *OUT_LOCS with the location of an array of locations of the
+ tokens, and return true. *OUT_LOCS is a borrowed pointer to
+ storage owned by the string_concat_db.
+ Otherwise, return false. */
+
+bool
+string_concat_db::get_string_concatenation (location_t loc,
+ int *out_num,
+ location_t **out_locs)
+{
+ gcc_assert (out_num);
+ gcc_assert (out_locs);
+
+ location_t key_loc = get_key_loc (loc);
+
+ string_concat **concat = m_table->get (key_loc);
+ if (!concat)
+ return false;
+
+ *out_num = (*concat)->m_num;
+ *out_locs =(*concat)->m_locs;
+ return true;
+}
+
+/* Internal function. Canonicalize LOC into a form suitable for
+ use as a key within the database, stripping away macro expansion,
+ ad-hoc information, and range information, using the location of
+ the start of LOC within an ordinary linemap. */
+
+location_t
+string_concat_db::get_key_loc (location_t loc)
+{
+ loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
+ NULL);
+
+ loc = get_range_from_loc (line_table, loc).m_start;
+
+ return loc;
+}
+
+/* Helper class for use within get_substring_ranges_for_loc.
+ An vec of cpp_string with responsibility for releasing all of the
+ str->text for each str in the vector. */
+
+class auto_cpp_string_vec : public auto_vec <cpp_string>
+{
+ public:
+ auto_cpp_string_vec (int alloc)
+ : auto_vec <cpp_string> (alloc) {}
+
+ ~auto_cpp_string_vec ()
+ {
+ /* Clean up the copies within this vec. */
+ int i;
+ cpp_string *str;
+ FOR_EACH_VEC_ELT (*this, i, str)
+ free (const_cast <unsigned char *> (str->text));
+ }
+};
+
+/* Attempt to populate RANGES with source location information on the
+ individual characters within the string literal found at STRLOC.
+ If CONCATS is non-NULL, then any string literals that the token at
+ STRLOC was concatenated with are also added to RANGES.
+
+ Return NULL if successful, or an error message if any errors occurred (in
+ which case RANGES may be only partially populated and should not
+ be used).
+
+ This is implemented by re-parsing the relevant source line(s). */
+
+static const char *
+get_substring_ranges_for_loc (cpp_reader *pfile,
+ string_concat_db *concats,
+ location_t strloc,
+ enum cpp_ttype type,
+ cpp_substring_ranges &ranges)
+{
+ gcc_assert (pfile);
+
+ if (strloc == UNKNOWN_LOCATION)
+ return "unknown location";
+
+ /* If string concatenation has occurred at STRLOC, get the locations
+ of all of the literal tokens making up the compound string.
+ Otherwise, just use STRLOC. */
+ int num_locs = 1;
+ location_t *strlocs = &strloc;
+ if (concats)
+ concats->get_string_concatenation (strloc, &num_locs, &strlocs);
+
+ auto_cpp_string_vec strs (num_locs);
+ auto_vec <cpp_string_location_reader> loc_readers (num_locs);
+ for (int i = 0; i < num_locs; i++)
+ {
+ /* Get range of strloc. We will use it to locate the start and finish
+ of the literal token within the line. */
+ source_range src_range = get_range_from_loc (line_table, strlocs[i]);
+
+ if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
+ /* If the string is within a macro expansion, we can't get at the
+ end location. */
+ return "macro expansion";
+
+ if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
+ /* If so, we can't reliably determine where the token started within
+ its line. */
+ return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
+
+ if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
+ /* If so, we can't reliably determine where the token finished within
+ its line. */
+ return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
+
+ expanded_location start
+ = expand_location_to_spelling_point (src_range.m_start);
+ expanded_location finish
+ = expand_location_to_spelling_point (src_range.m_finish);
+ if (start.file != finish.file)
+ return "range endpoints are in different files";
+ if (start.line != finish.line)
+ return "range endpoints are on different lines";
+ if (start.column > finish.column)
+ return "range endpoints are reversed";
+
+ int line_width;
+ const char *line = location_get_source_line (start.file, start.line,
+ &line_width);
+ if (line == NULL)
+ return "unable to read source line";
+
+ /* Determine the location of the literal (including quotes
+ and leading prefix chars, such as the 'u' in a u""
+ token). */
+ const char *literal = line + start.column - 1;
+ int literal_length = finish.column - start.column + 1;
+
+ gcc_assert (line_width >= (start.column - 1 + literal_length));
+ cpp_string from;
+ from.len = literal_length;
+ /* Make a copy of the literal, to avoid having to rely on
+ the lifetime of the copy of the line within the cache.
+ This will be released by the auto_cpp_string_vec dtor. */
+ from.text = XDUPVEC (unsigned char, literal, literal_length);
+ strs.safe_push (from);
+
+ /* For very long lines, a new linemap could have started
+ halfway through the token.
+ Ensure that the loc_reader uses the linemap of the
+ *end* of the token for its start location. */
+ const line_map_ordinary *final_ord_map;
+ linemap_resolve_location (line_table, src_range.m_finish,
+ LRK_MACRO_EXPANSION_POINT, &final_ord_map);
+ location_t start_loc
+ = linemap_position_for_line_and_column (line_table, final_ord_map,
+ start.line, start.column);
+
+ cpp_string_location_reader loc_reader (start_loc, line_table);
+ loc_readers.safe_push (loc_reader);
+ }
+
+ /* Rerun cpp_interpret_string, or rather, a modified version of it. */
+ const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
+ loc_readers.address (),
+ num_locs, &ranges, type);
+ if (err)
+ return err;
+
+ /* Success: "ranges" should now contain information on the string. */
+ return NULL;
+}
+
+/* Attempt to populate *OUT_RANGE with source location information on the
+ range of given characters within the string literal found at STRLOC.
+ START_IDX and END_IDX refer to offsets within the execution character
+ set.
+ If CONCATS is non-NULL, then any string literals that the token at
+ STRLOC was concatenated with are also considered.
+
+ This is implemented by re-parsing the relevant source line(s).
+
+ Return NULL if successful, or an error message if any errors occurred.
+ Error messages are intended for GCC developers (to help debugging) rather
+ than for end-users. */
+
+const char *
+get_source_range_for_substring (cpp_reader *pfile,
+ string_concat_db *concats,
+ location_t strloc,
+ enum cpp_ttype type,
+ int start_idx, int end_idx,
+ source_range *out_range)
+{
+ gcc_checking_assert (start_idx >= 0);
+ gcc_checking_assert (end_idx >= 0);
+ gcc_assert (out_range);
+
+ cpp_substring_ranges ranges;
+ const char *err
+ = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
+ if (err)
+ return err;
+
+ if (start_idx >= ranges.get_num_ranges ())
+ return "start_idx out of range";
+ if (end_idx >= ranges.get_num_ranges ())
+ return "end_idx out of range";
+
+ out_range->m_start = ranges.get_range (start_idx).m_start;
+ out_range->m_finish = ranges.get_range (end_idx).m_finish;
+ return NULL;
+}
+
+/* As get_source_range_for_substring, but write to *OUT the number
+ of ranges that are available. */
+
+const char *
+get_num_source_ranges_for_substring (cpp_reader *pfile,
+ string_concat_db *concats,
+ location_t strloc,
+ enum cpp_ttype type,
+ int *out)
+{
+ gcc_assert (out);
+
+ cpp_substring_ranges ranges;
+ const char *err
+ = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
+
+ if (err)
+ return err;
+
+ *out = ranges.get_num_ranges ();
+ return NULL;
+}
+
#if CHECKING_P
namespace selftest {
@@ -1541,6 +1814,1259 @@ test_lexer (const line_table_case &case_)
cpp_destroy (parser);
}
+/* Forward decls. */
+
+struct lexer_test;
+class lexer_test_options;
+
+/* A class for specifying options of a lexer_test.
+ The "apply" vfunc is called during the lexer_test constructor. */
+
+class lexer_test_options
+{
+ public:
+ virtual void apply (lexer_test &) = 0;
+};
+
+/* A struct for writing lexer tests. */
+
+struct lexer_test
+{
+ lexer_test (const line_table_case &case_, const char *content,
+ lexer_test_options *options);
+ ~lexer_test ();
+
+ const cpp_token *get_token ();
+
+ temp_source_file m_tempfile;
+ temp_line_table m_tmp_lt;
+ cpp_reader *m_parser;
+ string_concat_db m_concats;
+};
+
+/* Use an EBCDIC encoding for the execution charset, specifically
+ IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
+
+ This exercises iconv integration within libcpp.
+ Not every build of iconv supports the given charset,
+ so we need to flag this error and handle it gracefully. */
+
+class ebcdic_execution_charset : public lexer_test_options
+{
+ public:
+ ebcdic_execution_charset () : m_num_iconv_errors (0)
+ {
+ gcc_assert (s_singleton == NULL);
+ s_singleton = this;
+ }
+ ~ebcdic_execution_charset ()
+ {
+ gcc_assert (s_singleton == this);
+ s_singleton = NULL;
+ }
+
+ void apply (lexer_test &test) FINAL OVERRIDE
+ {
+ cpp_options *cpp_opts = cpp_get_options (test.m_parser);
+ cpp_opts->narrow_charset = "IBM1047";
+
+ cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
+ callbacks->error = on_error;
+ }
+
+ static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
+ int level ATTRIBUTE_UNUSED,
+ int reason ATTRIBUTE_UNUSED,
+ rich_location *richloc ATTRIBUTE_UNUSED,
+ const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
+ ATTRIBUTE_FPTR_PRINTF(5,0)
+ {
+ gcc_assert (s_singleton);
+ /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
+ when the local iconv build doesn't support the conversion. */
+ if (strstr (msgid, "not supported by iconv"))
+ {
+ s_singleton->m_num_iconv_errors++;
+ return true;
+ }
+
+ /* Otherwise, we have an unexpected error. */
+ abort ();
+ }
+
+ bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
+
+ private:
+ static ebcdic_execution_charset *s_singleton;
+ int m_num_iconv_errors;
+};
+
+ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
+
+/* Constructor. Override line_table with a new instance based on CASE_,
+ and write CONTENT to a tempfile. Create a cpp_reader, and use it to
+ start parsing the tempfile. */
+
+lexer_test::lexer_test (const line_table_case &case_, const char *content,
+ lexer_test_options *options) :
+ /* Create a tempfile and write the text to it. */
+ m_tempfile (SELFTEST_LOCATION, ".c", content),
+ m_tmp_lt (case_),
+ m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
+ m_concats ()
+{
+ if (options)
+ options->apply (*this);
+
+ cpp_init_iconv (m_parser);
+
+ /* Parse the file. */
+ const char *fname = cpp_read_main_file (m_parser,
+ m_tempfile.get_filename ());
+ ASSERT_NE (fname, NULL);
+}
+
+/* Destructor. Verify that the next token in m_parser is EOF. */
+
+lexer_test::~lexer_test ()
+{
+ location_t loc;
+ const cpp_token *tok;
+
+ tok = cpp_get_token_with_location (m_parser, &loc);
+ ASSERT_NE (tok, NULL);
+ ASSERT_EQ (tok->type, CPP_EOF);
+
+ cpp_finish (m_parser, NULL);
+ cpp_destroy (m_parser);
+}
+
+/* Get the next token from m_parser. */
+
+const cpp_token *
+lexer_test::get_token ()
+{
+ location_t loc;
+ const cpp_token *tok;
+
+ tok = cpp_get_token_with_location (m_parser, &loc);
+ ASSERT_NE (tok, NULL);
+ return tok;
+}
+
+/* Verify that locations within string literals are correctly handled. */
+
+/* Verify get_source_range_for_substring for token(s) at STRLOC,
+ using the string concatenation database for TEST.
+
+ Assert that the character at index IDX is on EXPECTED_LINE,
+ and that it begins at column EXPECTED_START_COL and ends at
+ EXPECTED_FINISH_COL (unless the locations are beyond
+ LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
+ columns). */
+
+static void
+assert_char_at_range (const location &loc,
+ lexer_test& test,
+ location_t strloc, enum cpp_ttype type, int idx,
+ int expected_line, int expected_start_col,
+ int expected_finish_col)
+{
+ cpp_reader *pfile = test.m_parser;
+ string_concat_db *concats = &test.m_concats;
+
+ source_range actual_range;
+ const char *err
+ = get_source_range_for_substring (pfile, concats, strloc, type,
+ idx, idx, &actual_range);
+ if (should_have_column_data_p (strloc))
+ ASSERT_EQ_AT (loc, NULL, err);
+ else
+ {
+ ASSERT_STREQ_AT (loc,
+ "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
+ err);
+ return;
+ }
+
+ int actual_start_line = LOCATION_LINE (actual_range.m_start);
+ ASSERT_EQ_AT (loc, expected_line, actual_start_line);
+ int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
+ ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
+
+ if (should_have_column_data_p (actual_range.m_start))
+ {
+ int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
+ ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
+ }
+ if (should_have_column_data_p (actual_range.m_finish))
+ {
+ int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
+ ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
+ }
+}
+
+/* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
+ the effective location of any errors. */
+
+#define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
+ EXPECTED_START_COL, EXPECTED_FINISH_COL) \
+ assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
+ (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
+ (EXPECTED_FINISH_COL))
+
+/* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
+ using the string concatenation database for TEST.
+
+ Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES. */
+
+static void
+assert_num_substring_ranges (const location &loc,
+ lexer_test& test,
+ location_t strloc,
+ enum cpp_ttype type,
+ int expected_num_ranges)
+{
+ cpp_reader *pfile = test.m_parser;
+ string_concat_db *concats = &test.m_concats;
+
+ int actual_num_ranges;
+ const char *err
+ = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
+ &actual_num_ranges);
+ if (should_have_column_data_p (strloc))
+ ASSERT_EQ_AT (loc, NULL, err);
+ else
+ {
+ ASSERT_STREQ_AT (loc,
+ "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
+ err);
+ return;
+ }
+ ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
+}
+
+/* Macro for calling assert_num_substring_ranges, supplying
+ SELFTEST_LOCATION for the effective location of any errors. */
+
+#define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
+ EXPECTED_NUM_RANGES) \
+ assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
+ (TYPE), (EXPECTED_NUM_RANGES))
+
+
+/* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
+ returns an error (using the string concatenation database for TEST). */
+
+static void
+assert_has_no_substring_ranges (const location &loc,
+ lexer_test& test,
+ location_t strloc,
+ enum cpp_ttype type,
+ const char *expected_err)
+{
+ cpp_reader *pfile = test.m_parser;
+ string_concat_db *concats = &test.m_concats;
+ cpp_substring_ranges ranges;
+ const char *actual_err
+ = get_substring_ranges_for_loc (pfile, concats, strloc,
+ type, ranges);
+ if (should_have_column_data_p (strloc))
+ ASSERT_STREQ_AT (loc, expected_err, actual_err);
+ else
+ ASSERT_STREQ_AT (loc,
+ "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
+ actual_err);
+}
+
+#define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR) \
+ assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
+ (STRLOC), (TYPE), (ERR))
+
+/* Lex a simple string literal. Verify the substring location data, before
+ and after running cpp_interpret_string on it. */
+
+static void
+test_lexer_string_locations_simple (const line_table_case &case_)
+{
+ /* Digits 0-9 (with 0 at column 10), the simple way.
+ ....................000000000.11111111112.2222222223333333333
+ ....................123456789.01234567890.1234567890123456789
+ We add a trailing comment to ensure that we correctly locate
+ the end of the string literal token. */
+ const char *content = " \"0123456789\" /* not a string */\n";
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected token back, with the correct
+ location information. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
+ ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
+
+ /* At this point in lexing, the quote characters are treated as part of
+ the string (they are stripped off by cpp_interpret_string). */
+
+ ASSERT_EQ (tok->val.str.len, 12);
+
+ /* Verify that cpp_interpret_string works. */
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_STRING;
+ bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+ ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* Verify ranges of individual characters. This no longer includes the
+ quotes. */
+ for (int i = 0; i <= 9; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
+ 10 + i, 10 + i);
+
+ ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 10);
+}
+
+/* As test_lexer_string_locations_simple, but use an EBCDIC execution
+ encoding. */
+
+static void
+test_lexer_string_locations_ebcdic (const line_table_case &case_)
+{
+ /* EBCDIC support requires iconv. */
+ if (!HAVE_ICONV)
+ return;
+
+ /* Digits 0-9 (with 0 at column 10), the simple way.
+ ....................000000000.11111111112.2222222223333333333
+ ....................123456789.01234567890.1234567890123456789
+ We add a trailing comment to ensure that we correctly locate
+ the end of the string literal token. */
+ const char *content = " \"0123456789\" /* not a string */\n";
+ ebcdic_execution_charset use_ebcdic;
+ lexer_test test (case_, content, &use_ebcdic);
+
+ /* Verify that we get the expected token back, with the correct
+ location information. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
+ ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
+
+ /* At this point in lexing, the quote characters are treated as part of
+ the string (they are stripped off by cpp_interpret_string). */
+
+ ASSERT_EQ (tok->val.str.len, 12);
+
+ /* The remainder of the test requires an iconv implementation that
+ can convert from UTF-8 to the EBCDIC encoding requested above. */
+ if (use_ebcdic.iconv_errors_occurred_p ())
+ return;
+
+ /* Verify that cpp_interpret_string works. */
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_STRING;
+ bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+ /* We should now have EBCDIC-encoded text, specifically
+ IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
+ The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9. */
+ ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
+ (const char *)dst_string.text);
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* Verify that we don't attempt to record substring location information
+ for such cases. */
+ ASSERT_HAS_NO_SUBSTRING_RANGES
+ (test, tok->src_loc, type,
+ "execution character set != source character set");
+}
+
+/* Lex a string literal containing a hex-escaped character.
+ Verify the substring location data, before and after running
+ cpp_interpret_string on it. */
+
+static void
+test_lexer_string_locations_hex (const line_table_case &case_)
+{
+ /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
+ and with a space in place of digit 6, to terminate the escaped
+ hex code.
+ ....................000000000.111111.11112222.
+ ....................123456789.012345.67890123. */
+ const char *content = " \"01234\\x35 789\"\n";
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected token back, with the correct
+ location information. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
+ ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
+
+ /* At this point in lexing, the quote characters are treated as part of
+ the string (they are stripped off by cpp_interpret_string). */
+ ASSERT_EQ (tok->val.str.len, 15);
+
+ /* Verify that cpp_interpret_string works. */
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_STRING;
+ bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+ ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* Verify ranges of individual characters. This no longer includes the
+ quotes. */
+ for (int i = 0; i <= 4; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
+ for (int i = 6; i <= 9; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
+
+ ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 10);
+}
+
+/* Lex a string literal containing an octal-escaped character.
+ Verify the substring location data after running cpp_interpret_string
+ on it. */
+
+static void
+test_lexer_string_locations_oct (const line_table_case &case_)
+{
+ /* Digits 0-9, expressing digit 5 in ASCII as "\065"
+ and with a space in place of digit 6, to terminate the escaped
+ octal code.
+ ....................000000000.111111.11112222.2222223333333333444
+ ....................123456789.012345.67890123.4567890123456789012 */
+ const char *content = " \"01234\\065 789\" /* not a string */\n";
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected token back, with the correct
+ location information. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
+
+ /* Verify that cpp_interpret_string works. */
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_STRING;
+ bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+ ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* Verify ranges of individual characters. This no longer includes the
+ quotes. */
+ for (int i = 0; i < 5; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
+ for (int i = 6; i <= 9; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
+
+ ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 10);
+}
+
+/* Test of string literal containing letter escapes. */
+
+static void
+test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
+{
+ /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
+ .....................000000000.1.11111.1.1.11222.22222223333333
+ .....................123456789.0.12345.6.7.89012.34567890123456. */
+ const char *content = (" \"\\tfoo\\\\\\nbar\" /* non-str */\n");
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected tokens back. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
+
+ /* Verify ranges of individual characters. */
+ /* "\t". */
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+ 0, 1, 10, 11);
+ /* "foo". */
+ for (int i = 1; i <= 3; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+ i, 1, 11 + i, 11 + i);
+ /* "\\" and "\n". */
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+ 4, 1, 15, 16);
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+ 5, 1, 17, 18);
+
+ /* "bar". */
+ for (int i = 6; i <= 8; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+ i, 1, 13 + i, 13 + i);
+
+ ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 9);
+}
+
+/* Another test of a string literal containing a letter escape.
+ Based on string seen in
+ printf ("%-%\n");
+ in gcc.dg/format/c90-printf-1.c. */
+
+static void
+test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
+{
+ /* .....................000000000.1111.11.1111.22222222223.
+ .....................123456789.0123.45.6789.01234567890. */
+ const char *content = (" \"%-%\\n\" /* non-str */\n");
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected tokens back. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
+
+ /* Verify ranges of individual characters. */
+ /* "%-%". */
+ for (int i = 0; i < 3; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+ i, 1, 10 + i, 10 + i);
+ /* "\n". */
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+ 3, 1, 13, 14);
+
+ ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 4);
+}
+
+/* Lex a string literal containing UCN 4 characters.
+ Verify the substring location data after running cpp_interpret_string
+ on it. */
+
+static void
+test_lexer_string_locations_ucn4 (const line_table_case &case_)
+{
+ /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
+ as UCN 4.
+ ....................000000000.111111.111122.222222223.33333333344444
+ ....................123456789.012345.678901.234567890.12345678901234 */
+ const char *content = " \"01234\\u2174\\u2175789\" /* non-str */\n";
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected token back, with the correct
+ location information. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
+
+ /* Verify that cpp_interpret_string works.
+ The string should be encoded in the execution character
+ set. Assuming that that is UTF-8, we should have the following:
+ ----------- ---- ----- ------- ----------------
+ Byte offset Byte Octal Unicode Source Column(s)
+ ----------- ---- ----- ------- ----------------
+ 0 0x30 '0' 10
+ 1 0x31 '1' 11
+ 2 0x32 '2' 12
+ 3 0x33 '3' 13
+ 4 0x34 '4' 14
+ 5 0xE2 \342 U+2174 15-20
+ 6 0x85 \205 (cont) 15-20
+ 7 0xB4 \264 (cont) 15-20
+ 8 0xE2 \342 U+2175 21-26
+ 9 0x85 \205 (cont) 21-26
+ 10 0xB5 \265 (cont) 21-26
+ 11 0x37 '7' 27
+ 12 0x38 '8' 28
+ 13 0x39 '9' 29
+ ----------- ---- ----- ------- ---------------. */
+
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_STRING;
+ bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+ ASSERT_STREQ ("01234\342\205\264\342\205\265789",
+ (const char *)dst_string.text);
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* Verify ranges of individual characters. This no longer includes the
+ quotes.
+ '01234'. */
+ for (int i = 0; i <= 4; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
+ /* U+2174. */
+ for (int i = 5; i <= 7; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
+ /* U+2175. */
+ for (int i = 8; i <= 10; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
+ /* '789'. */
+ for (int i = 11; i <= 13; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
+
+ ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 14);
+}
+
+/* Lex a string literal containing UCN 8 characters.
+ Verify the substring location data after running cpp_interpret_string
+ on it. */
+
+static void
+test_lexer_string_locations_ucn8 (const line_table_case &case_)
+{
+ /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
+ ....................000000000.111111.1111222222.2222333333333.344444
+ ....................123456789.012345.6789012345.6789012345678.901234 */
+ const char *content = " \"01234\\U00002174\\U00002175789\" /* */\n";
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected token back, with the correct
+ location information. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
+ "\"01234\\U00002174\\U00002175789\"");
+
+ /* Verify that cpp_interpret_string works.
+ The UTF-8 encoding of the string is identical to that from
+ the ucn4 testcase above; the only difference is the column
+ locations. */
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_STRING;
+ bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+ ASSERT_STREQ ("01234\342\205\264\342\205\265789",
+ (const char *)dst_string.text);
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* Verify ranges of individual characters. This no longer includes the
+ quotes.
+ '01234'. */
+ for (int i = 0; i <= 4; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
+ /* U+2174. */
+ for (int i = 5; i <= 7; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
+ /* U+2175. */
+ for (int i = 8; i <= 10; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
+ /* '789' at columns 35-37 */
+ for (int i = 11; i <= 13; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
+
+ ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 14);
+}
+
+/* Fetch a big-endian 32-bit value and convert to host endianness. */
+
+static uint32_t
+uint32_from_big_endian (const uint32_t *ptr_be_value)
+{
+ const unsigned char *buf = (const unsigned char *)ptr_be_value;
+ return (((uint32_t) buf[0] << 24)
+ | ((uint32_t) buf[1] << 16)
+ | ((uint32_t) buf[2] << 8)
+ | (uint32_t) buf[3]);
+}
+
+/* Lex a wide string literal and verify that attempts to read substring
+ location data from it fail gracefully. */
+
+static void
+test_lexer_string_locations_wide_string (const line_table_case &case_)
+{
+ /* Digits 0-9.
+ ....................000000000.11111111112.22222222233333
+ ....................123456789.01234567890.12345678901234 */
+ const char *content = " L\"0123456789\" /* non-str */\n";
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected token back, with the correct
+ location information. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_WSTRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
+
+ /* Verify that cpp_interpret_string works, using CPP_WSTRING. */
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_WSTRING;
+ bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+ /* The cpp_reader defaults to big-endian with
+ CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
+ now be encoded as UTF-32BE. */
+ const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
+ ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
+ ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
+ ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
+ ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* We don't yet support generating substring location information
+ for L"" strings. */
+ ASSERT_HAS_NO_SUBSTRING_RANGES
+ (test, tok->src_loc, type,
+ "execution character set != source character set");
+}
+
+/* Fetch a big-endian 16-bit value and convert to host endianness. */
+
+static uint16_t
+uint16_from_big_endian (const uint16_t *ptr_be_value)
+{
+ const unsigned char *buf = (const unsigned char *)ptr_be_value;
+ return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
+}
+
+/* Lex a u"" string literal and verify that attempts to read substring
+ location data from it fail gracefully. */
+
+static void
+test_lexer_string_locations_string16 (const line_table_case &case_)
+{
+ /* Digits 0-9.
+ ....................000000000.11111111112.22222222233333
+ ....................123456789.01234567890.12345678901234 */
+ const char *content = " u\"0123456789\" /* non-str */\n";
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected token back, with the correct
+ location information. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING16);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
+
+ /* Verify that cpp_interpret_string works, using CPP_STRING16. */
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_STRING16;
+ bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+
+ /* The cpp_reader defaults to big-endian, so dst_string should
+ now be encoded as UTF-16BE. */
+ const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
+ ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
+ ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
+ ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
+ ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* We don't yet support generating substring location information
+ for L"" strings. */
+ ASSERT_HAS_NO_SUBSTRING_RANGES
+ (test, tok->src_loc, type,
+ "execution character set != source character set");
+}
+
+/* Lex a U"" string literal and verify that attempts to read substring
+ location data from it fail gracefully. */
+
+static void
+test_lexer_string_locations_string32 (const line_table_case &case_)
+{
+ /* Digits 0-9.
+ ....................000000000.11111111112.22222222233333
+ ....................123456789.01234567890.12345678901234 */
+ const char *content = " U\"0123456789\" /* non-str */\n";
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected token back, with the correct
+ location information. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING32);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
+
+ /* Verify that cpp_interpret_string works, using CPP_STRING32. */
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_STRING32;
+ bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+
+ /* The cpp_reader defaults to big-endian, so dst_string should
+ now be encoded as UTF-32BE. */
+ const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
+ ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
+ ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
+ ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
+ ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* We don't yet support generating substring location information
+ for L"" strings. */
+ ASSERT_HAS_NO_SUBSTRING_RANGES
+ (test, tok->src_loc, type,
+ "execution character set != source character set");
+}
+
+/* Lex a u8-string literal.
+ Verify the substring location data after running cpp_interpret_string
+ on it. */
+
+static void
+test_lexer_string_locations_u8 (const line_table_case &case_)
+{
+ /* Digits 0-9.
+ ....................000000000.11111111112.22222222233333
+ ....................123456789.01234567890.12345678901234 */
+ const char *content = " u8\"0123456789\" /* non-str */\n";
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected token back, with the correct
+ location information. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_UTF8STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
+
+ /* Verify that cpp_interpret_string works. */
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_STRING;
+ bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+ ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* Verify ranges of individual characters. This no longer includes the
+ quotes. */
+ for (int i = 0; i <= 9; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
+}
+
+/* Lex a string literal containing UTF-8 source characters.
+ Verify the substring location data after running cpp_interpret_string
+ on it. */
+
+static void
+test_lexer_string_locations_utf8_source (const line_table_case &case_)
+{
+ /* This string literal is written out to the source file as UTF-8,
+ and is of the form "before mojibake after", where "mojibake"
+ is written as the following four unicode code points:
+ U+6587 CJK UNIFIED IDEOGRAPH-6587
+ U+5B57 CJK UNIFIED IDEOGRAPH-5B57
+ U+5316 CJK UNIFIED IDEOGRAPH-5316
+ U+3051 HIRAGANA LETTER KE.
+ Each of these is 3 bytes wide when encoded in UTF-8, whereas the
+ "before" and "after" are 1 byte per unicode character.
+
+ The numbering shown are "columns", which are *byte* numbers within
+ the line, rather than unicode character numbers.
+
+ .................... 000000000.1111111.
+ .................... 123456789.0123456. */
+ const char *content = (" \"before "
+ /* U+6587 CJK UNIFIED IDEOGRAPH-6587
+ UTF-8: 0xE6 0x96 0x87
+ C octal escaped UTF-8: \346\226\207
+ "column" numbers: 17-19. */
+ "\346\226\207"
+
+ /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
+ UTF-8: 0xE5 0xAD 0x97
+ C octal escaped UTF-8: \345\255\227
+ "column" numbers: 20-22. */
+ "\345\255\227"
+
+ /* U+5316 CJK UNIFIED IDEOGRAPH-5316
+ UTF-8: 0xE5 0x8C 0x96
+ C octal escaped UTF-8: \345\214\226
+ "column" numbers: 23-25. */
+ "\345\214\226"
+
+ /* U+3051 HIRAGANA LETTER KE
+ UTF-8: 0xE3 0x81 0x91
+ C octal escaped UTF-8: \343\201\221
+ "column" numbers: 26-28. */
+ "\343\201\221"
+
+ /* column numbers 29 onwards
+ 2333333.33334444444444
+ 9012345.67890123456789. */
+ " after\" /* non-str */\n");
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected token back, with the correct
+ location information. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ
+ (test.m_parser, tok,
+ "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
+
+ /* Verify that cpp_interpret_string works. */
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_STRING;
+ bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+ ASSERT_STREQ
+ ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
+ (const char *)dst_string.text);
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* Verify ranges of individual characters. This no longer includes the
+ quotes.
+ Assuming that both source and execution encodings are UTF-8, we have
+ a run of 25 octets in each. */
+ for (int i = 0; i < 25; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
+
+ ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 25);
+}
+
+/* Test of string literal concatenation. */
+
+static void
+test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
+{
+ /* Digits 0-9.
+ .....................000000000.111111.11112222222222
+ .....................123456789.012345.67890123456789. */
+ const char *content = (" \"01234\" /* non-str */\n"
+ " \"56789\" /* non-str */\n");
+ lexer_test test (case_, content, NULL);
+
+ location_t input_locs[2];
+
+ /* Verify that we get the expected tokens back. */
+ auto_vec <cpp_string> input_strings;
+ const cpp_token *tok_a = test.get_token ();
+ ASSERT_EQ (tok_a->type, CPP_STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
+ input_strings.safe_push (tok_a->val.str);
+ input_locs[0] = tok_a->src_loc;
+
+ const cpp_token *tok_b = test.get_token ();
+ ASSERT_EQ (tok_b->type, CPP_STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
+ input_strings.safe_push (tok_b->val.str);
+ input_locs[1] = tok_b->src_loc;
+
+ /* Verify that cpp_interpret_string works. */
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_STRING;
+ bool result = cpp_interpret_string (test.m_parser,
+ input_strings.address (), 2,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+ ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* Simulate c-lex.c's lex_string in order to record concatenation. */
+ test.m_concats.record_string_concatenation (2, input_locs);
+
+ location_t initial_loc = input_locs[0];
+
+ for (int i = 0; i <= 4; i++)
+ ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
+ for (int i = 5; i <= 9; i++)
+ ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
+
+ ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 10);
+}
+
+/* Another test of string literal concatenation. */
+
+static void
+test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
+{
+ /* Digits 0-9.
+ .....................000000000.111.11111112222222
+ .....................123456789.012.34567890123456. */
+ const char *content = (" \"01\" /* non-str */\n"
+ " \"23\" /* non-str */\n"
+ " \"45\" /* non-str */\n"
+ " \"67\" /* non-str */\n"
+ " \"89\" /* non-str */\n");
+ lexer_test test (case_, content, NULL);
+
+ auto_vec <cpp_string> input_strings;
+ location_t input_locs[5];
+
+ /* Verify that we get the expected tokens back. */
+ for (int i = 0; i < 5; i++)
+ {
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+ input_strings.safe_push (tok->val.str);
+ input_locs[i] = tok->src_loc;
+ }
+
+ /* Verify that cpp_interpret_string works. */
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_STRING;
+ bool result = cpp_interpret_string (test.m_parser,
+ input_strings.address (), 5,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+ ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* Simulate c-lex.c's lex_string in order to record concatenation. */
+ test.m_concats.record_string_concatenation (5, input_locs);
+
+ location_t initial_loc = input_locs[0];
+
+ /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
+ detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
+ and expect get_source_range_for_substring to fail.
+ However, for a string concatenation test, we can have a case
+ where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
+ but subsequent strings can be after it.
+ Attempting to detect this within assert_char_at_range
+ would overcomplicate the logic for the common test cases, so
+ we detect it here. */
+ if (should_have_column_data_p (input_locs[0])
+ && !should_have_column_data_p (input_locs[4]))
+ {
+ /* Verify that get_source_range_for_substring gracefully rejects
+ this case. */
+ source_range actual_range;
+ const char *err
+ = get_source_range_for_substring (test.m_parser, &test.m_concats,
+ initial_loc, type, 0, 0,
+ &actual_range);
+ ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
+ return;
+ }
+
+ for (int i = 0; i < 5; i++)
+ for (int j = 0; j < 2; j++)
+ ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
+ i + 1, 10 + j, 10 + j);
+
+ ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 10);
+}
+
+/* Another test of string literal concatenation, this time combined with
+ various kinds of escaped characters. */
+
+static void
+test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
+{
+ /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
+ digit 6 in ASCII as octal "\066", concatenating multiple strings. */
+ const char *content
+ /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
+ .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
+ = (" \"01234\" \"\\x35\" \"\\066\" \"789\" /* non-str */\n");
+ lexer_test test (case_, content, NULL);
+
+ auto_vec <cpp_string> input_strings;
+ location_t input_locs[4];
+
+ /* Verify that we get the expected tokens back. */
+ for (int i = 0; i < 4; i++)
+ {
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+ input_strings.safe_push (tok->val.str);
+ input_locs[i] = tok->src_loc;
+ }
+
+ /* Verify that cpp_interpret_string works. */
+ cpp_string dst_string;
+ const enum cpp_ttype type = CPP_STRING;
+ bool result = cpp_interpret_string (test.m_parser,
+ input_strings.address (), 4,
+ &dst_string, type);
+ ASSERT_TRUE (result);
+ ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
+ free (const_cast <unsigned char *> (dst_string.text));
+
+ /* Simulate c-lex.c's lex_string in order to record concatenation. */
+ test.m_concats.record_string_concatenation (4, input_locs);
+
+ location_t initial_loc = input_locs[0];
+
+ for (int i = 0; i <= 4; i++)
+ ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
+ ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
+ ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
+ for (int i = 7; i <= 9; i++)
+ ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
+
+ ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 10);
+}
+
+/* Test of string literal in a macro. */
+
+static void
+test_lexer_string_locations_macro (const line_table_case &case_)
+{
+ /* Digits 0-9.
+ .....................0000000001111111111.22222222223.
+ .....................1234567890123456789.01234567890. */
+ const char *content = ("#define MACRO \"0123456789\" /* non-str */\n"
+ " MACRO");
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected tokens back. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_PADDING);
+
+ tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
+
+ /* Verify ranges of individual characters. We ought to
+ see columns within the macro definition. */
+ for (int i = 0; i <= 9; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+ i, 1, 20 + i, 20 + i);
+
+ ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
+
+ tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_PADDING);
+}
+
+/* Test of stringification of a macro argument. */
+
+static void
+test_lexer_string_locations_stringified_macro_argument
+ (const line_table_case &case_)
+{
+ /* .....................000000000111111111122222222223.
+ .....................123456789012345678901234567890. */
+ const char *content = ("#define MACRO(X) #X /* non-str */\n"
+ "MACRO(foo)\n");
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected token back. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_PADDING);
+
+ tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
+
+ /* We don't support getting the location of a stringified macro
+ argument. Verify that it fails gracefully. */
+ ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
+ "cpp_interpret_string_1 failed");
+
+ tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_PADDING);
+
+ tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_PADDING);
+}
+
+/* Ensure that we are fail gracefully if something attempts to pass
+ in a location that isn't a string literal token. Seen on this code:
+
+ const char a[] = " %d ";
+ __builtin_printf (a, 0.5);
+ ^
+
+ when c-format.c erroneously used the indicated one-character
+ location as the format string location, leading to a read past the
+ end of a string buffer in cpp_interpret_string_1. */
+
+static void
+test_lexer_string_locations_non_string (const line_table_case &case_)
+{
+ /* .....................000000000111111111122222222223.
+ .....................123456789012345678901234567890. */
+ const char *content = (" a\n");
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected token back. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_NAME);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
+
+ /* At this point, libcpp is attempting to interpret the name as a
+ string literal, despite it not starting with a quote. We don't detect
+ that, but we should at least fail gracefully. */
+ ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
+ "cpp_interpret_string_1 failed");
+}
+
+/* Ensure that we can read substring information for a token which
+ starts in one linemap and ends in another . Adapted from
+ gcc.dg/cpp/pr69985.c. */
+
+static void
+test_lexer_string_locations_long_line (const line_table_case &case_)
+{
+ /* .....................000000.000111111111
+ .....................123456.789012346789. */
+ const char *content = ("/* A very long line, so that we start a new line map. */\n"
+ " \"0123456789012345678901234567890123456789"
+ "0123456789012345678901234567890123456789"
+ "0123456789012345678901234567890123456789"
+ "0123456789\"\n");
+
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected token back. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_STRING);
+
+ if (!should_have_column_data_p (line_table->highest_location))
+ return;
+
+ /* Verify ranges of individual characters. */
+ ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 130);
+ for (int i = 0; i < 130; i++)
+ ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
+ i, 2, 7 + i, 7 + i);
+}
+
+/* Test of lexing char constants. */
+
+static void
+test_lexer_char_constants (const line_table_case &case_)
+{
+ /* Various char constants.
+ .....................0000000001111111111.22222222223.
+ .....................1234567890123456789.01234567890. */
+ const char *content = (" 'a'\n"
+ " u'a'\n"
+ " U'a'\n"
+ " L'a'\n"
+ " 'abc'\n");
+ lexer_test test (case_, content, NULL);
+
+ /* Verify that we get the expected tokens back. */
+ /* 'a'. */
+ const cpp_token *tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_CHAR);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
+
+ unsigned int chars_seen;
+ int unsignedp;
+ cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
+ &chars_seen, &unsignedp);
+ ASSERT_EQ (cc, 'a');
+ ASSERT_EQ (chars_seen, 1);
+
+ /* u'a'. */
+ tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_CHAR16);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
+
+ /* U'a'. */
+ tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_CHAR32);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
+
+ /* L'a'. */
+ tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_WCHAR);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
+
+ /* 'abc' (c-char-sequence). */
+ tok = test.get_token ();
+ ASSERT_EQ (tok->type, CPP_CHAR);
+ ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
+}
/* A table of interesting location_t values, giving one axis of our test
matrix. */
@@ -1599,6 +3125,27 @@ input_c_tests ()
/* Run all tests for the given case within the test matrix. */
test_accessing_ordinary_linemaps (c);
test_lexer (c);
+ test_lexer_string_locations_simple (c);
+ test_lexer_string_locations_ebcdic (c);
+ test_lexer_string_locations_hex (c);
+ test_lexer_string_locations_oct (c);
+ test_lexer_string_locations_letter_escape_1 (c);
+ test_lexer_string_locations_letter_escape_2 (c);
+ test_lexer_string_locations_ucn4 (c);
+ test_lexer_string_locations_ucn8 (c);
+ test_lexer_string_locations_wide_string (c);
+ test_lexer_string_locations_string16 (c);
+ test_lexer_string_locations_string32 (c);
+ test_lexer_string_locations_u8 (c);
+ test_lexer_string_locations_utf8_source (c);
+ test_lexer_string_locations_concatenation_1 (c);
+ test_lexer_string_locations_concatenation_2 (c);
+ test_lexer_string_locations_concatenation_3 (c);
+ test_lexer_string_locations_macro (c);
+ test_lexer_string_locations_stringified_macro_argument (c);
+ test_lexer_string_locations_non_string (c);
+ test_lexer_string_locations_long_line (c);
+ test_lexer_char_constants (c);
num_cases_tested++;
}
diff --git a/gcc/input.h b/gcc/input.h
index d51f950..c17e440 100644
--- a/gcc/input.h
+++ b/gcc/input.h
@@ -95,4 +95,39 @@ void dump_location_info (FILE *stream);
void diagnostics_file_cache_fini (void);
+struct GTY(()) string_concat
+{
+ string_concat (int num, location_t *locs);
+
+ int m_num;
+ location_t * GTY ((atomic)) m_locs;
+};
+
+struct location_hash : int_hash <location_t, UNKNOWN_LOCATION> { };
+
+class GTY(()) string_concat_db
+{
+ public:
+ string_concat_db ();
+ void record_string_concatenation (int num, location_t *locs);
+
+ bool get_string_concatenation (location_t loc,
+ int *out_num,
+ location_t **out_locs);
+
+ private:
+ static location_t get_key_loc (location_t loc);
+
+ /* For the fields to be private, we must grant access to the
+ generated code in gtype-desc.c. */
+
+ friend void ::gt_ggc_mx_string_concat_db (void *x_p);
+ friend void ::gt_pch_nx_string_concat_db (void *x_p);
+ friend void ::gt_pch_p_16string_concat_db (void *this_obj, void *x_p,
+ gt_pointer_operator op,
+ void *cookie);
+
+ hash_map <location_hash, string_concat *> *m_table;
+};
+
#endif
diff --git a/gcc/substring-locations.h b/gcc/substring-locations.h
new file mode 100644
index 0000000..274ebbe
--- /dev/null
+++ b/gcc/substring-locations.h
@@ -0,0 +1,30 @@
+/* Source locations within string literals.
+ Copyright (C) 2016 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#ifndef GCC_SUBSTRING_LOCATIONS_H
+#define GCC_SUBSTRING_LOCATIONS_H
+
+extern const char *get_source_range_for_substring (cpp_reader *pfile,
+ string_concat_db *concats,
+ location_t strloc,
+ enum cpp_ttype type,
+ int start_idx, int end_idx,
+ source_range *out_range);
+
+#endif /* ! GCC_SUBSTRING_LOCATIONS_H */
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 25af783..997efac 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@
+2016-08-05 David Malcolm <dmalcolm@redhat.com>
+
+ * gcc.dg/plugin/diagnostic-test-string-literals-1.c: New file.
+ * gcc.dg/plugin/diagnostic-test-string-literals-2.c: New file.
+ * gcc.dg/plugin/diagnostic_plugin_test_string_literals.c: New file.
+ * gcc.dg/plugin/plugin.exp (plugin_test_list): Add the above new files.
+
2016-08-05 Patrick Palka <ppalka@gcc.gnu.org>
PR tree-optimization/72810
diff --git a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-1.c b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-1.c
new file mode 100644
index 0000000..82689b4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-1.c
@@ -0,0 +1,211 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdiagnostics-show-caret" } */
+
+/* This is a collection of unittests for ranges within string literals,
+ using diagnostic_plugin_test_string_literals, which handles
+ "__emit_string_literal_range" by generating a warning at the given
+ subset of a string literal.
+
+ The indices are 0-based. It's easiest to verify things using string
+ literals that are runs of 0-based digits (to avoid having to count
+ characters).
+
+ LITERAL is a const void * to allow testing the various kinds of wide
+ string literal, rather than just const char *. */
+
+extern void __emit_string_literal_range (const void *literal,
+ int start_idx, int end_idx);
+
+void
+test_simple_string_literal (void)
+{
+ __emit_string_literal_range ("0123456789", /* { dg-warning "range" } */
+ 6, 7);
+/* { dg-begin-multiline-output "" }
+ __emit_string_literal_range ("0123456789",
+ ^~
+ { dg-end-multiline-output "" } */
+}
+
+void
+test_concatenated_string_literal (void)
+{
+ __emit_string_literal_range ("01234" "56789", /* { dg-warning "range" } */
+ 3, 6);
+/* { dg-begin-multiline-output "" }
+ __emit_string_literal_range ("01234" "56789",
+ ^~~~~~~
+ { dg-end-multiline-output "" } */
+}
+
+void
+test_multiline_string_literal (void)
+{
+ __emit_string_literal_range ("01234" /* { dg-warning "range" } */
+ "56789",
+ 3, 6);
+/* { dg-begin-multiline-output "" }
+ __emit_string_literal_range ("01234"
+ ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ "56789",
+ ~~~
+ { dg-end-multiline-output "" } */
+ /* FIXME: why does the above need two trailing spaces? */
+}
+
+/* Tests of various unicode encodings.
+
+ Digits 0 through 9 are unicode code points:
+ U+0030 DIGIT ZERO
+ ...
+ U+0039 DIGIT NINE
+ However, these are not always valid as UCN (see the comment in
+ libcpp/charset.c:_cpp_valid_ucn).
+
+ Hence we need to test UCN using an alternative unicode
+ representation of numbers; let's use Roman numerals,
+ (though these start at one, not zero):
+ U+2170 SMALL ROMAN NUMERAL ONE
+ ...
+ U+2174 SMALL ROMAN NUMERAL FIVE ("v")
+ U+2175 SMALL ROMAN NUMERAL SIX ("vi")
+ ...
+ U+2178 SMALL ROMAN NUMERAL NINE. */
+
+void
+test_hex (void)
+{
+ /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
+ and with a space in place of digit 6, to terminate the escaped
+ hex code. */
+ __emit_string_literal_range ("01234\x35 789", /* { dg-warning "range" } */
+ 3, 7);
+/* { dg-begin-multiline-output "" }
+ __emit_string_literal_range ("01234\x35 789"
+ ^~~~~~~~
+ { dg-end-multiline-output "" } */
+}
+
+void
+test_oct (void)
+{
+ /* Digits 0-9, expressing digit 5 in ASCII as "\065"
+ and with a space in place of digit 6, to terminate the escaped
+ octal code. */
+ __emit_string_literal_range ("01234\065 789", /* { dg-warning "range" } */
+ 3, 7);
+/* { dg-begin-multiline-output "" }
+ __emit_string_literal_range ("01234\065 789"
+ ^~~~~~~~
+ { dg-end-multiline-output "" } */
+}
+
+void
+test_multiple (void)
+{
+ /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
+ digit 6 in ASCII as octal "\066", concatenating multiple strings. */
+ __emit_string_literal_range ("01234" "\x35" "\066" "789", /* { dg-warning "range" } */
+ 3, 8);
+/* { dg-begin-multiline-output "" }
+ __emit_string_literal_range ("01234" "\x35" "\066" "789",
+ ^~~~~~~~~~~~~~~~~~~~~~~~
+ { dg-end-multiline-output "" } */
+}
+
+void
+test_ucn4 (void)
+{
+ /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
+ as UCN 4.
+ The resulting string is encoded as UTF-8. Most of the digits are 1 byte
+ each, but digits 5 and 6 are encoded with 3 bytes each.
+ Hence to underline digits 4-7 we need to underling using bytes 4-11 in
+ the UTF-8 encoding. */
+ __emit_string_literal_range ("01234\u2174\u2175789", /* { dg-warning "range" } */
+ 4, 11);
+/* { dg-begin-multiline-output "" }
+ __emit_string_literal_range ("01234\u2174\u2175789",
+ ^~~~~~~~~~~~~~
+ { dg-end-multiline-output "" } */
+}
+
+void
+test_ucn8 (void)
+{
+ /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
+ The resulting string is the same as as in test_ucn4 above, and hence
+ has the same UTF-8 encoding, and so we again need to underline bytes
+ 4-11 in the UTF-8 encoding in order to underline digits 4-7. */
+ __emit_string_literal_range ("01234\U00002174\U00002175789", /* { dg-warning "range" } */
+ 4, 11);
+/* { dg-begin-multiline-output "" }
+ __emit_string_literal_range ("01234\U00002174\U00002175789",
+ ^~~~~~~~~~~~~~~~~~~~~~
+ { dg-end-multiline-output "" } */
+}
+
+void
+test_u8 (void)
+{
+ /* Digits 0-9. */
+ __emit_string_literal_range (u8"0123456789", /* { dg-warning "range" } */
+ 4, 7);
+/* { dg-begin-multiline-output "" }
+ __emit_string_literal_range (u8"0123456789",
+ ^~~~
+ { dg-end-multiline-output "" } */
+}
+
+void
+test_u (void)
+{
+ /* Digits 0-9. */
+ __emit_string_literal_range (u"0123456789", /* { dg-error "unable to read substring range: execution character set != source character set" } */
+ 4, 7);
+/* { dg-begin-multiline-output "" }
+ __emit_string_literal_range (u"0123456789",
+ ^~~~~~~~~~~~~
+ { dg-end-multiline-output "" } */
+}
+
+void
+test_U (void)
+{
+ /* Digits 0-9. */
+ __emit_string_literal_range (U"0123456789", /* { dg-error "unable to read substring range: execution character set != source character set" } */
+ 4, 7);
+/* { dg-begin-multiline-output "" }
+ __emit_string_literal_range (U"0123456789",
+ ^~~~~~~~~~~~~
+ { dg-end-multiline-output "" } */
+}
+
+void
+test_L (void)
+{
+ /* Digits 0-9. */
+ __emit_string_literal_range (L"0123456789", /* { dg-error "unable to read substring range: execution character set != source character set" } */
+ 4, 7);
+/* { dg-begin-multiline-output "" }
+ __emit_string_literal_range (L"0123456789",
+ ^~~~~~~~~~~~~
+ { dg-end-multiline-output "" } */
+}
+
+void
+test_macro (void)
+{
+#define START "01234" /* { dg-warning "range" } */
+ __emit_string_literal_range (START
+ "56789",
+ 3, 6);
+/* { dg-begin-multiline-output "" }
+ #define START "01234"
+ ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ __emit_string_literal_range (START
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ "56789",
+ ~~~
+ { dg-end-multiline-output "" } */
+}
diff --git a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-2.c b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-2.c
new file mode 100644
index 0000000..7851c02
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-string-literals-2.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+
+/* See the notes in diagnostic-test-string-literals-1.c.
+ This test case has caret-printing disabled. */
+
+extern void __emit_string_literal_range (const void *literal,
+ int start_idx, int end_idx);
+/* Test of a stringified macro argument, by itself. */
+
+void
+test_stringified_token_1 (int x)
+{
+#define STRINGIFY(EXPR) #EXPR
+
+ __emit_string_literal_range (STRINGIFY(x > 0), /* { dg-error "unable to read substring range: macro expansion" } */
+ 0, 4);
+
+#undef STRINGIFY
+}
+
+/* Test of a stringified token within a concatenation. */
+
+void
+test_stringized_token_2 (int x)
+{
+#define EXAMPLE(EXPR, START_IDX, END_IDX) \
+ do { \
+ __emit_string_literal_range (" before " #EXPR " after \n", \
+ START_IDX, END_IDX); \
+ } while (0)
+
+ EXAMPLE(x > 0, 1, 6);
+ /* { dg-error "unable to read substring range: cpp_interpret_string_1 failed" "" { target *-*-* } 28 } */
+
+#undef EXAMPLE
+}
+
+/* Test of a doubly-stringified macro argument (by itself). */
+
+void
+test_stringified_token_3 (int x)
+{
+#define XSTR(s) STR(s)
+#define STR(s) #s
+#define FOO 123456789
+ __emit_string_literal_range (XSTR (FOO), /* { dg-error "unable to read substring range: macro expansion" } */
+ 2, 3);
+
+#undef XSTR
+#undef STR
+#undef FOO
+}
+
diff --git a/gcc/testsuite/gcc.dg/plugin/diagnostic_plugin_test_string_literals.c b/gcc/testsuite/gcc.dg/plugin/diagnostic_plugin_test_string_literals.c
new file mode 100644
index 0000000..d44612a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/plugin/diagnostic_plugin_test_string_literals.c
@@ -0,0 +1,212 @@
+/* This plugin uses the diagnostics code to verify tracking of source code
+ locations within string literals. */
+/* { dg-options "-O" } */
+
+#include "gcc-plugin.h"
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "stringpool.h"
+#include "toplev.h"
+#include "basic-block.h"
+#include "hash-table.h"
+#include "vec.h"
+#include "ggc.h"
+#include "basic-block.h"
+#include "tree-ssa-alias.h"
+#include "internal-fn.h"
+#include "gimple-fold.h"
+#include "tree-eh.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "gimple-iterator.h"
+#include "tree.h"
+#include "tree-pass.h"
+#include "intl.h"
+#include "plugin-version.h"
+#include "c-family/c-common.h"
+#include "diagnostic.h"
+#include "context.h"
+#include "print-tree.h"
+#include "cpplib.h"
+#include "c-family/c-pragma.h"
+
+int plugin_is_GPL_compatible;
+
+/* A custom pass for printing string literal location information. */
+
+const pass_data pass_data_test_string_literals =
+{
+ GIMPLE_PASS, /* type */
+ "test_string_literals", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_NONE, /* tv_id */
+ PROP_ssa, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
+class pass_test_string_literals : public gimple_opt_pass
+{
+public:
+ pass_test_string_literals(gcc::context *ctxt)
+ : gimple_opt_pass(pass_data_test_string_literals, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ bool gate (function *) { return true; }
+ virtual unsigned int execute (function *);
+
+}; // class pass_test_string_literals
+
+/* Determine if STMT is a call with NUM_ARGS arguments to a function
+ named FUNCNAME.
+ If so, return STMT as a gcall *. Otherwise return NULL. */
+
+static gcall *
+check_for_named_call (gimple *stmt,
+ const char *funcname, unsigned int num_args)
+{
+ gcc_assert (funcname);
+
+ gcall *call = dyn_cast <gcall *> (stmt);
+ if (!call)
+ return NULL;
+
+ tree fndecl = gimple_call_fndecl (call);
+ if (!fndecl)
+ return NULL;
+
+ if (strcmp (IDENTIFIER_POINTER (DECL_NAME (fndecl)), funcname))
+ return NULL;
+
+ if (gimple_call_num_args (call) != num_args)
+ {
+ error_at (stmt->location, "expected number of args: %i (got %i)",
+ num_args, gimple_call_num_args (call));
+ return NULL;
+ }
+
+ return call;
+}
+
+/* Emit a warning covering SRC_RANGE, with the caret at the start of
+ SRC_RANGE. */
+
+static void
+emit_warning (source_range src_range)
+{
+ location_t loc
+ = make_location (src_range.m_start, src_range.m_start, src_range.m_finish);
+ warning_at (loc, 0, "range %i:%i-%i:%i",
+ LOCATION_LINE (src_range.m_start),
+ LOCATION_COLUMN (src_range.m_start),
+ LOCATION_LINE (src_range.m_finish),
+ LOCATION_COLUMN (src_range.m_finish));
+}
+
+/* Support code for verifying that we are correctly tracking ranges
+ within string literals, for use by diagnostic-test-string-literals-*.c.
+ Emit a warning showing the range of a string literal, for each call to
+ a function named "__emit_string_literal_range".
+ The initial argument should be a string literal; arguments 2 and 3
+ should be integer constants, giving the range within the string
+ to be printed. */
+
+static void
+test_string_literals (gimple *stmt)
+{
+ gcall *call = check_for_named_call (stmt, "__emit_string_literal_range", 3);
+ if (!call)
+ return;
+
+ /* We expect an ADDR_EXPR with a STRING_CST inside it for the
+ initial arg. */
+ tree t_addr_string = gimple_call_arg (call, 0);
+ if (TREE_CODE (t_addr_string) != ADDR_EXPR)
+ {
+ error_at (call->location, "string literal required for arg 1");
+ return;
+ }
+
+ tree t_string = TREE_OPERAND (t_addr_string, 0);
+ if (TREE_CODE (t_string) != STRING_CST)
+ {
+ error_at (call->location, "string literal required for arg 1");
+ return;
+ }
+
+ tree t_start_idx = gimple_call_arg (call, 1);
+ if (TREE_CODE (t_start_idx) != INTEGER_CST)
+ {
+ error_at (call->location, "integer constant required for arg 2");
+ return;
+ }
+ int start_idx = TREE_INT_CST_LOW (t_start_idx);
+
+ tree t_end_idx = gimple_call_arg (call, 2);
+ if (TREE_CODE (t_end_idx) != INTEGER_CST)
+ {
+ error_at (call->location, "integer constant required for arg 3");
+ return;
+ }
+ int end_idx = TREE_INT_CST_LOW (t_end_idx);
+
+ /* A STRING_CST doesn't have a location, but the ADDR_EXPR does. */
+ location_t strloc = EXPR_LOCATION (t_addr_string);
+ source_range src_range;
+ substring_loc substr_loc (strloc, TREE_TYPE (t_string),
+ start_idx, end_idx);
+ const char *err = substr_loc.get_range (&src_range);
+ if (err)
+ error_at (strloc, "unable to read substring range: %s", err);
+ else
+ emit_warning (src_range);
+}
+
+/* Call test_string_literals on every statement within FUN. */
+
+unsigned int
+pass_test_string_literals::execute (function *fun)
+{
+ gimple_stmt_iterator gsi;
+ basic_block bb;
+
+ FOR_EACH_BB_FN (bb, fun)
+ for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ test_string_literals (stmt);
+ }
+
+ return 0;
+}
+
+/* Entrypoint for the plugin. Create and register the custom pass. */
+
+int
+plugin_init (struct plugin_name_args *plugin_info,
+ struct plugin_gcc_version *version)
+{
+ struct register_pass_info pass_info;
+ const char *plugin_name = plugin_info->base_name;
+ int argc = plugin_info->argc;
+ struct plugin_argument *argv = plugin_info->argv;
+
+ if (!plugin_default_version_check (version, &gcc_version))
+ return 1;
+
+ pass_info.pass = new pass_test_string_literals (g);
+ pass_info.reference_pass_name = "ssa";
+ pass_info.ref_pass_instance_number = 1;
+ pass_info.pos_op = PASS_POS_INSERT_AFTER;
+ register_callback (plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL,
+ &pass_info);
+
+ return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/plugin/plugin.exp b/gcc/testsuite/gcc.dg/plugin/plugin.exp
index faebb75..715038a 100644
--- a/gcc/testsuite/gcc.dg/plugin/plugin.exp
+++ b/gcc/testsuite/gcc.dg/plugin/plugin.exp
@@ -70,6 +70,9 @@ set plugin_test_list [list \
diagnostic-test-expressions-1.c } \
{ diagnostic_plugin_show_trees.c \
diagnostic-test-show-trees-1.c } \
+ { diagnostic_plugin_test_string_literals.c \
+ diagnostic-test-string-literals-1.c \
+ diagnostic-test-string-literals-2.c } \
{ location_overflow_plugin.c \
location-overflow-test-1.c \
location-overflow-test-2.c } \