diff options
Diffstat (limited to 'libcpp')
-rw-r--r-- | libcpp/charset.c | 63 | ||||
-rw-r--r-- | libcpp/errors.c | 82 | ||||
-rw-r--r-- | libcpp/include/cpplib.h | 76 | ||||
-rw-r--r-- | libcpp/include/line-map.h | 13 | ||||
-rw-r--r-- | libcpp/internal.h | 23 | ||||
-rw-r--r-- | libcpp/lex.c | 38 | ||||
-rw-r--r-- | libcpp/line-map.c | 3 |
7 files changed, 246 insertions, 52 deletions
diff --git a/libcpp/charset.c b/libcpp/charset.c index e4e45f6..0b0ccc6 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -1582,12 +1582,14 @@ convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit, "unknown escape sequence: '\\%c'", (int) c); else { + encoding_rich_location rich_loc (pfile); + /* diagnostic.c does not support "%03o". When it does, this code can use %03o directly in the diagnostic again. */ char buf[32]; sprintf(buf, "%03o", (int) c); - cpp_error (pfile, CPP_DL_PEDWARN, - "unknown escape sequence: '\\%s'", buf); + cpp_error_at (pfile, CPP_DL_PEDWARN, &rich_loc, + "unknown escape sequence: '\\%s'", buf); } } @@ -2345,14 +2347,16 @@ cpp_string_location_reader::get_next () } cpp_display_width_computation:: -cpp_display_width_computation (const char *data, int data_length, int tabstop) : +cpp_display_width_computation (const char *data, int data_length, + const cpp_char_column_policy &policy) : m_begin (data), m_next (m_begin), m_bytes_left (data_length), - m_tabstop (tabstop), + m_policy (policy), m_display_cols (0) { - gcc_assert (m_tabstop > 0); + gcc_assert (policy.m_tabstop > 0); + gcc_assert (policy.m_width_cb); } @@ -2364,19 +2368,28 @@ cpp_display_width_computation (const char *data, int data_length, int tabstop) : point to a valid UTF-8-encoded sequence, then it will be treated as a single byte with display width 1. m_cur_display_col is the current display column, relative to which tab stops should be expanded. Returns the display width of - the codepoint just processed. */ + the codepoint just processed. + If OUT is non-NULL, it is populated. */ int -cpp_display_width_computation::process_next_codepoint () +cpp_display_width_computation::process_next_codepoint (cpp_decoded_char *out) { cppchar_t c; int next_width; + if (out) + out->m_start_byte = m_next; + if (*m_next == '\t') { ++m_next; --m_bytes_left; - next_width = m_tabstop - (m_display_cols % m_tabstop); + next_width = m_policy.m_tabstop - (m_display_cols % m_policy.m_tabstop); + if (out) + { + out->m_ch = '\t'; + out->m_valid_ch = true; + } } else if (one_utf8_to_cppchar ((const uchar **) &m_next, &m_bytes_left, &c) != 0) @@ -2386,14 +2399,24 @@ cpp_display_width_computation::process_next_codepoint () of one. */ ++m_next; --m_bytes_left; - next_width = 1; + next_width = m_policy.m_undecoded_byte_width; + if (out) + out->m_valid_ch = false; } else { /* one_utf8_to_cppchar() has updated m_next and m_bytes_left for us. */ - next_width = cpp_wcwidth (c); + next_width = m_policy.m_width_cb (c); + if (out) + { + out->m_ch = c; + out->m_valid_ch = true; + } } + if (out) + out->m_next_byte = m_next; + m_display_cols += next_width; return next_width; } @@ -2409,7 +2432,7 @@ cpp_display_width_computation::advance_display_cols (int n) const int start = m_display_cols; const int target = start + n; while (m_display_cols < target && !done ()) - process_next_codepoint (); + process_next_codepoint (NULL); return m_display_cols - start; } @@ -2417,29 +2440,33 @@ cpp_display_width_computation::advance_display_cols (int n) how many display columns are occupied by the first COLUMN bytes. COLUMN may exceed DATA_LENGTH, in which case the phantom bytes at the end are treated as if they have display width 1. Tabs are expanded to the next tab - stop, relative to the start of DATA. */ + stop, relative to the start of DATA, and non-printable-ASCII characters + will be escaped as per POLICY. */ int cpp_byte_column_to_display_column (const char *data, int data_length, - int column, int tabstop) + int column, + const cpp_char_column_policy &policy) { const int offset = MAX (0, column - data_length); - cpp_display_width_computation dw (data, column - offset, tabstop); + cpp_display_width_computation dw (data, column - offset, policy); while (!dw.done ()) - dw.process_next_codepoint (); + dw.process_next_codepoint (NULL); return dw.display_cols_processed () + offset; } /* For the string of length DATA_LENGTH bytes that begins at DATA, compute the least number of bytes that will result in at least DISPLAY_COL display columns. The return value may exceed DATA_LENGTH if the entire string does - not occupy enough display columns. */ + not occupy enough display columns. Non-printable-ASCII characters + will be escaped as per POLICY. */ int cpp_display_column_to_byte_column (const char *data, int data_length, - int display_col, int tabstop) + int display_col, + const cpp_char_column_policy &policy) { - cpp_display_width_computation dw (data, data_length, tabstop); + cpp_display_width_computation dw (data, data_length, policy); const int avail_display = dw.advance_display_cols (display_col); return dw.bytes_processed () + MAX (0, display_col - avail_display); } diff --git a/libcpp/errors.c b/libcpp/errors.c index 5e1bf33..f34334a 100644 --- a/libcpp/errors.c +++ b/libcpp/errors.c @@ -27,6 +27,31 @@ along with this program; see the file COPYING3. If not see #include "cpplib.h" #include "internal.h" +/* Get a location_t for the current location in PFILE, + generally that of the previously lexed token. */ + +location_t +cpp_diagnostic_get_current_location (cpp_reader *pfile) +{ + if (CPP_OPTION (pfile, traditional)) + { + if (pfile->state.in_directive) + return pfile->directive_line; + else + return pfile->line_table->highest_line; + } + /* We don't want to refer to a token before the beginning of the + current run -- that is invalid. */ + else if (pfile->cur_token == pfile->cur_run->base) + { + return 0; + } + else + { + return pfile->cur_token[-1].src_loc; + } +} + /* Print a diagnostic at the given location. */ ATTRIBUTE_FPTR_PRINTF(5,0) @@ -52,25 +77,7 @@ cpp_diagnostic (cpp_reader * pfile, enum cpp_diagnostic_level level, enum cpp_warning_reason reason, const char *msgid, va_list *ap) { - location_t src_loc; - - if (CPP_OPTION (pfile, traditional)) - { - if (pfile->state.in_directive) - src_loc = pfile->directive_line; - else - src_loc = pfile->line_table->highest_line; - } - /* We don't want to refer to a token before the beginning of the - current run -- that is invalid. */ - else if (pfile->cur_token == pfile->cur_run->base) - { - src_loc = 0; - } - else - { - src_loc = pfile->cur_token[-1].src_loc; - } + location_t src_loc = cpp_diagnostic_get_current_location (pfile); rich_location richloc (pfile->line_table, src_loc); return cpp_diagnostic_at (pfile, level, reason, &richloc, msgid, ap); } @@ -144,6 +151,43 @@ cpp_warning_syshdr (cpp_reader * pfile, enum cpp_warning_reason reason, return ret; } +/* As cpp_warning above, but use RICHLOC as the location of the diagnostic. */ + +bool cpp_warning_at (cpp_reader *pfile, enum cpp_warning_reason reason, + rich_location *richloc, const char *msgid, ...) +{ + va_list ap; + bool ret; + + va_start (ap, msgid); + + ret = cpp_diagnostic_at (pfile, CPP_DL_WARNING, reason, richloc, + msgid, &ap); + + va_end (ap); + return ret; + +} + +/* As cpp_pedwarning above, but use RICHLOC as the location of the + diagnostic. */ + +bool +cpp_pedwarning_at (cpp_reader * pfile, enum cpp_warning_reason reason, + rich_location *richloc, const char *msgid, ...) +{ + va_list ap; + bool ret; + + va_start (ap, msgid); + + ret = cpp_diagnostic_at (pfile, CPP_DL_PEDWARN, reason, richloc, + msgid, &ap); + + va_end (ap); + return ret; +} + /* Print a diagnostic at a specific location. */ ATTRIBUTE_FPTR_PRINTF(6,0) diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index 56b07ac..176f8c5 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -1268,6 +1268,14 @@ extern bool cpp_warning_syshdr (cpp_reader *, enum cpp_warning_reason reason, const char *msgid, ...) ATTRIBUTE_PRINTF_3; +/* As their counterparts above, but use RICHLOC. */ +extern bool cpp_warning_at (cpp_reader *, enum cpp_warning_reason, + rich_location *richloc, const char *msgid, ...) + ATTRIBUTE_PRINTF_4; +extern bool cpp_pedwarning_at (cpp_reader *, enum cpp_warning_reason, + rich_location *richloc, const char *msgid, ...) + ATTRIBUTE_PRINTF_4; + /* Output a diagnostic with "MSGID: " preceding the error string of errno. No location is printed. */ extern bool cpp_errno (cpp_reader *, enum cpp_diagnostic_level, @@ -1442,43 +1450,95 @@ extern const char * cpp_get_userdef_suffix /* In charset.c */ +/* The result of attempting to decode a run of UTF-8 bytes. */ + +struct cpp_decoded_char +{ + const char *m_start_byte; + const char *m_next_byte; + + bool m_valid_ch; + cppchar_t m_ch; +}; + +/* Information for mapping between code points and display columns. + + This is a tabstop value, along with a callback for getting the + widths of characters. Normally this callback is cpp_wcwidth, but we + support other schemes for escaping non-ASCII unicode as a series of + ASCII chars when printing the user's source code in diagnostic-show-locus.c + + For example, consider: + - the Unicode character U+03C0 "GREEK SMALL LETTER PI" (UTF-8: 0xCF 0x80) + - the Unicode character U+1F642 "SLIGHTLY SMILING FACE" + (UTF-8: 0xF0 0x9F 0x99 0x82) + - the byte 0xBF (a stray trailing byte of a UTF-8 character) + Normally U+03C0 would occupy one display column, U+1F642 + would occupy two display columns, and the stray byte would be + printed verbatim as one display column. + + However when escaping them as unicode code points as "<U+03C0>" + and "<U+1F642>" they occupy 8 and 9 display columns respectively, + and when escaping them as bytes as "<CF><80>" and "<F0><9F><99><82>" + they occupy 8 and 16 display columns respectively. In both cases + the stray byte is escaped to <BF> as 4 display columns. */ + +struct cpp_char_column_policy +{ + cpp_char_column_policy (int tabstop, + int (*width_cb) (cppchar_t c)) + : m_tabstop (tabstop), + m_undecoded_byte_width (1), + m_width_cb (width_cb) + {} + + int m_tabstop; + /* Width in display columns of a stray byte that isn't decodable + as UTF-8. */ + int m_undecoded_byte_width; + int (*m_width_cb) (cppchar_t c); +}; + /* A class to manage the state while converting a UTF-8 sequence to cppchar_t and computing the display width one character at a time. */ class cpp_display_width_computation { public: cpp_display_width_computation (const char *data, int data_length, - int tabstop); + const cpp_char_column_policy &policy); const char *next_byte () const { return m_next; } int bytes_processed () const { return m_next - m_begin; } int bytes_left () const { return m_bytes_left; } bool done () const { return !bytes_left (); } int display_cols_processed () const { return m_display_cols; } - int process_next_codepoint (); + int process_next_codepoint (cpp_decoded_char *out); int advance_display_cols (int n); private: const char *const m_begin; const char *m_next; size_t m_bytes_left; - const int m_tabstop; + const cpp_char_column_policy &m_policy; int m_display_cols; }; /* Convenience functions that are simple use cases for class cpp_display_width_computation. Tab characters will be expanded to spaces - as determined by TABSTOP. */ + as determined by POLICY.m_tabstop, and non-printable-ASCII characters + will be escaped as per POLICY. */ int cpp_byte_column_to_display_column (const char *data, int data_length, - int column, int tabstop); + int column, + const cpp_char_column_policy &policy); inline int cpp_display_width (const char *data, int data_length, - int tabstop) + const cpp_char_column_policy &policy) { return cpp_byte_column_to_display_column (data, data_length, data_length, - tabstop); + policy); } int cpp_display_column_to_byte_column (const char *data, int data_length, - int display_col, int tabstop); + int display_col, + const cpp_char_column_policy &policy); int cpp_wcwidth (cppchar_t c); bool cpp_input_conversion_is_trivial (const char *input_charset); diff --git a/libcpp/include/line-map.h b/libcpp/include/line-map.h index 464494b..8b5e2f8 100644 --- a/libcpp/include/line-map.h +++ b/libcpp/include/line-map.h @@ -1787,6 +1787,18 @@ class rich_location const diagnostic_path *get_path () const { return m_path; } void set_path (const diagnostic_path *path) { m_path = path; } + /* A flag for hinting that the diagnostic involves character encoding + issues, and thus that it will be helpful to the user if we show some + representation of how the characters in the pertinent source lines + are encoded. + The default is false (i.e. do not escape). + When set to true, non-ASCII bytes in the pertinent source lines will + be escaped in a manner controlled by the user-supplied option + -fdiagnostics-escape-format=, so that the user can better understand + what's going on with the encoding in their source file. */ + bool escape_on_output_p () const { return m_escape_on_output; } + void set_escape_on_output (bool flag) { m_escape_on_output = flag; } + private: bool reject_impossible_fixit (location_t where); void stop_supporting_fixits (); @@ -1813,6 +1825,7 @@ protected: bool m_fixits_cannot_be_auto_applied; const diagnostic_path *m_path; + bool m_escape_on_output; }; /* A struct for the result of range_label::get_text: a NUL-terminated buffer diff --git a/libcpp/internal.h b/libcpp/internal.h index fd44de6..8577cab 100644 --- a/libcpp/internal.h +++ b/libcpp/internal.h @@ -769,6 +769,9 @@ extern void _cpp_do_file_change (cpp_reader *, enum lc_reason, const char *, extern void _cpp_pop_buffer (cpp_reader *); extern char *_cpp_bracket_include (cpp_reader *); +/* In errors.c */ +extern location_t cpp_diagnostic_get_current_location (cpp_reader *); + /* In traditional.c. */ extern bool _cpp_scan_out_logical_line (cpp_reader *, cpp_macro *, bool); extern bool _cpp_read_logical_line_trad (cpp_reader *); @@ -935,6 +938,26 @@ int linemap_get_expansion_line (class line_maps *, const char* linemap_get_expansion_filename (class line_maps *, location_t); +/* A subclass of rich_location for emitting a diagnostic + at the current location of the reader, but flagging + it with set_escape_on_output (true). */ +class encoding_rich_location : public rich_location +{ + public: + encoding_rich_location (cpp_reader *pfile) + : rich_location (pfile->line_table, + cpp_diagnostic_get_current_location (pfile)) + { + set_escape_on_output (true); + } + + encoding_rich_location (cpp_reader *pfile, location_t loc) + : rich_location (pfile->line_table, loc) + { + set_escape_on_output (true); + } +}; + #ifdef __cplusplus } #endif diff --git a/libcpp/lex.c b/libcpp/lex.c index 8e3ef09..fa2253d 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -1268,7 +1268,11 @@ skip_whitespace (cpp_reader *pfile, cppchar_t c) while (is_nvspace (c)); if (saw_NUL) - cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored"); + { + encoding_rich_location rich_loc (pfile); + cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc, + "null character(s) ignored"); + } buffer->cur--; } @@ -1297,6 +1301,28 @@ warn_about_normalization (cpp_reader *pfile, if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s) && !pfile->state.skipping) { + location_t loc = token->src_loc; + + /* If possible, create a location range for the token. */ + if (loc >= RESERVED_LOCATION_COUNT + && token->type != CPP_EOF + /* There must be no line notes to process. */ + && (!(pfile->buffer->cur + >= pfile->buffer->notes[pfile->buffer->cur_note].pos + && !pfile->overlaid_buffer))) + { + source_range tok_range; + tok_range.m_start = loc; + tok_range.m_finish + = linemap_position_for_column (pfile->line_table, + CPP_BUF_COLUMN (pfile->buffer, + pfile->buffer->cur)); + loc = COMBINE_LOCATION_DATA (pfile->line_table, + loc, tok_range, NULL); + } + + encoding_rich_location rich_loc (pfile, loc); + /* Make sure that the token is printed using UCNs, even if we'd otherwise happily print UTF-8. */ unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token)); @@ -1304,14 +1330,14 @@ warn_about_normalization (cpp_reader *pfile, sz = cpp_spell_token (pfile, token, buf, false) - buf; if (NORMALIZE_STATE_RESULT (s) == normalized_C) - cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, - "`%.*s' is not in NFKC", (int) sz, buf); + cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc, + "`%.*s' is not in NFKC", (int) sz, buf); else if (CPP_OPTION (pfile, cxx23_identifiers)) - cpp_pedwarning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, + cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc, "`%.*s' is not in NFC", (int) sz, buf); else - cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, - "`%.*s' is not in NFC", (int) sz, buf); + cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc, + "`%.*s' is not in NFC", (int) sz, buf); free (buf); } } diff --git a/libcpp/line-map.c b/libcpp/line-map.c index 1a6902a..1957dd7 100644 --- a/libcpp/line-map.c +++ b/libcpp/line-map.c @@ -2086,7 +2086,8 @@ rich_location::rich_location (line_maps *set, location_t loc, m_fixit_hints (), m_seen_impossible_fixit (false), m_fixits_cannot_be_auto_applied (false), - m_path (NULL) + m_path (NULL), + m_escape_on_output (false) { add_range (loc, SHOW_RANGE_WITH_CARET, label); } |