diff options
Diffstat (limited to 'gcc/input.cc')
-rw-r--r-- | gcc/input.cc | 3932 |
1 files changed, 3932 insertions, 0 deletions
diff --git a/gcc/input.cc b/gcc/input.cc new file mode 100644 index 0000000..82e79be --- /dev/null +++ b/gcc/input.cc @@ -0,0 +1,3932 @@ +/* Data and functions related to line maps and input files. + Copyright (C) 2004-2022 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "intl.h" +#include "diagnostic.h" +#include "selftest.h" +#include "cpplib.h" + +#ifndef HAVE_ICONV +#define HAVE_ICONV 0 +#endif + +/* Input charset configuration. */ +static const char *default_charset_callback (const char *) +{ + return nullptr; +} + +void +file_cache::initialize_input_context (diagnostic_input_charset_callback ccb, + bool should_skip_bom) +{ + in_context.ccb = (ccb ? ccb : default_charset_callback); + in_context.should_skip_bom = should_skip_bom; +} + +/* This is a cache used by get_next_line to store the content of a + file to be searched for file lines. */ +class file_cache_slot +{ +public: + file_cache_slot (); + ~file_cache_slot (); + + bool read_line_num (size_t line_num, + char ** line, ssize_t *line_len); + + /* Accessors. */ + const char *get_file_path () const { return m_file_path; } + unsigned get_use_count () const { return m_use_count; } + bool missing_trailing_newline_p () const + { + return m_missing_trailing_newline; + } + + void inc_use_count () { m_use_count++; } + + bool create (const file_cache::input_context &in_context, + const char *file_path, FILE *fp, unsigned highest_use_count); + void evict (); + + private: + /* These are information used to store a line boundary. */ + class line_info + { + public: + /* The line number. It starts from 1. */ + size_t line_num; + + /* The position (byte count) of the beginning of the line, + relative to the file data pointer. This starts at zero. */ + size_t start_pos; + + /* The position (byte count) of the last byte of the line. This + normally points to the '\n' character, or to one byte after the + last byte of the file, if the file doesn't contain a '\n' + character. */ + size_t end_pos; + + line_info (size_t l, size_t s, size_t e) + : line_num (l), start_pos (s), end_pos (e) + {} + + line_info () + :line_num (0), start_pos (0), end_pos (0) + {} + }; + + bool needs_read_p () const; + bool needs_grow_p () const; + void maybe_grow (); + bool read_data (); + bool maybe_read_data (); + bool get_next_line (char **line, ssize_t *line_len); + bool read_next_line (char ** line, ssize_t *line_len); + bool goto_next_line (); + + static const size_t buffer_size = 4 * 1024; + static const size_t line_record_size = 100; + + /* The number of time this file has been accessed. This is used + to designate which file cache to evict from the cache + array. */ + unsigned m_use_count; + + /* The file_path is the key for identifying a particular file in + the cache. + For libcpp-using code, the underlying buffer for this field is + owned by the corresponding _cpp_file within the cpp_reader. */ + const char *m_file_path; + + FILE *m_fp; + + /* This points to the content of the file that we've read so + far. */ + char *m_data; + + /* The allocated buffer to be freed may start a little earlier than DATA, + e.g. if a UTF8 BOM was skipped at the beginning. */ + int m_alloc_offset; + + /* The size of the DATA array above.*/ + size_t m_size; + + /* The number of bytes read from the underlying file so far. This + must be less (or equal) than SIZE above. */ + size_t m_nb_read; + + /* The index of the beginning of the current line. */ + size_t m_line_start_idx; + + /* The number of the previous line read. This starts at 1. Zero + means we've read no line so far. */ + size_t m_line_num; + + /* This is the total number of lines of the current file. At the + moment, we try to get this information from the line map + subsystem. Note that this is just a hint. When using the C++ + front-end, this hint is correct because the input file is then + completely tokenized before parsing starts; so the line map knows + the number of lines before compilation really starts. For e.g, + the C front-end, it can happen that we start emitting diagnostics + before the line map has seen the end of the file. */ + size_t m_total_lines; + + /* Could this file be missing a trailing newline on its final line? + Initially true (to cope with empty files), set to true/false + as each line is read. */ + bool m_missing_trailing_newline; + + /* This is a record of the beginning and end of the lines we've seen + while reading the file. This is useful to avoid walking the data + from the beginning when we are asked to read a line that is + before LINE_START_IDX above. Note that the maximum size of this + record is line_record_size, so that the memory consumption + doesn't explode. We thus scale total_lines down to + line_record_size. */ + vec<line_info, va_heap> m_line_record; + + void offset_buffer (int offset) + { + gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0 + : (size_t) offset <= m_size); + gcc_assert (m_data); + m_alloc_offset += offset; + m_data += offset; + m_size -= offset; + } + +}; + +/* Current position in real source file. */ + +location_t input_location = UNKNOWN_LOCATION; + +class line_maps *line_table; + +/* A stashed copy of "line_table" for use by selftest::line_table_test. + This needs to be a global so that it can be a GC root, and thus + prevent the stashed copy from being garbage-collected if the GC runs + during a line_table_test. */ + +class line_maps *saved_line_table; + +/* Expand the source location LOC into a human readable location. If + LOC resolves to a builtin location, the file name of the readable + location is set to the string "<built-in>". If EXPANSION_POINT_P is + TRUE and LOC is virtual, then it is resolved to the expansion + point of the involved macro. Otherwise, it is resolved to the + spelling location of the token. + + When resolving to the spelling location of the token, if the + resulting location is for a built-in location (that is, it has no + associated line/column) in the context of a macro expansion, the + returned location is the first one (while unwinding the macro + location towards its expansion point) that is in real source + code. + + ASPECT controls which part of the location to use. */ + +static expanded_location +expand_location_1 (location_t loc, + bool expansion_point_p, + enum location_aspect aspect) +{ + expanded_location xloc; + const line_map_ordinary *map; + enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT; + tree block = NULL; + + if (IS_ADHOC_LOC (loc)) + { + block = LOCATION_BLOCK (loc); + loc = LOCATION_LOCUS (loc); + } + + memset (&xloc, 0, sizeof (xloc)); + + if (loc >= RESERVED_LOCATION_COUNT) + { + if (!expansion_point_p) + { + /* We want to resolve LOC to its spelling location. + + But if that spelling location is a reserved location that + appears in the context of a macro expansion (like for a + location for a built-in token), let's consider the first + location (toward the expansion point) that is not reserved; + that is, the first location that is in real source code. */ + loc = linemap_unwind_to_first_non_reserved_loc (line_table, + loc, NULL); + lrk = LRK_SPELLING_LOCATION; + } + loc = linemap_resolve_location (line_table, loc, lrk, &map); + + /* loc is now either in an ordinary map, or is a reserved location. + If it is a compound location, the caret is in a spelling location, + but the start/finish might still be a virtual location. + Depending of what the caller asked for, we may need to recurse + one level in order to resolve any virtual locations in the + end-points. */ + switch (aspect) + { + default: + gcc_unreachable (); + /* Fall through. */ + case LOCATION_ASPECT_CARET: + break; + case LOCATION_ASPECT_START: + { + location_t start = get_start (loc); + if (start != loc) + return expand_location_1 (start, expansion_point_p, aspect); + } + break; + case LOCATION_ASPECT_FINISH: + { + location_t finish = get_finish (loc); + if (finish != loc) + return expand_location_1 (finish, expansion_point_p, aspect); + } + break; + } + xloc = linemap_expand_location (line_table, map, loc); + } + + xloc.data = block; + if (loc <= BUILTINS_LOCATION) + xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>"); + + return xloc; +} + +/* Initialize the set of cache used for files accessed by caret + diagnostic. */ + +static void +diagnostic_file_cache_init (void) +{ + gcc_assert (global_dc); + if (global_dc->m_file_cache == NULL) + global_dc->m_file_cache = new file_cache (); +} + +/* Free the resources used by the set of cache used for files accessed + by caret diagnostic. */ + +void +diagnostic_file_cache_fini (void) +{ + if (global_dc->m_file_cache) + { + delete global_dc->m_file_cache; + global_dc->m_file_cache = NULL; + } +} + +/* Return the total lines number that have been read so far by the + line map (in the preprocessor) so far. For languages like C++ that + entirely preprocess the input file before starting to parse, this + equals the actual number of lines of the file. */ + +static size_t +total_lines_num (const char *file_path) +{ + size_t r = 0; + location_t l = 0; + if (linemap_get_file_highest_location (line_table, file_path, &l)) + { + gcc_assert (l >= RESERVED_LOCATION_COUNT); + expanded_location xloc = expand_location (l); + r = xloc.line; + } + return r; +} + +/* Lookup the cache used for the content of a given file accessed by + caret diagnostic. Return the found cached file, or NULL if no + cached file was found. */ + +file_cache_slot * +file_cache::lookup_file (const char *file_path) +{ + gcc_assert (file_path); + + /* This will contain the found cached file. */ + file_cache_slot *r = NULL; + for (unsigned i = 0; i < num_file_slots; ++i) + { + file_cache_slot *c = &m_file_slots[i]; + if (c->get_file_path () && !strcmp (c->get_file_path (), file_path)) + { + c->inc_use_count (); + r = c; + } + } + + if (r) + r->inc_use_count (); + + return r; +} + +/* Purge any mention of FILENAME from the cache of files used for + printing source code. For use in selftests when working + with tempfiles. */ + +void +diagnostics_file_cache_forcibly_evict_file (const char *file_path) +{ + gcc_assert (file_path); + + if (!global_dc->m_file_cache) + return; + + global_dc->m_file_cache->forcibly_evict_file (file_path); +} + +void +file_cache::forcibly_evict_file (const char *file_path) +{ + gcc_assert (file_path); + + file_cache_slot *r = lookup_file (file_path); + if (!r) + /* Not found. */ + return; + + r->evict (); +} + +void +file_cache_slot::evict () +{ + m_file_path = NULL; + if (m_fp) + fclose (m_fp); + m_fp = NULL; + m_nb_read = 0; + m_line_start_idx = 0; + m_line_num = 0; + m_line_record.truncate (0); + m_use_count = 0; + m_total_lines = 0; + m_missing_trailing_newline = true; +} + +/* Return the file cache that has been less used, recently, or the + first empty one. If HIGHEST_USE_COUNT is non-null, + *HIGHEST_USE_COUNT is set to the highest use count of the entries + in the cache table. */ + +file_cache_slot* +file_cache::evicted_cache_tab_entry (unsigned *highest_use_count) +{ + diagnostic_file_cache_init (); + + file_cache_slot *to_evict = &m_file_slots[0]; + unsigned huc = to_evict->get_use_count (); + for (unsigned i = 1; i < num_file_slots; ++i) + { + file_cache_slot *c = &m_file_slots[i]; + bool c_is_empty = (c->get_file_path () == NULL); + + if (c->get_use_count () < to_evict->get_use_count () + || (to_evict->get_file_path () && c_is_empty)) + /* We evict C because it's either an entry with a lower use + count or one that is empty. */ + to_evict = c; + + if (huc < c->get_use_count ()) + huc = c->get_use_count (); + + if (c_is_empty) + /* We've reached the end of the cache; subsequent elements are + all empty. */ + break; + } + + if (highest_use_count) + *highest_use_count = huc; + + return to_evict; +} + +/* Create the cache used for the content of a given file to be + accessed by caret diagnostic. This cache is added to an array of + cache and can be retrieved by lookup_file_in_cache_tab. This + function returns the created cache. Note that only the last + num_file_slots files are cached. */ + +file_cache_slot* +file_cache::add_file (const char *file_path) +{ + + FILE *fp = fopen (file_path, "r"); + if (fp == NULL) + return NULL; + + unsigned highest_use_count = 0; + file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count); + if (!r->create (in_context, file_path, fp, highest_use_count)) + return NULL; + return r; +} + +/* Populate this slot for use on FILE_PATH and FP, dropping any + existing cached content within it. */ + +bool +file_cache_slot::create (const file_cache::input_context &in_context, + const char *file_path, FILE *fp, + unsigned highest_use_count) +{ + m_file_path = file_path; + if (m_fp) + fclose (m_fp); + m_fp = fp; + if (m_alloc_offset) + offset_buffer (-m_alloc_offset); + m_nb_read = 0; + m_line_start_idx = 0; + m_line_num = 0; + m_line_record.truncate (0); + /* Ensure that this cache entry doesn't get evicted next time + add_file_to_cache_tab is called. */ + m_use_count = ++highest_use_count; + m_total_lines = total_lines_num (file_path); + m_missing_trailing_newline = true; + + + /* Check the input configuration to determine if we need to do any + transformations, such as charset conversion or BOM skipping. */ + if (const char *input_charset = in_context.ccb (file_path)) + { + /* Need a full-blown conversion of the input charset. */ + fclose (m_fp); + m_fp = NULL; + const cpp_converted_source cs + = cpp_get_converted_source (file_path, input_charset); + if (!cs.data) + return false; + if (m_data) + XDELETEVEC (m_data); + m_data = cs.data; + m_nb_read = m_size = cs.len; + m_alloc_offset = cs.data - cs.to_free; + } + else if (in_context.should_skip_bom) + { + if (read_data ()) + { + const int offset = cpp_check_utf8_bom (m_data, m_nb_read); + offset_buffer (offset); + m_nb_read -= offset; + } + } + + return true; +} + +/* file_cache's ctor. */ + +file_cache::file_cache () +: m_file_slots (new file_cache_slot[num_file_slots]) +{ + initialize_input_context (nullptr, false); +} + +/* file_cache's dtor. */ + +file_cache::~file_cache () +{ + delete[] m_file_slots; +} + +/* Lookup the cache used for the content of a given file accessed by + caret diagnostic. If no cached file was found, create a new cache + for this file, add it to the array of cached file and return + it. */ + +file_cache_slot* +file_cache::lookup_or_add_file (const char *file_path) +{ + file_cache_slot *r = lookup_file (file_path); + if (r == NULL) + r = add_file (file_path); + return r; +} + +/* Default constructor for a cache of file used by caret + diagnostic. */ + +file_cache_slot::file_cache_slot () +: m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0), + m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0), + m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true) +{ + m_line_record.create (0); +} + +/* Destructor for a cache of file used by caret diagnostic. */ + +file_cache_slot::~file_cache_slot () +{ + if (m_fp) + { + fclose (m_fp); + m_fp = NULL; + } + if (m_data) + { + offset_buffer (-m_alloc_offset); + XDELETEVEC (m_data); + m_data = 0; + } + m_line_record.release (); +} + +/* Returns TRUE iff the cache would need to be filled with data coming + from the file. That is, either the cache is empty or full or the + current line is empty. Note that if the cache is full, it would + need to be extended and filled again. */ + +bool +file_cache_slot::needs_read_p () const +{ + return m_fp && (m_nb_read == 0 + || m_nb_read == m_size + || (m_line_start_idx >= m_nb_read - 1)); +} + +/* Return TRUE iff the cache is full and thus needs to be + extended. */ + +bool +file_cache_slot::needs_grow_p () const +{ + return m_nb_read == m_size; +} + +/* Grow the cache if it needs to be extended. */ + +void +file_cache_slot::maybe_grow () +{ + if (!needs_grow_p ()) + return; + + if (!m_data) + { + gcc_assert (m_size == 0 && m_alloc_offset == 0); + m_size = buffer_size; + m_data = XNEWVEC (char, m_size); + } + else + { + const int offset = m_alloc_offset; + offset_buffer (-offset); + m_size *= 2; + m_data = XRESIZEVEC (char, m_data, m_size); + offset_buffer (offset); + } +} + +/* Read more data into the cache. Extends the cache if need be. + Returns TRUE iff new data could be read. */ + +bool +file_cache_slot::read_data () +{ + if (feof (m_fp) || ferror (m_fp)) + return false; + + maybe_grow (); + + char * from = m_data + m_nb_read; + size_t to_read = m_size - m_nb_read; + size_t nb_read = fread (from, 1, to_read, m_fp); + + if (ferror (m_fp)) + return false; + + m_nb_read += nb_read; + return !!nb_read; +} + +/* Read new data iff the cache needs to be filled with more data + coming from the file FP. Return TRUE iff the cache was filled with + mode data. */ + +bool +file_cache_slot::maybe_read_data () +{ + if (!needs_read_p ()) + return false; + return read_data (); +} + +/* Read a new line from file FP, using C as a cache for the data + coming from the file. Upon successful completion, *LINE is set to + the beginning of the line found. *LINE points directly in the + line cache and is only valid until the next call of get_next_line. + *LINE_LEN is set to the length of the line. Note that the line + does not contain any terminal delimiter. This function returns + true if some data was read or process from the cache, false + otherwise. Note that subsequent calls to get_next_line might + make the content of *LINE invalid. */ + +bool +file_cache_slot::get_next_line (char **line, ssize_t *line_len) +{ + /* Fill the cache with data to process. */ + maybe_read_data (); + + size_t remaining_size = m_nb_read - m_line_start_idx; + if (remaining_size == 0) + /* There is no more data to process. */ + return false; + + char *line_start = m_data + m_line_start_idx; + + char *next_line_start = NULL; + size_t len = 0; + char *line_end = (char *) memchr (line_start, '\n', remaining_size); + if (line_end == NULL) + { + /* We haven't found the end-of-line delimiter in the cache. + Fill the cache with more data from the file and look for the + '\n'. */ + while (maybe_read_data ()) + { + line_start = m_data + m_line_start_idx; + remaining_size = m_nb_read - m_line_start_idx; + line_end = (char *) memchr (line_start, '\n', remaining_size); + if (line_end != NULL) + { + next_line_start = line_end + 1; + break; + } + } + if (line_end == NULL) + { + /* We've loadded all the file into the cache and still no + '\n'. Let's say the line ends up at one byte passed the + end of the file. This is to stay consistent with the case + of when the line ends up with a '\n' and line_end points to + that terminal '\n'. That consistency is useful below in + the len calculation. */ + line_end = m_data + m_nb_read ; + m_missing_trailing_newline = true; + } + else + m_missing_trailing_newline = false; + } + else + { + next_line_start = line_end + 1; + m_missing_trailing_newline = false; + } + + if (m_fp && ferror (m_fp)) + return false; + + /* At this point, we've found the end of the of line. It either + points to the '\n' or to one byte after the last byte of the + file. */ + gcc_assert (line_end != NULL); + + len = line_end - line_start; + + if (m_line_start_idx < m_nb_read) + *line = line_start; + + ++m_line_num; + + /* Before we update our line record, make sure the hint about the + total number of lines of the file is correct. If it's not, then + we give up recording line boundaries from now on. */ + bool update_line_record = true; + if (m_line_num > m_total_lines) + update_line_record = false; + + /* Now update our line record so that re-reading lines from the + before m_line_start_idx is faster. */ + if (update_line_record + && m_line_record.length () < line_record_size) + { + /* If the file lines fits in the line record, we just record all + its lines ...*/ + if (m_total_lines <= line_record_size + && m_line_num > m_line_record.length ()) + m_line_record.safe_push + (file_cache_slot::line_info (m_line_num, + m_line_start_idx, + line_end - m_data)); + else if (m_total_lines > line_record_size) + { + /* ... otherwise, we just scale total_lines down to + (line_record_size lines. */ + size_t n = (m_line_num * line_record_size) / m_total_lines; + if (m_line_record.length () == 0 + || n >= m_line_record.length ()) + m_line_record.safe_push + (file_cache_slot::line_info (m_line_num, + m_line_start_idx, + line_end - m_data)); + } + } + + /* Update m_line_start_idx so that it points to the next line to be + read. */ + if (next_line_start) + m_line_start_idx = next_line_start - m_data; + else + /* We didn't find any terminal '\n'. Let's consider that the end + of line is the end of the data in the cache. The next + invocation of get_next_line will either read more data from the + underlying file or return false early because we've reached the + end of the file. */ + m_line_start_idx = m_nb_read; + + *line_len = len; + + return true; +} + +/* Consume the next bytes coming from the cache (or from its + underlying file if there are remaining unread bytes in the file) + until we reach the next end-of-line (or end-of-file). There is no + copying from the cache involved. Return TRUE upon successful + completion. */ + +bool +file_cache_slot::goto_next_line () +{ + char *l; + ssize_t len; + + return get_next_line (&l, &len); +} + +/* Read an arbitrary line number LINE_NUM from the file cached in C. + If the line was read successfully, *LINE points to the beginning + of the line in the file cache and *LINE_LEN is the length of the + line. *LINE is not nul-terminated, but may contain zero bytes. + *LINE is only valid until the next call of read_line_num. + This function returns bool if a line was read. */ + +bool +file_cache_slot::read_line_num (size_t line_num, + char ** line, ssize_t *line_len) +{ + gcc_assert (line_num > 0); + + if (line_num <= m_line_num) + { + /* We've been asked to read lines that are before m_line_num. + So lets use our line record (if it's not empty) to try to + avoid re-reading the file from the beginning again. */ + + if (m_line_record.is_empty ()) + { + m_line_start_idx = 0; + m_line_num = 0; + } + else + { + file_cache_slot::line_info *i = NULL; + if (m_total_lines <= line_record_size) + { + /* In languages where the input file is not totally + preprocessed up front, the m_total_lines hint + can be smaller than the number of lines of the + file. In that case, only the first + m_total_lines have been recorded. + + Otherwise, the first m_total_lines we've read have + their start/end recorded here. */ + i = (line_num <= m_total_lines) + ? &m_line_record[line_num - 1] + : &m_line_record[m_total_lines - 1]; + gcc_assert (i->line_num <= line_num); + } + else + { + /* So the file had more lines than our line record + size. Thus the number of lines we've recorded has + been scaled down to line_record_size. Let's + pick the start/end of the recorded line that is + closest to line_num. */ + size_t n = (line_num <= m_total_lines) + ? line_num * line_record_size / m_total_lines + : m_line_record.length () - 1; + if (n < m_line_record.length ()) + { + i = &m_line_record[n]; + gcc_assert (i->line_num <= line_num); + } + } + + if (i && i->line_num == line_num) + { + /* We have the start/end of the line. */ + *line = m_data + i->start_pos; + *line_len = i->end_pos - i->start_pos; + return true; + } + + if (i) + { + m_line_start_idx = i->start_pos; + m_line_num = i->line_num - 1; + } + else + { + m_line_start_idx = 0; + m_line_num = 0; + } + } + } + + /* Let's walk from line m_line_num up to line_num - 1, without + copying any line. */ + while (m_line_num < line_num - 1) + if (!goto_next_line ()) + return false; + + /* The line we want is the next one. Let's read and copy it back to + the caller. */ + return get_next_line (line, line_len); +} + +/* Return the physical source line that corresponds to FILE_PATH/LINE. + The line is not nul-terminated. The returned pointer is only + valid until the next call of location_get_source_line. + Note that the line can contain several null characters, + so the returned value's length has the actual length of the line. + If the function fails, a NULL char_span is returned. */ + +char_span +location_get_source_line (const char *file_path, int line) +{ + char *buffer = NULL; + ssize_t len; + + if (line == 0) + return char_span (NULL, 0); + + if (file_path == NULL) + return char_span (NULL, 0); + + diagnostic_file_cache_init (); + + file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path); + if (c == NULL) + return char_span (NULL, 0); + + bool read = c->read_line_num (line, &buffer, &len); + if (!read) + return char_span (NULL, 0); + + return char_span (buffer, len); +} + +/* Determine if FILE_PATH missing a trailing newline on its final line. + Only valid to call once all of the file has been loaded, by + requesting a line number beyond the end of the file. */ + +bool +location_missing_trailing_newline (const char *file_path) +{ + diagnostic_file_cache_init (); + + file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path); + if (c == NULL) + return false; + + return c->missing_trailing_newline_p (); +} + +/* Test if the location originates from the spelling location of a + builtin-tokens. That is, return TRUE if LOC is a (possibly + virtual) location of a built-in token that appears in the expansion + list of a macro. Please note that this function also works on + tokens that result from built-in tokens. For instance, the + function would return true if passed a token "4" that is the result + of the expansion of the built-in __LINE__ macro. */ +bool +is_location_from_builtin_token (location_t loc) +{ + const line_map_ordinary *map = NULL; + loc = linemap_resolve_location (line_table, loc, + LRK_SPELLING_LOCATION, &map); + return loc == BUILTINS_LOCATION; +} + +/* Expand the source location LOC into a human readable location. If + LOC is virtual, it resolves to the expansion point of the involved + macro. If LOC resolves to a builtin location, the file name of the + readable location is set to the string "<built-in>". */ + +expanded_location +expand_location (location_t loc) +{ + return expand_location_1 (loc, /*expansion_point_p=*/true, + LOCATION_ASPECT_CARET); +} + +/* Expand the source location LOC into a human readable location. If + LOC is virtual, it resolves to the expansion location of the + relevant macro. If LOC resolves to a builtin location, the file + name of the readable location is set to the string + "<built-in>". */ + +expanded_location +expand_location_to_spelling_point (location_t loc, + enum location_aspect aspect) +{ + return expand_location_1 (loc, /*expansion_point_p=*/false, aspect); +} + +/* The rich_location class within libcpp requires a way to expand + location_t instances, and relies on the client code + providing a symbol named + linemap_client_expand_location_to_spelling_point + to do this. + + This is the implementation for libcommon.a (all host binaries), + which simply calls into expand_location_1. */ + +expanded_location +linemap_client_expand_location_to_spelling_point (location_t loc, + enum location_aspect aspect) +{ + return expand_location_1 (loc, /*expansion_point_p=*/false, aspect); +} + + +/* If LOCATION is in a system header and if it is a virtual location + for a token coming from the expansion of a macro, unwind it to + the location of the expansion point of the macro. If the expansion + point is also in a system header return the original LOCATION. + Otherwise, return the location of the expansion point. + + This is used for instance when we want to emit diagnostics about a + token that may be located in a macro that is itself defined in a + system header, for example, for the NULL macro. In such a case, if + LOCATION were passed directly to diagnostic functions such as + warning_at, the diagnostic would be suppressed (unless + -Wsystem-headers). */ + +location_t +expansion_point_location_if_in_system_header (location_t location) +{ + if (!in_system_header_at (location)) + return location; + + location_t xloc = linemap_resolve_location (line_table, location, + LRK_MACRO_EXPANSION_POINT, + NULL); + return in_system_header_at (xloc) ? location : xloc; +} + +/* If LOCATION is a virtual location for a token coming from the expansion + of a macro, unwind to the location of the expansion point of the macro. */ + +location_t +expansion_point_location (location_t location) +{ + return linemap_resolve_location (line_table, location, + LRK_MACRO_EXPANSION_POINT, NULL); +} + +/* Construct a location with caret at CARET, ranging from START to + finish e.g. + + 11111111112 + 12345678901234567890 + 522 + 523 return foo + bar; + ~~~~^~~~~ + 524 + + The location's caret is at the "+", line 523 column 15, but starts + earlier, at the "f" of "foo" at column 11. The finish is at the "r" + of "bar" at column 19. */ + +location_t +make_location (location_t caret, location_t start, location_t finish) +{ + location_t pure_loc = get_pure_location (caret); + source_range src_range; + src_range.m_start = get_start (start); + src_range.m_finish = get_finish (finish); + location_t combined_loc = COMBINE_LOCATION_DATA (line_table, + pure_loc, + src_range, + NULL); + return combined_loc; +} + +/* Same as above, but taking a source range rather than two locations. */ + +location_t +make_location (location_t caret, source_range src_range) +{ + location_t pure_loc = get_pure_location (caret); + return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL); +} + +/* An expanded_location stores the column in byte units. This function + converts that column to display units. That requires reading the associated + source line in order to calculate the display width. If that cannot be done + for any reason, then returns the byte column as a fallback. */ +int +location_compute_display_column (expanded_location exploc, + const cpp_char_column_policy &policy) +{ + if (!(exploc.file && *exploc.file && exploc.line && exploc.column)) + return exploc.column; + char_span line = location_get_source_line (exploc.file, exploc.line); + /* If line is NULL, this function returns exploc.column which is the + desired fallback. */ + return cpp_byte_column_to_display_column (line.get_buffer (), line.length (), + exploc.column, policy); +} + +/* Dump statistics to stderr about the memory usage of the line_table + set of line maps. This also displays some statistics about macro + expansion. */ + +void +dump_line_table_statistics (void) +{ + struct linemap_stats s; + long total_used_map_size, + macro_maps_size, + total_allocated_map_size; + + memset (&s, 0, sizeof (s)); + + linemap_get_statistics (line_table, &s); + + macro_maps_size = s.macro_maps_used_size + + s.macro_maps_locations_size; + + total_allocated_map_size = s.ordinary_maps_allocated_size + + s.macro_maps_allocated_size + + s.macro_maps_locations_size; + + total_used_map_size = s.ordinary_maps_used_size + + s.macro_maps_used_size + + s.macro_maps_locations_size; + + fprintf (stderr, "Number of expanded macros: %5ld\n", + s.num_expanded_macros); + if (s.num_expanded_macros != 0) + fprintf (stderr, "Average number of tokens per macro expansion: %5ld\n", + s.num_macro_tokens / s.num_expanded_macros); + fprintf (stderr, + "\nLine Table allocations during the " + "compilation process\n"); + fprintf (stderr, "Number of ordinary maps used: " PRsa (5) "\n", + SIZE_AMOUNT (s.num_ordinary_maps_used)); + fprintf (stderr, "Ordinary map used size: " PRsa (5) "\n", + SIZE_AMOUNT (s.ordinary_maps_used_size)); + fprintf (stderr, "Number of ordinary maps allocated: " PRsa (5) "\n", + SIZE_AMOUNT (s.num_ordinary_maps_allocated)); + fprintf (stderr, "Ordinary maps allocated size: " PRsa (5) "\n", + SIZE_AMOUNT (s.ordinary_maps_allocated_size)); + fprintf (stderr, "Number of macro maps used: " PRsa (5) "\n", + SIZE_AMOUNT (s.num_macro_maps_used)); + fprintf (stderr, "Macro maps used size: " PRsa (5) "\n", + SIZE_AMOUNT (s.macro_maps_used_size)); + fprintf (stderr, "Macro maps locations size: " PRsa (5) "\n", + SIZE_AMOUNT (s.macro_maps_locations_size)); + fprintf (stderr, "Macro maps size: " PRsa (5) "\n", + SIZE_AMOUNT (macro_maps_size)); + fprintf (stderr, "Duplicated maps locations size: " PRsa (5) "\n", + SIZE_AMOUNT (s.duplicated_macro_maps_locations_size)); + fprintf (stderr, "Total allocated maps size: " PRsa (5) "\n", + SIZE_AMOUNT (total_allocated_map_size)); + fprintf (stderr, "Total used maps size: " PRsa (5) "\n", + SIZE_AMOUNT (total_used_map_size)); + fprintf (stderr, "Ad-hoc table size: " PRsa (5) "\n", + SIZE_AMOUNT (s.adhoc_table_size)); + fprintf (stderr, "Ad-hoc table entries used: " PRsa (5) "\n", + SIZE_AMOUNT (s.adhoc_table_entries_used)); + fprintf (stderr, "optimized_ranges: " PRsa (5) "\n", + SIZE_AMOUNT (line_table->num_optimized_ranges)); + fprintf (stderr, "unoptimized_ranges: " PRsa (5) "\n", + SIZE_AMOUNT (line_table->num_unoptimized_ranges)); + + fprintf (stderr, "\n"); +} + +/* Get location one beyond the final location in ordinary map IDX. */ + +static location_t +get_end_location (class line_maps *set, unsigned int idx) +{ + if (idx == LINEMAPS_ORDINARY_USED (set) - 1) + return set->highest_location; + + struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1); + return MAP_START_LOCATION (next_map); +} + +/* Helper function for write_digit_row. */ + +static void +write_digit (FILE *stream, int digit) +{ + fputc ('0' + (digit % 10), stream); +} + +/* Helper function for dump_location_info. + Write a row of numbers to STREAM, numbering a source line, + giving the units, tens, hundreds etc of the column number. */ + +static void +write_digit_row (FILE *stream, int indent, + const line_map_ordinary *map, + location_t loc, int max_col, int divisor) +{ + fprintf (stream, "%*c", indent, ' '); + fprintf (stream, "|"); + for (int column = 1; column < max_col; column++) + { + location_t column_loc = loc + (column << map->m_range_bits); + write_digit (stream, column_loc / divisor); + } + fprintf (stream, "\n"); +} + +/* Write a half-closed (START) / half-open (END) interval of + location_t to STREAM. */ + +static void +dump_location_range (FILE *stream, + location_t start, location_t end) +{ + fprintf (stream, + " location_t interval: %u <= loc < %u\n", + start, end); +} + +/* Write a labelled description of a half-closed (START) / half-open (END) + interval of location_t to STREAM. */ + +static void +dump_labelled_location_range (FILE *stream, + const char *name, + location_t start, location_t end) +{ + fprintf (stream, "%s\n", name); + dump_location_range (stream, start, end); + fprintf (stream, "\n"); +} + +/* Write a visualization of the locations in the line_table to STREAM. */ + +void +dump_location_info (FILE *stream) +{ + /* Visualize the reserved locations. */ + dump_labelled_location_range (stream, "RESERVED LOCATIONS", + 0, RESERVED_LOCATION_COUNT); + + /* Visualize the ordinary line_map instances, rendering the sources. */ + for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++) + { + location_t end_location = get_end_location (line_table, idx); + /* half-closed: doesn't include this one. */ + + const line_map_ordinary *map + = LINEMAPS_ORDINARY_MAP_AT (line_table, idx); + fprintf (stream, "ORDINARY MAP: %i\n", idx); + dump_location_range (stream, + MAP_START_LOCATION (map), end_location); + fprintf (stream, " file: %s\n", ORDINARY_MAP_FILE_NAME (map)); + fprintf (stream, " starting at line: %i\n", + ORDINARY_MAP_STARTING_LINE_NUMBER (map)); + fprintf (stream, " column and range bits: %i\n", + map->m_column_and_range_bits); + fprintf (stream, " column bits: %i\n", + map->m_column_and_range_bits - map->m_range_bits); + fprintf (stream, " range bits: %i\n", + map->m_range_bits); + const char * reason; + switch (map->reason) { + case LC_ENTER: + reason = "LC_ENTER"; + break; + case LC_LEAVE: + reason = "LC_LEAVE"; + break; + case LC_RENAME: + reason = "LC_RENAME"; + break; + case LC_RENAME_VERBATIM: + reason = "LC_RENAME_VERBATIM"; + break; + case LC_ENTER_MACRO: + reason = "LC_RENAME_MACRO"; + break; + default: + reason = "Unknown"; + } + fprintf (stream, " reason: %d (%s)\n", map->reason, reason); + + const line_map_ordinary *includer_map + = linemap_included_from_linemap (line_table, map); + fprintf (stream, " included from location: %d", + linemap_included_from (map)); + if (includer_map) { + fprintf (stream, " (in ordinary map %d)", + int (includer_map - line_table->info_ordinary.maps)); + } + fprintf (stream, "\n"); + + /* Render the span of source lines that this "map" covers. */ + for (location_t loc = MAP_START_LOCATION (map); + loc < end_location; + loc += (1 << map->m_range_bits) ) + { + gcc_assert (pure_location_p (line_table, loc) ); + + expanded_location exploc + = linemap_expand_location (line_table, map, loc); + + if (exploc.column == 0) + { + /* Beginning of a new source line: draw the line. */ + + char_span line_text = location_get_source_line (exploc.file, + exploc.line); + if (!line_text) + break; + fprintf (stream, + "%s:%3i|loc:%5i|%.*s\n", + exploc.file, exploc.line, + loc, + (int)line_text.length (), line_text.get_buffer ()); + + /* "loc" is at column 0, which means "the whole line". + Render the locations *within* the line, by underlining + it, showing the location_t numeric values + at each column. */ + size_t max_col = (1 << map->m_column_and_range_bits) - 1; + if (max_col > line_text.length ()) + max_col = line_text.length () + 1; + + int len_lnum = num_digits (exploc.line); + if (len_lnum < 3) + len_lnum = 3; + int len_loc = num_digits (loc); + if (len_loc < 5) + len_loc = 5; + + int indent = 6 + strlen (exploc.file) + len_lnum + len_loc; + + /* Thousands. */ + if (end_location > 999) + write_digit_row (stream, indent, map, loc, max_col, 1000); + + /* Hundreds. */ + if (end_location > 99) + write_digit_row (stream, indent, map, loc, max_col, 100); + + /* Tens. */ + write_digit_row (stream, indent, map, loc, max_col, 10); + + /* Units. */ + write_digit_row (stream, indent, map, loc, max_col, 1); + } + } + fprintf (stream, "\n"); + } + + /* Visualize unallocated values. */ + dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS", + line_table->highest_location, + LINEMAPS_MACRO_LOWEST_LOCATION (line_table)); + + /* Visualize the macro line_map instances, rendering the sources. */ + for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++) + { + /* Each macro map that is allocated owns location_t values + that are *lower* that the one before them. + Hence it's meaningful to view them either in order of ascending + source locations, or in order of ascending macro map index. */ + const bool ascending_location_ts = true; + unsigned int idx = (ascending_location_ts + ? (LINEMAPS_MACRO_USED (line_table) - (i + 1)) + : i); + const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx); + fprintf (stream, "MACRO %i: %s (%u tokens)\n", + idx, + linemap_map_get_macro_name (map), + MACRO_MAP_NUM_MACRO_TOKENS (map)); + dump_location_range (stream, + map->start_location, + (map->start_location + + MACRO_MAP_NUM_MACRO_TOKENS (map))); + inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map), + "expansion point is location %i", + MACRO_MAP_EXPANSION_POINT_LOCATION (map)); + fprintf (stream, " map->start_location: %u\n", + map->start_location); + + fprintf (stream, " macro_locations:\n"); + for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++) + { + location_t x = MACRO_MAP_LOCATIONS (map)[2 * i]; + location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1]; + + /* linemap_add_macro_token encodes token numbers in an expansion + by putting them after MAP_START_LOCATION. */ + + /* I'm typically seeing 4 uninitialized entries at the end of + 0xafafafaf. + This appears to be due to macro.c:replace_args + adding 2 extra args for padding tokens; presumably there may + be a leading and/or trailing padding token injected, + each for 2 more location slots. + This would explain there being up to 4 location_ts slots + that may be uninitialized. */ + + fprintf (stream, " %u: %u, %u\n", + i, + x, + y); + if (x == y) + { + if (x < MAP_START_LOCATION (map)) + inform (x, "token %u has %<x-location == y-location == %u%>", + i, x); + else + fprintf (stream, + "x-location == y-location == %u encodes token # %u\n", + x, x - MAP_START_LOCATION (map)); + } + else + { + inform (x, "token %u has %<x-location == %u%>", i, x); + inform (x, "token %u has %<y-location == %u%>", i, y); + } + } + fprintf (stream, "\n"); + } + + /* It appears that MAX_LOCATION_T itself is never assigned to a + macro map, presumably due to an off-by-one error somewhere + between the logic in linemap_enter_macro and + LINEMAPS_MACRO_LOWEST_LOCATION. */ + dump_labelled_location_range (stream, "MAX_LOCATION_T", + MAX_LOCATION_T, + MAX_LOCATION_T + 1); + + /* Visualize ad-hoc values. */ + dump_labelled_location_range (stream, "AD-HOC LOCATIONS", + MAX_LOCATION_T + 1, UINT_MAX); +} + +/* string_concat's constructor. */ + +string_concat::string_concat (int num, location_t *locs) + : m_num (num) +{ + m_locs = ggc_vec_alloc <location_t> (num); + for (int i = 0; i < num; i++) + m_locs[i] = locs[i]; +} + +/* string_concat_db's constructor. */ + +string_concat_db::string_concat_db () +{ + m_table = hash_map <location_hash, string_concat *>::create_ggc (64); +} + +/* Record that a string concatenation occurred, covering NUM + string literal tokens. LOCS is an array of size NUM, containing the + locations of the tokens. A copy of LOCS is taken. */ + +void +string_concat_db::record_string_concatenation (int num, location_t *locs) +{ + gcc_assert (num > 1); + gcc_assert (locs); + + location_t key_loc = get_key_loc (locs[0]); + /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values: + any data now recorded under key 'key_loc' would be overwritten by a + subsequent call with the same key 'key_loc'. */ + if (RESERVED_LOCATION_P (key_loc)) + return; + + string_concat *concat + = new (ggc_alloc <string_concat> ()) string_concat (num, locs); + m_table->put (key_loc, concat); +} + +/* Determine if LOC was the location of the initial token of a + concatenation of string literal tokens. + If so, *OUT_NUM is written to with the number of tokens, and + *OUT_LOCS with the location of an array of locations of the + tokens, and return true. *OUT_LOCS is a borrowed pointer to + storage owned by the string_concat_db. + Otherwise, return false. */ + +bool +string_concat_db::get_string_concatenation (location_t loc, + int *out_num, + location_t **out_locs) +{ + gcc_assert (out_num); + gcc_assert (out_locs); + + location_t key_loc = get_key_loc (loc); + /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values; see + discussion in 'string_concat_db::record_string_concatenation'. */ + if (RESERVED_LOCATION_P (key_loc)) + return false; + + string_concat **concat = m_table->get (key_loc); + if (!concat) + return false; + + *out_num = (*concat)->m_num; + *out_locs =(*concat)->m_locs; + return true; +} + +/* Internal function. Canonicalize LOC into a form suitable for + use as a key within the database, stripping away macro expansion, + ad-hoc information, and range information, using the location of + the start of LOC within an ordinary linemap. */ + +location_t +string_concat_db::get_key_loc (location_t loc) +{ + loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION, + NULL); + + loc = get_range_from_loc (line_table, loc).m_start; + + return loc; +} + +/* Helper class for use within get_substring_ranges_for_loc. + An vec of cpp_string with responsibility for releasing all of the + str->text for each str in the vector. */ + +class auto_cpp_string_vec : public auto_vec <cpp_string> +{ + public: + auto_cpp_string_vec (int alloc) + : auto_vec <cpp_string> (alloc) {} + + ~auto_cpp_string_vec () + { + /* Clean up the copies within this vec. */ + int i; + cpp_string *str; + FOR_EACH_VEC_ELT (*this, i, str) + free (const_cast <unsigned char *> (str->text)); + } +}; + +/* Attempt to populate RANGES with source location information on the + individual characters within the string literal found at STRLOC. + If CONCATS is non-NULL, then any string literals that the token at + STRLOC was concatenated with are also added to RANGES. + + Return NULL if successful, or an error message if any errors occurred (in + which case RANGES may be only partially populated and should not + be used). + + This is implemented by re-parsing the relevant source line(s). */ + +static const char * +get_substring_ranges_for_loc (cpp_reader *pfile, + string_concat_db *concats, + location_t strloc, + enum cpp_ttype type, + cpp_substring_ranges &ranges) +{ + gcc_assert (pfile); + + if (strloc == UNKNOWN_LOCATION) + return "unknown location"; + + /* Reparsing the strings requires accurate location information. + If -ftrack-macro-expansion has been overridden from its default + of 2, then we might have a location of a macro expansion point, + rather than the location of the literal itself. + Avoid this by requiring that we have full macro expansion tracking + for substring locations to be available. */ + if (cpp_get_options (pfile)->track_macro_expansion != 2) + return "track_macro_expansion != 2"; + + /* If #line or # 44 "file"-style directives are present, then there's + no guarantee that the line numbers we have can be used to locate + the strings. For example, we might have a .i file with # directives + pointing back to lines within a .c file, but the .c file might + have been edited since the .i file was created. + In such a case, the safest course is to disable on-demand substring + locations. */ + if (line_table->seen_line_directive) + return "seen line directive"; + + /* If string concatenation has occurred at STRLOC, get the locations + of all of the literal tokens making up the compound string. + Otherwise, just use STRLOC. */ + int num_locs = 1; + location_t *strlocs = &strloc; + if (concats) + concats->get_string_concatenation (strloc, &num_locs, &strlocs); + + auto_cpp_string_vec strs (num_locs); + auto_vec <cpp_string_location_reader> loc_readers (num_locs); + for (int i = 0; i < num_locs; i++) + { + /* Get range of strloc. We will use it to locate the start and finish + of the literal token within the line. */ + source_range src_range = get_range_from_loc (line_table, strlocs[i]); + + if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table)) + { + /* If the string token was within a macro expansion, then we can + cope with it for the simple case where we have a single token. + Otherwise, bail out. */ + if (src_range.m_start != src_range.m_finish) + return "macro expansion"; + } + else + { + if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS) + /* If so, we can't reliably determine where the token started within + its line. */ + return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS"; + + if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS) + /* If so, we can't reliably determine where the token finished + within its line. */ + return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS"; + } + + expanded_location start + = expand_location_to_spelling_point (src_range.m_start, + LOCATION_ASPECT_START); + expanded_location finish + = expand_location_to_spelling_point (src_range.m_finish, + LOCATION_ASPECT_FINISH); + if (start.file != finish.file) + return "range endpoints are in different files"; + if (start.line != finish.line) + return "range endpoints are on different lines"; + if (start.column > finish.column) + return "range endpoints are reversed"; + + char_span line = location_get_source_line (start.file, start.line); + if (!line) + return "unable to read source line"; + + /* Determine the location of the literal (including quotes + and leading prefix chars, such as the 'u' in a u"" + token). */ + size_t literal_length = finish.column - start.column + 1; + + /* Ensure that we don't crash if we got the wrong location. */ + if (start.column < 1) + return "zero start column"; + if (line.length () < (start.column - 1 + literal_length)) + return "line is not wide enough"; + + char_span literal = line.subspan (start.column - 1, literal_length); + + cpp_string from; + from.len = literal_length; + /* Make a copy of the literal, to avoid having to rely on + the lifetime of the copy of the line within the cache. + This will be released by the auto_cpp_string_vec dtor. */ + from.text = (unsigned char *)literal.xstrdup (); + strs.safe_push (from); + + /* For very long lines, a new linemap could have started + halfway through the token. + Ensure that the loc_reader uses the linemap of the + *end* of the token for its start location. */ + const line_map_ordinary *start_ord_map; + linemap_resolve_location (line_table, src_range.m_start, + LRK_SPELLING_LOCATION, &start_ord_map); + const line_map_ordinary *final_ord_map; + linemap_resolve_location (line_table, src_range.m_finish, + LRK_SPELLING_LOCATION, &final_ord_map); + if (start_ord_map == NULL || final_ord_map == NULL) + return "failed to get ordinary maps"; + /* Bulletproofing. We ought to only have different ordinary maps + for start vs finish due to line-length jumps. */ + if (start_ord_map != final_ord_map + && start_ord_map->to_file != final_ord_map->to_file) + return "start and finish are spelled in different ordinary maps"; + /* The file from linemap_resolve_location ought to match that from + expand_location_to_spelling_point. */ + if (start_ord_map->to_file != start.file) + return "mismatching file after resolving linemap"; + + location_t start_loc + = linemap_position_for_line_and_column (line_table, final_ord_map, + start.line, start.column); + + cpp_string_location_reader loc_reader (start_loc, line_table); + loc_readers.safe_push (loc_reader); + } + + /* Rerun cpp_interpret_string, or rather, a modified version of it. */ + const char *err = cpp_interpret_string_ranges (pfile, strs.address (), + loc_readers.address (), + num_locs, &ranges, type); + if (err) + return err; + + /* Success: "ranges" should now contain information on the string. */ + return NULL; +} + +/* Attempt to populate *OUT_LOC with source location information on the + given characters within the string literal found at STRLOC. + CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution + character set. + + For example, given CARET_IDX = 4, START_IDX = 3, END_IDX = 7 + and string literal "012345\n789" + *OUT_LOC is written to with: + "012345\n789" + ~^~~~~ + + If CONCATS is non-NULL, then any string literals that the token at + STRLOC was concatenated with are also considered. + + This is implemented by re-parsing the relevant source line(s). + + Return NULL if successful, or an error message if any errors occurred. + Error messages are intended for GCC developers (to help debugging) rather + than for end-users. */ + +const char * +get_location_within_string (cpp_reader *pfile, + string_concat_db *concats, + location_t strloc, + enum cpp_ttype type, + int caret_idx, int start_idx, int end_idx, + location_t *out_loc) +{ + gcc_checking_assert (caret_idx >= 0); + gcc_checking_assert (start_idx >= 0); + gcc_checking_assert (end_idx >= 0); + gcc_assert (out_loc); + + cpp_substring_ranges ranges; + const char *err + = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges); + if (err) + return err; + + if (caret_idx >= ranges.get_num_ranges ()) + return "caret_idx out of range"; + if (start_idx >= ranges.get_num_ranges ()) + return "start_idx out of range"; + if (end_idx >= ranges.get_num_ranges ()) + return "end_idx out of range"; + + *out_loc = make_location (ranges.get_range (caret_idx).m_start, + ranges.get_range (start_idx).m_start, + ranges.get_range (end_idx).m_finish); + return NULL; +} + +#if CHECKING_P + +namespace selftest { + +/* Selftests of location handling. */ + +/* Attempt to populate *OUT_RANGE with source location information on the + given character within the string literal found at STRLOC. + CHAR_IDX refers to an offset within the execution character set. + If CONCATS is non-NULL, then any string literals that the token at + STRLOC was concatenated with are also considered. + + This is implemented by re-parsing the relevant source line(s). + + Return NULL if successful, or an error message if any errors occurred. + Error messages are intended for GCC developers (to help debugging) rather + than for end-users. */ + +static const char * +get_source_range_for_char (cpp_reader *pfile, + string_concat_db *concats, + location_t strloc, + enum cpp_ttype type, + int char_idx, + source_range *out_range) +{ + gcc_checking_assert (char_idx >= 0); + gcc_assert (out_range); + + cpp_substring_ranges ranges; + const char *err + = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges); + if (err) + return err; + + if (char_idx >= ranges.get_num_ranges ()) + return "char_idx out of range"; + + *out_range = ranges.get_range (char_idx); + return NULL; +} + +/* As get_source_range_for_char, but write to *OUT the number + of ranges that are available. */ + +static const char * +get_num_source_ranges_for_substring (cpp_reader *pfile, + string_concat_db *concats, + location_t strloc, + enum cpp_ttype type, + int *out) +{ + gcc_assert (out); + + cpp_substring_ranges ranges; + const char *err + = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges); + + if (err) + return err; + + *out = ranges.get_num_ranges (); + return NULL; +} + +/* Selftests of location handling. */ + +/* Verify that compare() on linenum_type handles comparisons over the full + range of the type. */ + +static void +test_linenum_comparisons () +{ + linenum_type min_line (0); + linenum_type max_line (0xffffffff); + ASSERT_EQ (0, compare (min_line, min_line)); + ASSERT_EQ (0, compare (max_line, max_line)); + + ASSERT_GT (compare (max_line, min_line), 0); + ASSERT_LT (compare (min_line, max_line), 0); +} + +/* Helper function for verifying location data: when location_t + values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated + as having column 0. */ + +static bool +should_have_column_data_p (location_t loc) +{ + if (IS_ADHOC_LOC (loc)) + loc = get_location_from_adhoc_loc (line_table, loc); + if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS) + return false; + return true; +} + +/* Selftest for should_have_column_data_p. */ + +static void +test_should_have_column_data_p () +{ + ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT)); + ASSERT_TRUE + (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS)); + ASSERT_FALSE + (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1)); +} + +/* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN + on LOC. */ + +static void +assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum, + location_t loc) +{ + ASSERT_STREQ (exp_filename, LOCATION_FILE (loc)); + ASSERT_EQ (exp_linenum, LOCATION_LINE (loc)); + /* If location_t values are sufficiently high, then column numbers + will be unavailable and LOCATION_COLUMN (loc) will be 0. + When close to the threshold, column numbers *may* be present: if + the final linemap before the threshold contains a line that straddles + the threshold, locations in that line have column information. */ + if (should_have_column_data_p (loc)) + ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc)); +} + +/* Various selftests involve constructing a line table and one or more + line maps within it. + + For maximum test coverage we want to run these tests with a variety + of situations: + - line_table->default_range_bits: some frontends use a non-zero value + and others use zero + - the fallback modes within line-map.c: there are various threshold + values for location_t beyond line-map.c changes + behavior (disabling of the range-packing optimization, disabling + of column-tracking). We can exercise these by starting the line_table + at interesting values at or near these thresholds. + + The following struct describes a particular case within our test + matrix. */ + +class line_table_case +{ +public: + line_table_case (int default_range_bits, int base_location) + : m_default_range_bits (default_range_bits), + m_base_location (base_location) + {} + + int m_default_range_bits; + int m_base_location; +}; + +/* Constructor. Store the old value of line_table, and create a new + one, using sane defaults. */ + +line_table_test::line_table_test () +{ + gcc_assert (saved_line_table == NULL); + saved_line_table = line_table; + line_table = ggc_alloc<line_maps> (); + linemap_init (line_table, BUILTINS_LOCATION); + gcc_assert (saved_line_table->reallocator); + line_table->reallocator = saved_line_table->reallocator; + gcc_assert (saved_line_table->round_alloc_size); + line_table->round_alloc_size = saved_line_table->round_alloc_size; + line_table->default_range_bits = 0; +} + +/* Constructor. Store the old value of line_table, and create a new + one, using the sitation described in CASE_. */ + +line_table_test::line_table_test (const line_table_case &case_) +{ + gcc_assert (saved_line_table == NULL); + saved_line_table = line_table; + line_table = ggc_alloc<line_maps> (); + linemap_init (line_table, BUILTINS_LOCATION); + gcc_assert (saved_line_table->reallocator); + line_table->reallocator = saved_line_table->reallocator; + gcc_assert (saved_line_table->round_alloc_size); + line_table->round_alloc_size = saved_line_table->round_alloc_size; + line_table->default_range_bits = case_.m_default_range_bits; + if (case_.m_base_location) + { + line_table->highest_location = case_.m_base_location; + line_table->highest_line = case_.m_base_location; + } +} + +/* Destructor. Restore the old value of line_table. */ + +line_table_test::~line_table_test () +{ + gcc_assert (saved_line_table != NULL); + line_table = saved_line_table; + saved_line_table = NULL; +} + +/* Verify basic operation of ordinary linemaps. */ + +static void +test_accessing_ordinary_linemaps (const line_table_case &case_) +{ + line_table_test ltt (case_); + + /* Build a simple linemap describing some locations. */ + linemap_add (line_table, LC_ENTER, false, "foo.c", 0); + + linemap_line_start (line_table, 1, 100); + location_t loc_a = linemap_position_for_column (line_table, 1); + location_t loc_b = linemap_position_for_column (line_table, 23); + + linemap_line_start (line_table, 2, 100); + location_t loc_c = linemap_position_for_column (line_table, 1); + location_t loc_d = linemap_position_for_column (line_table, 17); + + /* Example of a very long line. */ + linemap_line_start (line_table, 3, 2000); + location_t loc_e = linemap_position_for_column (line_table, 700); + + /* Transitioning back to a short line. */ + linemap_line_start (line_table, 4, 0); + location_t loc_back_to_short = linemap_position_for_column (line_table, 100); + + if (should_have_column_data_p (loc_back_to_short)) + { + /* Verify that we switched to short lines in the linemap. */ + line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table); + ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits); + } + + /* Example of a line that will eventually be seen to be longer + than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is + below that. */ + linemap_line_start (line_table, 5, 2000); + + location_t loc_start_of_very_long_line + = linemap_position_for_column (line_table, 2000); + location_t loc_too_wide + = linemap_position_for_column (line_table, 4097); + location_t loc_too_wide_2 + = linemap_position_for_column (line_table, 4098); + + /* ...and back to a sane line length. */ + linemap_line_start (line_table, 6, 100); + location_t loc_sane_again = linemap_position_for_column (line_table, 10); + + linemap_add (line_table, LC_LEAVE, false, NULL, 0); + + /* Multiple files. */ + linemap_add (line_table, LC_ENTER, false, "bar.c", 0); + linemap_line_start (line_table, 1, 200); + location_t loc_f = linemap_position_for_column (line_table, 150); + linemap_add (line_table, LC_LEAVE, false, NULL, 0); + + /* Verify that we can recover the location info. */ + assert_loceq ("foo.c", 1, 1, loc_a); + assert_loceq ("foo.c", 1, 23, loc_b); + assert_loceq ("foo.c", 2, 1, loc_c); + assert_loceq ("foo.c", 2, 17, loc_d); + assert_loceq ("foo.c", 3, 700, loc_e); + assert_loceq ("foo.c", 4, 100, loc_back_to_short); + + /* In the very wide line, the initial location should be fully tracked. */ + assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line); + /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should + be disabled. */ + assert_loceq ("foo.c", 5, 0, loc_too_wide); + assert_loceq ("foo.c", 5, 0, loc_too_wide_2); + /*...and column-tracking should be re-enabled for subsequent lines. */ + assert_loceq ("foo.c", 6, 10, loc_sane_again); + + assert_loceq ("bar.c", 1, 150, loc_f); + + ASSERT_FALSE (is_location_from_builtin_token (loc_a)); + ASSERT_TRUE (pure_location_p (line_table, loc_a)); + + /* Verify using make_location to build a range, and extracting data + back from it. */ + location_t range_c_b_d = make_location (loc_c, loc_b, loc_d); + ASSERT_FALSE (pure_location_p (line_table, range_c_b_d)); + ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d)); + source_range src_range = get_range_from_loc (line_table, range_c_b_d); + ASSERT_EQ (loc_b, src_range.m_start); + ASSERT_EQ (loc_d, src_range.m_finish); +} + +/* Verify various properties of UNKNOWN_LOCATION. */ + +static void +test_unknown_location () +{ + ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION)); + ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION)); + ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION)); +} + +/* Verify various properties of BUILTINS_LOCATION. */ + +static void +test_builtins () +{ + assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION); + ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION); +} + +/* Regression test for make_location. + Ensure that we use pure locations for the start/finish of the range, + rather than storing a packed or ad-hoc range as the start/finish. */ + +static void +test_make_location_nonpure_range_endpoints (const line_table_case &case_) +{ + /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c + with C++ frontend. + ....................0000000001111111111222. + ....................1234567890123456789012. */ + const char *content = " r += !aaa == bbb;\n"; + temp_source_file tmp (SELFTEST_LOCATION, ".C", content); + line_table_test ltt (case_); + linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1); + + const location_t c11 = linemap_position_for_column (line_table, 11); + const location_t c12 = linemap_position_for_column (line_table, 12); + const location_t c13 = linemap_position_for_column (line_table, 13); + const location_t c14 = linemap_position_for_column (line_table, 14); + const location_t c21 = linemap_position_for_column (line_table, 21); + + if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS) + return; + + /* Use column 13 for the caret location, arbitrarily, to verify that we + handle start != caret. */ + const location_t aaa = make_location (c13, c12, c14); + ASSERT_EQ (c13, get_pure_location (aaa)); + ASSERT_EQ (c12, get_start (aaa)); + ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa))); + ASSERT_EQ (c14, get_finish (aaa)); + ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa))); + + /* Make a location using a location with a range as the start-point. */ + const location_t not_aaa = make_location (c11, aaa, c14); + ASSERT_EQ (c11, get_pure_location (not_aaa)); + /* It should use the start location of the range, not store the range + itself. */ + ASSERT_EQ (c12, get_start (not_aaa)); + ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa))); + ASSERT_EQ (c14, get_finish (not_aaa)); + ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa))); + + /* Similarly, make a location with a range as the end-point. */ + const location_t aaa_eq_bbb = make_location (c12, c12, c21); + ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb)); + ASSERT_EQ (c12, get_start (aaa_eq_bbb)); + ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb))); + ASSERT_EQ (c21, get_finish (aaa_eq_bbb)); + ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb))); + const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb); + /* It should use the finish location of the range, not store the range + itself. */ + ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb)); + ASSERT_EQ (c12, get_start (not_aaa_eq_bbb)); + ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb))); + ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb)); + ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb))); +} + +/* Verify reading of input files (e.g. for caret-based diagnostics). */ + +static void +test_reading_source_line () +{ + /* Create a tempfile and write some text to it. */ + temp_source_file tmp (SELFTEST_LOCATION, ".txt", + "01234567890123456789\n" + "This is the test text\n" + "This is the 3rd line"); + + /* Read back a specific line from the tempfile. */ + char_span source_line = location_get_source_line (tmp.get_filename (), 3); + ASSERT_TRUE (source_line); + ASSERT_TRUE (source_line.get_buffer () != NULL); + ASSERT_EQ (20, source_line.length ()); + ASSERT_TRUE (!strncmp ("This is the 3rd line", + source_line.get_buffer (), source_line.length ())); + + source_line = location_get_source_line (tmp.get_filename (), 2); + ASSERT_TRUE (source_line); + ASSERT_TRUE (source_line.get_buffer () != NULL); + ASSERT_EQ (21, source_line.length ()); + ASSERT_TRUE (!strncmp ("This is the test text", + source_line.get_buffer (), source_line.length ())); + + source_line = location_get_source_line (tmp.get_filename (), 4); + ASSERT_FALSE (source_line); + ASSERT_TRUE (source_line.get_buffer () == NULL); +} + +/* Tests of lexing. */ + +/* Verify that token TOK from PARSER has cpp_token_as_text + equal to EXPECTED_TEXT. */ + +#define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT) \ + SELFTEST_BEGIN_STMT \ + unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK)); \ + ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt); \ + SELFTEST_END_STMT + +/* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM, + and ranges from EXP_START_COL to EXP_FINISH_COL. + Use LOC as the effective location of the selftest. */ + +static void +assert_token_loc_eq (const location &loc, + const cpp_token *tok, + const char *exp_filename, int exp_linenum, + int exp_start_col, int exp_finish_col) +{ + location_t tok_loc = tok->src_loc; + ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc)); + ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc)); + + /* If location_t values are sufficiently high, then column numbers + will be unavailable. */ + if (!should_have_column_data_p (tok_loc)) + return; + + ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc)); + source_range tok_range = get_range_from_loc (line_table, tok_loc); + ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start)); + ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish)); +} + +/* Use assert_token_loc_eq to verify the TOK->src_loc, using + SELFTEST_LOCATION as the effective location of the selftest. */ + +#define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \ + EXP_START_COL, EXP_FINISH_COL) \ + assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \ + (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL)) + +/* Test of lexing a file using libcpp, verifying tokens and their + location information. */ + +static void +test_lexer (const line_table_case &case_) +{ + /* Create a tempfile and write some text to it. */ + const char *content = + /*00000000011111111112222222222333333.3333444444444.455555555556 + 12345678901234567890123456789012345.6789012345678.901234567890. */ + ("test_name /* c-style comment */\n" + " \"test literal\"\n" + " // test c++-style comment\n" + " 42\n"); + temp_source_file tmp (SELFTEST_LOCATION, ".txt", content); + + line_table_test ltt (case_); + + cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table); + + const char *fname = cpp_read_main_file (parser, tmp.get_filename ()); + ASSERT_NE (fname, NULL); + + /* Verify that we get the expected tokens back, with the correct + location information. */ + + location_t loc; + const cpp_token *tok; + tok = cpp_get_token_with_location (parser, &loc); + ASSERT_NE (tok, NULL); + ASSERT_EQ (tok->type, CPP_NAME); + ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name"); + ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9); + + tok = cpp_get_token_with_location (parser, &loc); + ASSERT_NE (tok, NULL); + ASSERT_EQ (tok->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\""); + ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48); + + tok = cpp_get_token_with_location (parser, &loc); + ASSERT_NE (tok, NULL); + ASSERT_EQ (tok->type, CPP_NUMBER); + ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42"); + ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5); + + tok = cpp_get_token_with_location (parser, &loc); + ASSERT_NE (tok, NULL); + ASSERT_EQ (tok->type, CPP_EOF); + + cpp_finish (parser, NULL); + cpp_destroy (parser); +} + +/* Forward decls. */ + +class lexer_test; +class lexer_test_options; + +/* A class for specifying options of a lexer_test. + The "apply" vfunc is called during the lexer_test constructor. */ + +class lexer_test_options +{ + public: + virtual void apply (lexer_test &) = 0; +}; + +/* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy + in its dtor. + + This is needed by struct lexer_test to ensure that the cleanup of the + cpp_reader happens *after* the cleanup of the temp_source_file. */ + +class cpp_reader_ptr +{ + public: + cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {} + + ~cpp_reader_ptr () + { + cpp_finish (m_ptr, NULL); + cpp_destroy (m_ptr); + } + + operator cpp_reader * () const { return m_ptr; } + + private: + cpp_reader *m_ptr; +}; + +/* A struct for writing lexer tests. */ + +class lexer_test +{ +public: + lexer_test (const line_table_case &case_, const char *content, + lexer_test_options *options); + ~lexer_test (); + + const cpp_token *get_token (); + + /* The ordering of these fields matters. + The line_table_test must be first, since the cpp_reader_ptr + uses it. + The cpp_reader must be cleaned up *after* the temp_source_file + since the filenames in input.c's input cache are owned by the + cpp_reader; in particular, when ~temp_source_file evicts the + filename the filenames must still be alive. */ + line_table_test m_ltt; + cpp_reader_ptr m_parser; + temp_source_file m_tempfile; + string_concat_db m_concats; + bool m_implicitly_expect_EOF; +}; + +/* Use an EBCDIC encoding for the execution charset, specifically + IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047"). + + This exercises iconv integration within libcpp. + Not every build of iconv supports the given charset, + so we need to flag this error and handle it gracefully. */ + +class ebcdic_execution_charset : public lexer_test_options +{ + public: + ebcdic_execution_charset () : m_num_iconv_errors (0) + { + gcc_assert (s_singleton == NULL); + s_singleton = this; + } + ~ebcdic_execution_charset () + { + gcc_assert (s_singleton == this); + s_singleton = NULL; + } + + void apply (lexer_test &test) FINAL OVERRIDE + { + cpp_options *cpp_opts = cpp_get_options (test.m_parser); + cpp_opts->narrow_charset = "IBM1047"; + + cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser); + callbacks->diagnostic = on_diagnostic; + } + + static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED, + enum cpp_diagnostic_level level ATTRIBUTE_UNUSED, + enum cpp_warning_reason reason ATTRIBUTE_UNUSED, + rich_location *richloc ATTRIBUTE_UNUSED, + const char *msgid, va_list *ap ATTRIBUTE_UNUSED) + ATTRIBUTE_FPTR_PRINTF(5,0) + { + gcc_assert (s_singleton); + /* Avoid exgettext from picking this up, it is translated in libcpp. */ + const char *msg = "conversion from %s to %s not supported by iconv"; +#ifdef ENABLE_NLS + msg = dgettext ("cpplib", msg); +#endif + /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc + when the local iconv build doesn't support the conversion. */ + if (strcmp (msgid, msg) == 0) + { + s_singleton->m_num_iconv_errors++; + return true; + } + + /* Otherwise, we have an unexpected error. */ + abort (); + } + + bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; } + + private: + static ebcdic_execution_charset *s_singleton; + int m_num_iconv_errors; +}; + +ebcdic_execution_charset *ebcdic_execution_charset::s_singleton; + +/* A lexer_test_options subclass that records a list of diagnostic + messages emitted by the lexer. */ + +class lexer_diagnostic_sink : public lexer_test_options +{ + public: + lexer_diagnostic_sink () + { + gcc_assert (s_singleton == NULL); + s_singleton = this; + } + ~lexer_diagnostic_sink () + { + gcc_assert (s_singleton == this); + s_singleton = NULL; + + int i; + char *str; + FOR_EACH_VEC_ELT (m_diagnostics, i, str) + free (str); + } + + void apply (lexer_test &test) FINAL OVERRIDE + { + cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser); + callbacks->diagnostic = on_diagnostic; + } + + static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED, + enum cpp_diagnostic_level level ATTRIBUTE_UNUSED, + enum cpp_warning_reason reason ATTRIBUTE_UNUSED, + rich_location *richloc ATTRIBUTE_UNUSED, + const char *msgid, va_list *ap) + ATTRIBUTE_FPTR_PRINTF(5,0) + { + char *msg = xvasprintf (msgid, *ap); + s_singleton->m_diagnostics.safe_push (msg); + return true; + } + + auto_vec<char *> m_diagnostics; + + private: + static lexer_diagnostic_sink *s_singleton; +}; + +lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton; + +/* Constructor. Override line_table with a new instance based on CASE_, + and write CONTENT to a tempfile. Create a cpp_reader, and use it to + start parsing the tempfile. */ + +lexer_test::lexer_test (const line_table_case &case_, const char *content, + lexer_test_options *options) +: m_ltt (case_), + m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)), + /* Create a tempfile and write the text to it. */ + m_tempfile (SELFTEST_LOCATION, ".c", content), + m_concats (), + m_implicitly_expect_EOF (true) +{ + if (options) + options->apply (*this); + + cpp_init_iconv (m_parser); + + /* Parse the file. */ + const char *fname = cpp_read_main_file (m_parser, + m_tempfile.get_filename ()); + ASSERT_NE (fname, NULL); +} + +/* Destructor. By default, verify that the next token in m_parser is EOF. */ + +lexer_test::~lexer_test () +{ + location_t loc; + const cpp_token *tok; + + if (m_implicitly_expect_EOF) + { + tok = cpp_get_token_with_location (m_parser, &loc); + ASSERT_NE (tok, NULL); + ASSERT_EQ (tok->type, CPP_EOF); + } +} + +/* Get the next token from m_parser. */ + +const cpp_token * +lexer_test::get_token () +{ + location_t loc; + const cpp_token *tok; + + tok = cpp_get_token_with_location (m_parser, &loc); + ASSERT_NE (tok, NULL); + return tok; +} + +/* Verify that locations within string literals are correctly handled. */ + +/* Verify get_source_range_for_substring for token(s) at STRLOC, + using the string concatenation database for TEST. + + Assert that the character at index IDX is on EXPECTED_LINE, + and that it begins at column EXPECTED_START_COL and ends at + EXPECTED_FINISH_COL (unless the locations are beyond + LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their + columns). */ + +static void +assert_char_at_range (const location &loc, + lexer_test& test, + location_t strloc, enum cpp_ttype type, int idx, + int expected_line, int expected_start_col, + int expected_finish_col) +{ + cpp_reader *pfile = test.m_parser; + string_concat_db *concats = &test.m_concats; + + source_range actual_range = source_range(); + const char *err + = get_source_range_for_char (pfile, concats, strloc, type, idx, + &actual_range); + if (should_have_column_data_p (strloc)) + ASSERT_EQ_AT (loc, NULL, err); + else + { + ASSERT_STREQ_AT (loc, + "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", + err); + return; + } + + int actual_start_line = LOCATION_LINE (actual_range.m_start); + ASSERT_EQ_AT (loc, expected_line, actual_start_line); + int actual_finish_line = LOCATION_LINE (actual_range.m_finish); + ASSERT_EQ_AT (loc, expected_line, actual_finish_line); + + if (should_have_column_data_p (actual_range.m_start)) + { + int actual_start_col = LOCATION_COLUMN (actual_range.m_start); + ASSERT_EQ_AT (loc, expected_start_col, actual_start_col); + } + if (should_have_column_data_p (actual_range.m_finish)) + { + int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish); + ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col); + } +} + +/* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for + the effective location of any errors. */ + +#define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \ + EXPECTED_START_COL, EXPECTED_FINISH_COL) \ + assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \ + (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \ + (EXPECTED_FINISH_COL)) + +/* Verify get_num_source_ranges_for_substring for token(s) at STRLOC, + using the string concatenation database for TEST. + + Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES. */ + +static void +assert_num_substring_ranges (const location &loc, + lexer_test& test, + location_t strloc, + enum cpp_ttype type, + int expected_num_ranges) +{ + cpp_reader *pfile = test.m_parser; + string_concat_db *concats = &test.m_concats; + + int actual_num_ranges = -1; + const char *err + = get_num_source_ranges_for_substring (pfile, concats, strloc, type, + &actual_num_ranges); + if (should_have_column_data_p (strloc)) + ASSERT_EQ_AT (loc, NULL, err); + else + { + ASSERT_STREQ_AT (loc, + "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", + err); + return; + } + ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges); +} + +/* Macro for calling assert_num_substring_ranges, supplying + SELFTEST_LOCATION for the effective location of any errors. */ + +#define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \ + EXPECTED_NUM_RANGES) \ + assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \ + (TYPE), (EXPECTED_NUM_RANGES)) + + +/* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC + returns an error (using the string concatenation database for TEST). */ + +static void +assert_has_no_substring_ranges (const location &loc, + lexer_test& test, + location_t strloc, + enum cpp_ttype type, + const char *expected_err) +{ + cpp_reader *pfile = test.m_parser; + string_concat_db *concats = &test.m_concats; + cpp_substring_ranges ranges; + const char *actual_err + = get_substring_ranges_for_loc (pfile, concats, strloc, + type, ranges); + if (should_have_column_data_p (strloc)) + ASSERT_STREQ_AT (loc, expected_err, actual_err); + else + ASSERT_STREQ_AT (loc, + "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", + actual_err); +} + +#define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR) \ + assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \ + (STRLOC), (TYPE), (ERR)) + +/* Lex a simple string literal. Verify the substring location data, before + and after running cpp_interpret_string on it. */ + +static void +test_lexer_string_locations_simple (const line_table_case &case_) +{ + /* Digits 0-9 (with 0 at column 10), the simple way. + ....................000000000.11111111112.2222222223333333333 + ....................123456789.01234567890.1234567890123456789 + We add a trailing comment to ensure that we correctly locate + the end of the string literal token. */ + const char *content = " \"0123456789\" /* not a string */\n"; + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back, with the correct + location information. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\""); + ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20); + + /* At this point in lexing, the quote characters are treated as part of + the string (they are stripped off by cpp_interpret_string). */ + + ASSERT_EQ (tok->val.str.len, 12); + + /* Verify that cpp_interpret_string works. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING; + bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1, + &dst_string, type); + ASSERT_TRUE (result); + ASSERT_STREQ ("0123456789", (const char *)dst_string.text); + free (const_cast <unsigned char *> (dst_string.text)); + + /* Verify ranges of individual characters. This no longer includes the + opening quote, but does include the closing quote. */ + for (int i = 0; i <= 10; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, + 10 + i, 10 + i); + + ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11); +} + +/* As test_lexer_string_locations_simple, but use an EBCDIC execution + encoding. */ + +static void +test_lexer_string_locations_ebcdic (const line_table_case &case_) +{ + /* EBCDIC support requires iconv. */ + if (!HAVE_ICONV) + return; + + /* Digits 0-9 (with 0 at column 10), the simple way. + ....................000000000.11111111112.2222222223333333333 + ....................123456789.01234567890.1234567890123456789 + We add a trailing comment to ensure that we correctly locate + the end of the string literal token. */ + const char *content = " \"0123456789\" /* not a string */\n"; + ebcdic_execution_charset use_ebcdic; + lexer_test test (case_, content, &use_ebcdic); + + /* Verify that we get the expected token back, with the correct + location information. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\""); + ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20); + + /* At this point in lexing, the quote characters are treated as part of + the string (they are stripped off by cpp_interpret_string). */ + + ASSERT_EQ (tok->val.str.len, 12); + + /* The remainder of the test requires an iconv implementation that + can convert from UTF-8 to the EBCDIC encoding requested above. */ + if (use_ebcdic.iconv_errors_occurred_p ()) + return; + + /* Verify that cpp_interpret_string works. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING; + bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1, + &dst_string, type); + ASSERT_TRUE (result); + /* We should now have EBCDIC-encoded text, specifically + IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047"). + The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9. */ + ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", + (const char *)dst_string.text); + free (const_cast <unsigned char *> (dst_string.text)); + + /* Verify that we don't attempt to record substring location information + for such cases. */ + ASSERT_HAS_NO_SUBSTRING_RANGES + (test, tok->src_loc, type, + "execution character set != source character set"); +} + +/* Lex a string literal containing a hex-escaped character. + Verify the substring location data, before and after running + cpp_interpret_string on it. */ + +static void +test_lexer_string_locations_hex (const line_table_case &case_) +{ + /* Digits 0-9, expressing digit 5 in ASCII as "\x35" + and with a space in place of digit 6, to terminate the escaped + hex code. + ....................000000000.111111.11112222. + ....................123456789.012345.67890123. */ + const char *content = " \"01234\\x35 789\"\n"; + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back, with the correct + location information. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\""); + ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23); + + /* At this point in lexing, the quote characters are treated as part of + the string (they are stripped off by cpp_interpret_string). */ + ASSERT_EQ (tok->val.str.len, 15); + + /* Verify that cpp_interpret_string works. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING; + bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1, + &dst_string, type); + ASSERT_TRUE (result); + ASSERT_STREQ ("012345 789", (const char *)dst_string.text); + free (const_cast <unsigned char *> (dst_string.text)); + + /* Verify ranges of individual characters. This no longer includes the + opening quote, but does include the closing quote. */ + for (int i = 0; i <= 4; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i); + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18); + for (int i = 6; i <= 10; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i); + + ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11); +} + +/* Lex a string literal containing an octal-escaped character. + Verify the substring location data after running cpp_interpret_string + on it. */ + +static void +test_lexer_string_locations_oct (const line_table_case &case_) +{ + /* Digits 0-9, expressing digit 5 in ASCII as "\065" + and with a space in place of digit 6, to terminate the escaped + octal code. + ....................000000000.111111.11112222.2222223333333333444 + ....................123456789.012345.67890123.4567890123456789012 */ + const char *content = " \"01234\\065 789\" /* not a string */\n"; + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back, with the correct + location information. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\""); + + /* Verify that cpp_interpret_string works. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING; + bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1, + &dst_string, type); + ASSERT_TRUE (result); + ASSERT_STREQ ("012345 789", (const char *)dst_string.text); + free (const_cast <unsigned char *> (dst_string.text)); + + /* Verify ranges of individual characters. This no longer includes the + opening quote, but does include the closing quote. */ + for (int i = 0; i < 5; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i); + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18); + for (int i = 6; i <= 10; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i); + + ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11); +} + +/* Test of string literal containing letter escapes. */ + +static void +test_lexer_string_locations_letter_escape_1 (const line_table_case &case_) +{ + /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar. + .....................000000000.1.11111.1.1.11222.22222223333333 + .....................123456789.0.12345.6.7.89012.34567890123456. */ + const char *content = (" \"\\tfoo\\\\\\nbar\" /* non-str */\n"); + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected tokens back. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\""); + + /* Verify ranges of individual characters. */ + /* "\t". */ + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING, + 0, 1, 10, 11); + /* "foo". */ + for (int i = 1; i <= 3; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING, + i, 1, 11 + i, 11 + i); + /* "\\" and "\n". */ + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING, + 4, 1, 15, 16); + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING, + 5, 1, 17, 18); + + /* "bar" and closing quote for nul-terminator. */ + for (int i = 6; i <= 9; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING, + i, 1, 13 + i, 13 + i); + + ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10); +} + +/* Another test of a string literal containing a letter escape. + Based on string seen in + printf ("%-%\n"); + in gcc.dg/format/c90-printf-1.c. */ + +static void +test_lexer_string_locations_letter_escape_2 (const line_table_case &case_) +{ + /* .....................000000000.1111.11.1111.22222222223. + .....................123456789.0123.45.6789.01234567890. */ + const char *content = (" \"%-%\\n\" /* non-str */\n"); + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected tokens back. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\""); + + /* Verify ranges of individual characters. */ + /* "%-%". */ + for (int i = 0; i < 3; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING, + i, 1, 10 + i, 10 + i); + /* "\n". */ + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING, + 3, 1, 13, 14); + + /* Closing quote for nul-terminator. */ + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING, + 4, 1, 15, 15); + + ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5); +} + +/* Lex a string literal containing UCN 4 characters. + Verify the substring location data after running cpp_interpret_string + on it. */ + +static void +test_lexer_string_locations_ucn4 (const line_table_case &case_) +{ + /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed + as UCN 4. + ....................000000000.111111.111122.222222223.33333333344444 + ....................123456789.012345.678901.234567890.12345678901234 */ + const char *content = " \"01234\\u2174\\u2175789\" /* non-str */\n"; + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back, with the correct + location information. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\""); + + /* Verify that cpp_interpret_string works. + The string should be encoded in the execution character + set. Assuming that is UTF-8, we should have the following: + ----------- ---- ----- ------- ---------------- + Byte offset Byte Octal Unicode Source Column(s) + ----------- ---- ----- ------- ---------------- + 0 0x30 '0' 10 + 1 0x31 '1' 11 + 2 0x32 '2' 12 + 3 0x33 '3' 13 + 4 0x34 '4' 14 + 5 0xE2 \342 U+2174 15-20 + 6 0x85 \205 (cont) 15-20 + 7 0xB4 \264 (cont) 15-20 + 8 0xE2 \342 U+2175 21-26 + 9 0x85 \205 (cont) 21-26 + 10 0xB5 \265 (cont) 21-26 + 11 0x37 '7' 27 + 12 0x38 '8' 28 + 13 0x39 '9' 29 + 14 0x00 30 (closing quote) + ----------- ---- ----- ------- ---------------. */ + + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING; + bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1, + &dst_string, type); + ASSERT_TRUE (result); + ASSERT_STREQ ("01234\342\205\264\342\205\265789", + (const char *)dst_string.text); + free (const_cast <unsigned char *> (dst_string.text)); + + /* Verify ranges of individual characters. This no longer includes the + opening quote, but does include the closing quote. + '01234'. */ + for (int i = 0; i <= 4; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i); + /* U+2174. */ + for (int i = 5; i <= 7; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20); + /* U+2175. */ + for (int i = 8; i <= 10; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26); + /* '789' and nul terminator */ + for (int i = 11; i <= 14; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i); + + ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15); +} + +/* Lex a string literal containing UCN 8 characters. + Verify the substring location data after running cpp_interpret_string + on it. */ + +static void +test_lexer_string_locations_ucn8 (const line_table_case &case_) +{ + /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8. + ....................000000000.111111.1111222222.2222333333333.344444 + ....................123456789.012345.6789012345.6789012345678.901234 */ + const char *content = " \"01234\\U00002174\\U00002175789\" /* */\n"; + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back, with the correct + location information. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, + "\"01234\\U00002174\\U00002175789\""); + + /* Verify that cpp_interpret_string works. + The UTF-8 encoding of the string is identical to that from + the ucn4 testcase above; the only difference is the column + locations. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING; + bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1, + &dst_string, type); + ASSERT_TRUE (result); + ASSERT_STREQ ("01234\342\205\264\342\205\265789", + (const char *)dst_string.text); + free (const_cast <unsigned char *> (dst_string.text)); + + /* Verify ranges of individual characters. This no longer includes the + opening quote, but does include the closing quote. + '01234'. */ + for (int i = 0; i <= 4; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i); + /* U+2174. */ + for (int i = 5; i <= 7; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24); + /* U+2175. */ + for (int i = 8; i <= 10; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34); + /* '789' at columns 35-37 */ + for (int i = 11; i <= 13; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i); + /* Closing quote/nul-terminator at column 38. */ + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38); + + ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15); +} + +/* Fetch a big-endian 32-bit value and convert to host endianness. */ + +static uint32_t +uint32_from_big_endian (const uint32_t *ptr_be_value) +{ + const unsigned char *buf = (const unsigned char *)ptr_be_value; + return (((uint32_t) buf[0] << 24) + | ((uint32_t) buf[1] << 16) + | ((uint32_t) buf[2] << 8) + | (uint32_t) buf[3]); +} + +/* Lex a wide string literal and verify that attempts to read substring + location data from it fail gracefully. */ + +static void +test_lexer_string_locations_wide_string (const line_table_case &case_) +{ + /* Digits 0-9. + ....................000000000.11111111112.22222222233333 + ....................123456789.01234567890.12345678901234 */ + const char *content = " L\"0123456789\" /* non-str */\n"; + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back, with the correct + location information. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_WSTRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\""); + + /* Verify that cpp_interpret_string works, using CPP_WSTRING. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_WSTRING; + bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1, + &dst_string, type); + ASSERT_TRUE (result); + /* The cpp_reader defaults to big-endian with + CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should + now be encoded as UTF-32BE. */ + const uint32_t *be32_chars = (const uint32_t *)dst_string.text; + ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0])); + ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5])); + ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9])); + ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10])); + free (const_cast <unsigned char *> (dst_string.text)); + + /* We don't yet support generating substring location information + for L"" strings. */ + ASSERT_HAS_NO_SUBSTRING_RANGES + (test, tok->src_loc, type, + "execution character set != source character set"); +} + +/* Fetch a big-endian 16-bit value and convert to host endianness. */ + +static uint16_t +uint16_from_big_endian (const uint16_t *ptr_be_value) +{ + const unsigned char *buf = (const unsigned char *)ptr_be_value; + return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1]; +} + +/* Lex a u"" string literal and verify that attempts to read substring + location data from it fail gracefully. */ + +static void +test_lexer_string_locations_string16 (const line_table_case &case_) +{ + /* Digits 0-9. + ....................000000000.11111111112.22222222233333 + ....................123456789.01234567890.12345678901234 */ + const char *content = " u\"0123456789\" /* non-str */\n"; + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back, with the correct + location information. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING16); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\""); + + /* Verify that cpp_interpret_string works, using CPP_STRING16. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING16; + bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1, + &dst_string, type); + ASSERT_TRUE (result); + + /* The cpp_reader defaults to big-endian, so dst_string should + now be encoded as UTF-16BE. */ + const uint16_t *be16_chars = (const uint16_t *)dst_string.text; + ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0])); + ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5])); + ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9])); + ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10])); + free (const_cast <unsigned char *> (dst_string.text)); + + /* We don't yet support generating substring location information + for L"" strings. */ + ASSERT_HAS_NO_SUBSTRING_RANGES + (test, tok->src_loc, type, + "execution character set != source character set"); +} + +/* Lex a U"" string literal and verify that attempts to read substring + location data from it fail gracefully. */ + +static void +test_lexer_string_locations_string32 (const line_table_case &case_) +{ + /* Digits 0-9. + ....................000000000.11111111112.22222222233333 + ....................123456789.01234567890.12345678901234 */ + const char *content = " U\"0123456789\" /* non-str */\n"; + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back, with the correct + location information. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING32); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\""); + + /* Verify that cpp_interpret_string works, using CPP_STRING32. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING32; + bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1, + &dst_string, type); + ASSERT_TRUE (result); + + /* The cpp_reader defaults to big-endian, so dst_string should + now be encoded as UTF-32BE. */ + const uint32_t *be32_chars = (const uint32_t *)dst_string.text; + ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0])); + ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5])); + ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9])); + ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10])); + free (const_cast <unsigned char *> (dst_string.text)); + + /* We don't yet support generating substring location information + for L"" strings. */ + ASSERT_HAS_NO_SUBSTRING_RANGES + (test, tok->src_loc, type, + "execution character set != source character set"); +} + +/* Lex a u8-string literal. + Verify the substring location data after running cpp_interpret_string + on it. */ + +static void +test_lexer_string_locations_u8 (const line_table_case &case_) +{ + /* Digits 0-9. + ....................000000000.11111111112.22222222233333 + ....................123456789.01234567890.12345678901234 */ + const char *content = " u8\"0123456789\" /* non-str */\n"; + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back, with the correct + location information. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_UTF8STRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\""); + + /* Verify that cpp_interpret_string works. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING; + bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1, + &dst_string, type); + ASSERT_TRUE (result); + ASSERT_STREQ ("0123456789", (const char *)dst_string.text); + free (const_cast <unsigned char *> (dst_string.text)); + + /* Verify ranges of individual characters. This no longer includes the + opening quote, but does include the closing quote. */ + for (int i = 0; i <= 10; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i); +} + +/* Lex a string literal containing UTF-8 source characters. + Verify the substring location data after running cpp_interpret_string + on it. */ + +static void +test_lexer_string_locations_utf8_source (const line_table_case &case_) +{ + /* This string literal is written out to the source file as UTF-8, + and is of the form "before mojibake after", where "mojibake" + is written as the following four unicode code points: + U+6587 CJK UNIFIED IDEOGRAPH-6587 + U+5B57 CJK UNIFIED IDEOGRAPH-5B57 + U+5316 CJK UNIFIED IDEOGRAPH-5316 + U+3051 HIRAGANA LETTER KE. + Each of these is 3 bytes wide when encoded in UTF-8, whereas the + "before" and "after" are 1 byte per unicode character. + + The numbering shown are "columns", which are *byte* numbers within + the line, rather than unicode character numbers. + + .................... 000000000.1111111. + .................... 123456789.0123456. */ + const char *content = (" \"before " + /* U+6587 CJK UNIFIED IDEOGRAPH-6587 + UTF-8: 0xE6 0x96 0x87 + C octal escaped UTF-8: \346\226\207 + "column" numbers: 17-19. */ + "\346\226\207" + + /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57 + UTF-8: 0xE5 0xAD 0x97 + C octal escaped UTF-8: \345\255\227 + "column" numbers: 20-22. */ + "\345\255\227" + + /* U+5316 CJK UNIFIED IDEOGRAPH-5316 + UTF-8: 0xE5 0x8C 0x96 + C octal escaped UTF-8: \345\214\226 + "column" numbers: 23-25. */ + "\345\214\226" + + /* U+3051 HIRAGANA LETTER KE + UTF-8: 0xE3 0x81 0x91 + C octal escaped UTF-8: \343\201\221 + "column" numbers: 26-28. */ + "\343\201\221" + + /* column numbers 29 onwards + 2333333.33334444444444 + 9012345.67890123456789. */ + " after\" /* non-str */\n"); + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back, with the correct + location information. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ + (test.m_parser, tok, + "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\""); + + /* Verify that cpp_interpret_string works. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING; + bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1, + &dst_string, type); + ASSERT_TRUE (result); + ASSERT_STREQ + ("before \346\226\207\345\255\227\345\214\226\343\201\221 after", + (const char *)dst_string.text); + free (const_cast <unsigned char *> (dst_string.text)); + + /* Verify ranges of individual characters. This no longer includes the + opening quote, but does include the closing quote. + Assuming that both source and execution encodings are UTF-8, we have + a run of 25 octets in each, plus the NUL terminator. */ + for (int i = 0; i < 25; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i); + /* NUL-terminator should use the closing quote at column 35. */ + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35); + + ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26); +} + +/* Test of string literal concatenation. */ + +static void +test_lexer_string_locations_concatenation_1 (const line_table_case &case_) +{ + /* Digits 0-9. + .....................000000000.111111.11112222222222 + .....................123456789.012345.67890123456789. */ + const char *content = (" \"01234\" /* non-str */\n" + " \"56789\" /* non-str */\n"); + lexer_test test (case_, content, NULL); + + location_t input_locs[2]; + + /* Verify that we get the expected tokens back. */ + auto_vec <cpp_string> input_strings; + const cpp_token *tok_a = test.get_token (); + ASSERT_EQ (tok_a->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\""); + input_strings.safe_push (tok_a->val.str); + input_locs[0] = tok_a->src_loc; + + const cpp_token *tok_b = test.get_token (); + ASSERT_EQ (tok_b->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\""); + input_strings.safe_push (tok_b->val.str); + input_locs[1] = tok_b->src_loc; + + /* Verify that cpp_interpret_string works. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING; + bool result = cpp_interpret_string (test.m_parser, + input_strings.address (), 2, + &dst_string, type); + ASSERT_TRUE (result); + ASSERT_STREQ ("0123456789", (const char *)dst_string.text); + free (const_cast <unsigned char *> (dst_string.text)); + + /* Simulate c-lex.c's lex_string in order to record concatenation. */ + test.m_concats.record_string_concatenation (2, input_locs); + + location_t initial_loc = input_locs[0]; + + /* "01234" on line 1. */ + for (int i = 0; i <= 4; i++) + ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i); + /* "56789" in line 2, plus its closing quote for the nul terminator. */ + for (int i = 5; i <= 10; i++) + ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i); + + ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11); +} + +/* Another test of string literal concatenation. */ + +static void +test_lexer_string_locations_concatenation_2 (const line_table_case &case_) +{ + /* Digits 0-9. + .....................000000000.111.11111112222222 + .....................123456789.012.34567890123456. */ + const char *content = (" \"01\" /* non-str */\n" + " \"23\" /* non-str */\n" + " \"45\" /* non-str */\n" + " \"67\" /* non-str */\n" + " \"89\" /* non-str */\n"); + lexer_test test (case_, content, NULL); + + auto_vec <cpp_string> input_strings; + location_t input_locs[5]; + + /* Verify that we get the expected tokens back. */ + for (int i = 0; i < 5; i++) + { + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + input_strings.safe_push (tok->val.str); + input_locs[i] = tok->src_loc; + } + + /* Verify that cpp_interpret_string works. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING; + bool result = cpp_interpret_string (test.m_parser, + input_strings.address (), 5, + &dst_string, type); + ASSERT_TRUE (result); + ASSERT_STREQ ("0123456789", (const char *)dst_string.text); + free (const_cast <unsigned char *> (dst_string.text)); + + /* Simulate c-lex.c's lex_string in order to record concatenation. */ + test.m_concats.record_string_concatenation (5, input_locs); + + location_t initial_loc = input_locs[0]; + + /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can + detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS + and expect get_source_range_for_substring to fail. + However, for a string concatenation test, we can have a case + where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS, + but subsequent strings can be after it. + Attempting to detect this within assert_char_at_range + would overcomplicate the logic for the common test cases, so + we detect it here. */ + if (should_have_column_data_p (input_locs[0]) + && !should_have_column_data_p (input_locs[4])) + { + /* Verify that get_source_range_for_substring gracefully rejects + this case. */ + source_range actual_range; + const char *err + = get_source_range_for_char (test.m_parser, &test.m_concats, + initial_loc, type, 0, &actual_range); + ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err); + return; + } + + for (int i = 0; i < 5; i++) + for (int j = 0; j < 2; j++) + ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j, + i + 1, 10 + j, 10 + j); + + /* NUL-terminator should use the final closing quote at line 5 column 12. */ + ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12); + + ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11); +} + +/* Another test of string literal concatenation, this time combined with + various kinds of escaped characters. */ + +static void +test_lexer_string_locations_concatenation_3 (const line_table_case &case_) +{ + /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35" + digit 6 in ASCII as octal "\066", concatenating multiple strings. */ + const char *content + /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555 + .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */ + = (" \"01234\" \"\\x35\" \"\\066\" \"789\" /* non-str */\n"); + lexer_test test (case_, content, NULL); + + auto_vec <cpp_string> input_strings; + location_t input_locs[4]; + + /* Verify that we get the expected tokens back. */ + for (int i = 0; i < 4; i++) + { + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + input_strings.safe_push (tok->val.str); + input_locs[i] = tok->src_loc; + } + + /* Verify that cpp_interpret_string works. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING; + bool result = cpp_interpret_string (test.m_parser, + input_strings.address (), 4, + &dst_string, type); + ASSERT_TRUE (result); + ASSERT_STREQ ("0123456789", (const char *)dst_string.text); + free (const_cast <unsigned char *> (dst_string.text)); + + /* Simulate c-lex.c's lex_string in order to record concatenation. */ + test.m_concats.record_string_concatenation (4, input_locs); + + location_t initial_loc = input_locs[0]; + + for (int i = 0; i <= 4; i++) + ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i); + ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22); + ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30); + for (int i = 7; i <= 9; i++) + ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i); + + /* NUL-terminator should use the location of the final closing quote. */ + ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38); + + ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11); +} + +/* Test of string literal in a macro. */ + +static void +test_lexer_string_locations_macro (const line_table_case &case_) +{ + /* Digits 0-9. + .....................0000000001111111111.22222222223. + .....................1234567890123456789.01234567890. */ + const char *content = ("#define MACRO \"0123456789\" /* non-str */\n" + " MACRO"); + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected tokens back. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_PADDING); + + tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\""); + + /* Verify ranges of individual characters. We ought to + see columns within the macro definition. */ + for (int i = 0; i <= 10; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING, + i, 1, 20 + i, 20 + i); + + ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11); + + tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_PADDING); +} + +/* Test of stringification of a macro argument. */ + +static void +test_lexer_string_locations_stringified_macro_argument + (const line_table_case &case_) +{ + /* .....................000000000111111111122222222223. + .....................123456789012345678901234567890. */ + const char *content = ("#define MACRO(X) #X /* non-str */\n" + "MACRO(foo)\n"); + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_PADDING); + + tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\""); + + /* We don't support getting the location of a stringified macro + argument. Verify that it fails gracefully. */ + ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, + "cpp_interpret_string_1 failed"); + + tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_PADDING); + + tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_PADDING); +} + +/* Ensure that we are fail gracefully if something attempts to pass + in a location that isn't a string literal token. Seen on this code: + + const char a[] = " %d "; + __builtin_printf (a, 0.5); + ^ + + when c-format.c erroneously used the indicated one-character + location as the format string location, leading to a read past the + end of a string buffer in cpp_interpret_string_1. */ + +static void +test_lexer_string_locations_non_string (const line_table_case &case_) +{ + /* .....................000000000111111111122222222223. + .....................123456789012345678901234567890. */ + const char *content = (" a\n"); + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_NAME); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a"); + + /* At this point, libcpp is attempting to interpret the name as a + string literal, despite it not starting with a quote. We don't detect + that, but we should at least fail gracefully. */ + ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, + "cpp_interpret_string_1 failed"); +} + +/* Ensure that we can read substring information for a token which + starts in one linemap and ends in another . Adapted from + gcc.dg/cpp/pr69985.c. */ + +static void +test_lexer_string_locations_long_line (const line_table_case &case_) +{ + /* .....................000000.000111111111 + .....................123456.789012346789. */ + const char *content = ("/* A very long line, so that we start a new line map. */\n" + " \"0123456789012345678901234567890123456789" + "0123456789012345678901234567890123456789" + "0123456789012345678901234567890123456789" + "0123456789\"\n"); + + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + + if (!should_have_column_data_p (line_table->highest_location)) + return; + + /* Verify ranges of individual characters. */ + ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131); + for (int i = 0; i < 131; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING, + i, 2, 7 + i, 7 + i); +} + +/* Test of locations within a raw string that doesn't contain a newline. */ + +static void +test_lexer_string_locations_raw_string_one_line (const line_table_case &case_) +{ + /* .....................00.0000000111111111122. + .....................12.3456789012345678901. */ + const char *content = ("R\"foo(0123456789)foo\"\n"); + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + + /* Verify that cpp_interpret_string works. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING; + bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1, + &dst_string, type); + ASSERT_TRUE (result); + ASSERT_STREQ ("0123456789", (const char *)dst_string.text); + free (const_cast <unsigned char *> (dst_string.text)); + + if (!should_have_column_data_p (line_table->highest_location)) + return; + + /* 0-9, plus the nil terminator. */ + ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11); + for (int i = 0; i < 11; i++) + ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING, + i, 1, 7 + i, 7 + i); +} + +/* Test of locations within a raw string that contains a newline. */ + +static void +test_lexer_string_locations_raw_string_multiline (const line_table_case &case_) +{ + /* .....................00.0000. + .....................12.3456. */ + const char *content = ("R\"foo(\n" + /* .....................00000. + .....................12345. */ + "hello\n" + "world\n" + /* .....................00000. + .....................12345. */ + ")foo\"\n"); + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected token back. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_STRING); + + /* Verify that cpp_interpret_string works. */ + cpp_string dst_string; + const enum cpp_ttype type = CPP_STRING; + bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1, + &dst_string, type); + ASSERT_TRUE (result); + ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text); + free (const_cast <unsigned char *> (dst_string.text)); + + if (!should_have_column_data_p (line_table->highest_location)) + return; + + /* Currently we don't support locations within raw strings that + contain newlines. */ + ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type, + "range endpoints are on different lines"); +} + +/* Test of parsing an unterminated raw string. */ + +static void +test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_) +{ + const char *content = "R\"ouch()ouCh\" /* etc */"; + + lexer_diagnostic_sink diagnostics; + lexer_test test (case_, content, &diagnostics); + test.m_implicitly_expect_EOF = false; + + /* Attempt to parse the raw string. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_EOF); + + ASSERT_EQ (1, diagnostics.m_diagnostics.length ()); + /* We expect the message "unterminated raw string" + in the "cpplib" translation domain. + It's not clear that dgettext is available on all supported hosts, + so this assertion is commented-out for now. + ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"), + diagnostics.m_diagnostics[0]); + */ +} + +/* Test of lexing char constants. */ + +static void +test_lexer_char_constants (const line_table_case &case_) +{ + /* Various char constants. + .....................0000000001111111111.22222222223. + .....................1234567890123456789.01234567890. */ + const char *content = (" 'a'\n" + " u'a'\n" + " U'a'\n" + " L'a'\n" + " 'abc'\n"); + lexer_test test (case_, content, NULL); + + /* Verify that we get the expected tokens back. */ + /* 'a'. */ + const cpp_token *tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_CHAR); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'"); + + unsigned int chars_seen; + int unsignedp; + cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok, + &chars_seen, &unsignedp); + ASSERT_EQ (cc, 'a'); + ASSERT_EQ (chars_seen, 1); + + /* u'a'. */ + tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_CHAR16); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'"); + + /* U'a'. */ + tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_CHAR32); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'"); + + /* L'a'. */ + tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_WCHAR); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'"); + + /* 'abc' (c-char-sequence). */ + tok = test.get_token (); + ASSERT_EQ (tok->type, CPP_CHAR); + ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'"); +} +/* A table of interesting location_t values, giving one axis of our test + matrix. */ + +static const location_t boundary_locations[] = { + /* Zero means "don't override the default values for a new line_table". */ + 0, + + /* An arbitrary non-zero value that isn't close to one of + the boundary values below. */ + 0x10000, + + /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES. */ + LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100, + LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1, + LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES, + LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1, + LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100, + + /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS. */ + LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100, + LINE_MAP_MAX_LOCATION_WITH_COLS - 1, + LINE_MAP_MAX_LOCATION_WITH_COLS, + LINE_MAP_MAX_LOCATION_WITH_COLS + 1, + LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100, +}; + +/* Run TESTCASE multiple times, once for each case in our test matrix. */ + +void +for_each_line_table_case (void (*testcase) (const line_table_case &)) +{ + /* As noted above in the description of struct line_table_case, + we want to explore a test matrix of interesting line_table + situations, running various selftests for each case within the + matrix. */ + + /* Run all tests with: + (a) line_table->default_range_bits == 0, and + (b) line_table->default_range_bits == 5. */ + int num_cases_tested = 0; + for (int default_range_bits = 0; default_range_bits <= 5; + default_range_bits += 5) + { + /* ...and use each of the "interesting" location values as + the starting location within line_table. */ + const int num_boundary_locations + = sizeof (boundary_locations) / sizeof (boundary_locations[0]); + for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++) + { + line_table_case c (default_range_bits, boundary_locations[loc_idx]); + + testcase (c); + + num_cases_tested++; + } + } + + /* Verify that we fully covered the test matrix. */ + ASSERT_EQ (num_cases_tested, 2 * 12); +} + +/* Verify that when presented with a consecutive pair of locations with + a very large line offset, we don't attempt to consolidate them into + a single ordinary linemap where the line offsets within the line map + would lead to overflow (PR lto/88147). */ + +static void +test_line_offset_overflow () +{ + line_table_test ltt (line_table_case (5, 0)); + + linemap_add (line_table, LC_ENTER, false, "foo.c", 0); + linemap_line_start (line_table, 1, 100); + location_t loc_a = linemap_line_start (line_table, 2578, 255); + assert_loceq ("foo.c", 2578, 0, loc_a); + + const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table); + ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13); + ASSERT_EQ (ordmap_a->m_range_bits, 5); + + location_t loc_b = linemap_line_start (line_table, 404198, 512); + assert_loceq ("foo.c", 404198, 0, loc_b); + + /* We should have started a new linemap, rather than attempting to store + a very large line offset. */ + const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table); + ASSERT_NE (ordmap_a, ordmap_b); +} + +void test_cpp_utf8 () +{ + const int def_tabstop = 8; + cpp_char_column_policy policy (def_tabstop, cpp_wcwidth); + + /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */ + { + int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy); + ASSERT_EQ (8, w_bad); + int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy); + ASSERT_EQ (5, w_ctrl); + } + + /* Verify that wcwidth of valid UTF-8 is as expected. */ + { + const int w_pi = cpp_display_width ("\xcf\x80", 2, policy); + ASSERT_EQ (1, w_pi); + const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy); + ASSERT_EQ (2, w_emoji); + const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2, + policy); + ASSERT_EQ (1, w_umlaut_precomposed); + const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3, + policy); + ASSERT_EQ (1, w_umlaut_combining); + const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy); + ASSERT_EQ (2, w_han); + const int w_ascii = cpp_display_width ("GCC", 3, policy); + ASSERT_EQ (3, w_ascii); + const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82" + "\x9f! \xe4\xb8\xba y\xcc\x88", + 24, policy); + ASSERT_EQ (18, w_mixed); + } + + /* Verify that display width properly expands tabs. */ + { + const char *tstr = "\tabc\td"; + ASSERT_EQ (6, cpp_display_width (tstr, 6, + cpp_char_column_policy (1, cpp_wcwidth))); + ASSERT_EQ (10, cpp_display_width (tstr, 6, + cpp_char_column_policy (3, cpp_wcwidth))); + ASSERT_EQ (17, cpp_display_width (tstr, 6, + cpp_char_column_policy (8, cpp_wcwidth))); + ASSERT_EQ (1, + cpp_display_column_to_byte_column + (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth))); + } + + /* Verify that cpp_byte_column_to_display_column can go past the end, + and similar edge cases. */ + { + const char *str + /* Display columns. + 111111112345 */ + = "\xcf\x80 abc"; + /* 111122223456 + Byte columns. */ + + ASSERT_EQ (5, cpp_display_width (str, 6, policy)); + ASSERT_EQ (105, + cpp_byte_column_to_display_column (str, 6, 106, policy)); + ASSERT_EQ (10000, + cpp_byte_column_to_display_column (NULL, 0, 10000, policy)); + ASSERT_EQ (0, + cpp_byte_column_to_display_column (NULL, 10000, 0, policy)); + } + + /* Verify that cpp_display_column_to_byte_column can go past the end, + and similar edge cases, and check invertibility. */ + { + const char *str + /* Display columns. + 000000000000000000000000000000000000011 + 111111112222222234444444455555555678901 */ + = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello"; + /* 000000000000000000000000000000000111111 + 111122223333444456666777788889999012345 + Byte columns. */ + ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy)); + ASSERT_EQ (15, + cpp_display_column_to_byte_column (str, 15, 11, policy)); + ASSERT_EQ (115, + cpp_display_column_to_byte_column (str, 15, 111, policy)); + ASSERT_EQ (10000, + cpp_display_column_to_byte_column (NULL, 0, 10000, policy)); + ASSERT_EQ (0, + cpp_display_column_to_byte_column (NULL, 10000, 0, policy)); + + /* Verify that we do not interrupt a UTF-8 sequence. */ + ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy)); + + for (int byte_col = 1; byte_col <= 15; ++byte_col) + { + const int disp_col + = cpp_byte_column_to_display_column (str, 15, byte_col, policy); + const int byte_col2 + = cpp_display_column_to_byte_column (str, 15, disp_col, policy); + + /* If we ask for the display column in the middle of a UTF-8 + sequence, it will return the length of the partial sequence, + matching the behavior of GCC before display column support. + Otherwise check the round trip was successful. */ + if (byte_col < 4) + ASSERT_EQ (byte_col, disp_col); + else if (byte_col >= 6 && byte_col < 9) + ASSERT_EQ (3 + (byte_col - 5), disp_col); + else + ASSERT_EQ (byte_col2, byte_col); + } + } + +} + +/* Run all of the selftests within this file. */ + +void +input_c_tests () +{ + test_linenum_comparisons (); + test_should_have_column_data_p (); + test_unknown_location (); + test_builtins (); + for_each_line_table_case (test_make_location_nonpure_range_endpoints); + + for_each_line_table_case (test_accessing_ordinary_linemaps); + for_each_line_table_case (test_lexer); + for_each_line_table_case (test_lexer_string_locations_simple); + for_each_line_table_case (test_lexer_string_locations_ebcdic); + for_each_line_table_case (test_lexer_string_locations_hex); + for_each_line_table_case (test_lexer_string_locations_oct); + for_each_line_table_case (test_lexer_string_locations_letter_escape_1); + for_each_line_table_case (test_lexer_string_locations_letter_escape_2); + for_each_line_table_case (test_lexer_string_locations_ucn4); + for_each_line_table_case (test_lexer_string_locations_ucn8); + for_each_line_table_case (test_lexer_string_locations_wide_string); + for_each_line_table_case (test_lexer_string_locations_string16); + for_each_line_table_case (test_lexer_string_locations_string32); + for_each_line_table_case (test_lexer_string_locations_u8); + for_each_line_table_case (test_lexer_string_locations_utf8_source); + for_each_line_table_case (test_lexer_string_locations_concatenation_1); + for_each_line_table_case (test_lexer_string_locations_concatenation_2); + for_each_line_table_case (test_lexer_string_locations_concatenation_3); + for_each_line_table_case (test_lexer_string_locations_macro); + for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument); + for_each_line_table_case (test_lexer_string_locations_non_string); + for_each_line_table_case (test_lexer_string_locations_long_line); + for_each_line_table_case (test_lexer_string_locations_raw_string_one_line); + for_each_line_table_case (test_lexer_string_locations_raw_string_multiline); + for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated); + for_each_line_table_case (test_lexer_char_constants); + + test_reading_source_line (); + + test_line_offset_overflow (); + + test_cpp_utf8 (); +} + +} // namespace selftest + +#endif /* CHECKING_P */ |