diff options
Diffstat (limited to 'libcpp')
-rw-r--r-- | libcpp/charset.c | 109 | ||||
-rw-r--r-- | libcpp/files.c | 56 | ||||
-rw-r--r-- | libcpp/include/cpplib.h | 18 |
3 files changed, 137 insertions, 46 deletions
diff --git a/libcpp/charset.c b/libcpp/charset.c index 99a9b73..61881f9 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -630,7 +630,11 @@ static const struct cpp_conversion conversion_tab[] = { cset_converter structure for conversion from FROM to TO. If iconv_open() fails, issue an error and return an identity converter. Silently return an identity converter if FROM and TO - are identical. */ + are identical. + + PFILE is only used for generating diagnostics; setting it to NULL + suppresses diagnostics. */ + static struct cset_converter init_iconv_desc (cpp_reader *pfile, const char *to, const char *from) { @@ -672,25 +676,31 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from) if (ret.cd == (iconv_t) -1) { - if (errno == EINVAL) - cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */ - "conversion from %s to %s not supported by iconv", - from, to); - else - cpp_errno (pfile, CPP_DL_ERROR, "iconv_open"); - + if (pfile) + { + if (errno == EINVAL) + cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */ + "conversion from %s to %s not supported by iconv", + from, to); + else + cpp_errno (pfile, CPP_DL_ERROR, "iconv_open"); + } ret.func = convert_no_conversion; } } else { - cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */ - "no iconv implementation, cannot convert from %s to %s", - from, to); + if (pfile) + { + cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */ + "no iconv implementation, cannot convert from %s to %s", + from, to); + } ret.func = convert_no_conversion; ret.cd = (iconv_t) -1; ret.width = -1; } + return ret; } @@ -2122,6 +2132,25 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) buf, bufp - buf, HT_ALLOC)); } + +/* Utility to strip a UTF-8 byte order marking from the beginning + of a buffer. Returns the number of bytes to skip, which currently + will be either 0 or 3. */ +int +cpp_check_utf8_bom (const char *data, size_t data_length) +{ + +#if HOST_CHARSET == HOST_CHARSET_ASCII + const unsigned char *udata = (const unsigned char *) data; + if (data_length >= 3 && udata[0] == 0xef && udata[1] == 0xbb + && udata[2] == 0xbf) + return 3; +#endif + + return 0; +} + + /* Convert an input buffer (containing the complete contents of one source file) from INPUT_CHARSET to the source character set. INPUT points to the input buffer, SIZE is its allocated size, and LEN is @@ -2135,7 +2164,11 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) INPUT is expected to have been allocated with xmalloc. This function will either set *BUFFER_START to INPUT, or free it and set *BUFFER_START to a pointer to another xmalloc-allocated block of - memory. */ + memory. + + PFILE is only used to generate diagnostics; setting it to NULL suppresses + diagnostics, and causes a return of NULL if there was any error instead. */ + uchar * _cpp_convert_input (cpp_reader *pfile, const char *input_charset, uchar *input, size_t size, size_t len, @@ -2158,17 +2191,27 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, to.text = XNEWVEC (uchar, to.asize); to.len = 0; - if (!APPLY_CONVERSION (input_cset, input, len, &to)) - cpp_error (pfile, CPP_DL_ERROR, - "failure to convert %s to %s", - CPP_OPTION (pfile, input_charset), SOURCE_CHARSET); - + const bool ok = APPLY_CONVERSION (input_cset, input, len, &to); free (input); - } - /* Clean up the mess. */ - if (input_cset.func == convert_using_iconv) - iconv_close (input_cset.cd); + /* Clean up the mess. */ + if (input_cset.func == convert_using_iconv) + iconv_close (input_cset.cd); + + /* Handle conversion failure. */ + if (!ok) + { + if (!pfile) + { + XDELETEVEC (to.text); + *buffer_start = NULL; + *st_size = 0; + return NULL; + } + cpp_error (pfile, CPP_DL_ERROR, "failure to convert %s to %s", + input_charset, SOURCE_CHARSET); + } + } /* Resize buffer if we allocated substantially too much, or if we haven't enough space for the \n-terminator or following @@ -2192,19 +2235,14 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, buffer = to.text; *st_size = to.len; -#if HOST_CHARSET == HOST_CHARSET_ASCII - /* The HOST_CHARSET test just above ensures that the source charset - is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that - glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a + + /* Ignore a UTF-8 BOM if we see one and the source charset is UTF-8. Note + that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a BOM -- however, even if it did, we would still need this code due to the 'convert_no_conversion' case. */ - if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb - && to.text[2] == 0xbf) - { - *st_size -= 3; - buffer += 3; - } -#endif + const int bom_len = cpp_check_utf8_bom ((const char *) to.text, to.len); + *st_size -= bom_len; + buffer += bom_len; *buffer_start = to.text; return buffer; @@ -2244,6 +2282,13 @@ _cpp_default_encoding (void) return current_encoding; } +/* Check if the configured input charset requires no conversion, other than + possibly stripping a UTF-8 BOM. */ +bool cpp_input_conversion_is_trivial (const char *input_charset) +{ + return !strcasecmp (input_charset, SOURCE_CHARSET); +} + /* Implementation of class cpp_string_location_reader. */ /* Constructor for cpp_string_location_reader. */ diff --git a/libcpp/files.c b/libcpp/files.c index 6e20fc5..c93a03c 100644 --- a/libcpp/files.c +++ b/libcpp/files.c @@ -173,7 +173,7 @@ static bool pch_open_file (cpp_reader *pfile, _cpp_file *file, static bool find_file_in_dir (cpp_reader *pfile, _cpp_file *file, bool *invalid_pch, location_t loc); static bool read_file_guts (cpp_reader *pfile, _cpp_file *file, - location_t loc); + location_t loc, const char *input_charset); static bool read_file (cpp_reader *pfile, _cpp_file *file, location_t loc); static struct cpp_dir *search_path_head (cpp_reader *, const char *fname, @@ -671,9 +671,12 @@ _cpp_find_file (cpp_reader *pfile, const char *fname, cpp_dir *start_dir, Use LOC for any diagnostics. + PFILE may be NULL. In this case, no diagnostics are issued. + FIXME: Flush file cache and try again if we run out of memory. */ static bool -read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc) +read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, + const char *input_charset) { ssize_t size, total, count; uchar *buf; @@ -681,8 +684,9 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc) if (S_ISBLK (file->st.st_mode)) { - cpp_error_at (pfile, CPP_DL_ERROR, loc, - "%s is a block device", file->path); + if (pfile) + cpp_error_at (pfile, CPP_DL_ERROR, loc, + "%s is a block device", file->path); return false; } @@ -699,8 +703,9 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc) does not bite us. */ if (file->st.st_size > INTTYPE_MAXIMUM (ssize_t)) { - cpp_error_at (pfile, CPP_DL_ERROR, loc, - "%s is too large", file->path); + if (pfile) + cpp_error_at (pfile, CPP_DL_ERROR, loc, + "%s is too large", file->path); return false; } @@ -733,29 +738,29 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc) if (count < 0) { - cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc); + if (pfile) + cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc); free (buf); return false; } - if (regular && total != size && STAT_SIZE_RELIABLE (file->st)) + if (pfile && regular && total != size && STAT_SIZE_RELIABLE (file->st)) cpp_error_at (pfile, CPP_DL_WARNING, loc, "%s is shorter than expected", file->path); file->buffer = _cpp_convert_input (pfile, - CPP_OPTION (pfile, input_charset), + input_charset, buf, size + 16, total, &file->buffer_start, &file->st.st_size); - file->buffer_valid = true; - - return true; + file->buffer_valid = file->buffer; + return file->buffer_valid; } /* Convenience wrapper around read_file_guts that opens the file if necessary and closes the file descriptor after reading. FILE must have been passed through find_file() at some stage. Use LOC for - any diagnostics. */ + any diagnostics. Unlike read_file_guts(), PFILE may not be NULL. */ static bool read_file (cpp_reader *pfile, _cpp_file *file, location_t loc) { @@ -773,7 +778,8 @@ read_file (cpp_reader *pfile, _cpp_file *file, location_t loc) return false; } - file->dont_read = !read_file_guts (pfile, file, loc); + file->dont_read = !read_file_guts (pfile, file, loc, + CPP_OPTION (pfile, input_charset)); close (file->fd); file->fd = -1; @@ -2145,3 +2151,25 @@ _cpp_has_header (cpp_reader *pfile, const char *fname, int angle_brackets, return file->err_no != ENOENT; } +/* Read a file and convert to input charset, the same as if it were being read + by a cpp_reader. */ + +cpp_converted_source +cpp_get_converted_source (const char *fname, const char *input_charset) +{ + cpp_converted_source res = {}; + _cpp_file file = {}; + file.fd = -1; + file.name = lbasename (fname); + file.path = fname; + if (!open_file (&file)) + return res; + const bool ok = read_file_guts (NULL, &file, 0, input_charset); + close (file.fd); + if (!ok) + return res; + res.to_free = (char *) file.buffer_start; + res.data = (char *) file.buffer; + res.len = file.st.st_size; + return res; +} diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index 7e84063..af14291 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -1379,6 +1379,20 @@ extern struct _cpp_file *cpp_get_file (cpp_buffer *); extern cpp_buffer *cpp_get_prev (cpp_buffer *); extern void cpp_clear_file_cache (cpp_reader *); +/* cpp_get_converted_source returns the contents of the given file, as it exists + after cpplib has read it and converted it from the input charset to the + source charset. Return struct will be zero-filled if the data could not be + read for any reason. The data starts at the DATA pointer, but the TO_FREE + pointer is what should be passed to free(), as there may be an offset. */ +struct cpp_converted_source +{ + char *to_free; + char *data; + size_t len; +}; +cpp_converted_source cpp_get_converted_source (const char *fname, + const char *input_charset); + /* In pch.c */ struct save_macro_data; extern int cpp_save_state (cpp_reader *, FILE *); @@ -1449,6 +1463,7 @@ class cpp_display_width_computation { /* Convenience functions that are simple use cases for class cpp_display_width_computation. Tab characters will be expanded to spaces as determined by TABSTOP. */ + int cpp_byte_column_to_display_column (const char *data, int data_length, int column, int tabstop); inline int cpp_display_width (const char *data, int data_length, @@ -1461,4 +1476,7 @@ int cpp_display_column_to_byte_column (const char *data, int data_length, int display_col, int tabstop); int cpp_wcwidth (cppchar_t c); +bool cpp_input_conversion_is_trivial (const char *input_charset); +int cpp_check_utf8_bom (const char *data, size_t data_length); + #endif /* ! LIBCPP_CPPLIB_H */ |