diff options
Diffstat (limited to 'libcpp/charset.cc')
-rw-r--r-- | libcpp/charset.cc | 91 |
1 files changed, 69 insertions, 22 deletions
diff --git a/libcpp/charset.cc b/libcpp/charset.cc index d4f573e..7b625c9 100644 --- a/libcpp/charset.cc +++ b/libcpp/charset.cc @@ -1891,7 +1891,7 @@ cpp_valid_utf8_p (const char *buffer, size_t num_bytes) invalid because they cannot be represented in UTF-16. Reject such values.*/ - if (cp >= UCS_LIMIT) + if (cp > UCS_LIMIT) return false; } /* No problems encountered. */ @@ -3154,34 +3154,26 @@ cpp_display_column_to_byte_column (const char *data, int data_length, return dw.bytes_processed () + MAX (0, display_col - avail_display); } -/* Our own version of wcwidth(). We don't use the actual wcwidth() in glibc, - because that will inspect the user's locale, and in particular in an ASCII - locale, it will not return anything useful for extended characters. But GCC - in other respects (see e.g. _cpp_default_encoding()) behaves as if - everything is UTF-8. We also make some tweaks that are useful for the way - GCC needs to use this data, e.g. tabs and other control characters should be - treated as having width 1. The lookup tables are generated from - contrib/unicode/gen_wcwidth.py and were made by simply calling glibc - wcwidth() on all codepoints, then applying the small tweaks. These tables - are not highly optimized, but for the present purpose of outputting - diagnostics, they are sufficient. */ - -#include "generated_cpp_wcwidth.h" -int cpp_wcwidth (cppchar_t c) +template <typename PropertyType> +PropertyType +get_cppchar_property (cppchar_t c, + const cppchar_t *range_ends, + const PropertyType *range_values, + size_t num_ranges, + PropertyType default_value) { - if (__builtin_expect (c <= wcwidth_range_ends[0], true)) - return wcwidth_widths[0]; + if (__builtin_expect (c <= range_ends[0], true)) + return range_values[0]; /* Binary search the tables. */ int begin = 1; - static const int end - = sizeof wcwidth_range_ends / sizeof (*wcwidth_range_ends); + static const int end = num_ranges; int len = end - begin; do { int half = len/2; int middle = begin + half; - if (c > wcwidth_range_ends[middle]) + if (c > range_ends[middle]) { begin = middle + 1; len -= half + 1; @@ -3191,6 +3183,61 @@ int cpp_wcwidth (cppchar_t c) } while (len); if (__builtin_expect (begin != end, true)) - return wcwidth_widths[begin]; - return 1; + return range_values[begin]; + + return default_value; +} + +/* Our own version of wcwidth(). We don't use the actual wcwidth() in glibc, + because that will inspect the user's locale, and in particular in an ASCII + locale, it will not return anything useful for extended characters. But GCC + in other respects (see e.g. _cpp_default_encoding()) behaves as if + everything is UTF-8. We also make some tweaks that are useful for the way + GCC needs to use this data, e.g. tabs and other control characters should be + treated as having width 1. The lookup tables are generated from + contrib/unicode/gen_wcwidth.py and were made by simply calling glibc + wcwidth() on all codepoints, then applying the small tweaks. These tables + are not highly optimized, but for the present purpose of outputting + diagnostics, they are sufficient. */ + +#include "generated_cpp_wcwidth.h" + +int +cpp_wcwidth (cppchar_t c) +{ + const size_t num_ranges + = sizeof wcwidth_range_ends / sizeof (*wcwidth_range_ends); + return get_cppchar_property<unsigned char > (c, + &wcwidth_range_ends[0], + &wcwidth_widths[0], + num_ranges, + 1); +} + +#include "combining-chars.inc" + +bool +cpp_is_combining_char (cppchar_t c) +{ + const size_t num_ranges + = sizeof combining_range_ends / sizeof (*combining_range_ends); + return get_cppchar_property<bool> (c, + &combining_range_ends[0], + &is_combining[0], + num_ranges, + false); +} + +#include "printable-chars.inc" + +bool +cpp_is_printable_char (cppchar_t c) +{ + const size_t num_ranges + = sizeof printable_range_ends / sizeof (*printable_range_ends); + return get_cppchar_property<bool> (c, + &printable_range_ends[0], + &is_printable[0], + num_ranges, + false); } |