aboutsummaryrefslogtreecommitdiff
path: root/libcpp/charset.cc
diff options
context:
space:
mode:
Diffstat (limited to 'libcpp/charset.cc')
-rw-r--r--libcpp/charset.cc91
1 files changed, 69 insertions, 22 deletions
diff --git a/libcpp/charset.cc b/libcpp/charset.cc
index d4f573e..7b625c9 100644
--- a/libcpp/charset.cc
+++ b/libcpp/charset.cc
@@ -1891,7 +1891,7 @@ cpp_valid_utf8_p (const char *buffer, size_t num_bytes)
invalid because they cannot be represented in UTF-16.
Reject such values.*/
- if (cp >= UCS_LIMIT)
+ if (cp > UCS_LIMIT)
return false;
}
/* No problems encountered. */
@@ -3154,34 +3154,26 @@ cpp_display_column_to_byte_column (const char *data, int data_length,
return dw.bytes_processed () + MAX (0, display_col - avail_display);
}
-/* Our own version of wcwidth(). We don't use the actual wcwidth() in glibc,
- because that will inspect the user's locale, and in particular in an ASCII
- locale, it will not return anything useful for extended characters. But GCC
- in other respects (see e.g. _cpp_default_encoding()) behaves as if
- everything is UTF-8. We also make some tweaks that are useful for the way
- GCC needs to use this data, e.g. tabs and other control characters should be
- treated as having width 1. The lookup tables are generated from
- contrib/unicode/gen_wcwidth.py and were made by simply calling glibc
- wcwidth() on all codepoints, then applying the small tweaks. These tables
- are not highly optimized, but for the present purpose of outputting
- diagnostics, they are sufficient. */
-
-#include "generated_cpp_wcwidth.h"
-int cpp_wcwidth (cppchar_t c)
+template <typename PropertyType>
+PropertyType
+get_cppchar_property (cppchar_t c,
+ const cppchar_t *range_ends,
+ const PropertyType *range_values,
+ size_t num_ranges,
+ PropertyType default_value)
{
- if (__builtin_expect (c <= wcwidth_range_ends[0], true))
- return wcwidth_widths[0];
+ if (__builtin_expect (c <= range_ends[0], true))
+ return range_values[0];
/* Binary search the tables. */
int begin = 1;
- static const int end
- = sizeof wcwidth_range_ends / sizeof (*wcwidth_range_ends);
+ static const int end = num_ranges;
int len = end - begin;
do
{
int half = len/2;
int middle = begin + half;
- if (c > wcwidth_range_ends[middle])
+ if (c > range_ends[middle])
{
begin = middle + 1;
len -= half + 1;
@@ -3191,6 +3183,61 @@ int cpp_wcwidth (cppchar_t c)
} while (len);
if (__builtin_expect (begin != end, true))
- return wcwidth_widths[begin];
- return 1;
+ return range_values[begin];
+
+ return default_value;
+}
+
+/* Our own version of wcwidth(). We don't use the actual wcwidth() in glibc,
+ because that will inspect the user's locale, and in particular in an ASCII
+ locale, it will not return anything useful for extended characters. But GCC
+ in other respects (see e.g. _cpp_default_encoding()) behaves as if
+ everything is UTF-8. We also make some tweaks that are useful for the way
+ GCC needs to use this data, e.g. tabs and other control characters should be
+ treated as having width 1. The lookup tables are generated from
+ contrib/unicode/gen_wcwidth.py and were made by simply calling glibc
+ wcwidth() on all codepoints, then applying the small tweaks. These tables
+ are not highly optimized, but for the present purpose of outputting
+ diagnostics, they are sufficient. */
+
+#include "generated_cpp_wcwidth.h"
+
+int
+cpp_wcwidth (cppchar_t c)
+{
+ const size_t num_ranges
+ = sizeof wcwidth_range_ends / sizeof (*wcwidth_range_ends);
+ return get_cppchar_property<unsigned char > (c,
+ &wcwidth_range_ends[0],
+ &wcwidth_widths[0],
+ num_ranges,
+ 1);
+}
+
+#include "combining-chars.inc"
+
+bool
+cpp_is_combining_char (cppchar_t c)
+{
+ const size_t num_ranges
+ = sizeof combining_range_ends / sizeof (*combining_range_ends);
+ return get_cppchar_property<bool> (c,
+ &combining_range_ends[0],
+ &is_combining[0],
+ num_ranges,
+ false);
+}
+
+#include "printable-chars.inc"
+
+bool
+cpp_is_printable_char (cppchar_t c)
+{
+ const size_t num_ranges
+ = sizeof printable_range_ends / sizeof (*printable_range_ends);
+ return get_cppchar_property<bool> (c,
+ &printable_range_ends[0],
+ &is_printable[0],
+ num_ranges,
+ false);
}