From b1b60145aedb8adcb0b9dcf43a5ae735c2f03b51 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Tue, 22 May 2018 17:35:38 +0100 Subject: Support UTF-8 identifiers in C/C++ expressions (PR gdb/22973) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Factor out cp_ident_is_alpha/cp_ident_is_alnum out of gdb/cp-name-parser.y and use it in the C/C++ expression parser too. New test included. gdb/ChangeLog: 2018-05-22 Pedro Alves 張俊芝 PR gdb/22973 * c-exp.y: Include "c-support.h". (parse_number, c_parse_escape, lex_one_token): Use TOLOWER instead of tolower. Use c_ident_is_alpha to scan names. * c-lang.c: Include "c-support.h". (convert_ucn, convert_octal, convert_hex, convert_escape): Use ISXDIGIT instead of isxdigit and ISDIGIT instead of isdigit. * c-support.h: New file, with bits factored out from ... * cp-name-parser.y: ... this file. Include "c-support.h". (cp_ident_is_alpha, cp_ident_is_alnum): Deleted, moved to c-support.h and renamed. (symbol_end, yylex): Adjust. gdb/testsuite/ChangeLog: 2018-05-22 Pedro Alves PR gdb/22973 * gdb.base/utf8-identifiers.c: New file. * gdb.base/utf8-identifiers.exp: New file. --- gdb/cp-name-parser.y | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) (limited to 'gdb/cp-name-parser.y') diff --git a/gdb/cp-name-parser.y b/gdb/cp-name-parser.y index f522e46..ebae562 100644 --- a/gdb/cp-name-parser.y +++ b/gdb/cp-name-parser.y @@ -35,6 +35,7 @@ #include "safe-ctype.h" #include "demangle.h" #include "cp-support.h" +#include "c-support.h" /* Bison does not make it easy to create a parser without global state, unfortunately. Here are all the global variables used @@ -1304,28 +1305,6 @@ d_binary (const char *name, struct demangle_component *lhs, struct demangle_comp fill_comp (DEMANGLE_COMPONENT_BINARY_ARGS, lhs, rhs)); } -/* Like ISALPHA, but also returns true for the union of all UTF-8 - multi-byte sequence bytes and non-ASCII characters in - extended-ASCII charsets (e.g., Latin1). I.e., returns true if the - high bit is set. Note that not all UTF-8 ranges are allowed in C++ - identifiers, but we don't need to be pedantic so for simplicity we - ignore that here. Plus this avoids the complication of actually - knowing what was the right encoding. */ - -static inline bool -cp_ident_is_alpha (unsigned char ch) -{ - return ISALPHA (ch) || ch >= 0x80; -} - -/* Similarly, but Like ISALNUM. */ - -static inline bool -cp_ident_is_alnum (unsigned char ch) -{ - return ISALNUM (ch) || ch >= 0x80; -} - /* Find the end of a symbol name starting at LEXPTR. */ static const char * @@ -1333,7 +1312,7 @@ symbol_end (const char *lexptr) { const char *p = lexptr; - while (*p && (cp_ident_is_alnum (*p) || *p == '_' || *p == '$' || *p == '.')) + while (*p && (c_ident_is_alnum (*p) || *p == '_' || *p == '$' || *p == '.')) p++; return p; @@ -1813,7 +1792,7 @@ yylex (void) return ERROR; } - if (!(c == '_' || c == '$' || cp_ident_is_alpha (c))) + if (!(c == '_' || c == '$' || c_ident_is_alpha (c))) { /* We must have come across a bad character (e.g. ';'). */ yyerror (_("invalid character")); @@ -1824,7 +1803,7 @@ yylex (void) namelen = 0; do c = tokstart[++namelen]; - while (cp_ident_is_alnum (c) || c == '_' || c == '$'); + while (c_ident_is_alnum (c) || c == '_' || c == '$'); lexptr += namelen; -- cgit v1.1