diff options
author | Lewis Hyatt <lhyatt@gmail.com> | 2019-09-19 19:56:11 +0000 |
---|---|---|
committer | Joseph Myers <jsm28@gcc.gnu.org> | 2019-09-19 20:56:11 +0100 |
commit | 7d112d6670a0e0e662f8a7e64c33686e475832c8 (patch) | |
tree | 983eb23217b2572ff4fe5a7f7fe0e5c0c0b9a48d /libcpp/lex.c | |
parent | e0710fcf7dc70054a9a20ab1b8d77f4fef26ef2c (diff) | |
download | gcc-7d112d6670a0e0e662f8a7e64c33686e475832c8.zip gcc-7d112d6670a0e0e662f8a7e64c33686e475832c8.tar.gz gcc-7d112d6670a0e0e662f8a7e64c33686e475832c8.tar.bz2 |
Support extended characters in C/C++ identifiers (PR c/67224)
libcpp/ChangeLog
2019-09-19 Lewis Hyatt <lhyatt@gmail.com>
PR c/67224
* charset.c (_cpp_valid_utf8): New function to help lex UTF-8 tokens.
* internal.h (_cpp_valid_utf8): Declare.
* lex.c (forms_identifier_p): Use it to recognize UTF-8 identifiers.
(_cpp_lex_direct): Handle UTF-8 in identifiers and CPP_OTHER tokens.
Do all work in "default" case to avoid slowing down typical code paths.
Also handle $ and UCN in the default case for consistency.
gcc/Changelog
2019-09-19 Lewis Hyatt <lhyatt@gmail.com>
PR c/67224
* doc/cpp.texi: Document support for extended characters in
identifiers.
* doc/cppopts.texi: Likewise.
gcc/testsuite/ChangeLog
2019-09-19 Lewis Hyatt <lhyatt@gmail.com>
PR c/67224
* c-c++-common/cpp/ucnid-2011-1-utf8.c: New test.
* g++.dg/cpp/ucnid-1-utf8.C: New test.
* g++.dg/cpp/ucnid-2-utf8.C: New test.
* g++.dg/cpp/ucnid-3-utf8.C: New test.
* g++.dg/cpp/ucnid-4-utf8.C: New test.
* g++.dg/other/ucnid-1-utf8.C: New test.
* gcc.dg/cpp/ucnid-1-utf8.c: New test.
* gcc.dg/cpp/ucnid-10-utf8.c: New test.
* gcc.dg/cpp/ucnid-11-utf8.c: New test.
* gcc.dg/cpp/ucnid-12-utf8.c: New test.
* gcc.dg/cpp/ucnid-13-utf8.c: New test.
* gcc.dg/cpp/ucnid-14-utf8.c: New test.
* gcc.dg/cpp/ucnid-15-utf8.c: New test.
* gcc.dg/cpp/ucnid-2-utf8.c: New test.
* gcc.dg/cpp/ucnid-3-utf8.c: New test.
* gcc.dg/cpp/ucnid-4-utf8.c: New test.
* gcc.dg/cpp/ucnid-6-utf8.c: New test.
* gcc.dg/cpp/ucnid-7-utf8.c: New test.
* gcc.dg/cpp/ucnid-9-utf8.c: New test.
* gcc.dg/ucnid-1-utf8.c: New test.
* gcc.dg/ucnid-10-utf8.c: New test.
* gcc.dg/ucnid-11-utf8.c: New test.
* gcc.dg/ucnid-12-utf8.c: New test.
* gcc.dg/ucnid-13-utf8.c: New test.
* gcc.dg/ucnid-14-utf8.c: New test.
* gcc.dg/ucnid-15-utf8.c: New test.
* gcc.dg/ucnid-16-utf8.c: New test.
* gcc.dg/ucnid-2-utf8.c: New test.
* gcc.dg/ucnid-3-utf8.c: New test.
* gcc.dg/ucnid-4-utf8.c: New test.
* gcc.dg/ucnid-5-utf8.c: New test.
* gcc.dg/ucnid-6-utf8.c: New test.
* gcc.dg/ucnid-7-utf8.c: New test.
* gcc.dg/ucnid-8-utf8.c: New test.
* gcc.dg/ucnid-9-utf8.c: New test.
From-SVN: r275979
Diffstat (limited to 'libcpp/lex.c')
-rw-r--r-- | libcpp/lex.c | 55 |
1 files changed, 37 insertions, 18 deletions
diff --git a/libcpp/lex.c b/libcpp/lex.c index 52e5bce..0e8de38 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -1313,7 +1313,9 @@ warn_about_normalization (cpp_reader *pfile, } } -/* Returns TRUE if the sequence starting at buffer->cur is invalid in +static const cppchar_t utf8_signifier = 0xC0; + +/* Returns TRUE if the sequence starting at buffer->cur is valid in an identifier. FIRST is TRUE if this starts an identifier. */ static bool forms_identifier_p (cpp_reader *pfile, int first, @@ -1336,17 +1338,25 @@ forms_identifier_p (cpp_reader *pfile, int first, return true; } - /* Is this a syntactically valid UCN? */ - if (CPP_OPTION (pfile, extended_identifiers) - && *buffer->cur == '\\' - && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U')) + /* Is this a syntactically valid UCN or a valid UTF-8 char? */ + if (CPP_OPTION (pfile, extended_identifiers)) { cppchar_t s; - buffer->cur += 2; - if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first, - state, &s, NULL, NULL)) - return true; - buffer->cur -= 2; + if (*buffer->cur >= utf8_signifier) + { + if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first, + state, &s)) + return true; + } + else if (*buffer->cur == '\\' + && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U')) + { + buffer->cur += 2; + if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first, + state, &s, NULL, NULL)) + return true; + buffer->cur -= 2; + } } return false; @@ -1464,7 +1474,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn, pfile->buffer->cur = cur; if (starts_ucn || forms_identifier_p (pfile, false, nst)) { - /* Slower version for identifiers containing UCNs (or $). */ + /* Slower version for identifiers containing UCNs + or extended chars (including $). */ do { while (ISIDNUM (*pfile->buffer->cur)) { @@ -3123,12 +3134,12 @@ _cpp_lex_direct (cpp_reader *pfile) /* @ is a punctuator in Objective-C. */ case '@': result->type = CPP_ATSIGN; break; - case '$': - case '\\': + default: { const uchar *base = --buffer->cur; - struct normalize_state nst = INITIAL_NORMALIZE_STATE; + /* Check for an extended identifier ($ or UCN or UTF-8). */ + struct normalize_state nst = INITIAL_NORMALIZE_STATE; if (forms_identifier_p (pfile, true, &nst)) { result->type = CPP_NAME; @@ -3137,13 +3148,21 @@ _cpp_lex_direct (cpp_reader *pfile) warn_about_normalization (pfile, result, &nst); break; } + + /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a + single token. */ buffer->cur++; + if (c >= utf8_signifier) + { + const uchar *pstr = base; + cppchar_t s; + if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s)) + buffer->cur = pstr; + } + create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER); + break; } - /* FALLTHRU */ - default: - create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER); - break; } /* Potentially convert the location of the token to a range. */ |