diff options
author | Lewis Hyatt <lhyatt@gmail.com> | 2019-09-19 19:56:11 +0000 |
---|---|---|
committer | Joseph Myers <jsm28@gcc.gnu.org> | 2019-09-19 20:56:11 +0100 |
commit | 7d112d6670a0e0e662f8a7e64c33686e475832c8 (patch) | |
tree | 983eb23217b2572ff4fe5a7f7fe0e5c0c0b9a48d /libcpp/charset.c | |
parent | e0710fcf7dc70054a9a20ab1b8d77f4fef26ef2c (diff) | |
download | gcc-7d112d6670a0e0e662f8a7e64c33686e475832c8.zip gcc-7d112d6670a0e0e662f8a7e64c33686e475832c8.tar.gz gcc-7d112d6670a0e0e662f8a7e64c33686e475832c8.tar.bz2 |
Support extended characters in C/C++ identifiers (PR c/67224)
libcpp/ChangeLog
2019-09-19 Lewis Hyatt <lhyatt@gmail.com>
PR c/67224
* charset.c (_cpp_valid_utf8): New function to help lex UTF-8 tokens.
* internal.h (_cpp_valid_utf8): Declare.
* lex.c (forms_identifier_p): Use it to recognize UTF-8 identifiers.
(_cpp_lex_direct): Handle UTF-8 in identifiers and CPP_OTHER tokens.
Do all work in "default" case to avoid slowing down typical code paths.
Also handle $ and UCN in the default case for consistency.
gcc/Changelog
2019-09-19 Lewis Hyatt <lhyatt@gmail.com>
PR c/67224
* doc/cpp.texi: Document support for extended characters in
identifiers.
* doc/cppopts.texi: Likewise.
gcc/testsuite/ChangeLog
2019-09-19 Lewis Hyatt <lhyatt@gmail.com>
PR c/67224
* c-c++-common/cpp/ucnid-2011-1-utf8.c: New test.
* g++.dg/cpp/ucnid-1-utf8.C: New test.
* g++.dg/cpp/ucnid-2-utf8.C: New test.
* g++.dg/cpp/ucnid-3-utf8.C: New test.
* g++.dg/cpp/ucnid-4-utf8.C: New test.
* g++.dg/other/ucnid-1-utf8.C: New test.
* gcc.dg/cpp/ucnid-1-utf8.c: New test.
* gcc.dg/cpp/ucnid-10-utf8.c: New test.
* gcc.dg/cpp/ucnid-11-utf8.c: New test.
* gcc.dg/cpp/ucnid-12-utf8.c: New test.
* gcc.dg/cpp/ucnid-13-utf8.c: New test.
* gcc.dg/cpp/ucnid-14-utf8.c: New test.
* gcc.dg/cpp/ucnid-15-utf8.c: New test.
* gcc.dg/cpp/ucnid-2-utf8.c: New test.
* gcc.dg/cpp/ucnid-3-utf8.c: New test.
* gcc.dg/cpp/ucnid-4-utf8.c: New test.
* gcc.dg/cpp/ucnid-6-utf8.c: New test.
* gcc.dg/cpp/ucnid-7-utf8.c: New test.
* gcc.dg/cpp/ucnid-9-utf8.c: New test.
* gcc.dg/ucnid-1-utf8.c: New test.
* gcc.dg/ucnid-10-utf8.c: New test.
* gcc.dg/ucnid-11-utf8.c: New test.
* gcc.dg/ucnid-12-utf8.c: New test.
* gcc.dg/ucnid-13-utf8.c: New test.
* gcc.dg/ucnid-14-utf8.c: New test.
* gcc.dg/ucnid-15-utf8.c: New test.
* gcc.dg/ucnid-16-utf8.c: New test.
* gcc.dg/ucnid-2-utf8.c: New test.
* gcc.dg/ucnid-3-utf8.c: New test.
* gcc.dg/ucnid-4-utf8.c: New test.
* gcc.dg/ucnid-5-utf8.c: New test.
* gcc.dg/ucnid-6-utf8.c: New test.
* gcc.dg/ucnid-7-utf8.c: New test.
* gcc.dg/ucnid-8-utf8.c: New test.
* gcc.dg/ucnid-9-utf8.c: New test.
From-SVN: r275979
Diffstat (limited to 'libcpp/charset.c')
-rw-r--r-- | libcpp/charset.c | 83 |
1 files changed, 81 insertions, 2 deletions
diff --git a/libcpp/charset.c b/libcpp/charset.c index 8a0e5cb..1028621 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -1198,6 +1198,84 @@ convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit, return from; } +/* Performs a similar task as _cpp_valid_ucn, but parses UTF-8-encoded + extended characters rather than UCNs. If the return value is TRUE, then a + character was successfully decoded and stored in *CP; *PSTR has been + updated to point one past the valid UTF-8 sequence. Diagnostics may have + been emitted if the character parsed is not allowed in the current context. + If the return value is FALSE, then *PSTR has not been modified and *CP may + equal 0, to indicate that *PSTR does not form a valid UTF-8 sequence, or it + may, when processing an identifier in C mode, equal a codepoint that was + validly encoded but is not allowed to appear in an identifier. In either + case, no diagnostic is emitted, and the return value of FALSE should cause + a new token to be formed. + + Unlike _cpp_valid_ucn, this will never be called when lexing a string; only + a potential identifier, or a CPP_OTHER token. NST is unused in the latter + case. + + As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for + the start of an identifier, or 2 otherwise. */ + +extern bool +_cpp_valid_utf8 (cpp_reader *pfile, + const uchar **pstr, + const uchar *limit, + int identifier_pos, + struct normalize_state *nst, + cppchar_t *cp) +{ + const uchar *base = *pstr; + size_t inbytesleft = limit - base; + if (one_utf8_to_cppchar (pstr, &inbytesleft, cp)) + { + /* No diagnostic here as this byte will rather become a + new token. */ + *cp = 0; + return false; + } + + if (identifier_pos) + { + switch (ucn_valid_in_identifier (pfile, *cp, nst)) + { + + case 0: + /* In C++, this is an error for invalid character in an identifier + because logically, the UTF-8 was converted to a UCN during + translation phase 1 (even though we don't physically do it that + way). In C, this byte rather becomes grammatically a separate + token. */ + + if (CPP_OPTION (pfile, cplusplus)) + cpp_error (pfile, CPP_DL_ERROR, + "extended character %.*s is not valid in an identifier", + (int) (*pstr - base), base); + else + { + *pstr = base; + return false; + } + + break; + + case 2: + if (identifier_pos == 1) + { + /* This is treated the same way in C++ or C99 -- lexed as an + identifier which is then invalid because an identifier is + not allowed to start with this character. */ + cpp_error (pfile, CPP_DL_ERROR, + "extended character %.*s is not valid at the start of an identifier", + (int) (*pstr - base), base); + } + break; + } + } + + return true; +} + /* Subroutine of convert_hex and convert_oct. N is the representation in the execution character set of a numeric escape; write it into the string buffer TBUF and update the end-of-string pointer therein. WIDE @@ -1956,8 +2034,9 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token, } /* Convert an identifier denoted by ID and LEN, which might contain - UCN escapes, to the source character set, either UTF-8 or - UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */ + UCN escapes or UTF-8 multibyte chars, to the source character set, + either UTF-8 or UTF-EBCDIC. Assumes that the identifier is actually + a valid identifier. */ cpp_hashnode * _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) { |