diff options
author | Jason Merrill <jason@casey.cygnus.com> | 2000-03-08 09:07:36 +0000 |
---|---|---|
committer | Jason Merrill <jason@gcc.gnu.org> | 2000-03-08 04:07:36 -0500 |
commit | 1c08915645ab308eba6fb8a0ebd577539e3910b3 (patch) | |
tree | f3825b784f6f4350ee57f2d0d99203a984ff7831 | |
parent | f48d005079fcc19e695fb4d9274b68287eeaa123 (diff) | |
download | gcc-1c08915645ab308eba6fb8a0ebd577539e3910b3.zip gcc-1c08915645ab308eba6fb8a0ebd577539e3910b3.tar.gz gcc-1c08915645ab308eba6fb8a0ebd577539e3910b3.tar.bz2 |
Add initial support for '\uNNNN' specifier.
* lex.c (read_ucs): New fn.
(readescape, skip_white_space): Call it.
(is_extended_char, is_extended_char_1): New fns.
(utf8_extend_token): New fn, #if 0'd out.
(real_yylex): Treat extended chars like letters.
From-SVN: r32414
-rw-r--r-- | gcc/cp/ChangeLog | 7 | ||||
-rw-r--r-- | gcc/cp/lex.c | 417 |
2 files changed, 419 insertions, 5 deletions
diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog index 5d7d48e..4b7c03c 100644 --- a/gcc/cp/ChangeLog +++ b/gcc/cp/ChangeLog @@ -1,5 +1,12 @@ 2000-03-07 Jason Merrill <jason@casey.cygnus.com> + Add initial support for '\uNNNN' specifier. + * lex.c (read_ucs): New fn. + (readescape, skip_white_space): Call it. + (is_extended_char, is_extended_char_1): New fns. + (utf8_extend_token): New fn, #if 0'd out. + (real_yylex): Treat extended chars like letters. + * search.c (note_debug_info_needed): Walk the bases even if we weren't deferring the type itself. diff --git a/gcc/cp/lex.c b/gcc/cp/lex.c index 5c42a41..45037aa 100644 --- a/gcc/cp/lex.c +++ b/gcc/cp/lex.c @@ -90,6 +90,9 @@ static int read_line_number PARAMS ((int *)); static int token_getch PARAMS ((void)); static void token_put_back PARAMS ((int)); static void mark_impl_file_chain PARAMS ((void *)); +static int read_ucs PARAMS ((int)); +static int is_extended_char PARAMS ((int)); +static int is_extended_char_1 PARAMS ((int)); /* Given a file name X, return the nondirectory portion. Keep in mind that X can be computed more than once. */ @@ -2236,10 +2239,16 @@ skip_white_space (c) case '\\': c = getch (); if (c == '\n') - lineno++; + { + lineno++; + c = getch (); + } + else if (c == 'u') + c = read_ucs (4); + else if (c == 'U') + c = read_ucs (8); else error ("stray '\\' in program"); - c = getch (); break; default: @@ -2799,6 +2808,376 @@ do_pending_lang_change () pop_lang_context (); } +/* Parse a '\uNNNN' or '\UNNNNNNNN' sequence. + + [lex.charset]: The character designated by the universal-character-name + \UNNNNNNNN is that character whose character short name in ISO/IEC 10646 + is NNNNNNNN; the character designated by the universal-character-name + \uNNNN is that character whose character short name in ISO/IEC 10646 is + 0000NNNN. If the hexadecimal value for a universal character name is + less than 0x20 or in the range 0x7F-0x9F (inclusive), or if the + universal character name designates a character in the basic source + character set, then the program is ill-formed. + + We assume that wchar_t is Unicode, so we don't need to do any + mapping. Is this ever wrong? */ + +static int +read_ucs (length) + int length; +{ + unsigned int code = 0; + int c; + + for (; length; --length) + { + c = getch (); + if (! ISXDIGIT (c)) + { + error ("non hex digit '%c' in universal-character-name", c); + put_back (c); + break; + } + code <<= 4; + if (c >= 'a' && c <= 'f') + code += c - 'a' + 10; + if (c >= 'A' && c <= 'F') + code += c - 'A' + 10; + if (c >= '0' && c <= '9') + code += c - '0'; + } + +#ifdef TARGET_EBCDIC + sorry ("universal-character-name on EBCDIC target"); + return 0x3F; +#endif + + if (code > 0x9f && !(code & 0x80000000)) + /* True extended character, OK. */; + else if (code >= 0x20 && code < 0x7f) + { + /* ASCII printable character. The C character set consists of all of + these except $, @ and `. We use hex escapes so that this also + works with EBCDIC hosts. */ + if (code != 0x24 && code != 0x40 && code != 0x60) + error ("universal-character-name designates `%c', part of the basic source character set", code); + } + else + error ("invalid universal-character-name"); + return code; +} + +/* Returns nonzero if C is a universal-character-name. Give an error if it + is not one which may appear in an identifier, as per [extendid]. */ + +static inline int +is_extended_char (c) + int c; +{ +#ifdef TARGET_EBCDIC + return 0; +#else + /* ASCII. */ + if (c < 0x7f) + return 0; + + return is_extended_char_1 (c); +#endif +} + +static int +is_extended_char_1 (c) + int c; +{ + /* None of the valid chars are outside the Basic Multilingual Plane (the + low 16 bits). */ + if (c > 0xffff) + { + error ("universal-character-name `\\U%08x' not valid in identifier", c); + return 1; + } + + /* Latin */ + if ((c >= 0x00c0 && c <= 0x00d6) + || (c >= 0x00d8 && c <= 0x00f6) + || (c >= 0x00f8 && c <= 0x01f5) + || (c >= 0x01fa && c <= 0x0217) + || (c >= 0x0250 && c <= 0x02a8) + || (c >= 0x1e00 && c <= 0x1e9a) + || (c >= 0x1ea0 && c <= 0x1ef9)) + return 1; + + /* Greek */ + if ((c == 0x0384) + || (c >= 0x0388 && c <= 0x038a) + || (c == 0x038c) + || (c >= 0x038e && c <= 0x03a1) + || (c >= 0x03a3 && c <= 0x03ce) + || (c >= 0x03d0 && c <= 0x03d6) + || (c == 0x03da) + || (c == 0x03dc) + || (c == 0x03de) + || (c == 0x03e0) + || (c >= 0x03e2 && c <= 0x03f3) + || (c >= 0x1f00 && c <= 0x1f15) + || (c >= 0x1f18 && c <= 0x1f1d) + || (c >= 0x1f20 && c <= 0x1f45) + || (c >= 0x1f48 && c <= 0x1f4d) + || (c >= 0x1f50 && c <= 0x1f57) + || (c == 0x1f59) + || (c == 0x1f5b) + || (c == 0x1f5d) + || (c >= 0x1f5f && c <= 0x1f7d) + || (c >= 0x1f80 && c <= 0x1fb4) + || (c >= 0x1fb6 && c <= 0x1fbc) + || (c >= 0x1fc2 && c <= 0x1fc4) + || (c >= 0x1fc6 && c <= 0x1fcc) + || (c >= 0x1fd0 && c <= 0x1fd3) + || (c >= 0x1fd6 && c <= 0x1fdb) + || (c >= 0x1fe0 && c <= 0x1fec) + || (c >= 0x1ff2 && c <= 0x1ff4) + || (c >= 0x1ff6 && c <= 0x1ffc)) + return 1; + + /* Cyrillic */ + if ((c >= 0x0401 && c <= 0x040d) + || (c >= 0x040f && c <= 0x044f) + || (c >= 0x0451 && c <= 0x045c) + || (c >= 0x045e && c <= 0x0481) + || (c >= 0x0490 && c <= 0x04c4) + || (c >= 0x04c7 && c <= 0x04c8) + || (c >= 0x04cb && c <= 0x04cc) + || (c >= 0x04d0 && c <= 0x04eb) + || (c >= 0x04ee && c <= 0x04f5) + || (c >= 0x04f8 && c <= 0x04f9)) + return 1; + + /* Armenian */ + if ((c >= 0x0531 && c <= 0x0556) + || (c >= 0x0561 && c <= 0x0587)) + return 1; + + /* Hebrew */ + if ((c >= 0x05d0 && c <= 0x05ea) + || (c >= 0x05f0 && c <= 0x05f4)) + return 1; + + /* Arabic */ + if ((c >= 0x0621 && c <= 0x063a) + || (c >= 0x0640 && c <= 0x0652) + || (c >= 0x0670 && c <= 0x06b7) + || (c >= 0x06ba && c <= 0x06be) + || (c >= 0x06c0 && c <= 0x06ce) + || (c >= 0x06e5 && c <= 0x06e7)) + return 1; + + /* Devanagari */ + if ((c >= 0x0905 && c <= 0x0939) + || (c >= 0x0958 && c <= 0x0962)) + return 1; + + /* Bengali */ + if ((c >= 0x0985 && c <= 0x098c) + || (c >= 0x098f && c <= 0x0990) + || (c >= 0x0993 && c <= 0x09a8) + || (c >= 0x09aa && c <= 0x09b0) + || (c == 0x09b2) + || (c >= 0x09b6 && c <= 0x09b9) + || (c >= 0x09dc && c <= 0x09dd) + || (c >= 0x09df && c <= 0x09e1) + || (c >= 0x09f0 && c <= 0x09f1)) + return 1; + + /* Gurmukhi */ + if ((c >= 0x0a05 && c <= 0x0a0a) + || (c >= 0x0a0f && c <= 0x0a10) + || (c >= 0x0a13 && c <= 0x0a28) + || (c >= 0x0a2a && c <= 0x0a30) + || (c >= 0x0a32 && c <= 0x0a33) + || (c >= 0x0a35 && c <= 0x0a36) + || (c >= 0x0a38 && c <= 0x0a39) + || (c >= 0x0a59 && c <= 0x0a5c) + || (c == 0x0a5e)) + return 1; + + /* Gujarati */ + if ((c >= 0x0a85 && c <= 0x0a8b) + || (c == 0x0a8d) + || (c >= 0x0a8f && c <= 0x0a91) + || (c >= 0x0a93 && c <= 0x0aa8) + || (c >= 0x0aaa && c <= 0x0ab0) + || (c >= 0x0ab2 && c <= 0x0ab3) + || (c >= 0x0ab5 && c <= 0x0ab9) + || (c == 0x0ae0)) + return 1; + + /* Oriya */ + if ((c >= 0x0b05 && c <= 0x0b0c) + || (c >= 0x0b0f && c <= 0x0b10) + || (c >= 0x0b13 && c <= 0x0b28) + || (c >= 0x0b2a && c <= 0x0b30) + || (c >= 0x0b32 && c <= 0x0b33) + || (c >= 0x0b36 && c <= 0x0b39) + || (c >= 0x0b5c && c <= 0x0b5d) + || (c >= 0x0b5f && c <= 0x0b61)) + return 1; + + /* Tamil */ + if ((c >= 0x0b85 && c <= 0x0b8a) + || (c >= 0x0b8e && c <= 0x0b90) + || (c >= 0x0b92 && c <= 0x0b95) + || (c >= 0x0b99 && c <= 0x0b9a) + || (c == 0x0b9c) + || (c >= 0x0b9e && c <= 0x0b9f) + || (c >= 0x0ba3 && c <= 0x0ba4) + || (c >= 0x0ba8 && c <= 0x0baa) + || (c >= 0x0bae && c <= 0x0bb5) + || (c >= 0x0bb7 && c <= 0x0bb9)) + return 1; + + /* Telugu */ + if ((c >= 0x0c05 && c <= 0x0c0c) + || (c >= 0x0c0e && c <= 0x0c10) + || (c >= 0x0c12 && c <= 0x0c28) + || (c >= 0x0c2a && c <= 0x0c33) + || (c >= 0x0c35 && c <= 0x0c39) + || (c >= 0x0c60 && c <= 0x0c61)) + return 1; + + /* Kannada */ + if ((c >= 0x0c85 && c <= 0x0c8c) + || (c >= 0x0c8e && c <= 0x0c90) + || (c >= 0x0c92 && c <= 0x0ca8) + || (c >= 0x0caa && c <= 0x0cb3) + || (c >= 0x0cb5 && c <= 0x0cb9) + || (c >= 0x0ce0 && c <= 0x0ce1)) + return 1; + + /* Malayalam */ + if ((c >= 0x0d05 && c <= 0x0d0c) + || (c >= 0x0d0e && c <= 0x0d10) + || (c >= 0x0d12 && c <= 0x0d28) + || (c >= 0x0d2a && c <= 0x0d39) + || (c >= 0x0d60 && c <= 0x0d61)) + return 1; + + /* Thai */ + if ((c >= 0x0e01 && c <= 0x0e30) + || (c >= 0x0e32 && c <= 0x0e33) + || (c >= 0x0e40 && c <= 0x0e46) + || (c >= 0x0e4f && c <= 0x0e5b)) + return 1; + + /* Lao */ + if ((c >= 0x0e81 && c <= 0x0e82) + || (c == 0x0e84) + || (c == 0x0e87) + || (c == 0x0e88) + || (c == 0x0e8a) + || (c == 0x0e0d) + || (c >= 0x0e94 && c <= 0x0e97) + || (c >= 0x0e99 && c <= 0x0e9f) + || (c >= 0x0ea1 && c <= 0x0ea3) + || (c == 0x0ea5) + || (c == 0x0ea7) + || (c == 0x0eaa) + || (c == 0x0eab) + || (c >= 0x0ead && c <= 0x0eb0) + || (c == 0x0eb2) + || (c == 0x0eb3) + || (c == 0x0ebd) + || (c >= 0x0ec0 && c <= 0x0ec4) + || (c == 0x0ec6)) + return 1; + + /* Georgian */ + if ((c >= 0x10a0 && c <= 0x10c5) + || (c >= 0x10d0 && c <= 0x10f6)) + return 1; + + /* Hiragana */ + if ((c >= 0x3041 && c <= 0x3094) + || (c >= 0x309b && c <= 0x309e)) + return 1; + + /* Katakana */ + if ((c >= 0x30a1 && c <= 0x30fe)) + return 1; + + /* Bopmofo */ + if ((c >= 0x3105 && c <= 0x312c)) + return 1; + + /* Hangul */ + if ((c >= 0x1100 && c <= 0x1159) + || (c >= 0x1161 && c <= 0x11a2) + || (c >= 0x11a8 && c <= 0x11f9)) + return 1; + + /* CJK Unified Ideographs */ + if ((c >= 0xf900 && c <= 0xfa2d) + || (c >= 0xfb1f && c <= 0xfb36) + || (c >= 0xfb38 && c <= 0xfb3c) + || (c == 0xfb3e) + || (c >= 0xfb40 && c <= 0xfb41) + || (c >= 0xfb42 && c <= 0xfb44) + || (c >= 0xfb46 && c <= 0xfbb1) + || (c >= 0xfbd3 && c <= 0xfd3f) + || (c >= 0xfd50 && c <= 0xfd8f) + || (c >= 0xfd92 && c <= 0xfdc7) + || (c >= 0xfdf0 && c <= 0xfdfb) + || (c >= 0xfe70 && c <= 0xfe72) + || (c == 0xfe74) + || (c >= 0xfe76 && c <= 0xfefc) + || (c >= 0xff21 && c <= 0xff3a) + || (c >= 0xff41 && c <= 0xff5a) + || (c >= 0xff66 && c <= 0xffbe) + || (c >= 0xffc2 && c <= 0xffc7) + || (c >= 0xffca && c <= 0xffcf) + || (c >= 0xffd2 && c <= 0xffd7) + || (c >= 0xffda && c <= 0xffdc) + || (c >= 0x4e00 && c <= 0x9fa5)) + return 1; + + error ("universal-character-name `\\u%04x' not valid in identifier", c); + return 1; +} + +#if 0 +/* Add the UTF-8 representation of C to the token_buffer. */ + +static void +utf8_extend_token (c) + int c; +{ + int shift, mask; + + if (c <= 0x0000007f) + { + extend_token (c); + return; + } + else if (c <= 0x000007ff) + shift = 6, mask = 0xc0; + else if (c <= 0x0000ffff) + shift = 12, mask = 0xe0; + else if (c <= 0x001fffff) + shift = 18, mask = 0xf0; + else if (c <= 0x03ffffff) + shift = 24, mask = 0xf8; + else + shift = 30, mask = 0xfc; + + extend_token (mask | (c >> shift)); + do + { + shift -= 6; + extend_token ((unsigned char) (0x80 | (c >> shift))); + } + while (shift); +} +#endif + #define ENDFILE -1 /* token that represents end-of-file */ /* Read an escape sequence, returning its equivalent as a character, @@ -2869,6 +3248,11 @@ readescape (ignore_ptr) put_back (c); return code; + case 'U': + return read_ucs (8); + case 'u': + return read_ucs (4); + case '\\': case '\'': case '"': return c; @@ -3542,8 +3926,8 @@ real_yylex () case 'z': case '_': case '$': -#if USE_CPPLIB letter: +#if USE_CPPLIB if (cpp_token == CPP_NAME) { /* Note that one character has already been read from @@ -3561,22 +3945,43 @@ real_yylex () #endif { p = token_buffer; - while (ISALNUM (c) || (c == '_') || c == '$') + while (1) { /* Make sure this char really belongs in an identifier. */ - if (c == '$') + if (ISALNUM (c) || c == '_') + /* OK */; + else if (c == '$') { if (! dollars_in_ident) error ("`$' in identifier"); else if (pedantic) pedwarn ("`$' in identifier"); } + /* FIXME we should use some sort of multibyte character + encoding. Locale-dependent? Always UTF-8? */ + else if (is_extended_char (c)) + { + sorry ("universal characters in identifiers"); + c = '_'; + } + else + break; if (p >= token_buffer + maxtoken) p = extend_token_buffer (p); *p++ = c; + + idtryagain: c = token_getch (); + + if (c == '\\') + { + int ignore = 0; + c = readescape (&ignore); + if (ignore) + goto idtryagain; + } } *p = 0; @@ -4634,6 +5039,8 @@ real_yylex () break; default: + if (is_extended_char (c)) + goto letter; value = c; } |