diff options
author | Geoffrey Keating <geoffk@apple.com> | 2005-03-12 10:44:06 +0000 |
---|---|---|
committer | Geoffrey Keating <geoffk@gcc.gnu.org> | 2005-03-12 10:44:06 +0000 |
commit | 47e204910a9a3e154e38121f55b9cafec0620b63 (patch) | |
tree | 96b619db02d90b96e5dc09601db8bd7a58e95367 /libcpp/lex.c | |
parent | 5269bfe2809931ca62a0bcd8cad1bed7e78e5b32 (diff) | |
download | gcc-47e204910a9a3e154e38121f55b9cafec0620b63.zip gcc-47e204910a9a3e154e38121f55b9cafec0620b63.tar.gz gcc-47e204910a9a3e154e38121f55b9cafec0620b63.tar.bz2 |
Index: libcpp/ChangeLog
2005-03-12 Geoffrey Keating <geoffk@apple.com>
* directives.c (glue_header_name): Update call to cpp_spell_token.
* internal.h (_cpp_interpret_identifier): New.
* charset.c (_cpp_interpret_identifier): New.
(_cpp_valid_ucn): Allow UCN version of '$'.
* lex.c (lex_identifier): Add extra parameter to indicate if initial
character was '$' or '\'. Support identifiers with UCNs.
(forms_identifier_p): Allow UCNs.
(_cpp_lex_direct): Pass extra parameter to lex_identifier.
(utf8_to_ucn): New.
(cpp_spell_token): Add FORSTRING parameter. Use it.
(cpp_token_as_text): Update call to cpp_spell_token.
(cpp_output_token): Write UCNs back out.
(stringify_arg): Update call to cpp_spell_token.
(paste_tokens): Likewise.
(cpp_macro_definition): Likewise.
* macro.c (stringify_arg): Likewise.
(paste_tokens): Likewise.
(cpp_macro_definition): Likewise.
* include/cpplib.h: Add parameter to cpp_spell_token.
Index: gcc/ChangeLog
2005-03-12 Geoffrey Keating <geoffk@apple.com>
* c-lex.c (c_lex_with_flags): Add parameter to call to
cpp_spell_token.
Index: gcc/testsuite/ChangeLog
2005-03-12 Geoffrey Keating <geoffk@apple.com>
* gcc.dg/ucnid-1.c: New.
* gcc.dg/ucnid-2.c: New.
* gcc.dg/ucnid-3.c: New.
* gcc.dg/ucnid-4.c: New.
* gcc.dg/ucnid-5.c: New.
* gcc.dg/ucnid-6.c: New.
* gcc.dg/cpp/ucnid-1.c: New.
* gcc.dg/cpp/ucnid-2.c: New.
* gcc.dg/cpp/ucnid-3.c: New.
* g++.dg/other/ucnid-1.C: New.
From-SVN: r96333
Diffstat (limited to 'libcpp/lex.c')
-rw-r--r-- | libcpp/lex.c | 138 |
1 files changed, 103 insertions, 35 deletions
diff --git a/libcpp/lex.c b/libcpp/lex.c index 62a28f8..8398c7c 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -53,7 +53,7 @@ static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE }; static void add_line_note (cpp_buffer *, const uchar *, unsigned int); static int skip_line_comment (cpp_reader *); static void skip_whitespace (cpp_reader *, cppchar_t); -static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *); +static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *, bool); static void lex_number (cpp_reader *, cpp_string *); static bool forms_identifier_p (cpp_reader *, int); static void lex_string (cpp_reader *, cpp_token *, const uchar *); @@ -453,7 +453,7 @@ forms_identifier_p (cpp_reader *pfile, int first) } /* Is this a syntactically valid UCN? */ - if (0 && *buffer->cur == '\\' + if (*buffer->cur == '\\' && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U')) { buffer->cur += 2; @@ -467,39 +467,39 @@ forms_identifier_p (cpp_reader *pfile, int first) /* Lex an identifier starting at BUFFER->CUR - 1. */ static cpp_hashnode * -lex_identifier (cpp_reader *pfile, const uchar *base) +lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn) { cpp_hashnode *result; - const uchar *cur, *limit; + const uchar *cur; unsigned int len; unsigned int hash = HT_HASHSTEP (0, *base); cur = pfile->buffer->cur; - for (;;) + if (! starts_ucn) + while (ISIDNUM (*cur)) + { + hash = HT_HASHSTEP (hash, *cur); + cur++; + } + pfile->buffer->cur = cur; + if (starts_ucn || forms_identifier_p (pfile, false)) { - /* N.B. ISIDNUM does not include $. */ - while (ISIDNUM (*cur)) - { - hash = HT_HASHSTEP (hash, *cur); - cur++; - } - - pfile->buffer->cur = cur; - if (!forms_identifier_p (pfile, false)) - break; - - limit = pfile->buffer->cur; - while (cur < limit) - { - hash = HT_HASHSTEP (hash, *cur); - cur++; - } + /* Slower version for identifiers containing UCNs (or $). */ + do { + while (ISIDNUM (*pfile->buffer->cur)) + pfile->buffer->cur++; + } while (forms_identifier_p (pfile, false)); + result = _cpp_interpret_identifier (pfile, base, + pfile->buffer->cur - base); } - len = cur - base; - hash = HT_HASHFINISH (hash, len); + else + { + len = cur - base; + hash = HT_HASHFINISH (hash, len); - result = (cpp_hashnode *) - ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC); + result = (cpp_hashnode *) + ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC); + } /* Rarely, identifiers require diagnostics when lexed. */ if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC) @@ -922,7 +922,7 @@ _cpp_lex_direct (cpp_reader *pfile) case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': result->type = CPP_NAME; - result->val.node = lex_identifier (pfile, buffer->cur - 1); + result->val.node = lex_identifier (pfile, buffer->cur - 1, false); /* Convert named operators to their proper types. */ if (result->val.node->flags & NODE_OPERATOR) @@ -1155,7 +1155,7 @@ _cpp_lex_direct (cpp_reader *pfile) if (forms_identifier_p (pfile, true)) { result->type = CPP_NAME; - result->val.node = lex_identifier (pfile, base); + result->val.node = lex_identifier (pfile, base, true); break; } buffer->cur++; @@ -1180,19 +1180,56 @@ cpp_token_len (const cpp_token *token) { default: len = 4; break; case SPELL_LITERAL: len = token->val.str.len; break; - case SPELL_IDENT: len = NODE_LEN (token->val.node); break; + case SPELL_IDENT: len = NODE_LEN (token->val.node) * 10; break; } return len; } +/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER. + Return the number of bytes read out of NAME. (There are always + 10 bytes written to BUFFER.) */ + +static size_t +utf8_to_ucn (unsigned char *buffer, const unsigned char *name) +{ + int j; + int ucn_len = 0; + int ucn_len_c; + unsigned t; + unsigned long utf32; + + /* Compute the length of the UTF-8 sequence. */ + for (t = *name; t & 0x80; t <<= 1) + ucn_len++; + + utf32 = *name & (0x7F >> ucn_len); + for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++) + { + utf32 = (utf32 << 6) | (*++name & 0x3F); + + /* Ill-formed UTF-8. */ + if ((*name & ~0x3F) != 0x80) + abort (); + } + + *buffer++ = '\\'; + *buffer++ = 'U'; + for (j = 7; j >= 0; j--) + *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF]; + return ucn_len; +} + + /* Write the spelling of a token TOKEN to BUFFER. The buffer must already contain the enough space to hold the token's spelling. Returns a pointer to the character after the last character written. + FORSTRING is true if this is to be the spelling after translation + phase 1 (this is different for UCNs). FIXME: Would be nice if we didn't need the PFILE argument. */ unsigned char * cpp_spell_token (cpp_reader *pfile, const cpp_token *token, - unsigned char *buffer) + unsigned char *buffer, bool forstring) { switch (TOKEN_SPELL (token)) { @@ -1216,8 +1253,26 @@ cpp_spell_token (cpp_reader *pfile, const cpp_token *token, spell_ident: case SPELL_IDENT: - memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node)); - buffer += NODE_LEN (token->val.node); + if (forstring) + { + memcpy (buffer, NODE_NAME (token->val.node), + NODE_LEN (token->val.node)); + buffer += NODE_LEN (token->val.node); + } + else + { + size_t i; + const unsigned char * name = NODE_NAME (token->val.node); + + for (i = 0; i < NODE_LEN (token->val.node); i++) + if (name[i] & ~0x7F) + { + i += utf8_to_ucn (buffer, name + i) - 1; + buffer += 10; + } + else + *buffer++ = NODE_NAME (token->val.node)[i]; + } break; case SPELL_LITERAL: @@ -1242,7 +1297,7 @@ cpp_token_as_text (cpp_reader *pfile, const cpp_token *token) unsigned int len = cpp_token_len (token) + 1; unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end; - end = cpp_spell_token (pfile, token, start); + end = cpp_spell_token (pfile, token, start, false); end[0] = '\0'; return start; @@ -1286,8 +1341,21 @@ cpp_output_token (const cpp_token *token, FILE *fp) spell_ident: case SPELL_IDENT: - fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp); - break; + { + size_t i; + const unsigned char * name = NODE_NAME (token->val.node); + + for (i = 0; i < NODE_LEN (token->val.node); i++) + if (name[i] & ~0x7F) + { + unsigned char buffer[10]; + i += utf8_to_ucn (buffer, name + i) - 1; + fwrite (buffer, 1, 10, fp); + } + else + fputc (NODE_NAME (token->val.node)[i], fp); + } + break; case SPELL_LITERAL: fwrite (token->val.str.text, 1, token->val.str.len, fp); |