aboutsummaryrefslogtreecommitdiff
path: root/libcpp/charset.c
diff options
context:
space:
mode:
authorJoseph Myers <joseph@codesourcery.com>2013-11-16 00:05:08 +0000
committerJoseph Myers <jsm28@gcc.gnu.org>2013-11-16 00:05:08 +0000
commitd3f4ff8b51b8875a7862fae1fb574735db4bfd36 (patch)
treeddce12237d1c0fb641f1d74daf972657ec50caca /libcpp/charset.c
parent3d053a5f72d60cc868defb5108ac0b28bdd9ef4c (diff)
downloadgcc-d3f4ff8b51b8875a7862fae1fb574735db4bfd36.zip
gcc-d3f4ff8b51b8875a7862fae1fb574735db4bfd36.tar.gz
gcc-d3f4ff8b51b8875a7862fae1fb574735db4bfd36.tar.bz2
ucnid-2011-1.c: New test.
gcc/testsuite: * c-c++-common/cpp/ucnid-2011-1.c: New test. libcpp: * ucnid.tab: Add C11 and C11NOSTART data. * makeucnid.c (digit): Rename enum value to N99. (C11, N11, all_languages): New enum values. (NUM_CODE_POINTS, MAX_CODE_POINT): New macros. (flags, decomp, combining_value): Use NUM_CODE_POINTS as array size. (decomp): Use unsigned int as element type. (all_decomp): New array. (read_ucnid): Handle C11 and C11NOSTART. Use MAX_CODE_POINT. (read_table): Use MAX_CODE_POINT. Store all decompositions in all_decomp. (read_derived): Use MAX_CODE_POINT. (write_table): Use NUM_CODE_POINTS. Print N99, C11 and N11 flags. Print whole array variable declaration rather than just array contents. (char_id_valid, write_context_switch): New functions. (main): Call write_context_switch. * ucnid.h: Regenerate. * include/cpplib.h (struct cpp_options): Add c11_identifiers. * init.c (struct lang_flags): Add c11_identifiers. (cpp_set_lang): Set c11_identifiers option from selected language. * internal.h (struct normalize_state): Document "previous" as previous starter character. (NORMALIZE_STATE_UPDATE_IDNUM): Take character as argument. * charset.c (DIG): Rename enum value to N99. (C11, N11): New enum values. (struct ucnrange): Give name to struct. Use short for flags and unsigned int for end of range. Include ucnid.h for whole variable declaration. (ucn_valid_in_identifier): Allow for characters up to 0x10FFFF. Allow for C11 in determining valid characters and valid start characters. Use check_nfc for non-Hangul context-dependent checks. Only store starter characters in nst->previous. (_cpp_valid_ucn): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM. * lex.c (lex_identifier): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM. Call NORMALIZE_STATE_UPDATE_IDNUM after initial non-UCN part of identifier. (lex_number): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM. From-SVN: r204886
Diffstat (limited to 'libcpp/charset.c')
-rw-r--r--libcpp/charset.c88
1 files changed, 46 insertions, 42 deletions
diff --git a/libcpp/charset.c b/libcpp/charset.c
index ae56c5a..c48e64a 100644
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -828,29 +828,32 @@ enum {
/* Valid in a C99 identifier? */
C99 = 1,
/* Valid in a C99 identifier, but not as the first character? */
- DIG = 2,
+ N99 = 2,
/* Valid in a C++ identifier? */
CXX = 4,
+ /* Valid in a C11/C++11 identifier? */
+ C11 = 8,
+ /* Valid in a C11/C++11 identifier, but not as the first character? */
+ N11 = 16,
/* NFC representation is not valid in an identifier? */
- CID = 8,
+ CID = 32,
/* Might be valid NFC form? */
- NFC = 16,
+ NFC = 64,
/* Might be valid NFKC form? */
- NKC = 32,
+ NKC = 128,
/* Certain preceding characters might make it not valid NFC/NKFC form? */
- CTX = 64
+ CTX = 256
};
-static const struct {
+struct ucnrange {
/* Bitmap of flags above. */
- unsigned char flags;
+ unsigned short flags;
/* Combining class of the character. */
unsigned char combine;
/* Last character in the range described by this entry. */
- unsigned short end;
-} ucnranges[] = {
-#include "ucnid.h"
+ unsigned int end;
};
+#include "ucnid.h"
/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
the start of an identifier, and 0 if C is not valid in an
@@ -864,8 +867,9 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
struct normalize_state *nst)
{
int mn, mx, md;
+ unsigned short valid_flags, invalid_start_flags;
- if (c > 0xFFFF)
+ if (c > 0x10FFFF)
return 0;
mn = 0;
@@ -881,15 +885,25 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
/* When -pedantic, we require the character to have been listed by
the standard for the current language. Otherwise, we accept the
- union of the acceptable sets for C++98 and C99. */
- if (! (ucnranges[mn].flags & (C99 | CXX)))
+ union of the acceptable sets for all supported language versions. */
+ valid_flags = C99 | CXX | C11;
+ if (CPP_PEDANTIC (pfile))
+ {
+ if (CPP_OPTION (pfile, c11_identifiers))
+ valid_flags = C11;
+ else if (CPP_OPTION (pfile, c99))
+ valid_flags = C99;
+ else if (CPP_OPTION (pfile, cplusplus))
+ valid_flags = CXX;
+ }
+ if (! (ucnranges[mn].flags & valid_flags))
return 0;
-
- if (CPP_PEDANTIC (pfile)
- && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
- || (CPP_OPTION (pfile, cplusplus)
- && !(ucnranges[mn].flags & CXX))))
- return 0;
+ if (CPP_OPTION (pfile, c11_identifiers))
+ invalid_start_flags = N11;
+ else if (CPP_OPTION (pfile, c99))
+ invalid_start_flags = N99;
+ else
+ invalid_start_flags = 0;
/* Update NST. */
if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
@@ -899,17 +913,6 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
bool safe;
cppchar_t p = nst->previous;
- /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */
- if (c == 0x09BE)
- safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */
- else if (c == 0x0B3E)
- safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */
- else if (c == 0x0BBE)
- safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */
- else if (c == 0x0CC2)
- safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */
- else if (c == 0x0D3E)
- safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */
/* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
and are combined algorithmically from a sequence of the form
1100-1112 1161-1175 11A8-11C2
@@ -917,20 +920,19 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
really a valid character).
Unfortunately, C99 allows (only) the NFC form, but C++ allows
only the combining characters. */
- else if (c >= 0x1161 && c <= 0x1175)
+ if (c >= 0x1161 && c <= 0x1175)
safe = p < 0x1100 || p > 0x1112;
else if (c >= 0x11A8 && c <= 0x11C2)
safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
else
+ safe = check_nfc (pfile, c, p);
+ if (!safe)
{
- /* Uh-oh, someone updated ucnid.h without updating this code. */
- cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
- safe = true;
+ if ((c >= 0x1161 && c <= 0x1175) || (c >= 0x11A8 && c <= 0x11C2))
+ nst->level = MAX (nst->level, normalized_identifier_C);
+ else
+ nst->level = normalized_none;
}
- if (!safe && c < 0x1161)
- nst->level = normalized_none;
- else if (!safe)
- nst->level = MAX (nst->level, normalized_identifier_C);
}
else if (ucnranges[mn].flags & NKC)
;
@@ -940,11 +942,13 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
nst->level = MAX (nst->level, normalized_identifier_C);
else
nst->level = normalized_none;
- nst->previous = c;
+ if (ucnranges[mn].combine == 0)
+ nst->previous = c;
nst->prev_class = ucnranges[mn].combine;
- /* In C99, UCN digits may not begin identifiers. */
- if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
+ /* In C99, UCN digits may not begin identifiers. In C11 and C++11,
+ UCN combining characters may not begin identifiers. */
+ if (ucnranges[mn].flags & invalid_start_flags)
return 2;
return 1;
@@ -1054,7 +1058,7 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
CPP_OPTION (pfile, warn_dollars) = 0;
cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
}
- NORMALIZE_STATE_UPDATE_IDNUM (nst);
+ NORMALIZE_STATE_UPDATE_IDNUM (nst, result);
}
else if (identifier_pos)
{