/* CPP Library - charsets Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "config.h" #include "system.h" #include "coretypes.h" #include "tm.h" #include "cpplib.h" #include "cpphash.h" static int ucn_valid_in_identifier PARAMS ((cpp_reader *, cppchar_t)); /* [lex.charset]: The character designated by the universal character name \UNNNNNNNN is that character whose character short name in ISO/IEC 10646 is NNNNNNNN; the character designated by the universal character name \uNNNN is that character whose character short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value for a universal character name is less than 0x20 or in the range 0x7F-0x9F (inclusive), or if the universal character name designates a character in the basic source character set, then the program is ill-formed. *PSTR must be preceded by "\u" or "\U"; it is assumed that the buffer end is delimited by a non-hex digit. Returns zero if UCNs are not part of the relevant standard, or if the string beginning at *PSTR doesn't syntactically match the form 'NNNN' or 'NNNNNNNN'. Otherwise the non-zero value of the UCN, whether valid or invalid, is returned. Diagnostics are emitted for invalid values. PSTR is updated to point one beyond the UCN, or to the syntactically invalid character. IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of an identifier, or 2 otherwise. */ cppchar_t _cpp_valid_ucn (pfile, pstr, identifier_pos) cpp_reader *pfile; const uchar **pstr; int identifier_pos; { cppchar_t result, c; unsigned int length; const uchar *str = *pstr; const uchar *base = str - 2; /* Only attempt to interpret a UCS for C++ and C99. */ if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99)) return 0; /* We don't accept UCNs for an EBCDIC target. */ if (CPP_OPTION (pfile, EBCDIC)) return 0; if (str[-1] == 'u') length = 4; else if (str[-1] == 'U') length = 8; else abort(); result = 0; do { c = *str; if (!ISXDIGIT (c)) break; str++; result = (result << 4) + hex_value (c); } while (--length); *pstr = str; if (length) /* We'll error when we try it out as the start of an identifier. */ cpp_error (pfile, DL_ERROR, "incomplete universal character name %.*s", (int) (str - base), base); /* The standard permits $, @ and ` to be specified as UCNs. We use hex escapes so that this also works with EBCDIC hosts. */ else if ((result < 0xa0 && (result != 0x24 && result != 0x40 && result != 0x60)) || (result & 0x80000000) || (result >= 0xD800 && result <= 0xDFFF)) { cpp_error (pfile, DL_ERROR, "%.*s is not a valid universal character", (int) (str - base), base); } else if (identifier_pos) { int validity = ucn_valid_in_identifier (pfile, result); if (validity == 0) cpp_error (pfile, DL_ERROR, "universal character %.*s is not valid in an identifier", (int) (str - base), base); else if (validity == 2 && identifier_pos == 1) cpp_error (pfile, DL_ERROR, "universal character %.*s is not valid at the start of an identifier", (int) (str - base), base); } if (result == 0) result = 1; return result; } /* Returns 1 if C is valid in an identifier, 2 if C is valid except at the start of an identifier, and 0 if C is not valid in an identifier. We assume C has already gone through the checks of _cpp_valid_ucn. */ static int ucn_valid_in_identifier (pfile, c) cpp_reader *pfile; cppchar_t c; { /* None of the valid chars are outside the Basic Multilingual Plane (the low 16 bits). */ if (c > 0xffff) return 0; if (CPP_OPTION (pfile, c99) || !CPP_PEDANTIC (pfile)) { /* Latin. */ if (c == 0x0aa || c == 0x00ba || c == 0x207f || c == 0x1e9b) return 1; /* Greek. */ if (c == 0x0386) return 1; /* Cyrillic. */ if (c == 0x040c) return 1; /* Hebrew. */ if ((c >= 0x05b0 && c <= 0x05b9) || (c >= 0x05bb && c <= 0x005bd) || c == 0x05bf || (c >= 0x05c1 && c <= 0x05c2)) return 1; /* Arabic. */ if ((c >= 0x06d0 && c <= 0x06dc) || c == 0x06e8 || (c >= 0x06ea && c <= 0x06ed)) return 1; /* Devanagari */ if ((c >= 0x0901 && c <= 0x0903) || (c >= 0x093e && c <= 0x094d) || (c >= 0x0950 && c <= 0x0952) || c == 0x0963) return 1; /* Bengali */ if ((c >= 0x0981 && c <= 0x0983) || (c >= 0x09be && c <= 0x09c4) || (c >= 0x09c7 && c <= 0x09c8) || (c >= 0x09cb && c <= 0x09cd) || (c >= 0x09e2 && c <= 0x09e3)) return 1; /* Gurmukhi */ if (c == 0x0a02 || (c >= 0x0a3e && c <= 0x0a42) || (c >= 0x0a47 && c <= 0x0a48) || (c >= 0x0a4b && c <= 0x0a4d) || (c == 0x0a74)) return 1; /* Gujarati */ if ((c >= 0x0a81 && c <= 0x0a83) || (c >= 0x0abd && c <= 0x0ac5) || (c >= 0x0ac7 && c <= 0x0ac9) || (c >= 0x0acb && c <= 0x0acd) || (c == 0x0ad0)) return 1; /* Oriya */ if ((c >= 0x0b01 && c <= 0x0b03) || (c >= 0x0b3e && c <= 0x0b43) || (c >= 0x0b47 && c <= 0x0b48) || (c >= 0x0b4b && c <= 0x0b4d)) return 1; /* Tamil */ if ((c >= 0x0b82 && c <= 0x0b83) || (c >= 0x0bbe && c <= 0x0bc2) || (c >= 0x0bc6 && c <= 0x0bc8) || (c >= 0x0bc8 && c <= 0x0bcd)) return 1; /* Telugu */ if ((c >= 0x0c01 && c <= 0x0c03) || (c >= 0x0c3e && c <= 0x0c44) || (c >= 0x0c46 && c <= 0x0c48) || (c >= 0x0c4a && c <= 0x0c4d)) return 1; /* Kannada */ if ((c >= 0x0c82 && c <= 0x0c83) || (c >= 0x0cbe && c <= 0x0cc4) || (c >= 0x0cc6 && c <= 0x0cc8) || (c >= 0x0cca && c <= 0x0ccd) || c == 0x0cde) return 1; /* Malayalam */ if ((c >= 0x0d02 && c <= 0x0d03) || (c >= 0x0d3e && c <= 0x0d43) || (c >= 0x0d46 && c <= 0x0d48) || (c >= 0x0d4a && c <= 0x0d4d)) return 1; /* Thai */ if ((c >= 0x0e01 && c <= 0x0e3a) || (c >= 0x0e40 && c <= 0x0e5b)) return 1; /* Lao */ if ((c >= 0x0ead && c <= 0x0eae) || (c >= 0x0eb0 && c <= 0x0eb9) || (c >= 0x0ebb && c <= 0x0ebd) || (c >= 0x0ec0 && c <= 0x0ec4) || c == 0x0ec6 || (c >= 0x0ec8 && c <= 0x0ecd) || (c >= 0x0edc && c <= 0x0ed)) return 1; /* Tibetan. */ if (c == 0x0f00 || (c >= 0x0f18 && c <= 0x0f19) || c == 0x0f35 || c == 0x0f37 || c == 0x0f39 || (c >= 0x0f3e && c <= 0x0f47) || (c >= 0x0f49 && c <= 0x0f69) || (c >= 0x0f71 && c <= 0x0f84) || (c >= 0x0f86 && c <= 0x0f8b) || (c >= 0x0f90 && c <= 0x0f95) || c == 0x0f97 || (c >= 0x0f99 && c <= 0x0fad) || (c >= 0x0fb1 && c <= 0x0fb7) || c == 0x0fb9) return 1; /* Katakana */ if ((c >= 0x30a1 && c <= 0x30f6) || (c >= 0x30fb && c <= 0x30fc)) return 1; /* CJK Unified Ideographs. */ if (c >= 0x4e00 && c <= 0x9fa5) return 1; /* Hangul. */ if (c >= 0xac00 && c <= 0xd7a3) return 1; /* Digits. */ if ((c >= 0x0660 && c <= 0x0669) || (c >= 0x06f0 && c <= 0x06f9) || (c >= 0x0966 && c <= 0x096f) || (c >= 0x09e6 && c <= 0x09ef) || (c >= 0x0a66 && c <= 0x0a6f) || (c >= 0x0ae6 && c <= 0x0aef) || (c >= 0x0b66 && c <= 0x0b6f) || (c >= 0x0be7 && c <= 0x0bef) || (c >= 0x0c66 && c <= 0x0c6f) || (c >= 0x0ce6 && c <= 0x0cef) || (c >= 0x0d66 && c <= 0x0d6f) || (c >= 0x0e50 && c <= 0x0e59) || (c >= 0x0ed0 && c <= 0x0ed9) || (c >= 0x0f20 && c <= 0x0f33)) return 2; /* Special characters. */ if (c == 0x00b5 || c == 0x00b7 || (c >= 0x02b0 && c <= 0x02b8) || c == 0x02bb || (c >= 0x02bd && c <= 0x02c1) || (c >= 0x02d0 && c <= 0x02d1) || (c >= 0x02e0 && c <= 0x02e4) || c == 0x037a || c == 0x0559 || c == 0x093d || c == 0x0b3d || c == 0x1fbe || (c >= 0x203f && c <= 0x2040) || c == 0x2102 || c == 0x2107 || (c >= 0x210a && c <= 0x2113) || c == 0x2115 || (c >= 0x2118 && c <= 0x211d) || c == 0x2124 || c == 0x2126 || c == 0x2128 || (c >= 0x212a && c <= 0x2131) || (c >= 0x2133 && c <= 0x2138) || (c >= 0x2160 && c <= 0x2182) || (c >= 0x3005 && c <= 0x3007) || (c >= 0x3021 && c <= 0x3029)) return 1; } if (CPP_OPTION (pfile, cplusplus) || !CPP_PEDANTIC (pfile)) { /* Greek. */ if (c == 0x0384) return 1; /* Cyrillic. */ if (c == 0x040d) return 1; /* Hebrew. */ if (c >= 0x05f3 && c <= 0x05f4) return 1; /* Lao. */ if ((c >= 0x0ead && c <= 0x0eb0) || (c == 0x0eb2) || (c == 0x0eb3) || (c == 0x0ebd) || (c >= 0x0ec0 && c <= 0x0ec4) || (c == 0x0ec6)) return 1; /* Hiragana */ if (c == 0x3094 || (c >= 0x309d && c <= 0x309e)) return 1; /* Katakana */ if ((c >= 0x30a1 && c <= 0x30fe)) return 1; /* Hangul */ if ((c >= 0x1100 && c <= 0x1159) || (c >= 0x1161 && c <= 0x11a2) || (c >= 0x11a8 && c <= 0x11f9)) return 1; /* CJK Unified Ideographs */ if ((c >= 0xf900 && c <= 0xfa2d) || (c >= 0xfb1f && c <= 0xfb36) || (c >= 0xfb38 && c <= 0xfb3c) || (c == 0xfb3e) || (c >= 0xfb40 && c <= 0xfb41) || (c >= 0xfb42 && c <= 0xfb44) || (c >= 0xfb46 && c <= 0xfbb1) || (c >= 0xfbd3 && c <= 0xfd3f) || (c >= 0xfd50 && c <= 0xfd8f) || (c >= 0xfd92 && c <= 0xfdc7) || (c >= 0xfdf0 && c <= 0xfdfb) || (c >= 0xfe70 && c <= 0xfe72) || (c == 0xfe74) || (c >= 0xfe76 && c <= 0xfefc) || (c >= 0xff21 && c <= 0xff3a) || (c >= 0xff41 && c <= 0xff5a) || (c >= 0xff66 && c <= 0xffbe) || (c >= 0xffc2 && c <= 0xffc7) || (c >= 0xffca && c <= 0xffcf) || (c >= 0xffd2 && c <= 0xffd7) || (c >= 0xffda && c <= 0xffdc) || (c >= 0x4e00 && c <= 0x9fa5)) return 1; } /* Latin */ if ((c >= 0x00c0 && c <= 0x00d6) || (c >= 0x00d8 && c <= 0x00f6) || (c >= 0x00f8 && c <= 0x01f5) || (c >= 0x01fa && c <= 0x0217) || (c >= 0x0250 && c <= 0x02a8) || (c >= 0x1e00 && c <= 0x1e9a) || (c >= 0x1ea0 && c <= 0x1ef9)) return 1; /* Greek */ if ((c >= 0x0388 && c <= 0x038a) || (c == 0x038c) || (c >= 0x038e && c <= 0x03a1) || (c >= 0x03a3 && c <= 0x03ce) || (c >= 0x03d0 && c <= 0x03d6) || (c == 0x03da) || (c == 0x03dc) || (c == 0x03de) || (c == 0x03e0) || (c >= 0x03e2 && c <= 0x03f3) || (c >= 0x1f00 && c <= 0x1f15) || (c >= 0x1f18 && c <= 0x1f1d) || (c >= 0x1f20 && c <= 0x1f45) || (c >= 0x1f48 && c <= 0x1f4d) || (c >= 0x1f50 && c <= 0x1f57) || (c == 0x1f59) || (c == 0x1f5b) || (c == 0x1f5d) || (c >= 0x1f5f && c <= 0x1f7d) || (c >= 0x1f80 && c <= 0x1fb4) || (c >= 0x1fb6 && c <= 0x1fbc) || (c >= 0x1fc2 && c <= 0x1fc4) || (c >= 0x1fc6 && c <= 0x1fcc) || (c >= 0x1fd0 && c <= 0x1fd3) || (c >= 0x1fd6 && c <= 0x1fdb) || (c >= 0x1fe0 && c <= 0x1fec) || (c >= 0x1ff2 && c <= 0x1ff4) || (c >= 0x1ff6 && c <= 0x1ffc)) return 1; /* Cyrillic */ if ((c >= 0x0401 && c <= 0x040c) || (c >= 0x040f && c <= 0x044f) || (c >= 0x0451 && c <= 0x045c) || (c >= 0x045e && c <= 0x0481) || (c >= 0x0490 && c <= 0x04c4) || (c >= 0x04c7 && c <= 0x04c8) || (c >= 0x04cb && c <= 0x04cc) || (c >= 0x04d0 && c <= 0x04eb) || (c >= 0x04ee && c <= 0x04f5) || (c >= 0x04f8 && c <= 0x04f9)) return 1; /* Armenian */ if ((c >= 0x0531 && c <= 0x0556) || (c >= 0x0561 && c <= 0x0587)) return 1; /* Hebrew */ if ((c >= 0x05d0 && c <= 0x05ea) || (c >= 0x05f0 && c <= 0x05f2)) return 1; /* Arabic */ if ((c >= 0x0621 && c <= 0x063a) || (c >= 0x0640 && c <= 0x0652) || (c >= 0x0670 && c <= 0x06b7) || (c >= 0x06ba && c <= 0x06be) || (c >= 0x06c0 && c <= 0x06ce) || (c >= 0x06e5 && c <= 0x06e7)) return 1; /* Devanagari */ if ((c >= 0x0905 && c <= 0x0939) || (c >= 0x0958 && c <= 0x0962)) return 1; /* Bengali */ if ((c >= 0x0985 && c <= 0x098c) || (c >= 0x098f && c <= 0x0990) || (c >= 0x0993 && c <= 0x09a8) || (c >= 0x09aa && c <= 0x09b0) || (c == 0x09b2) || (c >= 0x09b6 && c <= 0x09b9) || (c >= 0x09dc && c <= 0x09dd) || (c >= 0x09df && c <= 0x09e1) || (c >= 0x09f0 && c <= 0x09f1)) return 1; /* Gurmukhi */ if ((c >= 0x0a05 && c <= 0x0a0a) || (c >= 0x0a0f && c <= 0x0a10) || (c >= 0x0a13 && c <= 0x0a28) || (c >= 0x0a2a && c <= 0x0a30) || (c >= 0x0a32 && c <= 0x0a33) || (c >= 0x0a35 && c <= 0x0a36) || (c >= 0x0a38 && c <= 0x0a39) || (c >= 0x0a59 && c <= 0x0a5c) || (c == 0x0a5e)) return 1; /* Gujarati */ if ((c >= 0x0a85 && c <= 0x0a8b) || (c == 0x0a8d) || (c >= 0x0a8f && c <= 0x0a91) || (c >= 0x0a93 && c <= 0x0aa8) || (c >= 0x0aaa && c <= 0x0ab0) || (c >= 0x0ab2 && c <= 0x0ab3) || (c >= 0x0ab5 && c <= 0x0ab9) || (c == 0x0ae0)) return 1; /* Oriya */ if ((c >= 0x0b05 && c <= 0x0b0c) || (c >= 0x0b0f && c <= 0x0b10) || (c >= 0x0b13 && c <= 0x0b28) || (c >= 0x0b2a && c <= 0x0b30) || (c >= 0x0b32 && c <= 0x0b33) || (c >= 0x0b36 && c <= 0x0b39) || (c >= 0x0b5c && c <= 0x0b5d) || (c >= 0x0b5f && c <= 0x0b61)) return 1; /* Tamil */ if ((c >= 0x0b85 && c <= 0x0b8a) || (c >= 0x0b8e && c <= 0x0b90) || (c >= 0x0b92 && c <= 0x0b95) || (c >= 0x0b99 && c <= 0x0b9a) || (c == 0x0b9c) || (c >= 0x0b9e && c <= 0x0b9f) || (c >= 0x0ba3 && c <= 0x0ba4) || (c >= 0x0ba8 && c <= 0x0baa) || (c >= 0x0bae && c <= 0x0bb5) || (c >= 0x0bb7 && c <= 0x0bb9)) return 1; /* Telugu */ if ((c >= 0x0c05 && c <= 0x0c0c) || (c >= 0x0c0e && c <= 0x0c10) || (c >= 0x0c12 && c <= 0x0c28) || (c >= 0x0c2a && c <= 0x0c33) || (c >= 0x0c35 && c <= 0x0c39) || (c >= 0x0c60 && c <= 0x0c61)) return 1; /* Kannada */ if ((c >= 0x0c85 && c <= 0x0c8c) || (c >= 0x0c8e && c <= 0x0c90) || (c >= 0x0c92 && c <= 0x0ca8) || (c >= 0x0caa && c <= 0x0cb3) || (c >= 0x0cb5 && c <= 0x0cb9) || (c >= 0x0ce0 && c <= 0x0ce1)) return 1; /* Malayalam */ if ((c >= 0x0d05 && c <= 0x0d0c) || (c >= 0x0d0e && c <= 0x0d10) || (c >= 0x0d12 && c <= 0x0d28) || (c >= 0x0d2a && c <= 0x0d39) || (c >= 0x0d60 && c <= 0x0d61)) return 1; /* Thai */ if ((c >= 0x0e01 && c <= 0x0e30) || (c >= 0x0e32 && c <= 0x0e33) || (c >= 0x0e40 && c <= 0x0e46) || (c >= 0x0e4f && c <= 0x0e5b)) return 1; /* Lao */ if ((c >= 0x0e81 && c <= 0x0e82) || (c == 0x0e84) || (c == 0x0e87) || (c == 0x0e88) || (c == 0x0e8a) || (c == 0x0e8d) || (c >= 0x0e94 && c <= 0x0e97) || (c >= 0x0e99 && c <= 0x0e9f) || (c >= 0x0ea1 && c <= 0x0ea3) || (c == 0x0ea5) || (c == 0x0ea7) || (c == 0x0eaa) || (c == 0x0eab)) return 1; /* Georgian */ if ((c >= 0x10a0 && c <= 0x10c5) || (c >= 0x10d0 && c <= 0x10f6)) return 1; /* Hiragana */ if ((c >= 0x3041 && c <= 0x3093) || (c >= 0x309b && c <= 0x309c)) return 1; /* Bopmofo */ if ((c >= 0x3105 && c <= 0x312c)) return 1; return 0; }