c-lex.c (lex_charconst): Convert into a simple wrapper around cpp_interpret_charconst...

* c-lex.c (lex_charconst): Convert into a simple wrapper around cpp_interpret_charconst, to which most of the code body is moved. * cppexp.c (MAX_CHAR_TYPE_SIZE, MAX_WCHAR_TYPE_SIZE, MAX_LONG_TYPE_SIZE, MAX_INT_TYPE_SIZE, MAX_CHAR_TYPE_MASK, MAX_WCHAR_TYPE_MASK, parse_escape, parse_charconst): Remove. (lex): Use cpp_interpret_charconst. * cpplex.c (parse_escape, read_ucs, cpp_interpret_charconst, hex_digit_value): New functions. (MAX_CHAR_TYPE_SIZE, MAX_WCHAR_TYPE_SIZE): New macros. * cpplib.h (cpp_interpret_charconst): New prototype. * Makefile.in: Update. From-SVN: r41978
author: Neil Booth <neil@daikokuya.demon.co.uk> 2001-05-11 23:48:21 +0000
committer: Neil Booth <neil@gcc.gnu.org> 2001-05-11 23:48:21 +0000
commit: c8a96070172673d05574e3c9fe21e28750610223 (patch)
tree: eefc97fe7077fa7f6cfc9bb868638feb3419f0dd /gcc/cpplex.c
parent: f42974dc81a20a5ea306fab56faef8c322bf26d2 (diff)
download: gcc-c8a96070172673d05574e3c9fe21e28750610223.zip
gcc-c8a96070172673d05574e3c9fe21e28750610223.tar.gz
gcc-c8a96070172673d05574e3c9fe21e28750610223.tar.bz2
1 files changed, 349 insertions, 0 deletions
diff --git a/gcc/cpplex.c b/gcc/cpplex.c
index 6bf0cdf..9bbab0f 100644
--- a/gcc/cpplex.c
+++ b/gcc/cpplex.c
@@ -39,6 +39,18 @@ Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
 #include "cpplib.h"
 #include "cpphash.h"
 
+/* MULTIBYTE_CHARS support only works for native compilers.
+   ??? Ideally what we want is to model widechar support after
+   the current floating point support.  */
+#ifdef CROSS_COMPILE
+#undef MULTIBYTE_CHARS
+#endif
+
+#ifdef MULTIBYTE_CHARS
+#include "mbchar.h"
+#include <locale.h>
+#endif
+
 /* Tokens with SPELL_STRING store their spelling in the token list,
    and it's length in the token->val.name.len.  */
 enum spell_type
@@ -86,9 +98,15 @@ static void save_comment PARAMS ((cpp_reader *, cpp_token *, const U_CHAR *));
 static void lex_percent PARAMS ((cpp_buffer *, cpp_token *));
 static void lex_dot PARAMS ((cpp_reader *, cpp_token *));
 static int name_p PARAMS ((cpp_reader *, const cpp_string *));
+static unsigned int parse_escape PARAMS ((cpp_reader *, const unsigned char **,
+					  const unsigned char *, HOST_WIDE_INT,
+					  int));
+static unsigned int read_ucs PARAMS ((cpp_reader *, const unsigned char **,
+				      const unsigned char *, unsigned int));
 
 static cpp_chunk *new_chunk PARAMS ((unsigned int));
 static int chunk_suitable PARAMS ((cpp_pool *, cpp_chunk *, unsigned int));
+static unsigned int hex_digit_value PARAMS ((unsigned int));
 
 /* Utility routine:
 
@@ -1640,6 +1658,337 @@ cpp_output_line (pfile, fp)
   putc ('\n', fp);
 }
 
+/* Returns the value of a hexadecimal digit.  */
+static unsigned int
+hex_digit_value (c)
+     unsigned int c;
+{
+  if (c >= 'a' && c <= 'f')
+    return c - 'a' + 10;
+  if (c >= 'A' && c <= 'F')
+    return c - 'A' + 10;
+  if (c >= '0' && c <= '9')
+    return c - '0';
+  abort ();
+}
+
+/* Parse a '\uNNNN' or '\UNNNNNNNN' sequence (C++ and C99).
+
+   [lex.charset]: The character designated by the universal character
+   name \UNNNNNNNN is that character whose character short name in
+   ISO/IEC 10646 is NNNNNNNN; the character designated by the
+   universal character name \uNNNN is that character whose character
+   short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
+   for a universal character name is less than 0x20 or in the range
+   0x7F-0x9F (inclusive), or if the universal character name
+   designates a character in the basic source character set, then the
+   program is ill-formed.
+
+   We assume that wchar_t is Unicode, so we don't need to do any
+   mapping.  Is this ever wrong?  */
+
+static unsigned int
+read_ucs (pfile, pstr, limit, length)
+     cpp_reader *pfile;
+     const unsigned char **pstr;
+     const unsigned char *limit;
+     unsigned int length;
+{
+  const unsigned char *p = *pstr;
+  unsigned int c, code = 0;
+
+  for (; length; --length)
+    {
+      if (p >= limit)
+	{
+	  cpp_error (pfile, "incomplete universal-character-name");
+	  break;
+	}
+
+      c = *p;
+      if (ISXDIGIT (c))
+	{
+	  code = (code << 4) + hex_digit_value (c);
+	  p++;
+	}
+      else
+	{
+	  cpp_error (pfile,
+		     "non-hex digit '%c' in universal-character-name", c);
+	  break;
+	}
+
+    }
+
+#ifdef TARGET_EBCDIC
+  cpp_error (pfile, "universal-character-name on EBCDIC target");
+  code = 0x3f;  /* EBCDIC invalid character */
+#else
+  if (code > 0x9f && !(code & 0x80000000))
+    ; /* True extended character, OK.  */
+  else if (code >= 0x20 && code < 0x7f)
+    {
+      /* ASCII printable character.  The C character set consists of all of
+	 these except $, @ and `.  We use hex escapes so that this also
+	 works with EBCDIC hosts.  */
+      if (code != 0x24 && code != 0x40 && code != 0x60)
+	cpp_error (pfile, "universal-character-name used for '%c'", code);
+    }
+  else
+    cpp_error (pfile, "invalid universal-character-name");
+#endif
+
+  *pstr = p;
+  return code;
+}
+
+/* Interpret an escape sequence, and return its value.  PSTR points to
+   the input pointer, which is just after the backslash.  LIMIT is how
+   much text we have.  MASK is the precision for the target type (char
+   or wchar_t).  TRADITIONAL, if true, does not interpret escapes that
+   did not exist in traditional C.  */
+
+static unsigned int
+parse_escape (pfile, pstr, limit, mask, traditional)
+     cpp_reader *pfile;
+     const unsigned char **pstr;
+     const unsigned char *limit;
+     HOST_WIDE_INT mask;
+     int traditional;
+{
+  int unknown = 0;
+  const unsigned char *str = *pstr;
+  unsigned int c = *str++;
+
+  switch (c)
+    {
+    case '\\': case '\'': case '"': case '?': break;
+    case 'b': c = TARGET_BS;	  break;
+    case 'f': c = TARGET_FF;	  break;
+    case 'n': c = TARGET_NEWLINE; break;
+    case 'r': c = TARGET_CR;	  break;
+    case 't': c = TARGET_TAB;	  break;
+    case 'v': c = TARGET_VT;	  break;
+
+    case '(': case '{': case '[': case '%':
+      /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
+	 '\%' is used to prevent SCCS from getting confused.  */
+      unknown = CPP_PEDANTIC (pfile);
+      break;
+
+    case 'a':
+      if (CPP_WTRADITIONAL (pfile))
+	cpp_warning (pfile, "the meaning of '\\a' varies with -traditional");
+      if (!traditional)
+	c = TARGET_BELL;
+      break;
+
+    case 'e': case 'E':
+      if (CPP_PEDANTIC (pfile))
+	cpp_pedwarn (pfile, "non-ISO-standard escape sequence, '\\%c'", c);
+      c = TARGET_ESC;
+      break;
+      
+      /* Warnings and support checks handled by read_ucs().  */
+    case 'u': case 'U':
+      if (CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99))
+	{
+	  if (CPP_WTRADITIONAL (pfile))
+	    cpp_warning (pfile,
+			 "the meaning of '\\%c' varies with -traditional", c);
+	  c = read_ucs (pfile, &str, limit, c == 'u' ? 4 : 8);
+	}
+      else
+	unknown = 1;
+      break;
+
+    case 'x':
+      if (CPP_WTRADITIONAL (pfile))
+	cpp_warning (pfile, "the meaning of '\\x' varies with -traditional");
+
+      if (!traditional)
+	{
+	  unsigned int i = 0, overflow = 0;
+	  int digits_found = 0;
+
+	  while (str < limit)
+	    {
+	      c = *str;
+	      if (! ISXDIGIT (c))
+		break;
+	      str++;
+	      overflow |= i ^ (i << 4 >> 4);
+	      i = (i << 4) + hex_digit_value (c);
+	      digits_found = 1;
+	    }
+
+	  if (!digits_found)
+	    cpp_error (pfile, "\\x used with no following hex digits");
+
+	  if (overflow | (i != (i & mask)))
+	    {
+	      cpp_pedwarn (pfile, "hex escape sequence out of range");
+	      i &= mask;
+	    }
+	  c = i;
+	}
+      break;
+
+    case '0':  case '1':  case '2':  case '3':
+    case '4':  case '5':  case '6':  case '7':
+      {
+	unsigned int i = c - '0';
+	int count = 0;
+
+	while (str < limit && ++count < 3)
+	  {
+	    c = *str;
+	    if (c < '0' || c > '7')
+	      break;
+	    str++;
+	    i = (i << 3) + c - '0';
+	  }
+
+	if (i != (i & mask))
+	  {
+	    cpp_pedwarn (pfile, "octal escape sequence out of range");
+	    i &= mask;
+	  }
+	c = i;
+      }
+      break;
+
+    default:
+      unknown = 1;
+      break;
+    }
+
+  if (unknown)
+    {
+      if (ISGRAPH (c))
+	cpp_pedwarn (pfile, "unknown escape sequence '\\%c'", c);
+      else
+	cpp_pedwarn (pfile, "unknown escape sequence: '\\%03o'", c);
+    }
+
+  *pstr = str;
+  return c;
+}
+
+#ifndef MAX_CHAR_TYPE_SIZE
+#define MAX_CHAR_TYPE_SIZE CHAR_TYPE_SIZE
+#endif
+
+#ifndef MAX_WCHAR_TYPE_SIZE
+#define MAX_WCHAR_TYPE_SIZE WCHAR_TYPE_SIZE
+#endif
+
+/* Interpret a (possibly wide) character constant in TOKEN.
+   WARN_MULTI warns about multi-character charconsts, if not
+   TRADITIONAL.  TRADITIONAL also indicates not to interpret escapes
+   that did not exist in traditional C.  PCHARS_SEEN points to a
+   variable that is filled in with the number of characters seen.  */
+HOST_WIDE_INT
+cpp_interpret_charconst (pfile, token, warn_multi, traditional, pchars_seen)
+     cpp_reader *pfile;
+     const cpp_token *token;
+     int warn_multi;
+     int traditional;
+     unsigned int *pchars_seen;
+{
+  const unsigned char *str = token->val.str.text;
+  const unsigned char *limit = str + token->val.str.len;
+  unsigned int chars_seen = 0;
+  unsigned int width, max_chars, c;
+  HOST_WIDE_INT result = 0, mask;
+
+#ifdef MULTIBYTE_CHARS
+  (void) local_mbtowc (NULL, NULL, 0);
+#endif
+
+  /* Width in bits.  */
+  if (token->type == CPP_CHAR)
+    width = MAX_CHAR_TYPE_SIZE;
+  else
+    width = MAX_WCHAR_TYPE_SIZE;
+
+  if (width < HOST_BITS_PER_WIDE_INT)
+    mask = ((unsigned HOST_WIDE_INT) 1 << width) - 1;
+  else
+    mask = ~0;
+  max_chars = HOST_BITS_PER_WIDE_INT / width;
+
+  while (str < limit)
+    {
+#ifdef MULTIBYTE_CHARS
+      wchar_t wc;
+      int char_len;
+
+      char_len = local_mbtowc (&wc, str, limit - str);
+      if (char_len == -1)
+	{
+	  cpp_warning (pfile, "ignoring invalid multibyte character");
+	  c = *str++;
+	}
+      else
+	{
+	  str += char_len;
+	  c = wc;
+	}
+#else
+      c = *str++;
+#endif
+
+      if (c == '\\')
+	{
+	  c = parse_escape (pfile, &str, limit, mask, traditional);
+	  if (width < HOST_BITS_PER_WIDE_INT && c > mask)
+	    cpp_pedwarn (pfile, "escape sequence out of range for character");
+	}
+
+#ifdef MAP_CHARACTER
+      if (ISPRINT (c))
+	c = MAP_CHARACTER (c);
+#endif
+      
+      /* Merge character into result; ignore excess chars.  */
+      if (++chars_seen <= max_chars)
+	{
+	  if (width < HOST_BITS_PER_WIDE_INT)
+	    result = (result << width) | (c & mask);
+	  else
+	    result = c;
+	}
+    }
+
+  if (chars_seen == 0)
+    cpp_error (pfile, "empty character constant");
+  else if (chars_seen > max_chars)
+    {
+      chars_seen = max_chars;
+      cpp_error (pfile, "character constant too long");
+    }
+  else if (chars_seen > 1 && !traditional && warn_multi)
+    cpp_warning (pfile, "multi-character character constant");
+
+  /* If char type is signed, sign-extend the constant.  The
+     __CHAR_UNSIGNED__ macro is set by the driver if appropriate.  */
+  if (token->type == CPP_CHAR && chars_seen)
+    {
+      unsigned int nbits = chars_seen * width;
+      unsigned int mask = (unsigned int) ~0 >> (HOST_BITS_PER_INT - nbits);
+
+      if (pfile->spec_nodes.n__CHAR_UNSIGNED__->type == NT_MACRO
+	  || ((result >> (nbits - 1)) & 1) == 0)
+	result &= mask;
+      else
+	result |= ~mask;
+    }
+
+  *pchars_seen = chars_seen;
+  return result;
+}
+
 /* Memory pools.  */
 
 struct dummy
author	Neil Booth <neil@daikokuya.demon.co.uk>	2001-05-11 23:48:21 +0000
committer	Neil Booth <neil@gcc.gnu.org>	2001-05-11 23:48:21 +0000
commit	c8a96070172673d05574e3c9fe21e28750610223 (patch)
tree	eefc97fe7077fa7f6cfc9bb868638feb3419f0dd /gcc/cpplex.c
parent	f42974dc81a20a5ea306fab56faef8c322bf26d2 (diff)
download	gcc-c8a96070172673d05574e3c9fe21e28750610223.zip gcc-c8a96070172673d05574e3c9fe21e28750610223.tar.gz gcc-c8a96070172673d05574e3c9fe21e28750610223.tar.bz2