cppcharset.c (one_utf8_to_cppchar, [...]): New functions.

* cppcharset.c (one_utf8_to_cppchar, one_cppchar_to_utf8, one_utf8_to_utf32, one_utf32_to_utf8, one_utf8_to_utf16, one_utf16_to_utf8, conversion_loop, convert_utf8_utf16, convert_utf8_utf32, convert_utf16_utf8, convert_utf32_utf8, convert_no_conversion, convert_using_iconv): New functions. (APPLY_CONVERSION): New macro. (struct conversion, conversion_tab): New data structure. (init_iconv_desc): Check conversion_tab for a custom conversion primitive before trying to use iconv. (convert_cset): Deleted. (cpp_init_iconv): Use UTF- terminology, not UCS-. (_cpp_destroy_iconv): Update to match. (_cpp_valid_ucn): We don't need iconv to implement UCNs. (convert_ucn): Use one_cppchar_to_utf8 and APPLY_CONVERSION. (convert_escape, cpp_interpret_string): Use APPLY_CONVERSION. (_cpp_interpret_string_notranslate): New function, moved here from cpplib.c. * cpphash.h (convert_f, struct cset_converter): New types. (struct cpp_reader): narrow_cset_desc and wide_cset_desc are now struct cset_converter, not bare iconv_t. Update prototypes. * cpplib.c (interpret_string_notranslate): Moved to cppcharset.c; all callers changed. From-SVN: r69204
author: Zack Weinberg <zack@gcc.gnu.org> 2003-07-10 23:16:31 +0000
committer: Zack Weinberg <zack@gcc.gnu.org> 2003-07-10 23:16:31 +0000
commit: 6b88314c653778739e9f92700c3ae6a94a2a222c (patch)
tree: aecfe84fc61678543cc3833bd2320e614dbbd339 /gcc
parent: 38f4680f542122d06627905369389046efeb4289 (diff)
download: gcc-6b88314c653778739e9f92700c3ae6a94a2a222c.zip
gcc-6b88314c653778739e9f92700c3ae6a94a2a222c.tar.gz
gcc-6b88314c653778739e9f92700c3ae6a94a2a222c.tar.bz2
4 files changed, 634 insertions, 157 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 16da545..5094738 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,11 +1,38 @@
+2003-07-10  Zack Weinberg  <zack@codesourcery.com>
+
+	* cppcharset.c (one_utf8_to_cppchar, one_cppchar_to_utf8,
+	one_utf8_to_utf32, one_utf32_to_utf8, one_utf8_to_utf16,
+	one_utf16_to_utf8, conversion_loop, convert_utf8_utf16,
+	convert_utf8_utf32, convert_utf16_utf8,	convert_utf32_utf8,
+	convert_no_conversion, convert_using_iconv): New functions.
+	(APPLY_CONVERSION): New macro.
+	(struct conversion, conversion_tab): New data structure.
+	(init_iconv_desc): Check conversion_tab for a custom conversion
+	primitive before trying to use iconv.
+	(convert_cset): Deleted.
+	(cpp_init_iconv): Use UTF- terminology, not UCS-.
+	(_cpp_destroy_iconv): Update to match.
+	(_cpp_valid_ucn): We don't need iconv to implement UCNs.
+	(convert_ucn): Use one_cppchar_to_utf8 and APPLY_CONVERSION.
+	(convert_escape, cpp_interpret_string): Use APPLY_CONVERSION.
+	(_cpp_interpret_string_notranslate): New function, moved here
+	from cpplib.c.
+
+	* cpphash.h (convert_f, struct cset_converter): New types.
+	(struct cpp_reader): narrow_cset_desc and wide_cset_desc
+	are now struct cset_converter, not bare iconv_t.
+	Update prototypes.
+	* cpplib.c (interpret_string_notranslate): Moved to cppcharset.c;
+	all callers changed.
+
 2003-07-10  Kelley Cook  <kelleycook@wideopenwest.com>
-        
+
 	* Makefile.in (options.h): Depend on Makefile.  Add move-if-change
 	to opts.sh command line.
 	* opts.sh: Write to temporary files with a move-if-change at the end.
 
 2003-06-30  Denis Chertykov  <denisc@overta.ru>
-            Richard Kenner <kenner@vlsi1.ultra.nyu.edu>
+	    Richard Kenner <kenner@vlsi1.ultra.nyu.edu>
 
 	* combine.c (gen_binary): Handle the CLOBBER rtx and
 	don't build a binary operation with it.
@@ -319,7 +346,7 @@
 2003-07-09  Nathanael Nerode  <neroden@gcc.gnu.org>
 
 	PR bootstrap/11043
-	* config/arc/t-arc: Replace bogus references to "x-crtinit.o", 
+	* config/arc/t-arc: Replace bogus references to "x-crtinit.o",
 	"x-crtfini.o" with "crtinit.o", "crtfini.o".
 
 	* fixinc/inclhack.def (limits_ifndefs): Add select test.
diff --git a/gcc/cppcharset.c b/gcc/cppcharset.c
index 0e9805f..c170b5c 100644
--- a/gcc/cppcharset.c
+++ b/gcc/cppcharset.c
@@ -92,8 +92,7 @@ Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
 #error "Unrecognized basic host character set"
 #endif
 
-/* This structure is used for a resizable string buffer, mostly by
-   convert_cset and cpp_interpret_string.  */
+/* This structure is used for a resizable string buffer throughout.  */
 struct strbuf
 {
   uchar *text;
@@ -103,23 +102,545 @@ struct strbuf
 
 /* This is enough to hold any string that fits on a single 80-column
    line, even if iconv quadruples its size (e.g. conversion from
-   ASCII to UCS-4) rounded up to a power of two.  */
+   ASCII to UTF-32) rounded up to a power of two.  */
 #define OUTBUF_BLOCK_SIZE 256
 
-/* Subroutine of cpp_init_iconv: initialize and return an iconv
-   descriptor for conversion from FROM to TO.  If iconv_open() fails,
-   issue an error and return (iconv_t) -1.  Silently return
-   (iconv_t) -1 if FROM and TO are identical.  */
-static iconv_t
+/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
+   logic.  This is because a depressing number of systems lack iconv,
+   or have have iconv libraries that do not do these conversions, so
+   we need a fallback implementation for them.  To ensure the fallback
+   doesn't break due to neglect, it is used on all systems.
+
+   UTF-32 encoding is nice and simple: a four-byte binary number,
+   constrained to the range 00000000-7FFFFFFF to avoid questions of
+   signedness.  We do have to cope with big- and little-endian
+   variants.
+
+   UTF-16 encoding uses two-byte binary numbers, again in big- and
+   little-endian variants, for all values in the 00000000-0000FFFF
+   range.  Values in the 00010000-0010FFFF range are encoded as pairs
+   of two-byte numbers, called "surrogate pairs": given a number S in
+   this range, it is mapped to a pair (H, L) as follows:
+
+     H = (S - 0x10000) / 0x400 + 0xD800
+     L = (S - 0x10000) % 0x400 + 0xDC00
+
+   Two-byte values in the D800...DFFF range are ill-formed except as a
+   component of a surrogate pair.  Even if the encoding within a
+   two-byte value is little-endian, the H member of the surrogate pair
+   comes first.
+
+   There is no way to encode values in the 00110000-7FFFFFFF range,
+   which is not currently a problem as there are no assigned code
+   points in that range; however, the author expects that it will
+   eventually become necessary to abandon UTF-16 due to this
+   limitation.  Note also that, because of these pairs, UTF-16 does
+   not meet the requirements of the C standard for a wide character
+   encoding (see 3.7.3 and 6.4.4.4p11).
+
+   UTF-8 encoding looks like this:
+
+   value range	       encoded as
+   00000000-0000007F   0xxxxxxx
+   00000080-000007FF   110xxxxx 10xxxxxx
+   00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
+   00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+   00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+   04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+   Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
+   which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
+   never occur.  Note also that any value that can be encoded by a
+   given row of the table can also be encoded by all successive rows,
+   but this is not done; only the shortest possible encoding for any
+   given value is valid.  For instance, the character 07C0 could be
+   encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
+   FC 80 80 80 9F 80.  Only the first is valid.
+
+   An implementation note: the transformation from UTF-16 to UTF-8, or
+   vice versa, is easiest done by using UTF-32 as an intermediary.  */
+
+/* Internal primitives which go from an UTF-8 byte stream to native-endian
+   UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
+   operation in several places below.  */
+static inline int
+one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
+		     cppchar_t *cp)
+{
+  static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
+  static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+  
+  cppchar_t c;
+  const uchar *inbuf = *inbufp;
+  size_t nbytes, i;
+
+  if (*inbytesleftp < 1)
+    return EINVAL;
+
+  c = *inbuf;
+  if (c < 0x80)
+    {
+      *cp = c;
+      *inbytesleftp -= 1;
+      *inbufp += 1;
+      return 0;
+    }
+
+  /* The number of leading 1-bits in the first byte indicates how many
+     bytes follow.  */
+  for (nbytes = 2; nbytes < 7; nbytes++)
+    if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
+      goto found;
+  return EILSEQ;
+ found:
+
+  if (*inbytesleftp < nbytes)
+    return EINVAL;
+
+  c = (c & masks[nbytes-1]);
+  inbuf++;
+  for (i = 1; i < nbytes; i++)
+    {
+      cppchar_t n = *inbuf++;
+      if ((n & 0xC0) != 0x80)
+	return EILSEQ;
+      c = ((c << 6) + (n & 0x3F));
+    }
+
+  /* Make sure the shortest possible encoding was used.  */
+  if (c <=      0x7F && nbytes > 1) return EILSEQ;
+  if (c <=     0x7FF && nbytes > 2) return EILSEQ;
+  if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
+  if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
+  if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
+
+  /* Make sure the character is valid.  */
+  if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
+
+  *cp = c;
+  *inbufp = inbuf;
+  *inbytesleftp -= nbytes;
+  return 0;
+}
+
+static inline int
+one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
+{
+  static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+  static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
+  size_t nbytes;
+  uchar buf[6], *p = &buf[6];
+  uchar *outbuf = *outbufp;
+
+  nbytes = 1;
+  if (c < 0x80)
+    *--p = c;
+  else
+    {
+      do
+	{
+	  *--p = ((c & 0x3F) | 0x80);
+	  c >>= 6;
+	  nbytes++;
+	}
+      while (c >= 0x3F || (c & limits[nbytes-1]));
+      *--p = (c | masks[nbytes-1]);
+    }
+
+  if (*outbytesleftp < nbytes)
+    return E2BIG;
+
+  while (p < &buf[6])
+    *outbuf++ = *p++;
+  *outbytesleftp -= nbytes;
+  *outbufp = outbuf;
+  return 0;
+}
+
+/* The following four functions transform one character between the two
+   encodings named in the function name.  All have the signature
+   int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+           uchar **outbufp, size_t *outbytesleftp)
+
+   BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
+   interpreted as a boolean indicating whether big-endian or
+   little-endian encoding is to be used for the member of the pair
+   that is not UTF-8.
+
+   INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
+   do for iconv.
+
+   The return value is either 0 for success, or an errno value for
+   failure, which may be E2BIG (need more space), EILSEQ (ill-formed
+   input sequence), ir EINVAL (incomplete input sequence).  */
+   
+static inline int
+one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+		   uchar **outbufp, size_t *outbytesleftp)
+{
+  uchar *outbuf;
+  cppchar_t s;
+  int rval;
+
+  /* Check for space first, since we know exactly how much we need.  */
+  if (*outbytesleftp < 4)
+    return E2BIG;
+
+  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
+  if (rval)
+    return rval;
+
+  outbuf = *outbufp;
+  outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
+  outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
+  outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
+  outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
+
+  *outbufp += 4;
+  *outbytesleftp -= 4;
+  return 0;
+}
+
+static inline int
+one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+		   uchar **outbufp, size_t *outbytesleftp)
+{
+  cppchar_t s;
+  int rval;
+  const uchar *inbuf;
+
+  if (*inbytesleftp < 4)
+    return EINVAL;
+
+  inbuf = *inbufp;
+
+  s  = inbuf[bigend ? 0 : 3] << 24;
+  s += inbuf[bigend ? 1 : 2] << 16;
+  s += inbuf[bigend ? 2 : 1] << 8;
+  s += inbuf[bigend ? 3 : 0];
+
+  if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
+    return EILSEQ;
+
+  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
+  if (rval)
+    return rval;
+
+  *inbufp += 4;
+  *inbytesleftp -= 4;
+  return 0;
+}
+
+static inline int
+one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+		   uchar **outbufp, size_t *outbytesleftp)
+{
+  int rval;
+  cppchar_t s;
+  const uchar *save_inbuf = *inbufp;
+  size_t save_inbytesleft = *inbytesleftp;
+  uchar *outbuf = *outbufp;
+
+  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
+  if (rval)
+    return rval;
+
+  if (s > 0x0010FFFF)
+    {
+      *inbufp = save_inbuf;
+      *inbytesleftp = save_inbytesleft;
+      return EILSEQ;
+    }
+
+  if (s < 0xFFFF)
+    {
+      if (*outbytesleftp < 2)
+	{
+	  *inbufp = save_inbuf;
+	  *inbytesleftp = save_inbytesleft;
+	  return E2BIG;
+	}
+      outbuf[bigend ? 1 : 0] = (s & 0x00FF);
+      outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
+
+      *outbufp += 2;
+      *outbytesleftp -= 2;
+      return 0;
+    }
+  else
+    {
+      cppchar_t hi, lo;
+
+      if (*outbytesleftp < 4)
+	{
+	  *inbufp = save_inbuf;
+	  *inbytesleftp = save_inbytesleft;
+	  return E2BIG;
+	}
+
+      hi = (s - 0x10000) / 0x400 + 0xD800;
+      lo = (s - 0x10000) % 0x400 + 0xDC00;
+
+      /* Even if we are little-endian, put the high surrogate first.
+	 ??? Matches practice?  */
+      outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
+      outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
+      outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
+      outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
+
+      *outbufp += 4;
+      *outbytesleftp -= 4;
+      return 0;
+    }
+}
+
+static inline int
+one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+		   uchar **outbufp, size_t *outbytesleftp)
+{
+  cppchar_t s;
+  const uchar *inbuf = *inbufp;
+  int rval;
+
+  if (*inbytesleftp < 2)
+    return EINVAL;
+  s  = inbuf[bigend ? 0 : 1] << 8;
+  s += inbuf[bigend ? 1 : 0];
+
+  /* Low surrogate without immediately preceding high surrogate is invalid.  */
+  if (s >= 0xDC00 && s <= 0xDFFF)
+    return EILSEQ;
+  /* High surrogate must have a following low surrogate.  */
+  else if (s >= 0xD800 && s <= 0xDBFF)
+    {
+      cppchar_t hi = s, lo;
+      if (*inbytesleftp < 4)
+	return EINVAL;
+
+      lo  = inbuf[bigend ? 2 : 3] << 8;
+      lo += inbuf[bigend ? 3 : 2];
+
+      if (lo < 0xDC00 || lo > 0xDFFF)
+	return EILSEQ;
+
+      s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
+    }
+
+  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
+  if (rval)
+    return rval;
+
+  /* Success - update the input pointers (one_cppchar_to_utf8 has done
+     the output pointers for us).  */
+  if (s <= 0xFFFF)
+    {
+      *inbufp += 2;
+      *inbytesleftp -= 2;
+    }
+  else
+    {
+      *inbufp += 4;
+      *inbytesleftp -= 4;
+    }
+  return 0;
+}
+
+/* Helper routine for the next few functions.  The 'const' on
+   one_conversion means that we promise not to modify what function is
+   pointed to, which lets the inliner see through it. */
+
+static inline bool
+conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
+					     uchar **, size_t *),
+		 iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
+{
+  const uchar *inbuf;
+  uchar *outbuf;
+  size_t inbytesleft, outbytesleft;
+  int rval;
+
+  inbuf = from;
+  inbytesleft = flen;
+  outbuf = to->text + to->len;
+  outbytesleft = to->asize - to->len;
+
+  for (;;)
+    {
+      do
+	rval = one_conversion (cd, &inbuf, &inbytesleft,
+			       &outbuf, &outbytesleft);
+      while (inbytesleft && !rval);
+
+      if (__builtin_expect (inbytesleft == 0, 1))
+	{
+	  to->len = to->asize - outbytesleft;
+	  return true;
+	}
+      if (rval != E2BIG)
+	{
+	  errno = rval;
+	  return false;
+	}
+
+      outbytesleft += OUTBUF_BLOCK_SIZE;
+      to->asize += OUTBUF_BLOCK_SIZE;
+      to->text = xrealloc (to->text, to->asize);
+      outbuf = to->text + to->asize - outbytesleft;
+    }
+}
+		 
+
+/* These functions convert entire strings between character sets.
+   They all have the signature
+
+   bool (*)(iconv_t cd, const uchar *from, size_t flen, struct strbuf *to);
+
+   The input string FROM is converted as specified by the function
+   name plus the iconv descriptor CD (which may be fake), and the
+   result appended to TO.  On any error, false is returned, otherwise true.  */
+
+/* These four use the custom conversion code above.  */
+static bool
+convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
+		    struct strbuf *to)
+{
+  return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
+}
+
+static bool
+convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
+		    struct strbuf *to)
+{
+  return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
+}
+
+static bool
+convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
+		    struct strbuf *to)
+{
+  return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
+}
+
+static bool
+convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
+		    struct strbuf *to)
+{
+  return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
+}
+
+/* Identity conversion, used when we have no alternative.  */
+static bool
+convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
+		       const uchar *from, size_t flen, struct strbuf *to)
+{
+  if (to->len + flen > to->asize)
+    {
+      to->asize = to->len + flen;
+      to->text = xrealloc (to->text, to->asize);
+    }
+  memcpy (to->text + to->len, from, flen);
+  to->len += flen;
+  return true;
+}
+
+/* And this one uses the system iconv primitive.  It's a little
+   different, since iconv's interface is a little different.  */
+
+static bool
+convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
+		     struct strbuf *to)
+{
+  ICONV_CONST char *inbuf;
+  char *outbuf;
+  size_t inbytesleft, outbytesleft;
+
+  /* Reset conversion descriptor and check that it is valid.  */
+  if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
+    return false;
+
+  inbuf = (ICONV_CONST char *)from;
+  inbytesleft = flen;
+  outbuf = (char *)to->text + to->len;
+  outbytesleft = to->asize - to->len;
+
+  for (;;)
+    {
+      iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+      if (__builtin_expect (inbytesleft == 0, 1))
+	{
+	  to->len = to->asize - outbytesleft;
+	  return true;
+	}
+      if (errno != E2BIG)
+	return false;
+
+      outbytesleft += OUTBUF_BLOCK_SIZE;
+      to->asize += OUTBUF_BLOCK_SIZE;
+      to->text = xrealloc (to->text, to->asize);
+      outbuf = (char *)to->text + to->asize - outbytesleft;
+    }
+}
+
+/* Arrange for the above custom conversion logic to be used automatically
+   when conversion between a suitable pair of character sets is requested.  */
+
+#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
+   CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
+
+struct conversion
+{
+  const char *pair;
+  convert_f func;
+  iconv_t fake_cd;
+};
+static const struct conversion conversion_tab[] = {
+  { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
+  { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
+  { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
+  { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
+  { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
+  { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
+  { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
+  { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
+};
+
+/* Subroutine of cpp_init_iconv: initialize and return a
+   cset_converter structure for conversion from FROM to TO.  If
+   iconv_open() fails, issue an error and return an identity
+   converter.  Silently return an identity converter if FROM and TO
+   are identical.  */
+static struct cset_converter
 init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
 {
-  iconv_t dsc;
+  struct cset_converter ret;
+  char *pair;
+  size_t i;
+  
+  if (!strcasecmp (to, from))
+    {
+      ret.func = convert_no_conversion;
+      ret.cd = (iconv_t) -1;
+      return ret;
+    }
+
+  pair = alloca(strlen(to) + strlen(from) + 2);
+
+  strcpy(pair, from);
+  strcat(pair, "/");
+  strcat(pair, to);
+  for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
+    if (!strcasecmp (pair, conversion_tab[i].pair))
+      {
+	ret.func = conversion_tab[i].func;
+	ret.cd = conversion_tab[i].fake_cd;
+	return ret;
+      }
 
-  if (!strcmp (to, from))
-    return (iconv_t) -1;
+  /* No custom converter - try iconv.  */
+  ret.func = convert_using_iconv;
+  ret.cd = iconv_open (to, from);
 
-  dsc = iconv_open (to, from);
-  if (dsc == (iconv_t) -1)
+  if (ret.cd == (iconv_t) -1)
     {
       if (errno == EINVAL)
 	cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
@@ -127,8 +648,10 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
 		   from, to);
       else
 	cpp_errno (pfile, DL_ERROR, "iconv_open");
+
+      ret.func = convert_no_conversion;
     }
-  return dsc;
+  return ret;
 }
 
 /* If charset conversion is requested, initialize iconv(3) descriptors
@@ -146,9 +669,9 @@ cpp_init_iconv (cpp_reader *pfile)
   bool be = CPP_OPTION (pfile, bytes_big_endian);
 
   if (CPP_OPTION (pfile, wchar_precision) >= 32)
-    default_wcset = be ? "UCS-4BE" : "UCS-4LE";
+    default_wcset = be ? "UTF-32BE" : "UTF-32LE";
   else if (CPP_OPTION (pfile, wchar_precision) >= 16)
-    default_wcset = be ? "UCS-2BE" : "UCS-2LE";
+    default_wcset = be ? "UTF-16BE" : "UTF-16LE";
   else
     /* This effectively means that wide strings are not supported,
        so don't do any conversion at all.  */
@@ -181,67 +704,13 @@ _cpp_destroy_iconv (cpp_reader *pfile)
 {
   if (HAVE_ICONV)
     {
-      if (pfile->narrow_cset_desc != (iconv_t) -1)
-	iconv_close (pfile->narrow_cset_desc);
-      if (pfile->wide_cset_desc != (iconv_t) -1)
-	iconv_close (pfile->wide_cset_desc);
+      if (pfile->narrow_cset_desc.func == convert_using_iconv)
+	iconv_close (pfile->narrow_cset_desc.cd);
+      if (pfile->wide_cset_desc.func == convert_using_iconv)
+	iconv_close (pfile->wide_cset_desc.cd);
     }
 }
 
-/* iconv(3) utility wrapper.  Convert the string FROM, of length FLEN,
-   according to the iconv descriptor CD.  The result is appended to
-   the string buffer TO.  If DESC is (iconv_t)-1 or iconv is not
-   available, the string is simply copied into TO.
-
-   Returns true on success, false on error.  */
-
-static bool
-convert_cset (iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
-{
-  if (!HAVE_ICONV || cd == (iconv_t)-1)
-    {
-      if (to->len + flen > to->asize)
-	{
-	  to->asize = to->len + flen;
-	  to->text = xrealloc (to->text, to->asize);
-	}
-      memcpy (to->text + to->len, from, flen);
-      to->len += flen;
-      return true;
-    }
-  else
-    {
-      ICONV_CONST char *inbuf;
-      char *outbuf;
-      size_t inbytesleft, outbytesleft;
-
-      /* Reset conversion descriptor and check that it is valid.  */
-      if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
-	return false;
-
-      inbuf = (ICONV_CONST char *)from;
-      inbytesleft = flen;
-      outbuf = (char *)to->text + to->len;
-      outbytesleft = to->asize - to->len;
-
-      for (;;)
-	{
-	  iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
-	  if (__builtin_expect (inbytesleft == 0, 1))
-	    {
-	      to->len = to->asize - outbytesleft;
-	      return true;
-	    }
-	  if (errno != E2BIG)
-	    return false;
-
-	  outbytesleft += OUTBUF_BLOCK_SIZE;
-	  to->asize += OUTBUF_BLOCK_SIZE;
-	  to->text = xrealloc (to->text, to->asize);
-	  outbuf = (char *)to->text + to->asize - outbytesleft;
-	}
-    }
-}
 
 /* Utility routine that computes a mask of the form 0000...111... with
    WIDTH 1-bits.  */
@@ -390,15 +859,6 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
    "universal character %.*s is not valid at the start of an identifier",
 		   (int) (str - base), base);
     }
-  /* We don't accept UCNs if iconv is not available or will not
-     convert to the target wide character set.  */
-  else if (!HAVE_ICONV || pfile->wide_cset_desc == (iconv_t) -1)
-    {
-      /* XXX should be DL_SORRY */
-      cpp_error (pfile, DL_ERROR,
-	"universal character names are not supported in this configuration");
-    }
-
 
   if (result == 0)
     result = 1;
@@ -408,58 +868,31 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
 
 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
    it to the execution character set and write the result into TBUF.
-   An advanced pointer is returned.  Issues all relevant diagnostics.
-
-   UTF-8 encoding looks like this:
-
-   value range	       encoded as
-   00000000-0000007F   0xxxxxxx
-   00000080-000007FF   110xxxxx 10xxxxxx
-   00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
-   00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-   00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-   04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+   An advanced pointer is returned.  Issues all relevant diagnostics.  */
 
-   Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
-   which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
-   never occur.  Note also that any value that can be encoded by a
-   given row of the table can also be encoded by all successive rows,
-   but this is not done; only the shortest possible encoding for any
-   given value is valid.  For instance, the character 07C0 could be
-   encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
-   FC 80 80 80 9F 80.  Only the first is valid.  */
 
 static const uchar *
 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
 	     struct strbuf *tbuf, bool wide)
 {
-  int nbytes;
-  uchar buf[6], *p = &buf[6];
-  static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
   cppchar_t ucn;
-
-  from++; /* skip u/U */
+  uchar buf[6];
+  uchar *bufp = buf;
+  size_t bytesleft = 6;
+  int rval;
+  struct cset_converter cvt
+    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
+
+  from++;  /* skip u/U */
   ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
-  if (!ucn)
-    return from;
 
-  nbytes = 1;
-  if (ucn < 0x80)
-    *--p = ucn;
-  else
+  rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
+  if (rval)
     {
-      do
-	{
-	  *--p = ((ucn & 0x3F) | 0x80);
-	  ucn >>= 6;
-	  nbytes++;
-	}
-      while (ucn >= 0x3F || (ucn & masks[nbytes-1]));
-      *--p = (ucn | masks[nbytes-1]);
+      errno = rval;
+      cpp_errno (pfile, DL_ERROR, "converting UCN to source character set");
     }
-
-  if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
-		     p, nbytes, tbuf))
+  else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
     cpp_errno (pfile, DL_ERROR, "converting UCN to execution character set");
 
   return from;
@@ -615,6 +1048,8 @@ convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
 #endif
 
   uchar c;
+  struct cset_converter cvt
+    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
 
   c = *from;
   switch (c)
@@ -676,8 +1111,7 @@ convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
     }
 
   /* Now convert what we have to the execution character set.  */
-  if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
-		     &c, 1, tbuf))
+  if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
     cpp_errno (pfile, DL_ERROR,
 	       "converting escape sequence to execution character set");
 
@@ -697,7 +1131,8 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
   struct strbuf tbuf;
   const uchar *p, *base, *limit;
   size_t i;
-  iconv_t cd = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
+  struct cset_converter cvt
+    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
 
   tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
   tbuf.text = xmalloc (tbuf.asize);
@@ -719,7 +1154,7 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
 	    {
 	      /* We have a run of normal characters; these can be fed
 		 directly to convert_cset.  */
-	      if (!convert_cset (cd, base, p - base, &tbuf))
+	      if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
 		goto fail;
 	    }
 	  if (p == limit)
@@ -741,6 +1176,25 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
   free (tbuf.text);
   return false;
 }
+
+/* Subroutine of do_line and do_linemarker.  Convert escape sequences
+   in a string, but do not perform character set conversion.  */
+bool
+_cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *in,
+				   cpp_string *out)
+{
+  struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
+  bool retval;
+
+  pfile->narrow_cset_desc.func = convert_no_conversion;
+  pfile->narrow_cset_desc.cd = (iconv_t) -1;
+
+  retval = cpp_interpret_string (pfile, in, 1, out, false);
+
+  pfile->narrow_cset_desc = save_narrow_cset_desc;
+  return retval;
+}
+
 
 /* Subroutine of cpp_interpret_charconst which performs the conversion
    to a number, for narrow strings.  STR is the string structure returned
diff --git a/gcc/cpphash.h b/gcc/cpphash.h
index f4a7cfc..0e1bb39 100644
--- a/gcc/cpphash.h
+++ b/gcc/cpphash.h
@@ -35,6 +35,15 @@ typedef int iconv_t;  /* dummy */
 struct directive;		/* Deliberately incomplete.  */
 struct pending_option;
 struct op;
+struct strbuf;
+
+typedef bool (*convert_f) (iconv_t, const unsigned char *, size_t,
+			   struct strbuf *);
+struct cset_converter
+{
+  convert_f func;
+  iconv_t cd;
+};
 
 #ifndef HAVE_UCHAR
 typedef unsigned char uchar;
@@ -369,14 +378,13 @@ struct cpp_reader
   unsigned char *macro_buffer;
   unsigned int macro_buffer_len;
 
-  /* Iconv descriptor for converting from the source character set
-     to the execution character set.  (iconv_t)-1 for no conversion.  */
-  iconv_t narrow_cset_desc;
+  /* Descriptor for converting from the source character set to the
+     execution character set.  */
+  struct cset_converter narrow_cset_desc;
 
-  /* Iconv descriptor for converting from the execution character set
-     to the wide execution character set.  (iconv_t)-1 for no conversion
-     other than zero-extending each character to the width of wchar_t.  */
-  iconv_t wide_cset_desc;
+  /* Descriptor for converting from the source character set to the
+     wide execution character set.  */
+  struct cset_converter wide_cset_desc;
 
   /* Tree of other included files.  See cppfiles.c.  */
   struct splay_tree_s *all_include_files;
@@ -555,8 +563,11 @@ extern uchar *_cpp_copy_replacement_text (const cpp_macro *, uchar *);
 extern size_t _cpp_replacement_text_len (const cpp_macro *);
 
 /* In cppcharset.c.  */
-cppchar_t _cpp_valid_ucn (cpp_reader *, const uchar **, const uchar *, int);
-void _cpp_destroy_iconv (cpp_reader *);
+extern cppchar_t _cpp_valid_ucn (cpp_reader *, const uchar **,
+				 const uchar *, int);
+extern void _cpp_destroy_iconv (cpp_reader *);
+extern bool _cpp_interpret_string_notranslate (cpp_reader *, const cpp_string *,
+					       cpp_string *);
 
 /* Utility routines and macros.  */
 #define DSC(str) (const uchar *)str, sizeof str - 1
diff --git a/gcc/cpplib.c b/gcc/cpplib.c
index 2fac44e..1dfef72 100644
--- a/gcc/cpplib.c
+++ b/gcc/cpplib.c
@@ -733,21 +733,6 @@ strtoul_for_line (const uchar *str, unsigned int len, long unsigned int *nump)
   return 0;
 }
 
-/* Subroutine of do_line and do_linemarker.  Convert escape sequences
-   in a string, but do not perform character set conversion.  */
-static bool
-interpret_string_notranslate (cpp_reader *pfile, const cpp_string *in,
-			      cpp_string *out)
-{
-  iconv_t save_narrow_cset_desc = pfile->narrow_cset_desc;
-  bool retval;
-
-  pfile->narrow_cset_desc = (iconv_t) -1;
-  retval = cpp_interpret_string (pfile, in, 1, out, false);
-  pfile->narrow_cset_desc = save_narrow_cset_desc;
-  return retval;
-}
-
 /* Interpret #line command.
    Note that the filename string (if any) is a true string constant
    (escapes are interpreted), unlike in #line.  */
@@ -780,7 +765,7 @@ do_line (cpp_reader *pfile)
   if (token->type == CPP_STRING)
     {
       cpp_string s = { 0, 0 };
-      if (interpret_string_notranslate (pfile, &token->val.str, &s))
+      if (_cpp_interpret_string_notranslate (pfile, &token->val.str, &s))
 	new_file = (const char *)s.text;
       check_eol (pfile);
     }
@@ -829,7 +814,7 @@ do_linemarker (cpp_reader *pfile)
   if (token->type == CPP_STRING)
     {
       cpp_string s = { 0, 0 };
-      if (interpret_string_notranslate (pfile, &token->val.str, &s))
+      if (_cpp_interpret_string_notranslate (pfile, &token->val.str, &s))
 	new_file = (const char *)s.text;
       
       new_sysp = 0;
author	Zack Weinberg <zack@gcc.gnu.org>	2003-07-10 23:16:31 +0000
committer	Zack Weinberg <zack@gcc.gnu.org>	2003-07-10 23:16:31 +0000
commit	6b88314c653778739e9f92700c3ae6a94a2a222c (patch)
tree	aecfe84fc61678543cc3833bd2320e614dbbd339 /gcc
parent	38f4680f542122d06627905369389046efeb4289 (diff)
download	gcc-6b88314c653778739e9f92700c3ae6a94a2a222c.zip gcc-6b88314c653778739e9f92700c3ae6a94a2a222c.tar.gz gcc-6b88314c653778739e9f92700c3ae6a94a2a222c.tar.bz2