aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorZack Weinberg <zack@gcc.gnu.org>2003-07-10 23:16:31 +0000
committerZack Weinberg <zack@gcc.gnu.org>2003-07-10 23:16:31 +0000
commit6b88314c653778739e9f92700c3ae6a94a2a222c (patch)
treeaecfe84fc61678543cc3833bd2320e614dbbd339 /gcc
parent38f4680f542122d06627905369389046efeb4289 (diff)
downloadgcc-6b88314c653778739e9f92700c3ae6a94a2a222c.zip
gcc-6b88314c653778739e9f92700c3ae6a94a2a222c.tar.gz
gcc-6b88314c653778739e9f92700c3ae6a94a2a222c.tar.bz2
cppcharset.c (one_utf8_to_cppchar, [...]): New functions.
* cppcharset.c (one_utf8_to_cppchar, one_cppchar_to_utf8, one_utf8_to_utf32, one_utf32_to_utf8, one_utf8_to_utf16, one_utf16_to_utf8, conversion_loop, convert_utf8_utf16, convert_utf8_utf32, convert_utf16_utf8, convert_utf32_utf8, convert_no_conversion, convert_using_iconv): New functions. (APPLY_CONVERSION): New macro. (struct conversion, conversion_tab): New data structure. (init_iconv_desc): Check conversion_tab for a custom conversion primitive before trying to use iconv. (convert_cset): Deleted. (cpp_init_iconv): Use UTF- terminology, not UCS-. (_cpp_destroy_iconv): Update to match. (_cpp_valid_ucn): We don't need iconv to implement UCNs. (convert_ucn): Use one_cppchar_to_utf8 and APPLY_CONVERSION. (convert_escape, cpp_interpret_string): Use APPLY_CONVERSION. (_cpp_interpret_string_notranslate): New function, moved here from cpplib.c. * cpphash.h (convert_f, struct cset_converter): New types. (struct cpp_reader): narrow_cset_desc and wide_cset_desc are now struct cset_converter, not bare iconv_t. Update prototypes. * cpplib.c (interpret_string_notranslate): Moved to cppcharset.c; all callers changed. From-SVN: r69204
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog33
-rw-r--r--gcc/cppcharset.c710
-rw-r--r--gcc/cpphash.h29
-rw-r--r--gcc/cpplib.c19
4 files changed, 634 insertions, 157 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 16da545..5094738 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,11 +1,38 @@
+2003-07-10 Zack Weinberg <zack@codesourcery.com>
+
+ * cppcharset.c (one_utf8_to_cppchar, one_cppchar_to_utf8,
+ one_utf8_to_utf32, one_utf32_to_utf8, one_utf8_to_utf16,
+ one_utf16_to_utf8, conversion_loop, convert_utf8_utf16,
+ convert_utf8_utf32, convert_utf16_utf8, convert_utf32_utf8,
+ convert_no_conversion, convert_using_iconv): New functions.
+ (APPLY_CONVERSION): New macro.
+ (struct conversion, conversion_tab): New data structure.
+ (init_iconv_desc): Check conversion_tab for a custom conversion
+ primitive before trying to use iconv.
+ (convert_cset): Deleted.
+ (cpp_init_iconv): Use UTF- terminology, not UCS-.
+ (_cpp_destroy_iconv): Update to match.
+ (_cpp_valid_ucn): We don't need iconv to implement UCNs.
+ (convert_ucn): Use one_cppchar_to_utf8 and APPLY_CONVERSION.
+ (convert_escape, cpp_interpret_string): Use APPLY_CONVERSION.
+ (_cpp_interpret_string_notranslate): New function, moved here
+ from cpplib.c.
+
+ * cpphash.h (convert_f, struct cset_converter): New types.
+ (struct cpp_reader): narrow_cset_desc and wide_cset_desc
+ are now struct cset_converter, not bare iconv_t.
+ Update prototypes.
+ * cpplib.c (interpret_string_notranslate): Moved to cppcharset.c;
+ all callers changed.
+
2003-07-10 Kelley Cook <kelleycook@wideopenwest.com>
-
+
* Makefile.in (options.h): Depend on Makefile. Add move-if-change
to opts.sh command line.
* opts.sh: Write to temporary files with a move-if-change at the end.
2003-06-30 Denis Chertykov <denisc@overta.ru>
- Richard Kenner <kenner@vlsi1.ultra.nyu.edu>
+ Richard Kenner <kenner@vlsi1.ultra.nyu.edu>
* combine.c (gen_binary): Handle the CLOBBER rtx and
don't build a binary operation with it.
@@ -319,7 +346,7 @@
2003-07-09 Nathanael Nerode <neroden@gcc.gnu.org>
PR bootstrap/11043
- * config/arc/t-arc: Replace bogus references to "x-crtinit.o",
+ * config/arc/t-arc: Replace bogus references to "x-crtinit.o",
"x-crtfini.o" with "crtinit.o", "crtfini.o".
* fixinc/inclhack.def (limits_ifndefs): Add select test.
diff --git a/gcc/cppcharset.c b/gcc/cppcharset.c
index 0e9805f..c170b5c 100644
--- a/gcc/cppcharset.c
+++ b/gcc/cppcharset.c
@@ -92,8 +92,7 @@ Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
#error "Unrecognized basic host character set"
#endif
-/* This structure is used for a resizable string buffer, mostly by
- convert_cset and cpp_interpret_string. */
+/* This structure is used for a resizable string buffer throughout. */
struct strbuf
{
uchar *text;
@@ -103,23 +102,545 @@ struct strbuf
/* This is enough to hold any string that fits on a single 80-column
line, even if iconv quadruples its size (e.g. conversion from
- ASCII to UCS-4) rounded up to a power of two. */
+ ASCII to UTF-32) rounded up to a power of two. */
#define OUTBUF_BLOCK_SIZE 256
-/* Subroutine of cpp_init_iconv: initialize and return an iconv
- descriptor for conversion from FROM to TO. If iconv_open() fails,
- issue an error and return (iconv_t) -1. Silently return
- (iconv_t) -1 if FROM and TO are identical. */
-static iconv_t
+/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
+ logic. This is because a depressing number of systems lack iconv,
+ or have have iconv libraries that do not do these conversions, so
+ we need a fallback implementation for them. To ensure the fallback
+ doesn't break due to neglect, it is used on all systems.
+
+ UTF-32 encoding is nice and simple: a four-byte binary number,
+ constrained to the range 00000000-7FFFFFFF to avoid questions of
+ signedness. We do have to cope with big- and little-endian
+ variants.
+
+ UTF-16 encoding uses two-byte binary numbers, again in big- and
+ little-endian variants, for all values in the 00000000-0000FFFF
+ range. Values in the 00010000-0010FFFF range are encoded as pairs
+ of two-byte numbers, called "surrogate pairs": given a number S in
+ this range, it is mapped to a pair (H, L) as follows:
+
+ H = (S - 0x10000) / 0x400 + 0xD800
+ L = (S - 0x10000) % 0x400 + 0xDC00
+
+ Two-byte values in the D800...DFFF range are ill-formed except as a
+ component of a surrogate pair. Even if the encoding within a
+ two-byte value is little-endian, the H member of the surrogate pair
+ comes first.
+
+ There is no way to encode values in the 00110000-7FFFFFFF range,
+ which is not currently a problem as there are no assigned code
+ points in that range; however, the author expects that it will
+ eventually become necessary to abandon UTF-16 due to this
+ limitation. Note also that, because of these pairs, UTF-16 does
+ not meet the requirements of the C standard for a wide character
+ encoding (see 3.7.3 and 6.4.4.4p11).
+
+ UTF-8 encoding looks like this:
+
+ value range encoded as
+ 00000000-0000007F 0xxxxxxx
+ 00000080-000007FF 110xxxxx 10xxxxxx
+ 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
+ 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+ Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
+ which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
+ never occur. Note also that any value that can be encoded by a
+ given row of the table can also be encoded by all successive rows,
+ but this is not done; only the shortest possible encoding for any
+ given value is valid. For instance, the character 07C0 could be
+ encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
+ FC 80 80 80 9F 80. Only the first is valid.
+
+ An implementation note: the transformation from UTF-16 to UTF-8, or
+ vice versa, is easiest done by using UTF-32 as an intermediary. */
+
+/* Internal primitives which go from an UTF-8 byte stream to native-endian
+ UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
+ operation in several places below. */
+static inline int
+one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
+ cppchar_t *cp)
+{
+ static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
+ static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+
+ cppchar_t c;
+ const uchar *inbuf = *inbufp;
+ size_t nbytes, i;
+
+ if (*inbytesleftp < 1)
+ return EINVAL;
+
+ c = *inbuf;
+ if (c < 0x80)
+ {
+ *cp = c;
+ *inbytesleftp -= 1;
+ *inbufp += 1;
+ return 0;
+ }
+
+ /* The number of leading 1-bits in the first byte indicates how many
+ bytes follow. */
+ for (nbytes = 2; nbytes < 7; nbytes++)
+ if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
+ goto found;
+ return EILSEQ;
+ found:
+
+ if (*inbytesleftp < nbytes)
+ return EINVAL;
+
+ c = (c & masks[nbytes-1]);
+ inbuf++;
+ for (i = 1; i < nbytes; i++)
+ {
+ cppchar_t n = *inbuf++;
+ if ((n & 0xC0) != 0x80)
+ return EILSEQ;
+ c = ((c << 6) + (n & 0x3F));
+ }
+
+ /* Make sure the shortest possible encoding was used. */
+ if (c <= 0x7F && nbytes > 1) return EILSEQ;
+ if (c <= 0x7FF && nbytes > 2) return EILSEQ;
+ if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
+ if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
+ if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
+
+ /* Make sure the character is valid. */
+ if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
+
+ *cp = c;
+ *inbufp = inbuf;
+ *inbytesleftp -= nbytes;
+ return 0;
+}
+
+static inline int
+one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
+{
+ static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+ static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
+ size_t nbytes;
+ uchar buf[6], *p = &buf[6];
+ uchar *outbuf = *outbufp;
+
+ nbytes = 1;
+ if (c < 0x80)
+ *--p = c;
+ else
+ {
+ do
+ {
+ *--p = ((c & 0x3F) | 0x80);
+ c >>= 6;
+ nbytes++;
+ }
+ while (c >= 0x3F || (c & limits[nbytes-1]));
+ *--p = (c | masks[nbytes-1]);
+ }
+
+ if (*outbytesleftp < nbytes)
+ return E2BIG;
+
+ while (p < &buf[6])
+ *outbuf++ = *p++;
+ *outbytesleftp -= nbytes;
+ *outbufp = outbuf;
+ return 0;
+}
+
+/* The following four functions transform one character between the two
+ encodings named in the function name. All have the signature
+ int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+ uchar **outbufp, size_t *outbytesleftp)
+
+ BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
+ interpreted as a boolean indicating whether big-endian or
+ little-endian encoding is to be used for the member of the pair
+ that is not UTF-8.
+
+ INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
+ do for iconv.
+
+ The return value is either 0 for success, or an errno value for
+ failure, which may be E2BIG (need more space), EILSEQ (ill-formed
+ input sequence), ir EINVAL (incomplete input sequence). */
+
+static inline int
+one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+ uchar **outbufp, size_t *outbytesleftp)
+{
+ uchar *outbuf;
+ cppchar_t s;
+ int rval;
+
+ /* Check for space first, since we know exactly how much we need. */
+ if (*outbytesleftp < 4)
+ return E2BIG;
+
+ rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
+ if (rval)
+ return rval;
+
+ outbuf = *outbufp;
+ outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
+ outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
+ outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
+ outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
+
+ *outbufp += 4;
+ *outbytesleftp -= 4;
+ return 0;
+}
+
+static inline int
+one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+ uchar **outbufp, size_t *outbytesleftp)
+{
+ cppchar_t s;
+ int rval;
+ const uchar *inbuf;
+
+ if (*inbytesleftp < 4)
+ return EINVAL;
+
+ inbuf = *inbufp;
+
+ s = inbuf[bigend ? 0 : 3] << 24;
+ s += inbuf[bigend ? 1 : 2] << 16;
+ s += inbuf[bigend ? 2 : 1] << 8;
+ s += inbuf[bigend ? 3 : 0];
+
+ if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
+ return EILSEQ;
+
+ rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
+ if (rval)
+ return rval;
+
+ *inbufp += 4;
+ *inbytesleftp -= 4;
+ return 0;
+}
+
+static inline int
+one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+ uchar **outbufp, size_t *outbytesleftp)
+{
+ int rval;
+ cppchar_t s;
+ const uchar *save_inbuf = *inbufp;
+ size_t save_inbytesleft = *inbytesleftp;
+ uchar *outbuf = *outbufp;
+
+ rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
+ if (rval)
+ return rval;
+
+ if (s > 0x0010FFFF)
+ {
+ *inbufp = save_inbuf;
+ *inbytesleftp = save_inbytesleft;
+ return EILSEQ;
+ }
+
+ if (s < 0xFFFF)
+ {
+ if (*outbytesleftp < 2)
+ {
+ *inbufp = save_inbuf;
+ *inbytesleftp = save_inbytesleft;
+ return E2BIG;
+ }
+ outbuf[bigend ? 1 : 0] = (s & 0x00FF);
+ outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
+
+ *outbufp += 2;
+ *outbytesleftp -= 2;
+ return 0;
+ }
+ else
+ {
+ cppchar_t hi, lo;
+
+ if (*outbytesleftp < 4)
+ {
+ *inbufp = save_inbuf;
+ *inbytesleftp = save_inbytesleft;
+ return E2BIG;
+ }
+
+ hi = (s - 0x10000) / 0x400 + 0xD800;
+ lo = (s - 0x10000) % 0x400 + 0xDC00;
+
+ /* Even if we are little-endian, put the high surrogate first.
+ ??? Matches practice? */
+ outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
+ outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
+ outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
+ outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
+
+ *outbufp += 4;
+ *outbytesleftp -= 4;
+ return 0;
+ }
+}
+
+static inline int
+one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
+ uchar **outbufp, size_t *outbytesleftp)
+{
+ cppchar_t s;
+ const uchar *inbuf = *inbufp;
+ int rval;
+
+ if (*inbytesleftp < 2)
+ return EINVAL;
+ s = inbuf[bigend ? 0 : 1] << 8;
+ s += inbuf[bigend ? 1 : 0];
+
+ /* Low surrogate without immediately preceding high surrogate is invalid. */
+ if (s >= 0xDC00 && s <= 0xDFFF)
+ return EILSEQ;
+ /* High surrogate must have a following low surrogate. */
+ else if (s >= 0xD800 && s <= 0xDBFF)
+ {
+ cppchar_t hi = s, lo;
+ if (*inbytesleftp < 4)
+ return EINVAL;
+
+ lo = inbuf[bigend ? 2 : 3] << 8;
+ lo += inbuf[bigend ? 3 : 2];
+
+ if (lo < 0xDC00 || lo > 0xDFFF)
+ return EILSEQ;
+
+ s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
+ }
+
+ rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
+ if (rval)
+ return rval;
+
+ /* Success - update the input pointers (one_cppchar_to_utf8 has done
+ the output pointers for us). */
+ if (s <= 0xFFFF)
+ {
+ *inbufp += 2;
+ *inbytesleftp -= 2;
+ }
+ else
+ {
+ *inbufp += 4;
+ *inbytesleftp -= 4;
+ }
+ return 0;
+}
+
+/* Helper routine for the next few functions. The 'const' on
+ one_conversion means that we promise not to modify what function is
+ pointed to, which lets the inliner see through it. */
+
+static inline bool
+conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
+ uchar **, size_t *),
+ iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
+{
+ const uchar *inbuf;
+ uchar *outbuf;
+ size_t inbytesleft, outbytesleft;
+ int rval;
+
+ inbuf = from;
+ inbytesleft = flen;
+ outbuf = to->text + to->len;
+ outbytesleft = to->asize - to->len;
+
+ for (;;)
+ {
+ do
+ rval = one_conversion (cd, &inbuf, &inbytesleft,
+ &outbuf, &outbytesleft);
+ while (inbytesleft && !rval);
+
+ if (__builtin_expect (inbytesleft == 0, 1))
+ {
+ to->len = to->asize - outbytesleft;
+ return true;
+ }
+ if (rval != E2BIG)
+ {
+ errno = rval;
+ return false;
+ }
+
+ outbytesleft += OUTBUF_BLOCK_SIZE;
+ to->asize += OUTBUF_BLOCK_SIZE;
+ to->text = xrealloc (to->text, to->asize);
+ outbuf = to->text + to->asize - outbytesleft;
+ }
+}
+
+
+/* These functions convert entire strings between character sets.
+ They all have the signature
+
+ bool (*)(iconv_t cd, const uchar *from, size_t flen, struct strbuf *to);
+
+ The input string FROM is converted as specified by the function
+ name plus the iconv descriptor CD (which may be fake), and the
+ result appended to TO. On any error, false is returned, otherwise true. */
+
+/* These four use the custom conversion code above. */
+static bool
+convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
+ struct strbuf *to)
+{
+ return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
+}
+
+static bool
+convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
+ struct strbuf *to)
+{
+ return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
+}
+
+static bool
+convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
+ struct strbuf *to)
+{
+ return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
+}
+
+static bool
+convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
+ struct strbuf *to)
+{
+ return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
+}
+
+/* Identity conversion, used when we have no alternative. */
+static bool
+convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
+ const uchar *from, size_t flen, struct strbuf *to)
+{
+ if (to->len + flen > to->asize)
+ {
+ to->asize = to->len + flen;
+ to->text = xrealloc (to->text, to->asize);
+ }
+ memcpy (to->text + to->len, from, flen);
+ to->len += flen;
+ return true;
+}
+
+/* And this one uses the system iconv primitive. It's a little
+ different, since iconv's interface is a little different. */
+
+static bool
+convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
+ struct strbuf *to)
+{
+ ICONV_CONST char *inbuf;
+ char *outbuf;
+ size_t inbytesleft, outbytesleft;
+
+ /* Reset conversion descriptor and check that it is valid. */
+ if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
+ return false;
+
+ inbuf = (ICONV_CONST char *)from;
+ inbytesleft = flen;
+ outbuf = (char *)to->text + to->len;
+ outbytesleft = to->asize - to->len;
+
+ for (;;)
+ {
+ iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+ if (__builtin_expect (inbytesleft == 0, 1))
+ {
+ to->len = to->asize - outbytesleft;
+ return true;
+ }
+ if (errno != E2BIG)
+ return false;
+
+ outbytesleft += OUTBUF_BLOCK_SIZE;
+ to->asize += OUTBUF_BLOCK_SIZE;
+ to->text = xrealloc (to->text, to->asize);
+ outbuf = (char *)to->text + to->asize - outbytesleft;
+ }
+}
+
+/* Arrange for the above custom conversion logic to be used automatically
+ when conversion between a suitable pair of character sets is requested. */
+
+#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
+ CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
+
+struct conversion
+{
+ const char *pair;
+ convert_f func;
+ iconv_t fake_cd;
+};
+static const struct conversion conversion_tab[] = {
+ { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
+ { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
+ { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
+ { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
+ { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
+ { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
+ { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
+ { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
+};
+
+/* Subroutine of cpp_init_iconv: initialize and return a
+ cset_converter structure for conversion from FROM to TO. If
+ iconv_open() fails, issue an error and return an identity
+ converter. Silently return an identity converter if FROM and TO
+ are identical. */
+static struct cset_converter
init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
{
- iconv_t dsc;
+ struct cset_converter ret;
+ char *pair;
+ size_t i;
+
+ if (!strcasecmp (to, from))
+ {
+ ret.func = convert_no_conversion;
+ ret.cd = (iconv_t) -1;
+ return ret;
+ }
+
+ pair = alloca(strlen(to) + strlen(from) + 2);
+
+ strcpy(pair, from);
+ strcat(pair, "/");
+ strcat(pair, to);
+ for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
+ if (!strcasecmp (pair, conversion_tab[i].pair))
+ {
+ ret.func = conversion_tab[i].func;
+ ret.cd = conversion_tab[i].fake_cd;
+ return ret;
+ }
- if (!strcmp (to, from))
- return (iconv_t) -1;
+ /* No custom converter - try iconv. */
+ ret.func = convert_using_iconv;
+ ret.cd = iconv_open (to, from);
- dsc = iconv_open (to, from);
- if (dsc == (iconv_t) -1)
+ if (ret.cd == (iconv_t) -1)
{
if (errno == EINVAL)
cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
@@ -127,8 +648,10 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
from, to);
else
cpp_errno (pfile, DL_ERROR, "iconv_open");
+
+ ret.func = convert_no_conversion;
}
- return dsc;
+ return ret;
}
/* If charset conversion is requested, initialize iconv(3) descriptors
@@ -146,9 +669,9 @@ cpp_init_iconv (cpp_reader *pfile)
bool be = CPP_OPTION (pfile, bytes_big_endian);
if (CPP_OPTION (pfile, wchar_precision) >= 32)
- default_wcset = be ? "UCS-4BE" : "UCS-4LE";
+ default_wcset = be ? "UTF-32BE" : "UTF-32LE";
else if (CPP_OPTION (pfile, wchar_precision) >= 16)
- default_wcset = be ? "UCS-2BE" : "UCS-2LE";
+ default_wcset = be ? "UTF-16BE" : "UTF-16LE";
else
/* This effectively means that wide strings are not supported,
so don't do any conversion at all. */
@@ -181,67 +704,13 @@ _cpp_destroy_iconv (cpp_reader *pfile)
{
if (HAVE_ICONV)
{
- if (pfile->narrow_cset_desc != (iconv_t) -1)
- iconv_close (pfile->narrow_cset_desc);
- if (pfile->wide_cset_desc != (iconv_t) -1)
- iconv_close (pfile->wide_cset_desc);
+ if (pfile->narrow_cset_desc.func == convert_using_iconv)
+ iconv_close (pfile->narrow_cset_desc.cd);
+ if (pfile->wide_cset_desc.func == convert_using_iconv)
+ iconv_close (pfile->wide_cset_desc.cd);
}
}
-/* iconv(3) utility wrapper. Convert the string FROM, of length FLEN,
- according to the iconv descriptor CD. The result is appended to
- the string buffer TO. If DESC is (iconv_t)-1 or iconv is not
- available, the string is simply copied into TO.
-
- Returns true on success, false on error. */
-
-static bool
-convert_cset (iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
-{
- if (!HAVE_ICONV || cd == (iconv_t)-1)
- {
- if (to->len + flen > to->asize)
- {
- to->asize = to->len + flen;
- to->text = xrealloc (to->text, to->asize);
- }
- memcpy (to->text + to->len, from, flen);
- to->len += flen;
- return true;
- }
- else
- {
- ICONV_CONST char *inbuf;
- char *outbuf;
- size_t inbytesleft, outbytesleft;
-
- /* Reset conversion descriptor and check that it is valid. */
- if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
- return false;
-
- inbuf = (ICONV_CONST char *)from;
- inbytesleft = flen;
- outbuf = (char *)to->text + to->len;
- outbytesleft = to->asize - to->len;
-
- for (;;)
- {
- iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
- if (__builtin_expect (inbytesleft == 0, 1))
- {
- to->len = to->asize - outbytesleft;
- return true;
- }
- if (errno != E2BIG)
- return false;
-
- outbytesleft += OUTBUF_BLOCK_SIZE;
- to->asize += OUTBUF_BLOCK_SIZE;
- to->text = xrealloc (to->text, to->asize);
- outbuf = (char *)to->text + to->asize - outbytesleft;
- }
- }
-}
/* Utility routine that computes a mask of the form 0000...111... with
WIDTH 1-bits. */
@@ -390,15 +859,6 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
"universal character %.*s is not valid at the start of an identifier",
(int) (str - base), base);
}
- /* We don't accept UCNs if iconv is not available or will not
- convert to the target wide character set. */
- else if (!HAVE_ICONV || pfile->wide_cset_desc == (iconv_t) -1)
- {
- /* XXX should be DL_SORRY */
- cpp_error (pfile, DL_ERROR,
- "universal character names are not supported in this configuration");
- }
-
if (result == 0)
result = 1;
@@ -408,58 +868,31 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
it to the execution character set and write the result into TBUF.
- An advanced pointer is returned. Issues all relevant diagnostics.
-
- UTF-8 encoding looks like this:
-
- value range encoded as
- 00000000-0000007F 0xxxxxxx
- 00000080-000007FF 110xxxxx 10xxxxxx
- 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
- 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
- 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
- 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ An advanced pointer is returned. Issues all relevant diagnostics. */
- Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
- which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
- never occur. Note also that any value that can be encoded by a
- given row of the table can also be encoded by all successive rows,
- but this is not done; only the shortest possible encoding for any
- given value is valid. For instance, the character 07C0 could be
- encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
- FC 80 80 80 9F 80. Only the first is valid. */
static const uchar *
convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
struct strbuf *tbuf, bool wide)
{
- int nbytes;
- uchar buf[6], *p = &buf[6];
- static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
cppchar_t ucn;
-
- from++; /* skip u/U */
+ uchar buf[6];
+ uchar *bufp = buf;
+ size_t bytesleft = 6;
+ int rval;
+ struct cset_converter cvt
+ = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
+
+ from++; /* skip u/U */
ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
- if (!ucn)
- return from;
- nbytes = 1;
- if (ucn < 0x80)
- *--p = ucn;
- else
+ rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
+ if (rval)
{
- do
- {
- *--p = ((ucn & 0x3F) | 0x80);
- ucn >>= 6;
- nbytes++;
- }
- while (ucn >= 0x3F || (ucn & masks[nbytes-1]));
- *--p = (ucn | masks[nbytes-1]);
+ errno = rval;
+ cpp_errno (pfile, DL_ERROR, "converting UCN to source character set");
}
-
- if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
- p, nbytes, tbuf))
+ else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
cpp_errno (pfile, DL_ERROR, "converting UCN to execution character set");
return from;
@@ -615,6 +1048,8 @@ convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
#endif
uchar c;
+ struct cset_converter cvt
+ = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
c = *from;
switch (c)
@@ -676,8 +1111,7 @@ convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
}
/* Now convert what we have to the execution character set. */
- if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
- &c, 1, tbuf))
+ if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
cpp_errno (pfile, DL_ERROR,
"converting escape sequence to execution character set");
@@ -697,7 +1131,8 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
struct strbuf tbuf;
const uchar *p, *base, *limit;
size_t i;
- iconv_t cd = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
+ struct cset_converter cvt
+ = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
tbuf.text = xmalloc (tbuf.asize);
@@ -719,7 +1154,7 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
{
/* We have a run of normal characters; these can be fed
directly to convert_cset. */
- if (!convert_cset (cd, base, p - base, &tbuf))
+ if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
goto fail;
}
if (p == limit)
@@ -741,6 +1176,25 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
free (tbuf.text);
return false;
}
+
+/* Subroutine of do_line and do_linemarker. Convert escape sequences
+ in a string, but do not perform character set conversion. */
+bool
+_cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *in,
+ cpp_string *out)
+{
+ struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
+ bool retval;
+
+ pfile->narrow_cset_desc.func = convert_no_conversion;
+ pfile->narrow_cset_desc.cd = (iconv_t) -1;
+
+ retval = cpp_interpret_string (pfile, in, 1, out, false);
+
+ pfile->narrow_cset_desc = save_narrow_cset_desc;
+ return retval;
+}
+
/* Subroutine of cpp_interpret_charconst which performs the conversion
to a number, for narrow strings. STR is the string structure returned
diff --git a/gcc/cpphash.h b/gcc/cpphash.h
index f4a7cfc..0e1bb39 100644
--- a/gcc/cpphash.h
+++ b/gcc/cpphash.h
@@ -35,6 +35,15 @@ typedef int iconv_t; /* dummy */
struct directive; /* Deliberately incomplete. */
struct pending_option;
struct op;
+struct strbuf;
+
+typedef bool (*convert_f) (iconv_t, const unsigned char *, size_t,
+ struct strbuf *);
+struct cset_converter
+{
+ convert_f func;
+ iconv_t cd;
+};
#ifndef HAVE_UCHAR
typedef unsigned char uchar;
@@ -369,14 +378,13 @@ struct cpp_reader
unsigned char *macro_buffer;
unsigned int macro_buffer_len;
- /* Iconv descriptor for converting from the source character set
- to the execution character set. (iconv_t)-1 for no conversion. */
- iconv_t narrow_cset_desc;
+ /* Descriptor for converting from the source character set to the
+ execution character set. */
+ struct cset_converter narrow_cset_desc;
- /* Iconv descriptor for converting from the execution character set
- to the wide execution character set. (iconv_t)-1 for no conversion
- other than zero-extending each character to the width of wchar_t. */
- iconv_t wide_cset_desc;
+ /* Descriptor for converting from the source character set to the
+ wide execution character set. */
+ struct cset_converter wide_cset_desc;
/* Tree of other included files. See cppfiles.c. */
struct splay_tree_s *all_include_files;
@@ -555,8 +563,11 @@ extern uchar *_cpp_copy_replacement_text (const cpp_macro *, uchar *);
extern size_t _cpp_replacement_text_len (const cpp_macro *);
/* In cppcharset.c. */
-cppchar_t _cpp_valid_ucn (cpp_reader *, const uchar **, const uchar *, int);
-void _cpp_destroy_iconv (cpp_reader *);
+extern cppchar_t _cpp_valid_ucn (cpp_reader *, const uchar **,
+ const uchar *, int);
+extern void _cpp_destroy_iconv (cpp_reader *);
+extern bool _cpp_interpret_string_notranslate (cpp_reader *, const cpp_string *,
+ cpp_string *);
/* Utility routines and macros. */
#define DSC(str) (const uchar *)str, sizeof str - 1
diff --git a/gcc/cpplib.c b/gcc/cpplib.c
index 2fac44e..1dfef72 100644
--- a/gcc/cpplib.c
+++ b/gcc/cpplib.c
@@ -733,21 +733,6 @@ strtoul_for_line (const uchar *str, unsigned int len, long unsigned int *nump)
return 0;
}
-/* Subroutine of do_line and do_linemarker. Convert escape sequences
- in a string, but do not perform character set conversion. */
-static bool
-interpret_string_notranslate (cpp_reader *pfile, const cpp_string *in,
- cpp_string *out)
-{
- iconv_t save_narrow_cset_desc = pfile->narrow_cset_desc;
- bool retval;
-
- pfile->narrow_cset_desc = (iconv_t) -1;
- retval = cpp_interpret_string (pfile, in, 1, out, false);
- pfile->narrow_cset_desc = save_narrow_cset_desc;
- return retval;
-}
-
/* Interpret #line command.
Note that the filename string (if any) is a true string constant
(escapes are interpreted), unlike in #line. */
@@ -780,7 +765,7 @@ do_line (cpp_reader *pfile)
if (token->type == CPP_STRING)
{
cpp_string s = { 0, 0 };
- if (interpret_string_notranslate (pfile, &token->val.str, &s))
+ if (_cpp_interpret_string_notranslate (pfile, &token->val.str, &s))
new_file = (const char *)s.text;
check_eol (pfile);
}
@@ -829,7 +814,7 @@ do_linemarker (cpp_reader *pfile)
if (token->type == CPP_STRING)
{
cpp_string s = { 0, 0 };
- if (interpret_string_notranslate (pfile, &token->val.str, &s))
+ if (_cpp_interpret_string_notranslate (pfile, &token->val.str, &s))
new_file = (const char *)s.text;
new_sysp = 0;