aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Tromey <tromey@cygnus.com>2000-10-26 00:01:46 +0000
committerTom Tromey <tromey@gcc.gnu.org>2000-10-26 00:01:46 +0000
commit07b5e470a6e5a28f5e887d65c98174318b940812 (patch)
tree76ca02ecce99066ffcf0c9dd82a4c07d5eff2edc
parent081b49f1443d4bf4f1ab3718db439719c9f33e8d (diff)
downloadgcc-07b5e470a6e5a28f5e887d65c98174318b940812.zip
gcc-07b5e470a6e5a28f5e887d65c98174318b940812.tar.gz
gcc-07b5e470a6e5a28f5e887d65c98174318b940812.tar.bz2
lex.c (java_new_lexer): Initialize new fields.
* lex.c (java_new_lexer): Initialize new fields. Work around broken iconv() implementations. (java_read_char): Swap bytes if required. Use fallback decoder if required. (byteswap_init, need_byteswap): New globals. (java_destroy_lexer): Only close iconv handle if it is in use. * lex.h (java_lexer): New fields read_anything, byte_swap, use_fallback. Made out_buffer unsigned. From-SVN: r37063
-rw-r--r--gcc/java/ChangeLog12
-rw-r--r--gcc/java/lex.c355
-rw-r--r--gcc/java/lex.h12
3 files changed, 244 insertions, 135 deletions
diff --git a/gcc/java/ChangeLog b/gcc/java/ChangeLog
index 14069ab..53408cf 100644
--- a/gcc/java/ChangeLog
+++ b/gcc/java/ChangeLog
@@ -1,3 +1,15 @@
+2000-10-24 Tom Tromey <tromey@cygnus.com>
+
+ * lex.c (java_new_lexer): Initialize new fields. Work around
+ broken iconv() implementations.
+ (java_read_char): Swap bytes if required. Use fallback decoder if
+ required.
+ (byteswap_init, need_byteswap): New globals.
+ (java_destroy_lexer): Only close iconv handle if it is in use.
+ * lex.h (java_lexer): New fields read_anything, byte_swap,
+ use_fallback.
+ Made out_buffer unsigned.
+
2000-10-24 Alexandre Petit-Bianco <apbianco@cygnus.com>
* parse.y (register_incomplete_type): Include JDEP_FIELD as a case
diff --git a/gcc/java/lex.c b/gcc/java/lex.c
index 329d628..b26499b 100644
--- a/gcc/java/lex.c
+++ b/gcc/java/lex.c
@@ -59,6 +59,15 @@ static void java_unget_unicode PARAMS ((void));
static unicode_t java_sneak_unicode PARAMS ((void));
java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
+/* This is nonzero if we have initialized `need_byteswap'. */
+static int byteswap_init = 0;
+
+/* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
+ big-endian order -- not native endian order. We handle this by
+ doing a conversion once at startup and seeing what happens. This
+ flag holds the results of this determination. */
+static int need_byteswap = 0;
+
void
java_init_lex (finput, encoding)
FILE *finput;
@@ -208,19 +217,66 @@ java_new_lexer (finput, encoding)
#ifdef HAVE_ICONV
lex->handle = iconv_open ("UCS-2", encoding);
- if (lex->handle == (iconv_t) -1)
+ if (lex->handle != (iconv_t) -1)
{
- /* FIXME: we should give a nice error based on errno here. */
- enc_error = 1;
+ lex->first = -1;
+ lex->last = -1;
+ lex->out_first = -1;
+ lex->out_last = -1;
+ lex->read_anything = 0;
+ lex->use_fallback = 0;
+
+ /* Work around broken iconv() implementations by doing checking at
+ runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
+ then all UCS-2 encoders will be broken. Perhaps not a valid
+ assumption. */
+ if (! byteswap_init)
+ {
+ iconv_t handle;
+
+ byteswap_init = 1;
+
+ handle = iconv_open ("UCS-2", "UTF-8");
+ if (handle != (iconv_t) -1)
+ {
+ unicode_t result;
+ unsigned char in[3];
+ char *inp, *outp;
+ size_t inc, outc, r;
+
+ /* This is the UTF-8 encoding of \ufeff. */
+ in[0] = 0xef;
+ in[1] = 0xbb;
+ in[2] = 0xbf;
+
+ inp = in;
+ inc = 3;
+ outp = (char *) &result;
+ outc = 2;
+
+ r = iconv (handle, (const char **) &inp, &inc, &outp, &outc);
+ /* Conversion must be complete for us to use the result. */
+ if (r != (size_t) -1 && inc == 0 && outc == 0)
+ need_byteswap = (result != 0xfeff);
+ }
+ }
+
+ lex->byte_swap = need_byteswap;
}
- lex->first = -1;
- lex->last = -1;
- lex->out_first = -1;
- lex->out_last = -1;
-#else /* HAVE_ICONV */
- if (strcmp (encoding, DEFAULT_ENCODING))
- enc_error = 1;
+ else
#endif /* HAVE_ICONV */
+ {
+ /* If iconv failed, use the internal decoder if the default
+ encoding was requested. This code is used on platforms where
+ iconv() exists but is insufficient for our needs. For
+ instance, on Solaris 2.5 iconv() cannot handle UTF-8 or UCS-2. */
+ if (strcmp (encoding, DEFAULT_ENCODING))
+ enc_error = 1;
+#ifdef HAVE_ICONV
+ else
+ lex->use_fallback = 1;
+#endif /* HAVE_ICONV */
+ }
if (enc_error)
fatal ("unknown encoding: `%s'", encoding);
@@ -233,7 +289,8 @@ java_destroy_lexer (lex)
java_lexer *lex;
{
#ifdef HAVE_ICONV
- iconv_close (lex->handle);
+ if (! lex->use_fallback)
+ iconv_close (lex->handle);
#endif
free (lex);
}
@@ -250,140 +307,170 @@ java_read_char (lex)
}
#ifdef HAVE_ICONV
- {
- size_t ir, inbytesleft, in_save, out_count, out_save;
- char *inp, *outp;
- unicode_t result;
+ if (! lex->use_fallback)
+ {
+ size_t ir, inbytesleft, in_save, out_count, out_save;
+ char *inp, *outp;
+ unicode_t result;
- /* If there is data which has already been converted, use it. */
- if (lex->out_first == -1 || lex->out_first >= lex->out_last)
- {
- lex->out_first = 0;
- lex->out_last = 0;
+ /* If there is data which has already been converted, use it. */
+ if (lex->out_first == -1 || lex->out_first >= lex->out_last)
+ {
+ lex->out_first = 0;
+ lex->out_last = 0;
- while (1)
- {
- /* See if we need to read more data. If FIRST == 0 then
- the previous conversion attempt ended in the middle of
- a character at the end of the buffer. Otherwise we
- only have to read if the buffer is empty. */
- if (lex->first == 0 || lex->first >= lex->last)
- {
- int r;
-
- if (lex->first >= lex->last)
- {
- lex->first = 0;
- lex->last = 0;
- }
- if (feof (lex->finput))
+ while (1)
+ {
+ /* See if we need to read more data. If FIRST == 0 then
+ the previous conversion attempt ended in the middle of
+ a character at the end of the buffer. Otherwise we
+ only have to read if the buffer is empty. */
+ if (lex->first == 0 || lex->first >= lex->last)
+ {
+ int r;
+
+ if (lex->first >= lex->last)
+ {
+ lex->first = 0;
+ lex->last = 0;
+ }
+ if (feof (lex->finput))
+ return UEOF;
+ r = fread (&lex->buffer[lex->last], 1,
+ sizeof (lex->buffer) - lex->last,
+ lex->finput);
+ lex->last += r;
+ }
+
+ inbytesleft = lex->last - lex->first;
+ out_count = sizeof (lex->out_buffer) - lex->out_last;
+
+ if (inbytesleft == 0)
+ {
+ /* We've tried to read and there is nothing left. */
return UEOF;
- r = fread (&lex->buffer[lex->last], 1,
- sizeof (lex->buffer) - lex->last,
- lex->finput);
- lex->last += r;
- }
+ }
- inbytesleft = lex->last - lex->first;
- out_count = sizeof (lex->out_buffer) - lex->out_last;
+ in_save = inbytesleft;
+ out_save = out_count;
+ inp = &lex->buffer[lex->first];
+ outp = &lex->out_buffer[lex->out_last];
+ ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
+ &outp, &out_count);
- if (inbytesleft == 0)
- {
- /* We've tried to read and there is nothing left. */
- return UEOF;
- }
+ /* If we haven't read any bytes, then look to see if we
+ have read a BOM. */
+ if (! lex->read_anything && out_save - out_count >= 2)
+ {
+ unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
+ if (uc == 0xfeff)
+ {
+ lex->byte_swap = 0;
+ lex->out_first += 2;
+ }
+ else if (uc == 0xfffe)
+ {
+ lex->byte_swap = 1;
+ lex->out_first += 2;
+ }
+ lex->read_anything = 1;
+ }
- in_save = inbytesleft;
- out_save = out_count;
- inp = &lex->buffer[lex->first];
- outp = &lex->out_buffer[lex->out_last];
- ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
- &outp, &out_count);
- lex->first += in_save - inbytesleft;
- lex->out_last += out_save - out_count;
-
- /* If we converted anything at all, move along. */
- if (out_count != out_save)
- break;
+ if (lex->byte_swap)
+ {
+ unsigned int i;
+ for (i = 0; i < out_save - out_count; i += 2)
+ {
+ char t = lex->out_buffer[lex->out_last + i];
+ lex->out_buffer[lex->out_last + i]
+ = lex->out_buffer[lex->out_last + i + 1];
+ lex->out_buffer[lex->out_last + i + 1] = t;
+ }
+ }
- if (ir == (size_t) -1)
- {
- if (errno == EINVAL)
- {
- /* This is ok. This means that the end of our buffer
- is in the middle of a character sequence. We just
- move the valid part of the buffer to the beginning
- to force a read. */
- /* We use bcopy() because it should work for
- overlapping strings. Use memmove() instead... */
- bcopy (&lex->buffer[lex->first], &lex->buffer[0],
- lex->last - lex->first);
- lex->last -= lex->first;
- lex->first = 0;
- }
- else
- {
- /* A more serious error. */
- java_lex_error ("unrecognized character in input stream",
- 0);
- return UEOF;
- }
- }
- }
- }
+ lex->first += in_save - inbytesleft;
+ lex->out_last += out_save - out_count;
- if (lex->out_first == -1 || lex->out_first >= lex->out_last)
- {
- /* Don't have any data. */
- return UEOF;
- }
+ /* If we converted anything at all, move along. */
+ if (out_count != out_save)
+ break;
- /* Success. We assume that UCS-2 is big-endian. This appears to
- be an ok assumption. */
- result = ((((unsigned char) lex->out_buffer[lex->out_first]) << 8)
- | (unsigned char) lex->out_buffer[lex->out_first + 1]);
- lex->out_first += 2;
- return result;
- }
-#else /* HAVE_ICONV */
- {
- int c, c1, c2;
- c = getc (lex->finput);
-
- if (c < 128)
- return (unicode_t)c;
- if (c == EOF)
- return UEOF;
- else
- {
- if ((c & 0xe0) == 0xc0)
- {
- c1 = getc (lex->finput);
- if ((c1 & 0xc0) == 0x80)
- return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
- c = c1;
- }
- else if ((c & 0xf0) == 0xe0)
- {
- c1 = getc (lex->finput);
- if ((c1 & 0xc0) == 0x80)
- {
- c2 = getc (lex->finput);
- if ((c2 & 0xc0) == 0x80)
- return (unicode_t)(((c & 0xf) << 12) +
- (( c1 & 0x3f) << 6) + (c2 & 0x3f));
- else
- c = c2;
- }
- else
- c = c1;
- }
+ if (ir == (size_t) -1)
+ {
+ if (errno == EINVAL)
+ {
+ /* This is ok. This means that the end of our buffer
+ is in the middle of a character sequence. We just
+ move the valid part of the buffer to the beginning
+ to force a read. */
+ /* We use bcopy() because it should work for
+ overlapping strings. Use memmove() instead... */
+ bcopy (&lex->buffer[lex->first], &lex->buffer[0],
+ lex->last - lex->first);
+ lex->last -= lex->first;
+ lex->first = 0;
+ }
+ else
+ {
+ /* A more serious error. */
+ java_lex_error ("unrecognized character in input stream",
+ 0);
+ return UEOF;
+ }
+ }
+ }
+ }
- /* We simply don't support invalid characters. */
- java_lex_error ("malformed UTF-8 character", 0);
- }
- }
+ if (lex->out_first == -1 || lex->out_first >= lex->out_last)
+ {
+ /* Don't have any data. */
+ return UEOF;
+ }
+
+ /* Success. */
+ result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
+ lex->out_first += 2;
+ return result;
+ }
+ else
#endif /* HAVE_ICONV */
+ {
+ int c, c1, c2;
+ c = getc (lex->finput);
+
+ if (c < 128)
+ return (unicode_t)c;
+ if (c == EOF)
+ return UEOF;
+ else
+ {
+ if ((c & 0xe0) == 0xc0)
+ {
+ c1 = getc (lex->finput);
+ if ((c1 & 0xc0) == 0x80)
+ return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
+ c = c1;
+ }
+ else if ((c & 0xf0) == 0xe0)
+ {
+ c1 = getc (lex->finput);
+ if ((c1 & 0xc0) == 0x80)
+ {
+ c2 = getc (lex->finput);
+ if ((c2 & 0xc0) == 0x80)
+ return (unicode_t)(((c & 0xf) << 12) +
+ (( c1 & 0x3f) << 6) + (c2 & 0x3f));
+ else
+ c = c2;
+ }
+ else
+ c = c1;
+ }
+
+ /* We simply don't support invalid characters. */
+ java_lex_error ("malformed UTF-8 character", 0);
+ }
+ }
/* We only get here on error. */
return UEOF;
diff --git a/gcc/java/lex.h b/gcc/java/lex.h
index 71a030d..ae9eebb 100644
--- a/gcc/java/lex.h
+++ b/gcc/java/lex.h
@@ -115,6 +115,16 @@ typedef struct java_lexer
unicode_t unget_value;
#ifdef HAVE_ICONV
+ /* Nonzero if we've read any bytes. We only recognize the
+ byte-order-marker (BOM) as the first word. */
+ int read_anything : 1;
+
+ /* Nonzero if we have to byte swap. */
+ int byte_swap : 1;
+
+ /* Nonzero if we're using the fallback decoder. */
+ int use_fallback : 1;
+
/* The handle for the iconv converter we're using. */
iconv_t handle;
@@ -132,7 +142,7 @@ typedef struct java_lexer
/* This is a buffer of characters already converted by iconv. We
use `char' here because we're assuming that iconv() converts to
big-endian UCS-2, and then we convert it ourselves. */
- char out_buffer[1024];
+ unsigned char out_buffer[1024];
/* Index of first valid output character. -1 if no valid
characters. */