From f92351d76e07aa24c84fb6f3ce6b7cd97592c9f5 Mon Sep 17 00:00:00 2001 From: Tom Tromey Date: Wed, 1 Nov 2000 17:00:02 +0000 Subject: encodings.pl: Added `ASCII' alias. * scripts/encodings.pl: Added `ASCII' alias. * Makefile.in: Rebuilt. * Makefile.am (convert_source_files): Added new files. * gnu/gcj/convert/Input_ASCII.java: New file. * gnu/gcj/convert/Output_ASCII.java: New file. * gnu/gcj/convert/Output_8859_1.java (write): Use `?' to represent out-of-range characters. * gnu/gcj/convert/natIconv.cc (iconv_init): New method. (read): Swap bytes if required. Treat `count' as character count, not byte count. (write): Likewise. Also, handle case where iconv fails on a given character. (init): Put encoding into exception. * gnu/gcj/convert/IOConverter.java (iconv_byte_swap): New global. (static): Call iconv_init. Rebuilt alias list. (iconv_init): New private method. From-SVN: r37190 --- libjava/gnu/gcj/convert/IOConverter.java | 19 +++++ libjava/gnu/gcj/convert/Input_8859_1.java | 4 +- libjava/gnu/gcj/convert/Input_ASCII.java | 37 ++++++++++ libjava/gnu/gcj/convert/Output_8859_1.java | 12 +-- libjava/gnu/gcj/convert/Output_ASCII.java | 54 ++++++++++++++ libjava/gnu/gcj/convert/natIconv.cc | 115 +++++++++++++++++++++++++---- 6 files changed, 220 insertions(+), 21 deletions(-) create mode 100644 libjava/gnu/gcj/convert/Input_ASCII.java create mode 100644 libjava/gnu/gcj/convert/Output_ASCII.java (limited to 'libjava/gnu') diff --git a/libjava/gnu/gcj/convert/IOConverter.java b/libjava/gnu/gcj/convert/IOConverter.java index c986624..9b5fbad 100644 --- a/libjava/gnu/gcj/convert/IOConverter.java +++ b/libjava/gnu/gcj/convert/IOConverter.java @@ -18,6 +18,10 @@ public abstract class IOConverter // Map encoding aliases to our canonical form. static private Hashtable hash = new Hashtable (); + // True if we have to do byte-order conversions on iconv() + // arguments. + static protected boolean iconv_byte_swap; + static { // Manually maintained aliases. Note that the value must be our @@ -25,6 +29,17 @@ public abstract class IOConverter hash.put ("ISO-Latin-1", "8859_1"); // All aliases after this point are automatically generated by the // `encodings.pl' script. Run it to make any corrections. + hash.put ("ANSI_X3.4-1968", "ASCII"); + hash.put ("iso-ir-6", "ASCII"); + hash.put ("ANSI_X3.4-1986", "ASCII"); + hash.put ("ISO_646.irv:1991", "ASCII"); + hash.put ("ASCII", "ASCII"); + hash.put ("ISO646-US", "ASCII"); + hash.put ("US-ASCII", "ASCII"); + hash.put ("us", "ASCII"); + hash.put ("IBM367", "ASCII"); + hash.put ("cp367", "ASCII"); + hash.put ("csASCII", "ASCII"); hash.put ("ISO_8859-1:1987", "8859_1"); hash.put ("iso-ir-100", "8859_1"); hash.put ("ISO_8859-1", "8859_1"); @@ -41,8 +56,12 @@ public abstract class IOConverter hash.put ("Extended_UNIX_Code_Packed_Format_for_Japanese", "EUCJIS"); hash.put ("csEUCPkdFmtJapanese", "EUCJIS"); hash.put ("EUC-JP", "EUCJIS"); + + iconv_byte_swap = iconv_init (); } + private static native boolean iconv_init (); + // Turn an alias into the canonical form. protected static final String canonicalize (String name) { diff --git a/libjava/gnu/gcj/convert/Input_8859_1.java b/libjava/gnu/gcj/convert/Input_8859_1.java index 6c70034..bd5f779 100644 --- a/libjava/gnu/gcj/convert/Input_8859_1.java +++ b/libjava/gnu/gcj/convert/Input_8859_1.java @@ -1,4 +1,4 @@ -/* Copyright (C) 1999 Free Software Foundation +/* Copyright (C) 1999, 2000 Free Software Foundation This file is part of libgcj. @@ -9,7 +9,7 @@ details. */ package gnu.gcj.convert; /** - * Convert ISO-Latin-1 (8851-1) text to Unicode. + * Convert ISO-Latin-1 (8859-1) text to Unicode. * @author Per Bothner * @date March 1999. */ diff --git a/libjava/gnu/gcj/convert/Input_ASCII.java b/libjava/gnu/gcj/convert/Input_ASCII.java new file mode 100644 index 0000000..cb531e9 --- /dev/null +++ b/libjava/gnu/gcj/convert/Input_ASCII.java @@ -0,0 +1,37 @@ +/* Copyright (C) 2000 Free Software Foundation + + This file is part of libgcj. + +This software is copyrighted work licensed under the terms of the +Libgcj License. Please consult the file "LIBGCJ_LICENSE" for +details. */ + +package gnu.gcj.convert; + +/** + * Convert ASCII text to Unicode. + * @date October 2000 + */ + +public class Input_ASCII extends BytesToUnicode +{ + public String getName() { return "ASCII"; } + + public int read (char[] outbuffer, int outpos, int count) + { + int origpos = outpos; + // Make sure fields of this are in registers. + int inpos = this.inpos; + byte[] inbuffer = this.inbuffer; + int inavail = this.inlength - inpos; + int outavail = count; + if (outavail > inavail) + outavail = inavail; + while (--outavail >= 0) + { + outbuffer[outpos++] = (char) (inbuffer[inpos++] & 0x7f); + } + this.inpos = inpos; + return outpos - origpos; + } +} diff --git a/libjava/gnu/gcj/convert/Output_8859_1.java b/libjava/gnu/gcj/convert/Output_8859_1.java index ac04ad6..7ae6a61 100644 --- a/libjava/gnu/gcj/convert/Output_8859_1.java +++ b/libjava/gnu/gcj/convert/Output_8859_1.java @@ -1,4 +1,4 @@ -/* Copyright (C) 1999 Free Software Foundation +/* Copyright (C) 1999, 2000 Free Software Foundation This file is part of libgcj. @@ -10,9 +10,9 @@ package gnu.gcj.convert; /** * Convert Unicode ISO-Latin-1 (8851-1) text. - * The high-order byte of each character is truncated. + * Unrecognized characters are printed as `?'. * @author Per Bothner - * @date Match 1999. + * @date March 1999. */ public class Output_8859_1 extends UnicodeToBytes @@ -30,7 +30,8 @@ public class Output_8859_1 extends UnicodeToBytes inlength = avail; for (int i = inlength; --i >= 0; ) { - buf[count++] = (byte) inbuffer[inpos++]; + char c = inbuffer[inpos++]; + buf[count++] = (byte) ((c > 0xff) ? '?' : c); } this.count = count; return inlength; @@ -45,7 +46,8 @@ public class Output_8859_1 extends UnicodeToBytes inlength = avail; for (int i = inlength; --i >= 0; ) { - buf[count++] = (byte) str.charAt(inpos++); + char c = str.charAt(inpos++); + buf[count++] = (byte) ((c > 0xff) ? '?' : c); } this.count = count; return inlength; diff --git a/libjava/gnu/gcj/convert/Output_ASCII.java b/libjava/gnu/gcj/convert/Output_ASCII.java new file mode 100644 index 0000000..9f33645 --- /dev/null +++ b/libjava/gnu/gcj/convert/Output_ASCII.java @@ -0,0 +1,54 @@ +/* Copyright (C) 2000 Free Software Foundation + + This file is part of libgcj. + +This software is copyrighted work licensed under the terms of the +Libgcj License. Please consult the file "LIBGCJ_LICENSE" for +details. */ + +package gnu.gcj.convert; + +/** + * Convert Unicode ASCII + * Unrecognized characters are printed as `?'. + * @date October 2000 + */ + +public class Output_ASCII extends UnicodeToBytes +{ + public String getName() { return "ASCII"; } + + /** + * @return number of chars converted. */ + public int write (char[] inbuffer, int inpos, int inlength) + { + int count = this.count; + byte[] buf = this.buf; + int avail = buf.length - count; + if (inlength > avail) + inlength = avail; + for (int i = inlength; --i >= 0; ) + { + char c = inbuffer[inpos++]; + buf[count++] = (byte) ((c > 0x7f) ? '?' : c); + } + this.count = count; + return inlength; + } + + public int write (String str, int inpos, int inlength, char[] work) + { + int count = this.count; + byte[] buf = this.buf; + int avail = buf.length - count; + if (inlength > avail) + inlength = avail; + for (int i = inlength; --i >= 0; ) + { + char c = str.charAt(inpos++); + buf[count++] = (byte) ((c > 0x7f) ? '?' : c); + } + this.count = count; + return inlength; + } +} diff --git a/libjava/gnu/gcj/convert/natIconv.cc b/libjava/gnu/gcj/convert/natIconv.cc index 061779c..d346b14 100644 --- a/libjava/gnu/gcj/convert/natIconv.cc +++ b/libjava/gnu/gcj/convert/natIconv.cc @@ -44,13 +44,13 @@ gnu::gcj::convert::Input_iconv::init (jstring encoding) iconv_t h = iconv_open ("UCS-2", buffer); if (h == (iconv_t) -1) - JvThrow (new java::io::UnsupportedEncodingException); + throw new java::io::UnsupportedEncodingException (encoding); JvAssert (h != NULL); handle = reinterpret_cast (h); #else /* HAVE_ICONV */ // If no iconv, just throw an exception. - JvThrow (new java::io::UnsupportedEncodingException); + throw new java::io::UnsupportedEncodingException (encoding); #endif /* HAVE_ICONV */ } @@ -75,7 +75,7 @@ gnu::gcj::convert::Input_iconv::read (jcharArray outbuffer, jchar *out = elements (outbuffer); size_t inavail = inlength - inpos; size_t old_in = inavail; - size_t outavail = count; + size_t outavail = count * sizeof (jchar); size_t old_out = outavail; char *inbuf = (char *) &bytes[inpos]; @@ -86,8 +86,20 @@ gnu::gcj::convert::Input_iconv::read (jcharArray outbuffer, &outbuf, &outavail); // FIXME: what if R==-1? + if (iconv_byte_swap) + { + size_t max = (old_out - outavail) / sizeof (jchar); + for (size_t i = 0; i < max; ++i) + { + // Byte swap. + jchar c = (((out[outpos + i] & 0xff) << 8) + | ((out[outpos + i] >> 8) & 0xff)); + outbuf[i] = c; + } + } + inpos += old_in - inavail; - return old_out - outavail; + return (old_out - outavail) / sizeof (jchar); #else /* HAVE_ICONV */ return -1; #endif /* HAVE_ICONV */ @@ -104,13 +116,13 @@ gnu::gcj::convert::Output_iconv::init (jstring encoding) iconv_t h = iconv_open (buffer, "UCS-2"); if (h == (iconv_t) -1) - JvThrow (new java::io::UnsupportedEncodingException); + throw new java::io::UnsupportedEncodingException (encoding); JvAssert (h != NULL); handle = reinterpret_cast (h); #else /* HAVE_ICONV */ // If no iconv, just throw an exception. - JvThrow (new java::io::UnsupportedEncodingException); + throw new java::io::UnsupportedEncodingException (encoding); #endif /* HAVE_ICONV */ } @@ -128,14 +140,15 @@ gnu::gcj::convert::Output_iconv::finalize (void) jint gnu::gcj::convert::Output_iconv::write (jcharArray inbuffer, - jint inpos, jint count) + jint inpos, jint inlength) { #ifdef HAVE_ICONV jchar *chars = elements (inbuffer); jbyte *out = elements (buf); + jchar *temp_buffer = NULL; - size_t inavail = count; - size_t old_in = count; + size_t inavail = inlength * sizeof (jchar); + size_t old_in = inavail; size_t outavail = buf->length - count; size_t old_out = outavail; @@ -143,14 +156,88 @@ gnu::gcj::convert::Output_iconv::write (jcharArray inbuffer, char *inbuf = (char *) &chars[inpos]; char *outbuf = (char *) &out[count]; - size_t r = iconv_adapter (iconv, (iconv_t) handle, - &inbuf, &inavail, - &outbuf, &outavail); - // FIXME: what if R==-1? + if (iconv_byte_swap) + { + // Ugly performance penalty -- don't use losing systems! + temp_buffer = (jchar *) _Jv_Malloc (inlength * sizeof (jchar)); + for (int i = 0; i < inlength; ++i) + { + // Byte swap. + jchar c = (((chars[inpos + i] & 0xff) << 8) + | ((chars[inpos + i] >> 8) & 0xff)); + temp_buffer[i] = c; + } + inbuf = (char *) temp_buffer; + } + + // If the conversion fails on the very first character, then we + // assume that the character can't be represented in the output + // encoding. There's nothing useful we can do here, so we simply + // omit that character. Note that we can't check `errno' because + // glibc 2.1.3 doesn't set it correctly. We could check it if we + // really needed to, but we'd have to disable support for 2.1.3. + size_t loop_old_in = old_in; + while (1) + { + size_t r = iconv_adapter (iconv, (iconv_t) handle, + &inbuf, &inavail, + &outbuf, &outavail); + if (r == -1 && inavail == loop_old_in) + { + inavail -= 2; + if (inavail == 0) + break; + loop_old_in -= 2; + inbuf += 2; + } + else + break; + } + + if (temp_buffer != NULL) + _Jv_Free (temp_buffer); count += old_out - outavail; - return old_in - inavail; + return (old_in - inavail) / sizeof (jchar); #else /* HAVE_ICONV */ return -1; #endif /* HAVE_ICONV */ } + +jboolean +gnu::gcj::convert::IOConverter::iconv_init (void) +{ + // Some versions of iconv() always return their UCS-2 results in + // big-endian order, and they also require UCS-2 inputs to be in + // big-endian order. For instance, glibc 2.1.3 does this. If the + // UTF-8=>UCS-2 iconv converter has this feature, then we assume + // that all UCS-2 converters do. (This might not be the best + // heuristic, but is is all we've got.) + jboolean result = false; +#ifdef HAVE_ICONV + iconv_t handle = iconv_open ("UCS-2", "UTF-8"); + if (handle != (iconv_t) -1) + { + jchar c; + unsigned char in[3]; + char *inp, *outp; + size_t inc, outc, r; + + // This is the UTF-8 encoding of \ufeff. + in[0] = 0xef; + in[1] = 0xbb; + in[2] = 0xbf; + + inp = (char *) in; + inc = 3; + outp = (char *) &c; + outc = 2; + + r = iconv_adapter (iconv, handle, &inp, &inc, &outp, &outc); + // Conversion must be complete for us to use the result. + if (r != (size_t) -1 && inc == 0 && outc == 0) + result = (c != 0xfeff); + } +#endif /* HAVE_ICONV */ + return result; +} -- cgit v1.1