aboutsummaryrefslogtreecommitdiff
path: root/libjava/gnu/gcj/convert/Input_UTF8.java
diff options
context:
space:
mode:
authorTom Tromey <tromey@gcc.gnu.org>1999-04-07 14:42:40 +0000
committerTom Tromey <tromey@gcc.gnu.org>1999-04-07 14:42:40 +0000
commitee9dd3721be68b9fa63dea9aa5a1d86e66958cde (patch)
treed96801a16fdf03a5682ef98730fe333a46eef944 /libjava/gnu/gcj/convert/Input_UTF8.java
parent140fa895c6b859f827fc4437b91775a82cd105fb (diff)
downloadgcc-ee9dd3721be68b9fa63dea9aa5a1d86e66958cde.zip
gcc-ee9dd3721be68b9fa63dea9aa5a1d86e66958cde.tar.gz
gcc-ee9dd3721be68b9fa63dea9aa5a1d86e66958cde.tar.bz2
Initial revision
From-SVN: r26263
Diffstat (limited to 'libjava/gnu/gcj/convert/Input_UTF8.java')
-rw-r--r--libjava/gnu/gcj/convert/Input_UTF8.java107
1 files changed, 107 insertions, 0 deletions
diff --git a/libjava/gnu/gcj/convert/Input_UTF8.java b/libjava/gnu/gcj/convert/Input_UTF8.java
new file mode 100644
index 0000000..c706a52
--- /dev/null
+++ b/libjava/gnu/gcj/convert/Input_UTF8.java
@@ -0,0 +1,107 @@
+/* Copyright (C) 1999 Cygnus Solutions
+
+ This file is part of libgcj.
+
+This software is copyrighted work licensed under the terms of the
+Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
+details. */
+
+package gnu.gcj.convert;
+
+public class Input_UTF8 extends BytesToUnicode
+{
+ public String getName() { return "UTF8"; }
+
+ int partial = 0;
+ int partial_bytes_expected = 0;
+ //int suggogate_second = -1;
+
+ public int read (char[] outbuffer, int outpos, int outlength)
+ {
+ int origpos = outpos;
+ for (;;)
+ {
+ if (outpos >= outlength)
+ break;
+ if (inpos >= inlength)
+ break;
+ int b = inbuffer[inpos++];
+ if (b >= 0)
+ outbuffer[outpos++] = (char) b;
+ else
+ {
+ if ((b & 0xC0) == 0x80) // Continuation byte
+ {
+ partial = (partial << 6) | (b & 0x3F);
+ --partial_bytes_expected;
+ if (partial_bytes_expected == 1)
+ {
+ if (partial > (0xFFFF>>6))
+ {
+ // The next continuation byte will cause the result
+ // to exceed 0xFFFF, so we must use a surrogate pair.
+ // The "Unicode scalar value" (see D28 in section 3.7
+ // of the Unicode Standard 2.0) is defined as:
+ // value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000,
+ // where (hi, lo) is the Unicode surrogate pair.
+ // After reading the first three bytes, we have:
+ // partial == (value >> 6).
+ // Substituting and simplifying, we get:
+ // partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400.
+ // The definition lo>=0xDC00 && lo<=0xDFFF implies
+ // that (lo-0xDC00)>>6 is in the range 0..15.
+ // Hence we can infer (partial-0x400)>>4 == (hi-0xDB00)
+ // and we can emit the high-surrogate without waiting
+ // for the final byte:
+ outbuffer[outpos++] = (char) (0xDA00+(partial>>4));
+
+ // Now we want to set it up so that when we read
+ // the final byte on the next iteration, we will
+ // get the low-surrogate without special handling.
+ // I.e. we want:
+ // lo == (next_partial << 6) | (next & 0x3F)
+ // where next is the next input byte and next_partial
+ // is the value of partial at the end of this
+ // iteration. This implies: next_partial == lo >> 6.
+ // We can simplify the previous:
+ // partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400,
+ // to: partial == (hi-0xD800)*0x10+(lo>>6)+0x90.
+ // Inserting the values of hi and next_partial,
+ // and simplifying, we get: partial ==
+ // ( (partial-0x400)&~0xF) + next_partial + 0x90.
+ // Solving for next_partial, we get:
+ // next_partial = partial+0x400-0x90-(partial&~0xF):
+ // or: next_partial = (partial&0xF) + 0x370. Hence:
+ partial = (partial & 0xF) + 0x370;
+ }
+ }
+ else if (partial_bytes_expected == 0)
+ {
+ outbuffer[outpos++] = (char) partial;
+ partial = 0;
+ partial_bytes_expected = 0;
+ }
+ }
+ else // prefix byte
+ {
+ if ((b & 0xE) == 0xC0)
+ {
+ partial = b & 0x1F;
+ partial_bytes_expected = 1;
+ }
+ else if ((b & 0xF) == 0xF0)
+ {
+ partial = b & 0xF;
+ partial_bytes_expected = 2;
+ }
+ else
+ {
+ partial = b & 7;
+ partial_bytes_expected = 3;
+ }
+ }
+ }
+ }
+ return outpos - origpos;
+ }
+}