aboutsummaryrefslogtreecommitdiff
path: root/libjava/gnu
diff options
context:
space:
mode:
authorTom Tromey <tromey@cygnus.com>2000-08-08 17:35:32 +0000
committerTom Tromey <tromey@gcc.gnu.org>2000-08-08 17:35:32 +0000
commit6dd1b06886deb44ccd82a27c173baa82cf7c702a (patch)
tree4e56c11527f3021de4681a3fb9c9570949a66080 /libjava/gnu
parent97e242b0a71b4d685eca7514da77f8058adbda38 (diff)
downloadgcc-6dd1b06886deb44ccd82a27c173baa82cf7c702a.zip
gcc-6dd1b06886deb44ccd82a27c173baa82cf7c702a.tar.gz
gcc-6dd1b06886deb44ccd82a27c173baa82cf7c702a.tar.bz2
Input_UTF8.java (read): Fixed handling of surrogate characters.
* gnu/gcj/convert/Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Output_UTF8.java (standardUTF8): Default to true. (write): Correct handling of surrogate characters. From-SVN: r35569
Diffstat (limited to 'libjava/gnu')
-rw-r--r--libjava/gnu/gcj/convert/Input_UTF8.java11
-rw-r--r--libjava/gnu/gcj/convert/Output_UTF8.java35
2 files changed, 31 insertions, 15 deletions
diff --git a/libjava/gnu/gcj/convert/Input_UTF8.java b/libjava/gnu/gcj/convert/Input_UTF8.java
index f76f282..433a0d1 100644
--- a/libjava/gnu/gcj/convert/Input_UTF8.java
+++ b/libjava/gnu/gcj/convert/Input_UTF8.java
@@ -1,4 +1,4 @@
-/* Copyright (C) 1999 Free Software Foundation
+/* Copyright (C) 1999, 2000 Free Software Foundation
This file is part of libgcj.
@@ -56,10 +56,11 @@ public class Input_UTF8 extends BytesToUnicode
// partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400.
// The definition lo>=0xDC00 && lo<=0xDFFF implies
// that (lo-0xDC00)>>6 is in the range 0..15.
- // Hence we can infer (partial-0x400)>>4 == (hi-0xDB00)
- // and we can emit the high-surrogate without waiting
- // for the final byte:
- outbuffer[outpos++] = (char) (0xDA00+(partial>>4));
+ // Hence we can solve for `hi' and we can emit
+ // the high-surrogate without waiting for the
+ // final byte:
+ outbuffer[outpos++]
+ = (char) (0xD800 + ((partial - 0x400) >> 4));
// Now we want to set it up so that when we read
// the final byte on the next iteration, we will
diff --git a/libjava/gnu/gcj/convert/Output_UTF8.java b/libjava/gnu/gcj/convert/Output_UTF8.java
index 7fb5910..01f5ce8 100644
--- a/libjava/gnu/gcj/convert/Output_UTF8.java
+++ b/libjava/gnu/gcj/convert/Output_UTF8.java
@@ -1,4 +1,4 @@
-/* Copyright (C) 1999 Free Software Foundation
+/* Copyright (C) 1999, 2000 Free Software Foundation
This file is part of libgcj.
@@ -21,7 +21,7 @@ public class Output_UTF8 extends UnicodeToBytes
/** True if a surrogate pair should be emitted as a single UTF8 sequence.
* Otherwise, a surrogate pair is treated as two separate characters.
* Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
- public boolean standardUTF8;
+ public boolean standardUTF8 = true;
// Saves the previous char if it was a high-surrogate.
char hi_part;
@@ -60,9 +60,27 @@ public class Output_UTF8 extends UnicodeToBytes
while (bytes_todo > 0 && avail > 0);
continue;
}
+
char ch = inbuffer[inpos++];
inlength--;
- if (ch < 128 && (ch != 0 || standardUTF8))
+
+ if ((hi_part != 0 && (ch <= 0xDBFF || ch > 0xDFFF))
+ || (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF))
+ {
+ // If the previous character was a high surrogate, and we
+ // don't now have a low surrogate, we print the high
+ // surrogate as an isolated character. If this character
+ // is a low surrogate and we didn't previously see a high
+ // surrogate, we do the same thing.
+ --inpos;
+ ++inlength;
+ buf[count++] = (byte) (0xE0 | (hi_part >> 12));
+ value = hi_part;
+ hi_part = 0;
+ avail--;
+ bytes_todo = 2;
+ }
+ else if (ch < 128 && (ch != 0 || standardUTF8))
{
avail--;
buf[count++] = (byte) ch;
@@ -78,19 +96,16 @@ public class Output_UTF8 extends UnicodeToBytes
{
if (ch <= 0xDBFF) // High surrogates
{
- // The first byte is (0xF0 | value>>18), where value is the
- // Unicode scalar value of the combine character - which
- // we may not know yet. But from substituting:
- // value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000,
- // hi==ch, and cancelling we get:
- buf[count++] = (byte) (0xF0 | ((ch-0xD800) >> 8));
- avail--;
+ // Just save the high surrogate until the next
+ // character comes along.
hi_part = ch;
}
else // Low surrogates
{
value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
+ buf[count++] = (byte) (0xF0 | (value >> 18));
bytes_todo = 3;
+ hi_part = 0;
}
}
else