diff options
author | Tom Tromey <tromey@cygnus.com> | 2000-08-08 17:35:32 +0000 |
---|---|---|
committer | Tom Tromey <tromey@gcc.gnu.org> | 2000-08-08 17:35:32 +0000 |
commit | 6dd1b06886deb44ccd82a27c173baa82cf7c702a (patch) | |
tree | 4e56c11527f3021de4681a3fb9c9570949a66080 /libjava/gnu | |
parent | 97e242b0a71b4d685eca7514da77f8058adbda38 (diff) | |
download | gcc-6dd1b06886deb44ccd82a27c173baa82cf7c702a.zip gcc-6dd1b06886deb44ccd82a27c173baa82cf7c702a.tar.gz gcc-6dd1b06886deb44ccd82a27c173baa82cf7c702a.tar.bz2 |
Input_UTF8.java (read): Fixed handling of surrogate characters.
* gnu/gcj/convert/Input_UTF8.java (read): Fixed handling of
surrogate characters.
* gnu/gcj/convert/Output_UTF8.java (standardUTF8): Default to
true.
(write): Correct handling of surrogate characters.
From-SVN: r35569
Diffstat (limited to 'libjava/gnu')
-rw-r--r-- | libjava/gnu/gcj/convert/Input_UTF8.java | 11 | ||||
-rw-r--r-- | libjava/gnu/gcj/convert/Output_UTF8.java | 35 |
2 files changed, 31 insertions, 15 deletions
diff --git a/libjava/gnu/gcj/convert/Input_UTF8.java b/libjava/gnu/gcj/convert/Input_UTF8.java index f76f282..433a0d1 100644 --- a/libjava/gnu/gcj/convert/Input_UTF8.java +++ b/libjava/gnu/gcj/convert/Input_UTF8.java @@ -1,4 +1,4 @@ -/* Copyright (C) 1999 Free Software Foundation +/* Copyright (C) 1999, 2000 Free Software Foundation This file is part of libgcj. @@ -56,10 +56,11 @@ public class Input_UTF8 extends BytesToUnicode // partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400. // The definition lo>=0xDC00 && lo<=0xDFFF implies // that (lo-0xDC00)>>6 is in the range 0..15. - // Hence we can infer (partial-0x400)>>4 == (hi-0xDB00) - // and we can emit the high-surrogate without waiting - // for the final byte: - outbuffer[outpos++] = (char) (0xDA00+(partial>>4)); + // Hence we can solve for `hi' and we can emit + // the high-surrogate without waiting for the + // final byte: + outbuffer[outpos++] + = (char) (0xD800 + ((partial - 0x400) >> 4)); // Now we want to set it up so that when we read // the final byte on the next iteration, we will diff --git a/libjava/gnu/gcj/convert/Output_UTF8.java b/libjava/gnu/gcj/convert/Output_UTF8.java index 7fb5910..01f5ce8 100644 --- a/libjava/gnu/gcj/convert/Output_UTF8.java +++ b/libjava/gnu/gcj/convert/Output_UTF8.java @@ -1,4 +1,4 @@ -/* Copyright (C) 1999 Free Software Foundation +/* Copyright (C) 1999, 2000 Free Software Foundation This file is part of libgcj. @@ -21,7 +21,7 @@ public class Output_UTF8 extends UnicodeToBytes /** True if a surrogate pair should be emitted as a single UTF8 sequence. * Otherwise, a surrogate pair is treated as two separate characters. * Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */ - public boolean standardUTF8; + public boolean standardUTF8 = true; // Saves the previous char if it was a high-surrogate. char hi_part; @@ -60,9 +60,27 @@ public class Output_UTF8 extends UnicodeToBytes while (bytes_todo > 0 && avail > 0); continue; } + char ch = inbuffer[inpos++]; inlength--; - if (ch < 128 && (ch != 0 || standardUTF8)) + + if ((hi_part != 0 && (ch <= 0xDBFF || ch > 0xDFFF)) + || (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF)) + { + // If the previous character was a high surrogate, and we + // don't now have a low surrogate, we print the high + // surrogate as an isolated character. If this character + // is a low surrogate and we didn't previously see a high + // surrogate, we do the same thing. + --inpos; + ++inlength; + buf[count++] = (byte) (0xE0 | (hi_part >> 12)); + value = hi_part; + hi_part = 0; + avail--; + bytes_todo = 2; + } + else if (ch < 128 && (ch != 0 || standardUTF8)) { avail--; buf[count++] = (byte) ch; @@ -78,19 +96,16 @@ public class Output_UTF8 extends UnicodeToBytes { if (ch <= 0xDBFF) // High surrogates { - // The first byte is (0xF0 | value>>18), where value is the - // Unicode scalar value of the combine character - which - // we may not know yet. But from substituting: - // value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000, - // hi==ch, and cancelling we get: - buf[count++] = (byte) (0xF0 | ((ch-0xD800) >> 8)); - avail--; + // Just save the high surrogate until the next + // character comes along. hi_part = ch; } else // Low surrogates { value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000; + buf[count++] = (byte) (0xF0 | (value >> 18)); bytes_todo = 3; + hi_part = 0; } } else |