diff options
-rw-r--r-- | libjava/gnu/gcj/convert/Input_8859_1.java | 6 | ||||
-rw-r--r-- | libjava/gnu/gcj/convert/Input_SJIS.java | 24 | ||||
-rw-r--r-- | libjava/gnu/gcj/convert/JIS0201.h | 159 | ||||
-rw-r--r-- | libjava/gnu/gcj/convert/gen-from-JIS.c | 37 | ||||
-rw-r--r-- | libjava/gnu/gcj/convert/natInput_EUCJIS.cc | 2 | ||||
-rw-r--r-- | libjava/gnu/gcj/convert/natOutput_EUCJIS.cc | 102 |
6 files changed, 319 insertions, 11 deletions
diff --git a/libjava/gnu/gcj/convert/Input_8859_1.java b/libjava/gnu/gcj/convert/Input_8859_1.java index 262099a..e0d2d51 100644 --- a/libjava/gnu/gcj/convert/Input_8859_1.java +++ b/libjava/gnu/gcj/convert/Input_8859_1.java @@ -8,6 +8,12 @@ details. */ package gnu.gcj.convert; +/** + * Convert ISO-Latin-1 (8851-1) text to Unicode. + * @author Per Bothner <bothner@cygnus.com> + * @date Match 1999. + */ + public class Input_8859_1 extends BytesToUnicode { public String getName() { return "8859_1"; } diff --git a/libjava/gnu/gcj/convert/Input_SJIS.java b/libjava/gnu/gcj/convert/Input_SJIS.java new file mode 100644 index 0000000..cdc7c4c --- /dev/null +++ b/libjava/gnu/gcj/convert/Input_SJIS.java @@ -0,0 +1,24 @@ +/* Copyright (C) 1999 Cygnus Solutions + + This file is part of libgcj. + +This software is copyrighted work licensed under the terms of the +Libgcj License. Please consult the file "LIBGCJ_LICENSE" for +details. */ + +package gnu.gcj.convert; + +/** + * Convert SJIS (Shift JIS, used on Japanese MS-Windows) to Unicode. + * @author Per Bothner <bothner@cygnus.com> + * @date April 1999. + */ + +public class Input_SJIS extends BytesToUnicode +{ + public String getName() { return "SJIS"; } + + public native int read (char[] outbuffer, int outpos, int outlength); + + int first_byte; +} diff --git a/libjava/gnu/gcj/convert/JIS0201.h b/libjava/gnu/gcj/convert/JIS0201.h new file mode 100644 index 0000000..3a3b4f1 --- /dev/null +++ b/libjava/gnu/gcj/convert/JIS0201.h @@ -0,0 +1,159 @@ +/* This file is automatically generated from Unicode tables */ +MAP(0x00, 0x20, 0x0020) /* SPACE */ +MAP(0x00, 0x21, 0x0021) /* EXCLAMATION MARK */ +MAP(0x00, 0x22, 0x0022) /* QUOTATION MARK */ +MAP(0x00, 0x23, 0x0023) /* NUMBER SIGN */ +MAP(0x00, 0x24, 0x0024) /* DOLLAR SIGN */ +MAP(0x00, 0x25, 0x0025) /* PERCENT SIGN */ +MAP(0x00, 0x26, 0x0026) /* AMPERSAND */ +MAP(0x00, 0x27, 0x0027) /* APOSTROPHE */ +MAP(0x00, 0x28, 0x0028) /* LEFT PARENTHESIS */ +MAP(0x00, 0x29, 0x0029) /* RIGHT PARENTHESIS */ +MAP(0x00, 0x2A, 0x002A) /* ASTERISK */ +MAP(0x00, 0x2B, 0x002B) /* PLUS SIGN */ +MAP(0x00, 0x2C, 0x002C) /* COMMA */ +MAP(0x00, 0x2D, 0x002D) /* HYPHEN-MINUS */ +MAP(0x00, 0x2E, 0x002E) /* FULL STOP */ +MAP(0x00, 0x2F, 0x002F) /* SOLIDUS */ +MAP(0x00, 0x30, 0x0030) /* DIGIT ZERO */ +MAP(0x00, 0x31, 0x0031) /* DIGIT ONE */ +MAP(0x00, 0x32, 0x0032) /* DIGIT TWO */ +MAP(0x00, 0x33, 0x0033) /* DIGIT THREE */ +MAP(0x00, 0x34, 0x0034) /* DIGIT FOUR */ +MAP(0x00, 0x35, 0x0035) /* DIGIT FIVE */ +MAP(0x00, 0x36, 0x0036) /* DIGIT SIX */ +MAP(0x00, 0x37, 0x0037) /* DIGIT SEVEN */ +MAP(0x00, 0x38, 0x0038) /* DIGIT EIGHT */ +MAP(0x00, 0x39, 0x0039) /* DIGIT NINE */ +MAP(0x00, 0x3A, 0x003A) /* COLON */ +MAP(0x00, 0x3B, 0x003B) /* SEMICOLON */ +MAP(0x00, 0x3C, 0x003C) /* LESS-THAN SIGN */ +MAP(0x00, 0x3D, 0x003D) /* EQUALS SIGN */ +MAP(0x00, 0x3E, 0x003E) /* GREATER-THAN SIGN */ +MAP(0x00, 0x3F, 0x003F) /* QUESTION MARK */ +MAP(0x00, 0x40, 0x0040) /* COMMERCIAL AT */ +MAP(0x00, 0x41, 0x0041) /* LATIN CAPITAL LETTER A */ +MAP(0x00, 0x42, 0x0042) /* LATIN CAPITAL LETTER B */ +MAP(0x00, 0x43, 0x0043) /* LATIN CAPITAL LETTER C */ +MAP(0x00, 0x44, 0x0044) /* LATIN CAPITAL LETTER D */ +MAP(0x00, 0x45, 0x0045) /* LATIN CAPITAL LETTER E */ +MAP(0x00, 0x46, 0x0046) /* LATIN CAPITAL LETTER F */ +MAP(0x00, 0x47, 0x0047) /* LATIN CAPITAL LETTER G */ +MAP(0x00, 0x48, 0x0048) /* LATIN CAPITAL LETTER H */ +MAP(0x00, 0x49, 0x0049) /* LATIN CAPITAL LETTER I */ +MAP(0x00, 0x4A, 0x004A) /* LATIN CAPITAL LETTER J */ +MAP(0x00, 0x4B, 0x004B) /* LATIN CAPITAL LETTER K */ +MAP(0x00, 0x4C, 0x004C) /* LATIN CAPITAL LETTER L */ +MAP(0x00, 0x4D, 0x004D) /* LATIN CAPITAL LETTER M */ +MAP(0x00, 0x4E, 0x004E) /* LATIN CAPITAL LETTER N */ +MAP(0x00, 0x4F, 0x004F) /* LATIN CAPITAL LETTER O */ +MAP(0x00, 0x50, 0x0050) /* LATIN CAPITAL LETTER P */ +MAP(0x00, 0x51, 0x0051) /* LATIN CAPITAL LETTER Q */ +MAP(0x00, 0x52, 0x0052) /* LATIN CAPITAL LETTER R */ +MAP(0x00, 0x53, 0x0053) /* LATIN CAPITAL LETTER S */ +MAP(0x00, 0x54, 0x0054) /* LATIN CAPITAL LETTER T */ +MAP(0x00, 0x55, 0x0055) /* LATIN CAPITAL LETTER U */ +MAP(0x00, 0x56, 0x0056) /* LATIN CAPITAL LETTER V */ +MAP(0x00, 0x57, 0x0057) /* LATIN CAPITAL LETTER W */ +MAP(0x00, 0x58, 0x0058) /* LATIN CAPITAL LETTER X */ +MAP(0x00, 0x59, 0x0059) /* LATIN CAPITAL LETTER Y */ +MAP(0x00, 0x5A, 0x005A) /* LATIN CAPITAL LETTER Z */ +MAP(0x00, 0x5B, 0x005B) /* LEFT SQUARE BRACKET */ +MAP(0x00, 0x5C, 0x00A5) /* YEN SIGN */ +MAP(0x00, 0x5D, 0x005D) /* RIGHT SQUARE BRACKET */ +MAP(0x00, 0x5E, 0x005E) /* CIRCUMFLEX ACCENT */ +MAP(0x00, 0x5F, 0x005F) /* LOW LINE */ +MAP(0x00, 0x60, 0x0060) /* GRAVE ACCENT */ +MAP(0x00, 0x61, 0x0061) /* LATIN SMALL LETTER A */ +MAP(0x00, 0x62, 0x0062) /* LATIN SMALL LETTER B */ +MAP(0x00, 0x63, 0x0063) /* LATIN SMALL LETTER C */ +MAP(0x00, 0x64, 0x0064) /* LATIN SMALL LETTER D */ +MAP(0x00, 0x65, 0x0065) /* LATIN SMALL LETTER E */ +MAP(0x00, 0x66, 0x0066) /* LATIN SMALL LETTER F */ +MAP(0x00, 0x67, 0x0067) /* LATIN SMALL LETTER G */ +MAP(0x00, 0x68, 0x0068) /* LATIN SMALL LETTER H */ +MAP(0x00, 0x69, 0x0069) /* LATIN SMALL LETTER I */ +MAP(0x00, 0x6A, 0x006A) /* LATIN SMALL LETTER J */ +MAP(0x00, 0x6B, 0x006B) /* LATIN SMALL LETTER K */ +MAP(0x00, 0x6C, 0x006C) /* LATIN SMALL LETTER L */ +MAP(0x00, 0x6D, 0x006D) /* LATIN SMALL LETTER M */ +MAP(0x00, 0x6E, 0x006E) /* LATIN SMALL LETTER N */ +MAP(0x00, 0x6F, 0x006F) /* LATIN SMALL LETTER O */ +MAP(0x00, 0x70, 0x0070) /* LATIN SMALL LETTER P */ +MAP(0x00, 0x71, 0x0071) /* LATIN SMALL LETTER Q */ +MAP(0x00, 0x72, 0x0072) /* LATIN SMALL LETTER R */ +MAP(0x00, 0x73, 0x0073) /* LATIN SMALL LETTER S */ +MAP(0x00, 0x74, 0x0074) /* LATIN SMALL LETTER T */ +MAP(0x00, 0x75, 0x0075) /* LATIN SMALL LETTER U */ +MAP(0x00, 0x76, 0x0076) /* LATIN SMALL LETTER V */ +MAP(0x00, 0x77, 0x0077) /* LATIN SMALL LETTER W */ +MAP(0x00, 0x78, 0x0078) /* LATIN SMALL LETTER X */ +MAP(0x00, 0x79, 0x0079) /* LATIN SMALL LETTER Y */ +MAP(0x00, 0x7A, 0x007A) /* LATIN SMALL LETTER Z */ +MAP(0x00, 0x7B, 0x007B) /* LEFT CURLY BRACKET */ +MAP(0x00, 0x7C, 0x007C) /* VERTICAL LINE */ +MAP(0x00, 0x7D, 0x007D) /* RIGHT CURLY BRACKET */ +MAP(0x00, 0x7E, 0x203E) /* OVERLINE */ +MAP(0x00, 0xA1, 0xFF61) /* HALFWIDTH IDEOGRAPHIC FULL STOP */ +MAP(0x00, 0xA2, 0xFF62) /* HALFWIDTH LEFT CORNER BRACKET */ +MAP(0x00, 0xA3, 0xFF63) /* HALFWIDTH RIGHT CORNER BRACKET */ +MAP(0x00, 0xA4, 0xFF64) /* HALFWIDTH IDEOGRAPHIC COMMA */ +MAP(0x00, 0xA5, 0xFF65) /* HALFWIDTH KATAKANA MIDDLE DOT */ +MAP(0x00, 0xA6, 0xFF66) /* HALFWIDTH KATAKANA LETTER WO */ +MAP(0x00, 0xA7, 0xFF67) /* HALFWIDTH KATAKANA LETTER SMALL A */ +MAP(0x00, 0xA8, 0xFF68) /* HALFWIDTH KATAKANA LETTER SMALL I */ +MAP(0x00, 0xA9, 0xFF69) /* HALFWIDTH KATAKANA LETTER SMALL U */ +MAP(0x00, 0xAA, 0xFF6A) /* HALFWIDTH KATAKANA LETTER SMALL E */ +MAP(0x00, 0xAB, 0xFF6B) /* HALFWIDTH KATAKANA LETTER SMALL O */ +MAP(0x00, 0xAC, 0xFF6C) /* HALFWIDTH KATAKANA LETTER SMALL YA */ +MAP(0x00, 0xAD, 0xFF6D) /* HALFWIDTH KATAKANA LETTER SMALL YU */ +MAP(0x00, 0xAE, 0xFF6E) /* HALFWIDTH KATAKANA LETTER SMALL YO */ +MAP(0x00, 0xAF, 0xFF6F) /* HALFWIDTH KATAKANA LETTER SMALL TU */ +MAP(0x00, 0xB0, 0xFF70) /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */ +MAP(0x00, 0xB1, 0xFF71) /* HALFWIDTH KATAKANA LETTER A */ +MAP(0x00, 0xB2, 0xFF72) /* HALFWIDTH KATAKANA LETTER I */ +MAP(0x00, 0xB3, 0xFF73) /* HALFWIDTH KATAKANA LETTER U */ +MAP(0x00, 0xB4, 0xFF74) /* HALFWIDTH KATAKANA LETTER E */ +MAP(0x00, 0xB5, 0xFF75) /* HALFWIDTH KATAKANA LETTER O */ +MAP(0x00, 0xB6, 0xFF76) /* HALFWIDTH KATAKANA LETTER KA */ +MAP(0x00, 0xB7, 0xFF77) /* HALFWIDTH KATAKANA LETTER KI */ +MAP(0x00, 0xB8, 0xFF78) /* HALFWIDTH KATAKANA LETTER KU */ +MAP(0x00, 0xB9, 0xFF79) /* HALFWIDTH KATAKANA LETTER KE */ +MAP(0x00, 0xBA, 0xFF7A) /* HALFWIDTH KATAKANA LETTER KO */ +MAP(0x00, 0xBB, 0xFF7B) /* HALFWIDTH KATAKANA LETTER SA */ +MAP(0x00, 0xBC, 0xFF7C) /* HALFWIDTH KATAKANA LETTER SI */ +MAP(0x00, 0xBD, 0xFF7D) /* HALFWIDTH KATAKANA LETTER SU */ +MAP(0x00, 0xBE, 0xFF7E) /* HALFWIDTH KATAKANA LETTER SE */ +MAP(0x00, 0xBF, 0xFF7F) /* HALFWIDTH KATAKANA LETTER SO */ +MAP(0x00, 0xC0, 0xFF80) /* HALFWIDTH KATAKANA LETTER TA */ +MAP(0x00, 0xC1, 0xFF81) /* HALFWIDTH KATAKANA LETTER TI */ +MAP(0x00, 0xC2, 0xFF82) /* HALFWIDTH KATAKANA LETTER TU */ +MAP(0x00, 0xC3, 0xFF83) /* HALFWIDTH KATAKANA LETTER TE */ +MAP(0x00, 0xC4, 0xFF84) /* HALFWIDTH KATAKANA LETTER TO */ +MAP(0x00, 0xC5, 0xFF85) /* HALFWIDTH KATAKANA LETTER NA */ +MAP(0x00, 0xC6, 0xFF86) /* HALFWIDTH KATAKANA LETTER NI */ +MAP(0x00, 0xC7, 0xFF87) /* HALFWIDTH KATAKANA LETTER NU */ +MAP(0x00, 0xC8, 0xFF88) /* HALFWIDTH KATAKANA LETTER NE */ +MAP(0x00, 0xC9, 0xFF89) /* HALFWIDTH KATAKANA LETTER NO */ +MAP(0x00, 0xCA, 0xFF8A) /* HALFWIDTH KATAKANA LETTER HA */ +MAP(0x00, 0xCB, 0xFF8B) /* HALFWIDTH KATAKANA LETTER HI */ +MAP(0x00, 0xCC, 0xFF8C) /* HALFWIDTH KATAKANA LETTER HU */ +MAP(0x00, 0xCD, 0xFF8D) /* HALFWIDTH KATAKANA LETTER HE */ +MAP(0x00, 0xCE, 0xFF8E) /* HALFWIDTH KATAKANA LETTER HO */ +MAP(0x00, 0xCF, 0xFF8F) /* HALFWIDTH KATAKANA LETTER MA */ +MAP(0x00, 0xD0, 0xFF90) /* HALFWIDTH KATAKANA LETTER MI */ +MAP(0x00, 0xD1, 0xFF91) /* HALFWIDTH KATAKANA LETTER MU */ +MAP(0x00, 0xD2, 0xFF92) /* HALFWIDTH KATAKANA LETTER ME */ +MAP(0x00, 0xD3, 0xFF93) /* HALFWIDTH KATAKANA LETTER MO */ +MAP(0x00, 0xD4, 0xFF94) /* HALFWIDTH KATAKANA LETTER YA */ +MAP(0x00, 0xD5, 0xFF95) /* HALFWIDTH KATAKANA LETTER YU */ +MAP(0x00, 0xD6, 0xFF96) /* HALFWIDTH KATAKANA LETTER YO */ +MAP(0x00, 0xD7, 0xFF97) /* HALFWIDTH KATAKANA LETTER RA */ +MAP(0x00, 0xD8, 0xFF98) /* HALFWIDTH KATAKANA LETTER RI */ +MAP(0x00, 0xD9, 0xFF99) /* HALFWIDTH KATAKANA LETTER RU */ +MAP(0x00, 0xDA, 0xFF9A) /* HALFWIDTH KATAKANA LETTER RE */ +MAP(0x00, 0xDB, 0xFF9B) /* HALFWIDTH KATAKANA LETTER RO */ +MAP(0x00, 0xDC, 0xFF9C) /* HALFWIDTH KATAKANA LETTER WA */ +MAP(0x00, 0xDD, 0xFF9D) /* HALFWIDTH KATAKANA LETTER N */ +MAP(0x00, 0xDE, 0xFF9E) /* HALFWIDTH KATAKANA VOICED SOUND MARK */ +MAP(0x00, 0xDF, 0xFF9F) /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */ diff --git a/libjava/gnu/gcj/convert/gen-from-JIS.c b/libjava/gnu/gcj/convert/gen-from-JIS.c index c49b894..4df25bc 100644 --- a/libjava/gnu/gcj/convert/gen-from-JIS.c +++ b/libjava/gnu/gcj/convert/gen-from-JIS.c @@ -16,6 +16,11 @@ struct chval #define MAP(B1, B2, C) { B1, B2, C }, +struct chval chtab_0201[] = { +#include "JIS0201.h" + { 255, 255, 0} +}; + struct chval chtab_0208[] = { #include "JIS0208.h" { 255, 255, 0} @@ -50,9 +55,9 @@ int main(int argc, char** argv) { FILE *out = stdout; - unsigned min1 = 256, max1 = 0, min2 = 256, max2 = 0, count = 0; - unsigned short low1_uc = 0xFFFF, high1_uc = 0; - unsigned short low2_uc = 0xFFFF, high2_uc = 0; + int min1 = 256, max1 = 0, min2 = 256, max2 = 0, count = 0; + int low1_uc = 0xFFFF, high1_uc = 0; + int low2_uc = 0xFFFF, high2_uc = 0; int i; int row, col; if (strcmp (argv[1], "JIS0208") == 0) chtab = chtab_0208; @@ -61,14 +66,26 @@ main(int argc, char** argv) else if (strcmp (argv[1], "toJIS") == 0) { int i; - int count = sizeof(sorted)/sizeof(struct chval); - qsort (sorted, count, sizeof(struct chval), - compare); - for (i = 0; i < count; i++) + for (i = 0; chtab_0201[i].b1 != 255; i++) + { + enter(chtab_0201[i].uc, chtab_0201[i].b2); + } + for (i = 0; i < 0x20; i++) + { + enter (i, i); + } + enter (127, 127); + for (i = 0; chtab_0208[i].b1 != 255; i++) + { + enter(chtab_0208[i].uc, + (chtab_0208[i].b1 << 8) | chtab_0208[i].b2); + } + for (i = 0; chtab_0212[i].b1 != 255; i++) { - fprintf (out, " 0x%04x -> 0x%02x, 0x%02x\n", - sorted[i].uc, sorted[i].b1, sorted[i].b2); + enter(chtab_0212[i].uc, + 0x8000 | (chtab_0212[i].b1 << 8) | chtab_0212[i].b2); } + print_table ("Unicode_to_JIS", stdout); exit(0); } else @@ -111,7 +128,7 @@ main(int argc, char** argv) { if (row == chtab[i].b1 && col == chtab[i].b2) { - unsigned uc = chtab[i].uc; + int uc = chtab[i].uc; if (uc < 0x2000) { if (uc > high1_uc) diff --git a/libjava/gnu/gcj/convert/natInput_EUCJIS.cc b/libjava/gnu/gcj/convert/natInput_EUCJIS.cc index 68e4cae..4c62818 100644 --- a/libjava/gnu/gcj/convert/natInput_EUCJIS.cc +++ b/libjava/gnu/gcj/convert/natInput_EUCJIS.cc @@ -31,7 +31,7 @@ gnu::gcj::convert::Input_EUCJIS::read(jcharArray outbuffer, jint outpos, { if (b < 128) { -#if 0 +#if 1 // Technically, we should translate 0x5c to Yen symbol; // in practice, it is not clear. if (b == 0x5c) diff --git a/libjava/gnu/gcj/convert/natOutput_EUCJIS.cc b/libjava/gnu/gcj/convert/natOutput_EUCJIS.cc new file mode 100644 index 0000000..585e56b --- /dev/null +++ b/libjava/gnu/gcj/convert/natOutput_EUCJIS.cc @@ -0,0 +1,102 @@ +/* Copyright (C) 1999 Cygnus Solutions + + This file is part of libgcj. + +This software is copyrighted work licensed under the terms of the +Libgcj License. Please consult the file "LIBGCJ_LICENSE" for +details. */ + +#include <config.h> +#include <cni.h> +#include <gnu/gcj/convert/Output_EUCJIS.h> + +/* A trie structure to map unicode values to JIS codes. + * code == -1: the character is undefined. + * code >= 0 && code < 128: JIS-Roman - mostly Ascii. + * code >= 128 && code < 256: Half-width Katakana. + * code >= 256 && code < 0x8000: JIS X 0208:1997. + * code >= 0x8000 && code < 0xFFFF: JIX X 0212-1990. + */ + +extern unsigned short Unicode_to_JIS[]; + +int +trie_lookup (unsigned short *trie, unsigned short key) +{ + unsigned short branch = trie[(key >> 12) & 0xf]; + if (branch == 0) + return -1; + branch = trie[branch + ((key >> 8) & 0xf)]; + if (branch == 0) + return -1; + branch = trie[branch + ((key >> 4) & 0xf)]; + if (branch == 0) + return -1; + return trie[branch + (key & 0xf)]; +} + +static jint +convert_TO_EUCJIS (gnu::gcj::convert::Output_EUCJIS *encoder, + jchar *ptr, jint inlength) +{ + int orig_inlength = inlength; + jint outbuf_length = encoder->buf->length; + for (;;) + { + if (encoder->count >= outbuf_length) + break; + if (encoder->pending1 >= 0) + { + elements(encoder->buf)[encoder->count++] = encoder->pending1; + encoder->pending1 = encoder->pending2; + encoder->pending2 = -1; + continue; + } + if (inlength == 0) + break; + jchar ch = *ptr++; + inlength--; + unsigned short val = trie_lookup(Unicode_to_JIS, ch); + if (val < 0x80) + { + if (val == 0xffff) + val = '?'; + } + else if (val <= 0xFF) + { + encoder->pending1 = val; + encoder->pending2 = -1; + val = 0x8e; + } + else if (val < 0x8000) + { + val |= 0x8080; + encoder->pending1 = val & 0xff; + val = val >> 8; + encoder->pending2 = -1; + } + else + { + val |= 0x8080; + encoder->pending1 = val >> 8; + encoder->pending2 = val & 0xff; + val = 0x8f; + } + elements(encoder->buf)[encoder->count++] = val; + } + return orig_inlength - inlength; +} + +jint +gnu::gcj::convert::Output_EUCJIS::write (jcharArray inbuffer, + jint inpos, jint inlength) +{ + return convert_TO_EUCJIS(this, &elements(inbuffer)[inpos], inlength); +} + +jint +gnu::gcj::convert::Output_EUCJIS::write (jstring str, jint inpos, + jint inlength, jcharArray) +{ + return convert_TO_EUCJIS(this, _Jv_GetStringChars(str)+inpos, inlength); +} |