aboutsummaryrefslogtreecommitdiff
path: root/libjava/chartables.pl
diff options
context:
space:
mode:
Diffstat (limited to 'libjava/chartables.pl')
-rw-r--r--libjava/chartables.pl965
1 files changed, 965 insertions, 0 deletions
diff --git a/libjava/chartables.pl b/libjava/chartables.pl
new file mode 100644
index 0000000..fbcfb27
--- /dev/null
+++ b/libjava/chartables.pl
@@ -0,0 +1,965 @@
+# chartables.pl - A perl program to generate tables for use by the
+# Character class.
+
+# Copyright (C) 1998, 1999 Cygnus Solutions
+#
+# This file is part of libjava.
+#
+# This software is copyrighted work licensed under the terms of the
+# Libjava License. Please consult the file "LIBJAVA_LICENSE" for
+# details.
+
+# This program requires a `unidata.txt' file of the form distributed
+# on the Unicode 2.0 CD ROM. Or, get it more conveniently here:
+# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
+# Version `2.1.8' of this file was last used to update the Character class.
+
+# Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
+# "The Java Language Specification", ISBN 0-201-63451-1
+# plus online API docs for JDK 1.2 beta from http://www.javasoft.com.
+
+# Usage: perl chartables.pl [-n] UnicodeData-VERSION.txt
+# If this exits with nonzero status, then you must investigate the
+# cause of the problem.
+# Diagnostics and other information to stderr.
+# This creates the new include/java-chartables.h and
+# include/java-chardecomp.h files directly.
+# With -n, the files are not created, but all processing
+# still occurs.
+
+# Fields in the table.
+$CODE = 0;
+$NAME = 1;
+$CATEGORY = 2;
+$DECOMPOSITION = 5;
+$DECIMAL = 6;
+$DIGIT = 7;
+$NUMERIC = 8;
+$UPPERCASE = 12;
+$LOWERCASE = 13;
+$TITLECASE = 14;
+
+# A special case.
+$TAMIL_DIGIT_ONE = 0x0be7;
+$TAMIL_DIGIT_NINE = 0x0bef;
+
+# These are endpoints of legitimate gaps in the tables.
+$CJK_IDEOGRAPH_END = 0x9fa5;
+$HANGUL_END = 0xd7a3;
+$HIGH_SURROGATE_END = 0xdb7f;
+$PRIVATE_HIGH_SURROGATE_END = 0xdbff;
+$LOW_SURROGATE_END = 0xdfff;
+$PRIVATE_END = 0xf8ff;
+
+%title_to_upper = ();
+%title_to_lower = ();
+%numerics = ();
+%name = ();
+
+@digit_start = ();
+@digit_end = ();
+
+@space_start = ();
+@space_end = ();
+
+# @letter_start = ();
+# @letter_end = ();
+
+@all_start = ();
+@all_end = ();
+@all_cats = ();
+
+@upper_start = ();
+@upper_end = ();
+@upper_map = ();
+%upper_anom = ();
+
+@lower_start = ();
+@lower_end = ();
+@lower_map = ();
+%lower_anom = ();
+
+@attributes = ();
+
+# There are a few characters which actually need two attributes.
+# These are special-cased.
+$ROMAN_START = 0x2160;
+$ROMAN_END = 0x217f;
+%second_attributes = ();
+
+$prevcode = -1;
+$status = 0;
+
+%category_map =
+(
+ 'Mn' => 'NON_SPACING_MARK',
+ 'Mc' => 'COMBINING_SPACING_MARK',
+ 'Me' => 'ENCLOSING_MARK',
+ 'Nd' => 'DECIMAL_DIGIT_NUMBER',
+ 'Nl' => 'LETTER_NUMBER',
+ 'No' => 'OTHER_NUMBER',
+ 'Zs' => 'SPACE_SEPARATOR',
+ 'Zl' => 'LINE_SEPARATOR',
+ 'Zp' => 'PARAGRAPH_SEPARATOR',
+ 'Cc' => 'CONTROL',
+ 'Cf' => 'FORMAT',
+ 'Cs' => 'SURROGATE',
+ 'Co' => 'PRIVATE_USE',
+ 'Cn' => 'UNASSIGNED',
+ 'Lu' => 'UPPERCASE_LETTER',
+ 'Ll' => 'LOWERCASE_LETTER',
+ 'Lt' => 'TITLECASE_LETTER',
+ 'Lm' => 'MODIFIER_LETTER',
+ 'Lo' => 'OTHER_LETTER',
+ 'Pc' => 'CONNECTOR_PUNCTUATION',
+ 'Pd' => 'DASH_PUNCTUATION',
+ 'Ps' => 'START_PUNCTUATION',
+ 'Pe' => 'END_PUNCTUATION',
+ 'Pi' => 'START_PUNCTUATION',
+ 'Pf' => 'END_PUNCTUATION',
+ 'Po' => 'OTHER_PUNCTUATION',
+ 'Sm' => 'MATH_SYMBOL',
+ 'Sc' => 'CURRENCY_SYMBOL',
+ 'Sk' => 'MODIFIER_SYMBOL',
+ 'So' => 'OTHER_SYMBOL'
+ );
+
+# These maps characters to their decompositions.
+%canonical_decomposition = ();
+%full_decomposition = ();
+
+
+# Handle `-n' and open output files.
+local ($f1, $f2) = ('include/java-chartables.h',
+ 'include/java-chardecomp.h');
+if ($ARGV[0] eq '-n')
+{
+ shift @ARGV;
+ $f1 = '/dev/null';
+ $f2 = '/dev/null';
+}
+
+open (CHARTABLE, "> $f1");
+open (DECOMP, "> $f2");
+
+# Process the Unicode file.
+while (<>)
+{
+ chop;
+ # Specify a limit for split so that we pick up trailing fields.
+ # We make the limit larger than we need, to catch the case where
+ # there are extra fields.
+ @fields = split (';', $_, 30);
+ # Convert code to number.
+ $ncode = hex ($fields[$CODE]);
+
+ if ($#fields != 14)
+ {
+ print STDERR ("Entry for \\u", $fields[$CODE],
+ " has wrong number of fields: ", $#fields, "\n");
+ }
+
+ $name{$fields[$CODE]} = $fields[$NAME];
+
+ # If we've found a gap in the table, fill it in.
+ if ($ncode != $prevcode + 1)
+ {
+ &process_gap (*fields, $prevcode, $ncode);
+ }
+
+ &process_char (*fields, $ncode);
+
+ $prevcode = $ncode;
+}
+
+if ($prevcode != 0xffff)
+{
+ # Setting of `fields' parameter doesn't matter here.
+ &process_gap (*fields, $prevcode, 0x10000);
+}
+
+print CHARTABLE "// java-chartables.h - Character tables for java.lang.Character -*- c++ -*-\n\n";
+print CHARTABLE "#ifndef __JAVA_CHARTABLES_H__\n";
+print CHARTABLE "#define __JAVA_CHARTABLES_H__\n\n";
+print CHARTABLE "// These tables are automatically generated by the chartables.pl\n";
+print CHARTABLE "// script. DO NOT EDIT the tables. Instead, fix the script\n";
+print CHARTABLE "// and run it again.\n\n";
+print CHARTABLE "// This file should only be included by natCharacter.cc\n\n";
+
+
+$bytes = 0;
+
+# Titlecase mapping tables.
+if ($#title_to_lower != $#title_to_upper)
+{
+ # If this fails we need to reimplement toTitleCase.
+ print STDERR "titlecase mappings have different sizes\n";
+ $status = 1;
+}
+# Also ensure that the tables are entirely parallel.
+foreach $key (sort keys %title_to_lower)
+{
+ if (! defined $title_to_upper{$key})
+ {
+ print STDERR "titlecase mappings have different entries\n";
+ $status = 1;
+ }
+}
+&print_single_map ("title_to_lower_table", %title_to_lower);
+&print_single_map ("title_to_upper_table", %title_to_upper);
+
+print CHARTABLE "#ifdef COMPACT_CHARACTER\n\n";
+
+printf CHARTABLE "#define TAMIL_DIGIT_ONE 0x%04x\n\n", $TAMIL_DIGIT_ONE;
+
+# All numeric values.
+&print_numerics;
+
+# Digits only.
+&print_block ("digit_table", *digit_start, *digit_end);
+
+# Space characters.
+&print_block ("space_table", *space_start, *space_end);
+
+# Letters. We used to generate a separate letter table. But this
+# doesn't really seem worthwhile. Simply using `all_table' saves us
+# about 800 bytes, and only adds 3 table probes to isLetter.
+# &print_block ("letter_table", *letter_start, *letter_end);
+
+# Case tables.
+&print_case_table ("upper", *upper_start, *upper_end, *upper_map, *upper_anom);
+&print_case_table ("lower", *lower_start, *lower_end, *lower_map, *lower_anom);
+
+# Everything else.
+&print_all_block (*all_start, *all_end, *all_cats);
+
+print CHARTABLE "#else /* COMPACT_CHARACTER */\n\n";
+
+printf CHARTABLE "#define ROMAN_START 0x%04x\n", $ROMAN_START;
+printf CHARTABLE "#define ROMAN_END 0x%04x\n\n", $ROMAN_END;
+
+&print_fast_tables (*all_start, *all_end, *all_cats,
+ *attributes, *second_attributes);
+
+print CHARTABLE "#endif /* COMPACT_CHARACTER */\n\n";
+
+print CHARTABLE "#endif /* __JAVA_CHARTABLES_H__ */\n";
+
+printf STDERR "Approximately %d bytes of data generated (compact case)\n",
+ $bytes;
+
+
+# Now generate decomposition tables.
+printf DECOMP "// java-chardecomp.h - Decomposition character tables -*- c++ -*-\n\n";
+printf DECOMP "#ifndef __JAVA_CHARDECOMP_H__\n";
+printf DECOMP "#define __JAVA_CHARDECOMP_H__\n\n";
+print DECOMP "// These tables are automatically generated by the chartables.pl\n";
+print DECOMP "// script. DO NOT EDIT the tables. Instead, fix the script\n";
+print DECOMP "// and run it again.\n\n";
+print DECOMP "// This file should only be included by natCollator.cc\n\n";
+
+print DECOMP "struct decomp_entry\n{\n";
+print DECOMP " jchar key;\n";
+print DECOMP " const char *value;\n";
+print DECOMP "};\n\n";
+
+&write_decompositions;
+
+printf DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
+
+
+close (CHARTABLE);
+close (DECOMP);
+
+exit $status;
+
+
+# Process a gap in the space.
+sub process_gap
+{
+ local (*fields, $prevcode, $ncode) = @_;
+ local (@gap_fields, $i);
+
+ if ($ncode == $CJK_IDEOGRAPH_END
+ || $ncode == $HANGUL_END
+ || $ncode == $HIGH_SURROGATE_END
+ || $ncode == $PRIVATE_HIGH_SURROGATE_END
+ || $ncode == $LOW_SURROGATE_END
+ || $ncode == $PRIVATE_END)
+ {
+ # The characters in the gap we just found are known to
+ # have the same properties as the character at the end of
+ # the gap.
+ @gap_fields = @fields;
+ }
+ else
+ {
+ # This prints too much to be enabled.
+ # print STDERR "Gap found at \\u", $fields[$CODE], "\n";
+ @gap_fields = ('', '', 'Cn', '', '', '', '', '', '', '', '',
+ '', '', '', '');
+ }
+
+ for ($i = $prevcode + 1; $i < $ncode; ++$i)
+ {
+ $gap_fields[$CODE] = sprintf ("%04x", $i);
+ $gap_fields[$NAME] = "CHARACTER " . $gap_fields[$CODE];
+ &process_char (*gap_fields, $i);
+ }
+}
+
+# Process a single character.
+sub process_char
+{
+ local (*fields, $ncode) = @_;
+
+ if ($fields[$DECOMPOSITION] ne '')
+ {
+ &add_decomposition ($ncode, $fields[$DECOMPOSITION]);
+ }
+
+ # If this is a titlecase character, mark it.
+ if ($fields[$CATEGORY] eq 'Lt')
+ {
+ $title_to_upper{$fields[$CODE]} = $fields[$UPPERCASE];
+ $title_to_lower{$fields[$CODE]} = $fields[$LOWERCASE];
+ }
+ else
+ {
+ # For upper and lower case mappings, we try to build compact
+ # tables that map range onto range. We specifically want to
+ # avoid titlecase characters. Java specifies a range check to
+ # make sure the character is not between 0x2000 and 0x2fff.
+ # We avoid that here because we need to generate table entries
+ # -- toLower and toUpper still work in that range.
+ if ($fields[$UPPERCASE] eq ''
+ && ($fields[$LOWERCASE] ne ''
+ || $fields[$NAME] =~ /CAPITAL (LETTER|LIGATURE)/))
+ {
+ if ($fields[$LOWERCASE] ne '')
+ {
+ &update_case_block (*upper_start, *upper_end, *upper_map,
+ $fields[$CODE], $fields[$LOWERCASE]);
+ &set_attribute ($ncode, hex ($fields[$LOWERCASE]));
+ }
+ else
+ {
+ $upper_anom{$fields[$CODE]} = 1;
+ }
+ }
+ elsif ($fields[$LOWERCASE] ne '')
+ {
+ print STDERR ("Java missed upper case char \\u",
+ $fields[$CODE], "\n");
+ }
+ elsif ($fields[$CATEGORY] eq 'Lu')
+ {
+ # This case is for letters which are marked as upper case
+ # but for which there is no lower case equivalent. For
+ # instance, LATIN LETTER YR.
+ }
+
+ if ($fields[$LOWERCASE] eq ''
+ && ($fields[$UPPERCASE] ne ''
+ || $fields[$NAME] =~ /SMALL (LETTER|LIGATURE)/))
+ {
+ if ($fields[$UPPERCASE] ne '')
+ {
+ &update_case_block (*lower_start, *lower_end, *lower_map,
+ $fields[$CODE], $fields[$UPPERCASE]);
+ &set_attribute ($ncode, hex ($fields[$UPPERCASE]));
+ }
+ else
+ {
+ $lower_anom{$fields[$CODE]} = 1;
+ }
+ }
+ elsif ($fields[$UPPERCASE] ne '')
+ {
+ print STDERR ("Java missed lower case char \\u",
+ $fields[$CODE], "\n");
+ }
+ elsif ($fields[$CATEGORY] eq 'Ll')
+ {
+ # This case is for letters which are marked as lower case
+ # but for which there is no upper case equivalent. For
+ # instance, FEMININE ORDINAL INDICATOR.
+ }
+ }
+
+
+ # If we have a non-decimal numeric value, add it to the list.
+ if ($fields[$CATEGORY] eq 'Nd'
+ && ($ncode < 0x2000 || $ncode > 0x2fff)
+ && $fields[$NAME] =~ /DIGIT/)
+ {
+ # This is a digit character that is handled elsewhere.
+ }
+ elsif ($fields[$DIGIT] ne '' || $fields[$NUMERIC] ne '')
+ {
+ # Do a simple check.
+ if ($fields[$DECIMAL] ne '')
+ {
+ # This catches bugs in an earlier implementation of
+ # chartables.pl. Now it is here for historical interest
+ # only.
+ # print STDERR ("Character \u", $fields[$CODE],
+ # " would have been missed as digit\n");
+ }
+
+ local ($val) = $fields[$DIGIT];
+ $val = $fields[$NUMERIC] if $val eq '';
+ local ($ok) = 1;
+
+ # If we have a value which is not a positive integer, then we
+ # set the value to -2 to make life easier for
+ # Character.getNumericValue.
+ if ($val !~ m/^[0-9]+$/)
+ {
+ if ($fields[$CATEGORY] ne 'Nl'
+ && $fields[$CATEGORY] ne 'No')
+ {
+ # This shows a few errors in the Unicode table. These
+ # characters have a missing Numeric field, and the `N'
+ # for the mirrored field shows up there instead. I
+ # reported these characters to errata@unicode.org on
+ # Thu Sep 10 1998. They said it will be fixed in the
+ # 2.1.6 release of the tables.
+ print STDERR ("Character \u", $fields[$CODE],
+ " has value but is not numeric; val = '",
+ $val, "'\n");
+ # We skip these.
+ $ok = 0;
+ }
+ $val = "-2";
+ }
+
+ if ($ok)
+ {
+ $numerics{$fields[$CODE]} = $val;
+ &set_attribute ($ncode, $val);
+ }
+ }
+
+ # We build a table that lists ranges of ordinary decimal values.
+ # At each step we make sure that the digits are in the correct
+ # order, with no holes, as this is assumed by Character. If this
+ # fails, reimplementation is required. This implementation
+ # dovetails nicely with the Java Spec, which has strange rules for
+ # what constitutes a decimal value. In particular the Unicode
+ # name must contain the word `DIGIT'. The spec doesn't directly
+ # say that digits must have type `Nd' (or that their value must an
+ # integer), but that can be inferred from the list of digits in
+ # the book(s). Currently the only Unicode characters whose name
+ # includes `DIGIT' which would not fit are the Tibetan "half"
+ # digits.
+ if ($fields[$CATEGORY] eq 'Nd')
+ {
+ if (($ncode < 0x2000 || $ncode > 0x2fff)
+ && $fields[$NAME] =~ /DIGIT/)
+ {
+ &update_digit_block (*digit_start, *digit_end, $fields[$CODE],
+ $fields[$DECIMAL]);
+ &set_attribute ($ncode, $fields[$DECIMAL]);
+ }
+ else
+ {
+ # If this fails then Character.getType will fail. We
+ # assume that things in `digit_table' are the only
+ # category `Nd' characters.
+ print STDERR ("Character \u", $fields[$CODE],
+ " is class Nd but not in digit table\n");
+ $status = 1;
+ }
+ }
+
+ # Keep track of space characters.
+ if ($fields[$CATEGORY] =~ /Z[slp]/)
+ {
+ &update_block (*space_start, *space_end, $fields[$CODE]);
+ }
+
+ # Keep track of letters.
+ # if ($fields[$CATEGORY] =~ /L[ultmo]/)
+ # {
+ # &update_letter_block (*letter_start, *letter_end, $fields[$CODE],
+ # $fields[$CATEGORY]);
+ # }
+
+ # Keep track of all characters. You might think we wouldn't have
+ # to do this for uppercase letters, or other characters we already
+ # "classify". The problem is that this classification is
+ # different. E.g., \u216f is uppercase by Java rules, but is a
+ # LETTER_NUMBER here.
+ &update_all_block (*all_start, *all_end, *all_cats,
+ $fields[$CODE], $fields[$CATEGORY]);
+}
+
+
+# Called to add a new decomposition.
+sub add_decomposition
+{
+ local ($ncode, $value) = @_;
+ local ($is_full) = 0;
+ local ($first) = 1;
+ local (@decomp) = ();
+
+ foreach (split (' ', $value))
+ {
+ if ($first && /^\<.*\>$/)
+ {
+ $is_full = 1;
+ }
+ else
+ {
+ push (@decomp, hex ($_));
+ }
+ $first = 0;
+ }
+
+ # We pack the value into a string because this means we can stick
+ # with Perl 4 features.
+ local ($s) = pack "I*", @decomp;
+ if ($is_full)
+ {
+ $full_decomposition{$ncode} = $s;
+ }
+ else
+ {
+ $canonical_decomposition{$ncode} = $s;
+ }
+}
+
+# Write a single decomposition table.
+sub write_single_decomposition
+{
+ local ($name, $is_canon, %table) = @_;
+
+ printf DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
+
+ local ($key, @expansion, $char);
+ local ($first_line) = 1;
+
+ for ($key = 0; $key <= 65535; ++$key)
+ {
+ next if ! defined $table{$key};
+
+ printf DECOMP ",\n"
+ unless $first_line;
+ $first_line = 0;
+
+ printf DECOMP " { 0x%04x, \"", $key;
+
+ # We represent the expansion as a series of bytes, terminated
+ # with a double nul. This is ugly, but relatively
+ # space-efficient. Most expansions are short, but there are a
+ # few that are very long (e.g. \uFDFA). This means that if we
+ # chose a fixed-space representation we would waste a lot of
+ # space.
+ @expansion = unpack "I*", $table{$key};
+ foreach $char (@expansion)
+ {
+ printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
+ }
+
+ printf DECOMP "\" }";
+ }
+
+ printf DECOMP "\n};\n\n";
+}
+
+sub write_decompositions
+{
+ &write_single_decomposition ('canonical', 1, %canonical_decomposition);
+ &write_single_decomposition ('full', 0, %full_decomposition);
+}
+
+# We represent a block of characters with a pair of lists. This
+# function updates the pair to account for the new character. Returns
+# 1 if we added to the old block, 0 otherwise.
+sub update_block
+{
+ local (*start, *end, $char) = @_;
+
+ local ($nchar) = hex ($char);
+ local ($count) = $#end;
+ if ($count >= 0 && $end[$count] == $nchar - 1)
+ {
+ ++$end[$count];
+ return 1;
+ }
+ else
+ {
+ ++$count;
+ $start[$count] = $nchar;
+ $end[$count] = $nchar;
+ }
+ return 0;
+}
+
+# Return true if we will be appending this character to the end of the
+# existing block.
+sub block_append_p
+{
+ local (*end, $char) = @_;
+ return $#end >= 0 && $end[$#end] == $char - 1;
+}
+
+# This updates the digit block. This table is much like an ordinary
+# block, but it has an extra constraint.
+sub update_digit_block
+{
+ local (*start, *end, $char, $value) = @_;
+
+ &update_block ($start, $end, $char);
+ local ($nchar) = hex ($char);
+
+ # We want to make sure that the new digit's value is correct for
+ # its place in the block. However, we special-case Tamil digits,
+ # since Tamil does not have a digit `0'.
+ local ($count) = $#start;
+ if (($nchar < $TAMIL_DIGIT_ONE || $nchar > $TAMIL_DIGIT_NINE)
+ && $nchar - $start[$count] != $value)
+ {
+ # If this fails then Character.digit_value will be wrong.
+ print STDERR "Character \\u", $char, " violates digit constraint\n";
+ $status = 1;
+ }
+}
+
+# Update letter table. We could be smart about avoiding upper or
+# lower case letters, but it is much simpler to just track them all.
+sub update_letter_block
+{
+ local (*start, *end, $char, $category) = @_;
+
+ &update_block (*start, *end, $char);
+}
+
+# Update `all' table. This table holds all the characters we don't
+# already categorize for other reasons. FIXME: if a given type has
+# very few characters, we should just inline the code. E.g., there is
+# only one paragraph separator.
+sub update_all_block
+{
+ local (*start, *end, *cats, $char, $category) = @_;
+
+ local ($nchar) = hex ($char);
+ local ($count) = $#end;
+ if ($count >= 0
+ && $end[$count] == $nchar - 1
+ && $cats[$count] eq $category)
+ {
+ ++$end[$count];
+ }
+ else
+ {
+ ++$count;
+ $start[$count] = $nchar;
+ $end[$count] = $nchar;
+ $cats[$count] = $category;
+ }
+}
+
+# Update a case table. We handle case tables specially because we
+# want to map (e.g.) a block of uppercase characters directly onto the
+# corresponding block of lowercase characters. Therefore we generate
+# a new entry when the block would no longer map directly.
+sub update_case_block
+{
+ local (*start, *end, *map, $char, $mapchar) = @_;
+
+ local ($nchar) = hex ($char);
+ local ($nmap) = hex ($mapchar);
+
+ local ($count) = $#end;
+ if ($count >= 0
+ && $end[$count] == $nchar - 1
+ && $nchar - $start[$count] == $nmap - $map[$count])
+ {
+ ++$end[$count];
+ }
+ else
+ {
+ ++$count;
+ $start[$count] = $nchar;
+ $end[$count] = $nchar;
+ $map[$count] = $nmap;
+ }
+}
+
+# Set the attribute value for the character. Each character can have
+# only one attribute.
+sub set_attribute
+{
+ local ($ncode, $attr) = @_;
+
+ if ($attributes{$ncode} ne '' && $attributes{$ncode} ne $attr)
+ {
+ if ($ncode >= $ROMAN_START && $ncode <= $ROMAN_END)
+ {
+ $second_attributes{$ncode} = $attr;
+ }
+ else
+ {
+ printf STDERR "character \\u%04x already has attribute\n", $ncode;
+ }
+ }
+ # Attributes can be interpreted as unsigned in some situations,
+ # so we check against 65535. This could cause errors -- we need
+ # to check the interpretation here.
+ elsif ($attr < -32768 || $attr > 65535)
+ {
+ printf STDERR "attribute out of range for character \\u%04x\n", $ncode;
+ }
+ else
+ {
+ $attributes{$ncode} = $attr;
+ }
+}
+
+
+# Print a block table.
+sub print_block
+{
+ local ($title, *start, *end) = @_;
+
+ print CHARTABLE "static const jchar ", $title, "[][2] =\n";
+ print CHARTABLE " {\n";
+
+ local ($i) = 0;
+ while ($i <= $#start)
+ {
+ print CHARTABLE " { ";
+ &print_char ($start[$i]);
+ print CHARTABLE ", ";
+ &print_char ($end[$i]);
+ print CHARTABLE " }";
+ print CHARTABLE "," if ($i != $#start);
+ print CHARTABLE "\n";
+ ++$i;
+ $bytes += 4; # Two bytes per char.
+ }
+
+ print CHARTABLE " };\n\n";
+}
+
+# Print the numerics table.
+sub print_numerics
+{
+ local ($i, $key, $count, @keys);
+
+ $i = 0;
+ @keys = sort keys %numerics;
+ $count = @keys;
+
+ print CHARTABLE "static const jchar numeric_table[] =\n";
+ print CHARTABLE " { ";
+ foreach $key (@keys)
+ {
+ &print_char (hex ($key));
+ ++$i;
+ print CHARTABLE ", " if $i < $count;
+ # Print 5 per line.
+ print CHARTABLE "\n " if ($i % 5 == 0);
+ $bytes += 2; # One character.
+ }
+ print CHARTABLE " };\n\n";
+
+ print CHARTABLE "static const jshort numeric_value[] =\n";
+ print CHARTABLE " { ";
+ $i = 0;
+ foreach $key (@keys)
+ {
+ print CHARTABLE $numerics{$key};
+ if ($numerics{$key} > 32767 || $numerics{$key} < -32768)
+ {
+ # This means our generated type info is incorrect. We
+ # could just detect and work around this here, but I'm
+ # lazy.
+ print STDERR "numeric value won't fit in a short\n";
+ $status = 1;
+ }
+ ++$i;
+ print CHARTABLE ", " if $i < $count;
+ # Print 10 per line.
+ print CHARTABLE "\n " if ($i % 10 == 0);
+ $bytes += 2; # One short.
+ }
+ print CHARTABLE " };\n\n";
+}
+
+# Print a table that maps one single letter onto another. It assumes
+# the map is index by char code.
+sub print_single_map
+{
+ local ($title, %map) = @_;
+
+ local (@keys) = sort keys %map;
+ $num = @keys;
+ print CHARTABLE "static const jchar ", $title, "[][2] =\n";
+ print CHARTABLE " {\n";
+ $i = 0;
+ for $key (@keys)
+ {
+ print CHARTABLE " { ";
+ &print_char (hex ($key));
+ print CHARTABLE ", ";
+ &print_char (hex ($map{$key}));
+ print CHARTABLE " }";
+ ++$i;
+ if ($i < $num)
+ {
+ print CHARTABLE ",";
+ }
+ else
+ {
+ print CHARTABLE " ";
+ }
+ print CHARTABLE " // ", $name{$key}, "\n";
+ $bytes += 4; # Two bytes per char.
+ }
+ print CHARTABLE " };\n\n";
+}
+
+# Print the `all' block.
+sub print_all_block
+{
+ local (*start, *end, *cats) = @_;
+
+ &print_block ("all_table", *start, *end);
+
+ local ($i) = 0;
+ local ($sum) = 0;
+ while ($i <= $#start)
+ {
+ $sum += $end[$i] - $start[$i] + 1;
+ ++$i;
+ }
+ # We do this computation just to make sure it isn't cheaper to
+ # simply list all the characters individually.
+ printf STDERR ("all_table encodes %d characters in %d entries\n",
+ $sum, $#start + 1);
+
+ print CHARTABLE "static const jbyte category_table[] =\n";
+ print CHARTABLE " { ";
+
+ $i = 0;
+ while ($i <= $#cats)
+ {
+ if ($i > 0 && $cats[$i] eq $cats[$i - 1])
+ {
+ # This isn't an error. We can have a duplicate because
+ # two ranges are not adjacent while the intervening
+ # characters are left out of the table for other reasons.
+ # We could exploit this to make the table a little smaller.
+ # printf STDERR "Duplicate all entry at \\u%04x\n", $start[$i];
+ }
+ print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]};
+ print CHARTABLE ", " if ($i < $#cats);
+ ++$i;
+ print CHARTABLE "\n ";
+ ++$bytes;
+ }
+ print CHARTABLE " };\n\n";
+}
+
+# Print case table.
+sub print_case_table
+{
+ local ($title, *start, *end, *map, *anomalous) = @_;
+
+ &print_block ($title . '_case_table', *start, *end);
+
+ print CHARTABLE "static const jchar ", $title, "_case_map_table[] =\n";
+ print CHARTABLE " { ";
+
+ local ($i) = 0;
+ while ($i <= $#map)
+ {
+ &print_char ($map[$i]);
+ print CHARTABLE ", " if $i < $#map;
+ ++$i;
+ print CHARTABLE "\n " if $i % 5 == 0;
+ $bytes += 2;
+ }
+ print CHARTABLE " };\n";
+
+
+ local ($key, @keys);
+ @keys = sort keys %anomalous;
+
+ if ($title eq 'upper')
+ {
+ if ($#keys >= 0)
+ {
+ # If these are found we need to change Character.isUpperCase.
+ print STDERR "Found anomalous upper case characters\n";
+ $status = 1;
+ }
+ }
+ else
+ {
+ print CHARTABLE "\n";
+ print CHARTABLE "static const jchar ", $title, "_anomalous_table[] =\n";
+ print CHARTABLE " { ";
+ $i = 0;
+ foreach $key (@keys)
+ {
+ &print_char (hex ($key));
+ print CHARTABLE ", " if $i < $#keys;
+ ++$i;
+ print CHARTABLE "\n " if $i % 5 == 0;
+ $bytes += 2;
+ }
+ print CHARTABLE " };\n";
+ }
+
+ print CHARTABLE "\n";
+}
+
+# Print the type table and attributes table for the fast version.
+sub print_fast_tables
+{
+ local (*start, *end, *cats, *atts, *second_atts) = @_;
+
+ print CHARTABLE "static const jbyte type_table[] =\n{ ";
+
+ local ($i, $j);
+ for ($i = 0; $i <= $#cats; ++$i)
+ {
+ for ($j = $start[$i]; $j <= $end[$i]; ++$j)
+ {
+ print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]};
+ print CHARTABLE "," if ($i < $#cats || $j < $end[$i]);
+ print CHARTABLE "\n ";
+ }
+ }
+ print CHARTABLE "\n };\n\n";
+
+ print CHARTABLE "static const jshort attribute_table[] =\n{ ";
+ for ($i = 0; $i <= 0xffff; ++$i)
+ {
+ $atts{$i} = 0 if ! defined $atts{$i};
+ print CHARTABLE $atts{$i};
+ print CHARTABLE ", " if $i < 0xffff;
+ print CHARTABLE "\n " if $i % 5 == 1;
+ }
+ print CHARTABLE "\n };\n\n";
+
+ print CHARTABLE "static const jshort secondary_attribute_table[] =\n{ ";
+ for ($i = $ROMAN_START; $i <= $ROMAN_END; ++$i)
+ {
+ print CHARTABLE $second_atts{$i};
+ print CHARTABLE ", " if $i < $ROMAN_END;
+ print CHARTABLE "\n " if $i % 5 == 1;
+ }
+ print CHARTABLE "\n };\n\n";
+}
+
+# Print a character constant.
+sub print_char
+{
+ local ($ncode) = @_;
+ printf CHARTABLE "0x%04x", $ncode;
+}