diff options
Diffstat (limited to 'libjava/chartables.pl')
-rw-r--r-- | libjava/chartables.pl | 965 |
1 files changed, 965 insertions, 0 deletions
diff --git a/libjava/chartables.pl b/libjava/chartables.pl new file mode 100644 index 0000000..fbcfb27 --- /dev/null +++ b/libjava/chartables.pl @@ -0,0 +1,965 @@ +# chartables.pl - A perl program to generate tables for use by the +# Character class. + +# Copyright (C) 1998, 1999 Cygnus Solutions +# +# This file is part of libjava. +# +# This software is copyrighted work licensed under the terms of the +# Libjava License. Please consult the file "LIBJAVA_LICENSE" for +# details. + +# This program requires a `unidata.txt' file of the form distributed +# on the Unicode 2.0 CD ROM. Or, get it more conveniently here: +# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt +# Version `2.1.8' of this file was last used to update the Character class. + +# Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3 +# "The Java Language Specification", ISBN 0-201-63451-1 +# plus online API docs for JDK 1.2 beta from http://www.javasoft.com. + +# Usage: perl chartables.pl [-n] UnicodeData-VERSION.txt +# If this exits with nonzero status, then you must investigate the +# cause of the problem. +# Diagnostics and other information to stderr. +# This creates the new include/java-chartables.h and +# include/java-chardecomp.h files directly. +# With -n, the files are not created, but all processing +# still occurs. + +# Fields in the table. +$CODE = 0; +$NAME = 1; +$CATEGORY = 2; +$DECOMPOSITION = 5; +$DECIMAL = 6; +$DIGIT = 7; +$NUMERIC = 8; +$UPPERCASE = 12; +$LOWERCASE = 13; +$TITLECASE = 14; + +# A special case. +$TAMIL_DIGIT_ONE = 0x0be7; +$TAMIL_DIGIT_NINE = 0x0bef; + +# These are endpoints of legitimate gaps in the tables. +$CJK_IDEOGRAPH_END = 0x9fa5; +$HANGUL_END = 0xd7a3; +$HIGH_SURROGATE_END = 0xdb7f; +$PRIVATE_HIGH_SURROGATE_END = 0xdbff; +$LOW_SURROGATE_END = 0xdfff; +$PRIVATE_END = 0xf8ff; + +%title_to_upper = (); +%title_to_lower = (); +%numerics = (); +%name = (); + +@digit_start = (); +@digit_end = (); + +@space_start = (); +@space_end = (); + +# @letter_start = (); +# @letter_end = (); + +@all_start = (); +@all_end = (); +@all_cats = (); + +@upper_start = (); +@upper_end = (); +@upper_map = (); +%upper_anom = (); + +@lower_start = (); +@lower_end = (); +@lower_map = (); +%lower_anom = (); + +@attributes = (); + +# There are a few characters which actually need two attributes. +# These are special-cased. +$ROMAN_START = 0x2160; +$ROMAN_END = 0x217f; +%second_attributes = (); + +$prevcode = -1; +$status = 0; + +%category_map = +( + 'Mn' => 'NON_SPACING_MARK', + 'Mc' => 'COMBINING_SPACING_MARK', + 'Me' => 'ENCLOSING_MARK', + 'Nd' => 'DECIMAL_DIGIT_NUMBER', + 'Nl' => 'LETTER_NUMBER', + 'No' => 'OTHER_NUMBER', + 'Zs' => 'SPACE_SEPARATOR', + 'Zl' => 'LINE_SEPARATOR', + 'Zp' => 'PARAGRAPH_SEPARATOR', + 'Cc' => 'CONTROL', + 'Cf' => 'FORMAT', + 'Cs' => 'SURROGATE', + 'Co' => 'PRIVATE_USE', + 'Cn' => 'UNASSIGNED', + 'Lu' => 'UPPERCASE_LETTER', + 'Ll' => 'LOWERCASE_LETTER', + 'Lt' => 'TITLECASE_LETTER', + 'Lm' => 'MODIFIER_LETTER', + 'Lo' => 'OTHER_LETTER', + 'Pc' => 'CONNECTOR_PUNCTUATION', + 'Pd' => 'DASH_PUNCTUATION', + 'Ps' => 'START_PUNCTUATION', + 'Pe' => 'END_PUNCTUATION', + 'Pi' => 'START_PUNCTUATION', + 'Pf' => 'END_PUNCTUATION', + 'Po' => 'OTHER_PUNCTUATION', + 'Sm' => 'MATH_SYMBOL', + 'Sc' => 'CURRENCY_SYMBOL', + 'Sk' => 'MODIFIER_SYMBOL', + 'So' => 'OTHER_SYMBOL' + ); + +# These maps characters to their decompositions. +%canonical_decomposition = (); +%full_decomposition = (); + + +# Handle `-n' and open output files. +local ($f1, $f2) = ('include/java-chartables.h', + 'include/java-chardecomp.h'); +if ($ARGV[0] eq '-n') +{ + shift @ARGV; + $f1 = '/dev/null'; + $f2 = '/dev/null'; +} + +open (CHARTABLE, "> $f1"); +open (DECOMP, "> $f2"); + +# Process the Unicode file. +while (<>) +{ + chop; + # Specify a limit for split so that we pick up trailing fields. + # We make the limit larger than we need, to catch the case where + # there are extra fields. + @fields = split (';', $_, 30); + # Convert code to number. + $ncode = hex ($fields[$CODE]); + + if ($#fields != 14) + { + print STDERR ("Entry for \\u", $fields[$CODE], + " has wrong number of fields: ", $#fields, "\n"); + } + + $name{$fields[$CODE]} = $fields[$NAME]; + + # If we've found a gap in the table, fill it in. + if ($ncode != $prevcode + 1) + { + &process_gap (*fields, $prevcode, $ncode); + } + + &process_char (*fields, $ncode); + + $prevcode = $ncode; +} + +if ($prevcode != 0xffff) +{ + # Setting of `fields' parameter doesn't matter here. + &process_gap (*fields, $prevcode, 0x10000); +} + +print CHARTABLE "// java-chartables.h - Character tables for java.lang.Character -*- c++ -*-\n\n"; +print CHARTABLE "#ifndef __JAVA_CHARTABLES_H__\n"; +print CHARTABLE "#define __JAVA_CHARTABLES_H__\n\n"; +print CHARTABLE "// These tables are automatically generated by the chartables.pl\n"; +print CHARTABLE "// script. DO NOT EDIT the tables. Instead, fix the script\n"; +print CHARTABLE "// and run it again.\n\n"; +print CHARTABLE "// This file should only be included by natCharacter.cc\n\n"; + + +$bytes = 0; + +# Titlecase mapping tables. +if ($#title_to_lower != $#title_to_upper) +{ + # If this fails we need to reimplement toTitleCase. + print STDERR "titlecase mappings have different sizes\n"; + $status = 1; +} +# Also ensure that the tables are entirely parallel. +foreach $key (sort keys %title_to_lower) +{ + if (! defined $title_to_upper{$key}) + { + print STDERR "titlecase mappings have different entries\n"; + $status = 1; + } +} +&print_single_map ("title_to_lower_table", %title_to_lower); +&print_single_map ("title_to_upper_table", %title_to_upper); + +print CHARTABLE "#ifdef COMPACT_CHARACTER\n\n"; + +printf CHARTABLE "#define TAMIL_DIGIT_ONE 0x%04x\n\n", $TAMIL_DIGIT_ONE; + +# All numeric values. +&print_numerics; + +# Digits only. +&print_block ("digit_table", *digit_start, *digit_end); + +# Space characters. +&print_block ("space_table", *space_start, *space_end); + +# Letters. We used to generate a separate letter table. But this +# doesn't really seem worthwhile. Simply using `all_table' saves us +# about 800 bytes, and only adds 3 table probes to isLetter. +# &print_block ("letter_table", *letter_start, *letter_end); + +# Case tables. +&print_case_table ("upper", *upper_start, *upper_end, *upper_map, *upper_anom); +&print_case_table ("lower", *lower_start, *lower_end, *lower_map, *lower_anom); + +# Everything else. +&print_all_block (*all_start, *all_end, *all_cats); + +print CHARTABLE "#else /* COMPACT_CHARACTER */\n\n"; + +printf CHARTABLE "#define ROMAN_START 0x%04x\n", $ROMAN_START; +printf CHARTABLE "#define ROMAN_END 0x%04x\n\n", $ROMAN_END; + +&print_fast_tables (*all_start, *all_end, *all_cats, + *attributes, *second_attributes); + +print CHARTABLE "#endif /* COMPACT_CHARACTER */\n\n"; + +print CHARTABLE "#endif /* __JAVA_CHARTABLES_H__ */\n"; + +printf STDERR "Approximately %d bytes of data generated (compact case)\n", + $bytes; + + +# Now generate decomposition tables. +printf DECOMP "// java-chardecomp.h - Decomposition character tables -*- c++ -*-\n\n"; +printf DECOMP "#ifndef __JAVA_CHARDECOMP_H__\n"; +printf DECOMP "#define __JAVA_CHARDECOMP_H__\n\n"; +print DECOMP "// These tables are automatically generated by the chartables.pl\n"; +print DECOMP "// script. DO NOT EDIT the tables. Instead, fix the script\n"; +print DECOMP "// and run it again.\n\n"; +print DECOMP "// This file should only be included by natCollator.cc\n\n"; + +print DECOMP "struct decomp_entry\n{\n"; +print DECOMP " jchar key;\n"; +print DECOMP " const char *value;\n"; +print DECOMP "};\n\n"; + +&write_decompositions; + +printf DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n"; + + +close (CHARTABLE); +close (DECOMP); + +exit $status; + + +# Process a gap in the space. +sub process_gap +{ + local (*fields, $prevcode, $ncode) = @_; + local (@gap_fields, $i); + + if ($ncode == $CJK_IDEOGRAPH_END + || $ncode == $HANGUL_END + || $ncode == $HIGH_SURROGATE_END + || $ncode == $PRIVATE_HIGH_SURROGATE_END + || $ncode == $LOW_SURROGATE_END + || $ncode == $PRIVATE_END) + { + # The characters in the gap we just found are known to + # have the same properties as the character at the end of + # the gap. + @gap_fields = @fields; + } + else + { + # This prints too much to be enabled. + # print STDERR "Gap found at \\u", $fields[$CODE], "\n"; + @gap_fields = ('', '', 'Cn', '', '', '', '', '', '', '', '', + '', '', '', ''); + } + + for ($i = $prevcode + 1; $i < $ncode; ++$i) + { + $gap_fields[$CODE] = sprintf ("%04x", $i); + $gap_fields[$NAME] = "CHARACTER " . $gap_fields[$CODE]; + &process_char (*gap_fields, $i); + } +} + +# Process a single character. +sub process_char +{ + local (*fields, $ncode) = @_; + + if ($fields[$DECOMPOSITION] ne '') + { + &add_decomposition ($ncode, $fields[$DECOMPOSITION]); + } + + # If this is a titlecase character, mark it. + if ($fields[$CATEGORY] eq 'Lt') + { + $title_to_upper{$fields[$CODE]} = $fields[$UPPERCASE]; + $title_to_lower{$fields[$CODE]} = $fields[$LOWERCASE]; + } + else + { + # For upper and lower case mappings, we try to build compact + # tables that map range onto range. We specifically want to + # avoid titlecase characters. Java specifies a range check to + # make sure the character is not between 0x2000 and 0x2fff. + # We avoid that here because we need to generate table entries + # -- toLower and toUpper still work in that range. + if ($fields[$UPPERCASE] eq '' + && ($fields[$LOWERCASE] ne '' + || $fields[$NAME] =~ /CAPITAL (LETTER|LIGATURE)/)) + { + if ($fields[$LOWERCASE] ne '') + { + &update_case_block (*upper_start, *upper_end, *upper_map, + $fields[$CODE], $fields[$LOWERCASE]); + &set_attribute ($ncode, hex ($fields[$LOWERCASE])); + } + else + { + $upper_anom{$fields[$CODE]} = 1; + } + } + elsif ($fields[$LOWERCASE] ne '') + { + print STDERR ("Java missed upper case char \\u", + $fields[$CODE], "\n"); + } + elsif ($fields[$CATEGORY] eq 'Lu') + { + # This case is for letters which are marked as upper case + # but for which there is no lower case equivalent. For + # instance, LATIN LETTER YR. + } + + if ($fields[$LOWERCASE] eq '' + && ($fields[$UPPERCASE] ne '' + || $fields[$NAME] =~ /SMALL (LETTER|LIGATURE)/)) + { + if ($fields[$UPPERCASE] ne '') + { + &update_case_block (*lower_start, *lower_end, *lower_map, + $fields[$CODE], $fields[$UPPERCASE]); + &set_attribute ($ncode, hex ($fields[$UPPERCASE])); + } + else + { + $lower_anom{$fields[$CODE]} = 1; + } + } + elsif ($fields[$UPPERCASE] ne '') + { + print STDERR ("Java missed lower case char \\u", + $fields[$CODE], "\n"); + } + elsif ($fields[$CATEGORY] eq 'Ll') + { + # This case is for letters which are marked as lower case + # but for which there is no upper case equivalent. For + # instance, FEMININE ORDINAL INDICATOR. + } + } + + + # If we have a non-decimal numeric value, add it to the list. + if ($fields[$CATEGORY] eq 'Nd' + && ($ncode < 0x2000 || $ncode > 0x2fff) + && $fields[$NAME] =~ /DIGIT/) + { + # This is a digit character that is handled elsewhere. + } + elsif ($fields[$DIGIT] ne '' || $fields[$NUMERIC] ne '') + { + # Do a simple check. + if ($fields[$DECIMAL] ne '') + { + # This catches bugs in an earlier implementation of + # chartables.pl. Now it is here for historical interest + # only. + # print STDERR ("Character \u", $fields[$CODE], + # " would have been missed as digit\n"); + } + + local ($val) = $fields[$DIGIT]; + $val = $fields[$NUMERIC] if $val eq ''; + local ($ok) = 1; + + # If we have a value which is not a positive integer, then we + # set the value to -2 to make life easier for + # Character.getNumericValue. + if ($val !~ m/^[0-9]+$/) + { + if ($fields[$CATEGORY] ne 'Nl' + && $fields[$CATEGORY] ne 'No') + { + # This shows a few errors in the Unicode table. These + # characters have a missing Numeric field, and the `N' + # for the mirrored field shows up there instead. I + # reported these characters to errata@unicode.org on + # Thu Sep 10 1998. They said it will be fixed in the + # 2.1.6 release of the tables. + print STDERR ("Character \u", $fields[$CODE], + " has value but is not numeric; val = '", + $val, "'\n"); + # We skip these. + $ok = 0; + } + $val = "-2"; + } + + if ($ok) + { + $numerics{$fields[$CODE]} = $val; + &set_attribute ($ncode, $val); + } + } + + # We build a table that lists ranges of ordinary decimal values. + # At each step we make sure that the digits are in the correct + # order, with no holes, as this is assumed by Character. If this + # fails, reimplementation is required. This implementation + # dovetails nicely with the Java Spec, which has strange rules for + # what constitutes a decimal value. In particular the Unicode + # name must contain the word `DIGIT'. The spec doesn't directly + # say that digits must have type `Nd' (or that their value must an + # integer), but that can be inferred from the list of digits in + # the book(s). Currently the only Unicode characters whose name + # includes `DIGIT' which would not fit are the Tibetan "half" + # digits. + if ($fields[$CATEGORY] eq 'Nd') + { + if (($ncode < 0x2000 || $ncode > 0x2fff) + && $fields[$NAME] =~ /DIGIT/) + { + &update_digit_block (*digit_start, *digit_end, $fields[$CODE], + $fields[$DECIMAL]); + &set_attribute ($ncode, $fields[$DECIMAL]); + } + else + { + # If this fails then Character.getType will fail. We + # assume that things in `digit_table' are the only + # category `Nd' characters. + print STDERR ("Character \u", $fields[$CODE], + " is class Nd but not in digit table\n"); + $status = 1; + } + } + + # Keep track of space characters. + if ($fields[$CATEGORY] =~ /Z[slp]/) + { + &update_block (*space_start, *space_end, $fields[$CODE]); + } + + # Keep track of letters. + # if ($fields[$CATEGORY] =~ /L[ultmo]/) + # { + # &update_letter_block (*letter_start, *letter_end, $fields[$CODE], + # $fields[$CATEGORY]); + # } + + # Keep track of all characters. You might think we wouldn't have + # to do this for uppercase letters, or other characters we already + # "classify". The problem is that this classification is + # different. E.g., \u216f is uppercase by Java rules, but is a + # LETTER_NUMBER here. + &update_all_block (*all_start, *all_end, *all_cats, + $fields[$CODE], $fields[$CATEGORY]); +} + + +# Called to add a new decomposition. +sub add_decomposition +{ + local ($ncode, $value) = @_; + local ($is_full) = 0; + local ($first) = 1; + local (@decomp) = (); + + foreach (split (' ', $value)) + { + if ($first && /^\<.*\>$/) + { + $is_full = 1; + } + else + { + push (@decomp, hex ($_)); + } + $first = 0; + } + + # We pack the value into a string because this means we can stick + # with Perl 4 features. + local ($s) = pack "I*", @decomp; + if ($is_full) + { + $full_decomposition{$ncode} = $s; + } + else + { + $canonical_decomposition{$ncode} = $s; + } +} + +# Write a single decomposition table. +sub write_single_decomposition +{ + local ($name, $is_canon, %table) = @_; + + printf DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n"; + + local ($key, @expansion, $char); + local ($first_line) = 1; + + for ($key = 0; $key <= 65535; ++$key) + { + next if ! defined $table{$key}; + + printf DECOMP ",\n" + unless $first_line; + $first_line = 0; + + printf DECOMP " { 0x%04x, \"", $key; + + # We represent the expansion as a series of bytes, terminated + # with a double nul. This is ugly, but relatively + # space-efficient. Most expansions are short, but there are a + # few that are very long (e.g. \uFDFA). This means that if we + # chose a fixed-space representation we would waste a lot of + # space. + @expansion = unpack "I*", $table{$key}; + foreach $char (@expansion) + { + printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256); + } + + printf DECOMP "\" }"; + } + + printf DECOMP "\n};\n\n"; +} + +sub write_decompositions +{ + &write_single_decomposition ('canonical', 1, %canonical_decomposition); + &write_single_decomposition ('full', 0, %full_decomposition); +} + +# We represent a block of characters with a pair of lists. This +# function updates the pair to account for the new character. Returns +# 1 if we added to the old block, 0 otherwise. +sub update_block +{ + local (*start, *end, $char) = @_; + + local ($nchar) = hex ($char); + local ($count) = $#end; + if ($count >= 0 && $end[$count] == $nchar - 1) + { + ++$end[$count]; + return 1; + } + else + { + ++$count; + $start[$count] = $nchar; + $end[$count] = $nchar; + } + return 0; +} + +# Return true if we will be appending this character to the end of the +# existing block. +sub block_append_p +{ + local (*end, $char) = @_; + return $#end >= 0 && $end[$#end] == $char - 1; +} + +# This updates the digit block. This table is much like an ordinary +# block, but it has an extra constraint. +sub update_digit_block +{ + local (*start, *end, $char, $value) = @_; + + &update_block ($start, $end, $char); + local ($nchar) = hex ($char); + + # We want to make sure that the new digit's value is correct for + # its place in the block. However, we special-case Tamil digits, + # since Tamil does not have a digit `0'. + local ($count) = $#start; + if (($nchar < $TAMIL_DIGIT_ONE || $nchar > $TAMIL_DIGIT_NINE) + && $nchar - $start[$count] != $value) + { + # If this fails then Character.digit_value will be wrong. + print STDERR "Character \\u", $char, " violates digit constraint\n"; + $status = 1; + } +} + +# Update letter table. We could be smart about avoiding upper or +# lower case letters, but it is much simpler to just track them all. +sub update_letter_block +{ + local (*start, *end, $char, $category) = @_; + + &update_block (*start, *end, $char); +} + +# Update `all' table. This table holds all the characters we don't +# already categorize for other reasons. FIXME: if a given type has +# very few characters, we should just inline the code. E.g., there is +# only one paragraph separator. +sub update_all_block +{ + local (*start, *end, *cats, $char, $category) = @_; + + local ($nchar) = hex ($char); + local ($count) = $#end; + if ($count >= 0 + && $end[$count] == $nchar - 1 + && $cats[$count] eq $category) + { + ++$end[$count]; + } + else + { + ++$count; + $start[$count] = $nchar; + $end[$count] = $nchar; + $cats[$count] = $category; + } +} + +# Update a case table. We handle case tables specially because we +# want to map (e.g.) a block of uppercase characters directly onto the +# corresponding block of lowercase characters. Therefore we generate +# a new entry when the block would no longer map directly. +sub update_case_block +{ + local (*start, *end, *map, $char, $mapchar) = @_; + + local ($nchar) = hex ($char); + local ($nmap) = hex ($mapchar); + + local ($count) = $#end; + if ($count >= 0 + && $end[$count] == $nchar - 1 + && $nchar - $start[$count] == $nmap - $map[$count]) + { + ++$end[$count]; + } + else + { + ++$count; + $start[$count] = $nchar; + $end[$count] = $nchar; + $map[$count] = $nmap; + } +} + +# Set the attribute value for the character. Each character can have +# only one attribute. +sub set_attribute +{ + local ($ncode, $attr) = @_; + + if ($attributes{$ncode} ne '' && $attributes{$ncode} ne $attr) + { + if ($ncode >= $ROMAN_START && $ncode <= $ROMAN_END) + { + $second_attributes{$ncode} = $attr; + } + else + { + printf STDERR "character \\u%04x already has attribute\n", $ncode; + } + } + # Attributes can be interpreted as unsigned in some situations, + # so we check against 65535. This could cause errors -- we need + # to check the interpretation here. + elsif ($attr < -32768 || $attr > 65535) + { + printf STDERR "attribute out of range for character \\u%04x\n", $ncode; + } + else + { + $attributes{$ncode} = $attr; + } +} + + +# Print a block table. +sub print_block +{ + local ($title, *start, *end) = @_; + + print CHARTABLE "static const jchar ", $title, "[][2] =\n"; + print CHARTABLE " {\n"; + + local ($i) = 0; + while ($i <= $#start) + { + print CHARTABLE " { "; + &print_char ($start[$i]); + print CHARTABLE ", "; + &print_char ($end[$i]); + print CHARTABLE " }"; + print CHARTABLE "," if ($i != $#start); + print CHARTABLE "\n"; + ++$i; + $bytes += 4; # Two bytes per char. + } + + print CHARTABLE " };\n\n"; +} + +# Print the numerics table. +sub print_numerics +{ + local ($i, $key, $count, @keys); + + $i = 0; + @keys = sort keys %numerics; + $count = @keys; + + print CHARTABLE "static const jchar numeric_table[] =\n"; + print CHARTABLE " { "; + foreach $key (@keys) + { + &print_char (hex ($key)); + ++$i; + print CHARTABLE ", " if $i < $count; + # Print 5 per line. + print CHARTABLE "\n " if ($i % 5 == 0); + $bytes += 2; # One character. + } + print CHARTABLE " };\n\n"; + + print CHARTABLE "static const jshort numeric_value[] =\n"; + print CHARTABLE " { "; + $i = 0; + foreach $key (@keys) + { + print CHARTABLE $numerics{$key}; + if ($numerics{$key} > 32767 || $numerics{$key} < -32768) + { + # This means our generated type info is incorrect. We + # could just detect and work around this here, but I'm + # lazy. + print STDERR "numeric value won't fit in a short\n"; + $status = 1; + } + ++$i; + print CHARTABLE ", " if $i < $count; + # Print 10 per line. + print CHARTABLE "\n " if ($i % 10 == 0); + $bytes += 2; # One short. + } + print CHARTABLE " };\n\n"; +} + +# Print a table that maps one single letter onto another. It assumes +# the map is index by char code. +sub print_single_map +{ + local ($title, %map) = @_; + + local (@keys) = sort keys %map; + $num = @keys; + print CHARTABLE "static const jchar ", $title, "[][2] =\n"; + print CHARTABLE " {\n"; + $i = 0; + for $key (@keys) + { + print CHARTABLE " { "; + &print_char (hex ($key)); + print CHARTABLE ", "; + &print_char (hex ($map{$key})); + print CHARTABLE " }"; + ++$i; + if ($i < $num) + { + print CHARTABLE ","; + } + else + { + print CHARTABLE " "; + } + print CHARTABLE " // ", $name{$key}, "\n"; + $bytes += 4; # Two bytes per char. + } + print CHARTABLE " };\n\n"; +} + +# Print the `all' block. +sub print_all_block +{ + local (*start, *end, *cats) = @_; + + &print_block ("all_table", *start, *end); + + local ($i) = 0; + local ($sum) = 0; + while ($i <= $#start) + { + $sum += $end[$i] - $start[$i] + 1; + ++$i; + } + # We do this computation just to make sure it isn't cheaper to + # simply list all the characters individually. + printf STDERR ("all_table encodes %d characters in %d entries\n", + $sum, $#start + 1); + + print CHARTABLE "static const jbyte category_table[] =\n"; + print CHARTABLE " { "; + + $i = 0; + while ($i <= $#cats) + { + if ($i > 0 && $cats[$i] eq $cats[$i - 1]) + { + # This isn't an error. We can have a duplicate because + # two ranges are not adjacent while the intervening + # characters are left out of the table for other reasons. + # We could exploit this to make the table a little smaller. + # printf STDERR "Duplicate all entry at \\u%04x\n", $start[$i]; + } + print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]}; + print CHARTABLE ", " if ($i < $#cats); + ++$i; + print CHARTABLE "\n "; + ++$bytes; + } + print CHARTABLE " };\n\n"; +} + +# Print case table. +sub print_case_table +{ + local ($title, *start, *end, *map, *anomalous) = @_; + + &print_block ($title . '_case_table', *start, *end); + + print CHARTABLE "static const jchar ", $title, "_case_map_table[] =\n"; + print CHARTABLE " { "; + + local ($i) = 0; + while ($i <= $#map) + { + &print_char ($map[$i]); + print CHARTABLE ", " if $i < $#map; + ++$i; + print CHARTABLE "\n " if $i % 5 == 0; + $bytes += 2; + } + print CHARTABLE " };\n"; + + + local ($key, @keys); + @keys = sort keys %anomalous; + + if ($title eq 'upper') + { + if ($#keys >= 0) + { + # If these are found we need to change Character.isUpperCase. + print STDERR "Found anomalous upper case characters\n"; + $status = 1; + } + } + else + { + print CHARTABLE "\n"; + print CHARTABLE "static const jchar ", $title, "_anomalous_table[] =\n"; + print CHARTABLE " { "; + $i = 0; + foreach $key (@keys) + { + &print_char (hex ($key)); + print CHARTABLE ", " if $i < $#keys; + ++$i; + print CHARTABLE "\n " if $i % 5 == 0; + $bytes += 2; + } + print CHARTABLE " };\n"; + } + + print CHARTABLE "\n"; +} + +# Print the type table and attributes table for the fast version. +sub print_fast_tables +{ + local (*start, *end, *cats, *atts, *second_atts) = @_; + + print CHARTABLE "static const jbyte type_table[] =\n{ "; + + local ($i, $j); + for ($i = 0; $i <= $#cats; ++$i) + { + for ($j = $start[$i]; $j <= $end[$i]; ++$j) + { + print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]}; + print CHARTABLE "," if ($i < $#cats || $j < $end[$i]); + print CHARTABLE "\n "; + } + } + print CHARTABLE "\n };\n\n"; + + print CHARTABLE "static const jshort attribute_table[] =\n{ "; + for ($i = 0; $i <= 0xffff; ++$i) + { + $atts{$i} = 0 if ! defined $atts{$i}; + print CHARTABLE $atts{$i}; + print CHARTABLE ", " if $i < 0xffff; + print CHARTABLE "\n " if $i % 5 == 1; + } + print CHARTABLE "\n };\n\n"; + + print CHARTABLE "static const jshort secondary_attribute_table[] =\n{ "; + for ($i = $ROMAN_START; $i <= $ROMAN_END; ++$i) + { + print CHARTABLE $second_atts{$i}; + print CHARTABLE ", " if $i < $ROMAN_END; + print CHARTABLE "\n " if $i % 5 == 1; + } + print CHARTABLE "\n };\n\n"; +} + +# Print a character constant. +sub print_char +{ + local ($ncode) = @_; + printf CHARTABLE "0x%04x", $ncode; +} |