aboutsummaryrefslogtreecommitdiff
path: root/libjava/scripts/unicode-decomp.pl
blob: 8aeed152adf01c6c034defb74f1a57a4501c382b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/perl -w
# unicode-decomp.pl - script to generate database for java.text.Collator
# Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
#
# This file is part of libjava.
# 
# This software is copyrighted work licensed under the terms of the
# Libjava License.  Please consult the file "LIBJAVA_LICENSE" for
# details.

# Code for reading UnicodeData.txt and generating the code for
# gnu.java.lang.CharData.  For now, the relevant Unicode definition files
# are found in libjava/gnu/gcj/convert/.
#
# Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
#   where <UnicodeData.txt> is obtained from www.unicode.org (named
#   UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
#   is the final location of include/java-chardecomp.h.
#   As of JDK 1.4, use Unicode version 3.0.0 for best results.
#
# If this exits with nonzero status, then you must investigate the
# cause of the problem.
# Diagnostics and other information to stderr.
# With -n, the files are not created, but all processing still occurs.

# These maps characters to their decompositions.
my %canonical_decomposition = ();
my %full_decomposition = ();

# Handle `-n' and open output files.
if ($ARGV[0] && $ARGV[0] eq '-n')
{
    shift @ARGV;
    $ARGV[1] = '/dev/null';
}
die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";

# Process the Unicode file.
$| = 1;
my $count = 0;
print STDERR "Parsing attributes file";
while (<UNICODE>)
{
    print STDERR "." unless $count++ % 1000;
    chomp;
    s/\r//g;
    my ($ch, undef, undef, undef, undef, $decomp) = split ';';
    $ch = hex($ch);

    if ($decomp ne '')
    {
        my $is_full = 0;
        my @decomp = ();
        foreach (split (' ', $decomp))
        {
            if (/^\<.*\>$/)
            {
                $is_full = 1;
                next;
            }
	    push (@decomp, hex ($_));
	}
        my $s = pack "n*", @decomp;
        if ($is_full)
        {
            $full_decomposition{$ch} = $s;
        }
        else
        {
            $canonical_decomposition{$ch} = $s;
        }
    }
}

# Now generate decomposition tables.
open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
print STDERR "\nGenerating tables\n";
print DECOMP <<EOF;
// java-chardecomp.h - Decomposition character tables -*- c++ -*-

#ifndef __JAVA_CHARDECOMP_H__
#define __JAVA_CHARDECOMP_H__


// These tables are automatically generated by the $0
// script.  DO NOT EDIT the tables.  Instead, fix the script
// and run it again.

// This file should only be included by natCollator.cc

struct decomp_entry
{
  jchar key;
  const char *value;
};

EOF

&write_decompositions;

print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";

close(DECOMP);
print STDERR "Done\n";
exit;


# Write a single decomposition table.
sub write_single_decomposition($$%)
{
    my ($name, $is_canon, %table) = @_;
    my $first_line = 1;
    print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";

    for my $key (0 .. 0xffff)
    {
	next if ! defined $table{$key};
        print DECOMP ",\n" unless $first_line;
	$first_line = 0;

	printf DECOMP "  { 0x%04x, \"", $key;

	# We represent the expansion as a series of bytes, terminated
	# with a double nul.  This is ugly, but relatively
	# space-efficient.  Most expansions are short, but there are a
	# few that are very long (e.g. \uFDFA).  This means that if we
	# chose a fixed-space representation we would waste a lot of
	# space.
	my @expansion = unpack "n*", $table{$key};
	foreach my $char (@expansion)
	{
	    printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
	}

	print DECOMP "\" }";
    }

    print DECOMP "\n};\n\n";
}

sub write_decompositions()
{
    &write_single_decomposition ('canonical', 1, %canonical_decomposition);
    &write_single_decomposition ('full', 0, %full_decomposition);
}