blob: e5b0c47bea6db94303d1e53ccec4b3dfb0cc6421 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
#!/usr/bin/env tclsh
# Generate UTF-8 case mapping tables
#
# (c) 2010 Steve Bennett <steveb@workware.net.au>
#
# See LICENCE for licence details.
#/
# Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt
# and http://unicode.org/Public/UNIDATA/EastAsianWidth.txt
# to generate case mapping and display width tables
set map(lower) {}
set map(upper) {}
set map(title) {}
set map(combining) {}
set map(wide) {}
set USAGE "Usage: parse-unidata.tcl \[-width\] UnicodeData.txt \[EastAsianWidth.txt\]"
set do_width 0
if {[lindex $argv 0] eq "-width"} {
set do_width 1
set argv [lrange $argv 1 end]
}
if {[llength $argv] ni {1 2}} {
puts stderr $USAGE
exit 1
}
lassign $argv unicodefile widthfile
set f [open $unicodefile]
while {[gets $f buf] >= 0} {
set title ""
set lower ""
set upper ""
lassign [split $buf ";"] code name class x x x x x x x x x upper lower title
set codex [string tolower 0x$code]
if {[string match M* $class]} {
if {![info exists combining]} {
set combining $codex
}
continue
} elseif {[info exists combining]} {
lappend map(combining) $combining $codex
unset combining
}
if {$codex <= 0x7f} {
continue
}
if {$codex > 0xffff} {
break
}
if {![string match L* $class]} {
continue
}
if {$upper ne ""} {
lappend map(upper) $codex [string tolower 0x$upper]
}
if {$lower ne ""} {
lappend map(lower) $codex [string tolower 0x$lower]
}
if {$title ne "" && $title ne $upper} {
if {$title eq $code} {
set title 0
}
lappend map(title) $codex [string tolower 0x$title]
}
}
close $f
proc output-int-pairs {list} {
set n 0
foreach {v1 v2} $list {
puts -nonewline "\t{ $v1, $v2 },"
if {[incr n] % 4 == 0} {
puts ""
}
}
if {$n % 4} {
puts ""
}
}
foreach type {upper lower title} {
puts "static const struct casemap unicode_case_mapping_$type\[\] = \{"
output-int-pairs $map($type)
puts "\};\n"
}
if {$do_width} {
set f [open $widthfile]
while {[gets $f buf] >= 0} {
if {[regexp {^([0-9A-F.]+);W} $buf -> range]} {
lassign [split $range .] lower - upper
if {$upper eq ""} {
set upper $lower
}
set lower 0x$lower
set upper 0x$upper
if {[info exists endrange]} {
if {$upper == $endrange + 1} {
# Just extend the range
set endrange $upper
continue
}
lappend map(wide) $startrange $endrange
}
set startrange $lower
set endrange $upper
}
}
close $f
}
foreach type {combining wide} {
puts "static const struct utf8range unicode_range_$type\[\] = \{"
if {$do_width} {
output-int-pairs $map($type)
} else {
# Just produce empty width tables in this case
output-int-pairs {}
}
puts "\};\n"
}
|