aboutsummaryrefslogtreecommitdiff
path: root/parse-unidata.tcl
blob: c87cc6445f67ff57f7f546c824df0507225c1b7b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env tclsh

# Generate UTF-8 case mapping tables
#
# (c) 2010 Steve Bennett <steveb@workware.net.au>
#
# See LICENCE for licence details.
#/

# Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt
# and http://unicode.org/Public/UNIDATA/EastAsianWidth.txt
# to generate case mapping and display width tables
set map(lower) {}
set map(upper) {}
set map(title) {}
set map(combining) {}
set map(wide) {}

set USAGE "Usage: parse-unidata.tcl \[-width\] UnicodeData.txt \[EastAsianWidth.txt\]"
set do_width 0

if {[lindex $argv 0] eq "-width"} {
	set do_width 1
	set argv [lrange $argv 1 end]
}

if {[llength $argv] ni {1 2}} {
	puts stderr $USAGE
	exit 1
}

lassign $argv unicodefile widthfile

set f [open $unicodefile]
while {[gets $f buf] >= 0} {
	# Remove any trailing whitespace, especially errant CR
	set buf [string trim $buf]
	set title ""
	set lower ""
	set upper ""
	lassign [split $buf ";"] code name class x x x x x x x x x upper lower title
	set codex [string tolower 0x$code]
	if {[string match M* $class]} {
		if {![info exists combining]} {
			set combining $codex
		}
		continue
	} elseif {[info exists combining]} {
		lappend map(combining) $combining $codex
		unset combining
	}
	if {$codex <= 0x7f} {
		continue
	}
	if {$codex > 0xffff} {
		break
	}
	if {![string match L* $class]} {
		continue
	}
	if {$upper ne ""} {
		lappend map(upper) $codex [string tolower 0x$upper]
	}
	if {$lower ne ""} {
		lappend map(lower) $codex [string tolower 0x$lower]
	}
	if {$title ne "" && $title ne $upper} {
		if {$title eq $code} {
			set title 0
		}
		lappend map(title) $codex [string tolower 0x$title]
	}
}
close $f

proc output-int-pairs {list} {
	set n 0
	foreach {v1 v2} $list {
		puts -nonewline "\t{ $v1, $v2 },"
		if {[incr n] % 4 == 0} {
			puts ""
		}
	}
	if {$n % 4} {
		puts ""
	}
}

# Merges adjacent ranges in a list of ranges (lower upper lower upper ...)
proc combine-adjacent-ranges {list} {
	set newlist {}
	foreach {lower upper} $list {
		if {[info exists prev_upper]} {
			if {$lower == $prev_upper + 1} {
				# combine these
				set prev_upper $upper
				continue
			} else {
				# can't combine
				lappend newlist $prev_lower $prev_upper
			}
		}
		set prev_lower $lower
		set prev_upper $upper
	}
	# Now add the last range
	lappend newlist $prev_lower $prev_upper
	return $newlist
}

foreach type {upper lower title} {
	puts "static const struct casemap unicode_case_mapping_$type\[\] = \{"
	output-int-pairs $map($type)
	puts "\};\n"
}

if {$do_width} {
	set f [open $widthfile]
	while {[gets $f buf] >= 0} {
		# Remove any trailing whitespace, especially errant CR
		set buf [string trim $buf]
		if {[regexp {^([0-9A-Fa-f.]+);[FW]} $buf -> range]} {
			set range [string tolower $range]
			lassign [split $range .] lower - upper
			if {$upper eq ""} {
				set upper $lower
			}
			lappend map(wide) 0x$lower 0x$upper
		}
	}
	close $f
}

foreach type {combining wide} {
	puts "static const struct utf8range unicode_range_$type\[\] = \{"
	if {$do_width} {
		output-int-pairs [combine-adjacent-ranges $map($type)]
	} else {
		# Just produce empty width tables in this case
		output-int-pairs {}
	}
	puts "\};\n"
}