aboutsummaryrefslogtreecommitdiff
path: root/parse-unidata.tcl
blob: e5b0c47bea6db94303d1e53ccec4b3dfb0cc6421 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env tclsh

# Generate UTF-8 case mapping tables
#
# (c) 2010 Steve Bennett <steveb@workware.net.au>
#
# See LICENCE for licence details.
#/

# Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt
# and http://unicode.org/Public/UNIDATA/EastAsianWidth.txt
# to generate case mapping and display width tables
set map(lower) {}
set map(upper) {}
set map(title) {}
set map(combining) {}
set map(wide) {}

set USAGE "Usage: parse-unidata.tcl \[-width\] UnicodeData.txt \[EastAsianWidth.txt\]"
set do_width 0

if {[lindex $argv 0] eq "-width"} {
	set do_width 1
	set argv [lrange $argv 1 end]
}

if {[llength $argv] ni {1 2}} {
	puts stderr $USAGE
	exit 1
}

lassign $argv unicodefile widthfile

set f [open $unicodefile]
while {[gets $f buf] >= 0} {
	set title ""
	set lower ""
	set upper ""
	lassign [split $buf ";"] code name class x x x x x x x x x upper lower title
	set codex [string tolower 0x$code]
	if {[string match M* $class]} {
		if {![info exists combining]} {
			set combining $codex
		}
		continue
	} elseif {[info exists combining]} {
		lappend map(combining) $combining $codex
		unset combining
	}
	if {$codex <= 0x7f} {
		continue
	}
	if {$codex > 0xffff} {
		break
	}
	if {![string match L* $class]} {
		continue
	}
	if {$upper ne ""} {
		lappend map(upper) $codex [string tolower 0x$upper]
	}
	if {$lower ne ""} {
		lappend map(lower) $codex [string tolower 0x$lower]
	}
	if {$title ne "" && $title ne $upper} {
		if {$title eq $code} {
			set title 0
		}
		lappend map(title) $codex [string tolower 0x$title]
	}
}
close $f

proc output-int-pairs {list} {
	set n 0
	foreach {v1 v2} $list {
		puts -nonewline "\t{ $v1, $v2 },"
		if {[incr n] % 4 == 0} {
			puts ""
		}
	}
	if {$n % 4} {
		puts ""
	}
}


foreach type {upper lower title} {
	puts "static const struct casemap unicode_case_mapping_$type\[\] = \{"
	output-int-pairs $map($type)
	puts "\};\n"
}

if {$do_width} {
	set f [open $widthfile]
	while {[gets $f buf] >= 0} {
		if {[regexp {^([0-9A-F.]+);W} $buf -> range]} {
			lassign [split $range .] lower - upper
			if {$upper eq ""} {
				set upper $lower
			}
			set lower 0x$lower
			set upper 0x$upper
			if {[info exists endrange]} {
				if {$upper == $endrange + 1} {
					# Just extend the range
					set endrange $upper
					continue
				}
				lappend map(wide) $startrange $endrange
			}
			set startrange $lower
			set endrange $upper
		}
	}
	close $f
}

foreach type {combining wide} {
	puts "static const struct utf8range unicode_range_$type\[\] = \{"
	if {$do_width} {
		output-int-pairs $map($type)
	} else {
		# Just produce empty width tables in this case
		output-int-pairs {}
	}
	puts "\};\n"
}