aboutsummaryrefslogtreecommitdiff
path: root/parse-unidata.tcl
blob: 4b5ec3aae4cceffb150431ac799966c71ca18793 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env tclsh

# Generate UTF-8 case mapping tables
#
# (c) 2010 Steve Bennett <steveb@workware.net.au>
#
# See LICENCE for licence details.
#/

# Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt
# to generate case mapping tables

set f [open [lindex $argv 0]]
set extoff 0
puts "static const struct casemap unicode_case_mapping\[\] = \{"
while {[gets $f buf] >= 0} {
	foreach {code name class x x x x x x x x x upper lower} [split $buf ";"] break
	set code 0x$code
	if {$code <= 0x7f} {
		continue
	}
	if {$code > 0xffff} {
		break
	}
	if {$class ne "Lu" && $class ne "Ll"} {
		continue
	}
	if {$upper eq ""} {
		set upper $code
	} else {
		set upper 0x$upper
	}
	if {$lower eq ""} {
		set lower $code
	} else {
		set lower 0x$lower
	}
	if {$upper == $code && $lower == $code} {
		continue
	}
	set l [expr {$lower - $code}]
	set u [expr {$upper - $code}]
	if {abs($u) > 127 || abs($l) > 127} {
		# Can't encode both in one byte, so use indirection
		lappend jumptable $code $lower $upper
		set l -128
		set u $extoff
		incr extoff
		if {$extoff > 0xff} {
			error "Too many entries in the offset table!"
		}
	}
	set entry [string tolower "$code, $l, $u"]
	puts "    { $entry },"
}
close $f
puts "\};\n"

# Now the jump table
puts "static const struct caseextmap unicode_extmap\[\] = \{"
foreach {c l u} $jumptable {
	puts "    { $l, $u },"
}
puts "\};\n"