aboutsummaryrefslogtreecommitdiff
path: root/parse-unidata.tcl
diff options
context:
space:
mode:
authorSteve Bennett <steveb@workware.net.au>2011-10-10 08:19:01 +1000
committerSteve Bennett <steveb@workware.net.au>2011-10-10 08:19:01 +1000
commit510a65c687133c537b428992d0b1df6c336deaeb (patch)
tree4ce036628d4e46767088e3bfdadb032f23bd707a /parse-unidata.tcl
parentdfbde800afdabc83efc9ebe087b1aed6a90136d8 (diff)
downloadjimtcl-510a65c687133c537b428992d0b1df6c336deaeb.zip
jimtcl-510a65c687133c537b428992d0b1df6c336deaeb.tar.gz
jimtcl-510a65c687133c537b428992d0b1df6c336deaeb.tar.bz2
Simplify the way unicode case mapping is done
Smaller, faster and includes title-case characters. Signed-off-by: Steve Bennett <steveb@workware.net.au>
Diffstat (limited to 'parse-unidata.tcl')
-rw-r--r--parse-unidata.tcl49
1 files changed, 14 insertions, 35 deletions
diff --git a/parse-unidata.tcl b/parse-unidata.tcl
index 4b5ec3a..1a927a3 100644
--- a/parse-unidata.tcl
+++ b/parse-unidata.tcl
@@ -9,56 +9,35 @@
# Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt
# to generate case mapping tables
+set map(lower) {}
+set map(upper) {}
set f [open [lindex $argv 0]]
-set extoff 0
-puts "static const struct casemap unicode_case_mapping\[\] = \{"
while {[gets $f buf] >= 0} {
foreach {code name class x x x x x x x x x upper lower} [split $buf ";"] break
- set code 0x$code
+ set code [string tolower 0x$code]
if {$code <= 0x7f} {
continue
}
if {$code > 0xffff} {
break
}
- if {$class ne "Lu" && $class ne "Ll"} {
+ if {![string match L* $class]} {
continue
}
- if {$upper eq ""} {
- set upper $code
- } else {
- set upper 0x$upper
+ if {$upper ne ""} {
+ lappend map(upper) $code [string tolower 0x$upper]
}
- if {$lower eq ""} {
- set lower $code
- } else {
- set lower 0x$lower
+ if {$lower ne ""} {
+ lappend map(lower) $code [string tolower 0x$lower]
}
- if {$upper == $code && $lower == $code} {
- continue
- }
- set l [expr {$lower - $code}]
- set u [expr {$upper - $code}]
- if {abs($u) > 127 || abs($l) > 127} {
- # Can't encode both in one byte, so use indirection
- lappend jumptable $code $lower $upper
- set l -128
- set u $extoff
- incr extoff
- if {$extoff > 0xff} {
- error "Too many entries in the offset table!"
- }
- }
- set entry [string tolower "$code, $l, $u"]
- puts " { $entry },"
}
close $f
-puts "\};\n"
-# Now the jump table
-puts "static const struct caseextmap unicode_extmap\[\] = \{"
-foreach {c l u} $jumptable {
- puts " { $l, $u },"
+foreach type {upper lower} {
+ puts "static const struct casemap unicode_case_mapping_$type\[\] = \{"
+ foreach {code alt} $map($type) {
+ puts "\t{ $code, $alt },"
+ }
+ puts "\};\n"
}
-puts "\};\n"