aboutsummaryrefslogtreecommitdiff
path: root/parse-unidata.tcl
diff options
context:
space:
mode:
authorSteve Bennett <steveb@workware.net.au>2016-09-04 13:54:59 +1000
committerSteve Bennett <steveb@workware.net.au>2016-09-05 09:40:26 +1000
commited4923bc71b7630cc34328b32704e806bb10c614 (patch)
tree024ee39a3b430a447bc2fc1a49a0110a21cef013 /parse-unidata.tcl
parentc672379dca1fe3fa7b89e1d8c6b1a1e570bb4043 (diff)
downloadjimtcl-ed4923bc71b7630cc34328b32704e806bb10c614.zip
jimtcl-ed4923bc71b7630cc34328b32704e806bb10c614.tar.gz
jimtcl-ed4923bc71b7630cc34328b32704e806bb10c614.tar.bz2
Add utf-8 display width info
Needed when using linenoise line editing with utf-8 support Note that variable width support is not yet added to linenoise Signed-off-by: Steve Bennett <steveb@workware.net.au>
Diffstat (limited to 'parse-unidata.tcl')
-rw-r--r--parse-unidata.tcl71
1 files changed, 67 insertions, 4 deletions
diff --git a/parse-unidata.tcl b/parse-unidata.tcl
index 348a114..b3beec1 100644
--- a/parse-unidata.tcl
+++ b/parse-unidata.tcl
@@ -8,18 +8,59 @@
#/
# Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt
-# to generate case mapping tables
+# to generate case mapping and display width tables
set map(lower) {}
set map(upper) {}
set map(title) {}
+set map(combining) {}
-set f [open [lindex $argv 0]]
+set USAGE "Usage: parse-unidata.tcl \[-width\] UnicodeData.txt"
+
+set do_width 0
+foreach arg $argv {
+ if {$arg eq "-width"} {
+ incr do_width
+ } else {
+ if {[info exists filename]} {
+ puts stderr $USAGE
+ exit 1
+ }
+ set filename $arg
+ }
+}
+if {![info exists filename]} {
+ puts stderr $USAGE
+ exit 1
+}
+
+# Why isn't this available in UnicodeData.txt?
+set map(wide) {
+ 0x1100 0x115f 0x2329 0x232a 0x2e80 0x2e99 0x2e9b 0x2ef3
+ 0x2f00 0x2fd5 0x2ff0 0x2ffb 0x3000 0x303e 0x3041 0x3096
+ 0x3099 0x30ff 0x3105 0x312d 0x3131 0x318e 0x3190 0x31ba
+ 0x31c0 0x31e3 0x31f0 0x321e 0x3220 0x3247 0x3250 0x4dbf
+ 0x4e00 0xa48c 0xa490 0xa4c6 0xa960 0xa97c 0xac00 0xd7a3
+ 0xf900 0xfaff 0xfe10 0xfe19 0xfe30 0xfe52 0xfe54 0xfe66
+ 0xfe68 0xfe6b 0xff01 0xffe6 0x1b000 0x1b001 0x1f200 0x1f202
+ 0x1f210 0x1f23a 0x1f240 0x1f248 0x1f250 0x1f251 0x20000 0x3fffd
+}
+
+set f [open $filename]
while {[gets $f buf] >= 0} {
set title ""
set lower ""
set upper ""
foreach {code name class x x x x x x x x x upper lower title} [split $buf ";"] break
set codex [string tolower 0x$code]
+ if {[string match M* $class]} {
+ if {![info exists combining]} {
+ set combining $codex
+ }
+ continue
+ } elseif {[exists combining]} {
+ lappend map(combining) $combining $codex
+ unset combining
+ }
if {$codex <= 0x7f} {
continue
}
@@ -44,10 +85,32 @@ while {[gets $f buf] >= 0} {
}
close $f
+proc output-int-pairs {list} {
+ set n 0
+ foreach {v1 v2} $list {
+ puts -nonewline "\t{ $v1, $v2 },"
+ if {[incr n] % 4 == 0} {
+ puts ""
+ }
+ }
+ if {$n % 4} {
+ puts ""
+ }
+}
+
foreach type {upper lower title} {
puts "static const struct casemap unicode_case_mapping_$type\[\] = \{"
- foreach {code alt} $map($type) {
- puts "\t{ $code, $alt },"
+ output-int-pairs $map($type)
+ puts "\};\n"
+}
+
+foreach type {combining wide} {
+ puts "static const struct utf8range unicode_range_$type\[\] = \{"
+ if {$do_width} {
+ output-int-pairs $map($type)
+ } else {
+ # Just produce empty width tables in this case
+ output-int-pairs {}
}
puts "\};\n"
}