From 6b285bb9542260e2d6d817fea8a1ee5a61181ef6 Mon Sep 17 00:00:00 2001 From: Steve Bennett Date: Sun, 31 Dec 2017 11:01:27 +1000 Subject: utf8: Fix merging of adjacent wide character ranges Adjacent wide character ranges were not being merged correctly, and the final range was not being output. Fix this, and also merge adjacent combining character ranges. Signed-off-by: Steve Bennett --- parse-unidata.tcl | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'parse-unidata.tcl') diff --git a/parse-unidata.tcl b/parse-unidata.tcl index e5b0c47..2bf6547 100644 --- a/parse-unidata.tcl +++ b/parse-unidata.tcl @@ -84,6 +84,27 @@ proc output-int-pairs {list} { } } +# Merges adjacent ranges in a list of ranges (lower upper lower upper ...) +proc combine-adjacent-ranges {list} { + set newlist {} + foreach {lower upper} $list { + if {[info exists prev_upper]} { + if {$lower == $prev_upper + 1} { + # combine these + set prev_upper $upper + continue + } else { + # can't combine + lappend newlist $prev_lower $prev_upper + } + } + set prev_lower $lower + set prev_upper $upper + } + # Now add the last range + lappend newlist $prev_lower $prev_upper + return $newlist +} foreach type {upper lower title} { puts "static const struct casemap unicode_case_mapping_$type\[\] = \{" @@ -94,23 +115,13 @@ foreach type {upper lower title} { if {$do_width} { set f [open $widthfile] while {[gets $f buf] >= 0} { - if {[regexp {^([0-9A-F.]+);W} $buf -> range]} { + if {[regexp {^([0-9A-Fa-f.]+);W} $buf -> range]} { + set range [string tolower $range] lassign [split $range .] lower - upper if {$upper eq ""} { set upper $lower } - set lower 0x$lower - set upper 0x$upper - if {[info exists endrange]} { - if {$upper == $endrange + 1} { - # Just extend the range - set endrange $upper - continue - } - lappend map(wide) $startrange $endrange - } - set startrange $lower - set endrange $upper + lappend map(wide) 0x$lower 0x$upper } } close $f @@ -119,7 +130,7 @@ if {$do_width} { foreach type {combining wide} { puts "static const struct utf8range unicode_range_$type\[\] = \{" if {$do_width} { - output-int-pairs $map($type) + output-int-pairs [combine-adjacent-ranges $map($type)] } else { # Just produce empty width tables in this case output-int-pairs {} -- cgit v1.1