diff options
author | Steve Bennett <steveb@workware.net.au> | 2011-10-10 08:19:01 +1000 |
---|---|---|
committer | Steve Bennett <steveb@workware.net.au> | 2011-10-10 08:19:01 +1000 |
commit | 510a65c687133c537b428992d0b1df6c336deaeb (patch) | |
tree | 4ce036628d4e46767088e3bfdadb032f23bd707a | |
parent | dfbde800afdabc83efc9ebe087b1aed6a90136d8 (diff) | |
download | jimtcl-510a65c687133c537b428992d0b1df6c336deaeb.zip jimtcl-510a65c687133c537b428992d0b1df6c336deaeb.tar.gz jimtcl-510a65c687133c537b428992d0b1df6c336deaeb.tar.bz2 |
Simplify the way unicode case mapping is done
Smaller, faster and includes title-case characters.
Signed-off-by: Steve Bennett <steveb@workware.net.au>
-rw-r--r-- | parse-unidata.tcl | 49 | ||||
-rw-r--r-- | tests/utf8.test | 4 | ||||
-rw-r--r-- | utf8.c | 47 |
3 files changed, 37 insertions, 63 deletions
diff --git a/parse-unidata.tcl b/parse-unidata.tcl index 4b5ec3a..1a927a3 100644 --- a/parse-unidata.tcl +++ b/parse-unidata.tcl @@ -9,56 +9,35 @@ # Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt # to generate case mapping tables +set map(lower) {} +set map(upper) {} set f [open [lindex $argv 0]] -set extoff 0 -puts "static const struct casemap unicode_case_mapping\[\] = \{" while {[gets $f buf] >= 0} { foreach {code name class x x x x x x x x x upper lower} [split $buf ";"] break - set code 0x$code + set code [string tolower 0x$code] if {$code <= 0x7f} { continue } if {$code > 0xffff} { break } - if {$class ne "Lu" && $class ne "Ll"} { + if {![string match L* $class]} { continue } - if {$upper eq ""} { - set upper $code - } else { - set upper 0x$upper + if {$upper ne ""} { + lappend map(upper) $code [string tolower 0x$upper] } - if {$lower eq ""} { - set lower $code - } else { - set lower 0x$lower + if {$lower ne ""} { + lappend map(lower) $code [string tolower 0x$lower] } - if {$upper == $code && $lower == $code} { - continue - } - set l [expr {$lower - $code}] - set u [expr {$upper - $code}] - if {abs($u) > 127 || abs($l) > 127} { - # Can't encode both in one byte, so use indirection - lappend jumptable $code $lower $upper - set l -128 - set u $extoff - incr extoff - if {$extoff > 0xff} { - error "Too many entries in the offset table!" - } - } - set entry [string tolower "$code, $l, $u"] - puts " { $entry }," } close $f -puts "\};\n" -# Now the jump table -puts "static const struct caseextmap unicode_extmap\[\] = \{" -foreach {c l u} $jumptable { - puts " { $l, $u }," +foreach type {upper lower} { + puts "static const struct casemap unicode_case_mapping_$type\[\] = \{" + foreach {code alt} $map($type) { + puts "\t{ $code, $alt }," + } + puts "\};\n" } -puts "\};\n" diff --git a/tests/utf8.test b/tests/utf8.test index 04c5b57..715df13 100644 --- a/tests/utf8.test +++ b/tests/utf8.test @@ -125,4 +125,8 @@ test utf8-7.2 {append counts correctly} { list [string length $x] [string bytelength $x] } {8 12} +test utf8-7.3 {Upper, lower for titlecase utf-8} { + list [string toupper \u01c5] [string tolower \u01c5] +} "\u01c4 \u01c6" + testreport @@ -136,57 +136,48 @@ int utf8_tounicode(const char *str, int *uc) } struct casemap { - unsigned short code; /* code point */ - signed char lowerdelta; /* add for lowercase, or if -128 use the ext table */ - signed char upperdelta; /* add for uppercase, or offset into the ext table */ -}; - -/* Extended table for codepoints where |delta| > 127 */ -struct caseextmap { - unsigned short lower; - unsigned short upper; + unsigned short code; /* code point */ + unsigned short altcode; /* alternate case code point */ }; /* Generated mapping tables */ #include "_unicode_mapping.c" -#define NUMCASEMAP sizeof(unicode_case_mapping) / sizeof(*unicode_case_mapping) +#define ARRAYSIZE(A) sizeof(A) / sizeof(*(A)) static int cmp_casemap(const void *key, const void *cm) { return *(int *)key - (int)((const struct casemap *)cm)->code; } -static int utf8_map_case(int uc, int upper) +static int utf8_map_case(const struct casemap *mapping, int num, int ch) { - const struct casemap *cm = bsearch(&uc, unicode_case_mapping, NUMCASEMAP, sizeof(*unicode_case_mapping), cmp_casemap); + /* We only support 16 bit case mapping */ + if (ch <= 0xffff) { + const struct casemap *cm = + bsearch(&ch, mapping, num, sizeof(*mapping), cmp_casemap); - if (cm) { - if (cm->lowerdelta == -128) { - uc = upper ? unicode_extmap[cm->upperdelta].upper : unicode_extmap[cm->upperdelta].lower; - } - else { - uc += upper ? cm->upperdelta : cm->lowerdelta; + if (cm) { + return cm->altcode; } } - return uc; + return ch; } -int utf8_upper(int uc) +int utf8_upper(int ch) { - if (isascii(uc)) { - return toupper(uc); + if (isascii(ch)) { + return toupper(ch); } - return utf8_map_case(uc, 1); + return utf8_map_case(unicode_case_mapping_upper, ARRAYSIZE(unicode_case_mapping_upper), ch); } -int utf8_lower(int uc) +int utf8_lower(int ch) { - if (isascii(uc)) { - return tolower(uc); + if (isascii(ch)) { + return tolower(ch); } - - return utf8_map_case(uc, 0); + return utf8_map_case(unicode_case_mapping_lower, ARRAYSIZE(unicode_case_mapping_lower), ch); } #endif /* JIM_BOOTSTRAP */ |