aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--parse-unidata.tcl49
-rw-r--r--tests/utf8.test4
-rw-r--r--utf8.c47
3 files changed, 37 insertions, 63 deletions
diff --git a/parse-unidata.tcl b/parse-unidata.tcl
index 4b5ec3a..1a927a3 100644
--- a/parse-unidata.tcl
+++ b/parse-unidata.tcl
@@ -9,56 +9,35 @@
# Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt
# to generate case mapping tables
+set map(lower) {}
+set map(upper) {}
set f [open [lindex $argv 0]]
-set extoff 0
-puts "static const struct casemap unicode_case_mapping\[\] = \{"
while {[gets $f buf] >= 0} {
foreach {code name class x x x x x x x x x upper lower} [split $buf ";"] break
- set code 0x$code
+ set code [string tolower 0x$code]
if {$code <= 0x7f} {
continue
}
if {$code > 0xffff} {
break
}
- if {$class ne "Lu" && $class ne "Ll"} {
+ if {![string match L* $class]} {
continue
}
- if {$upper eq ""} {
- set upper $code
- } else {
- set upper 0x$upper
+ if {$upper ne ""} {
+ lappend map(upper) $code [string tolower 0x$upper]
}
- if {$lower eq ""} {
- set lower $code
- } else {
- set lower 0x$lower
+ if {$lower ne ""} {
+ lappend map(lower) $code [string tolower 0x$lower]
}
- if {$upper == $code && $lower == $code} {
- continue
- }
- set l [expr {$lower - $code}]
- set u [expr {$upper - $code}]
- if {abs($u) > 127 || abs($l) > 127} {
- # Can't encode both in one byte, so use indirection
- lappend jumptable $code $lower $upper
- set l -128
- set u $extoff
- incr extoff
- if {$extoff > 0xff} {
- error "Too many entries in the offset table!"
- }
- }
- set entry [string tolower "$code, $l, $u"]
- puts " { $entry },"
}
close $f
-puts "\};\n"
-# Now the jump table
-puts "static const struct caseextmap unicode_extmap\[\] = \{"
-foreach {c l u} $jumptable {
- puts " { $l, $u },"
+foreach type {upper lower} {
+ puts "static const struct casemap unicode_case_mapping_$type\[\] = \{"
+ foreach {code alt} $map($type) {
+ puts "\t{ $code, $alt },"
+ }
+ puts "\};\n"
}
-puts "\};\n"
diff --git a/tests/utf8.test b/tests/utf8.test
index 04c5b57..715df13 100644
--- a/tests/utf8.test
+++ b/tests/utf8.test
@@ -125,4 +125,8 @@ test utf8-7.2 {append counts correctly} {
list [string length $x] [string bytelength $x]
} {8 12}
+test utf8-7.3 {Upper, lower for titlecase utf-8} {
+ list [string toupper \u01c5] [string tolower \u01c5]
+} "\u01c4 \u01c6"
+
testreport
diff --git a/utf8.c b/utf8.c
index a81b3de..cd1c88b 100644
--- a/utf8.c
+++ b/utf8.c
@@ -136,57 +136,48 @@ int utf8_tounicode(const char *str, int *uc)
}
struct casemap {
- unsigned short code; /* code point */
- signed char lowerdelta; /* add for lowercase, or if -128 use the ext table */
- signed char upperdelta; /* add for uppercase, or offset into the ext table */
-};
-
-/* Extended table for codepoints where |delta| > 127 */
-struct caseextmap {
- unsigned short lower;
- unsigned short upper;
+ unsigned short code; /* code point */
+ unsigned short altcode; /* alternate case code point */
};
/* Generated mapping tables */
#include "_unicode_mapping.c"
-#define NUMCASEMAP sizeof(unicode_case_mapping) / sizeof(*unicode_case_mapping)
+#define ARRAYSIZE(A) sizeof(A) / sizeof(*(A))
static int cmp_casemap(const void *key, const void *cm)
{
return *(int *)key - (int)((const struct casemap *)cm)->code;
}
-static int utf8_map_case(int uc, int upper)
+static int utf8_map_case(const struct casemap *mapping, int num, int ch)
{
- const struct casemap *cm = bsearch(&uc, unicode_case_mapping, NUMCASEMAP, sizeof(*unicode_case_mapping), cmp_casemap);
+ /* We only support 16 bit case mapping */
+ if (ch <= 0xffff) {
+ const struct casemap *cm =
+ bsearch(&ch, mapping, num, sizeof(*mapping), cmp_casemap);
- if (cm) {
- if (cm->lowerdelta == -128) {
- uc = upper ? unicode_extmap[cm->upperdelta].upper : unicode_extmap[cm->upperdelta].lower;
- }
- else {
- uc += upper ? cm->upperdelta : cm->lowerdelta;
+ if (cm) {
+ return cm->altcode;
}
}
- return uc;
+ return ch;
}
-int utf8_upper(int uc)
+int utf8_upper(int ch)
{
- if (isascii(uc)) {
- return toupper(uc);
+ if (isascii(ch)) {
+ return toupper(ch);
}
- return utf8_map_case(uc, 1);
+ return utf8_map_case(unicode_case_mapping_upper, ARRAYSIZE(unicode_case_mapping_upper), ch);
}
-int utf8_lower(int uc)
+int utf8_lower(int ch)
{
- if (isascii(uc)) {
- return tolower(uc);
+ if (isascii(ch)) {
+ return tolower(ch);
}
-
- return utf8_map_case(uc, 0);
+ return utf8_map_case(unicode_case_mapping_lower, ARRAYSIZE(unicode_case_mapping_lower), ch);
}
#endif /* JIM_BOOTSTRAP */