5 files changed, 145 insertions, 12 deletions
diff --git a/Makefile.in b/Makefile.in
index a803b07..7bbf3d9 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -122,7 +122,7 @@ $(OBJS): Makefile $(wildcard *.h)
 utf8.o: _unicode_mapping.c
 
 _unicode_mapping.c: @srcdir@/UnicodeData.txt @srcdir@/parse-unidata.tcl
-	@tclsh@ @srcdir@/parse-unidata.tcl @srcdir@/UnicodeData.txt >$@ || ( rm $@; exit 1)
+	@tclsh@ @srcdir@/parse-unidata.tcl @PARSE_UNIDATA_FLAGS@ @srcdir@/UnicodeData.txt >$@ || ( rm $@; exit 1)
 @endif
 
 _load-static-exts.c: @srcdir@/make-load-static-exts.tcl Makefile
diff --git a/auto.def b/auto.def
index 1d83995..e531743 100644
--- a/auto.def
+++ b/auto.def
@@ -173,6 +173,7 @@ if {[opt-bool utf8 full]} {
     msg-result "Enabling UTF-8"
     define JIM_UTF8
     define-append CCOPTS -DUSE_UTF8
+    define PARSE_UNIDATA_FLAGS ""
     incr jimregexp
 } else {
     define JIM_UTF8 0
@@ -219,6 +220,7 @@ if {[opt-bool lineedit full]} {
         msg-result "Enabling line editing"
         define USE_LINENOISE
         define-append CCOPTS -DNO_COMPLETION
+        define-append PARSE_UNIDATA_FLAGS -width
         lappend extra_objs linenoise.o
     }
 }
diff --git a/parse-unidata.tcl b/parse-unidata.tcl
index 348a114..b3beec1 100644
--- a/parse-unidata.tcl
+++ b/parse-unidata.tcl
@@ -8,18 +8,59 @@
 #/
 
 # Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt
-# to generate case mapping tables
+# to generate case mapping and display width tables
 set map(lower) {}
 set map(upper) {}
 set map(title) {}
+set map(combining) {}
 
-set f [open [lindex $argv 0]]
+set USAGE "Usage: parse-unidata.tcl \[-width\] UnicodeData.txt"
+
+set do_width 0
+foreach arg $argv {
+	if {$arg eq "-width"} {
+		incr do_width
+	} else {
+		if {[info exists filename]} {
+			puts stderr $USAGE
+			exit 1
+		}
+		set filename $arg
+	}
+}
+if {![info exists filename]} {
+	puts stderr $USAGE
+	exit 1
+}
+
+# Why isn't this available in UnicodeData.txt?
+set map(wide) {
+	0x1100 0x115f 0x2329 0x232a 0x2e80 0x2e99 0x2e9b 0x2ef3
+	0x2f00 0x2fd5 0x2ff0 0x2ffb 0x3000 0x303e 0x3041 0x3096
+	0x3099 0x30ff 0x3105 0x312d 0x3131 0x318e 0x3190 0x31ba
+	0x31c0 0x31e3 0x31f0 0x321e 0x3220 0x3247 0x3250 0x4dbf
+	0x4e00 0xa48c 0xa490 0xa4c6 0xa960 0xa97c 0xac00 0xd7a3
+	0xf900 0xfaff 0xfe10 0xfe19 0xfe30 0xfe52 0xfe54 0xfe66
+	0xfe68 0xfe6b 0xff01 0xffe6 0x1b000 0x1b001 0x1f200 0x1f202
+	0x1f210 0x1f23a 0x1f240 0x1f248 0x1f250 0x1f251 0x20000 0x3fffd
+}
+
+set f [open $filename]
 while {[gets $f buf] >= 0} {
 	set title ""
 	set lower ""
 	set upper ""
 	foreach {code name class x x x x x x x x x upper lower title} [split $buf ";"] break
 	set codex [string tolower 0x$code]
+	if {[string match M* $class]} {
+		if {![info exists combining]} {
+			set combining $codex
+		}
+		continue
+	} elseif {[exists combining]} {
+		lappend map(combining) $combining $codex
+		unset combining
+	}
 	if {$codex <= 0x7f} {
 		continue
 	}
@@ -44,10 +85,32 @@ while {[gets $f buf] >= 0} {
 }
 close $f
 
+proc output-int-pairs {list} {
+	set n 0
+	foreach {v1 v2} $list {
+		puts -nonewline "\t{ $v1, $v2 },"
+		if {[incr n] % 4 == 0} {
+			puts ""
+		}
+	}
+	if {$n % 4} {
+		puts ""
+	}
+}
+
 foreach type {upper lower title} {
 	puts "static const struct casemap unicode_case_mapping_$type\[\] = \{"
-	foreach {code alt} $map($type) {
-		puts "\t{ $code, $alt },"
+	output-int-pairs $map($type)
+	puts "\};\n"
+}
+
+foreach type {combining wide} {
+	puts "static const struct utf8range unicode_range_$type\[\] = \{"
+	if {$do_width} {
+		output-int-pairs $map($type)
+	} else {
+		# Just produce empty width tables in this case
+		output-int-pairs {}
 	}
 	puts "\};\n"
 }
diff --git a/utf8.c b/utf8.c
index 3f00f39..0d94c0e 100644
--- a/utf8.c
+++ b/utf8.c
@@ -76,6 +76,19 @@ int utf8_strlen(const char *str, int bytelen)
     return charlen;
 }
 
+int utf8_strwidth(const char *str, int charlen)
+{
+    int width = 0;
+    while (charlen) {
+        int c;
+        int l = utf8_tounicode(str, &c);
+        width += utf8_width(c);
+        str += l;
+        charlen--;
+    }
+    return width;
+}
+
 int utf8_index(const char *str, int index)
 {
     const char *s = str;
@@ -144,6 +157,12 @@ struct casemap {
     unsigned short altcode;     /* alternate case code point */
 };
 
+struct utf8range {
+    unsigned lower;     /* lower inclusive */
+    unsigned upper;     /* upper exclusive */
+};
+
+
 /* Generated mapping tables */
 #include "_unicode_mapping.c"
 
@@ -168,10 +187,29 @@ static int utf8_map_case(const struct casemap *mapping, int num, int ch)
     return ch;
 }
 
-/* Some platforms don't have isascii */
-#ifndef isascii
-#define isascii(C) (!((C) & ~0x7f))
-#endif
+static int cmp_range(const void *key, const void *cm)
+{
+    const struct utf8range *range = (const struct utf8range *)cm;
+    int ch = *(int *)key;
+    if (ch < range->lower) {
+        return -1;
+    }
+    if (ch >= range->upper) {
+        return 1;
+    }
+    return 0;
+}
+
+static int utf8_in_range(const struct utf8range *range, int num, int ch)
+{
+    const struct utf8range *r =
+        bsearch(&ch, range, num, sizeof(*range), cmp_range);
+
+    if (r) {
+        return 1;
+    }
+    return 0;
+}
 
 int utf8_upper(int ch)
 {
@@ -191,11 +229,26 @@ int utf8_lower(int ch)
 
 int utf8_title(int ch)
 {
-    int newch = utf8_map_case(unicode_case_mapping_title, ARRAYSIZE(unicode_case_mapping_title), ch);
-    if (newch != ch) {
-        return newch ? newch : ch;
+    if (!isascii(ch)) {
+        int newch = utf8_map_case(unicode_case_mapping_title, ARRAYSIZE(unicode_case_mapping_title), ch);
+        if (newch != ch) {
+            return newch ? newch : ch;
+        }
     }
     return utf8_upper(ch);
 }
 
+int utf8_width(int ch)
+{
+    if (!isascii(ch)) {
+        if (utf8_in_range(unicode_range_combining, ARRAYSIZE(unicode_range_combining), ch)) {
+            return 0;
+        }
+        if (utf8_in_range(unicode_range_wide, ARRAYSIZE(unicode_range_wide), ch)) {
+            return 2;
+        }
+    }
+    return 1;
+}
+
 #endif /* JIM_BOOTSTRAP */
diff --git a/utf8.h b/utf8.h
index 7069d25..40fc95f 100644
--- a/utf8.h
+++ b/utf8.h
@@ -30,6 +30,7 @@ int utf8_fromunicode(char *p, unsigned uc);
 
 /* No utf-8 support. 1 byte = 1 char */
 #define utf8_strlen(S, B) ((B) < 0 ? (int)strlen(S) : (B))
+#define utf8_strwidth(S, B) utf8_strlen((S), (B))
 #define utf8_tounicode(S, CP) (*(CP) = (unsigned char)*(S), 1)
 #define utf8_getchars(CP, C) (*(CP) = (C), 1)
 #define utf8_upper(C) toupper(C)
@@ -38,6 +39,7 @@ int utf8_fromunicode(char *p, unsigned uc);
 #define utf8_index(C, I) (I)
 #define utf8_charlen(C) 1
 #define utf8_prev_len(S, L) 1
+#define utf8_width(C) 1
 
 #else
 #if !defined(JIM_BOOTSTRAP)
@@ -67,6 +69,12 @@ int utf8_charlen(int c);
 int utf8_strlen(const char *str, int bytelen);
 
 /**
+ * Calculates the display width of the first 'charlen' characters in 'str'.
+ * See utf8_width()
+ */
+int utf8_strwidth(const char *str, int charlen);
+
+/**
  * Returns the byte index of the given character in the utf-8 string.
  *
  * The string *must* be null terminated.
@@ -125,6 +133,13 @@ int utf8_title(int uc);
  * Unicode code points > \uffff are returned unchanged.
  */
 int utf8_lower(int uc);
+
+/**
+ * Returns the width (in characters) of the given unicode codepoint.
+ * This is 1 for normal letters and 0 for combining characters and 2 for wide characters.
+ */
+int utf8_width(int ch);
+
 #endif /* JIM_BOOTSTRAP */
 
 #endif