aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.in2
-rw-r--r--auto.def2
-rw-r--r--parse-unidata.tcl71
-rw-r--r--utf8.c67
-rw-r--r--utf8.h15
5 files changed, 145 insertions, 12 deletions
diff --git a/Makefile.in b/Makefile.in
index a803b07..7bbf3d9 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -122,7 +122,7 @@ $(OBJS): Makefile $(wildcard *.h)
utf8.o: _unicode_mapping.c
_unicode_mapping.c: @srcdir@/UnicodeData.txt @srcdir@/parse-unidata.tcl
- @tclsh@ @srcdir@/parse-unidata.tcl @srcdir@/UnicodeData.txt >$@ || ( rm $@; exit 1)
+ @tclsh@ @srcdir@/parse-unidata.tcl @PARSE_UNIDATA_FLAGS@ @srcdir@/UnicodeData.txt >$@ || ( rm $@; exit 1)
@endif
_load-static-exts.c: @srcdir@/make-load-static-exts.tcl Makefile
diff --git a/auto.def b/auto.def
index 1d83995..e531743 100644
--- a/auto.def
+++ b/auto.def
@@ -173,6 +173,7 @@ if {[opt-bool utf8 full]} {
msg-result "Enabling UTF-8"
define JIM_UTF8
define-append CCOPTS -DUSE_UTF8
+ define PARSE_UNIDATA_FLAGS ""
incr jimregexp
} else {
define JIM_UTF8 0
@@ -219,6 +220,7 @@ if {[opt-bool lineedit full]} {
msg-result "Enabling line editing"
define USE_LINENOISE
define-append CCOPTS -DNO_COMPLETION
+ define-append PARSE_UNIDATA_FLAGS -width
lappend extra_objs linenoise.o
}
}
diff --git a/parse-unidata.tcl b/parse-unidata.tcl
index 348a114..b3beec1 100644
--- a/parse-unidata.tcl
+++ b/parse-unidata.tcl
@@ -8,18 +8,59 @@
#/
# Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt
-# to generate case mapping tables
+# to generate case mapping and display width tables
set map(lower) {}
set map(upper) {}
set map(title) {}
+set map(combining) {}
-set f [open [lindex $argv 0]]
+set USAGE "Usage: parse-unidata.tcl \[-width\] UnicodeData.txt"
+
+set do_width 0
+foreach arg $argv {
+ if {$arg eq "-width"} {
+ incr do_width
+ } else {
+ if {[info exists filename]} {
+ puts stderr $USAGE
+ exit 1
+ }
+ set filename $arg
+ }
+}
+if {![info exists filename]} {
+ puts stderr $USAGE
+ exit 1
+}
+
+# Why isn't this available in UnicodeData.txt?
+set map(wide) {
+ 0x1100 0x115f 0x2329 0x232a 0x2e80 0x2e99 0x2e9b 0x2ef3
+ 0x2f00 0x2fd5 0x2ff0 0x2ffb 0x3000 0x303e 0x3041 0x3096
+ 0x3099 0x30ff 0x3105 0x312d 0x3131 0x318e 0x3190 0x31ba
+ 0x31c0 0x31e3 0x31f0 0x321e 0x3220 0x3247 0x3250 0x4dbf
+ 0x4e00 0xa48c 0xa490 0xa4c6 0xa960 0xa97c 0xac00 0xd7a3
+ 0xf900 0xfaff 0xfe10 0xfe19 0xfe30 0xfe52 0xfe54 0xfe66
+ 0xfe68 0xfe6b 0xff01 0xffe6 0x1b000 0x1b001 0x1f200 0x1f202
+ 0x1f210 0x1f23a 0x1f240 0x1f248 0x1f250 0x1f251 0x20000 0x3fffd
+}
+
+set f [open $filename]
while {[gets $f buf] >= 0} {
set title ""
set lower ""
set upper ""
foreach {code name class x x x x x x x x x upper lower title} [split $buf ";"] break
set codex [string tolower 0x$code]
+ if {[string match M* $class]} {
+ if {![info exists combining]} {
+ set combining $codex
+ }
+ continue
+ } elseif {[exists combining]} {
+ lappend map(combining) $combining $codex
+ unset combining
+ }
if {$codex <= 0x7f} {
continue
}
@@ -44,10 +85,32 @@ while {[gets $f buf] >= 0} {
}
close $f
+proc output-int-pairs {list} {
+ set n 0
+ foreach {v1 v2} $list {
+ puts -nonewline "\t{ $v1, $v2 },"
+ if {[incr n] % 4 == 0} {
+ puts ""
+ }
+ }
+ if {$n % 4} {
+ puts ""
+ }
+}
+
foreach type {upper lower title} {
puts "static const struct casemap unicode_case_mapping_$type\[\] = \{"
- foreach {code alt} $map($type) {
- puts "\t{ $code, $alt },"
+ output-int-pairs $map($type)
+ puts "\};\n"
+}
+
+foreach type {combining wide} {
+ puts "static const struct utf8range unicode_range_$type\[\] = \{"
+ if {$do_width} {
+ output-int-pairs $map($type)
+ } else {
+ # Just produce empty width tables in this case
+ output-int-pairs {}
}
puts "\};\n"
}
diff --git a/utf8.c b/utf8.c
index 3f00f39..0d94c0e 100644
--- a/utf8.c
+++ b/utf8.c
@@ -76,6 +76,19 @@ int utf8_strlen(const char *str, int bytelen)
return charlen;
}
+int utf8_strwidth(const char *str, int charlen)
+{
+ int width = 0;
+ while (charlen) {
+ int c;
+ int l = utf8_tounicode(str, &c);
+ width += utf8_width(c);
+ str += l;
+ charlen--;
+ }
+ return width;
+}
+
int utf8_index(const char *str, int index)
{
const char *s = str;
@@ -144,6 +157,12 @@ struct casemap {
unsigned short altcode; /* alternate case code point */
};
+struct utf8range {
+ unsigned lower; /* lower inclusive */
+ unsigned upper; /* upper exclusive */
+};
+
+
/* Generated mapping tables */
#include "_unicode_mapping.c"
@@ -168,10 +187,29 @@ static int utf8_map_case(const struct casemap *mapping, int num, int ch)
return ch;
}
-/* Some platforms don't have isascii */
-#ifndef isascii
-#define isascii(C) (!((C) & ~0x7f))
-#endif
+static int cmp_range(const void *key, const void *cm)
+{
+ const struct utf8range *range = (const struct utf8range *)cm;
+ int ch = *(int *)key;
+ if (ch < range->lower) {
+ return -1;
+ }
+ if (ch >= range->upper) {
+ return 1;
+ }
+ return 0;
+}
+
+static int utf8_in_range(const struct utf8range *range, int num, int ch)
+{
+ const struct utf8range *r =
+ bsearch(&ch, range, num, sizeof(*range), cmp_range);
+
+ if (r) {
+ return 1;
+ }
+ return 0;
+}
int utf8_upper(int ch)
{
@@ -191,11 +229,26 @@ int utf8_lower(int ch)
int utf8_title(int ch)
{
- int newch = utf8_map_case(unicode_case_mapping_title, ARRAYSIZE(unicode_case_mapping_title), ch);
- if (newch != ch) {
- return newch ? newch : ch;
+ if (!isascii(ch)) {
+ int newch = utf8_map_case(unicode_case_mapping_title, ARRAYSIZE(unicode_case_mapping_title), ch);
+ if (newch != ch) {
+ return newch ? newch : ch;
+ }
}
return utf8_upper(ch);
}
+int utf8_width(int ch)
+{
+ if (!isascii(ch)) {
+ if (utf8_in_range(unicode_range_combining, ARRAYSIZE(unicode_range_combining), ch)) {
+ return 0;
+ }
+ if (utf8_in_range(unicode_range_wide, ARRAYSIZE(unicode_range_wide), ch)) {
+ return 2;
+ }
+ }
+ return 1;
+}
+
#endif /* JIM_BOOTSTRAP */
diff --git a/utf8.h b/utf8.h
index 7069d25..40fc95f 100644
--- a/utf8.h
+++ b/utf8.h
@@ -30,6 +30,7 @@ int utf8_fromunicode(char *p, unsigned uc);
/* No utf-8 support. 1 byte = 1 char */
#define utf8_strlen(S, B) ((B) < 0 ? (int)strlen(S) : (B))
+#define utf8_strwidth(S, B) utf8_strlen((S), (B))
#define utf8_tounicode(S, CP) (*(CP) = (unsigned char)*(S), 1)
#define utf8_getchars(CP, C) (*(CP) = (C), 1)
#define utf8_upper(C) toupper(C)
@@ -38,6 +39,7 @@ int utf8_fromunicode(char *p, unsigned uc);
#define utf8_index(C, I) (I)
#define utf8_charlen(C) 1
#define utf8_prev_len(S, L) 1
+#define utf8_width(C) 1
#else
#if !defined(JIM_BOOTSTRAP)
@@ -67,6 +69,12 @@ int utf8_charlen(int c);
int utf8_strlen(const char *str, int bytelen);
/**
+ * Calculates the display width of the first 'charlen' characters in 'str'.
+ * See utf8_width()
+ */
+int utf8_strwidth(const char *str, int charlen);
+
+/**
* Returns the byte index of the given character in the utf-8 string.
*
* The string *must* be null terminated.
@@ -125,6 +133,13 @@ int utf8_title(int uc);
* Unicode code points > \uffff are returned unchanged.
*/
int utf8_lower(int uc);
+
+/**
+ * Returns the width (in characters) of the given unicode codepoint.
+ * This is 1 for normal letters and 0 for combining characters and 2 for wide characters.
+ */
+int utf8_width(int ch);
+
#endif /* JIM_BOOTSTRAP */
#endif