diff options
-rw-r--r-- | Makefile.in | 2 | ||||
-rw-r--r-- | auto.def | 2 | ||||
-rw-r--r-- | parse-unidata.tcl | 71 | ||||
-rw-r--r-- | utf8.c | 67 | ||||
-rw-r--r-- | utf8.h | 15 |
5 files changed, 145 insertions, 12 deletions
diff --git a/Makefile.in b/Makefile.in index a803b07..7bbf3d9 100644 --- a/Makefile.in +++ b/Makefile.in @@ -122,7 +122,7 @@ $(OBJS): Makefile $(wildcard *.h) utf8.o: _unicode_mapping.c _unicode_mapping.c: @srcdir@/UnicodeData.txt @srcdir@/parse-unidata.tcl - @tclsh@ @srcdir@/parse-unidata.tcl @srcdir@/UnicodeData.txt >$@ || ( rm $@; exit 1) + @tclsh@ @srcdir@/parse-unidata.tcl @PARSE_UNIDATA_FLAGS@ @srcdir@/UnicodeData.txt >$@ || ( rm $@; exit 1) @endif _load-static-exts.c: @srcdir@/make-load-static-exts.tcl Makefile @@ -173,6 +173,7 @@ if {[opt-bool utf8 full]} { msg-result "Enabling UTF-8" define JIM_UTF8 define-append CCOPTS -DUSE_UTF8 + define PARSE_UNIDATA_FLAGS "" incr jimregexp } else { define JIM_UTF8 0 @@ -219,6 +220,7 @@ if {[opt-bool lineedit full]} { msg-result "Enabling line editing" define USE_LINENOISE define-append CCOPTS -DNO_COMPLETION + define-append PARSE_UNIDATA_FLAGS -width lappend extra_objs linenoise.o } } diff --git a/parse-unidata.tcl b/parse-unidata.tcl index 348a114..b3beec1 100644 --- a/parse-unidata.tcl +++ b/parse-unidata.tcl @@ -8,18 +8,59 @@ #/ # Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt -# to generate case mapping tables +# to generate case mapping and display width tables set map(lower) {} set map(upper) {} set map(title) {} +set map(combining) {} -set f [open [lindex $argv 0]] +set USAGE "Usage: parse-unidata.tcl \[-width\] UnicodeData.txt" + +set do_width 0 +foreach arg $argv { + if {$arg eq "-width"} { + incr do_width + } else { + if {[info exists filename]} { + puts stderr $USAGE + exit 1 + } + set filename $arg + } +} +if {![info exists filename]} { + puts stderr $USAGE + exit 1 +} + +# Why isn't this available in UnicodeData.txt? +set map(wide) { + 0x1100 0x115f 0x2329 0x232a 0x2e80 0x2e99 0x2e9b 0x2ef3 + 0x2f00 0x2fd5 0x2ff0 0x2ffb 0x3000 0x303e 0x3041 0x3096 + 0x3099 0x30ff 0x3105 0x312d 0x3131 0x318e 0x3190 0x31ba + 0x31c0 0x31e3 0x31f0 0x321e 0x3220 0x3247 0x3250 0x4dbf + 0x4e00 0xa48c 0xa490 0xa4c6 0xa960 0xa97c 0xac00 0xd7a3 + 0xf900 0xfaff 0xfe10 0xfe19 0xfe30 0xfe52 0xfe54 0xfe66 + 0xfe68 0xfe6b 0xff01 0xffe6 0x1b000 0x1b001 0x1f200 0x1f202 + 0x1f210 0x1f23a 0x1f240 0x1f248 0x1f250 0x1f251 0x20000 0x3fffd +} + +set f [open $filename] while {[gets $f buf] >= 0} { set title "" set lower "" set upper "" foreach {code name class x x x x x x x x x upper lower title} [split $buf ";"] break set codex [string tolower 0x$code] + if {[string match M* $class]} { + if {![info exists combining]} { + set combining $codex + } + continue + } elseif {[exists combining]} { + lappend map(combining) $combining $codex + unset combining + } if {$codex <= 0x7f} { continue } @@ -44,10 +85,32 @@ while {[gets $f buf] >= 0} { } close $f +proc output-int-pairs {list} { + set n 0 + foreach {v1 v2} $list { + puts -nonewline "\t{ $v1, $v2 }," + if {[incr n] % 4 == 0} { + puts "" + } + } + if {$n % 4} { + puts "" + } +} + foreach type {upper lower title} { puts "static const struct casemap unicode_case_mapping_$type\[\] = \{" - foreach {code alt} $map($type) { - puts "\t{ $code, $alt }," + output-int-pairs $map($type) + puts "\};\n" +} + +foreach type {combining wide} { + puts "static const struct utf8range unicode_range_$type\[\] = \{" + if {$do_width} { + output-int-pairs $map($type) + } else { + # Just produce empty width tables in this case + output-int-pairs {} } puts "\};\n" } @@ -76,6 +76,19 @@ int utf8_strlen(const char *str, int bytelen) return charlen; } +int utf8_strwidth(const char *str, int charlen) +{ + int width = 0; + while (charlen) { + int c; + int l = utf8_tounicode(str, &c); + width += utf8_width(c); + str += l; + charlen--; + } + return width; +} + int utf8_index(const char *str, int index) { const char *s = str; @@ -144,6 +157,12 @@ struct casemap { unsigned short altcode; /* alternate case code point */ }; +struct utf8range { + unsigned lower; /* lower inclusive */ + unsigned upper; /* upper exclusive */ +}; + + /* Generated mapping tables */ #include "_unicode_mapping.c" @@ -168,10 +187,29 @@ static int utf8_map_case(const struct casemap *mapping, int num, int ch) return ch; } -/* Some platforms don't have isascii */ -#ifndef isascii -#define isascii(C) (!((C) & ~0x7f)) -#endif +static int cmp_range(const void *key, const void *cm) +{ + const struct utf8range *range = (const struct utf8range *)cm; + int ch = *(int *)key; + if (ch < range->lower) { + return -1; + } + if (ch >= range->upper) { + return 1; + } + return 0; +} + +static int utf8_in_range(const struct utf8range *range, int num, int ch) +{ + const struct utf8range *r = + bsearch(&ch, range, num, sizeof(*range), cmp_range); + + if (r) { + return 1; + } + return 0; +} int utf8_upper(int ch) { @@ -191,11 +229,26 @@ int utf8_lower(int ch) int utf8_title(int ch) { - int newch = utf8_map_case(unicode_case_mapping_title, ARRAYSIZE(unicode_case_mapping_title), ch); - if (newch != ch) { - return newch ? newch : ch; + if (!isascii(ch)) { + int newch = utf8_map_case(unicode_case_mapping_title, ARRAYSIZE(unicode_case_mapping_title), ch); + if (newch != ch) { + return newch ? newch : ch; + } } return utf8_upper(ch); } +int utf8_width(int ch) +{ + if (!isascii(ch)) { + if (utf8_in_range(unicode_range_combining, ARRAYSIZE(unicode_range_combining), ch)) { + return 0; + } + if (utf8_in_range(unicode_range_wide, ARRAYSIZE(unicode_range_wide), ch)) { + return 2; + } + } + return 1; +} + #endif /* JIM_BOOTSTRAP */ @@ -30,6 +30,7 @@ int utf8_fromunicode(char *p, unsigned uc); /* No utf-8 support. 1 byte = 1 char */ #define utf8_strlen(S, B) ((B) < 0 ? (int)strlen(S) : (B)) +#define utf8_strwidth(S, B) utf8_strlen((S), (B)) #define utf8_tounicode(S, CP) (*(CP) = (unsigned char)*(S), 1) #define utf8_getchars(CP, C) (*(CP) = (C), 1) #define utf8_upper(C) toupper(C) @@ -38,6 +39,7 @@ int utf8_fromunicode(char *p, unsigned uc); #define utf8_index(C, I) (I) #define utf8_charlen(C) 1 #define utf8_prev_len(S, L) 1 +#define utf8_width(C) 1 #else #if !defined(JIM_BOOTSTRAP) @@ -67,6 +69,12 @@ int utf8_charlen(int c); int utf8_strlen(const char *str, int bytelen); /** + * Calculates the display width of the first 'charlen' characters in 'str'. + * See utf8_width() + */ +int utf8_strwidth(const char *str, int charlen); + +/** * Returns the byte index of the given character in the utf-8 string. * * The string *must* be null terminated. @@ -125,6 +133,13 @@ int utf8_title(int uc); * Unicode code points > \uffff are returned unchanged. */ int utf8_lower(int uc); + +/** + * Returns the width (in characters) of the given unicode codepoint. + * This is 1 for normal letters and 0 for combining characters and 2 for wide characters. + */ +int utf8_width(int ch); + #endif /* JIM_BOOTSTRAP */ #endif |