diff options
author | Thomas Wolff <towo@towo.net> | 2018-03-07 23:55:52 +0100 |
---|---|---|
committer | Corinna Vinschen <corinna@vinschen.de> | 2018-03-12 10:17:20 +0100 |
commit | 37132125bcda103dd723f0cb4226c190ffbc9966 (patch) | |
tree | d1d93cb0497e22607f4dc4ca00467c84906a80f4 /newlib/libc/string | |
parent | 8e8fd6c849472d7bfe3949c44880abfcca7aed48 (diff) | |
download | newlib-37132125bcda103dd723f0cb4226c190ffbc9966.zip newlib-37132125bcda103dd723f0cb4226c190ffbc9966.tar.gz newlib-37132125bcda103dd723f0cb4226c190ffbc9966.tar.bz2 |
width data generation
Diffstat (limited to 'newlib/libc/string')
-rw-r--r-- | newlib/libc/string/WIDTH-A | 569 | ||||
-rwxr-xr-x | newlib/libc/string/mkunidata | 54 | ||||
-rwxr-xr-x | newlib/libc/string/mkwide | 49 | ||||
-rwxr-xr-x | newlib/libc/string/mkwidthA | 20 | ||||
-rwxr-xr-x | newlib/libc/string/uniset | 696 |
5 files changed, 1388 insertions, 0 deletions
diff --git a/newlib/libc/string/WIDTH-A b/newlib/libc/string/WIDTH-A new file mode 100644 index 0000000..51e8f23 --- /dev/null +++ b/newlib/libc/string/WIDTH-A @@ -0,0 +1,569 @@ +# UAX #11: East Asian Ambiguous + +# Plane 00 +# Rows Positions (Cells) + + 00 A1 A4 A7-A8 AA AD-AE B0-B4 B6-BA BC-BF C6 D0 D7-D8 DE-E1 E6 E8-EA + 00 EC-ED F0 F2-F3 F7-FA FC FE + 01 01 11 13 1B 26-27 2B 31-33 38 3F-42 44 48-4B 4D 52-53 66-67 6B + 01 CE D0 D2 D4 D6 D8 DA DC + 02 51 61 C4 C7 C9-CB CD D0 D8-DB DD DF + 03 00-6F 91-A1 A3-A9 B1-C1 C3-C9 + 04 01 10-4F 51 + 20 10 13-16 18-19 1C-1D 20-22 24-27 30 32-33 35 3B 3E 74 7F 81-84 + 20 AC + 21 03 05 09 13 16 21-22 26 2B 53-54 5B-5E 60-6B 70-79 89 90-99 B8-B9 + 21 D2 D4 E7 + 22 00 02-03 07-08 0B 0F 11 15 1A 1D-20 23 25 27-2C 2E 34-37 3C-3D + 22 48 4C 52 60-61 64-67 6A-6B 6E-6F 82-83 86-87 95 99 A5 BF + 23 12 + 24 60-E9 EB-FF + 25 00-4B 50-73 80-8F 92-95 A0-A1 A3-A9 B2-B3 B6-B7 BC-BD C0-C1 C6-C8 + 25 CB CE-D1 E2-E5 EF + 26 05-06 09 0E-0F 1C 1E 40 42 60-61 63-65 67-6A 6C-6D 6F 9E-9F BF + 26 C6-CD CF-D3 D5-E1 E3 E8-E9 EB-F1 F4 F6-F9 FB-FC FE-FF + 27 3D 76-7F + 2B 56-59 + 32 48-4F + E0 00-FF + E1 00-FF + E2 00-FF + E3 00-FF + E4 00-FF + E5 00-FF + E6 00-FF + E7 00-FF + E8 00-FF + E9 00-FF + EA 00-FF + EB 00-FF + EC 00-FF + ED 00-FF + EE 00-FF + EF 00-FF + F0 00-FF + F1 00-FF + F2 00-FF + F3 00-FF + F4 00-FF + F5 00-FF + F6 00-FF + F7 00-FF + F8 00-FF + FE 00-0F + FF FD + 1F1 00-0A 10-2D 30-69 70-8D 8F-90 9B-AC + E01 00-EF + F00 00-FF + F01 00-FF + F02 00-FF + F03 00-FF + F04 00-FF + F05 00-FF + F06 00-FF + F07 00-FF + F08 00-FF + F09 00-FF + F0A 00-FF + F0B 00-FF + F0C 00-FF + F0D 00-FF + F0E 00-FF + F0F 00-FF + F10 00-FF + F11 00-FF + F12 00-FF + F13 00-FF + F14 00-FF + F15 00-FF + F16 00-FF + F17 00-FF + F18 00-FF + F19 00-FF + F1A 00-FF + F1B 00-FF + F1C 00-FF + F1D 00-FF + F1E 00-FF + F1F 00-FF + F20 00-FF + F21 00-FF + F22 00-FF + F23 00-FF + F24 00-FF + F25 00-FF + F26 00-FF + F27 00-FF + F28 00-FF + F29 00-FF + F2A 00-FF + F2B 00-FF + F2C 00-FF + F2D 00-FF + F2E 00-FF + F2F 00-FF + F30 00-FF + F31 00-FF + F32 00-FF + F33 00-FF + F34 00-FF + F35 00-FF + F36 00-FF + F37 00-FF + F38 00-FF + F39 00-FF + F3A 00-FF + F3B 00-FF + F3C 00-FF + F3D 00-FF + F3E 00-FF + F3F 00-FF + F40 00-FF + F41 00-FF + F42 00-FF + F43 00-FF + F44 00-FF + F45 00-FF + F46 00-FF + F47 00-FF + F48 00-FF + F49 00-FF + F4A 00-FF + F4B 00-FF + F4C 00-FF + F4D 00-FF + F4E 00-FF + F4F 00-FF + F50 00-FF + F51 00-FF + F52 00-FF + F53 00-FF + F54 00-FF + F55 00-FF + F56 00-FF + F57 00-FF + F58 00-FF + F59 00-FF + F5A 00-FF + F5B 00-FF + F5C 00-FF + F5D 00-FF + F5E 00-FF + F5F 00-FF + F60 00-FF + F61 00-FF + F62 00-FF + F63 00-FF + F64 00-FF + F65 00-FF + F66 00-FF + F67 00-FF + F68 00-FF + F69 00-FF + F6A 00-FF + F6B 00-FF + F6C 00-FF + F6D 00-FF + F6E 00-FF + F6F 00-FF + F70 00-FF + F71 00-FF + F72 00-FF + F73 00-FF + F74 00-FF + F75 00-FF + F76 00-FF + F77 00-FF + F78 00-FF + F79 00-FF + F7A 00-FF + F7B 00-FF + F7C 00-FF + F7D 00-FF + F7E 00-FF + F7F 00-FF + F80 00-FF + F81 00-FF + F82 00-FF + F83 00-FF + F84 00-FF + F85 00-FF + F86 00-FF + F87 00-FF + F88 00-FF + F89 00-FF + F8A 00-FF + F8B 00-FF + F8C 00-FF + F8D 00-FF + F8E 00-FF + F8F 00-FF + F90 00-FF + F91 00-FF + F92 00-FF + F93 00-FF + F94 00-FF + F95 00-FF + F96 00-FF + F97 00-FF + F98 00-FF + F99 00-FF + F9A 00-FF + F9B 00-FF + F9C 00-FF + F9D 00-FF + F9E 00-FF + F9F 00-FF + FA0 00-FF + FA1 00-FF + FA2 00-FF + FA3 00-FF + FA4 00-FF + FA5 00-FF + FA6 00-FF + FA7 00-FF + FA8 00-FF + FA9 00-FF + FAA 00-FF + FAB 00-FF + FAC 00-FF + FAD 00-FF + FAE 00-FF + FAF 00-FF + FB0 00-FF + FB1 00-FF + FB2 00-FF + FB3 00-FF + FB4 00-FF + FB5 00-FF + FB6 00-FF + FB7 00-FF + FB8 00-FF + FB9 00-FF + FBA 00-FF + FBB 00-FF + FBC 00-FF + FBD 00-FF + FBE 00-FF + FBF 00-FF + FC0 00-FF + FC1 00-FF + FC2 00-FF + FC3 00-FF + FC4 00-FF + FC5 00-FF + FC6 00-FF + FC7 00-FF + FC8 00-FF + FC9 00-FF + FCA 00-FF + FCB 00-FF + FCC 00-FF + FCD 00-FF + FCE 00-FF + FCF 00-FF + FD0 00-FF + FD1 00-FF + FD2 00-FF + FD3 00-FF + FD4 00-FF + FD5 00-FF + FD6 00-FF + FD7 00-FF + FD8 00-FF + FD9 00-FF + FDA 00-FF + FDB 00-FF + FDC 00-FF + FDD 00-FF + FDE 00-FF + FDF 00-FF + FE0 00-FF + FE1 00-FF + FE2 00-FF + FE3 00-FF + FE4 00-FF + FE5 00-FF + FE6 00-FF + FE7 00-FF + FE8 00-FF + FE9 00-FF + FEA 00-FF + FEB 00-FF + FEC 00-FF + FED 00-FF + FEE 00-FF + FEF 00-FF + FF0 00-FF + FF1 00-FF + FF2 00-FF + FF3 00-FF + FF4 00-FF + FF5 00-FF + FF6 00-FF + FF7 00-FF + FF8 00-FF + FF9 00-FF + FFA 00-FF + FFB 00-FF + FFC 00-FF + FFD 00-FF + FFE 00-FF + FFF 00-FD + 1000 00-FF + 1001 00-FF + 1002 00-FF + 1003 00-FF + 1004 00-FF + 1005 00-FF + 1006 00-FF + 1007 00-FF + 1008 00-FF + 1009 00-FF + 100A 00-FF + 100B 00-FF + 100C 00-FF + 100D 00-FF + 100E 00-FF + 100F 00-FF + 1010 00-FF + 1011 00-FF + 1012 00-FF + 1013 00-FF + 1014 00-FF + 1015 00-FF + 1016 00-FF + 1017 00-FF + 1018 00-FF + 1019 00-FF + 101A 00-FF + 101B 00-FF + 101C 00-FF + 101D 00-FF + 101E 00-FF + 101F 00-FF + 1020 00-FF + 1021 00-FF + 1022 00-FF + 1023 00-FF + 1024 00-FF + 1025 00-FF + 1026 00-FF + 1027 00-FF + 1028 00-FF + 1029 00-FF + 102A 00-FF + 102B 00-FF + 102C 00-FF + 102D 00-FF + 102E 00-FF + 102F 00-FF + 1030 00-FF + 1031 00-FF + 1032 00-FF + 1033 00-FF + 1034 00-FF + 1035 00-FF + 1036 00-FF + 1037 00-FF + 1038 00-FF + 1039 00-FF + 103A 00-FF + 103B 00-FF + 103C 00-FF + 103D 00-FF + 103E 00-FF + 103F 00-FF + 1040 00-FF + 1041 00-FF + 1042 00-FF + 1043 00-FF + 1044 00-FF + 1045 00-FF + 1046 00-FF + 1047 00-FF + 1048 00-FF + 1049 00-FF + 104A 00-FF + 104B 00-FF + 104C 00-FF + 104D 00-FF + 104E 00-FF + 104F 00-FF + 1050 00-FF + 1051 00-FF + 1052 00-FF + 1053 00-FF + 1054 00-FF + 1055 00-FF + 1056 00-FF + 1057 00-FF + 1058 00-FF + 1059 00-FF + 105A 00-FF + 105B 00-FF + 105C 00-FF + 105D 00-FF + 105E 00-FF + 105F 00-FF + 1060 00-FF + 1061 00-FF + 1062 00-FF + 1063 00-FF + 1064 00-FF + 1065 00-FF + 1066 00-FF + 1067 00-FF + 1068 00-FF + 1069 00-FF + 106A 00-FF + 106B 00-FF + 106C 00-FF + 106D 00-FF + 106E 00-FF + 106F 00-FF + 1070 00-FF + 1071 00-FF + 1072 00-FF + 1073 00-FF + 1074 00-FF + 1075 00-FF + 1076 00-FF + 1077 00-FF + 1078 00-FF + 1079 00-FF + 107A 00-FF + 107B 00-FF + 107C 00-FF + 107D 00-FF + 107E 00-FF + 107F 00-FF + 1080 00-FF + 1081 00-FF + 1082 00-FF + 1083 00-FF + 1084 00-FF + 1085 00-FF + 1086 00-FF + 1087 00-FF + 1088 00-FF + 1089 00-FF + 108A 00-FF + 108B 00-FF + 108C 00-FF + 108D 00-FF + 108E 00-FF + 108F 00-FF + 1090 00-FF + 1091 00-FF + 1092 00-FF + 1093 00-FF + 1094 00-FF + 1095 00-FF + 1096 00-FF + 1097 00-FF + 1098 00-FF + 1099 00-FF + 109A 00-FF + 109B 00-FF + 109C 00-FF + 109D 00-FF + 109E 00-FF + 109F 00-FF + 10A0 00-FF + 10A1 00-FF + 10A2 00-FF + 10A3 00-FF + 10A4 00-FF + 10A5 00-FF + 10A6 00-FF + 10A7 00-FF + 10A8 00-FF + 10A9 00-FF + 10AA 00-FF + 10AB 00-FF + 10AC 00-FF + 10AD 00-FF + 10AE 00-FF + 10AF 00-FF + 10B0 00-FF + 10B1 00-FF + 10B2 00-FF + 10B3 00-FF + 10B4 00-FF + 10B5 00-FF + 10B6 00-FF + 10B7 00-FF + 10B8 00-FF + 10B9 00-FF + 10BA 00-FF + 10BB 00-FF + 10BC 00-FF + 10BD 00-FF + 10BE 00-FF + 10BF 00-FF + 10C0 00-FF + 10C1 00-FF + 10C2 00-FF + 10C3 00-FF + 10C4 00-FF + 10C5 00-FF + 10C6 00-FF + 10C7 00-FF + 10C8 00-FF + 10C9 00-FF + 10CA 00-FF + 10CB 00-FF + 10CC 00-FF + 10CD 00-FF + 10CE 00-FF + 10CF 00-FF + 10D0 00-FF + 10D1 00-FF + 10D2 00-FF + 10D3 00-FF + 10D4 00-FF + 10D5 00-FF + 10D6 00-FF + 10D7 00-FF + 10D8 00-FF + 10D9 00-FF + 10DA 00-FF + 10DB 00-FF + 10DC 00-FF + 10DD 00-FF + 10DE 00-FF + 10DF 00-FF + 10E0 00-FF + 10E1 00-FF + 10E2 00-FF + 10E3 00-FF + 10E4 00-FF + 10E5 00-FF + 10E6 00-FF + 10E7 00-FF + 10E8 00-FF + 10E9 00-FF + 10EA 00-FF + 10EB 00-FF + 10EC 00-FF + 10ED 00-FF + 10EE 00-FF + 10EF 00-FF + 10F0 00-FF + 10F1 00-FF + 10F2 00-FF + 10F3 00-FF + 10F4 00-FF + 10F5 00-FF + 10F6 00-FF + 10F7 00-FF + 10F8 00-FF + 10F9 00-FF + 10FA 00-FF + 10FB 00-FF + 10FC 00-FF + 10FD 00-FF + 10FE 00-FF + 10FF 00-FD + diff --git a/newlib/libc/string/mkunidata b/newlib/libc/string/mkunidata new file mode 100755 index 0000000..c0bf5de --- /dev/null +++ b/newlib/libc/string/mkunidata @@ -0,0 +1,54 @@ +#! /bin/sh + +echo generating Unicode width data for newlib/libc/string/wcwidth.c + +cd `dirname $0` +PATH="$PATH":. # ensure access to uniset tool + +############################################################################# +# checks and (with option -u) downloads + +case "$1" in +-u) + #WGET=wget -N -t 1 --timeout=55 + WGET=curl -R -O --connect-timeout 55 + WGET+=-z $@ + + echo downloading uniset tool + $WGET http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz + gzip -dc uniset.tar.gz | tar xvf - uniset + + echo downloading data from unicode.org + for data in UnicodeData.txt Blocks.txt EastAsianWidth.txt + do $WGET http://unicode.org/Public/UNIDATA/$data + done + ;; +*) echo checking package unicode-ucd + grep unicode-ucd /etc/setup/installed.db || exit 9 + ;; +esac + +echo checking uniset tool +type uniset || exit 9 + +for data in UnicodeData.txt Blocks.txt EastAsianWidth.txt +do test -r $data || ln -s /usr/share/unicode/ucd/$data . || exit 9 +done + +echo generating from Unicode version `sed -e 's,[^.0-9],,g' -e 1q Blocks.txt` +exit + +############################################################################# +# table generation + +echo generating combining characters table +uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B +D7B0-D7C6 +D7CB-D7FB c > combining.t + +echo generating ambiguous width characters table +sh ./mkwidthA && uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c > ambiguous.t + +echo generating wide characters table +sh ./mkwide + +############################################################################# +# end diff --git a/newlib/libc/string/mkwide b/newlib/libc/string/mkwide new file mode 100755 index 0000000..55a0bab --- /dev/null +++ b/newlib/libc/string/mkwide @@ -0,0 +1,49 @@ +#! /bin/sh + +# generate list of wide characters, with convex closure + +skipcheck=false + +if [ ! -r EastAsianWidth.txt ] +then ln -s /usr/share/unicode/ucd/EastAsianWidth.txt . || exit 1 +fi +if [ ! -r UnicodeData.txt ] +then ln -s /usr/share/unicode/ucd/UnicodeData.txt . || exit 1 +fi +if [ ! -r Blocks.txt ] +then ln -s /usr/share/unicode/ucd/Blocks.txt . || exit 1 +fi + +sed -e "s,^\([^;]*\);[NAH],\1," -e t -e d EastAsianWidth.txt > wide.na +sed -e "s,^\([^;]*\);[WF],\1," -e t -e d EastAsianWidth.txt > wide.fw + +PATH="$PATH:." # for uniset + +nrfw=`uniset +wide.fw nr | sed -e 's,.*:,,'` +echo FW $nrfw +nrna=`uniset +wide.na nr | sed -e 's,.*:,,'` +echo NAH $nrna + +extrablocks="2E80-303E" + +# check all blocks +includes () { + nr=`uniset +wide.$2 -$1 nr | sed -e 's,.*:,,'` + test $nr != $3 +} +echo "adding compact closure of wide ranges, this may take ~10min" +for b in $extrablocks `sed -e 's,^\([0-9A-F]*\)\.\.\([0-9A-F]*\).*,\1-\2,' -e t -e d Blocks.txt` +do range=$b + echo checking $range $* >&2 + if includes $range fw $nrfw && ! includes $range na $nrna + then echo $range + fi +done > wide.blocks + +( +sed -e "s,^,//," -e 1q EastAsianWidth.txt +sed -e "s,^,//," -e 1q Blocks.txt +uniset `sed -e 's,^,+,' wide.blocks` +wide.fw c +) > wide.t + +rm -f wide.na wide.fw wide.blocks diff --git a/newlib/libc/string/mkwidthA b/newlib/libc/string/mkwidthA new file mode 100755 index 0000000..343ab40 --- /dev/null +++ b/newlib/libc/string/mkwidthA @@ -0,0 +1,20 @@ +#! /bin/sh + +# generate WIDTH-A file, listing Unicode characters with width property +# Ambiguous, from EastAsianWidth.txt + +if [ ! -r EastAsianWidth.txt ] +then ln -s /usr/share/unicode/ucd/EastAsianWidth.txt . || exit 1 +fi +if [ ! -r UnicodeData.txt ] +then ln -s /usr/share/unicode/ucd/UnicodeData.txt . || exit 1 +fi +if [ ! -r Blocks.txt ] +then ln -s /usr/share/unicode/ucd/Blocks.txt . || exit 1 +fi + +sed -e "s,^\([^;]*\);A,\1," -e t -e d EastAsianWidth.txt > width-a-new +rm -f WIDTH-A +echo "# UAX #11: East Asian Ambiguous" > WIDTH-A +PATH="$PATH:." uniset +width-a-new compact >> WIDTH-A +rm -f width-a-new diff --git a/newlib/libc/string/uniset b/newlib/libc/string/uniset new file mode 100755 index 0000000..85d3b2a --- /dev/null +++ b/newlib/libc/string/uniset @@ -0,0 +1,696 @@ +#!/usr/bin/perl +# Uniset -- Unicode subset manager -- Markus Kuhn +# http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz + +require 5.008; +use open ':utf8'; +use FindBin qw($RealBin); # to find directory where this file is located + +binmode(STDOUT, ":utf8"); +binmode(STDIN, ":utf8"); + +my (%name, %invname, %category, %comment); + +print <<End if $#ARGV < 0; +Uniset -- Unicode subset manager -- Markus Kuhn + +Uniset merges and subtracts Unicode subsets. It can output and +analyse the resulting character set in various formats. + +Uniset understand the following command-line arguments: + +Commands to define a set of characters: + + + filename add the character set described in the file to the set + - filename remove the character set described in the file from the set + +: filename add the characters in the UTF-8 file to the set + -: filename remove the characters in the UTF-8 file from the set + +xxxx..yyyy add the range to the set (xxxx and yyyy are hex numbers) + -xxxx..yyyy remove the range from the set (xxxx and yyyy are hex numbers) + +cat=Xx add all Unicode characters with category code Xx + -cat=Xx remove all Unicode characters with category code Xx + -cat!=Xx remove all Unicode characters without category code Xx + clean remove any elements that do not appear in the Unicode database + unknown remove any elements that do appear in the Unicode database + +Command to output descriptions of the constructed set of characters: + + table write a full table with one line per character + compact output the set in compact MES format + c output the set as C interval array + nr output the number of characters + sources output a table that shows the number of characters contributed + by the various combinations of input sets added with +. + utf8-list output a list of all characters encoded in UTF-8 + +Commands to tailor the following output commands: + + html write HTML tables instead of plain text + ucs add the unicode character itself to the table (UTF-8 in + plain table, numeric character reference in HTML) + +Formats of character set input files read by the + and - command: + +Empty lines, white space at the start and end of the line and any +comment text following a \# are ignored. The following formats are +recognized + +xx yyyy xx is the hex code in an 8-bit character set and yyyy + is the corresponding Unicode value. Both can optionally + be prefixed by 0x. This is the format used in the + files on <ftp://ftp.unicode.org/Public/MAPPINGS/>. + +yyyy yyyy (optionally prefixed with 0x) is a Unicode character + belonging to the specified subset. + +yyyy-yyyy a range of Unicode characters belonging to +yyyy..yyyy the specified subset. + +xx yy yy yy-yy yy xx denotes a row (high-byte) and the yy specify + corresponding low bytes or with a hyphen also ranges of + low bytes in the Unicode values that belong to this + subset. This is also the format that is generated by + the compact command. +End +exit 1 if $#ARGV < 0; + + +# Subroutine to identify whether the ISO 10646/Unicode character code +# ucs belongs into the East Asian Wide (W) or East Asian FullWidth +# (F) category as defined in Unicode Technical Report #11. + +sub iswide ($) { + my $ucs = shift(@_); + + return ($ucs >= 0x1100 && + ($ucs <= 0x115f || # Hangul Jamo + $ucs == 0x2329 || $ucs == 0x232a || + ($ucs >= 0x2e80 && $ucs <= 0xa4cf && + $ucs != 0x303f) || # CJK .. Yi + ($ucs >= 0xac00 && $ucs <= 0xd7a3) || # Hangul Syllables + ($ucs >= 0xf900 && $ucs <= 0xfaff) || # CJK Comp. Ideographs + ($ucs >= 0xfe30 && $ucs <= 0xfe6f) || # CJK Comp. Forms + ($ucs >= 0xff00 && $ucs <= 0xff60) || # Fullwidth Forms + ($ucs >= 0xffe0 && $ucs <= 0xffe6) || + ($ucs >= 0x20000 && $ucs <= 0x2fffd) || + ($ucs >= 0x30000 && $ucs <= 0x3fffd))); +} + +# Return the Unicode name that belongs to a given character code + +# Jamo short names, see Unicode 3.0, table 4-4, page 86 + +my @lname = ('G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', + 'J', 'JJ', 'C', 'K', 'T', 'P', 'H'); # 1100..1112 +my @vname = ('A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', + 'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', + 'EU', 'YI', 'I'); # 1161..1175 +my @tname = ('G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM', + 'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', + 'NG', 'J', 'C', 'K', 'T', 'P', 'H'); # 11a8..11c2 + +sub name { + my $ucs = shift(@_); + + # The intervals used here reflect Unicode Version 3.2 + if (($ucs >= 0x3400 && $ucs <= 0x4db5) || + ($ucs >= 0x4e00 && $ucs <= 0x9fa5) || + ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) { + return "CJK UNIFIED IDEOGRAPH-" . sprintf("%04X", $ucs); + } + + if ($ucs >= 0xac00 && $ucs <= 0xd7a3) { + my $s = $ucs - 0xac00; + my $l = 0x1100 + int($s / (21 * 28)); + my $v = 0x1161 + int(($s % (21 * 28)) / 28); + my $t = 0x11a7 + $s % 28; + return "HANGUL SYLLABLE " . + ($lname[int($s / (21 * 28))] . + $vname[int(($s % (21 * 28)) / 28)] . + $tname[$s % 28 - 1]); + } + + return $name{$ucs}; +} + +sub is_unicode { + my $ucs = shift(@_); + + # The intervals used here reflect Unicode Version 3.2 + if (($ucs >= 0x3400 && $ucs <= 0x4db5) || + ($ucs >= 0x4e00 && $ucs <= 0x9fa5) || + ($ucs >= 0xac00 && $ucs <= 0xd7a3) || + ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) { + return 1; + } + + return exists $name{$ucs}; +} + +my @search_path; +push @search_path, "$ENV{HOME}/local/share/uniset" + if -d "$ENV{HOME}/local/share/uniset"; +push @search_path, "/usr/share/uniset" if -d "/usr/share/uniset"; +push @search_path, $RealBin unless $RealBin =~ m|^/usr/bin|; + +sub search_open { + my ($mode, $fn) = @_; + my $file; + return $file if open($file, $mode, $fn); + return undef if $fn =~ m|/|; + for my $path (@search_path) { + return $file if open($file, $mode, "$path/$fn"); + } + return undef; +} + +my $html = 0; +my $image = 0; +my $adducs = 0; +my $unicodedata = "UnicodeData.txt"; +my $blockdata = "Blocks.txt"; + +# read list of all Unicode names +my $data = search_open('<', $unicodedata); +unless ($data) { + die ("Can't open Unicode database '$unicodedata':\n$!\n\n" . + "Please make sure that you have downloaded the file\n" . + "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n"); +} +while (<$data>) { + if (/^([0-9,A-F]{4,8});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) { + next if $2 ne '<control>' && substr($2, 0, 1) eq '<'; + $ucs = hex($1); + $name{$ucs} = $2; + $invname{$2} = $ucs; + $category{$ucs} = $3; + $comment{$ucs} = $12; + } else { + die("Syntax error in line '$_' in file '$unicodedata'"); + } +} +close($data); + +# read list of all Unicode blocks +$data = search_open('<', $blockdata); +unless ($data) { + die ("Can't open Unicode blockname list '$blockdata':\n$!\n\n" . + "Please make sure that you have downloaded the file\n" . + "http://www.unicode.org/Public/UNIDATA/Blocks.txt\n"); +} +my $blocks = 0; +my (@blockstart, @blockend, @blockname); +while (<$data>) { + if (/^\s*([0-9,A-F]{4,8})\s*\.\.\s*([0-9,A-F]{4,8})\s*;\s*(.*)$/) { + $blockstart[$blocks] = hex($1); + $blockend [$blocks] = hex($2); + $blockname [$blocks] = $3; + $blocks++; + } elsif (/^\s*\#/ || /^\s*$/) { + # ignore comments and empty lines + } else { + die("Syntax error in line '$_' in file '$blockdata'"); + } +} +close($data); +if ($blockend[$blocks-1] < 0x110000) { + $blockstart[$blocks] = 0x110000; + $blockend [$blocks] = 0x7FFFFFFF; + $blockname [$blocks] = "Beyond Plane 16"; + $blocks++; +} + +# process command line arguments +while ($_ = shift(@ARGV)) { + if (/^html$/) { + $html = 1; + } elsif (/^ucs$/) { + $adducs = 1; + } elsif (/^img$/) { + $html = 1; + $image = 1; + } elsif (/^template$/) { + $template = shift(@ARGV); + open(TEMPLATE, $template) || die("Can't open template file '$template': '$!'"); + while (<TEMPLATE>) { + if (/^\#\s*include\s+\"([^\"]*)\"\s*$/) { + open(INCLUDE, $1) || die("Can't open template include file '$1': '$!'"); + while (<INCLUDE>) { + print $_; + } + close(INCLUDE); + } elsif (/^\#\s*quote\s+\"([^\"]*)\"\s*$/) { + open(INCLUDE, $1) || die("Can't open template include file '$1': '$!'"); + while (<INCLUDE>) { + s/&/&/g; + s/</</g; + print $_; + } + close(INCLUDE); + } else { + print $_; + } + } + close(TEMPLATE); + } elsif (/^\+cat=(.+)$/) { + # add characters with given category + $cat = $1; + for $i (keys(%category)) { + $used{$i} = "[${cat}]" if $category{$i} eq $cat; + } + } elsif (/^\-cat=(.+)$/) { + # remove characters with given category + $cat = $1; + for $i (keys(%category)) { + delete $used{$i} if $category{$i} eq $cat; + } + } elsif (/^\-cat!=(.+)$/) { + # remove characters without given category + $cat = $1; + for $i (keys(%category)) { + delete $used{$i} unless $category{$i} eq $cat; + } + } elsif (/^([+-]):(.*)/) { + $remove = $1 eq "-"; + $setfile = $2; + $setfile = shift(@ARGV) if $setfile eq ""; + push(@SETS, $setfile); + open(SET, $setfile) || die("Can't open set file '$setfile': '$!'"); + $setname = $setfile; + while (<SET>) { + while ($_) { + $i = ord($_); + $used{$i} .= "[${setname}]" unless $remove; + delete $used{$i} if $remove; + $_ = substr($_, 1); + } + } + close SET; + } elsif (/^([+-])(.*)/) { + $remove = $1 eq "-"; + $setfile = $2; + $setfile = "$setfile..$setfile" if $setfile =~ /^([0-9A-Fa-f]{4,8})$/; + if ($setfile =~ /^([0-9A-Fa-f]{4,8})(-|\.\.)([0-9A-Fa-f]{4,8})$/) { + # handle intervall specification on command line + $first = hex($1); + $last = hex($3); + for ($i = $first; $i <= $last; $i++) { + $used{$i} .= "[ARG]" unless $remove; + delete $used{$i} if $remove; + } + next; + } + $setfile = shift(@ARGV) if $setfile eq ""; + push(@SETS, $setfile); + my $setf = search_open('<', $setfile); + die("Can't open set file '$setfile': '$!'") unless $setf; + $cedf = ($setfile =~ /cedf/); # detect Kosta Kosti's trans CEDF format by path name + $setname = $setfile; + $setname =~ s/([^.\[\]]*)\..*/$1/; + while (<$setf>) { + if (/^<code_set_name>/) { + # handle ISO 15897 (POSIX registry) charset mapping format + undef $comment_char; + undef $escape_char; + while (<$setf>) { + if ($comment_char && /^$comment_char/) { + # remove comments + $_ = $`; + } + next if (/^\032?\s*$/); # skip empty lines + if (/^<comment_char> (\S)$/) { + $comment_char = $1; + } elsif (/^<escape_char> (\S)$/) { + $escape_char = $1; + } elsif (/^(END )?CHARMAP$/) { + #ignore + } elsif (/^<.*>\s*\/x([0-9A-F]{2})\s*<U([0-9A-F]{4,8})>/) { + $used{hex($2)} .= "[${setname}{$1}]" unless $remove; + delete $used{hex($2)} if $remove; + } else { + die("Syntax error in line $. in file '$setfile':\n'$_'\n"); + } + } + next; + } elsif (/^STARTFONT /) { + # handle X11 BDF file + while (<$setf>) { + if (/^ENCODING\s+([0-9]+)/) { + $used{$1} .= "[${setname}]" unless $remove; + delete $used{$1} if $remove; + } + } + next; + } + tr/a-z/A-Z/; # make input uppercase + if ($cedf) { + if ($. > 4) { + if (/^([0-9A-F]{2})\t.?\t(.*)$/) { + # handle Kosta Kosti's trans CEDF format + next if (hex($1) < 32 || (hex($1) > 0x7e && hex($1) < 0xa0)); + $ucs = $invname{$2}; + die "unknown ISO 10646 name '$2' in '$setfile' line $..\n" if ! $ucs; + $used{$ucs} .= "[${setname}{$1}]" unless $remove; + delete $used{$ucs} if $remove; + } else { + die("Syntax error in line $. in CEDF file '$setfile':\n'$_'\n"); + } + } + next; + } + if (/^\s*(0X|U\+|U-)?([0-9A-F]{2})\s+\#\s*UNDEFINED\s*$/) { + # ignore ftp.unicode.org mapping file lines with #UNDEFINED + next; + } + s/^([^\#]*)\#.*$/$1/; # remove comments + next if (/^\032?\s*$/); # skip empty lines + if (/^\s*(0X)?([0-9A-F-]{2})\s+(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) { + # handle entry from a ftp.unicode.org mapping file + $used{hex($4)} .= "[${setname}{$2}]" unless $remove; + delete $used{hex($4)} if $remove; + } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})(\s*-\s*|\s*\.\.\s*|\s+)(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) { + # handle interval specification + $first = hex($2); + $last = hex($5); + for ($i = $first; $i <= $last; $i++) { + $used{$i} .= "[${setname}]" unless $remove; + delete $used{$i} if $remove; + } + } elsif (/^\s*([0-9A-F]{2,6})(\s+[0-9A-F]{2},?|\s+[0-9A-F]{2}-[0-9A-F]{2},?)+/) { + # handle lines from P10 MES draft + $row = $1; + $cols = $_; + $cols =~ s/^\s*([0-9A-F]{2,6})\s*(.*)\s*$/$2/; + $cols =~ tr/,//d; + @cols = split(/\s+/, $cols); + for (@cols) { + if (/^(..)$/) { + $first = hex("$row$1"); + $last = $first; + } elsif (/^(..)-(..)$/) { + $first = hex("$row$1"); + $last = hex("$row$2"); + } else { + die ("this should never happen '$_'"); + } + for ($i = $first; $i <= $last; $i++) { + $used{$i} .= "[${setname}]" unless $remove; + delete $used{$i} if $remove; + } + } + } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})\s*/) { + # handle single character + $used{hex($2)} .= "[${setname}]" unless $remove; + delete $used{hex($2)} if $remove; + } else { + die("Syntax error in line $. in file '$setfile':\n'$_'\n") unless /^\s*(\#.*)?$/; + } + } + close $setf; + } elsif (/^loadimages$/ || /^loadbigimages$/) { + if (/^loadimages$/) { + $prefix = "Small.Glyphs"; + } else { + $prefix = "Glyphs"; + } + $total = 0; + for $i (keys(%used)) { + next if ($name{$i} eq "<control>"); + $total++; + } + $count = 0; + $| = 1; + for $i (sort({$a <=> $b} keys(%used))) { + next if ($name{$i} eq "<control>"); + $count++; + $j = sprintf("%04X", $i); + $j =~ /(..)(..)/; + $gif = "http://charts.unicode.org/Unicode.charts/$prefix/$1/U$j.gif"; + print("\r$count/$total: $gif"); + system("mkdir -p $prefix/$1; cd $prefix/$1; webcopy -u -s $gif &"); + select(undef, undef, undef, 0.2); + } + print("\n"); + exit 0; + } elsif (/^giftable/) { + # form a table of glyphs (requires pbmtools installed) + $count = 0; + for $i (keys(%used)) { + $count++ unless $name{$i} eq "<control>"; + } + $width = int(sqrt($count/sqrt(2)) + 0.5); + $width = $1 if /^giftable([0-9]+)$/; + system("rm -f tmp-*.pnm table.pnm~ table.pnm"); + $col = 0; + $row = 0; + for $i (sort({$a <=> $b} keys(%used))) { + next if ($name{$i} eq "<control>"); + $j = sprintf("%04X", $i); + $j =~ /(..)(..)/; + $gif = "Small.Glyphs/$1/U$j.gif"; + $pnm = sprintf("tmp-%02x.pnm", $col); + $fallback = "Small.Glyphs/FF/UFFFD.gif"; + system("giftopnm $gif >$pnm || { rm $pnm ; giftopnm $fallback >$pnm ; }"); + if (++$col == $width) { + system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm"); + if ($row == 0) { + system("mv tmp-row.pnm table.pnm"); + } else { + system("mv table.pnm table.pnm~; pnmcat -tb table.pnm~ tmp-row.pnm >table.pnm"); + } + $row++; + $col = 0; + system("rm -f tmp-*.pnm table.pnm~"); + } + } + if ($col > 0) { + system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm"); + if ($row == 0) { + system("mv tmp-row.pnm table.pnm"); + } else { + system("mv table.pnm table.pnm~; pnmcat -tb -jleft -black table.pnm~ tmp-row.pnm >table.pnm"); + } + } + system("rm -f table.gif ; ppmtogif table.pnm > table.gif"); + system("rm -f tmp-*.pnm table.pnm~ table.pnm"); + } elsif (/^table$/) { + # go through all used names to print full table + print "<TABLE border=2>\n" if $html; + for $i (sort({$a <=> $b} keys(%used))) { + next if ($name{$i} eq "<control>"); + if ($html) { + $sources = $used{$i}; + $sources =~ s/\]\[/, /g; + $sources =~ s/^\[//g; + $sources =~ s/\]$//g; + $sources =~ s/\{(..)\}/<SUB>$1<\/SUB>/g; + $j = sprintf("%04X", $i); + $j =~ /(..)(..)/; + $gif = "Small.Glyphs/$1/U$j.gif"; + print "<TR>"; + print "<TD><img width=32 height=32 src=\"$gif\">" if $image; + printf("<TD>&#%d;", $i) if $adducs; + print "<TD><SAMP>$j</SAMP><TD><SAMP>" . name($i); + print " ($comment{$i})" if $comment{$i}; + print "</SAMP><TD><SMALL>$sources</SMALL>\n"; + } else { + printf("%04X \# ", $i); + print pack("U", $i) . " " if $adducs; + print name($i) ."\n"; + } + } + print "</TABLE>\n" if $html; + } elsif (/^imgblock$/) { + $width = 16; + $width = $1 if /giftable([0-9]+)/; + $col = 0; + $subline = ""; + print "\n<P><TABLE cellspacing=0 cellpadding=0>"; + for $i (sort({$a <=> $b} keys(%used))) { + print "<TR>" if $col == 0; + $j = sprintf("%04X", $i); + $j =~ /(..)(..)/; + $gif = "Small.Glyphs/$1/U$j.gif"; + $alt = name($i); + print "<TD><img width=32 height=32 src=\"$gif\" alt=\"$alt\">"; + $subline .= "<TD><SMALL><SAMP>$j</SAMP></SMALL>"; + if (++$col == $width) { + print "<TR align=center>$subline"; + $col = 0; + $subline = ""; + } + } + print "<TR align=center>$subline" if ($col > 0); + print "</TABLE>\n"; + } elsif (/^sources$/) { + # count how many characters are attributed to the various source set combinations + print "<P>Number of occurences of source character set combinations:\n<TABLE border=2>" if $html; + for $i (keys(%used)) { + next if ($name{$i} eq "<control>"); + $sources = $used{$i}; + $sources =~ s/\]\[/, /g; + $sources =~ s/^\[//g; + $sources =~ s/\]$//g; + $sources =~ s/\{(..)\}//g; + $contribs{$sources} += 1; + } + for $j (keys(%contribs)) { + print "<TR><TD>$contribs{$j}<TD>$j\n" if $html; + } + print "</TABLE>\n" if $html; + } elsif (/^compact$/) { + # print compact table in P10 MES format + print "<P>Compact representation of this character set:\n<TABLE border=2>" if $html; + print "<TR><TD><B>Rows</B><TD><B>Positions (Cells)</B>" if $html; + print "\n# Plane 00\n# Rows\tPositions (Cells)\n" unless $html; + $current_row = ''; + $start_col = ''; + $last_col = ''; + for $i (sort({$a <=> $b} keys(%used))) { + next if ($name{$i} eq "<control>"); + $row = sprintf("%02X", $i >> 8); + $col = sprintf("%02X", $i & 0xff); + if ($row ne $current_row) { + if (($last_col ne '') and ($last_col ne $start_col)) { + print "-$last_col"; + print "</SAMP>" if $html; + } + print "<TR><TD><SAMP>$row</SAMP><TD><SAMP>" if $html; + print "\n $row\t" unless $html; + $len = 0; + $current_row = $row; + $start_col = ''; + } + if ($start_col eq '') { + print "$col"; + $len += 2; + $start_col = $col; + $last_col = $col; + } elsif (hex($col) == hex($last_col) + 1) { + $last_col = $col; + } else { + if ($last_col ne $start_col) { + print "-$last_col"; + $len += 3; + } + if ($len > 60 && !$html) { + print "\n $row\t"; + $len = 0; + }; + print " " if $len; + print "$col"; + $len += 2 + !! $len; + $start_col = $col; + $last_col = $col; + } + } + if (($last_col ne '') and ($last_col ne $start_col)) { + print "-$last_col"; + print "</SAMP>" if $html; + } + print "\n" if ($current_row ne ''); + print "</TABLE>\n" if $html; + print "\n"; + } elsif (/^c$/) { + # print table as C interval array + print "{"; + $last_i = ''; + $columns = 3; + $col = $columns; + for $i (sort({$a <=> $b} keys(%used))) { + next if ($name{$i} eq "<control>"); + if ($last_i eq '') { + if (++$col > $columns) { $col = 1; print "\n "; } + printf(" { 0x%04X, ", $i); + $last_i = $i; + } elsif ($i == $last_i + 1) { + $last_i = $i; + } else { + printf("0x%04X },", $last_i); + if (++$col > $columns) { $col = 1; print "\n "; } + printf(" { 0x%04X, ", $i); + $last_i = $i; + } + } + if ($last_i ne '') { + printf("0x%04X }", $last_i); + } + print "\n};\n"; + } elsif (/^utf8-list$/) { + $col = 0; + $block = 0; + $last = -1; + for $i (sort({$a <=> $b} keys(%used))) { + next if ($name{$i} eq "<control>"); + while ($blockend[$block] < $i && $block < $blocks - 1) { + $block++; + } + if ($last <= $blockend[$block-1] && + $i < $blockstart[$block]) { + print "\n" if ($col); + printf "\nFree block (U+%04X-U+%04X):\n\n", + $blockend[$block-1] + 1, $blockstart[$block] - 1; + $col = 0; + } + if ($last < $blockstart[$block] && $i >= $blockstart[$block]) { + print "\n" if ($col); + printf "\n$blockname[$block] (U+%04X-U+%04X):\n\n", + $blockstart[$block], $blockend[$block]; + $col = 0; + } + if ($category{$i} eq 'Mn') { + # prefix non-spacing character with U+25CC DOTTED CIRCLE + print "\x{25CC}"; + } elsif ($category{$i} eq 'Me') { + # prefix enclosing non-spacing character with space + print " "; + } + print pack("U", $i); + $col += 1 + iswide($i); + if ($col >= 64) { + print "\n"; + $col = 0; + } + $last = $i; + } + print "\n" if ($col); + } elsif (/^collections$/) { + $block = 0; + $last = -1; + for $i (sort({$a <=> $b} keys(%used))) { + next if ($name{$i} eq "<control>"); + while ($blockend[$block] < $i && $block < $blocks - 1) { + $block++; + } + if ($last < $blockstart[$block] && $i >= $blockstart[$block]) { + print $blockname[$block], + " " x (40 - length($blockname[$block])); + printf "%04X-%04X\n", + $blockstart[$block], $blockend[$block]; + } + $last = $i; + } + } elsif (/^nr$/) { + print "<P>" if $html; + print "# " unless $html; + print "Number of characters in above table: "; + $count = 0; + for $i (keys(%used)) { + $count++ unless $name{$i} eq "<control>"; + } + print $count; + print "\n"; + } elsif (/^clean$/) { + # remove characters from set that are not in $unicodedata + for $i (keys(%used)) { + delete $used{$i} unless is_unicode($i); + } + } elsif (/^unknown$/) { + # remove characters from set that are in $unicodedata + for $i (keys(%used)) { + delete $used{$i} if is_unicode($i); + } + } else { + die("Unknown command line command '$_'"); + }; +} |