aboutsummaryrefslogtreecommitdiff
path: root/newlib/libc/string
diff options
context:
space:
mode:
authorThomas Wolff <towo@towo.net>2018-03-07 23:55:52 +0100
committerCorinna Vinschen <corinna@vinschen.de>2018-03-12 10:17:20 +0100
commit37132125bcda103dd723f0cb4226c190ffbc9966 (patch)
treed1d93cb0497e22607f4dc4ca00467c84906a80f4 /newlib/libc/string
parent8e8fd6c849472d7bfe3949c44880abfcca7aed48 (diff)
downloadnewlib-37132125bcda103dd723f0cb4226c190ffbc9966.zip
newlib-37132125bcda103dd723f0cb4226c190ffbc9966.tar.gz
newlib-37132125bcda103dd723f0cb4226c190ffbc9966.tar.bz2
width data generation
Diffstat (limited to 'newlib/libc/string')
-rw-r--r--newlib/libc/string/WIDTH-A569
-rwxr-xr-xnewlib/libc/string/mkunidata54
-rwxr-xr-xnewlib/libc/string/mkwide49
-rwxr-xr-xnewlib/libc/string/mkwidthA20
-rwxr-xr-xnewlib/libc/string/uniset696
5 files changed, 1388 insertions, 0 deletions
diff --git a/newlib/libc/string/WIDTH-A b/newlib/libc/string/WIDTH-A
new file mode 100644
index 0000000..51e8f23
--- /dev/null
+++ b/newlib/libc/string/WIDTH-A
@@ -0,0 +1,569 @@
+# UAX #11: East Asian Ambiguous
+
+# Plane 00
+# Rows Positions (Cells)
+
+ 00 A1 A4 A7-A8 AA AD-AE B0-B4 B6-BA BC-BF C6 D0 D7-D8 DE-E1 E6 E8-EA
+ 00 EC-ED F0 F2-F3 F7-FA FC FE
+ 01 01 11 13 1B 26-27 2B 31-33 38 3F-42 44 48-4B 4D 52-53 66-67 6B
+ 01 CE D0 D2 D4 D6 D8 DA DC
+ 02 51 61 C4 C7 C9-CB CD D0 D8-DB DD DF
+ 03 00-6F 91-A1 A3-A9 B1-C1 C3-C9
+ 04 01 10-4F 51
+ 20 10 13-16 18-19 1C-1D 20-22 24-27 30 32-33 35 3B 3E 74 7F 81-84
+ 20 AC
+ 21 03 05 09 13 16 21-22 26 2B 53-54 5B-5E 60-6B 70-79 89 90-99 B8-B9
+ 21 D2 D4 E7
+ 22 00 02-03 07-08 0B 0F 11 15 1A 1D-20 23 25 27-2C 2E 34-37 3C-3D
+ 22 48 4C 52 60-61 64-67 6A-6B 6E-6F 82-83 86-87 95 99 A5 BF
+ 23 12
+ 24 60-E9 EB-FF
+ 25 00-4B 50-73 80-8F 92-95 A0-A1 A3-A9 B2-B3 B6-B7 BC-BD C0-C1 C6-C8
+ 25 CB CE-D1 E2-E5 EF
+ 26 05-06 09 0E-0F 1C 1E 40 42 60-61 63-65 67-6A 6C-6D 6F 9E-9F BF
+ 26 C6-CD CF-D3 D5-E1 E3 E8-E9 EB-F1 F4 F6-F9 FB-FC FE-FF
+ 27 3D 76-7F
+ 2B 56-59
+ 32 48-4F
+ E0 00-FF
+ E1 00-FF
+ E2 00-FF
+ E3 00-FF
+ E4 00-FF
+ E5 00-FF
+ E6 00-FF
+ E7 00-FF
+ E8 00-FF
+ E9 00-FF
+ EA 00-FF
+ EB 00-FF
+ EC 00-FF
+ ED 00-FF
+ EE 00-FF
+ EF 00-FF
+ F0 00-FF
+ F1 00-FF
+ F2 00-FF
+ F3 00-FF
+ F4 00-FF
+ F5 00-FF
+ F6 00-FF
+ F7 00-FF
+ F8 00-FF
+ FE 00-0F
+ FF FD
+ 1F1 00-0A 10-2D 30-69 70-8D 8F-90 9B-AC
+ E01 00-EF
+ F00 00-FF
+ F01 00-FF
+ F02 00-FF
+ F03 00-FF
+ F04 00-FF
+ F05 00-FF
+ F06 00-FF
+ F07 00-FF
+ F08 00-FF
+ F09 00-FF
+ F0A 00-FF
+ F0B 00-FF
+ F0C 00-FF
+ F0D 00-FF
+ F0E 00-FF
+ F0F 00-FF
+ F10 00-FF
+ F11 00-FF
+ F12 00-FF
+ F13 00-FF
+ F14 00-FF
+ F15 00-FF
+ F16 00-FF
+ F17 00-FF
+ F18 00-FF
+ F19 00-FF
+ F1A 00-FF
+ F1B 00-FF
+ F1C 00-FF
+ F1D 00-FF
+ F1E 00-FF
+ F1F 00-FF
+ F20 00-FF
+ F21 00-FF
+ F22 00-FF
+ F23 00-FF
+ F24 00-FF
+ F25 00-FF
+ F26 00-FF
+ F27 00-FF
+ F28 00-FF
+ F29 00-FF
+ F2A 00-FF
+ F2B 00-FF
+ F2C 00-FF
+ F2D 00-FF
+ F2E 00-FF
+ F2F 00-FF
+ F30 00-FF
+ F31 00-FF
+ F32 00-FF
+ F33 00-FF
+ F34 00-FF
+ F35 00-FF
+ F36 00-FF
+ F37 00-FF
+ F38 00-FF
+ F39 00-FF
+ F3A 00-FF
+ F3B 00-FF
+ F3C 00-FF
+ F3D 00-FF
+ F3E 00-FF
+ F3F 00-FF
+ F40 00-FF
+ F41 00-FF
+ F42 00-FF
+ F43 00-FF
+ F44 00-FF
+ F45 00-FF
+ F46 00-FF
+ F47 00-FF
+ F48 00-FF
+ F49 00-FF
+ F4A 00-FF
+ F4B 00-FF
+ F4C 00-FF
+ F4D 00-FF
+ F4E 00-FF
+ F4F 00-FF
+ F50 00-FF
+ F51 00-FF
+ F52 00-FF
+ F53 00-FF
+ F54 00-FF
+ F55 00-FF
+ F56 00-FF
+ F57 00-FF
+ F58 00-FF
+ F59 00-FF
+ F5A 00-FF
+ F5B 00-FF
+ F5C 00-FF
+ F5D 00-FF
+ F5E 00-FF
+ F5F 00-FF
+ F60 00-FF
+ F61 00-FF
+ F62 00-FF
+ F63 00-FF
+ F64 00-FF
+ F65 00-FF
+ F66 00-FF
+ F67 00-FF
+ F68 00-FF
+ F69 00-FF
+ F6A 00-FF
+ F6B 00-FF
+ F6C 00-FF
+ F6D 00-FF
+ F6E 00-FF
+ F6F 00-FF
+ F70 00-FF
+ F71 00-FF
+ F72 00-FF
+ F73 00-FF
+ F74 00-FF
+ F75 00-FF
+ F76 00-FF
+ F77 00-FF
+ F78 00-FF
+ F79 00-FF
+ F7A 00-FF
+ F7B 00-FF
+ F7C 00-FF
+ F7D 00-FF
+ F7E 00-FF
+ F7F 00-FF
+ F80 00-FF
+ F81 00-FF
+ F82 00-FF
+ F83 00-FF
+ F84 00-FF
+ F85 00-FF
+ F86 00-FF
+ F87 00-FF
+ F88 00-FF
+ F89 00-FF
+ F8A 00-FF
+ F8B 00-FF
+ F8C 00-FF
+ F8D 00-FF
+ F8E 00-FF
+ F8F 00-FF
+ F90 00-FF
+ F91 00-FF
+ F92 00-FF
+ F93 00-FF
+ F94 00-FF
+ F95 00-FF
+ F96 00-FF
+ F97 00-FF
+ F98 00-FF
+ F99 00-FF
+ F9A 00-FF
+ F9B 00-FF
+ F9C 00-FF
+ F9D 00-FF
+ F9E 00-FF
+ F9F 00-FF
+ FA0 00-FF
+ FA1 00-FF
+ FA2 00-FF
+ FA3 00-FF
+ FA4 00-FF
+ FA5 00-FF
+ FA6 00-FF
+ FA7 00-FF
+ FA8 00-FF
+ FA9 00-FF
+ FAA 00-FF
+ FAB 00-FF
+ FAC 00-FF
+ FAD 00-FF
+ FAE 00-FF
+ FAF 00-FF
+ FB0 00-FF
+ FB1 00-FF
+ FB2 00-FF
+ FB3 00-FF
+ FB4 00-FF
+ FB5 00-FF
+ FB6 00-FF
+ FB7 00-FF
+ FB8 00-FF
+ FB9 00-FF
+ FBA 00-FF
+ FBB 00-FF
+ FBC 00-FF
+ FBD 00-FF
+ FBE 00-FF
+ FBF 00-FF
+ FC0 00-FF
+ FC1 00-FF
+ FC2 00-FF
+ FC3 00-FF
+ FC4 00-FF
+ FC5 00-FF
+ FC6 00-FF
+ FC7 00-FF
+ FC8 00-FF
+ FC9 00-FF
+ FCA 00-FF
+ FCB 00-FF
+ FCC 00-FF
+ FCD 00-FF
+ FCE 00-FF
+ FCF 00-FF
+ FD0 00-FF
+ FD1 00-FF
+ FD2 00-FF
+ FD3 00-FF
+ FD4 00-FF
+ FD5 00-FF
+ FD6 00-FF
+ FD7 00-FF
+ FD8 00-FF
+ FD9 00-FF
+ FDA 00-FF
+ FDB 00-FF
+ FDC 00-FF
+ FDD 00-FF
+ FDE 00-FF
+ FDF 00-FF
+ FE0 00-FF
+ FE1 00-FF
+ FE2 00-FF
+ FE3 00-FF
+ FE4 00-FF
+ FE5 00-FF
+ FE6 00-FF
+ FE7 00-FF
+ FE8 00-FF
+ FE9 00-FF
+ FEA 00-FF
+ FEB 00-FF
+ FEC 00-FF
+ FED 00-FF
+ FEE 00-FF
+ FEF 00-FF
+ FF0 00-FF
+ FF1 00-FF
+ FF2 00-FF
+ FF3 00-FF
+ FF4 00-FF
+ FF5 00-FF
+ FF6 00-FF
+ FF7 00-FF
+ FF8 00-FF
+ FF9 00-FF
+ FFA 00-FF
+ FFB 00-FF
+ FFC 00-FF
+ FFD 00-FF
+ FFE 00-FF
+ FFF 00-FD
+ 1000 00-FF
+ 1001 00-FF
+ 1002 00-FF
+ 1003 00-FF
+ 1004 00-FF
+ 1005 00-FF
+ 1006 00-FF
+ 1007 00-FF
+ 1008 00-FF
+ 1009 00-FF
+ 100A 00-FF
+ 100B 00-FF
+ 100C 00-FF
+ 100D 00-FF
+ 100E 00-FF
+ 100F 00-FF
+ 1010 00-FF
+ 1011 00-FF
+ 1012 00-FF
+ 1013 00-FF
+ 1014 00-FF
+ 1015 00-FF
+ 1016 00-FF
+ 1017 00-FF
+ 1018 00-FF
+ 1019 00-FF
+ 101A 00-FF
+ 101B 00-FF
+ 101C 00-FF
+ 101D 00-FF
+ 101E 00-FF
+ 101F 00-FF
+ 1020 00-FF
+ 1021 00-FF
+ 1022 00-FF
+ 1023 00-FF
+ 1024 00-FF
+ 1025 00-FF
+ 1026 00-FF
+ 1027 00-FF
+ 1028 00-FF
+ 1029 00-FF
+ 102A 00-FF
+ 102B 00-FF
+ 102C 00-FF
+ 102D 00-FF
+ 102E 00-FF
+ 102F 00-FF
+ 1030 00-FF
+ 1031 00-FF
+ 1032 00-FF
+ 1033 00-FF
+ 1034 00-FF
+ 1035 00-FF
+ 1036 00-FF
+ 1037 00-FF
+ 1038 00-FF
+ 1039 00-FF
+ 103A 00-FF
+ 103B 00-FF
+ 103C 00-FF
+ 103D 00-FF
+ 103E 00-FF
+ 103F 00-FF
+ 1040 00-FF
+ 1041 00-FF
+ 1042 00-FF
+ 1043 00-FF
+ 1044 00-FF
+ 1045 00-FF
+ 1046 00-FF
+ 1047 00-FF
+ 1048 00-FF
+ 1049 00-FF
+ 104A 00-FF
+ 104B 00-FF
+ 104C 00-FF
+ 104D 00-FF
+ 104E 00-FF
+ 104F 00-FF
+ 1050 00-FF
+ 1051 00-FF
+ 1052 00-FF
+ 1053 00-FF
+ 1054 00-FF
+ 1055 00-FF
+ 1056 00-FF
+ 1057 00-FF
+ 1058 00-FF
+ 1059 00-FF
+ 105A 00-FF
+ 105B 00-FF
+ 105C 00-FF
+ 105D 00-FF
+ 105E 00-FF
+ 105F 00-FF
+ 1060 00-FF
+ 1061 00-FF
+ 1062 00-FF
+ 1063 00-FF
+ 1064 00-FF
+ 1065 00-FF
+ 1066 00-FF
+ 1067 00-FF
+ 1068 00-FF
+ 1069 00-FF
+ 106A 00-FF
+ 106B 00-FF
+ 106C 00-FF
+ 106D 00-FF
+ 106E 00-FF
+ 106F 00-FF
+ 1070 00-FF
+ 1071 00-FF
+ 1072 00-FF
+ 1073 00-FF
+ 1074 00-FF
+ 1075 00-FF
+ 1076 00-FF
+ 1077 00-FF
+ 1078 00-FF
+ 1079 00-FF
+ 107A 00-FF
+ 107B 00-FF
+ 107C 00-FF
+ 107D 00-FF
+ 107E 00-FF
+ 107F 00-FF
+ 1080 00-FF
+ 1081 00-FF
+ 1082 00-FF
+ 1083 00-FF
+ 1084 00-FF
+ 1085 00-FF
+ 1086 00-FF
+ 1087 00-FF
+ 1088 00-FF
+ 1089 00-FF
+ 108A 00-FF
+ 108B 00-FF
+ 108C 00-FF
+ 108D 00-FF
+ 108E 00-FF
+ 108F 00-FF
+ 1090 00-FF
+ 1091 00-FF
+ 1092 00-FF
+ 1093 00-FF
+ 1094 00-FF
+ 1095 00-FF
+ 1096 00-FF
+ 1097 00-FF
+ 1098 00-FF
+ 1099 00-FF
+ 109A 00-FF
+ 109B 00-FF
+ 109C 00-FF
+ 109D 00-FF
+ 109E 00-FF
+ 109F 00-FF
+ 10A0 00-FF
+ 10A1 00-FF
+ 10A2 00-FF
+ 10A3 00-FF
+ 10A4 00-FF
+ 10A5 00-FF
+ 10A6 00-FF
+ 10A7 00-FF
+ 10A8 00-FF
+ 10A9 00-FF
+ 10AA 00-FF
+ 10AB 00-FF
+ 10AC 00-FF
+ 10AD 00-FF
+ 10AE 00-FF
+ 10AF 00-FF
+ 10B0 00-FF
+ 10B1 00-FF
+ 10B2 00-FF
+ 10B3 00-FF
+ 10B4 00-FF
+ 10B5 00-FF
+ 10B6 00-FF
+ 10B7 00-FF
+ 10B8 00-FF
+ 10B9 00-FF
+ 10BA 00-FF
+ 10BB 00-FF
+ 10BC 00-FF
+ 10BD 00-FF
+ 10BE 00-FF
+ 10BF 00-FF
+ 10C0 00-FF
+ 10C1 00-FF
+ 10C2 00-FF
+ 10C3 00-FF
+ 10C4 00-FF
+ 10C5 00-FF
+ 10C6 00-FF
+ 10C7 00-FF
+ 10C8 00-FF
+ 10C9 00-FF
+ 10CA 00-FF
+ 10CB 00-FF
+ 10CC 00-FF
+ 10CD 00-FF
+ 10CE 00-FF
+ 10CF 00-FF
+ 10D0 00-FF
+ 10D1 00-FF
+ 10D2 00-FF
+ 10D3 00-FF
+ 10D4 00-FF
+ 10D5 00-FF
+ 10D6 00-FF
+ 10D7 00-FF
+ 10D8 00-FF
+ 10D9 00-FF
+ 10DA 00-FF
+ 10DB 00-FF
+ 10DC 00-FF
+ 10DD 00-FF
+ 10DE 00-FF
+ 10DF 00-FF
+ 10E0 00-FF
+ 10E1 00-FF
+ 10E2 00-FF
+ 10E3 00-FF
+ 10E4 00-FF
+ 10E5 00-FF
+ 10E6 00-FF
+ 10E7 00-FF
+ 10E8 00-FF
+ 10E9 00-FF
+ 10EA 00-FF
+ 10EB 00-FF
+ 10EC 00-FF
+ 10ED 00-FF
+ 10EE 00-FF
+ 10EF 00-FF
+ 10F0 00-FF
+ 10F1 00-FF
+ 10F2 00-FF
+ 10F3 00-FF
+ 10F4 00-FF
+ 10F5 00-FF
+ 10F6 00-FF
+ 10F7 00-FF
+ 10F8 00-FF
+ 10F9 00-FF
+ 10FA 00-FF
+ 10FB 00-FF
+ 10FC 00-FF
+ 10FD 00-FF
+ 10FE 00-FF
+ 10FF 00-FD
+
diff --git a/newlib/libc/string/mkunidata b/newlib/libc/string/mkunidata
new file mode 100755
index 0000000..c0bf5de
--- /dev/null
+++ b/newlib/libc/string/mkunidata
@@ -0,0 +1,54 @@
+#! /bin/sh
+
+echo generating Unicode width data for newlib/libc/string/wcwidth.c
+
+cd `dirname $0`
+PATH="$PATH":. # ensure access to uniset tool
+
+#############################################################################
+# checks and (with option -u) downloads
+
+case "$1" in
+-u)
+ #WGET=wget -N -t 1 --timeout=55
+ WGET=curl -R -O --connect-timeout 55
+ WGET+=-z $@
+
+ echo downloading uniset tool
+ $WGET http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz
+ gzip -dc uniset.tar.gz | tar xvf - uniset
+
+ echo downloading data from unicode.org
+ for data in UnicodeData.txt Blocks.txt EastAsianWidth.txt
+ do $WGET http://unicode.org/Public/UNIDATA/$data
+ done
+ ;;
+*) echo checking package unicode-ucd
+ grep unicode-ucd /etc/setup/installed.db || exit 9
+ ;;
+esac
+
+echo checking uniset tool
+type uniset || exit 9
+
+for data in UnicodeData.txt Blocks.txt EastAsianWidth.txt
+do test -r $data || ln -s /usr/share/unicode/ucd/$data . || exit 9
+done
+
+echo generating from Unicode version `sed -e 's,[^.0-9],,g' -e 1q Blocks.txt`
+exit
+
+#############################################################################
+# table generation
+
+echo generating combining characters table
+uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B +D7B0-D7C6 +D7CB-D7FB c > combining.t
+
+echo generating ambiguous width characters table
+sh ./mkwidthA && uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c > ambiguous.t
+
+echo generating wide characters table
+sh ./mkwide
+
+#############################################################################
+# end
diff --git a/newlib/libc/string/mkwide b/newlib/libc/string/mkwide
new file mode 100755
index 0000000..55a0bab
--- /dev/null
+++ b/newlib/libc/string/mkwide
@@ -0,0 +1,49 @@
+#! /bin/sh
+
+# generate list of wide characters, with convex closure
+
+skipcheck=false
+
+if [ ! -r EastAsianWidth.txt ]
+then ln -s /usr/share/unicode/ucd/EastAsianWidth.txt . || exit 1
+fi
+if [ ! -r UnicodeData.txt ]
+then ln -s /usr/share/unicode/ucd/UnicodeData.txt . || exit 1
+fi
+if [ ! -r Blocks.txt ]
+then ln -s /usr/share/unicode/ucd/Blocks.txt . || exit 1
+fi
+
+sed -e "s,^\([^;]*\);[NAH],\1," -e t -e d EastAsianWidth.txt > wide.na
+sed -e "s,^\([^;]*\);[WF],\1," -e t -e d EastAsianWidth.txt > wide.fw
+
+PATH="$PATH:." # for uniset
+
+nrfw=`uniset +wide.fw nr | sed -e 's,.*:,,'`
+echo FW $nrfw
+nrna=`uniset +wide.na nr | sed -e 's,.*:,,'`
+echo NAH $nrna
+
+extrablocks="2E80-303E"
+
+# check all blocks
+includes () {
+ nr=`uniset +wide.$2 -$1 nr | sed -e 's,.*:,,'`
+ test $nr != $3
+}
+echo "adding compact closure of wide ranges, this may take ~10min"
+for b in $extrablocks `sed -e 's,^\([0-9A-F]*\)\.\.\([0-9A-F]*\).*,\1-\2,' -e t -e d Blocks.txt`
+do range=$b
+ echo checking $range $* >&2
+ if includes $range fw $nrfw && ! includes $range na $nrna
+ then echo $range
+ fi
+done > wide.blocks
+
+(
+sed -e "s,^,//," -e 1q EastAsianWidth.txt
+sed -e "s,^,//," -e 1q Blocks.txt
+uniset `sed -e 's,^,+,' wide.blocks` +wide.fw c
+) > wide.t
+
+rm -f wide.na wide.fw wide.blocks
diff --git a/newlib/libc/string/mkwidthA b/newlib/libc/string/mkwidthA
new file mode 100755
index 0000000..343ab40
--- /dev/null
+++ b/newlib/libc/string/mkwidthA
@@ -0,0 +1,20 @@
+#! /bin/sh
+
+# generate WIDTH-A file, listing Unicode characters with width property
+# Ambiguous, from EastAsianWidth.txt
+
+if [ ! -r EastAsianWidth.txt ]
+then ln -s /usr/share/unicode/ucd/EastAsianWidth.txt . || exit 1
+fi
+if [ ! -r UnicodeData.txt ]
+then ln -s /usr/share/unicode/ucd/UnicodeData.txt . || exit 1
+fi
+if [ ! -r Blocks.txt ]
+then ln -s /usr/share/unicode/ucd/Blocks.txt . || exit 1
+fi
+
+sed -e "s,^\([^;]*\);A,\1," -e t -e d EastAsianWidth.txt > width-a-new
+rm -f WIDTH-A
+echo "# UAX #11: East Asian Ambiguous" > WIDTH-A
+PATH="$PATH:." uniset +width-a-new compact >> WIDTH-A
+rm -f width-a-new
diff --git a/newlib/libc/string/uniset b/newlib/libc/string/uniset
new file mode 100755
index 0000000..85d3b2a
--- /dev/null
+++ b/newlib/libc/string/uniset
@@ -0,0 +1,696 @@
+#!/usr/bin/perl
+# Uniset -- Unicode subset manager -- Markus Kuhn
+# http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz
+
+require 5.008;
+use open ':utf8';
+use FindBin qw($RealBin); # to find directory where this file is located
+
+binmode(STDOUT, ":utf8");
+binmode(STDIN, ":utf8");
+
+my (%name, %invname, %category, %comment);
+
+print <<End if $#ARGV < 0;
+Uniset -- Unicode subset manager -- Markus Kuhn
+
+Uniset merges and subtracts Unicode subsets. It can output and
+analyse the resulting character set in various formats.
+
+Uniset understand the following command-line arguments:
+
+Commands to define a set of characters:
+
+ + filename add the character set described in the file to the set
+ - filename remove the character set described in the file from the set
+ +: filename add the characters in the UTF-8 file to the set
+ -: filename remove the characters in the UTF-8 file from the set
+ +xxxx..yyyy add the range to the set (xxxx and yyyy are hex numbers)
+ -xxxx..yyyy remove the range from the set (xxxx and yyyy are hex numbers)
+ +cat=Xx add all Unicode characters with category code Xx
+ -cat=Xx remove all Unicode characters with category code Xx
+ -cat!=Xx remove all Unicode characters without category code Xx
+ clean remove any elements that do not appear in the Unicode database
+ unknown remove any elements that do appear in the Unicode database
+
+Command to output descriptions of the constructed set of characters:
+
+ table write a full table with one line per character
+ compact output the set in compact MES format
+ c output the set as C interval array
+ nr output the number of characters
+ sources output a table that shows the number of characters contributed
+ by the various combinations of input sets added with +.
+ utf8-list output a list of all characters encoded in UTF-8
+
+Commands to tailor the following output commands:
+
+ html write HTML tables instead of plain text
+ ucs add the unicode character itself to the table (UTF-8 in
+ plain table, numeric character reference in HTML)
+
+Formats of character set input files read by the + and - command:
+
+Empty lines, white space at the start and end of the line and any
+comment text following a \# are ignored. The following formats are
+recognized
+
+xx yyyy xx is the hex code in an 8-bit character set and yyyy
+ is the corresponding Unicode value. Both can optionally
+ be prefixed by 0x. This is the format used in the
+ files on <ftp://ftp.unicode.org/Public/MAPPINGS/>.
+
+yyyy yyyy (optionally prefixed with 0x) is a Unicode character
+ belonging to the specified subset.
+
+yyyy-yyyy a range of Unicode characters belonging to
+yyyy..yyyy the specified subset.
+
+xx yy yy yy-yy yy xx denotes a row (high-byte) and the yy specify
+ corresponding low bytes or with a hyphen also ranges of
+ low bytes in the Unicode values that belong to this
+ subset. This is also the format that is generated by
+ the compact command.
+End
+exit 1 if $#ARGV < 0;
+
+
+# Subroutine to identify whether the ISO 10646/Unicode character code
+# ucs belongs into the East Asian Wide (W) or East Asian FullWidth
+# (F) category as defined in Unicode Technical Report #11.
+
+sub iswide ($) {
+ my $ucs = shift(@_);
+
+ return ($ucs >= 0x1100 &&
+ ($ucs <= 0x115f || # Hangul Jamo
+ $ucs == 0x2329 || $ucs == 0x232a ||
+ ($ucs >= 0x2e80 && $ucs <= 0xa4cf &&
+ $ucs != 0x303f) || # CJK .. Yi
+ ($ucs >= 0xac00 && $ucs <= 0xd7a3) || # Hangul Syllables
+ ($ucs >= 0xf900 && $ucs <= 0xfaff) || # CJK Comp. Ideographs
+ ($ucs >= 0xfe30 && $ucs <= 0xfe6f) || # CJK Comp. Forms
+ ($ucs >= 0xff00 && $ucs <= 0xff60) || # Fullwidth Forms
+ ($ucs >= 0xffe0 && $ucs <= 0xffe6) ||
+ ($ucs >= 0x20000 && $ucs <= 0x2fffd) ||
+ ($ucs >= 0x30000 && $ucs <= 0x3fffd)));
+}
+
+# Return the Unicode name that belongs to a given character code
+
+# Jamo short names, see Unicode 3.0, table 4-4, page 86
+
+my @lname = ('G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '',
+ 'J', 'JJ', 'C', 'K', 'T', 'P', 'H'); # 1100..1112
+my @vname = ('A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O',
+ 'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU',
+ 'EU', 'YI', 'I'); # 1161..1175
+my @tname = ('G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM',
+ 'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS',
+ 'NG', 'J', 'C', 'K', 'T', 'P', 'H'); # 11a8..11c2
+
+sub name {
+ my $ucs = shift(@_);
+
+ # The intervals used here reflect Unicode Version 3.2
+ if (($ucs >= 0x3400 && $ucs <= 0x4db5) ||
+ ($ucs >= 0x4e00 && $ucs <= 0x9fa5) ||
+ ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
+ return "CJK UNIFIED IDEOGRAPH-" . sprintf("%04X", $ucs);
+ }
+
+ if ($ucs >= 0xac00 && $ucs <= 0xd7a3) {
+ my $s = $ucs - 0xac00;
+ my $l = 0x1100 + int($s / (21 * 28));
+ my $v = 0x1161 + int(($s % (21 * 28)) / 28);
+ my $t = 0x11a7 + $s % 28;
+ return "HANGUL SYLLABLE " .
+ ($lname[int($s / (21 * 28))] .
+ $vname[int(($s % (21 * 28)) / 28)] .
+ $tname[$s % 28 - 1]);
+ }
+
+ return $name{$ucs};
+}
+
+sub is_unicode {
+ my $ucs = shift(@_);
+
+ # The intervals used here reflect Unicode Version 3.2
+ if (($ucs >= 0x3400 && $ucs <= 0x4db5) ||
+ ($ucs >= 0x4e00 && $ucs <= 0x9fa5) ||
+ ($ucs >= 0xac00 && $ucs <= 0xd7a3) ||
+ ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
+ return 1;
+ }
+
+ return exists $name{$ucs};
+}
+
+my @search_path;
+push @search_path, "$ENV{HOME}/local/share/uniset"
+ if -d "$ENV{HOME}/local/share/uniset";
+push @search_path, "/usr/share/uniset" if -d "/usr/share/uniset";
+push @search_path, $RealBin unless $RealBin =~ m|^/usr/bin|;
+
+sub search_open {
+ my ($mode, $fn) = @_;
+ my $file;
+ return $file if open($file, $mode, $fn);
+ return undef if $fn =~ m|/|;
+ for my $path (@search_path) {
+ return $file if open($file, $mode, "$path/$fn");
+ }
+ return undef;
+}
+
+my $html = 0;
+my $image = 0;
+my $adducs = 0;
+my $unicodedata = "UnicodeData.txt";
+my $blockdata = "Blocks.txt";
+
+# read list of all Unicode names
+my $data = search_open('<', $unicodedata);
+unless ($data) {
+ die ("Can't open Unicode database '$unicodedata':\n$!\n\n" .
+ "Please make sure that you have downloaded the file\n" .
+ "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n");
+}
+while (<$data>) {
+ if (/^([0-9,A-F]{4,8});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) {
+ next if $2 ne '<control>' && substr($2, 0, 1) eq '<';
+ $ucs = hex($1);
+ $name{$ucs} = $2;
+ $invname{$2} = $ucs;
+ $category{$ucs} = $3;
+ $comment{$ucs} = $12;
+ } else {
+ die("Syntax error in line '$_' in file '$unicodedata'");
+ }
+}
+close($data);
+
+# read list of all Unicode blocks
+$data = search_open('<', $blockdata);
+unless ($data) {
+ die ("Can't open Unicode blockname list '$blockdata':\n$!\n\n" .
+ "Please make sure that you have downloaded the file\n" .
+ "http://www.unicode.org/Public/UNIDATA/Blocks.txt\n");
+}
+my $blocks = 0;
+my (@blockstart, @blockend, @blockname);
+while (<$data>) {
+ if (/^\s*([0-9,A-F]{4,8})\s*\.\.\s*([0-9,A-F]{4,8})\s*;\s*(.*)$/) {
+ $blockstart[$blocks] = hex($1);
+ $blockend [$blocks] = hex($2);
+ $blockname [$blocks] = $3;
+ $blocks++;
+ } elsif (/^\s*\#/ || /^\s*$/) {
+ # ignore comments and empty lines
+ } else {
+ die("Syntax error in line '$_' in file '$blockdata'");
+ }
+}
+close($data);
+if ($blockend[$blocks-1] < 0x110000) {
+ $blockstart[$blocks] = 0x110000;
+ $blockend [$blocks] = 0x7FFFFFFF;
+ $blockname [$blocks] = "Beyond Plane 16";
+ $blocks++;
+}
+
+# process command line arguments
+while ($_ = shift(@ARGV)) {
+ if (/^html$/) {
+ $html = 1;
+ } elsif (/^ucs$/) {
+ $adducs = 1;
+ } elsif (/^img$/) {
+ $html = 1;
+ $image = 1;
+ } elsif (/^template$/) {
+ $template = shift(@ARGV);
+ open(TEMPLATE, $template) || die("Can't open template file '$template': '$!'");
+ while (<TEMPLATE>) {
+ if (/^\#\s*include\s+\"([^\"]*)\"\s*$/) {
+ open(INCLUDE, $1) || die("Can't open template include file '$1': '$!'");
+ while (<INCLUDE>) {
+ print $_;
+ }
+ close(INCLUDE);
+ } elsif (/^\#\s*quote\s+\"([^\"]*)\"\s*$/) {
+ open(INCLUDE, $1) || die("Can't open template include file '$1': '$!'");
+ while (<INCLUDE>) {
+ s/&/&amp;/g;
+ s/</&lt;/g;
+ print $_;
+ }
+ close(INCLUDE);
+ } else {
+ print $_;
+ }
+ }
+ close(TEMPLATE);
+ } elsif (/^\+cat=(.+)$/) {
+ # add characters with given category
+ $cat = $1;
+ for $i (keys(%category)) {
+ $used{$i} = "[${cat}]" if $category{$i} eq $cat;
+ }
+ } elsif (/^\-cat=(.+)$/) {
+ # remove characters with given category
+ $cat = $1;
+ for $i (keys(%category)) {
+ delete $used{$i} if $category{$i} eq $cat;
+ }
+ } elsif (/^\-cat!=(.+)$/) {
+ # remove characters without given category
+ $cat = $1;
+ for $i (keys(%category)) {
+ delete $used{$i} unless $category{$i} eq $cat;
+ }
+ } elsif (/^([+-]):(.*)/) {
+ $remove = $1 eq "-";
+ $setfile = $2;
+ $setfile = shift(@ARGV) if $setfile eq "";
+ push(@SETS, $setfile);
+ open(SET, $setfile) || die("Can't open set file '$setfile': '$!'");
+ $setname = $setfile;
+ while (<SET>) {
+ while ($_) {
+ $i = ord($_);
+ $used{$i} .= "[${setname}]" unless $remove;
+ delete $used{$i} if $remove;
+ $_ = substr($_, 1);
+ }
+ }
+ close SET;
+ } elsif (/^([+-])(.*)/) {
+ $remove = $1 eq "-";
+ $setfile = $2;
+ $setfile = "$setfile..$setfile" if $setfile =~ /^([0-9A-Fa-f]{4,8})$/;
+ if ($setfile =~ /^([0-9A-Fa-f]{4,8})(-|\.\.)([0-9A-Fa-f]{4,8})$/) {
+ # handle intervall specification on command line
+ $first = hex($1);
+ $last = hex($3);
+ for ($i = $first; $i <= $last; $i++) {
+ $used{$i} .= "[ARG]" unless $remove;
+ delete $used{$i} if $remove;
+ }
+ next;
+ }
+ $setfile = shift(@ARGV) if $setfile eq "";
+ push(@SETS, $setfile);
+ my $setf = search_open('<', $setfile);
+ die("Can't open set file '$setfile': '$!'") unless $setf;
+ $cedf = ($setfile =~ /cedf/); # detect Kosta Kosti's trans CEDF format by path name
+ $setname = $setfile;
+ $setname =~ s/([^.\[\]]*)\..*/$1/;
+ while (<$setf>) {
+ if (/^<code_set_name>/) {
+ # handle ISO 15897 (POSIX registry) charset mapping format
+ undef $comment_char;
+ undef $escape_char;
+ while (<$setf>) {
+ if ($comment_char && /^$comment_char/) {
+ # remove comments
+ $_ = $`;
+ }
+ next if (/^\032?\s*$/); # skip empty lines
+ if (/^<comment_char> (\S)$/) {
+ $comment_char = $1;
+ } elsif (/^<escape_char> (\S)$/) {
+ $escape_char = $1;
+ } elsif (/^(END )?CHARMAP$/) {
+ #ignore
+ } elsif (/^<.*>\s*\/x([0-9A-F]{2})\s*<U([0-9A-F]{4,8})>/) {
+ $used{hex($2)} .= "[${setname}{$1}]" unless $remove;
+ delete $used{hex($2)} if $remove;
+ } else {
+ die("Syntax error in line $. in file '$setfile':\n'$_'\n");
+ }
+ }
+ next;
+ } elsif (/^STARTFONT /) {
+ # handle X11 BDF file
+ while (<$setf>) {
+ if (/^ENCODING\s+([0-9]+)/) {
+ $used{$1} .= "[${setname}]" unless $remove;
+ delete $used{$1} if $remove;
+ }
+ }
+ next;
+ }
+ tr/a-z/A-Z/; # make input uppercase
+ if ($cedf) {
+ if ($. > 4) {
+ if (/^([0-9A-F]{2})\t.?\t(.*)$/) {
+ # handle Kosta Kosti's trans CEDF format
+ next if (hex($1) < 32 || (hex($1) > 0x7e && hex($1) < 0xa0));
+ $ucs = $invname{$2};
+ die "unknown ISO 10646 name '$2' in '$setfile' line $..\n" if ! $ucs;
+ $used{$ucs} .= "[${setname}{$1}]" unless $remove;
+ delete $used{$ucs} if $remove;
+ } else {
+ die("Syntax error in line $. in CEDF file '$setfile':\n'$_'\n");
+ }
+ }
+ next;
+ }
+ if (/^\s*(0X|U\+|U-)?([0-9A-F]{2})\s+\#\s*UNDEFINED\s*$/) {
+ # ignore ftp.unicode.org mapping file lines with #UNDEFINED
+ next;
+ }
+ s/^([^\#]*)\#.*$/$1/; # remove comments
+ next if (/^\032?\s*$/); # skip empty lines
+ if (/^\s*(0X)?([0-9A-F-]{2})\s+(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
+ # handle entry from a ftp.unicode.org mapping file
+ $used{hex($4)} .= "[${setname}{$2}]" unless $remove;
+ delete $used{hex($4)} if $remove;
+ } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})(\s*-\s*|\s*\.\.\s*|\s+)(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
+ # handle interval specification
+ $first = hex($2);
+ $last = hex($5);
+ for ($i = $first; $i <= $last; $i++) {
+ $used{$i} .= "[${setname}]" unless $remove;
+ delete $used{$i} if $remove;
+ }
+ } elsif (/^\s*([0-9A-F]{2,6})(\s+[0-9A-F]{2},?|\s+[0-9A-F]{2}-[0-9A-F]{2},?)+/) {
+ # handle lines from P10 MES draft
+ $row = $1;
+ $cols = $_;
+ $cols =~ s/^\s*([0-9A-F]{2,6})\s*(.*)\s*$/$2/;
+ $cols =~ tr/,//d;
+ @cols = split(/\s+/, $cols);
+ for (@cols) {
+ if (/^(..)$/) {
+ $first = hex("$row$1");
+ $last = $first;
+ } elsif (/^(..)-(..)$/) {
+ $first = hex("$row$1");
+ $last = hex("$row$2");
+ } else {
+ die ("this should never happen '$_'");
+ }
+ for ($i = $first; $i <= $last; $i++) {
+ $used{$i} .= "[${setname}]" unless $remove;
+ delete $used{$i} if $remove;
+ }
+ }
+ } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})\s*/) {
+ # handle single character
+ $used{hex($2)} .= "[${setname}]" unless $remove;
+ delete $used{hex($2)} if $remove;
+ } else {
+ die("Syntax error in line $. in file '$setfile':\n'$_'\n") unless /^\s*(\#.*)?$/;
+ }
+ }
+ close $setf;
+ } elsif (/^loadimages$/ || /^loadbigimages$/) {
+ if (/^loadimages$/) {
+ $prefix = "Small.Glyphs";
+ } else {
+ $prefix = "Glyphs";
+ }
+ $total = 0;
+ for $i (keys(%used)) {
+ next if ($name{$i} eq "<control>");
+ $total++;
+ }
+ $count = 0;
+ $| = 1;
+ for $i (sort({$a <=> $b} keys(%used))) {
+ next if ($name{$i} eq "<control>");
+ $count++;
+ $j = sprintf("%04X", $i);
+ $j =~ /(..)(..)/;
+ $gif = "http://charts.unicode.org/Unicode.charts/$prefix/$1/U$j.gif";
+ print("\r$count/$total: $gif");
+ system("mkdir -p $prefix/$1; cd $prefix/$1; webcopy -u -s $gif &");
+ select(undef, undef, undef, 0.2);
+ }
+ print("\n");
+ exit 0;
+ } elsif (/^giftable/) {
+ # form a table of glyphs (requires pbmtools installed)
+ $count = 0;
+ for $i (keys(%used)) {
+ $count++ unless $name{$i} eq "<control>";
+ }
+ $width = int(sqrt($count/sqrt(2)) + 0.5);
+ $width = $1 if /^giftable([0-9]+)$/;
+ system("rm -f tmp-*.pnm table.pnm~ table.pnm");
+ $col = 0;
+ $row = 0;
+ for $i (sort({$a <=> $b} keys(%used))) {
+ next if ($name{$i} eq "<control>");
+ $j = sprintf("%04X", $i);
+ $j =~ /(..)(..)/;
+ $gif = "Small.Glyphs/$1/U$j.gif";
+ $pnm = sprintf("tmp-%02x.pnm", $col);
+ $fallback = "Small.Glyphs/FF/UFFFD.gif";
+ system("giftopnm $gif >$pnm || { rm $pnm ; giftopnm $fallback >$pnm ; }");
+ if (++$col == $width) {
+ system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
+ if ($row == 0) {
+ system("mv tmp-row.pnm table.pnm");
+ } else {
+ system("mv table.pnm table.pnm~; pnmcat -tb table.pnm~ tmp-row.pnm >table.pnm");
+ }
+ $row++;
+ $col = 0;
+ system("rm -f tmp-*.pnm table.pnm~");
+ }
+ }
+ if ($col > 0) {
+ system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
+ if ($row == 0) {
+ system("mv tmp-row.pnm table.pnm");
+ } else {
+ system("mv table.pnm table.pnm~; pnmcat -tb -jleft -black table.pnm~ tmp-row.pnm >table.pnm");
+ }
+ }
+ system("rm -f table.gif ; ppmtogif table.pnm > table.gif");
+ system("rm -f tmp-*.pnm table.pnm~ table.pnm");
+ } elsif (/^table$/) {
+ # go through all used names to print full table
+ print "<TABLE border=2>\n" if $html;
+ for $i (sort({$a <=> $b} keys(%used))) {
+ next if ($name{$i} eq "<control>");
+ if ($html) {
+ $sources = $used{$i};
+ $sources =~ s/\]\[/, /g;
+ $sources =~ s/^\[//g;
+ $sources =~ s/\]$//g;
+ $sources =~ s/\{(..)\}/<SUB>$1<\/SUB>/g;
+ $j = sprintf("%04X", $i);
+ $j =~ /(..)(..)/;
+ $gif = "Small.Glyphs/$1/U$j.gif";
+ print "<TR>";
+ print "<TD><img width=32 height=32 src=\"$gif\">" if $image;
+ printf("<TD>&#%d;", $i) if $adducs;
+ print "<TD><SAMP>$j</SAMP><TD><SAMP>" . name($i);
+ print " ($comment{$i})" if $comment{$i};
+ print "</SAMP><TD><SMALL>$sources</SMALL>\n";
+ } else {
+ printf("%04X \# ", $i);
+ print pack("U", $i) . " " if $adducs;
+ print name($i) ."\n";
+ }
+ }
+ print "</TABLE>\n" if $html;
+ } elsif (/^imgblock$/) {
+ $width = 16;
+ $width = $1 if /giftable([0-9]+)/;
+ $col = 0;
+ $subline = "";
+ print "\n<P><TABLE cellspacing=0 cellpadding=0>";
+ for $i (sort({$a <=> $b} keys(%used))) {
+ print "<TR>" if $col == 0;
+ $j = sprintf("%04X", $i);
+ $j =~ /(..)(..)/;
+ $gif = "Small.Glyphs/$1/U$j.gif";
+ $alt = name($i);
+ print "<TD><img width=32 height=32 src=\"$gif\" alt=\"$alt\">";
+ $subline .= "<TD><SMALL><SAMP>$j</SAMP></SMALL>";
+ if (++$col == $width) {
+ print "<TR align=center>$subline";
+ $col = 0;
+ $subline = "";
+ }
+ }
+ print "<TR align=center>$subline" if ($col > 0);
+ print "</TABLE>\n";
+ } elsif (/^sources$/) {
+ # count how many characters are attributed to the various source set combinations
+ print "<P>Number of occurences of source character set combinations:\n<TABLE border=2>" if $html;
+ for $i (keys(%used)) {
+ next if ($name{$i} eq "<control>");
+ $sources = $used{$i};
+ $sources =~ s/\]\[/, /g;
+ $sources =~ s/^\[//g;
+ $sources =~ s/\]$//g;
+ $sources =~ s/\{(..)\}//g;
+ $contribs{$sources} += 1;
+ }
+ for $j (keys(%contribs)) {
+ print "<TR><TD>$contribs{$j}<TD>$j\n" if $html;
+ }
+ print "</TABLE>\n" if $html;
+ } elsif (/^compact$/) {
+ # print compact table in P10 MES format
+ print "<P>Compact representation of this character set:\n<TABLE border=2>" if $html;
+ print "<TR><TD><B>Rows</B><TD><B>Positions (Cells)</B>" if $html;
+ print "\n# Plane 00\n# Rows\tPositions (Cells)\n" unless $html;
+ $current_row = '';
+ $start_col = '';
+ $last_col = '';
+ for $i (sort({$a <=> $b} keys(%used))) {
+ next if ($name{$i} eq "<control>");
+ $row = sprintf("%02X", $i >> 8);
+ $col = sprintf("%02X", $i & 0xff);
+ if ($row ne $current_row) {
+ if (($last_col ne '') and ($last_col ne $start_col)) {
+ print "-$last_col";
+ print "</SAMP>" if $html;
+ }
+ print "<TR><TD><SAMP>$row</SAMP><TD><SAMP>" if $html;
+ print "\n $row\t" unless $html;
+ $len = 0;
+ $current_row = $row;
+ $start_col = '';
+ }
+ if ($start_col eq '') {
+ print "$col";
+ $len += 2;
+ $start_col = $col;
+ $last_col = $col;
+ } elsif (hex($col) == hex($last_col) + 1) {
+ $last_col = $col;
+ } else {
+ if ($last_col ne $start_col) {
+ print "-$last_col";
+ $len += 3;
+ }
+ if ($len > 60 && !$html) {
+ print "\n $row\t";
+ $len = 0;
+ };
+ print " " if $len;
+ print "$col";
+ $len += 2 + !! $len;
+ $start_col = $col;
+ $last_col = $col;
+ }
+ }
+ if (($last_col ne '') and ($last_col ne $start_col)) {
+ print "-$last_col";
+ print "</SAMP>" if $html;
+ }
+ print "\n" if ($current_row ne '');
+ print "</TABLE>\n" if $html;
+ print "\n";
+ } elsif (/^c$/) {
+ # print table as C interval array
+ print "{";
+ $last_i = '';
+ $columns = 3;
+ $col = $columns;
+ for $i (sort({$a <=> $b} keys(%used))) {
+ next if ($name{$i} eq "<control>");
+ if ($last_i eq '') {
+ if (++$col > $columns) { $col = 1; print "\n "; }
+ printf(" { 0x%04X, ", $i);
+ $last_i = $i;
+ } elsif ($i == $last_i + 1) {
+ $last_i = $i;
+ } else {
+ printf("0x%04X },", $last_i);
+ if (++$col > $columns) { $col = 1; print "\n "; }
+ printf(" { 0x%04X, ", $i);
+ $last_i = $i;
+ }
+ }
+ if ($last_i ne '') {
+ printf("0x%04X }", $last_i);
+ }
+ print "\n};\n";
+ } elsif (/^utf8-list$/) {
+ $col = 0;
+ $block = 0;
+ $last = -1;
+ for $i (sort({$a <=> $b} keys(%used))) {
+ next if ($name{$i} eq "<control>");
+ while ($blockend[$block] < $i && $block < $blocks - 1) {
+ $block++;
+ }
+ if ($last <= $blockend[$block-1] &&
+ $i < $blockstart[$block]) {
+ print "\n" if ($col);
+ printf "\nFree block (U+%04X-U+%04X):\n\n",
+ $blockend[$block-1] + 1, $blockstart[$block] - 1;
+ $col = 0;
+ }
+ if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
+ print "\n" if ($col);
+ printf "\n$blockname[$block] (U+%04X-U+%04X):\n\n",
+ $blockstart[$block], $blockend[$block];
+ $col = 0;
+ }
+ if ($category{$i} eq 'Mn') {
+ # prefix non-spacing character with U+25CC DOTTED CIRCLE
+ print "\x{25CC}";
+ } elsif ($category{$i} eq 'Me') {
+ # prefix enclosing non-spacing character with space
+ print " ";
+ }
+ print pack("U", $i);
+ $col += 1 + iswide($i);
+ if ($col >= 64) {
+ print "\n";
+ $col = 0;
+ }
+ $last = $i;
+ }
+ print "\n" if ($col);
+ } elsif (/^collections$/) {
+ $block = 0;
+ $last = -1;
+ for $i (sort({$a <=> $b} keys(%used))) {
+ next if ($name{$i} eq "<control>");
+ while ($blockend[$block] < $i && $block < $blocks - 1) {
+ $block++;
+ }
+ if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
+ print $blockname[$block],
+ " " x (40 - length($blockname[$block]));
+ printf "%04X-%04X\n",
+ $blockstart[$block], $blockend[$block];
+ }
+ $last = $i;
+ }
+ } elsif (/^nr$/) {
+ print "<P>" if $html;
+ print "# " unless $html;
+ print "Number of characters in above table: ";
+ $count = 0;
+ for $i (keys(%used)) {
+ $count++ unless $name{$i} eq "<control>";
+ }
+ print $count;
+ print "\n";
+ } elsif (/^clean$/) {
+ # remove characters from set that are not in $unicodedata
+ for $i (keys(%used)) {
+ delete $used{$i} unless is_unicode($i);
+ }
+ } elsif (/^unknown$/) {
+ # remove characters from set that are in $unicodedata
+ for $i (keys(%used)) {
+ delete $used{$i} if is_unicode($i);
+ }
+ } else {
+ die("Unknown command line command '$_'");
+ };
+}