diff options
author | Carlos O'Donell <carlos@systemhalted.org> | 2015-12-09 22:27:41 -0500 |
---|---|---|
committer | Carlos O'Donell <carlos@systemhalted.org> | 2015-12-09 22:52:13 -0500 |
commit | dd8e8e547647bf7a3f6feb816a848a846feeaf14 (patch) | |
tree | a2565747c02ddaa9b178a5aa9de6fa42aa5ae979 | |
parent | 40b59cace2fd5e5aa04367073a54efc995059376 (diff) | |
download | glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.zip glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar.gz glibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar.bz2 |
Update transliteration support to Unicode 7.0.0.
The transliteration files are now autogenerated from upstream Unicode
data.
-rw-r--r-- | localedata/ChangeLog | 32 | ||||
-rw-r--r-- | localedata/locales/translit_circle | 30 | ||||
-rw-r--r-- | localedata/locales/translit_cjk_compat | 422 | ||||
-rw-r--r-- | localedata/locales/translit_combining | 636 | ||||
-rw-r--r-- | localedata/locales/translit_compat | 578 | ||||
-rw-r--r-- | localedata/locales/translit_font | 151 | ||||
-rw-r--r-- | localedata/locales/translit_fraction | 15 | ||||
-rw-r--r-- | localedata/unicode-gen/Makefile | 42 | ||||
-rw-r--r-- | localedata/unicode-gen/gen_translit_circle.py | 150 | ||||
-rw-r--r-- | localedata/unicode-gen/gen_translit_cjk_compat.py | 220 | ||||
-rw-r--r-- | localedata/unicode-gen/gen_translit_combining.py | 442 | ||||
-rw-r--r-- | localedata/unicode-gen/gen_translit_compat.py | 326 | ||||
-rw-r--r-- | localedata/unicode-gen/gen_translit_font.py | 156 | ||||
-rw-r--r-- | localedata/unicode-gen/gen_translit_fraction.py | 197 | ||||
-rwxr-xr-x | localedata/unicode-gen/gen_unicode_ctype.py | 497 | ||||
-rw-r--r-- | localedata/unicode-gen/unicode_utils.py | 502 | ||||
-rwxr-xr-x | localedata/unicode-gen/utf8_compatibility.py | 217 | ||||
-rwxr-xr-x | localedata/unicode-gen/utf8_gen.py | 28 |
18 files changed, 3928 insertions, 713 deletions
diff --git a/localedata/ChangeLog b/localedata/ChangeLog index 5b58e8a..c58ccfe 100644 --- a/localedata/ChangeLog +++ b/localedata/ChangeLog @@ -1,4 +1,36 @@ 2015-12-09 Mike FABIAN <mfabian@redhat.com> + + [BZ #16061] + * unicode-gen/unicode_utils.py: New file. + * unicode-gen/gen_translit_circle.py: New file. + * unicode-gen/gen_translit_cjk_compat.py: New file. + * unicode-gen/gen_translit_combining.py: New file. + * unicode-gen/gen_translit_compat.py: New file + * unicode-gen/gen_translit_font.py: New file. + * unicode-gen/gen_translit_fraction.py: New file. + * unicode-gen/gen_unicode_ctype.py: Use unicode_utils.py. + * unicode-gen/utf8_compatibility.py: Likewise. + * unicode-gen/utf8_gen.py: Likewise. + * unicode-gen/Makefile (GENERATED): Add translit_combining + translit_compat translit_circle translit_cjk_compat translit_font + translit_fraction. + (install): Install translit_combining translit_compat translit_circle + translit_cjk_compat translit_font translit_fraction. + (UTF-8-report): Reference UnicodeData.txt and EastAsianWidth.txt. + (translit_combining): New target. + (translit_compat): New target. + (translit_circle): New target. + (translit_cjk_compat): New target. + (translit_font): New target. + (translit_fraction): New target. + * locales/translit_circle: Regenerate. + * locales/translit_cjk_compat: Regenerate. + * locales/translit_combining: Regenerate. + * locales/translit_compat: Regenerate. + * locales/translit_font: Regenerate. + * locales/translit_fraction: Regenerate. + +2015-12-09 Mike FABIAN <mfabian@redhat.com> Marko Myllynen <myllynen@redhat.com> * locales/translit_neutral: Change Ö U+00D6 LATIN CAPITAL LETTER O WITH diff --git a/localedata/locales/translit_circle b/localedata/locales/translit_circle index f701bc9..4fa6999 100644 --- a/localedata/locales/translit_circle +++ b/localedata/locales/translit_circle @@ -2,9 +2,7 @@ escape_char / comment_char % % Transliterations of encircled characters. -% Generated through -% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<circle>[^;]*;' UnicodeData.txt | \ -% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<circle> \([^;]*\);.*$/<U\1> "<U0028 \3 0029>"% \2/' -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G' +% Generated automatically from UnicodeData.txt by gen_translit_circle.py on 2015-12-09 for Unicode 7.0.0. LC_CTYPE @@ -156,6 +154,14 @@ translit_start <U24E9> "<U0028><U007A><U0029>" % CIRCLED DIGIT ZERO <U24EA> "<U0028><U0030><U0029>" +% CIRCLED IDEOGRAPH QUESTION +<U3244> "<U0028><U554F><U0029>" +% CIRCLED IDEOGRAPH KINDERGARTEN +<U3245> "<U0028><U5E7C><U0029>" +% CIRCLED IDEOGRAPH SCHOOL +<U3246> "<U0028><U6587><U0029>" +% CIRCLED IDEOGRAPH KOTO +<U3247> "<U0028><U7B8F><U0029>" % CIRCLED NUMBER TWENTY ONE <U3251> "<U0028><U0032><U0031><U0029>" % CIRCLED NUMBER TWENTY TWO @@ -242,6 +248,12 @@ translit_start <U327A> "<U0028><U1111><U1161><U0029>" % CIRCLED HANGUL HIEUH A <U327B> "<U0028><U1112><U1161><U0029>" +% CIRCLED KOREAN CHARACTER CHAMKO +<U327C> "<U0028><U110E><U1161><U11B7><U1100><U1169><U0029>" +% CIRCLED KOREAN CHARACTER JUEUI +<U327D> "<U0028><U110C><U116E><U110B><U1174><U0029>" +% CIRCLED HANGUL IEUNG U +<U327E> "<U0028><U110B><U116E><U0029>" % CIRCLED IDEOGRAPH ONE <U3280> "<U0028><U4E00><U0029>" % CIRCLED IDEOGRAPH TWO @@ -464,6 +476,18 @@ translit_start <U32FD> "<U0028><U30F1><U0029>" % CIRCLED KATAKANA WO <U32FE> "<U0028><U30F2><U0029>" +% CIRCLED ITALIC LATIN CAPITAL LETTER C +<U0001F12B> "<U0028><U0043><U0029>" +% CIRCLED ITALIC LATIN CAPITAL LETTER R +<U0001F12C> "<U0028><U0052><U0029>" +% CIRCLED CD +<U0001F12D> "<U0028><U0043><U0044><U0029>" +% CIRCLED WZ +<U0001F12E> "<U0028><U0057><U005A><U0029>" +% CIRCLED IDEOGRAPH ADVANTAGE +<U0001F250> "<U0028><U5F97><U0029>" +% CIRCLED IDEOGRAPH ACCEPT +<U0001F251> "<U0028><U53EF><U0029>" translit_end diff --git a/localedata/locales/translit_cjk_compat b/localedata/locales/translit_cjk_compat index c73e5e3..e61e8e5 100644 --- a/localedata/locales/translit_cjk_compat +++ b/localedata/locales/translit_cjk_compat @@ -2,18 +2,22 @@ escape_char / comment_char % % Transliterations of CJK compatibility characters. -% Generated through -% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<square>[^;]*;' UnicodeData.txt | \ -% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<square> \([^;]*\);.*$/<U\1> "<U\3>"% \2/' | sed -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G' -% and -% $ grep '[^;]*;CJK COMPATIBILITY IDEOGRAPH[^;]*;[^;]*;[^;]*;[^;]*;[^;]' UnicodeData.txt | \ -% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;\([^;]*\);.*$/<U\1> <U\3>% \2/' | sed -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G' | \ -% sed -e 's/<U\(.....\)>/<U000\1>/g' +% Generated automatically from UnicodeData.txt by gen_translit_cjk_compat.py on 2015-12-09 for Unicode 7.0.0. LC_CTYPE translit_start +% PARTNERSHIP SIGN +<U3250> "<U0050><U0054><U0045>" +% SQUARE HG +<U32CC> "<U0048><U0067>" +% SQUARE ERG +<U32CD> "<U0065><U0072><U0067>" +% SQUARE EV +<U32CE> "<U0065><U0056>" +% LIMITED LIABILITY SIGN +<U32CF> "<U004C><U0054><U0044>" % SQUARE APAATO <U3300> "<U30A2><U30D1><U30FC><U30C8>" % SQUARE ARUHUA @@ -202,6 +206,14 @@ translit_start <U3375> "<U006F><U0056>" % SQUARE PC <U3376> "<U0070><U0063>" +% SQUARE DM +<U3377> "<U0064><U006D>" +% SQUARE DM SQUARED +<U3378> "<U0064><U006D><U00B2>";"<U0064><U006D><U005E><U0032>" +% SQUARE DM CUBED +<U3379> "<U0064><U006D><U00B3>";"<U0064><U006D><U005E><U0033>" +% SQUARE IU +<U337A> "<U0049><U0055>" % SQUARE ERA NAME HEISEI <U337B> "<U5E73><U6210>" % SQUARE ERA NAME SYOUWA @@ -400,6 +412,170 @@ translit_start <U33DC> "<U0053><U0076>" % SQUARE WB <U33DD> "<U0057><U0062>" +% SQUARE V OVER M +<U33DE> "<U0056><U2215><U006D>";"<U0056><U002F><U006D>" +% SQUARE A OVER M +<U33DF> "<U0041><U2215><U006D>";"<U0041><U002F><U006D>" +% SQUARE GAL +<U33FF> "<U0067><U0061><U006C>" +% SQUARED LATIN CAPITAL LETTER A +<U0001F130> <U0041> +% SQUARED LATIN CAPITAL LETTER B +<U0001F131> <U0042> +% SQUARED LATIN CAPITAL LETTER C +<U0001F132> <U0043> +% SQUARED LATIN CAPITAL LETTER D +<U0001F133> <U0044> +% SQUARED LATIN CAPITAL LETTER E +<U0001F134> <U0045> +% SQUARED LATIN CAPITAL LETTER F +<U0001F135> <U0046> +% SQUARED LATIN CAPITAL LETTER G +<U0001F136> <U0047> +% SQUARED LATIN CAPITAL LETTER H +<U0001F137> <U0048> +% SQUARED LATIN CAPITAL LETTER I +<U0001F138> <U0049> +% SQUARED LATIN CAPITAL LETTER J +<U0001F139> <U004A> +% SQUARED LATIN CAPITAL LETTER K +<U0001F13A> <U004B> +% SQUARED LATIN CAPITAL LETTER L +<U0001F13B> <U004C> +% SQUARED LATIN CAPITAL LETTER M +<U0001F13C> <U004D> +% SQUARED LATIN CAPITAL LETTER N +<U0001F13D> <U004E> +% SQUARED LATIN CAPITAL LETTER O +<U0001F13E> <U004F> +% SQUARED LATIN CAPITAL LETTER P +<U0001F13F> <U0050> +% SQUARED LATIN CAPITAL LETTER Q +<U0001F140> <U0051> +% SQUARED LATIN CAPITAL LETTER R +<U0001F141> <U0052> +% SQUARED LATIN CAPITAL LETTER S +<U0001F142> <U0053> +% SQUARED LATIN CAPITAL LETTER T +<U0001F143> <U0054> +% SQUARED LATIN CAPITAL LETTER U +<U0001F144> <U0055> +% SQUARED LATIN CAPITAL LETTER V +<U0001F145> <U0056> +% SQUARED LATIN CAPITAL LETTER W +<U0001F146> <U0057> +% SQUARED LATIN CAPITAL LETTER X +<U0001F147> <U0058> +% SQUARED LATIN CAPITAL LETTER Y +<U0001F148> <U0059> +% SQUARED LATIN CAPITAL LETTER Z +<U0001F149> <U005A> +% SQUARED HV +<U0001F14A> "<U0048><U0056>" +% SQUARED MV +<U0001F14B> "<U004D><U0056>" +% SQUARED SD +<U0001F14C> "<U0053><U0044>" +% SQUARED SS +<U0001F14D> "<U0053><U0053>" +% SQUARED PPV +<U0001F14E> "<U0050><U0050><U0056>" +% SQUARED WC +<U0001F14F> "<U0057><U0043>" +% SQUARE DJ +<U0001F190> "<U0044><U004A>" +% SQUARE HIRAGANA HOKA +<U0001F200> "<U307B><U304B>" +% SQUARED KATAKANA KOKO +<U0001F201> "<U30B3><U30B3>" +% SQUARED KATAKANA SA +<U0001F202> <U30B5> +% SQUARED CJK UNIFIED IDEOGRAPH-624B +<U0001F210> <U624B> +% SQUARED CJK UNIFIED IDEOGRAPH-5B57 +<U0001F211> <U5B57> +% SQUARED CJK UNIFIED IDEOGRAPH-53CC +<U0001F212> <U53CC> +% SQUARED KATAKANA DE +<U0001F213> <U30C7> +% SQUARED CJK UNIFIED IDEOGRAPH-4E8C +<U0001F214> <U4E8C> +% SQUARED CJK UNIFIED IDEOGRAPH-591A +<U0001F215> <U591A> +% SQUARED CJK UNIFIED IDEOGRAPH-89E3 +<U0001F216> <U89E3> +% SQUARED CJK UNIFIED IDEOGRAPH-5929 +<U0001F217> <U5929> +% SQUARED CJK UNIFIED IDEOGRAPH-4EA4 +<U0001F218> <U4EA4> +% SQUARED CJK UNIFIED IDEOGRAPH-6620 +<U0001F219> <U6620> +% SQUARED CJK UNIFIED IDEOGRAPH-7121 +<U0001F21A> <U7121> +% SQUARED CJK UNIFIED IDEOGRAPH-6599 +<U0001F21B> <U6599> +% SQUARED CJK UNIFIED IDEOGRAPH-524D +<U0001F21C> <U524D> +% SQUARED CJK UNIFIED IDEOGRAPH-5F8C +<U0001F21D> <U5F8C> +% SQUARED CJK UNIFIED IDEOGRAPH-518D +<U0001F21E> <U518D> +% SQUARED CJK UNIFIED IDEOGRAPH-65B0 +<U0001F21F> <U65B0> +% SQUARED CJK UNIFIED IDEOGRAPH-521D +<U0001F220> <U521D> +% SQUARED CJK UNIFIED IDEOGRAPH-7D42 +<U0001F221> <U7D42> +% SQUARED CJK UNIFIED IDEOGRAPH-751F +<U0001F222> <U751F> +% SQUARED CJK UNIFIED IDEOGRAPH-8CA9 +<U0001F223> <U8CA9> +% SQUARED CJK UNIFIED IDEOGRAPH-58F0 +<U0001F224> <U58F0> +% SQUARED CJK UNIFIED IDEOGRAPH-5439 +<U0001F225> <U5439> +% SQUARED CJK UNIFIED IDEOGRAPH-6F14 +<U0001F226> <U6F14> +% SQUARED CJK UNIFIED IDEOGRAPH-6295 +<U0001F227> <U6295> +% SQUARED CJK UNIFIED IDEOGRAPH-6355 +<U0001F228> <U6355> +% SQUARED CJK UNIFIED IDEOGRAPH-4E00 +<U0001F229> <U4E00> +% SQUARED CJK UNIFIED IDEOGRAPH-4E09 +<U0001F22A> <U4E09> +% SQUARED CJK UNIFIED IDEOGRAPH-904A +<U0001F22B> <U904A> +% SQUARED CJK UNIFIED IDEOGRAPH-5DE6 +<U0001F22C> <U5DE6> +% SQUARED CJK UNIFIED IDEOGRAPH-4E2D +<U0001F22D> <U4E2D> +% SQUARED CJK UNIFIED IDEOGRAPH-53F3 +<U0001F22E> <U53F3> +% SQUARED CJK UNIFIED IDEOGRAPH-6307 +<U0001F22F> <U6307> +% SQUARED CJK UNIFIED IDEOGRAPH-8D70 +<U0001F230> <U8D70> +% SQUARED CJK UNIFIED IDEOGRAPH-6253 +<U0001F231> <U6253> +% SQUARED CJK UNIFIED IDEOGRAPH-7981 +<U0001F232> <U7981> +% SQUARED CJK UNIFIED IDEOGRAPH-7A7A +<U0001F233> <U7A7A> +% SQUARED CJK UNIFIED IDEOGRAPH-5408 +<U0001F234> <U5408> +% SQUARED CJK UNIFIED IDEOGRAPH-6E80 +<U0001F235> <U6E80> +% SQUARED CJK UNIFIED IDEOGRAPH-6709 +<U0001F236> <U6709> +% SQUARED CJK UNIFIED IDEOGRAPH-6708 +<U0001F237> <U6708> +% SQUARED CJK UNIFIED IDEOGRAPH-7533 +<U0001F238> <U7533> +% SQUARED CJK UNIFIED IDEOGRAPH-5272 +<U0001F239> <U5272> +% SQUARED CJK UNIFIED IDEOGRAPH-55B6 +<U0001F23A> <U55B6> % CJK COMPATIBILITY IDEOGRAPH-F900 <UF900> <U8C48> % CJK COMPATIBILITY IDEOGRAPH-F901 @@ -980,6 +1156,10 @@ translit_start <UFA2C> <U9928> % CJK COMPATIBILITY IDEOGRAPH-FA2D <UFA2D> <U9DB4> +% CJK COMPATIBILITY IDEOGRAPH-FA2E +<UFA2E> <U90DE> +% CJK COMPATIBILITY IDEOGRAPH-FA2F +<UFA2F> <U96B7> % CJK COMPATIBILITY IDEOGRAPH-FA30 <UFA30> <U4FAE> % CJK COMPATIBILITY IDEOGRAPH-FA31 @@ -1098,6 +1278,224 @@ translit_start <UFA69> <U97FF> % CJK COMPATIBILITY IDEOGRAPH-FA6A <UFA6A> <U983B> +% CJK COMPATIBILITY IDEOGRAPH-FA6B +<UFA6B> <U6075> +% CJK COMPATIBILITY IDEOGRAPH-FA6C +<UFA6C> <U000242EE> +% CJK COMPATIBILITY IDEOGRAPH-FA6D +<UFA6D> <U8218> +% CJK COMPATIBILITY IDEOGRAPH-FA70 +<UFA70> <U4E26> +% CJK COMPATIBILITY IDEOGRAPH-FA71 +<UFA71> <U51B5> +% CJK COMPATIBILITY IDEOGRAPH-FA72 +<UFA72> <U5168> +% CJK COMPATIBILITY IDEOGRAPH-FA73 +<UFA73> <U4F80> +% CJK COMPATIBILITY IDEOGRAPH-FA74 +<UFA74> <U5145> +% CJK COMPATIBILITY IDEOGRAPH-FA75 +<UFA75> <U5180> +% CJK COMPATIBILITY IDEOGRAPH-FA76 +<UFA76> <U52C7> +% CJK COMPATIBILITY IDEOGRAPH-FA77 +<UFA77> <U52FA> +% CJK COMPATIBILITY IDEOGRAPH-FA78 +<UFA78> <U559D> +% CJK COMPATIBILITY IDEOGRAPH-FA79 +<UFA79> <U5555> +% CJK COMPATIBILITY IDEOGRAPH-FA7A +<UFA7A> <U5599> +% CJK COMPATIBILITY IDEOGRAPH-FA7B +<UFA7B> <U55E2> +% CJK COMPATIBILITY IDEOGRAPH-FA7C +<UFA7C> <U585A> +% CJK COMPATIBILITY IDEOGRAPH-FA7D +<UFA7D> <U58B3> +% CJK COMPATIBILITY IDEOGRAPH-FA7E +<UFA7E> <U5944> +% CJK COMPATIBILITY IDEOGRAPH-FA7F +<UFA7F> <U5954> +% CJK COMPATIBILITY IDEOGRAPH-FA80 +<UFA80> <U5A62> +% CJK COMPATIBILITY IDEOGRAPH-FA81 +<UFA81> <U5B28> +% CJK COMPATIBILITY IDEOGRAPH-FA82 +<UFA82> <U5ED2> +% CJK COMPATIBILITY IDEOGRAPH-FA83 +<UFA83> <U5ED9> +% CJK COMPATIBILITY IDEOGRAPH-FA84 +<UFA84> <U5F69> +% CJK COMPATIBILITY IDEOGRAPH-FA85 +<UFA85> <U5FAD> +% CJK COMPATIBILITY IDEOGRAPH-FA86 +<UFA86> <U60D8> +% CJK COMPATIBILITY IDEOGRAPH-FA87 +<UFA87> <U614E> +% CJK COMPATIBILITY IDEOGRAPH-FA88 +<UFA88> <U6108> +% CJK COMPATIBILITY IDEOGRAPH-FA89 +<UFA89> <U618E> +% CJK COMPATIBILITY IDEOGRAPH-FA8A +<UFA8A> <U6160> +% CJK COMPATIBILITY IDEOGRAPH-FA8B +<UFA8B> <U61F2> +% CJK COMPATIBILITY IDEOGRAPH-FA8C +<UFA8C> <U6234> +% CJK COMPATIBILITY IDEOGRAPH-FA8D +<UFA8D> <U63C4> +% CJK COMPATIBILITY IDEOGRAPH-FA8E +<UFA8E> <U641C> +% CJK COMPATIBILITY IDEOGRAPH-FA8F +<UFA8F> <U6452> +% CJK COMPATIBILITY IDEOGRAPH-FA90 +<UFA90> <U6556> +% CJK COMPATIBILITY IDEOGRAPH-FA91 +<UFA91> <U6674> +% CJK COMPATIBILITY IDEOGRAPH-FA92 +<UFA92> <U6717> +% CJK COMPATIBILITY IDEOGRAPH-FA93 +<UFA93> <U671B> +% CJK COMPATIBILITY IDEOGRAPH-FA94 +<UFA94> <U6756> +% CJK COMPATIBILITY IDEOGRAPH-FA95 +<UFA95> <U6B79> +% CJK COMPATIBILITY IDEOGRAPH-FA96 +<UFA96> <U6BBA> +% CJK COMPATIBILITY IDEOGRAPH-FA97 +<UFA97> <U6D41> +% CJK COMPATIBILITY IDEOGRAPH-FA98 +<UFA98> <U6EDB> +% CJK COMPATIBILITY IDEOGRAPH-FA99 +<UFA99> <U6ECB> +% CJK COMPATIBILITY IDEOGRAPH-FA9A +<UFA9A> <U6F22> +% CJK COMPATIBILITY IDEOGRAPH-FA9B +<UFA9B> <U701E> +% CJK COMPATIBILITY IDEOGRAPH-FA9C +<UFA9C> <U716E> +% CJK COMPATIBILITY IDEOGRAPH-FA9D +<UFA9D> <U77A7> +% CJK COMPATIBILITY IDEOGRAPH-FA9E +<UFA9E> <U7235> +% CJK COMPATIBILITY IDEOGRAPH-FA9F +<UFA9F> <U72AF> +% CJK COMPATIBILITY IDEOGRAPH-FAA0 +<UFAA0> <U732A> +% CJK COMPATIBILITY IDEOGRAPH-FAA1 +<UFAA1> <U7471> +% CJK COMPATIBILITY IDEOGRAPH-FAA2 +<UFAA2> <U7506> +% CJK COMPATIBILITY IDEOGRAPH-FAA3 +<UFAA3> <U753B> +% CJK COMPATIBILITY IDEOGRAPH-FAA4 +<UFAA4> <U761D> +% CJK COMPATIBILITY IDEOGRAPH-FAA5 +<UFAA5> <U761F> +% CJK COMPATIBILITY IDEOGRAPH-FAA6 +<UFAA6> <U76CA> +% CJK COMPATIBILITY IDEOGRAPH-FAA7 +<UFAA7> <U76DB> +% CJK COMPATIBILITY IDEOGRAPH-FAA8 +<UFAA8> <U76F4> +% CJK COMPATIBILITY IDEOGRAPH-FAA9 +<UFAA9> <U774A> +% CJK COMPATIBILITY IDEOGRAPH-FAAA +<UFAAA> <U7740> +% CJK COMPATIBILITY IDEOGRAPH-FAAB +<UFAAB> <U78CC> +% CJK COMPATIBILITY IDEOGRAPH-FAAC +<UFAAC> <U7AB1> +% CJK COMPATIBILITY IDEOGRAPH-FAAD +<UFAAD> <U7BC0> +% CJK COMPATIBILITY IDEOGRAPH-FAAE +<UFAAE> <U7C7B> +% CJK COMPATIBILITY IDEOGRAPH-FAAF +<UFAAF> <U7D5B> +% CJK COMPATIBILITY IDEOGRAPH-FAB0 +<UFAB0> <U7DF4> +% CJK COMPATIBILITY IDEOGRAPH-FAB1 +<UFAB1> <U7F3E> +% CJK COMPATIBILITY IDEOGRAPH-FAB2 +<UFAB2> <U8005> +% CJK COMPATIBILITY IDEOGRAPH-FAB3 +<UFAB3> <U8352> +% CJK COMPATIBILITY IDEOGRAPH-FAB4 +<UFAB4> <U83EF> +% CJK COMPATIBILITY IDEOGRAPH-FAB5 +<UFAB5> <U8779> +% CJK COMPATIBILITY IDEOGRAPH-FAB6 +<UFAB6> <U8941> +% CJK COMPATIBILITY IDEOGRAPH-FAB7 +<UFAB7> <U8986> +% CJK COMPATIBILITY IDEOGRAPH-FAB8 +<UFAB8> <U8996> +% CJK COMPATIBILITY IDEOGRAPH-FAB9 +<UFAB9> <U8ABF> +% CJK COMPATIBILITY IDEOGRAPH-FABA +<UFABA> <U8AF8> +% CJK COMPATIBILITY IDEOGRAPH-FABB +<UFABB> <U8ACB> +% CJK COMPATIBILITY IDEOGRAPH-FABC +<UFABC> <U8B01> +% CJK COMPATIBILITY IDEOGRAPH-FABD +<UFABD> <U8AFE> +% CJK COMPATIBILITY IDEOGRAPH-FABE +<UFABE> <U8AED> +% CJK COMPATIBILITY IDEOGRAPH-FABF +<UFABF> <U8B39> +% CJK COMPATIBILITY IDEOGRAPH-FAC0 +<UFAC0> <U8B8A> +% CJK COMPATIBILITY IDEOGRAPH-FAC1 +<UFAC1> <U8D08> +% CJK COMPATIBILITY IDEOGRAPH-FAC2 +<UFAC2> <U8F38> +% CJK COMPATIBILITY IDEOGRAPH-FAC3 +<UFAC3> <U9072> +% CJK COMPATIBILITY IDEOGRAPH-FAC4 +<UFAC4> <U9199> +% CJK COMPATIBILITY IDEOGRAPH-FAC5 +<UFAC5> <U9276> +% CJK COMPATIBILITY IDEOGRAPH-FAC6 +<UFAC6> <U967C> +% CJK COMPATIBILITY IDEOGRAPH-FAC7 +<UFAC7> <U96E3> +% CJK COMPATIBILITY IDEOGRAPH-FAC8 +<UFAC8> <U9756> +% CJK COMPATIBILITY IDEOGRAPH-FAC9 +<UFAC9> <U97DB> +% CJK COMPATIBILITY IDEOGRAPH-FACA +<UFACA> <U97FF> +% CJK COMPATIBILITY IDEOGRAPH-FACB +<UFACB> <U980B> +% CJK COMPATIBILITY IDEOGRAPH-FACC +<UFACC> <U983B> +% CJK COMPATIBILITY IDEOGRAPH-FACD +<UFACD> <U9B12> +% CJK COMPATIBILITY IDEOGRAPH-FACE +<UFACE> <U9F9C> +% CJK COMPATIBILITY IDEOGRAPH-FACF +<UFACF> <U0002284A> +% CJK COMPATIBILITY IDEOGRAPH-FAD0 +<UFAD0> <U00022844> +% CJK COMPATIBILITY IDEOGRAPH-FAD1 +<UFAD1> <U000233D5> +% CJK COMPATIBILITY IDEOGRAPH-FAD2 +<UFAD2> <U3B9D> +% CJK COMPATIBILITY IDEOGRAPH-FAD3 +<UFAD3> <U4018> +% CJK COMPATIBILITY IDEOGRAPH-FAD4 +<UFAD4> <U4039> +% CJK COMPATIBILITY IDEOGRAPH-FAD5 +<UFAD5> <U00025249> +% CJK COMPATIBILITY IDEOGRAPH-FAD6 +<UFAD6> <U00025CD0> +% CJK COMPATIBILITY IDEOGRAPH-FAD7 +<UFAD7> <U00027ED3> +% CJK COMPATIBILITY IDEOGRAPH-FAD8 +<UFAD8> <U9F43> +% CJK COMPATIBILITY IDEOGRAPH-FAD9 +<UFAD9> <U9F8E> % CJK COMPATIBILITY IDEOGRAPH-2F800 <U0002F800> <U4E3D> % CJK COMPATIBILITY IDEOGRAPH-2F801 @@ -1307,7 +1705,7 @@ translit_start % CJK COMPATIBILITY IDEOGRAPH-2F867 <U0002F867> <U36EE> % CJK COMPATIBILITY IDEOGRAPH-2F868 -<U0002F868> <U0002136A> +<U0002F868> <U36FC> % CJK COMPATIBILITY IDEOGRAPH-2F869 <U0002F869> <U5B08> % CJK COMPATIBILITY IDEOGRAPH-2F86A @@ -1331,7 +1729,7 @@ translit_start % CJK COMPATIBILITY IDEOGRAPH-2F873 <U0002F873> <U5C06> % CJK COMPATIBILITY IDEOGRAPH-2F874 -<U0002F874> <U5F33> +<U0002F874> <U5F53> % CJK COMPATIBILITY IDEOGRAPH-2F875 <U0002F875> <U5C22> % CJK COMPATIBILITY IDEOGRAPH-2F876 @@ -1673,7 +2071,7 @@ translit_start % CJK COMPATIBILITY IDEOGRAPH-2F91E <U0002F91E> <U719C> % CJK COMPATIBILITY IDEOGRAPH-2F91F -<U0002F91F> <U43AB> +<U0002F91F> <U000243AB> % CJK COMPATIBILITY IDEOGRAPH-2F920 <U0002F920> <U7228> % CJK COMPATIBILITY IDEOGRAPH-2F921 @@ -1801,7 +2199,7 @@ translit_start % CJK COMPATIBILITY IDEOGRAPH-2F95E <U0002F95E> <U00025AA7> % CJK COMPATIBILITY IDEOGRAPH-2F95F -<U0002F95F> <U7AAE> +<U0002F95F> <U7AEE> % CJK COMPATIBILITY IDEOGRAPH-2F960 <U0002F960> <U4202> % CJK COMPATIBILITY IDEOGRAPH-2F961 @@ -1993,7 +2391,7 @@ translit_start % CJK COMPATIBILITY IDEOGRAPH-2F9BE <U0002F9BE> <U8786> % CJK COMPATIBILITY IDEOGRAPH-2F9BF -<U0002F9BF> <U4D57> +<U0002F9BF> <U45D7> % CJK COMPATIBILITY IDEOGRAPH-2F9C0 <U0002F9C0> <U87E1> % CJK COMPATIBILITY IDEOGRAPH-2F9C1 diff --git a/localedata/locales/translit_combining b/localedata/locales/translit_combining index 44c62f9..67497ec 100644 --- a/localedata/locales/translit_combining +++ b/localedata/locales/translit_combining @@ -3,7 +3,7 @@ comment_char % % Transliterations that remove all combining characters (accents, % pronounciation marks, etc.). -% Generated from UnicodeData.txt. +% Generated automatically from UnicodeData.txt by gen_translit_combining.py on 2015-12-09 for Unicode 7.0.0. LC_CTYPE @@ -167,6 +167,40 @@ translit_start <U034D> "" % COMBINING UPWARDS ARROW BELOW <U034E> "" +% COMBINING GRAPHEME JOINER +<U034F> "" +% COMBINING RIGHT ARROWHEAD ABOVE +<U0350> "" +% COMBINING LEFT HALF RING ABOVE +<U0351> "" +% COMBINING FERMATA +<U0352> "" +% COMBINING X BELOW +<U0353> "" +% COMBINING LEFT ARROWHEAD BELOW +<U0354> "" +% COMBINING RIGHT ARROWHEAD BELOW +<U0355> "" +% COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW +<U0356> "" +% COMBINING RIGHT HALF RING ABOVE +<U0357> "" +% COMBINING DOT ABOVE RIGHT +<U0358> "" +% COMBINING ASTERISK BELOW +<U0359> "" +% COMBINING DOUBLE RING BELOW +<U035A> "" +% COMBINING ZIGZAG ABOVE +<U035B> "" +% COMBINING DOUBLE BREVE BELOW +<U035C> "" +% COMBINING DOUBLE BREVE +<U035D> "" +% COMBINING DOUBLE MACRON +<U035E> "" +% COMBINING DOUBLE MACRON BELOW +<U035F> "" % COMBINING DOUBLE TILDE <U0360> "" % COMBINING DOUBLE INVERTED BREVE @@ -199,6 +233,68 @@ translit_start <U036E> "" % COMBINING LATIN SMALL LETTER X <U036F> "" +% HEBREW ACCENT ETNAHTA +<U0591> "" +% HEBREW ACCENT SEGOL +<U0592> "" +% HEBREW ACCENT SHALSHELET +<U0593> "" +% HEBREW ACCENT ZAQEF QATAN +<U0594> "" +% HEBREW ACCENT ZAQEF GADOL +<U0595> "" +% HEBREW ACCENT TIPEHA +<U0596> "" +% HEBREW ACCENT REVIA +<U0597> "" +% HEBREW ACCENT ZARQA +<U0598> "" +% HEBREW ACCENT PASHTA +<U0599> "" +% HEBREW ACCENT YETIV +<U059A> "" +% HEBREW ACCENT TEVIR +<U059B> "" +% HEBREW ACCENT GERESH +<U059C> "" +% HEBREW ACCENT GERESH MUQDAM +<U059D> "" +% HEBREW ACCENT GERSHAYIM +<U059E> "" +% HEBREW ACCENT QARNEY PARA +<U059F> "" +% HEBREW ACCENT TELISHA GEDOLA +<U05A0> "" +% HEBREW ACCENT PAZER +<U05A1> "" +% HEBREW ACCENT ATNAH HAFUKH +<U05A2> "" +% HEBREW ACCENT MUNAH +<U05A3> "" +% HEBREW ACCENT MAHAPAKH +<U05A4> "" +% HEBREW ACCENT MERKHA +<U05A5> "" +% HEBREW ACCENT MERKHA KEFULA +<U05A6> "" +% HEBREW ACCENT DARGA +<U05A7> "" +% HEBREW ACCENT QADMA +<U05A8> "" +% HEBREW ACCENT TELISHA QETANA +<U05A9> "" +% HEBREW ACCENT YERAH BEN YOMO +<U05AA> "" +% HEBREW ACCENT OLE +<U05AB> "" +% HEBREW ACCENT ILUY +<U05AC> "" +% HEBREW ACCENT DEHI +<U05AD> "" +% HEBREW ACCENT ZINOR +<U05AE> "" +% HEBREW MARK MASORA CIRCLE +<U05AF> "" % HEBREW POINT SHEVA <U05B0> "" % HEBREW POINT HATAF SEGOL @@ -219,6 +315,8 @@ translit_start <U05B8> "" % HEBREW POINT HOLAM <U05B9> "" +% HEBREW POINT HOLAM HASER FOR VAV +<U05BA> "" % HEBREW POINT QUBUTS <U05BB> "" % HEBREW POINT DAGESH OR MAPIQ @@ -231,12 +329,358 @@ translit_start <U05C1> "" % HEBREW POINT SIN DOT <U05C2> "" +% HEBREW MARK UPPER DOT +<U05C4> "" +% HEBREW MARK LOWER DOT +<U05C5> "" +% HEBREW POINT QAMATS QATAN +<U05C7> "" +% ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM +<U0610> "" +% ARABIC SIGN ALAYHE ASSALLAM +<U0611> "" +% ARABIC SIGN RAHMATULLAH ALAYHE +<U0612> "" +% ARABIC SIGN RADI ALLAHOU ANHU +<U0613> "" +% ARABIC SIGN TAKHALLUS +<U0614> "" +% ARABIC SMALL HIGH TAH +<U0615> "" +% ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH +<U0616> "" +% ARABIC SMALL HIGH ZAIN +<U0617> "" +% ARABIC SMALL FATHA +<U0618> "" +% ARABIC SMALL DAMMA +<U0619> "" +% ARABIC SMALL KASRA +<U061A> "" +% ARABIC FATHATAN +<U064B> "" +% ARABIC DAMMATAN +<U064C> "" +% ARABIC KASRATAN +<U064D> "" +% ARABIC FATHA +<U064E> "" +% ARABIC DAMMA +<U064F> "" +% ARABIC KASRA +<U0650> "" +% ARABIC SHADDA +<U0651> "" +% ARABIC SUKUN +<U0652> "" % ARABIC MADDAH ABOVE <U0653> "" % ARABIC HAMZA ABOVE <U0654> "" % ARABIC HAMZA BELOW <U0655> "" +% ARABIC SUBSCRIPT ALEF +<U0656> "" +% ARABIC INVERTED DAMMA +<U0657> "" +% ARABIC MARK NOON GHUNNA +<U0658> "" +% ARABIC ZWARAKAY +<U0659> "" +% ARABIC VOWEL SIGN SMALL V ABOVE +<U065A> "" +% ARABIC VOWEL SIGN INVERTED SMALL V ABOVE +<U065B> "" +% ARABIC VOWEL SIGN DOT BELOW +<U065C> "" +% ARABIC REVERSED DAMMA +<U065D> "" +% ARABIC FATHA WITH TWO DOTS +<U065E> "" +% ARABIC WAVY HAMZA BELOW +<U065F> "" +% ARABIC LETTER SUPERSCRIPT ALEF +<U0670> "" +% ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA +<U06D6> "" +% ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA +<U06D7> "" +% ARABIC SMALL HIGH MEEM INITIAL FORM +<U06D8> "" +% ARABIC SMALL HIGH LAM ALEF +<U06D9> "" +% ARABIC SMALL HIGH JEEM +<U06DA> "" +% ARABIC SMALL HIGH THREE DOTS +<U06DB> "" +% ARABIC SMALL HIGH SEEN +<U06DC> "" +% ARABIC SMALL HIGH ROUNDED ZERO +<U06DF> "" +% ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO +<U06E0> "" +% ARABIC SMALL HIGH DOTLESS HEAD OF KHAH +<U06E1> "" +% ARABIC SMALL HIGH MEEM ISOLATED FORM +<U06E2> "" +% ARABIC SMALL LOW SEEN +<U06E3> "" +% ARABIC SMALL HIGH MADDA +<U06E4> "" +% ARABIC SMALL HIGH YEH +<U06E7> "" +% ARABIC SMALL HIGH NOON +<U06E8> "" +% ARABIC EMPTY CENTRE LOW STOP +<U06EA> "" +% ARABIC EMPTY CENTRE HIGH STOP +<U06EB> "" +% ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE +<U06EC> "" +% ARABIC SMALL LOW MEEM +<U06ED> "" +% ARABIC CURLY FATHA +<U08E4> "" +% ARABIC CURLY DAMMA +<U08E5> "" +% ARABIC CURLY KASRA +<U08E6> "" +% ARABIC CURLY FATHATAN +<U08E7> "" +% ARABIC CURLY DAMMATAN +<U08E8> "" +% ARABIC CURLY KASRATAN +<U08E9> "" +% ARABIC TONE ONE DOT ABOVE +<U08EA> "" +% ARABIC TONE TWO DOTS ABOVE +<U08EB> "" +% ARABIC TONE LOOP ABOVE +<U08EC> "" +% ARABIC TONE ONE DOT BELOW +<U08ED> "" +% ARABIC TONE TWO DOTS BELOW +<U08EE> "" +% ARABIC TONE LOOP BELOW +<U08EF> "" +% ARABIC OPEN FATHATAN +<U08F0> "" +% ARABIC OPEN DAMMATAN +<U08F1> "" +% ARABIC OPEN KASRATAN +<U08F2> "" +% ARABIC SMALL HIGH WAW +<U08F3> "" +% ARABIC FATHA WITH RING +<U08F4> "" +% ARABIC FATHA WITH DOT ABOVE +<U08F5> "" +% ARABIC KASRA WITH DOT BELOW +<U08F6> "" +% ARABIC LEFT ARROWHEAD ABOVE +<U08F7> "" +% ARABIC RIGHT ARROWHEAD ABOVE +<U08F8> "" +% ARABIC LEFT ARROWHEAD BELOW +<U08F9> "" +% ARABIC RIGHT ARROWHEAD BELOW +<U08FA> "" +% ARABIC DOUBLE RIGHT ARROWHEAD ABOVE +<U08FB> "" +% ARABIC DOUBLE RIGHT ARROWHEAD ABOVE WITH DOT +<U08FC> "" +% ARABIC RIGHT ARROWHEAD ABOVE WITH DOT +<U08FD> "" +% ARABIC DAMMA WITH DOT +<U08FE> "" +% ARABIC MARK SIDEWAYS NOON GHUNNA +<U08FF> "" +% COMBINING DOUBLED CIRCUMFLEX ACCENT +<U1AB0> "" +% COMBINING DIAERESIS-RING +<U1AB1> "" +% COMBINING INFINITY +<U1AB2> "" +% COMBINING DOWNWARDS ARROW +<U1AB3> "" +% COMBINING TRIPLE DOT +<U1AB4> "" +% COMBINING X-X BELOW +<U1AB5> "" +% COMBINING WIGGLY LINE BELOW +<U1AB6> "" +% COMBINING OPEN MARK BELOW +<U1AB7> "" +% COMBINING DOUBLE OPEN MARK BELOW +<U1AB8> "" +% COMBINING LIGHT CENTRALIZATION STROKE BELOW +<U1AB9> "" +% COMBINING STRONG CENTRALIZATION STROKE BELOW +<U1ABA> "" +% COMBINING PARENTHESES ABOVE +<U1ABB> "" +% COMBINING DOUBLE PARENTHESES ABOVE +<U1ABC> "" +% COMBINING PARENTHESES BELOW +<U1ABD> "" +% COMBINING PARENTHESES OVERLAY +<U1ABE> "" +% COMBINING DOTTED GRAVE ACCENT +<U1DC0> "" +% COMBINING DOTTED ACUTE ACCENT +<U1DC1> "" +% COMBINING SNAKE BELOW +<U1DC2> "" +% COMBINING SUSPENSION MARK +<U1DC3> "" +% COMBINING MACRON-ACUTE +<U1DC4> "" +% COMBINING GRAVE-MACRON +<U1DC5> "" +% COMBINING MACRON-GRAVE +<U1DC6> "" +% COMBINING ACUTE-MACRON +<U1DC7> "" +% COMBINING GRAVE-ACUTE-GRAVE +<U1DC8> "" +% COMBINING ACUTE-GRAVE-ACUTE +<U1DC9> "" +% COMBINING LATIN SMALL LETTER R BELOW +<U1DCA> "" +% COMBINING BREVE-MACRON +<U1DCB> "" +% COMBINING MACRON-BREVE +<U1DCC> "" +% COMBINING DOUBLE CIRCUMFLEX ABOVE +<U1DCD> "" +% COMBINING OGONEK ABOVE +<U1DCE> "" +% COMBINING ZIGZAG BELOW +<U1DCF> "" +% COMBINING IS BELOW +<U1DD0> "" +% COMBINING UR ABOVE +<U1DD1> "" +% COMBINING US ABOVE +<U1DD2> "" +% COMBINING LATIN SMALL LETTER FLATTENED OPEN A ABOVE +<U1DD3> "" +% COMBINING LATIN SMALL LETTER AE +<U1DD4> "" +% COMBINING LATIN SMALL LETTER AO +<U1DD5> "" +% COMBINING LATIN SMALL LETTER AV +<U1DD6> "" +% COMBINING LATIN SMALL LETTER C CEDILLA +<U1DD7> "" +% COMBINING LATIN SMALL LETTER INSULAR D +<U1DD8> "" +% COMBINING LATIN SMALL LETTER ETH +<U1DD9> "" +% COMBINING LATIN SMALL LETTER G +<U1DDA> "" +% COMBINING LATIN LETTER SMALL CAPITAL G +<U1DDB> "" +% COMBINING LATIN SMALL LETTER K +<U1DDC> "" +% COMBINING LATIN SMALL LETTER L +<U1DDD> "" +% COMBINING LATIN LETTER SMALL CAPITAL L +<U1DDE> "" +% COMBINING LATIN LETTER SMALL CAPITAL M +<U1DDF> "" +% COMBINING LATIN SMALL LETTER N +<U1DE0> "" +% COMBINING LATIN LETTER SMALL CAPITAL N +<U1DE1> "" +% COMBINING LATIN LETTER SMALL CAPITAL R +<U1DE2> "" +% COMBINING LATIN SMALL LETTER R ROTUNDA +<U1DE3> "" +% COMBINING LATIN SMALL LETTER S +<U1DE4> "" +% COMBINING LATIN SMALL LETTER LONG S +<U1DE5> "" +% COMBINING LATIN SMALL LETTER Z +<U1DE6> "" +% COMBINING LATIN SMALL LETTER ALPHA +<U1DE7> "" +% COMBINING LATIN SMALL LETTER B +<U1DE8> "" +% COMBINING LATIN SMALL LETTER BETA +<U1DE9> "" +% COMBINING LATIN SMALL LETTER SCHWA +<U1DEA> "" +% COMBINING LATIN SMALL LETTER F +<U1DEB> "" +% COMBINING LATIN SMALL LETTER L WITH DOUBLE MIDDLE TILDE +<U1DEC> "" +% COMBINING LATIN SMALL LETTER O WITH LIGHT CENTRALIZATION STROKE +<U1DED> "" +% COMBINING LATIN SMALL LETTER P +<U1DEE> "" +% COMBINING LATIN SMALL LETTER ESH +<U1DEF> "" +% COMBINING LATIN SMALL LETTER U WITH LIGHT CENTRALIZATION STROKE +<U1DF0> "" +% COMBINING LATIN SMALL LETTER W +<U1DF1> "" +% COMBINING LATIN SMALL LETTER A WITH DIAERESIS +<U1DF2> "" +% COMBINING LATIN SMALL LETTER O WITH DIAERESIS +<U1DF3> "" +% COMBINING LATIN SMALL LETTER U WITH DIAERESIS +<U1DF4> "" +% COMBINING UP TACK ABOVE +<U1DF5> "" +% COMBINING DOUBLE INVERTED BREVE BELOW +<U1DFC> "" +% COMBINING ALMOST EQUAL TO BELOW +<U1DFD> "" +% COMBINING LEFT ARROWHEAD ABOVE +<U1DFE> "" +% COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW +<U1DFF> "" +% COMBINING LEFT HARPOON ABOVE +<U20D0> "" +% COMBINING RIGHT HARPOON ABOVE +<U20D1> "" +% COMBINING LONG VERTICAL LINE OVERLAY +<U20D2> "" +% COMBINING SHORT VERTICAL LINE OVERLAY +<U20D3> "" +% COMBINING ANTICLOCKWISE ARROW ABOVE +<U20D4> "" +% COMBINING CLOCKWISE ARROW ABOVE +<U20D5> "" +% COMBINING LEFT ARROW ABOVE +<U20D6> "" +% COMBINING RIGHT ARROW ABOVE +<U20D7> "" +% COMBINING RING OVERLAY +<U20D8> "" +% COMBINING CLOCKWISE RING OVERLAY +<U20D9> "" +% COMBINING ANTICLOCKWISE RING OVERLAY +<U20DA> "" +% COMBINING THREE DOTS ABOVE +<U20DB> "" +% COMBINING FOUR DOTS ABOVE +<U20DC> "" +% COMBINING ENCLOSING CIRCLE +<U20DD> "" +% COMBINING ENCLOSING SQUARE +<U20DE> "" +% COMBINING ENCLOSING DIAMOND +<U20DF> "" +% COMBINING ENCLOSING CIRCLE BACKSLASH +<U20E0> "" +% COMBINING LEFT RIGHT ARROW ABOVE +<U20E1> "" +% COMBINING ENCLOSING SCREEN +<U20E2> "" +% COMBINING ENCLOSING KEYCAP +<U20E3> "" % COMBINING ENCLOSING UPWARD POINTING TRIANGLE <U20E4> "" % COMBINING REVERSE SOLIDUS OVERLAY @@ -251,10 +695,70 @@ translit_start <U20E9> "" % COMBINING LEFTWARDS ARROW OVERLAY <U20EA> "" +% COMBINING LONG DOUBLE SOLIDUS OVERLAY +<U20EB> "" +% COMBINING RIGHTWARDS HARPOON WITH BARB DOWNWARDS +<U20EC> "" +% COMBINING LEFTWARDS HARPOON WITH BARB DOWNWARDS +<U20ED> "" +% COMBINING LEFT ARROW BELOW +<U20EE> "" +% COMBINING RIGHT ARROW BELOW +<U20EF> "" +% COMBINING ASTERISK ABOVE +<U20F0> "" % COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK <U3099> "" % COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK <U309A> "" +% HEBREW POINT JUDEO-SPANISH VARIKA +<UFB1E> "" +% COMBINING LIGATURE LEFT HALF +<UFE20> "" +% COMBINING LIGATURE RIGHT HALF +<UFE21> "" +% COMBINING DOUBLE TILDE LEFT HALF +<UFE22> "" +% COMBINING DOUBLE TILDE RIGHT HALF +<UFE23> "" +% COMBINING MACRON LEFT HALF +<UFE24> "" +% COMBINING MACRON RIGHT HALF +<UFE25> "" +% COMBINING CONJOINING MACRON +<UFE26> "" +% COMBINING LIGATURE LEFT HALF BELOW +<UFE27> "" +% COMBINING LIGATURE RIGHT HALF BELOW +<UFE28> "" +% COMBINING TILDE LEFT HALF BELOW +<UFE29> "" +% COMBINING TILDE RIGHT HALF BELOW +<UFE2A> "" +% COMBINING MACRON LEFT HALF BELOW +<UFE2B> "" +% COMBINING MACRON RIGHT HALF BELOW +<UFE2C> "" +% COMBINING CONJOINING MACRON BELOW +<UFE2D> "" +% PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE +<U000101FD> "" +% COMBINING OLD PERMIC LETTER AN +<U00010376> "" +% COMBINING OLD PERMIC LETTER DOI +<U00010377> "" +% COMBINING OLD PERMIC LETTER ZATA +<U00010378> "" +% COMBINING OLD PERMIC LETTER NENOE +<U00010379> "" +% COMBINING OLD PERMIC LETTER SII +<U0001037A> "" +% COMBINING GREEK MUSICAL TRISEME +<U0001D242> "" +% COMBINING GREEK MUSICAL TETRASEME +<U0001D243> "" +% COMBINING GREEK MUSICAL PENTASEME +<U0001D244> "" % LATIN CAPITAL LETTER A WITH GRAVE <U00C0> <U0041> @@ -268,6 +772,8 @@ translit_start <U00C4> <U0041> % LATIN CAPITAL LETTER A WITH RING ABOVE <U00C5> <U0041> +% LATIN CAPITAL LETTER AE +<U00C6> "<U0041><U0045>" % LATIN CAPITAL LETTER C WITH CEDILLA <U00C7> <U0043> % LATIN CAPITAL LETTER E WITH GRAVE @@ -298,6 +804,8 @@ translit_start <U00D5> <U004F> % LATIN CAPITAL LETTER O WITH DIAERESIS <U00D6> <U004F> +% LATIN CAPITAL LETTER O WITH STROKE +<U00D8> <U004F> % LATIN CAPITAL LETTER U WITH GRAVE <U00D9> <U0055> % LATIN CAPITAL LETTER U WITH ACUTE @@ -320,6 +828,8 @@ translit_start <U00E4> <U0061> % LATIN SMALL LETTER A WITH RING ABOVE <U00E5> <U0061> +% LATIN SMALL LETTER AE +<U00E6> "<U0061><U0065>" % LATIN SMALL LETTER C WITH CEDILLA <U00E7> <U0063> % LATIN SMALL LETTER E WITH GRAVE @@ -350,6 +860,8 @@ translit_start <U00F5> <U006F> % LATIN SMALL LETTER O WITH DIAERESIS <U00F6> <U006F> +% LATIN SMALL LETTER O WITH STROKE +<U00F8> <U006F> % LATIN SMALL LETTER U WITH GRAVE <U00F9> <U0075> % LATIN SMALL LETTER U WITH ACUTE @@ -472,10 +984,6 @@ translit_start <U013D> <U004C> % LATIN SMALL LETTER L WITH CARON <U013E> <U006C> -% LATIN CAPITAL LETTER L WITH STROKE -<U0141> <U004C> -% LATIN SMALL LETTER L WITH STROKE -<U0142> <U006C> % LATIN CAPITAL LETTER N WITH ACUTE <U0143> <U004E> % LATIN SMALL LETTER N WITH ACUTE @@ -673,9 +1181,9 @@ translit_start % LATIN SMALL LETTER AE WITH ACUTE <U01FD> <U00E6>;"<U0061><U0065>" % LATIN CAPITAL LETTER O WITH STROKE AND ACUTE -<U01FE> <U004F> +<U01FE> <U00D8>;<U004F> % LATIN SMALL LETTER O WITH STROKE AND ACUTE -<U01FF> <U006F> +<U01FF> <U00F8>;<U006F> % LATIN CAPITAL LETTER A WITH DOUBLE GRAVE <U0200> <U0041> % LATIN SMALL LETTER A WITH DOUBLE GRAVE @@ -764,14 +1272,6 @@ translit_start <U0232> <U0059> % LATIN SMALL LETTER Y WITH MACRON <U0233> <U0079> -% COMBINING GRAVE TONE MARK -<U0340> <U0300> -% COMBINING ACUTE TONE MARK -<U0341> <U0301> -% COMBINING GREEK KORONIS -<U0343> <U0313> -% COMBINING GREEK DIALYTIKA TONOS -<U0344> <U0308> % GREEK NUMERAL SIGN <U0374> <U02B9> % GREEK QUESTION MARK @@ -928,6 +1428,8 @@ translit_start <U04F8> <U042B> % CYRILLIC SMALL LETTER YERU WITH DIAERESIS <U04F9> <U044B> +% HEBREW LIGATURE YIDDISH DOUBLE YOD +<U05F2> "<U05D9><U05D9>" % ARABIC LETTER ALEF WITH MADDA ABOVE <U0622> <U0627> % ARABIC LETTER ALEF WITH HAMZA ABOVE @@ -1017,7 +1519,7 @@ translit_start % KANNADA VOWEL SIGN O <U0CCA> "<U0CC6><U0CC2>" % KANNADA VOWEL SIGN OO -<U0CCB> "<U0CCA><U0CD5>" +<U0CCB> "<U0CC6><U0CC2><U0CD5>" % MALAYALAM VOWEL SIGN O <U0D4A> "<U0D46><U0D3E>" % MALAYALAM VOWEL SIGN OO @@ -1029,7 +1531,7 @@ translit_start % SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA <U0DDC> "<U0DD9><U0DCF>" % SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA -<U0DDD> "<U0DDC><U0DCA>" +<U0DDD> "<U0DD9><U0DCF><U0DCA>" % SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA <U0DDE> "<U0DD9><U0DDF>" % TIBETAN LETTER GHA @@ -2020,16 +2522,114 @@ translit_start <U2000> <U2002>;<U0020> % EM QUAD <U2001> <U2003>;<U0020> +% EN SPACE +<U2002> <U0020> +% EM SPACE +<U2003> <U0020> % OHM SIGN <U2126> <U03A9> % KELVIN SIGN <U212A> <U004B> % ANGSTROM SIGN -<U212B> <U00C5> +<U212B> <U0041> +% LEFTWARDS ARROW WITH STROKE +<U219A> <U2190> +% RIGHTWARDS ARROW WITH STROKE +<U219B> <U2192> +% LEFT RIGHT ARROW WITH STROKE +<U21AE> "<U0021><U003C><U002D><U003E>" +% LEFTWARDS DOUBLE ARROW WITH STROKE +<U21CD> "<U0021><U003C><U003D>" +% LEFT RIGHT DOUBLE ARROW WITH STROKE +<U21CE> "<U0021><U003C><U003D><U003E>" +% RIGHTWARDS DOUBLE ARROW WITH STROKE +<U21CF> "<U0021><U003D><U003E>" +% THERE DOES NOT EXIST +<U2204> "<U0021><U2203>" +% NOT AN ELEMENT OF +<U2209> "<U0021><U2208>" +% DOES NOT CONTAIN AS MEMBER +<U220C> "<U0021><U220B>" +% DOES NOT DIVIDE +<U2224> "<U0021><U2223>" +% NOT PARALLEL TO +<U2226> "<U0021><U2225>" +% NOT TILDE +<U2241> "<U0021><U007E>" +% NOT ASYMPTOTICALLY EQUAL TO +<U2244> "<U0021><U007E><U002D>" +% NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO +<U2247> "<U0021><U007E><U003D>" +% NOT ALMOST EQUAL TO +<U2249> "<U0021><U007E><U007E>" +% NOT EQUAL TO +<U2260> "<U0021><U003D>" +% NOT IDENTICAL TO +<U2262> "<U0021><U003D><U003D>" +% NOT EQUIVALENT TO +<U226D> "<U0021><U224D>" +% NOT LESS-THAN +<U226E> "<U0021><U003C>" +% NOT GREATER-THAN +<U226F> "<U0021><U003E>" +% NEITHER LESS-THAN NOR EQUAL TO +<U2270> "<U0021><U003C><U003D>" +% NEITHER GREATER-THAN NOR EQUAL TO +<U2271> "<U0021><U003E><U003D>" +% NEITHER LESS-THAN NOR EQUIVALENT TO +<U2274> "<U0021><U003C><U007E>" +% NEITHER GREATER-THAN NOR EQUIVALENT TO +<U2275> "<U0021><U003E><U007E>" +% NEITHER LESS-THAN NOR GREATER-THAN +<U2278> "<U0021><U003C><U003E>" +% NEITHER GREATER-THAN NOR LESS-THAN +<U2279> "<U0021><U003E><U003C>" +% DOES NOT PRECEDE +<U2280> "<U0021><U227A>" +% DOES NOT SUCCEED +<U2281> "<U0021><U227B>" +% NOT A SUBSET OF +<U2284> "<U0021><U2282>" +% NOT A SUPERSET OF +<U2285> "<U0021><U2283>" +% NEITHER A SUBSET OF NOR EQUAL TO +<U2288> "<U0021><U2282><U003D>" +% NEITHER A SUPERSET OF NOR EQUAL TO +<U2289> "<U0021><U2283><U003D>" +% DOES NOT PROVE +<U22AC> "<U0021><U22A2>" +% NOT TRUE +<U22AD> "<U0021><U22A8>" +% DOES NOT FORCE +<U22AE> "<U0021><U22A9>" +% NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE +<U22AF> "<U0021><U22AB>" +% DOES NOT PRECEDE OR EQUAL +<U22E0> "<U0021><U227C>" +% DOES NOT SUCCEED OR EQUAL +<U22E1> "<U0021><U227D>" +% NOT SQUARE IMAGE OF OR EQUAL TO +<U22E2> "<U0021><U2291>" +% NOT SQUARE ORIGINAL OF OR EQUAL TO +<U22E3> "<U0021><U2292>" +% NOT NORMAL SUBGROUP OF +<U22EA> "<U0021><U22B2>" +% DOES NOT CONTAIN AS NORMAL SUBGROUP +<U22EB> "<U0021><U22B3>" +% NOT NORMAL SUBGROUP OF OR EQUAL TO +<U22EC> "<U0021><U22B4>" +% DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL +<U22ED> "<U0021><U22B5>" % LEFT-POINTING ANGLE BRACKET <U2329> <U3008>;<U003C> % RIGHT-POINTING ANGLE BRACKET <U232A> <U3009>;<U003E> +% FORKING +<U2ADC> "<U0021><U2ADD>" +% LEFT ANGLE BRACKET +<U3008> <U003C> +% RIGHT ANGLE BRACKET +<U3009> <U003E> % HIRAGANA LETTER GA <U304C> <U304B> % HIRAGANA LETTER GI diff --git a/localedata/locales/translit_compat b/localedata/locales/translit_compat index bb9d660..bf8d191 100644 --- a/localedata/locales/translit_compat +++ b/localedata/locales/translit_compat @@ -2,18 +2,24 @@ escape_char / comment_char % % Transliterations of compatibility characters and ligatures. -% Generated through -% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<compat>[^;]*;' UnicodeData.txt | \ -% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<compat> \([^;]*\);.*$/<U\1> "<U\3>"% \2/' | grep -v '0020 03[0-6][0-9A-F]' | sed -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G' -% and -% $ grep '[^;]*;[^;]*LIGATURE[^;]*;' UnicodeData.txt +% Generated automatically from UnicodeData.txt by gen_translit_compat.py on 2015-12-09 for Unicode 7.0.0. LC_CTYPE translit_start +% FEMININE ORDINAL INDICATOR +<U00AA> "<U0061>" +% SUPERSCRIPT TWO +<U00B2> "<U0032>" +% SUPERSCRIPT THREE +<U00B3> "<U0033>" % MICRO SIGN -<U00B5> "<U03BC>";<U0075> +<U00B5> "<U03BC>";"<U0075>" +% SUPERSCRIPT ONE +<U00B9> "<U0031>" +% MASCULINE ORDINAL INDICATOR +<U00BA> "<U006F>" % LATIN CAPITAL LIGATURE IJ <U0132> "<U0049><U004A>" % LATIN SMALL LIGATURE IJ @@ -54,6 +60,38 @@ translit_start <U01F2> "<U0044><U007A>" % LATIN SMALL LETTER DZ <U01F3> "<U0064><U007A>" +% MODIFIER LETTER SMALL H +<U02B0> "<U0068>" +% MODIFIER LETTER SMALL H WITH HOOK +<U02B1> "<U0266>" +% MODIFIER LETTER SMALL J +<U02B2> "<U006A>" +% MODIFIER LETTER SMALL R +<U02B3> "<U0072>" +% MODIFIER LETTER SMALL TURNED R +<U02B4> "<U0279>" +% MODIFIER LETTER SMALL TURNED R WITH HOOK +<U02B5> "<U027B>" +% MODIFIER LETTER SMALL CAPITAL INVERTED R +<U02B6> "<U0281>" +% MODIFIER LETTER SMALL W +<U02B7> "<U0077>" +% MODIFIER LETTER SMALL Y +<U02B8> "<U0079>" +% MODIFIER LETTER APOSTROPHE +<U02BC> "<U0027>" +% MODIFIER LETTER SMALL GAMMA +<U02E0> "<U0263>" +% MODIFIER LETTER SMALL L +<U02E1> "<U006C>" +% MODIFIER LETTER SMALL S +<U02E2> "<U0073>" +% MODIFIER LETTER SMALL X +<U02E3> "<U0078>" +% MODIFIER LETTER SMALL REVERSED GLOTTAL STOP +<U02E4> "<U0295>" +% GREEK SMALL LETTER MU +<U03BC> "<U0075>" % GREEK BETA SYMBOL <U03D0> "<U03B2>" % GREEK THETA SYMBOL @@ -74,6 +112,20 @@ translit_start <U03F4> "<U0398>" % GREEK LUNATE EPSILON SYMBOL <U03F5> "<U03B5>" +% GREEK CAPITAL LUNATE SIGMA SYMBOL +<U03F9> "<U03A3>" +% CYRILLIC CAPITAL LIGATURE EN GHE +<U04A4> "<U041D><U0413>" +% CYRILLIC SMALL LIGATURE EN GHE +<U04A5> "<U043D><U0433>" +% CYRILLIC CAPITAL LIGATURE TE TSE +<U04B4> "<U0422><U0426>" +% CYRILLIC SMALL LIGATURE TE TSE +<U04B5> "<U0442><U0446>" +% CYRILLIC CAPITAL LIGATURE A IE +<U04D4> "<U0410><U0415>" +% CYRILLIC SMALL LIGATURE A IE +<U04D5> "<U0430><U0435>" % ARMENIAN SMALL LIGATURE ECH YIWN <U0587> "<U0565><U0582>" % HEBREW LIGATURE YIDDISH DOUBLE VAV @@ -102,6 +154,204 @@ translit_start <U0F77> "<U0FB2><U0F81>" % TIBETAN VOWEL SIGN VOCALIC LL <U0F79> "<U0FB3><U0F81>" +% MODIFIER LETTER GEORGIAN NAR +<U10FC> "<U10DC>" +% MODIFIER LETTER CAPITAL A +<U1D2C> "<U0041>" +% MODIFIER LETTER CAPITAL AE +<U1D2D> "<U00C6>" +% MODIFIER LETTER CAPITAL B +<U1D2E> "<U0042>" +% MODIFIER LETTER CAPITAL D +<U1D30> "<U0044>" +% MODIFIER LETTER CAPITAL E +<U1D31> "<U0045>" +% MODIFIER LETTER CAPITAL REVERSED E +<U1D32> "<U018E>" +% MODIFIER LETTER CAPITAL G +<U1D33> "<U0047>" +% MODIFIER LETTER CAPITAL H +<U1D34> "<U0048>" +% MODIFIER LETTER CAPITAL I +<U1D35> "<U0049>" +% MODIFIER LETTER CAPITAL J +<U1D36> "<U004A>" +% MODIFIER LETTER CAPITAL K +<U1D37> "<U004B>" +% MODIFIER LETTER CAPITAL L +<U1D38> "<U004C>" +% MODIFIER LETTER CAPITAL M +<U1D39> "<U004D>" +% MODIFIER LETTER CAPITAL N +<U1D3A> "<U004E>" +% MODIFIER LETTER CAPITAL O +<U1D3C> "<U004F>" +% MODIFIER LETTER CAPITAL OU +<U1D3D> "<U0222>" +% MODIFIER LETTER CAPITAL P +<U1D3E> "<U0050>" +% MODIFIER LETTER CAPITAL R +<U1D3F> "<U0052>" +% MODIFIER LETTER CAPITAL T +<U1D40> "<U0054>" +% MODIFIER LETTER CAPITAL U +<U1D41> "<U0055>" +% MODIFIER LETTER CAPITAL W +<U1D42> "<U0057>" +% MODIFIER LETTER SMALL A +<U1D43> "<U0061>" +% MODIFIER LETTER SMALL TURNED A +<U1D44> "<U0250>" +% MODIFIER LETTER SMALL ALPHA +<U1D45> "<U0251>" +% MODIFIER LETTER SMALL TURNED AE +<U1D46> "<U1D02>" +% MODIFIER LETTER SMALL B +<U1D47> "<U0062>" +% MODIFIER LETTER SMALL D +<U1D48> "<U0064>" +% MODIFIER LETTER SMALL E +<U1D49> "<U0065>" +% MODIFIER LETTER SMALL SCHWA +<U1D4A> "<U0259>" +% MODIFIER LETTER SMALL OPEN E +<U1D4B> "<U025B>" +% MODIFIER LETTER SMALL TURNED OPEN E +<U1D4C> "<U025C>" +% MODIFIER LETTER SMALL G +<U1D4D> "<U0067>" +% MODIFIER LETTER SMALL K +<U1D4F> "<U006B>" +% MODIFIER LETTER SMALL M +<U1D50> "<U006D>" +% MODIFIER LETTER SMALL ENG +<U1D51> "<U014B>" +% MODIFIER LETTER SMALL O +<U1D52> "<U006F>" +% MODIFIER LETTER SMALL OPEN O +<U1D53> "<U0254>" +% MODIFIER LETTER SMALL TOP HALF O +<U1D54> "<U1D16>" +% MODIFIER LETTER SMALL BOTTOM HALF O +<U1D55> "<U1D17>" +% MODIFIER LETTER SMALL P +<U1D56> "<U0070>" +% MODIFIER LETTER SMALL T +<U1D57> "<U0074>" +% MODIFIER LETTER SMALL U +<U1D58> "<U0075>" +% MODIFIER LETTER SMALL SIDEWAYS U +<U1D59> "<U1D1D>" +% MODIFIER LETTER SMALL TURNED M +<U1D5A> "<U026F>" +% MODIFIER LETTER SMALL V +<U1D5B> "<U0076>" +% MODIFIER LETTER SMALL AIN +<U1D5C> "<U1D25>" +% MODIFIER LETTER SMALL BETA +<U1D5D> "<U03B2>" +% MODIFIER LETTER SMALL GREEK GAMMA +<U1D5E> "<U03B3>" +% MODIFIER LETTER SMALL DELTA +<U1D5F> "<U03B4>" +% MODIFIER LETTER SMALL GREEK PHI +<U1D60> "<U03C6>" +% MODIFIER LETTER SMALL CHI +<U1D61> "<U03C7>" +% LATIN SUBSCRIPT SMALL LETTER I +<U1D62> "<U0069>" +% LATIN SUBSCRIPT SMALL LETTER R +<U1D63> "<U0072>" +% LATIN SUBSCRIPT SMALL LETTER U +<U1D64> "<U0075>" +% LATIN SUBSCRIPT SMALL LETTER V +<U1D65> "<U0076>" +% GREEK SUBSCRIPT SMALL LETTER BETA +<U1D66> "<U03B2>" +% GREEK SUBSCRIPT SMALL LETTER GAMMA +<U1D67> "<U03B3>" +% GREEK SUBSCRIPT SMALL LETTER RHO +<U1D68> "<U03C1>" +% GREEK SUBSCRIPT SMALL LETTER PHI +<U1D69> "<U03C6>" +% GREEK SUBSCRIPT SMALL LETTER CHI +<U1D6A> "<U03C7>" +% MODIFIER LETTER CYRILLIC EN +<U1D78> "<U043D>" +% MODIFIER LETTER SMALL TURNED ALPHA +<U1D9B> "<U0252>" +% MODIFIER LETTER SMALL C +<U1D9C> "<U0063>" +% MODIFIER LETTER SMALL C WITH CURL +<U1D9D> "<U0255>" +% MODIFIER LETTER SMALL ETH +<U1D9E> "<U00F0>" +% MODIFIER LETTER SMALL REVERSED OPEN E +<U1D9F> "<U025C>" +% MODIFIER LETTER SMALL F +<U1DA0> "<U0066>" +% MODIFIER LETTER SMALL DOTLESS J WITH STROKE +<U1DA1> "<U025F>" +% MODIFIER LETTER SMALL SCRIPT G +<U1DA2> "<U0261>" +% MODIFIER LETTER SMALL TURNED H +<U1DA3> "<U0265>" +% MODIFIER LETTER SMALL I WITH STROKE +<U1DA4> "<U0268>" +% MODIFIER LETTER SMALL IOTA +<U1DA5> "<U0269>" +% MODIFIER LETTER SMALL CAPITAL I +<U1DA6> "<U026A>" +% MODIFIER LETTER SMALL CAPITAL I WITH STROKE +<U1DA7> "<U1D7B>" +% MODIFIER LETTER SMALL J WITH CROSSED-TAIL +<U1DA8> "<U029D>" +% MODIFIER LETTER SMALL L WITH RETROFLEX HOOK +<U1DA9> "<U026D>" +% MODIFIER LETTER SMALL L WITH PALATAL HOOK +<U1DAA> "<U1D85>" +% MODIFIER LETTER SMALL CAPITAL L +<U1DAB> "<U029F>" +% MODIFIER LETTER SMALL M WITH HOOK +<U1DAC> "<U0271>" +% MODIFIER LETTER SMALL TURNED M WITH LONG LEG +<U1DAD> "<U0270>" +% MODIFIER LETTER SMALL N WITH LEFT HOOK +<U1DAE> "<U0272>" +% MODIFIER LETTER SMALL N WITH RETROFLEX HOOK +<U1DAF> "<U0273>" +% MODIFIER LETTER SMALL CAPITAL N +<U1DB0> "<U0274>" +% MODIFIER LETTER SMALL BARRED O +<U1DB1> "<U0275>" +% MODIFIER LETTER SMALL PHI +<U1DB2> "<U0278>" +% MODIFIER LETTER SMALL S WITH HOOK +<U1DB3> "<U0282>" +% MODIFIER LETTER SMALL ESH +<U1DB4> "<U0283>" +% MODIFIER LETTER SMALL T WITH PALATAL HOOK +<U1DB5> "<U01AB>" +% MODIFIER LETTER SMALL U BAR +<U1DB6> "<U0289>" +% MODIFIER LETTER SMALL UPSILON +<U1DB7> "<U028A>" +% MODIFIER LETTER SMALL CAPITAL U +<U1DB8> "<U1D1C>" +% MODIFIER LETTER SMALL V WITH HOOK +<U1DB9> "<U028B>" +% MODIFIER LETTER SMALL TURNED V +<U1DBA> "<U028C>" +% MODIFIER LETTER SMALL Z +<U1DBB> "<U007A>" +% MODIFIER LETTER SMALL Z WITH RETROFLEX HOOK +<U1DBC> "<U0290>" +% MODIFIER LETTER SMALL Z WITH CURL +<U1DBD> "<U0291>" +% MODIFIER LETTER SMALL EZH +<U1DBE> "<U0292>" +% MODIFIER LETTER SMALL THETA +<U1DBF> "<U03B8>" % LATIN SMALL LETTER A WITH RIGHT HALF RING <U1E9A> "<U0061><U02BE>" % EN SPACE @@ -146,6 +396,90 @@ translit_start <U2057> "<U2032><U2032><U2032><U2032>" % MEDIUM MATHEMATICAL SPACE <U205F> "<U0020>" +% SUPERSCRIPT ZERO +<U2070> "<U0030>" +% SUPERSCRIPT LATIN SMALL LETTER I +<U2071> "<U0069>" +% SUPERSCRIPT FOUR +<U2074> "<U0034>" +% SUPERSCRIPT FIVE +<U2075> "<U0035>" +% SUPERSCRIPT SIX +<U2076> "<U0036>" +% SUPERSCRIPT SEVEN +<U2077> "<U0037>" +% SUPERSCRIPT EIGHT +<U2078> "<U0038>" +% SUPERSCRIPT NINE +<U2079> "<U0039>" +% SUPERSCRIPT PLUS SIGN +<U207A> "<U002B>" +% SUPERSCRIPT MINUS +<U207B> "<U2212>" +% SUPERSCRIPT EQUALS SIGN +<U207C> "<U003D>" +% SUPERSCRIPT LEFT PARENTHESIS +<U207D> "<U0028>" +% SUPERSCRIPT RIGHT PARENTHESIS +<U207E> "<U0029>" +% SUPERSCRIPT LATIN SMALL LETTER N +<U207F> "<U006E>" +% SUBSCRIPT ZERO +<U2080> "<U0030>" +% SUBSCRIPT ONE +<U2081> "<U0031>" +% SUBSCRIPT TWO +<U2082> "<U0032>" +% SUBSCRIPT THREE +<U2083> "<U0033>" +% SUBSCRIPT FOUR +<U2084> "<U0034>" +% SUBSCRIPT FIVE +<U2085> "<U0035>" +% SUBSCRIPT SIX +<U2086> "<U0036>" +% SUBSCRIPT SEVEN +<U2087> "<U0037>" +% SUBSCRIPT EIGHT +<U2088> "<U0038>" +% SUBSCRIPT NINE +<U2089> "<U0039>" +% SUBSCRIPT PLUS SIGN +<U208A> "<U002B>" +% SUBSCRIPT MINUS +<U208B> "<U2212>" +% SUBSCRIPT EQUALS SIGN +<U208C> "<U003D>" +% SUBSCRIPT LEFT PARENTHESIS +<U208D> "<U0028>" +% SUBSCRIPT RIGHT PARENTHESIS +<U208E> "<U0029>" +% LATIN SUBSCRIPT SMALL LETTER A +<U2090> "<U0061>" +% LATIN SUBSCRIPT SMALL LETTER E +<U2091> "<U0065>" +% LATIN SUBSCRIPT SMALL LETTER O +<U2092> "<U006F>" +% LATIN SUBSCRIPT SMALL LETTER X +<U2093> "<U0078>" +% LATIN SUBSCRIPT SMALL LETTER SCHWA +<U2094> "<U0259>" +% LATIN SUBSCRIPT SMALL LETTER H +<U2095> "<U0068>" +% LATIN SUBSCRIPT SMALL LETTER K +<U2096> "<U006B>" +% LATIN SUBSCRIPT SMALL LETTER L +<U2097> "<U006C>" +% LATIN SUBSCRIPT SMALL LETTER M +<U2098> "<U006D>" +% LATIN SUBSCRIPT SMALL LETTER N +<U2099> "<U006E>" +% LATIN SUBSCRIPT SMALL LETTER P +<U209A> "<U0070>" +% LATIN SUBSCRIPT SMALL LETTER S +<U209B> "<U0073>" +% LATIN SUBSCRIPT SMALL LETTER T +<U209C> "<U0074>" % RUPEE SIGN <U20A8> "<U0052><U0073>" % ACCOUNT OF @@ -164,8 +498,12 @@ translit_start <U2109> "<U00B0><U0046>" % NUMERO SIGN <U2116> "<U004E><U006F>" +% SERVICE MARK +<U2120> "<U0053><U004D>" % TELEPHONE SIGN <U2121> "<U0054><U0045><U004C>" +% TRADE MARK SIGN +<U2122> "<U0054><U004D>" % ALEF SYMBOL <U2135> "<U05D0>" % BET SYMBOL @@ -174,6 +512,8 @@ translit_start <U2137> "<U05D2>" % DALET SYMBOL <U2138> "<U05D3>" +% FACSIMILE SIGN +<U213B> "<U0046><U0041><U0058>" % ROMAN NUMERAL ONE <U2160> "<U0049>" % ROMAN NUMERAL TWO @@ -386,6 +726,12 @@ translit_start <U2A75> "<U003D><U003D>" % THREE CONSECUTIVE EQUALS SIGNS <U2A76> "<U003D><U003D><U003D>" +% LATIN SUBSCRIPT SMALL LETTER J +<U2C7C> "<U006A>" +% MODIFIER LETTER CAPITAL V +<U2C7D> "<U0056>" +% TIFINAGH MODIFIER LETTER LABIALIZATION MARK +<U2D6F> "<U2D61>" % CJK RADICAL MOTHER <U2E9F> "<U6BCD>" % CJK RADICAL C-SIMPLIFIED TURTLE @@ -830,6 +1176,10 @@ translit_start <U309B> "<U0020><U3099>" % KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK <U309C> "<U0020><U309A>" +% HIRAGANA DIGRAPH YORI +<U309F> "<U3088><U308A>" +% KATAKANA DIGRAPH KOTO +<U30FF> "<U30B3><U30C8>" % HANGUL LETTER KIYEOK <U3131> "<U1100>" % HANGUL LETTER SSANGKIYEOK @@ -1018,6 +1368,34 @@ translit_start <U318D> "<U119E>" % HANGUL LETTER ARAEAE <U318E> "<U11A1>" +% IDEOGRAPHIC ANNOTATION ONE MARK +<U3192> "<U4E00>" +% IDEOGRAPHIC ANNOTATION TWO MARK +<U3193> "<U4E8C>" +% IDEOGRAPHIC ANNOTATION THREE MARK +<U3194> "<U4E09>" +% IDEOGRAPHIC ANNOTATION FOUR MARK +<U3195> "<U56DB>" +% IDEOGRAPHIC ANNOTATION TOP MARK +<U3196> "<U4E0A>" +% IDEOGRAPHIC ANNOTATION MIDDLE MARK +<U3197> "<U4E2D>" +% IDEOGRAPHIC ANNOTATION BOTTOM MARK +<U3198> "<U4E0B>" +% IDEOGRAPHIC ANNOTATION FIRST MARK +<U3199> "<U7532>" +% IDEOGRAPHIC ANNOTATION SECOND MARK +<U319A> "<U4E59>" +% IDEOGRAPHIC ANNOTATION THIRD MARK +<U319B> "<U4E19>" +% IDEOGRAPHIC ANNOTATION FOURTH MARK +<U319C> "<U4E01>" +% IDEOGRAPHIC ANNOTATION HEAVEN MARK +<U319D> "<U5929>" +% IDEOGRAPHIC ANNOTATION EARTH MARK +<U319E> "<U5730>" +% IDEOGRAPHIC ANNOTATION MAN MARK +<U319F> "<U4EBA>" % PARENTHESIZED HANGUL KIYEOK <U3200> "<U0028><U1100><U0029>" % PARENTHESIZED HANGUL NIEUN @@ -1076,6 +1454,10 @@ translit_start <U321B> "<U0028><U1112><U1161><U0029>" % PARENTHESIZED HANGUL CIEUC U <U321C> "<U0028><U110C><U116E><U0029>" +% PARENTHESIZED KOREAN CHARACTER OJEON +<U321D> "<U0028><U110B><U1169><U110C><U1165><U11AB><U0029>" +% PARENTHESIZED KOREAN CHARACTER O HU +<U321E> "<U0028><U110B><U1169><U1112><U116E><U0029>" % PARENTHESIZED IDEOGRAPH ONE <U3220> "<U0028><U4E00><U0029>" % PARENTHESIZED IDEOGRAPH TWO @@ -1284,6 +1666,24 @@ translit_start <U33FD> "<U0033><U0030><U65E5>" % IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE <U33FE> "<U0033><U0031><U65E5>" +% MODIFIER LETTER CYRILLIC HARD SIGN +<UA69C> "<U044A>" +% MODIFIER LETTER CYRILLIC SOFT SIGN +<UA69D> "<U044C>" +% MODIFIER LETTER US +<UA770> "<UA76F>" +% MODIFIER LETTER CAPITAL H WITH STROKE +<UA7F8> "<U0126>" +% MODIFIER LETTER SMALL LIGATURE OE +<UA7F9> "<U0153>" +% MODIFIER LETTER SMALL HENG +<UAB5C> "<UA727>" +% MODIFIER LETTER SMALL L WITH INVERTED LAZY S +<UAB5D> "<UAB37>" +% MODIFIER LETTER SMALL L WITH MIDDLE TILDE +<UAB5E> "<U026B>" +% MODIFIER LETTER SMALL U WITH LEFT HOOK +<UAB5F> "<UAB52>" % LATIN SMALL LIGATURE FF <UFB00> "<U0066><U0066>" % LATIN SMALL LIGATURE FI @@ -1295,7 +1695,7 @@ translit_start % LATIN SMALL LIGATURE FFL <UFB04> "<U0066><U0066><U006C>" % LATIN SMALL LIGATURE LONG S T -<UFB05> "<U017F><U0074>" +<UFB05> "<U0073><U0074>" % LATIN SMALL LIGATURE ST <UFB06> "<U0073><U0074>" % ARMENIAN SMALL LIGATURE MEN NOW @@ -1310,6 +1710,72 @@ translit_start <UFB17> "<U0574><U056D>" % HEBREW LIGATURE ALEF LAMED <UFB4F> "<U05D0><U05DC>" +% PRESENTATION FORM FOR VERTICAL COMMA +<UFE10> "<U002C>" +% PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA +<UFE11> "<U3001>" +% PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP +<UFE12> "<U3002>" +% PRESENTATION FORM FOR VERTICAL COLON +<UFE13> "<U003A>" +% PRESENTATION FORM FOR VERTICAL SEMICOLON +<UFE14> "<U003B>" +% PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK +<UFE15> "<U0021>" +% PRESENTATION FORM FOR VERTICAL QUESTION MARK +<UFE16> "<U003F>" +% PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET +<UFE17> "<U3016>" +% PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET +<UFE18> "<U3017>" +% PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS +<UFE19> "<U002E><U002E><U002E>" +% PRESENTATION FORM FOR VERTICAL TWO DOT LEADER +<UFE30> "<U002E><U002E>" +% PRESENTATION FORM FOR VERTICAL EM DASH +<UFE31> "<U2014>" +% PRESENTATION FORM FOR VERTICAL EN DASH +<UFE32> "<U2013>" +% PRESENTATION FORM FOR VERTICAL LOW LINE +<UFE33> "<U005F>" +% PRESENTATION FORM FOR VERTICAL WAVY LOW LINE +<UFE34> "<U005F>" +% PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS +<UFE35> "<U0028>" +% PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS +<UFE36> "<U0029>" +% PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET +<UFE37> "<U007B>" +% PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET +<UFE38> "<U007D>" +% PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET +<UFE39> "<U3014>" +% PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET +<UFE3A> "<U3015>" +% PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET +<UFE3B> "<U3010>" +% PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET +<UFE3C> "<U3011>" +% PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET +<UFE3D> "<U300A>" +% PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET +<UFE3E> "<U300B>" +% PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET +<UFE3F> "<U3008>" +% PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET +<UFE40> "<U3009>" +% PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET +<UFE41> "<U300C>" +% PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET +<UFE42> "<U300D>" +% PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET +<UFE43> "<U300E>" +% PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET +<UFE44> "<U300F>" +% PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET +<UFE47> "<U005B>" +% PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET +<UFE48> "<U005D>" % DASHED OVERLINE <UFE49> "<U203E>" % CENTRELINE OVERLINE @@ -1324,6 +1790,104 @@ translit_start <UFE4E> "<U005F>" % WAVY LOW LINE <UFE4F> "<U005F>" +% DIGIT ZERO FULL STOP +<U0001F100> "<U0030><U002E>" +% DIGIT ZERO COMMA +<U0001F101> "<U0030><U002C>" +% DIGIT ONE COMMA +<U0001F102> "<U0031><U002C>" +% DIGIT TWO COMMA +<U0001F103> "<U0032><U002C>" +% DIGIT THREE COMMA +<U0001F104> "<U0033><U002C>" +% DIGIT FOUR COMMA +<U0001F105> "<U0034><U002C>" +% DIGIT FIVE COMMA +<U0001F106> "<U0035><U002C>" +% DIGIT SIX COMMA +<U0001F107> "<U0036><U002C>" +% DIGIT SEVEN COMMA +<U0001F108> "<U0037><U002C>" +% DIGIT EIGHT COMMA +<U0001F109> "<U0038><U002C>" +% DIGIT NINE COMMA +<U0001F10A> "<U0039><U002C>" +% PARENTHESIZED LATIN CAPITAL LETTER A +<U0001F110> "<U0028><U0041><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER B +<U0001F111> "<U0028><U0042><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER C +<U0001F112> "<U0028><U0043><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER D +<U0001F113> "<U0028><U0044><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER E +<U0001F114> "<U0028><U0045><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER F +<U0001F115> "<U0028><U0046><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER G +<U0001F116> "<U0028><U0047><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER H +<U0001F117> "<U0028><U0048><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER I +<U0001F118> "<U0028><U0049><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER J +<U0001F119> "<U0028><U004A><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER K +<U0001F11A> "<U0028><U004B><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER L +<U0001F11B> "<U0028><U004C><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER M +<U0001F11C> "<U0028><U004D><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER N +<U0001F11D> "<U0028><U004E><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER O +<U0001F11E> "<U0028><U004F><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER P +<U0001F11F> "<U0028><U0050><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER Q +<U0001F120> "<U0028><U0051><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER R +<U0001F121> "<U0028><U0052><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER S +<U0001F122> "<U0028><U0053><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER T +<U0001F123> "<U0028><U0054><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER U +<U0001F124> "<U0028><U0055><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER V +<U0001F125> "<U0028><U0056><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER W +<U0001F126> "<U0028><U0057><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER X +<U0001F127> "<U0028><U0058><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER Y +<U0001F128> "<U0028><U0059><U0029>" +% PARENTHESIZED LATIN CAPITAL LETTER Z +<U0001F129> "<U0028><U005A><U0029>" +% TORTOISE SHELL BRACKETED LATIN CAPITAL LETTER S +<U0001F12A> "<U3014><U0053><U3015>" +% RAISED MC SIGN +<U0001F16A> "<U004D><U0043>" +% RAISED MD SIGN +<U0001F16B> "<U004D><U0044>" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C +<U0001F240> "<U3014><U672C><U3015>" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-4E09 +<U0001F241> "<U3014><U4E09><U3015>" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-4E8C +<U0001F242> "<U3014><U4E8C><U3015>" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-5B89 +<U0001F243> "<U3014><U5B89><U3015>" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-70B9 +<U0001F244> "<U3014><U70B9><U3015>" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6253 +<U0001F245> "<U3014><U6253><U3015>" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-76D7 +<U0001F246> "<U3014><U76D7><U3015>" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-52DD +<U0001F247> "<U3014><U52DD><U3015>" +% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557 +<U0001F248> "<U3014><U6557><U3015>" translit_end diff --git a/localedata/locales/translit_font b/localedata/locales/translit_font index 9347bd4..7ceb937 100644 --- a/localedata/locales/translit_font +++ b/localedata/locales/translit_font @@ -2,9 +2,7 @@ escape_char / comment_char % % Transliterations of font equivalents. -% Generated through -% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<font>[^;]*;' UnicodeData.txt | \ -% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<font> \([^;]*\);.*$/<U\1> <U\3> % \2/' +% Generated automatically from UnicodeData.txt by gen_translit_font.py on 2015-12-09 for Unicode 7.0.0. LC_CTYPE @@ -37,6 +35,7 @@ translit_start <U2133> <U004D> % SCRIPT CAPITAL M <U2134> <U006F> % SCRIPT SMALL O <U2139> <U0069> % INFORMATION SOURCE +<U213C> <U03C0> % DOUBLE-STRUCK SMALL PI <U213D> <U03B3> % DOUBLE-STRUCK SMALL GAMMA <U213E> <U0393> % DOUBLE-STRUCK CAPITAL GAMMA <U213F> <U03A0> % DOUBLE-STRUCK CAPITAL PI @@ -238,6 +237,7 @@ translit_start <U0001D4BE> <U0069> % MATHEMATICAL SCRIPT SMALL I <U0001D4BF> <U006A> % MATHEMATICAL SCRIPT SMALL J <U0001D4C0> <U006B> % MATHEMATICAL SCRIPT SMALL K +<U0001D4C1> <U006C> % MATHEMATICAL SCRIPT SMALL L <U0001D4C2> <U006D> % MATHEMATICAL SCRIPT SMALL M <U0001D4C3> <U006E> % MATHEMATICAL SCRIPT SMALL N <U0001D4C5> <U0070> % MATHEMATICAL SCRIPT SMALL P @@ -707,6 +707,8 @@ translit_start <U0001D6A1> <U0078> % MATHEMATICAL MONOSPACE SMALL X <U0001D6A2> <U0079> % MATHEMATICAL MONOSPACE SMALL Y <U0001D6A3> <U007A> % MATHEMATICAL MONOSPACE SMALL Z +<U0001D6A4> <U0131> % MATHEMATICAL ITALIC SMALL DOTLESS I +<U0001D6A5> <U0237> % MATHEMATICAL ITALIC SMALL DOTLESS J <U0001D6A8> <U0391> % MATHEMATICAL BOLD CAPITAL ALPHA <U0001D6A9> <U0392> % MATHEMATICAL BOLD CAPITAL BETA <U0001D6AA> <U0393> % MATHEMATICAL BOLD CAPITAL GAMMA @@ -997,6 +999,8 @@ translit_start <U0001D7C7> <U03D5> % MATHEMATICAL SANS-SERIF BOLD ITALIC PHI SYMBOL <U0001D7C8> <U03F1> % MATHEMATICAL SANS-SERIF BOLD ITALIC RHO SYMBOL <U0001D7C9> <U03D6> % MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL +<U0001D7CA> <U03DC> % MATHEMATICAL BOLD CAPITAL DIGAMMA +<U0001D7CB> <U03DD> % MATHEMATICAL BOLD SMALL DIGAMMA <U0001D7CE> <U0030> % MATHEMATICAL BOLD DIGIT ZERO <U0001D7CF> <U0031> % MATHEMATICAL BOLD DIGIT ONE <U0001D7D0> <U0032> % MATHEMATICAL BOLD DIGIT TWO @@ -1047,6 +1051,147 @@ translit_start <U0001D7FD> <U0037> % MATHEMATICAL MONOSPACE DIGIT SEVEN <U0001D7FE> <U0038> % MATHEMATICAL MONOSPACE DIGIT EIGHT <U0001D7FF> <U0039> % MATHEMATICAL MONOSPACE DIGIT NINE +<U0001EE00> <U0627> % ARABIC MATHEMATICAL ALEF +<U0001EE01> <U0628> % ARABIC MATHEMATICAL BEH +<U0001EE02> <U062C> % ARABIC MATHEMATICAL JEEM +<U0001EE03> <U062F> % ARABIC MATHEMATICAL DAL +<U0001EE05> <U0648> % ARABIC MATHEMATICAL WAW +<U0001EE06> <U0632> % ARABIC MATHEMATICAL ZAIN +<U0001EE07> <U062D> % ARABIC MATHEMATICAL HAH +<U0001EE08> <U0637> % ARABIC MATHEMATICAL TAH +<U0001EE09> <U064A> % ARABIC MATHEMATICAL YEH +<U0001EE0A> <U0643> % ARABIC MATHEMATICAL KAF +<U0001EE0B> <U0644> % ARABIC MATHEMATICAL LAM +<U0001EE0C> <U0645> % ARABIC MATHEMATICAL MEEM +<U0001EE0D> <U0646> % ARABIC MATHEMATICAL NOON +<U0001EE0E> <U0633> % ARABIC MATHEMATICAL SEEN +<U0001EE0F> <U0639> % ARABIC MATHEMATICAL AIN +<U0001EE10> <U0641> % ARABIC MATHEMATICAL FEH +<U0001EE11> <U0635> % ARABIC MATHEMATICAL SAD +<U0001EE12> <U0642> % ARABIC MATHEMATICAL QAF +<U0001EE13> <U0631> % ARABIC MATHEMATICAL REH +<U0001EE14> <U0634> % ARABIC MATHEMATICAL SHEEN +<U0001EE15> <U062A> % ARABIC MATHEMATICAL TEH +<U0001EE16> <U062B> % ARABIC MATHEMATICAL THEH +<U0001EE17> <U062E> % ARABIC MATHEMATICAL KHAH +<U0001EE18> <U0630> % ARABIC MATHEMATICAL THAL +<U0001EE19> <U0636> % ARABIC MATHEMATICAL DAD +<U0001EE1A> <U0638> % ARABIC MATHEMATICAL ZAH +<U0001EE1B> <U063A> % ARABIC MATHEMATICAL GHAIN +<U0001EE1C> <U066E> % ARABIC MATHEMATICAL DOTLESS BEH +<U0001EE1D> <U06BA> % ARABIC MATHEMATICAL DOTLESS NOON +<U0001EE1E> <U06A1> % ARABIC MATHEMATICAL DOTLESS FEH +<U0001EE1F> <U066F> % ARABIC MATHEMATICAL DOTLESS QAF +<U0001EE21> <U0628> % ARABIC MATHEMATICAL INITIAL BEH +<U0001EE22> <U062C> % ARABIC MATHEMATICAL INITIAL JEEM +<U0001EE24> <U0647> % ARABIC MATHEMATICAL INITIAL HEH +<U0001EE27> <U062D> % ARABIC MATHEMATICAL INITIAL HAH +<U0001EE29> <U064A> % ARABIC MATHEMATICAL INITIAL YEH +<U0001EE2A> <U0643> % ARABIC MATHEMATICAL INITIAL KAF +<U0001EE2B> <U0644> % ARABIC MATHEMATICAL INITIAL LAM +<U0001EE2C> <U0645> % ARABIC MATHEMATICAL INITIAL MEEM +<U0001EE2D> <U0646> % ARABIC MATHEMATICAL INITIAL NOON +<U0001EE2E> <U0633> % ARABIC MATHEMATICAL INITIAL SEEN +<U0001EE2F> <U0639> % ARABIC MATHEMATICAL INITIAL AIN +<U0001EE30> <U0641> % ARABIC MATHEMATICAL INITIAL FEH +<U0001EE31> <U0635> % ARABIC MATHEMATICAL INITIAL SAD +<U0001EE32> <U0642> % ARABIC MATHEMATICAL INITIAL QAF +<U0001EE34> <U0634> % ARABIC MATHEMATICAL INITIAL SHEEN +<U0001EE35> <U062A> % ARABIC MATHEMATICAL INITIAL TEH +<U0001EE36> <U062B> % ARABIC MATHEMATICAL INITIAL THEH +<U0001EE37> <U062E> % ARABIC MATHEMATICAL INITIAL KHAH +<U0001EE39> <U0636> % ARABIC MATHEMATICAL INITIAL DAD +<U0001EE3B> <U063A> % ARABIC MATHEMATICAL INITIAL GHAIN +<U0001EE42> <U062C> % ARABIC MATHEMATICAL TAILED JEEM +<U0001EE47> <U062D> % ARABIC MATHEMATICAL TAILED HAH +<U0001EE49> <U064A> % ARABIC MATHEMATICAL TAILED YEH +<U0001EE4B> <U0644> % ARABIC MATHEMATICAL TAILED LAM +<U0001EE4D> <U0646> % ARABIC MATHEMATICAL TAILED NOON +<U0001EE4E> <U0633> % ARABIC MATHEMATICAL TAILED SEEN +<U0001EE4F> <U0639> % ARABIC MATHEMATICAL TAILED AIN +<U0001EE51> <U0635> % ARABIC MATHEMATICAL TAILED SAD +<U0001EE52> <U0642> % ARABIC MATHEMATICAL TAILED QAF +<U0001EE54> <U0634> % ARABIC MATHEMATICAL TAILED SHEEN +<U0001EE57> <U062E> % ARABIC MATHEMATICAL TAILED KHAH +<U0001EE59> <U0636> % ARABIC MATHEMATICAL TAILED DAD +<U0001EE5B> <U063A> % ARABIC MATHEMATICAL TAILED GHAIN +<U0001EE5D> <U06BA> % ARABIC MATHEMATICAL TAILED DOTLESS NOON +<U0001EE5F> <U066F> % ARABIC MATHEMATICAL TAILED DOTLESS QAF +<U0001EE61> <U0628> % ARABIC MATHEMATICAL STRETCHED BEH +<U0001EE62> <U062C> % ARABIC MATHEMATICAL STRETCHED JEEM +<U0001EE64> <U0647> % ARABIC MATHEMATICAL STRETCHED HEH +<U0001EE67> <U062D> % ARABIC MATHEMATICAL STRETCHED HAH +<U0001EE68> <U0637> % ARABIC MATHEMATICAL STRETCHED TAH +<U0001EE69> <U064A> % ARABIC MATHEMATICAL STRETCHED YEH +<U0001EE6A> <U0643> % ARABIC MATHEMATICAL STRETCHED KAF +<U0001EE6C> <U0645> % ARABIC MATHEMATICAL STRETCHED MEEM +<U0001EE6D> <U0646> % ARABIC MATHEMATICAL STRETCHED NOON +<U0001EE6E> <U0633> % ARABIC MATHEMATICAL STRETCHED SEEN +<U0001EE6F> <U0639> % ARABIC MATHEMATICAL STRETCHED AIN +<U0001EE70> <U0641> % ARABIC MATHEMATICAL STRETCHED FEH +<U0001EE71> <U0635> % ARABIC MATHEMATICAL STRETCHED SAD +<U0001EE72> <U0642> % ARABIC MATHEMATICAL STRETCHED QAF +<U0001EE74> <U0634> % ARABIC MATHEMATICAL STRETCHED SHEEN +<U0001EE75> <U062A> % ARABIC MATHEMATICAL STRETCHED TEH +<U0001EE76> <U062B> % ARABIC MATHEMATICAL STRETCHED THEH +<U0001EE77> <U062E> % ARABIC MATHEMATICAL STRETCHED KHAH +<U0001EE79> <U0636> % ARABIC MATHEMATICAL STRETCHED DAD +<U0001EE7A> <U0638> % ARABIC MATHEMATICAL STRETCHED ZAH +<U0001EE7B> <U063A> % ARABIC MATHEMATICAL STRETCHED GHAIN +<U0001EE7C> <U066E> % ARABIC MATHEMATICAL STRETCHED DOTLESS BEH +<U0001EE7E> <U06A1> % ARABIC MATHEMATICAL STRETCHED DOTLESS FEH +<U0001EE80> <U0627> % ARABIC MATHEMATICAL LOOPED ALEF +<U0001EE81> <U0628> % ARABIC MATHEMATICAL LOOPED BEH +<U0001EE82> <U062C> % ARABIC MATHEMATICAL LOOPED JEEM +<U0001EE83> <U062F> % ARABIC MATHEMATICAL LOOPED DAL +<U0001EE84> <U0647> % ARABIC MATHEMATICAL LOOPED HEH +<U0001EE85> <U0648> % ARABIC MATHEMATICAL LOOPED WAW +<U0001EE86> <U0632> % ARABIC MATHEMATICAL LOOPED ZAIN +<U0001EE87> <U062D> % ARABIC MATHEMATICAL LOOPED HAH +<U0001EE88> <U0637> % ARABIC MATHEMATICAL LOOPED TAH +<U0001EE89> <U064A> % ARABIC MATHEMATICAL LOOPED YEH +<U0001EE8B> <U0644> % ARABIC MATHEMATICAL LOOPED LAM +<U0001EE8C> <U0645> % ARABIC MATHEMATICAL LOOPED MEEM +<U0001EE8D> <U0646> % ARABIC MATHEMATICAL LOOPED NOON +<U0001EE8E> <U0633> % ARABIC MATHEMATICAL LOOPED SEEN +<U0001EE8F> <U0639> % ARABIC MATHEMATICAL LOOPED AIN +<U0001EE90> <U0641> % ARABIC MATHEMATICAL LOOPED FEH +<U0001EE91> <U0635> % ARABIC MATHEMATICAL LOOPED SAD +<U0001EE92> <U0642> % ARABIC MATHEMATICAL LOOPED QAF +<U0001EE93> <U0631> % ARABIC MATHEMATICAL LOOPED REH +<U0001EE94> <U0634> % ARABIC MATHEMATICAL LOOPED SHEEN +<U0001EE95> <U062A> % ARABIC MATHEMATICAL LOOPED TEH +<U0001EE96> <U062B> % ARABIC MATHEMATICAL LOOPED THEH +<U0001EE97> <U062E> % ARABIC MATHEMATICAL LOOPED KHAH +<U0001EE98> <U0630> % ARABIC MATHEMATICAL LOOPED THAL +<U0001EE99> <U0636> % ARABIC MATHEMATICAL LOOPED DAD +<U0001EE9A> <U0638> % ARABIC MATHEMATICAL LOOPED ZAH +<U0001EE9B> <U063A> % ARABIC MATHEMATICAL LOOPED GHAIN +<U0001EEA1> <U0628> % ARABIC MATHEMATICAL DOUBLE-STRUCK BEH +<U0001EEA2> <U062C> % ARABIC MATHEMATICAL DOUBLE-STRUCK JEEM +<U0001EEA3> <U062F> % ARABIC MATHEMATICAL DOUBLE-STRUCK DAL +<U0001EEA5> <U0648> % ARABIC MATHEMATICAL DOUBLE-STRUCK WAW +<U0001EEA6> <U0632> % ARABIC MATHEMATICAL DOUBLE-STRUCK ZAIN +<U0001EEA7> <U062D> % ARABIC MATHEMATICAL DOUBLE-STRUCK HAH +<U0001EEA8> <U0637> % ARABIC MATHEMATICAL DOUBLE-STRUCK TAH +<U0001EEA9> <U064A> % ARABIC MATHEMATICAL DOUBLE-STRUCK YEH +<U0001EEAB> <U0644> % ARABIC MATHEMATICAL DOUBLE-STRUCK LAM +<U0001EEAC> <U0645> % ARABIC MATHEMATICAL DOUBLE-STRUCK MEEM +<U0001EEAD> <U0646> % ARABIC MATHEMATICAL DOUBLE-STRUCK NOON +<U0001EEAE> <U0633> % ARABIC MATHEMATICAL DOUBLE-STRUCK SEEN +<U0001EEAF> <U0639> % ARABIC MATHEMATICAL DOUBLE-STRUCK AIN +<U0001EEB0> <U0641> % ARABIC MATHEMATICAL DOUBLE-STRUCK FEH +<U0001EEB1> <U0635> % ARABIC MATHEMATICAL DOUBLE-STRUCK SAD +<U0001EEB2> <U0642> % ARABIC MATHEMATICAL DOUBLE-STRUCK QAF +<U0001EEB3> <U0631> % ARABIC MATHEMATICAL DOUBLE-STRUCK REH +<U0001EEB4> <U0634> % ARABIC MATHEMATICAL DOUBLE-STRUCK SHEEN +<U0001EEB5> <U062A> % ARABIC MATHEMATICAL DOUBLE-STRUCK TEH +<U0001EEB6> <U062B> % ARABIC MATHEMATICAL DOUBLE-STRUCK THEH +<U0001EEB7> <U062E> % ARABIC MATHEMATICAL DOUBLE-STRUCK KHAH +<U0001EEB8> <U0630> % ARABIC MATHEMATICAL DOUBLE-STRUCK THAL +<U0001EEB9> <U0636> % ARABIC MATHEMATICAL DOUBLE-STRUCK DAD +<U0001EEBA> <U0638> % ARABIC MATHEMATICAL DOUBLE-STRUCK ZAH +<U0001EEBB> <U063A> % ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN translit_end diff --git a/localedata/locales/translit_fraction b/localedata/locales/translit_fraction index 50dbd78..3108d86 100644 --- a/localedata/locales/translit_fraction +++ b/localedata/locales/translit_fraction @@ -2,10 +2,7 @@ escape_char / comment_char % % Transliterations of fractions. -% Generated through -% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<fraction>[^;]*;' UnicodeData.txt | \ -% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<fraction> \([^;]*\);.*$/<U\1> "<U\3>"% \2/' -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G' -% +% Generated automatically from UnicodeData.txt by gen_translit_fraction.py on 2015-12-09 for Unicode 7.0.0. % The replacements have been surrounded with spaces, because fractions are % often preceded by a decimal number and followed by a unit or a math symbol. @@ -19,6 +16,12 @@ translit_start <U00BD> "<U0020><U0031><U2044><U0032><U0020>";"<U0020><U0031><U002F><U0032><U0020>" % VULGAR FRACTION THREE QUARTERS <U00BE> "<U0020><U0033><U2044><U0034><U0020>";"<U0020><U0033><U002F><U0034><U0020>" +% VULGAR FRACTION ONE SEVENTH +<U2150> "<U0020><U0031><U2044><U0037><U0020>";"<U0020><U0031><U002F><U0037><U0020>" +% VULGAR FRACTION ONE NINTH +<U2151> "<U0020><U0031><U2044><U0039><U0020>";"<U0020><U0031><U002F><U0039><U0020>" +% VULGAR FRACTION ONE TENTH +<U2152> "<U0020><U0031><U2044><U0031><U0030><U0020>";"<U0020><U0031><U002F><U0031><U0030><U0020>" % VULGAR FRACTION ONE THIRD <U2153> "<U0020><U0031><U2044><U0033><U0020>";"<U0020><U0031><U002F><U0033><U0020>" % VULGAR FRACTION TWO THIRDS @@ -44,7 +47,9 @@ translit_start % VULGAR FRACTION SEVEN EIGHTHS <U215E> "<U0020><U0037><U2044><U0038><U0020>";"<U0020><U0037><U002F><U0038><U0020>" % FRACTION NUMERATOR ONE -<U215F> "<U0020><U0031><U2044>";"<U0020><U0031><U002F>" +<U215F> "<U0020><U0031><U2044><U0020>";"<U0020><U0031><U002F><U0020>" +% VULGAR FRACTION ZERO THIRDS +<U2189> "<U0020><U0030><U2044><U0033><U0020>";"<U0020><U0030><U002F><U0033><U0020>" translit_end diff --git a/localedata/unicode-gen/Makefile b/localedata/unicode-gen/Makefile index 166ee31..920bf0e 100644 --- a/localedata/unicode-gen/Makefile +++ b/localedata/unicode-gen/Makefile @@ -41,7 +41,7 @@ PYTHON3 = python3 WGET = wget DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt -GENERATED = i18n UTF-8 +GENERATED = i18n UTF-8 translit_combining translit_compat translit_circle translit_cjk_compat translit_font translit_fraction REPORTS = i18n-report UTF-8-report all: $(GENERATED) @@ -51,6 +51,12 @@ check: check-i18n check-UTF-8 install: cp -p i18n ../locales/i18n cp -p UTF-8 ../charmaps/UTF-8 + cp -p translit_combining ../locales/translit_combining + cp -p translit_compat ../locales/translit_compat + cp -p translit_circle ../locales/translit_circle + cp -p translit_cjk_compat ../locales/translit_cjk_compat + cp -p translit_font ../locales/translit_font + cp -p translit_fraction ../locales/translit_fraction clean: mostlyclean -rm -rf __pycache__ @@ -82,13 +88,43 @@ UTF-8: utf8_gen.py UTF-8-report: UTF-8 ../charmaps/UTF-8 UTF-8-report: utf8_compatibility.py - $(PYTHON3) ./utf8_compatibility.py -o ../charmaps/UTF-8 \ - -n UTF-8 -a -m > $@ + $(PYTHON3) ./utf8_compatibility.py -u UnicodeData.txt \ + -e EastAsianWidth.txt -o ../charmaps/UTF-8 \ + -n UTF-8 -a -m -c > $@ check-UTF-8: UTF-8-report @if grep '^Total.*: [^0]' UTF-8-report; \ then echo manual verification required; false; else true; fi +translit_combining: UnicodeData.txt +translit_combining: gen_translit_combining.py + $(PYTHON3) ./gen_translit_combining.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_compat: UnicodeData.txt +translit_compat: gen_translit_compat.py + $(PYTHON3) ./gen_translit_compat.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_circle: UnicodeData.txt +translit_circle: gen_translit_circle.py + $(PYTHON3) ./gen_translit_circle.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_cjk_compat: UnicodeData.txt +translit_cjk_compat: gen_translit_cjk_compat.py + $(PYTHON3) ./gen_translit_cjk_compat.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_font: UnicodeData.txt +translit_font: gen_translit_font.py + $(PYTHON3) ./gen_translit_font.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) + +translit_fraction: UnicodeData.txt +translit_fraction: gen_translit_fraction.py + $(PYTHON3) ./gen_translit_fraction.py -u UnicodeData.txt \ + -o $@ --unicode_version $(UNICODE_VERSION) .PHONY: downloads clean-downloads downloads: $(DOWNLOADS) diff --git a/localedata/unicode-gen/gen_translit_circle.py b/localedata/unicode-gen/gen_translit_circle.py new file mode 100644 index 0000000..6142859 --- /dev/null +++ b/localedata/unicode-gen/gen_translit_circle.py @@ -0,0 +1,150 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Generate a translit_circle file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +Generate a translit_circle file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_circle -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_circle file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of encircled characters.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_circle.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition.startswith('<circle>'): + decomposition = decomposition[9:] + decomposed_code_points = [int(x, 16) + for x in decomposition.split(' ')] + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} "<U0028>'.format( + unicode_utils.ucs_symbol(code_point))) + for decomposed_code_point in decomposed_code_points: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + translit_file.write('<U0029>"\n') + translit_file.write('\n') + + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_circle file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_combining + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_circle.new', + help='''The new translit_circle file, default: %(default)s. If the + original glibc/localedata/locales/translit_circle file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_cjk_compat.py b/localedata/unicode-gen/gen_translit_cjk_compat.py new file mode 100644 index 0000000..627ff6b --- /dev/null +++ b/localedata/unicode-gen/gen_translit_cjk_compat.py @@ -0,0 +1,220 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Generate a translit_cjk_compat file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +Generate a translit_cjk_compat file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_cjk_compat -h + … prints usage message … +''' + +import argparse +import time +import sys +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_cjk_compat file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of CJK compatibility ') + translit_file.write('characters.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_cjk_compat.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def special_decompose(code_point_list): + ''' + Decompositions which are not in UnicodeData.txt at all but which + were used in the original translit_cjk_compat file in glibc and + which seem to make sense. I want to keep the update of + translit_cjk_compat close to the spirit of the original file, + therefore I added this special decomposition rules here. + ''' + special_decompose_dict = { + (0x2215,): [0x002F], # ∕ → / + (0x00B2,): [0x005E, 0x0032], # ² → ^2 + (0x03BC,): [0x00B5], # μ → µ (GREEK SMALL LETTER MU → MICRO SIGN) + (0x2113,): [0x006C], # ℓ → l + (0x00B3,): [0x005E, 0x0033], # ³ → ^3 + (0x00B5,): [0x0075], # µ → u + (0x03BC, 0x2113): [0x03BC, 0x006C], # μℓ → μl + (0x0072, 0x0061, 0x0064, 0x2215, 0x0073, 0x00B2): [ + 0x0072, 0x0061, 0x0064, 0x002F, 0x0073, 0x00B2], + (0x006D, 0x2215, 0x0073, 0x00B2): [0x006D, 0x002F, 0x0073, 0x00B2], + } + if tuple(code_point_list) in special_decompose_dict: + return special_decompose_dict[tuple(code_point_list)] + else: + return code_point_list + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition.startswith('<square>'): + decomposition = decomposition[9:] + decomposed_code_points = [[int(x, 16) + for x in decomposition.split(' ')]] + if decomposed_code_points[0]: + while True: + special_decomposed_code_points = special_decompose( + decomposed_code_points[-1]) + if (special_decomposed_code_points + != decomposed_code_points[-1]): + decomposed_code_points.append( + special_decomposed_code_points) + continue + special_decomposed_code_points = [] + for decomposed_code_point in decomposed_code_points[-1]: + special_decomposed_code_points += special_decompose( + [decomposed_code_point]) + if (special_decomposed_code_points + == decomposed_code_points[-1]): + break + decomposed_code_points.append( + special_decomposed_code_points) + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition and name.startswith('CJK COMPATIBILITY IDEOGRAPH'): + decomposed_code_points = [int(x, 16) + for x in decomposition.split(' ')] + if len(decomposed_code_points) != 1: + sys.stderr.write( + 'Unexpected decomposition length {:x} {:s} {:s}\n'.format( + code_point, name, decomposition)) + exit(1) + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for decomposed_code_point in decomposed_code_points: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + translit_file.write('\n') + translit_file.write('\n') + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_cjk_compat file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_cjk_compat + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_cjk_compat.new', + help='''The new translit_cjk_compat file, default: %(default)s. If the + original glibc/localedata/locales/translit_cjk_compat file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_combining.py b/localedata/unicode-gen/gen_translit_combining.py new file mode 100644 index 0000000..2551ce1 --- /dev/null +++ b/localedata/unicode-gen/gen_translit_combining.py @@ -0,0 +1,442 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Generate a translit_combining file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +Generate a translit_combining file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_combining -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_combining file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations that remove all ') + translit_file.write('combining characters (accents,\n') + translit_file.write('% pronounciation marks, etc.).\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_combining.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def is_combining_remove(code_point): + '''Check whether this is a combining character which should be listed + in the section of the translit_combining file where combining + characters are replaced by empty strings. + + We ignore combining characters from many scripts here because + the original translit_combining file didn’t do this for the + combining characters from these scripts either and I am not + sure yet whether this would be useful to do for all combining + characters or not. For the moment I think it is better to keep + close to the spirit of the original file. + ''' + if not unicode_utils.is_combining(code_point): + return False + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + for substring in ('DEVANAGARI', + 'BENGALI', + 'CYRILLIC', + 'SYRIAC', + 'THAANA', + 'NKO', + 'GURMUKHI', + 'TAMIL', + 'GUJARATI', + 'ORIYA', + 'TELUGU', + 'KANNADA', + 'MALAYALAM', + 'SINHALA', + 'THAI', + 'LAO', + 'TIBETAN', + 'MYANMAR', + 'ETHIOPIC', + 'TAGALOG', + 'HANUNOO', + 'BUHID', + 'TAGBANWA', + 'KHMER', + 'MONGOLIAN', + 'LIMBU', + 'NEW TAI LUE', + 'BUGINESE', + 'BALINESE', + 'SUNDANESE', + 'LEPCHA', + 'IDEOGRAPHIC', + 'HANGUL', + 'SYLOTI', + 'SAURASHTRA', + 'KAYAH', + 'REJANG', + 'CHAM', + 'VARIATION SELECTOR', + 'KHAROSHTHI', + 'MUSICAL SYMBOL', + 'SAMARITAN', + 'MANDAIC', + 'TAI THAM', + 'BATAK', + 'VEDIC', + 'COPTIC', + 'TIFINAGH', + 'BAMUM', + 'JAVANESE', + 'TAI VIET', + 'MEETEI', + 'MANICHAEAN', + 'BRAHMI', + 'KAITHI', + 'CHAKMA', + 'MAHAJANI', + 'SHARADA', + 'KHOJKI', + 'KHUDAWADI', + 'GRANTHA', + 'TIRHUTA', + 'SIDDHAM', + 'MODI VOWEL', + 'MODI SIGN', + 'TAKRI', + 'BASSA VAH', + 'PAHAWH HMONG', + 'MIAO', + 'DUPLOYAN', + 'MENDE KIKAKUI' + ): + if substring in name: + return False + return True + +def canonical_decompose(code_point): + '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings + + In some instances a canonical mapping or a compatibility mapping + may consist of a single character. For a canonical mapping, this + indicates that the character is a canonical equivalent of another + single character. For a compatibility mapping, this indicates that + the character is a compatibility equivalent of another single + character. + + A canonical mapping may also consist of a pair of characters, but + is never longer than two characters. When a canonical mapping + consists of a pair of characters, the first character may itself + be a character with a decomposition mapping, but the second + character never has a decomposition mapping. + + We ignore the canonical decomposition for code points + matching certain substrings because the original translit_combining + file didn’t include these types of characters either. I am unsure + about the usefulness of including them and want to keep close + to the spirit of the original file for the moment. + ''' + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + for substring in ('MUSICAL SYMBOL', + 'CJK COMPATIBILITY IDEOGRAPH', + 'BALINESE', + 'KAITHI LETTER', + 'CHAKMA VOWEL', + 'GRANTHA VOWEL', + 'TIRHUTA VOWEL', + 'SIDDHAM VOWEL'): + if substring in name: + return [] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition and not decomposition.startswith('<'): + decomposed_code_points = [int(x, 16) for x in decomposition.split(' ')] + if decomposed_code_points: + cd0 = canonical_decompose(decomposed_code_points[0]) + if cd0: + decomposed_code_points = cd0 + decomposed_code_points[1:] + return decomposed_code_points + else: + return [] + +def special_decompose(code_point_list): + ''' + Decompositions which are not canonical or which are not in + UnicodeData.txt at all but some of these were used in the original + translit_combining file in glibc and they seemed to make sense. + I want to keep the update of translit_combining close to the + spirit of the original file, therefore I added these special + decomposition rules here. + ''' + special_decompose_dict = { + # Ø U+00D8 is already handled in translit_neutral. But + # translit_combining is usually included after translit_neutral + # and Ǿ U+01FE LATIN CAPITAL LETTER O WITH STROKE AND ACUTE + # has a canonical decomposition to Ø U+00D8 and we want to + # further decompose this to U+004F. + (0x00D8,): [0x004F], # Ø → O + # ø U+00F8 is already handled in translit_neutral. But + # translit_combining is usually included after translit_neutral + # and ǿ U+01FF LATIN SMALL LETTER O WITH STROKE AND ACUTE + # has a canonical decomposition to ø U+00F8 and we want to + # further decompose this to U+006F. + (0x00F8,): [0x006F], # ø → o + # æ U+00E6 is already in translit_compat because ligatures + # are handled in translit_compat. But ǣ U+01E3 has a + # canonical decomposition to U+00E6, U+0304 and we want to + # further decompose this to “ae”. + (0x00E6,): [0x0061, 0x0065], # æ → ae + # Æ U+00C6 is already in translit_compat because ligatures + # are handled in translit_compat. But Ǣ U+01E2 has a + # canonical decomposition to U+00C6, U+0304 and we want to + # further decompose this to “AE” + (0x00C6,): [0x0041, 0x0045], # Æ → AE + # U+05F2 HEBREW LIGATURE YIDDISH DOUBLE YOD is already in + # translit_compat because ligatures are handled in translit_compat. + # But U+FB1F has a canonical decomposition to U+05F2 and + # we want to further decompose this to U+05D9, U+05D9. + (0x05F2,): [0x05D9, 0x05D9], # ײ → יי + # 0x2002 has a <compat> decomposition to 0x0020 in UnicodeData.txt + # But U+2000 EN QUAD has a canonical decomposition U+2002 + # and we want to further decompose this to U+0020. + (0x2002,): [0x0020], # EN SPACE → SPACE + # 0x2003 has a <compat> decomposition to 0x0020 in UnicodeData.txt + # But U+2001 EM QUAD has a canonical decomposition to U+2003 + # and we want to further decompose this to U+0020. + (0x2003,): [0x0020], # EM SPACE → SPACE + # U+2260 ≠ has the canonical decomposition U+003D U+0338 + # (= followed by ̸). After stripping the combining characters, + # the result is only = which reverses the meaning. + # Therefore, we add a special rules here for such mathematical + # negations: + (0x21AE,): [0x0021, 0x003C, 0x002D, 0x003E], # ↮ → !<-> + (0x21CD,): [0x0021, 0x003C, 0x003D], # ⇍ → !<= + (0x21CE,): [0x0021, 0x003C, 0x003D, 0x003E], # ⇎ → !<=> + (0x21CF,): [0x0021, 0x003D, 0x003E], # ⇏ → !=> + (0x2204,): [0x0021, 0x2203], # ∄ → !∃ + (0x2209,): [0x0021, 0x2208], # ∉ → !∈ + (0x220C,): [0x0021, 0x220B], # ∌ → !∋ + (0x2224,): [0x0021, 0x2223], # ∤ → !∣ + (0x2226,): [0x0021, 0x2225], # ∦ → !∥ + (0x2241,): [0x0021, 0x007E], # ≁ → !~ + (0x2244,): [0x0021, 0x007E, 0x002D], # ≄ → !~- + (0x2247,): [0x0021, 0x007E, 0x003D], # ≇ → !~= + (0x2249,): [0x0021, 0x007E, 0x007E], # ≉ → !~~ + (0x2260,): [0x0021, 0x003D], # ≠ → != + (0x2262,): [0x0021, 0x003D, 0x003D], # ≢ → !== + (0x226D,): [0x0021, 0x224D], # ≭ → !≍ + (0x226E,): [0x0021, 0x003C], # ≮ → !< + (0x226F,): [0x0021, 0x003E], # ≯ → !> + (0x2270,): [0x0021, 0x003C, 0x003D], # ≰ → !<= + (0x2271,): [0x0021, 0x003E, 0x003D], # ≱ → !>= + (0x2274,): [0x0021, 0x003C, 0x007E], # ≴ → !<~ + (0x2275,): [0x0021, 0x003E, 0x007E], # ≵ → !>~ + (0x2278,): [0x0021, 0x003C, 0x003E], # ≸ → !<> + (0x2279,): [0x0021, 0x003E, 0x003C], # ≹ → !>< + (0x2280,): [0x0021, 0x227A], # ⊀ → !≺ + (0x2281,): [0x0021, 0x227B], # ⊁ → !≻ + (0x2284,): [0x0021, 0x2282], # ⊄ → !⊂ + (0x2285,): [0x0021, 0x2283], # ⊅ → !⊃ + (0x2288,): [0x0021, 0x2282, 0x003D], # ⊈ → !⊂= + (0x2289,): [0x0021, 0x2283, 0x003D], # ⊉ → !⊃= + (0x22AC,): [0x0021, 0x22A2], # ⊬ → !⊢ + (0x22AD,): [0x0021, 0x22A8], # ⊭ → !⊨ + (0x22AE,): [0x0021, 0x22A9], # ⊮ → !⊩ + (0x22AF,): [0x0021, 0x22AB], # ⊯ → !⊫ + (0x22E0,): [0x0021, 0x227C], # ⋠ → !≼ + (0x22E1,): [0x0021, 0x227D], # ⋡ → !≽ + (0x22E2,): [0x0021, 0x2291], # ⋢ → !⊑ + (0x22E3,): [0x0021, 0x2292], # ⋣ → !⊒ + (0x22EA,): [0x0021, 0x22B2], # ⋪ → !⊲ + (0x22EB,): [0x0021, 0x22B3], # ⋫ → !⊳ + (0x22EC,): [0x0021, 0x22B4], # ⋬ → !⊴ + (0x22ED,): [0x0021, 0x22B5], # ⋭ → !⊵ + (0x2ADC,): [0x0021, 0x2ADD], # ⫝̸ → !⫝ + # Special rule for 〈 U+3008 is added + # because 〉 U+2329 has the canonical decomposition U+3008 + # and we want to further decompose this to > U+003C. + (0x3008,): [0x003C], # 〈 → < + # Special rule for 〉 U+3009 is added + # because 〉 U+232A has the canonical decomposition U+3009 + # and we want to further decompose this to < U+003E. + (0x3009,): [0x003E], # 〉→ > + } + if tuple(code_point_list) in special_decompose_dict: + return special_decompose_dict[tuple(code_point_list)] + else: + return code_point_list + +def output_combining_remove(translit_file): + '''Write the section of the translit_combining file where combining + characters are replaced by empty strings. + ''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + if is_combining_remove(code_point): + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} ""\n'.format( + unicode_utils.ucs_symbol(code_point))) + translit_file.write('\n') + +def output_decompositions(translit_file): + '''Write the section of the translit_combining file where characters + characters are decomposed and combining characters stripped from + the decompositions. + ''' + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + if special_decompose([code_point]) != [code_point]: + decomposed_code_points = [special_decompose([code_point])] + else: + decomposed_code_points = [canonical_decompose(code_point)] + if decomposed_code_points[0]: + while True: + special_decomposed_code_points = special_decompose( + decomposed_code_points[-1]) + if (special_decomposed_code_points + != decomposed_code_points[-1]): + decomposed_code_points.append( + special_decomposed_code_points) + continue + special_decomposed_code_points = [] + for decomposed_code_point in decomposed_code_points[-1]: + special_decomposed_code_points += special_decompose( + [decomposed_code_point]) + if (special_decomposed_code_points + == decomposed_code_points[-1]): + break + decomposed_code_points.append( + special_decomposed_code_points) + for index in range(0, len(decomposed_code_points)): + decomposed_code_points[index] = [ + x for x in decomposed_code_points[index] + if not is_combining_remove(x)] + if decomposed_code_points[0]: + translit_file.write('% {:s}\n'.format( + unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'])) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + translit_file.write('\n') + translit_file.write('\n') + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + output_combining_remove(translit_file) + output_decompositions(translit_file) + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_combining file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_combining + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_combining.new', + help='''The new translit_combining file, default: %(default)s. If the + original glibc/localedata/locales/translit_combining file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_compat.py b/localedata/unicode-gen/gen_translit_compat.py new file mode 100644 index 0000000..0e824a8 --- /dev/null +++ b/localedata/unicode-gen/gen_translit_compat.py @@ -0,0 +1,326 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Generate a translit_compat file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +Generate a translit_compat file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_compat -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_compat file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of compatibility characters ') + translit_file.write('and ligatures.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_compat.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def compatibility_decompose(code_point): + '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings + + “The compatibility decomposition is formed by recursively applying + the canonical and compatibility mappings, then applying the + Canonical Ordering Algorithm.” + + We don’t do the canonical decomposition here because this is + done in gen_translit_combining.py to generate translit_combining. + + And we ignore some of the possible compatibility formatting tags + here. Some of them are used in other translit_* files, not + translit_compat: + + <font>: translit_font + <circle>: translit_circle + <wide>: translit_wide + <narrow>: translit_narrow + <square>: translit_cjk_compat + <fraction>: translit_fraction + + And we ignore + + <noBreak>, <initial>, <medial>, <final>, <isolated> + + because they seem to be not useful for transliteration. + ''' + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + compatibility_tags = ( + '<compat>', '<super>', '<sub>', '<vertical>') + for compatibility_tag in compatibility_tags: + if decomposition.startswith(compatibility_tag): + decomposition = decomposition[len(compatibility_tag)+1:] + decomposed_code_points = [int(x, 16) + for x in decomposition.split(' ')] + if (len(decomposed_code_points) > 1 + and decomposed_code_points[0] == 0x0020 + and decomposed_code_points[1] >= 0x0300 + and decomposed_code_points[1] <= 0x03FF): + # Decomposes into a space followed by a combining character. + # This is not useful fo transliteration. + return [] + else: + return_value = [] + for index in range(0, len(decomposed_code_points)): + cd_code_points = compatibility_decompose( + decomposed_code_points[index]) + if cd_code_points: + return_value += cd_code_points + else: + return_value += [decomposed_code_points[index]] + return return_value + return [] + +def special_decompose(code_point_list): + ''' + Decompositions which are not in UnicodeData.txt at all but which + were used in the original translit_compat file in glibc and + which seem to make sense. I want to keep the update of + translit_compat close to the spirit of the original file, + therefore I added this special decomposition rules here. + ''' + special_decompose_dict = { + (0x03BC,): [0x0075], # μ → u + (0x02BC,): [0x0027], # ʼ → ' + } + if tuple(code_point_list) in special_decompose_dict: + return special_decompose_dict[tuple(code_point_list)] + else: + return code_point_list + +def special_ligature_decompose(code_point): + ''' + Decompositions for ligatures which are not in UnicodeData.txt at + all but which were used in the original translit_compat file in + glibc and which seem to make sense. I want to keep the update of + translit_compat close to the spirit of the original file, + therefore I added these special ligature decomposition rules here. + + ''' + special_ligature_decompose_dict = { + 0x00E6: [0x0061, 0x0065], # æ → ae + 0x00C6: [0x0041, 0x0045], # Æ → AE + # These following 5 special ligature decompositions were + # in the original glibc/localedata/locales/translit_compat file + 0x0152: [0x004F, 0x0045], # Œ → OE + 0x0153: [0x006F, 0x0065], # œ → oe + 0x05F0: [0x05D5, 0x05D5], # װ → וו + 0x05F1: [0x05D5, 0x05D9], # ױ → וי + 0x05F2: [0x05D9, 0x05D9], # ײ → יי + # The following special ligature decompositions were + # not in the original glibc/localedata/locales/translit_compat file + # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE + # → U+041D CYRILLIC CAPITAL LETTER EN, + # U+0413 CYRILLIC CAPITAL LETTER GHE + 0x04A4: [0x041D, 0x0413], # Ҥ → НГ + # U+04A5 CYRILLIC SMALL LIGATURE EN GHE + # → U+043D CYRILLIC SMALL LETTER EN, + # U+0433 CYRILLIC SMALL LETTER GHE + 0x04A5: [0x043D, 0x0433], # ҥ → нг + # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE + # → U+0422 CYRILLIC CAPITAL LETTER TE, + # U+0426 CYRILLIC CAPITAL LETTER TSE + 0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ + # U+04B5 CYRILLIC SMALL LIGATURE TE TSE + # → U+0442 CYRILLIC SMALL LETTER TE, + # U+0446 CYRILLIC SMALL LETTER TSE + 0x04B5: [0x0442, 0x0446], # ҵ → тц + # U+04d4 CYRILLIC CAPITAL LIGATURE A IE + # → U+0410 CYRILLIC CAPITAL LETTER A + # U+0415;CYRILLIC CAPITAL LETTER IE + 0x04D4: [0x0410, 0x0415], # Ӕ → АЕ + # U+04D5 CYRILLIC SMALL LIGATURE A IE + # → U+0430 CYRILLIC SMALL LETTER A, + # U+0435 CYRILLIC SMALL LETTER IE + 0x04D5: [0x0430, 0x0435], # ӕ → ае + # I am not sure what to do with the following ligatures + # maybe it makes no sense to decompose them: + # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH + # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA + # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA + # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM + # U+fe20 COMBINING LIGATURE LEFT HALF + # U+fe21 COMBINING LIGATURE RIGHT HALF + # U+fe27 COMBINING LIGATURE LEFT HALF BELOW + # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW + # U+11176 MAHAJANI LIGATURE SHRI + # U+1f670 SCRIPT LIGATURE ET ORNAMENT + # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT + # U+1f672 LIGATURE OPEN ET ORNAMENT + # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT + } + if code_point in special_ligature_decompose_dict: + return special_ligature_decompose_dict[code_point] + else: + return [code_point] + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposed_code_points = [compatibility_decompose(code_point)] + if not decomposed_code_points[0]: + if special_decompose([code_point]) != [code_point]: + decomposed_code_points[0] = special_decompose([code_point]) + else: + special_decomposed_code_points = [] + while True: + special_decomposed_code_points = special_decompose( + decomposed_code_points[-1]) + if (special_decomposed_code_points + != decomposed_code_points[-1]): + decomposed_code_points.append( + special_decomposed_code_points) + continue + special_decomposed_code_points = [] + for decomposed_code_point in decomposed_code_points[-1]: + special_decomposed_code_points += special_decompose( + [decomposed_code_point]) + if (special_decomposed_code_points + == decomposed_code_points[-1]): + break + decomposed_code_points.append( + special_decomposed_code_points) + if decomposed_code_points[0]: + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + translit_file.write('"') + translit_file.write('\n') + elif 'LIGATURE' in name and 'ARABIC' not in name: + decomposed_code_points = special_ligature_decompose(code_point) + if decomposed_code_points[0] != code_point: + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + translit_file.write('"') + for decomposed_code_point in decomposed_code_points: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + translit_file.write('"') + translit_file.write('\n') + else: + print('Warning: unhandled ligature: {:x} {:s}'.format( + code_point, name)) + translit_file.write('\n') + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_compat file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_compat + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_compat.new', + help='''The new translit_compat file, default: %(default)s. If the + original glibc/localedata/locales/translit_compat file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_font.py b/localedata/unicode-gen/gen_translit_font.py new file mode 100644 index 0000000..0723622 --- /dev/null +++ b/localedata/unicode-gen/gen_translit_font.py @@ -0,0 +1,156 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Generate a translit_font file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +Generate a translit_font file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_font -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_font file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of font equivalents.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_font.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition.startswith('<font>'): + decomposition = decomposition[7:] + decomposed_code_points = [[int(x, 16) + for x in decomposition.split(' ')]] + if decomposed_code_points[0]: + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + translit_file.write(' % {:s}\n'.format(name)) + translit_file.write('\n') + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_font file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_font + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_font.new', + help='''The new translit_font file, default: %(default)s. If the + original glibc/localedata/locales/translit_font file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_translit_fraction.py b/localedata/unicode-gen/gen_translit_fraction.py new file mode 100644 index 0000000..5bf63ea --- /dev/null +++ b/localedata/unicode-gen/gen_translit_fraction.py @@ -0,0 +1,197 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Generate a translit_fraction file from a UnicodeData file. +# Copyright (C) 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +Generate a translit_fraction file from UnicodeData.txt + +To see how this script is used, call it with the “-h” option: + + $ ./gen_translit_fraction -h + … prints usage message … +''' + +import argparse +import time +import unicode_utils + +def read_input_file(filename): + '''Reads the original glibc translit_fraction file to get the + original head and tail. + + We want to replace only the part of the file between + “translit_start” and “translit_end” + ''' + head = tail = '' + with open(filename, mode='r') as translit_file: + for line in translit_file: + head = head + line + if line.startswith('translit_start'): + break + for line in translit_file: + if line.startswith('translit_end'): + tail = line + break + for line in translit_file: + tail = tail + line + return (head, tail) + +def output_head(translit_file, unicode_version, head=''): + '''Write the header of the output file, i.e. the part of the file + before the “translit_start” line. + ''' + if ARGS.input_file and head: + translit_file.write(head) + else: + translit_file.write('escape_char /\n') + translit_file.write('comment_char %\n') + translit_file.write('\n') + translit_file.write('% Transliterations of fractions.\n') + translit_file.write('% Generated automatically from UnicodeData.txt ' + + 'by gen_translit_fraction.py ' + + 'on {:s} '.format(time.strftime('%Y-%m-%d')) + + 'for Unicode {:s}.\n'.format(unicode_version)) + translit_file.write('% The replacements have been surrounded ') + translit_file.write('with spaces, because fractions are\n') + translit_file.write('% often preceded by a decimal number and ') + translit_file.write('followed by a unit or a math symbol.\n') + translit_file.write('\n') + translit_file.write('LC_CTYPE\n') + translit_file.write('\n') + translit_file.write('translit_start\n') + +def output_tail(translit_file, tail=''): + '''Write the tail of the output file''' + if ARGS.input_file and tail: + translit_file.write(tail) + else: + translit_file.write('translit_end\n') + translit_file.write('\n') + translit_file.write('END LC_CTYPE\n') + +def special_decompose(code_point_list): + ''' + Decompositions which are not in UnicodeData.txt at all but which + were used in the original translit_fraction file in glibc and + which seem to make sense. I want to keep the update of + translit_fraction close to the spirit of the original file, + therefore I added this special decomposition rules here. + ''' + special_decompose_dict = { + (0x2044,): [0x002F], # ⁄ → / + } + if tuple(code_point_list) in special_decompose_dict: + return special_decompose_dict[tuple(code_point_list)] + else: + return code_point_list + +def output_transliteration(translit_file): + '''Write the new transliteration to the output file''' + translit_file.write('\n') + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): + name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name'] + decomposition = unicode_utils.UNICODE_ATTRIBUTES[ + code_point]['decomposition'] + if decomposition.startswith('<fraction>'): + decomposition = decomposition[11:] + decomposed_code_points = [[int(x, 16) + for x in decomposition.split(' ')]] + if decomposed_code_points[0]: + decomposed_code_points[0] = [0x0020] \ + + decomposed_code_points[0] \ + + [0x0020] + while True: + special_decomposed_code_points = special_decompose( + decomposed_code_points[-1]) + if (special_decomposed_code_points + != decomposed_code_points[-1]): + decomposed_code_points.append( + special_decomposed_code_points) + continue + special_decomposed_code_points = [] + for decomposed_code_point in decomposed_code_points[-1]: + special_decomposed_code_points += special_decompose( + [decomposed_code_point]) + if (special_decomposed_code_points + == decomposed_code_points[-1]): + break + decomposed_code_points.append( + special_decomposed_code_points) + translit_file.write('% {:s}\n'.format(name)) + translit_file.write('{:s} '.format( + unicode_utils.ucs_symbol(code_point))) + for index in range(0, len(decomposed_code_points)): + if index > 0: + translit_file.write(';') + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + for decomposed_code_point in decomposed_code_points[index]: + translit_file.write('{:s}'.format( + unicode_utils.ucs_symbol(decomposed_code_point))) + if len(decomposed_code_points[index]) > 1: + translit_file.write('"') + translit_file.write('\n') + translit_file.write('\n') + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a translit_cjk_compat file from UnicodeData.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-i', '--input_file', + nargs='?', + type=str, + help=''' The original glibc/localedata/locales/translit_fraction + file.''') + PARSER.add_argument( + '-o', '--output_file', + nargs='?', + type=str, + default='translit_fraction.new', + help='''The new translit_fraction file, default: %(default)s. If the + original glibc/localedata/locales/translit_fraction file has + been given as an option, the header up to the + “translit_start” line and the tail from the “translit_end” + line to the end of the file will be copied unchanged into the + output file. ''') + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + unicode_utils.fill_attributes(ARGS.unicode_data_file) + HEAD = TAIL = '' + if ARGS.input_file: + (HEAD, TAIL) = read_input_file(ARGS.input_file) + with open(ARGS.output_file, mode='w') as TRANSLIT_FILE: + output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD) + output_transliteration(TRANSLIT_FILE) + output_tail(TRANSLIT_FILE, tail=TAIL) diff --git a/localedata/unicode-gen/gen_unicode_ctype.py b/localedata/unicode-gen/gen_unicode_ctype.py index 0c74f2a..0f064f5 100755 --- a/localedata/unicode-gen/gen_unicode_ctype.py +++ b/localedata/unicode-gen/gen_unicode_ctype.py @@ -30,345 +30,9 @@ To see how this script is used, call it with the “-h” option: ''' import argparse -import sys import time import re - -# Dictionary holding the entire contents of the UnicodeData.txt file -# -# Contents of this dictionary look like this: -# -# {0: {'category': 'Cc', -# 'title': None, -# 'digit': '', -# 'name': '<control>', -# 'bidi': 'BN', -# 'combining': '0', -# 'comment': '', -# 'oldname': 'NULL', -# 'decomposition': '', -# 'upper': None, -# 'mirrored': 'N', -# 'lower': None, -# 'decdigit': '', -# 'numeric': ''}, -# … -# } -UNICODE_ATTRIBUTES = {} - -# Dictionary holding the entire contents of the DerivedCoreProperties.txt file -# -# Contents of this dictionary look like this: -# -# {917504: ['Default_Ignorable_Code_Point'], -# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'], -# … -# } -DERIVED_CORE_PROPERTIES = {} - -def fill_attribute(code_point, fields): - '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. - - One entry in the UNICODE_ATTRIBUTES dictionary represents one line - in the UnicodeData.txt file. - - ''' - UNICODE_ATTRIBUTES[code_point] = { - 'name': fields[1], # Character name - 'category': fields[2], # General category - 'combining': fields[3], # Canonical combining classes - 'bidi': fields[4], # Bidirectional category - 'decomposition': fields[5], # Character decomposition mapping - 'decdigit': fields[6], # Decimal digit value - 'digit': fields[7], # Digit value - 'numeric': fields[8], # Numeric value - 'mirrored': fields[9], # mirrored - 'oldname': fields[10], # Old Unicode 1.0 name - 'comment': fields[11], # comment - # Uppercase mapping - 'upper': int(fields[12], 16) if fields[12] else None, - # Lowercase mapping - 'lower': int(fields[13], 16) if fields[13] else None, - # Titlecase mapping - 'title': int(fields[14], 16) if fields[14] else None, - } - -def fill_attributes(filename): - '''Stores the entire contents of the UnicodeData.txt file - in the UNICODE_ATTRIBUTES dictionary. - - A typical line for a single code point in UnicodeData.txt looks - like this: - - 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; - - Code point ranges are indicated by pairs of lines like this: - - 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; - 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; - ''' - with open(filename, mode='r') as unicode_data_file: - fields_start = [] - for line in unicode_data_file: - fields = line.strip().split(';') - if len(fields) != 15: - sys.stderr.write( - 'short line in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - if fields[2] == 'Cs': - # Surrogates are UTF-16 artefacts, - # not real characters. Ignore them. - fields_start = [] - continue - if fields[1].endswith(', First>'): - fields_start = fields - fields_start[1] = fields_start[1].split(',')[0][1:] - continue - if fields[1].endswith(', Last>'): - fields[1] = fields[1].split(',')[0][1:] - if fields[1:] != fields_start[1:]: - sys.stderr.write( - 'broken code point range in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - for code_point in range( - int(fields_start[0], 16), - int(fields[0], 16)+1): - fill_attribute(code_point, fields) - fields_start = [] - continue - fill_attribute(int(fields[0], 16), fields) - fields_start = [] - -def fill_derived_core_properties(filename): - '''Stores the entire contents of the DerivedCoreProperties.txt file - in the DERIVED_CORE_PROPERTIES dictionary. - - Lines in DerivedCoreProperties.txt are either a code point range like - this: - - 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z - - or a single code point like this: - - 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR - - ''' - with open(filename, mode='r') as derived_core_properties_file: - for line in derived_core_properties_file: - match = re.match( - r'^(?P<codepoint1>[0-9A-F]{4,6})' - + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' - + r'\s*;\s*(?P<property>[a-zA-Z_]+)', - line) - if not match: - continue - start = match.group('codepoint1') - end = match.group('codepoint2') - if not end: - end = start - for code_point in range(int(start, 16), int(end, 16)+1): - prop = match.group('property') - if code_point in DERIVED_CORE_PROPERTIES: - DERIVED_CORE_PROPERTIES[code_point].append(prop) - else: - DERIVED_CORE_PROPERTIES[code_point] = [prop] - -def to_upper(code_point): - '''Returns the code point of the uppercase version - of the given code point''' - if (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['upper']): - return UNICODE_ATTRIBUTES[code_point]['upper'] - else: - return code_point - -def to_lower(code_point): - '''Returns the code point of the lowercase version - of the given code point''' - if (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['lower']): - return UNICODE_ATTRIBUTES[code_point]['lower'] - else: - return code_point - -def to_title(code_point): - '''Returns the code point of the titlecase version - of the given code point''' - if (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['title']): - return UNICODE_ATTRIBUTES[code_point]['title'] - else: - return code_point - -def is_upper(code_point): - '''Checks whether the character with this code point is uppercase''' - return (to_lower(code_point) != code_point - or (code_point in DERIVED_CORE_PROPERTIES - and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point])) - -def is_lower(code_point): - '''Checks whether the character with this code point is lowercase''' - # Some characters are defined as “Lowercase” in - # DerivedCoreProperties.txt but do not have a mapping to upper - # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is - # one of these. - return (to_upper(code_point) != code_point - # <U00DF> is lowercase, but without simple to_upper mapping. - or code_point == 0x00DF - or (code_point in DERIVED_CORE_PROPERTIES - and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point])) - -def is_alpha(code_point): - '''Checks whether the character with this code point is alphabetic''' - return ((code_point in DERIVED_CORE_PROPERTIES - and - 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point]) - or - # Consider all the non-ASCII digits as alphabetic. - # ISO C 99 forbids us to have them in category “digit”, - # but we want iswalnum to return true on them. - (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd' - and not (code_point >= 0x0030 and code_point <= 0x0039))) - -def is_digit(code_point): - '''Checks whether the character with this code point is a digit''' - if False: - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd') - # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without - # a zero. Must add <0> in front of them by hand. - else: - # SUSV2 gives us some freedom for the "digit" category, but ISO C 99 - # takes it away: - # 7.25.2.1.5: - # The iswdigit function tests for any wide character that - # corresponds to a decimal-digit character (as defined in 5.2.1). - # 5.2.1: - # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 - return (code_point >= 0x0030 and code_point <= 0x0039) - -def is_outdigit(code_point): - '''Checks whether the character with this code point is outdigit''' - return (code_point >= 0x0030 and code_point <= 0x0039) - -def is_blank(code_point): - '''Checks whether the character with this code point is blank''' - return (code_point == 0x0009 # '\t' - # Category Zs without mention of '<noBreak>' - or (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs' - and '<noBreak>' not in - UNICODE_ATTRIBUTES[code_point]['decomposition'])) - -def is_space(code_point): - '''Checks whether the character with this code point is a space''' - # Don’t make U+00A0 a space. Non-breaking space means that all programs - # should treat it like a punctuation character, not like a space. - return (code_point == 0x0020 # ' ' - or code_point == 0x000C # '\f' - or code_point == 0x000A # '\n' - or code_point == 0x000D # '\r' - or code_point == 0x0009 # '\t' - or code_point == 0x000B # '\v' - # Categories Zl, Zp, and Zs without mention of "<noBreak>" - or (UNICODE_ATTRIBUTES[code_point]['name'] - and - (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'] - or - (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs'] - and - '<noBreak>' not in - UNICODE_ATTRIBUTES[code_point]['decomposition'])))) - -def is_cntrl(code_point): - '''Checks whether the character with this code point is - a control character''' - return (UNICODE_ATTRIBUTES[code_point]['name'] - and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>' - or - UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'])) - -def is_xdigit(code_point): - '''Checks whether the character with this code point is - a hexadecimal digit''' - if False: - return (is_digit(code_point) - or (code_point >= 0x0041 and code_point <= 0x0046) - or (code_point >= 0x0061 and code_point <= 0x0066)) - else: - # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 - # takes it away: - # 7.25.2.1.12: - # The iswxdigit function tests for any wide character that - # corresponds to a hexadecimal-digit character (as defined - # in 6.4.4.1). - # 6.4.4.1: - # hexadecimal-digit: one of - # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F - return ((code_point >= 0x0030 and code_point <= 0x0039) - or (code_point >= 0x0041 and code_point <= 0x0046) - or (code_point >= 0x0061 and code_point <= 0x0066)) - -def is_graph(code_point): - '''Checks whether the character with this code point is - a graphical character''' - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' - and not is_space(code_point)) - -def is_print(code_point): - '''Checks whether the character with this code point is printable''' - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' - and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp']) - -def is_punct(code_point): - '''Checks whether the character with this code point is punctuation''' - if False: - return (UNICODE_ATTRIBUTES[code_point]['name'] - and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P')) - else: - # The traditional POSIX definition of punctuation is every graphic, - # non-alphanumeric character. - return (is_graph(code_point) - and not is_alpha(code_point) - and not is_digit(code_point)) - -def is_combining(code_point): - '''Checks whether the character with this code point is - a combining character''' - # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt - # file. In 3.0.1 it was identical to the union of the general categories - # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the - # PropList.txt file, so we take the latter definition. - return (UNICODE_ATTRIBUTES[code_point]['name'] - and - UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me']) - -def is_combining_level3(code_point): - '''Checks whether the character with this code point is - a combining level3 character''' - return (is_combining(code_point) - and - int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200)) - -def ucs_symbol(code_point): - '''Return the UCS symbol string for a Unicode character.''' - if code_point < 0x10000: - return '<U{:04X}>'.format(code_point) - else: - return '<U{:08X}>'.format(code_point) - -def ucs_symbol_range(code_point_low, code_point_high): - '''Returns a string UCS symbol string for a code point range. - - Example: - - <U0041>..<U005A> - ''' - return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high) +import unicode_utils def code_point_ranges(is_class_function): '''Returns a list of ranges of code points for which is_class_function @@ -379,7 +43,7 @@ def code_point_ranges(is_class_function): [[65, 90], [192, 214], [216, 222], [256], … ] ''' cp_ranges = [] - for code_point in sorted(UNICODE_ATTRIBUTES): + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): if is_class_function(code_point): if (cp_ranges and cp_ranges[-1][-1] == code_point - 1): @@ -413,9 +77,9 @@ def output_charclass(i18n_file, class_name, is_class_function): if line.strip(): line += ';' if len(code_point_range) == 1: - range_string = ucs_symbol(code_point_range[0]) + range_string = unicode_utils.ucs_symbol(code_point_range[0]) else: - range_string = ucs_symbol_range( + range_string = unicode_utils.ucs_symbol_range( code_point_range[0], code_point_range[-1]) if len(line+range_string) > max_column: i18n_file.write(line+'/\n') @@ -441,15 +105,15 @@ def output_charmap(i18n_file, map_name, map_function): line = prefix map_string = '' i18n_file.write('%s /\n' %map_name) - for code_point in sorted(UNICODE_ATTRIBUTES): + for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): mapped = map_function(code_point) if code_point != mapped: if line.strip(): line += ';' map_string = '(' \ - + ucs_symbol(code_point) \ + + unicode_utils.ucs_symbol(code_point) \ + ',' \ - + ucs_symbol(mapped) \ + + unicode_utils.ucs_symbol(mapped) \ + ')' if len(line+map_string) > max_column: i18n_file.write(line+'/\n') @@ -459,110 +123,6 @@ def output_charmap(i18n_file, map_name, map_function): i18n_file.write(line+'\n') i18n_file.write('\n') -def verifications(): - '''Tests whether the is_* functions observe the known restrictions''' - for code_point in sorted(UNICODE_ATTRIBUTES): - # toupper restriction: "Only characters specified for the keywords - # lower and upper shall be specified. - if (to_upper(code_point) != code_point - and not (is_lower(code_point) or is_upper(code_point))): - sys.stderr.write( - ('%(sym)s is not upper|lower ' - + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{ - 'sym': ucs_symbol(code_point), - 'c': code_point, - 'uc': to_upper(code_point)}) - # tolower restriction: "Only characters specified for the keywords - # lower and upper shall be specified. - if (to_lower(code_point) != code_point - and not (is_lower(code_point) or is_upper(code_point))): - sys.stderr.write( - ('%(sym)s is not upper|lower ' - + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{ - 'sym': ucs_symbol(code_point), - 'c': code_point, - 'uc': to_lower(code_point)}) - # alpha restriction: "Characters classified as either upper or lower - # shall automatically belong to this class. - if ((is_lower(code_point) or is_upper(code_point)) - and not is_alpha(code_point)): - sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{ - 'sym': ucs_symbol(code_point)}) - # alpha restriction: “No character specified for the keywords cntrl, - # digit, punct or space shall be specified.” - if (is_alpha(code_point) and is_cntrl(code_point)): - sys.stderr.write('%(sym)s is alpha and cntrl\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_alpha(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is alpha and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_alpha(code_point) and is_punct(code_point)): - sys.stderr.write('%(sym)s is alpha and punct\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_alpha(code_point) and is_space(code_point)): - sys.stderr.write('%(sym)s is alpha and space\n' %{ - 'sym': ucs_symbol(code_point)}) - # space restriction: “No character specified for the keywords upper, - # lower, alpha, digit, graph or xdigit shall be specified.” - # upper, lower, alpha already checked above. - if (is_space(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is space and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_space(code_point) and is_graph(code_point)): - sys.stderr.write('%(sym)s is space and graph\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_space(code_point) and is_xdigit(code_point)): - sys.stderr.write('%(sym)s is space and xdigit\n' %{ - 'sym': ucs_symbol(code_point)}) - # cntrl restriction: “No character specified for the keywords upper, - # lower, alpha, digit, punct, graph, print or xdigit shall be - # specified.” upper, lower, alpha already checked above. - if (is_cntrl(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is cntrl and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_punct(code_point)): - sys.stderr.write('%(sym)s is cntrl and punct\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_graph(code_point)): - sys.stderr.write('%(sym)s is cntrl and graph\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_print(code_point)): - sys.stderr.write('%(sym)s is cntrl and print\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_cntrl(code_point) and is_xdigit(code_point)): - sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{ - 'sym': ucs_symbol(code_point)}) - # punct restriction: “No character specified for the keywords upper, - # lower, alpha, digit, cntrl, xdigit or as the <space> character shall - # be specified.” upper, lower, alpha, cntrl already checked above. - if (is_punct(code_point) and is_digit(code_point)): - sys.stderr.write('%(sym)s is punct and digit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_punct(code_point) and is_xdigit(code_point)): - sys.stderr.write('%(sym)s is punct and xdigit\n' %{ - 'sym': ucs_symbol(code_point)}) - if (is_punct(code_point) and code_point == 0x0020): - sys.stderr.write('%(sym)s is punct\n' %{ - 'sym': ucs_symbol(code_point)}) - # graph restriction: “No character specified for the keyword cntrl - # shall be specified.” Already checked above. - - # print restriction: “No character specified for the keyword cntrl - # shall be specified.” Already checked above. - - # graph - print relation: differ only in the <space> character. - # How is this possible if there are more than one space character?! - # I think susv2/xbd/locale.html should speak of “space characters”, - # not “space character”. - if (is_print(code_point) - and not (is_graph(code_point) or is_space(code_point))): - sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{ - 'sym': ucs_symbol(code_point)}) - if (not is_print(code_point) - and (is_graph(code_point) or code_point == 0x0020)): - sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{ - 'sym': ucs_symbol(code_point)}) - def read_input_file(filename): '''Reads the original glibc i18n file to get the original head and tail. @@ -648,18 +208,18 @@ def output_tables(i18n_file, unicode_version): + 'program.\n\n') i18n_file.write('% The "upper" class reflects the uppercase ' + 'characters of class "alpha"\n') - output_charclass(i18n_file, 'upper', is_upper) + output_charclass(i18n_file, 'upper', unicode_utils.is_upper) i18n_file.write('% The "lower" class reflects the lowercase ' + 'characters of class "alpha"\n') - output_charclass(i18n_file, 'lower', is_lower) + output_charclass(i18n_file, 'lower', unicode_utils.is_lower) i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is ' + 'reflecting\n') i18n_file.write('% the recommendations in TR 10176 annex A\n') - output_charclass(i18n_file, 'alpha', is_alpha) + output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha) i18n_file.write('% The "digit" class must only contain the ' + 'BASIC LATIN digits, says ISO C 99\n') i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n') - output_charclass(i18n_file, 'digit', is_digit) + output_charclass(i18n_file, 'digit', unicode_utils.is_digit) i18n_file.write('% The "outdigit" information is by default ' + '"0" to "9". We don\'t have to\n') i18n_file.write('% provide it here since localedef will fill ' @@ -669,29 +229,30 @@ def output_tables(i18n_file, unicode_version): i18n_file.write('% outdigit /\n') i18n_file.write('% <U0030>..<U0039>\n\n') # output_charclass(i18n_file, 'outdigit', is_outdigit) - output_charclass(i18n_file, 'space', is_space) - output_charclass(i18n_file, 'cntrl', is_cntrl) - output_charclass(i18n_file, 'punct', is_punct) - output_charclass(i18n_file, 'graph', is_graph) - output_charclass(i18n_file, 'print', is_print) + output_charclass(i18n_file, 'space', unicode_utils.is_space) + output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl) + output_charclass(i18n_file, 'punct', unicode_utils.is_punct) + output_charclass(i18n_file, 'graph', unicode_utils.is_graph) + output_charclass(i18n_file, 'print', unicode_utils.is_print) i18n_file.write('% The "xdigit" class must only contain the ' + 'BASIC LATIN digits and A-F, a-f,\n') i18n_file.write('% says ISO C 99 ' + '(sections 7.25.2.1.12 and 6.4.4.1).\n') - output_charclass(i18n_file, 'xdigit', is_xdigit) - output_charclass(i18n_file, 'blank', is_blank) - output_charmap(i18n_file, 'toupper', to_upper) - output_charmap(i18n_file, 'tolower', to_lower) - output_charmap(i18n_file, 'map "totitle";', to_title) + output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit) + output_charclass(i18n_file, 'blank', unicode_utils.is_blank) + output_charmap(i18n_file, 'toupper', unicode_utils.to_upper) + output_charmap(i18n_file, 'tolower', unicode_utils.to_lower) + output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title) i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 ' + 'annex B.1\n') i18n_file.write('% That is, all combining characters (level 2+3).\n') - output_charclass(i18n_file, 'class "combining";', is_combining) + output_charclass(i18n_file, 'class "combining";', + unicode_utils.is_combining) i18n_file.write('% The "combining_level3" class reflects ' + 'ISO/IEC 10646-1 annex B.2\n') i18n_file.write('% That is, combining characters of level 3.\n') - output_charclass(i18n_file, - 'class "combining_level3";', is_combining_level3) + output_charclass(i18n_file, 'class "combining_level3";', + unicode_utils.is_combining_level3) if __name__ == "__main__": PARSER = argparse.ArgumentParser( @@ -739,9 +300,11 @@ if __name__ == "__main__": help='The Unicode version of the input files used.') ARGS = PARSER.parse_args() - fill_attributes(ARGS.unicode_data_file) - fill_derived_core_properties(ARGS.derived_core_properties_file) - verifications() + unicode_utils.fill_attributes( + ARGS.unicode_data_file) + unicode_utils.fill_derived_core_properties( + ARGS.derived_core_properties_file) + unicode_utils.verifications() HEAD = TAIL = '' if ARGS.input_file: (HEAD, TAIL) = read_input_file(ARGS.input_file) diff --git a/localedata/unicode-gen/unicode_utils.py b/localedata/unicode-gen/unicode_utils.py new file mode 100644 index 0000000..ee91582 --- /dev/null +++ b/localedata/unicode-gen/unicode_utils.py @@ -0,0 +1,502 @@ +# Utilities to generate Unicode data for glibc from upstream Unicode data. +# +# Copyright (C) 2014, 2015 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +''' +This module contains utilities used by the scripts to generate +Unicode data for glibc from upstream Unicode data files. +''' + +import sys +import re + +# Dictionary holding the entire contents of the UnicodeData.txt file +# +# Contents of this dictionary look like this: +# +# {0: {'category': 'Cc', +# 'title': None, +# 'digit': '', +# 'name': '<control>', +# 'bidi': 'BN', +# 'combining': '0', +# 'comment': '', +# 'oldname': 'NULL', +# 'decomposition': '', +# 'upper': None, +# 'mirrored': 'N', +# 'lower': None, +# 'decdigit': '', +# 'numeric': ''}, +# … +# } +UNICODE_ATTRIBUTES = {} + +# Dictionary holding the entire contents of the DerivedCoreProperties.txt file +# +# Contents of this dictionary look like this: +# +# {917504: ['Default_Ignorable_Code_Point'], +# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'], +# … +# } +DERIVED_CORE_PROPERTIES = {} + +# Dictionary holding the entire contents of the EastAsianWidths.txt file +# +# Contents of this dictionary look like this: +# +# {0: 'N', … , 45430: 'W', …} +EAST_ASIAN_WIDTHS = {} + +def fill_attribute(code_point, fields): + '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. + + One entry in the UNICODE_ATTRIBUTES dictionary represents one line + in the UnicodeData.txt file. + + ''' + UNICODE_ATTRIBUTES[code_point] = { + 'name': fields[1], # Character name + 'category': fields[2], # General category + 'combining': fields[3], # Canonical combining classes + 'bidi': fields[4], # Bidirectional category + 'decomposition': fields[5], # Character decomposition mapping + 'decdigit': fields[6], # Decimal digit value + 'digit': fields[7], # Digit value + 'numeric': fields[8], # Numeric value + 'mirrored': fields[9], # mirrored + 'oldname': fields[10], # Old Unicode 1.0 name + 'comment': fields[11], # comment + # Uppercase mapping + 'upper': int(fields[12], 16) if fields[12] else None, + # Lowercase mapping + 'lower': int(fields[13], 16) if fields[13] else None, + # Titlecase mapping + 'title': int(fields[14], 16) if fields[14] else None, + } + +def fill_attributes(filename): + '''Stores the entire contents of the UnicodeData.txt file + in the UNICODE_ATTRIBUTES dictionary. + + A typical line for a single code point in UnicodeData.txt looks + like this: + + 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; + + Code point ranges are indicated by pairs of lines like this: + + 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; + 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; + ''' + with open(filename, mode='r') as unicode_data_file: + fields_start = [] + for line in unicode_data_file: + fields = line.strip().split(';') + if len(fields) != 15: + sys.stderr.write( + 'short line in file "%(f)s": %(l)s\n' %{ + 'f': filename, 'l': line}) + exit(1) + if fields[2] == 'Cs': + # Surrogates are UTF-16 artefacts, + # not real characters. Ignore them. + fields_start = [] + continue + if fields[1].endswith(', First>'): + fields_start = fields + fields_start[1] = fields_start[1].split(',')[0][1:] + continue + if fields[1].endswith(', Last>'): + fields[1] = fields[1].split(',')[0][1:] + if fields[1:] != fields_start[1:]: + sys.stderr.write( + 'broken code point range in file "%(f)s": %(l)s\n' %{ + 'f': filename, 'l': line}) + exit(1) + for code_point in range( + int(fields_start[0], 16), + int(fields[0], 16)+1): + fill_attribute(code_point, fields) + fields_start = [] + continue + fill_attribute(int(fields[0], 16), fields) + fields_start = [] + +def fill_derived_core_properties(filename): + '''Stores the entire contents of the DerivedCoreProperties.txt file + in the DERIVED_CORE_PROPERTIES dictionary. + + Lines in DerivedCoreProperties.txt are either a code point range like + this: + + 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z + + or a single code point like this: + + 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR + + ''' + with open(filename, mode='r') as derived_core_properties_file: + for line in derived_core_properties_file: + match = re.match( + r'^(?P<codepoint1>[0-9A-F]{4,6})' + + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' + + r'\s*;\s*(?P<property>[a-zA-Z_]+)', + line) + if not match: + continue + start = match.group('codepoint1') + end = match.group('codepoint2') + if not end: + end = start + for code_point in range(int(start, 16), int(end, 16)+1): + prop = match.group('property') + if code_point in DERIVED_CORE_PROPERTIES: + DERIVED_CORE_PROPERTIES[code_point].append(prop) + else: + DERIVED_CORE_PROPERTIES[code_point] = [prop] + +def fill_east_asian_widths(filename): + '''Stores the entire contents of the EastAsianWidths.txt file + in the EAST_ASIAN_WIDTHS dictionary. + + Lines in EastAsianWidths.txt are either a code point range like + this: + + 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF> + + or a single code point like this: + + A015;W # Lm YI SYLLABLE WU + ''' + with open(filename, mode='r') as east_asian_widths_file: + for line in east_asian_widths_file: + match = re.match( + r'^(?P<codepoint1>[0-9A-F]{4,6})' + +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' + +r'\s*;\s*(?P<property>[a-zA-Z]+)', + line) + if not match: + continue + start = match.group('codepoint1') + end = match.group('codepoint2') + if not end: + end = start + for code_point in range(int(start, 16), int(end, 16)+1): + EAST_ASIAN_WIDTHS[code_point] = match.group('property') + +def to_upper(code_point): + '''Returns the code point of the uppercase version + of the given code point''' + if (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['upper']): + return UNICODE_ATTRIBUTES[code_point]['upper'] + else: + return code_point + +def to_lower(code_point): + '''Returns the code point of the lowercase version + of the given code point''' + if (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['lower']): + return UNICODE_ATTRIBUTES[code_point]['lower'] + else: + return code_point + +def to_title(code_point): + '''Returns the code point of the titlecase version + of the given code point''' + if (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['title']): + return UNICODE_ATTRIBUTES[code_point]['title'] + else: + return code_point + +def is_upper(code_point): + '''Checks whether the character with this code point is uppercase''' + return (to_lower(code_point) != code_point + or (code_point in DERIVED_CORE_PROPERTIES + and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point])) + +def is_lower(code_point): + '''Checks whether the character with this code point is lowercase''' + # Some characters are defined as “Lowercase” in + # DerivedCoreProperties.txt but do not have a mapping to upper + # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is + # one of these. + return (to_upper(code_point) != code_point + # <U00DF> is lowercase, but without simple to_upper mapping. + or code_point == 0x00DF + or (code_point in DERIVED_CORE_PROPERTIES + and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point])) + +def is_alpha(code_point): + '''Checks whether the character with this code point is alphabetic''' + return ((code_point in DERIVED_CORE_PROPERTIES + and + 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point]) + or + # Consider all the non-ASCII digits as alphabetic. + # ISO C 99 forbids us to have them in category “digit”, + # but we want iswalnum to return true on them. + (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd' + and not (code_point >= 0x0030 and code_point <= 0x0039))) + +def is_digit(code_point): + '''Checks whether the character with this code point is a digit''' + if False: + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd') + # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without + # a zero. Must add <0> in front of them by hand. + else: + # SUSV2 gives us some freedom for the "digit" category, but ISO C 99 + # takes it away: + # 7.25.2.1.5: + # The iswdigit function tests for any wide character that + # corresponds to a decimal-digit character (as defined in 5.2.1). + # 5.2.1: + # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 + return (code_point >= 0x0030 and code_point <= 0x0039) + +def is_outdigit(code_point): + '''Checks whether the character with this code point is outdigit''' + return (code_point >= 0x0030 and code_point <= 0x0039) + +def is_blank(code_point): + '''Checks whether the character with this code point is blank''' + return (code_point == 0x0009 # '\t' + # Category Zs without mention of '<noBreak>' + or (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs' + and '<noBreak>' not in + UNICODE_ATTRIBUTES[code_point]['decomposition'])) + +def is_space(code_point): + '''Checks whether the character with this code point is a space''' + # Don’t make U+00A0 a space. Non-breaking space means that all programs + # should treat it like a punctuation character, not like a space. + return (code_point == 0x0020 # ' ' + or code_point == 0x000C # '\f' + or code_point == 0x000A # '\n' + or code_point == 0x000D # '\r' + or code_point == 0x0009 # '\t' + or code_point == 0x000B # '\v' + # Categories Zl, Zp, and Zs without mention of "<noBreak>" + or (UNICODE_ATTRIBUTES[code_point]['name'] + and + (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'] + or + (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs'] + and + '<noBreak>' not in + UNICODE_ATTRIBUTES[code_point]['decomposition'])))) + +def is_cntrl(code_point): + '''Checks whether the character with this code point is + a control character''' + return (UNICODE_ATTRIBUTES[code_point]['name'] + and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>' + or + UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'])) + +def is_xdigit(code_point): + '''Checks whether the character with this code point is + a hexadecimal digit''' + if False: + return (is_digit(code_point) + or (code_point >= 0x0041 and code_point <= 0x0046) + or (code_point >= 0x0061 and code_point <= 0x0066)) + else: + # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 + # takes it away: + # 7.25.2.1.12: + # The iswxdigit function tests for any wide character that + # corresponds to a hexadecimal-digit character (as defined + # in 6.4.4.1). + # 6.4.4.1: + # hexadecimal-digit: one of + # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F + return ((code_point >= 0x0030 and code_point <= 0x0039) + or (code_point >= 0x0041 and code_point <= 0x0046) + or (code_point >= 0x0061 and code_point <= 0x0066)) + +def is_graph(code_point): + '''Checks whether the character with this code point is + a graphical character''' + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' + and not is_space(code_point)) + +def is_print(code_point): + '''Checks whether the character with this code point is printable''' + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' + and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp']) + +def is_punct(code_point): + '''Checks whether the character with this code point is punctuation''' + if False: + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P')) + else: + # The traditional POSIX definition of punctuation is every graphic, + # non-alphanumeric character. + return (is_graph(code_point) + and not is_alpha(code_point) + and not is_digit(code_point)) + +def is_combining(code_point): + '''Checks whether the character with this code point is + a combining character''' + # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt + # file. In 3.0.1 it was identical to the union of the general categories + # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the + # PropList.txt file, so we take the latter definition. + return (UNICODE_ATTRIBUTES[code_point]['name'] + and + UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me']) + +def is_combining_level3(code_point): + '''Checks whether the character with this code point is + a combining level3 character''' + return (is_combining(code_point) + and + int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200)) + +def ucs_symbol(code_point): + '''Return the UCS symbol string for a Unicode character.''' + if code_point < 0x10000: + return '<U{:04X}>'.format(code_point) + else: + return '<U{:08X}>'.format(code_point) + +def ucs_symbol_range(code_point_low, code_point_high): + '''Returns a string UCS symbol string for a code point range. + + Example: + + <U0041>..<U005A> + ''' + return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high) + +def verifications(): + '''Tests whether the is_* functions observe the known restrictions''' + for code_point in sorted(UNICODE_ATTRIBUTES): + # toupper restriction: "Only characters specified for the keywords + # lower and upper shall be specified. + if (to_upper(code_point) != code_point + and not (is_lower(code_point) or is_upper(code_point))): + sys.stderr.write( + ('%(sym)s is not upper|lower ' + + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{ + 'sym': ucs_symbol(code_point), + 'c': code_point, + 'uc': to_upper(code_point)}) + # tolower restriction: "Only characters specified for the keywords + # lower and upper shall be specified. + if (to_lower(code_point) != code_point + and not (is_lower(code_point) or is_upper(code_point))): + sys.stderr.write( + ('%(sym)s is not upper|lower ' + + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{ + 'sym': ucs_symbol(code_point), + 'c': code_point, + 'uc': to_lower(code_point)}) + # alpha restriction: "Characters classified as either upper or lower + # shall automatically belong to this class. + if ((is_lower(code_point) or is_upper(code_point)) + and not is_alpha(code_point)): + sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{ + 'sym': ucs_symbol(code_point)}) + # alpha restriction: “No character specified for the keywords cntrl, + # digit, punct or space shall be specified.” + if (is_alpha(code_point) and is_cntrl(code_point)): + sys.stderr.write('%(sym)s is alpha and cntrl\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_alpha(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is alpha and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_alpha(code_point) and is_punct(code_point)): + sys.stderr.write('%(sym)s is alpha and punct\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_alpha(code_point) and is_space(code_point)): + sys.stderr.write('%(sym)s is alpha and space\n' %{ + 'sym': ucs_symbol(code_point)}) + # space restriction: “No character specified for the keywords upper, + # lower, alpha, digit, graph or xdigit shall be specified.” + # upper, lower, alpha already checked above. + if (is_space(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is space and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_space(code_point) and is_graph(code_point)): + sys.stderr.write('%(sym)s is space and graph\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_space(code_point) and is_xdigit(code_point)): + sys.stderr.write('%(sym)s is space and xdigit\n' %{ + 'sym': ucs_symbol(code_point)}) + # cntrl restriction: “No character specified for the keywords upper, + # lower, alpha, digit, punct, graph, print or xdigit shall be + # specified.” upper, lower, alpha already checked above. + if (is_cntrl(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is cntrl and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_punct(code_point)): + sys.stderr.write('%(sym)s is cntrl and punct\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_graph(code_point)): + sys.stderr.write('%(sym)s is cntrl and graph\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_print(code_point)): + sys.stderr.write('%(sym)s is cntrl and print\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_xdigit(code_point)): + sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{ + 'sym': ucs_symbol(code_point)}) + # punct restriction: “No character specified for the keywords upper, + # lower, alpha, digit, cntrl, xdigit or as the <space> character shall + # be specified.” upper, lower, alpha, cntrl already checked above. + if (is_punct(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is punct and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_punct(code_point) and is_xdigit(code_point)): + sys.stderr.write('%(sym)s is punct and xdigit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_punct(code_point) and code_point == 0x0020): + sys.stderr.write('%(sym)s is punct\n' %{ + 'sym': ucs_symbol(code_point)}) + # graph restriction: “No character specified for the keyword cntrl + # shall be specified.” Already checked above. + + # print restriction: “No character specified for the keyword cntrl + # shall be specified.” Already checked above. + + # graph - print relation: differ only in the <space> character. + # How is this possible if there are more than one space character?! + # I think susv2/xbd/locale.html should speak of “space characters”, + # not “space character”. + if (is_print(code_point) + and not (is_graph(code_point) or is_space(code_point))): + sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{ + 'sym': unicode_utils.ucs_symbol(code_point)}) + if (not is_print(code_point) + and (is_graph(code_point) or code_point == 0x0020)): + sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{ + 'sym': unicode_utils.ucs_symbol(code_point)}) diff --git a/localedata/unicode-gen/utf8_compatibility.py b/localedata/unicode-gen/utf8_compatibility.py index b84a1eb..3b7a94c 100755 --- a/localedata/unicode-gen/utf8_compatibility.py +++ b/localedata/unicode-gen/utf8_compatibility.py @@ -30,146 +30,7 @@ To see how this script is used, call it with the “-h” option: import sys import re import argparse - -# Dictionary holding the entire contents of the UnicodeData.txt file -# -# Contents of this dictionary look like this: -# -# {0: {'category': 'Cc', -# 'title': None, -# 'digit': '', -# 'name': '<control>', -# 'bidi': 'BN', -# 'combining': '0', -# 'comment': '', -# 'oldname': 'NULL', -# 'decomposition': '', -# 'upper': None, -# 'mirrored': 'N', -# 'lower': None, -# 'decdigit': '', -# 'numeric': ''}, -# … -# } -UNICODE_ATTRIBUTES = {} - -# Dictionary holding the entire contents of the EastAsianWidths.txt file -# -# Contents of this dictionary look like this: -# -# {0: 'N', … , 45430: 'W', …} -EAST_ASIAN_WIDTHS = {} - -def fill_attribute(code_point, fields): - '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. - - One entry in the UNICODE_ATTRIBUTES dictionary represents one line - in the UnicodeData.txt file. - - ''' - UNICODE_ATTRIBUTES[code_point] = { - 'name': fields[1], # Character name - 'category': fields[2], # General category - 'combining': fields[3], # Canonical combining classes - 'bidi': fields[4], # Bidirectional category - 'decomposition': fields[5], # Character decomposition mapping - 'decdigit': fields[6], # Decimal digit value - 'digit': fields[7], # Digit value - 'numeric': fields[8], # Numeric value - 'mirrored': fields[9], # mirrored - 'oldname': fields[10], # Old Unicode 1.0 name - 'comment': fields[11], # comment - # Uppercase mapping - 'upper': int(fields[12], 16) if fields[12] else None, - # Lowercase mapping - 'lower': int(fields[13], 16) if fields[13] else None, - # Titlecase mapping - 'title': int(fields[14], 16) if fields[14] else None, - } - -def fill_attributes(filename): - '''Stores the entire contents of the UnicodeData.txt file - in the UNICODE_ATTRIBUTES dictionary. - - A typical line for a single code point in UnicodeData.txt looks - like this: - - 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; - - Code point ranges are indicated by pairs of lines like this: - - 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; - 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; - ''' - with open(filename, mode='r') as unicode_data_file: - fields_start = [] - for line in unicode_data_file: - fields = line.strip().split(';') - if len(fields) != 15: - sys.stderr.write( - 'short line in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - if fields[2] == 'Cs': - # Surrogates are UTF-16 artefacts, - # not real characters. Ignore them. - fields_start = [] - continue - if fields[1].endswith(', First>'): - fields_start = fields - fields_start[1] = fields_start[1].split(',')[0][1:] - continue - if fields[1].endswith(', Last>'): - fields[1] = fields[1].split(',')[0][1:] - if fields[1:] != fields_start[1:]: - sys.stderr.write( - 'broken code point range in file "%(f)s": %(l)s\n' %{ - 'f': filename, 'l': line}) - exit(1) - for code_point in range( - int(fields_start[0], 16), - int(fields[0], 16)+1): - fill_attribute(code_point, fields) - fields_start = [] - continue - fill_attribute(int(fields[0], 16), fields) - fields_start = [] - -def fill_east_asian_widths(filename): - '''Stores the entire contents of the EastAsianWidths.txt file - in the EAST_ASIAN_WIDTHS dictionary. - - Lines in EastAsianWidths.txt are either a code point range like - this: - - 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF> - - or a single code point like this: - - A015;W # Lm YI SYLLABLE WU - ''' - with open(filename, mode='r') as east_asian_widths_file: - for line in east_asian_widths_file: - match = re.match( - r'^(?P<codepoint1>[0-9A-F]{4,6})' - +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' - +r'\s*;\s*(?P<property>[a-zA-Z]+)', - line) - if not match: - continue - start = match.group('codepoint1') - end = match.group('codepoint2') - if not end: - end = start - for code_point in range(int(start, 16), int(end, 16)+1): - EAST_ASIAN_WIDTHS[code_point] = match.group('property') - -def ucs_symbol(code_point): - '''Return the UCS symbol string for a Unicode character.''' - if code_point < 0x10000: - return '<U{:04X}>'.format(code_point) - else: - return '<U{:08X}>'.format(code_point) +import unicode_utils def create_charmap_dictionary(file_name): '''Create a dictionary for all code points found in the CHARMAP @@ -217,10 +78,10 @@ def check_charmap(original_file_name, new_file_name): if ARGS.show_missing_characters: for key in sorted(set(ocharmap)-set(ncharmap)): print('removed: {:s} {:s} {:s}'.format( - ucs_symbol(key), + unicode_utils.ucs_symbol(key), ocharmap[key], - UNICODE_ATTRIBUTES[key]['name'] \ - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') changed_charmap = {} for key in set(ocharmap).intersection(set(ncharmap)): @@ -231,21 +92,21 @@ def check_charmap(original_file_name, new_file_name): if ARGS.show_changed_characters: for key in sorted(changed_charmap): print('changed: {:s} {:s}->{:s} {:s}'.format( - ucs_symbol(key), + unicode_utils.ucs_symbol(key), changed_charmap[key][0], changed_charmap[key][1], - UNICODE_ATTRIBUTES[key]['name'] \ - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') print('Total added characters in newly generated CHARMAP: %d' %len(set(ncharmap)-set(ocharmap))) if ARGS.show_added_characters: for key in sorted(set(ncharmap)-set(ocharmap)): print('added: {:s} {:s} {:s}'.format( - ucs_symbol(key), + unicode_utils.ucs_symbol(key), ncharmap[key], - UNICODE_ATTRIBUTES[key]['name'] \ - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) def create_width_dictionary(file_name): '''Create a dictionary for all code points found in the WIDTH @@ -290,20 +151,20 @@ def check_width(original_file_name, new_file_name): + 'i.e. these have width 1 now.)') if ARGS.show_missing_characters: for key in sorted(set(owidth)-set(nwidth)): - print('removed: {:s} '.format(ucs_symbol(key)) + print('removed: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d} : '.format(owidth[key]) + 'eaw={:s} '.format( - EAST_ASIAN_WIDTHS[key] - if key in EAST_ASIAN_WIDTHS else None) + unicode_utils.EAST_ASIAN_WIDTHS[key] + if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( - UNICODE_ATTRIBUTES[key]['category'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['category'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( - UNICODE_ATTRIBUTES[key]['bidi'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( - UNICODE_ATTRIBUTES[key]['name'] - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') changed_width = {} for key in set(owidth).intersection(set(nwidth)): @@ -313,21 +174,21 @@ def check_width(original_file_name, new_file_name): %len(changed_width)) if ARGS.show_changed_characters: for key in sorted(changed_width): - print('changed width: {:s} '.format(ucs_symbol(key)) + print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d}->{:d} : '.format(changed_width[key][0], changed_width[key][1]) + 'eaw={:s} '.format( - EAST_ASIAN_WIDTHS[key] - if key in EAST_ASIAN_WIDTHS else None) + unicode_utils.EAST_ASIAN_WIDTHS[key] + if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( - UNICODE_ATTRIBUTES[key]['category'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['category'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( - UNICODE_ATTRIBUTES[key]['bidi'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( - UNICODE_ATTRIBUTES[key]['name'] - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) print('------------------------------------------------------------') print('Total added characters in newly generated WIDTH: %d' %len(set(nwidth)-set(owidth))) @@ -335,20 +196,20 @@ def check_width(original_file_name, new_file_name): + 'i.e. these had width 1 before.)') if ARGS.show_added_characters: for key in sorted(set(nwidth)-set(owidth)): - print('added: {:s} '.format(ucs_symbol(key)) + print('added: {:s} '.format(unicode_utils.ucs_symbol(key)) + '{:d} : '.format(nwidth[key]) + 'eaw={:s} '.format( - EAST_ASIAN_WIDTHS[key] - if key in EAST_ASIAN_WIDTHS else None) + unicode_utils.EAST_ASIAN_WIDTHS[key] + if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') + 'category={:2s} '.format( - UNICODE_ATTRIBUTES[key]['category'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['category'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'bidi={:3s} '.format( - UNICODE_ATTRIBUTES[key]['bidi'] - if key in UNICODE_ATTRIBUTES else None) + unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') + 'name={:s}'.format( - UNICODE_ATTRIBUTES[key]['name'] - if key in UNICODE_ATTRIBUTES else None)) + unicode_utils.UNICODE_ATTRIBUTES[key]['name'] + if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) if __name__ == "__main__": PARSER = argparse.ArgumentParser( @@ -392,8 +253,8 @@ if __name__ == "__main__": ARGS = PARSER.parse_args() if ARGS.unicode_data_file: - fill_attributes(ARGS.unicode_data_file) + unicode_utils.fill_attributes(ARGS.unicode_data_file) if ARGS.east_asian_width_file: - fill_east_asian_widths(ARGS.east_asian_width_file) + unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file) check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file) check_width(ARGS.old_utf8_file, ARGS.new_utf8_file) diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py index f1b88f5..bc84c07 100755 --- a/localedata/unicode-gen/utf8_gen.py +++ b/localedata/unicode-gen/utf8_gen.py @@ -29,6 +29,7 @@ It will output UTF-8 file import sys import re +import unicode_utils # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book, # sections 3.11 and 4.4. @@ -49,13 +50,6 @@ JAMO_FINAL_SHORT_NAME = ( 'P', 'H' ) -def ucs_symbol(code_point): - '''Return the UCS symbol string for a Unicode character.''' - if code_point < 0x10000: - return '<U{:04X}>'.format(code_point) - else: - return '<U{:08X}>'.format(code_point) - def process_range(start, end, outfile, name): '''Writes a range of code points into the CHARMAP section of the output file @@ -78,7 +72,7 @@ def process_range(start, end, outfile, name): + JAMO_MEDIAL_SHORT_NAME[index2] \ + JAMO_FINAL_SHORT_NAME[index3] outfile.write('{:<11s} {:<12s} {:s}\n'.format( - ucs_symbol(i), convert_to_hex(i), + unicode_utils.ucs_symbol(i), convert_to_hex(i), hangul_syllable_name)) return # UnicodeData.txt file has contains code point ranges like this: @@ -95,14 +89,14 @@ def process_range(start, end, outfile, name): for i in range(int(start, 16), int(end, 16), 64 ): if i > (int(end, 16)-64): outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( - ucs_symbol(i), - ucs_symbol(int(end,16)), + unicode_utils.ucs_symbol(i), + unicode_utils.ucs_symbol(int(end,16)), convert_to_hex(i), name)) break outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( - ucs_symbol(i), - ucs_symbol(i+63), + unicode_utils.ucs_symbol(i), + unicode_utils.ucs_symbol(i+63), convert_to_hex(i), name)) @@ -168,7 +162,7 @@ def process_charmap(flines, outfile): # comments, so we keep these comment lines. outfile.write('%') outfile.write('{:<11s} {:<12s} {:s}\n'.format( - ucs_symbol(int(fields[0], 16)), + unicode_utils.ucs_symbol(int(fields[0], 16)), convert_to_hex(int(fields[0], 16)), fields[1])) @@ -230,7 +224,7 @@ def process_width(outfile, ulines, elines): for line in ulines: fields = line.split(";") if fields[4] == "NSM" or fields[2] == "Cf": - width_dict[int(fields[0], 16)] = ucs_symbol( + width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( int(fields[0], 16)) + '\t0' for line in elines: @@ -238,7 +232,7 @@ def process_width(outfile, ulines, elines): # UnicodeData.txt: fields = line.split(";") if not '..' in fields[0]: - width_dict[int(fields[0], 16)] = ucs_symbol( + width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol( int(fields[0], 16)) + '\t2' else: code_points = fields[0].split("..") @@ -247,8 +241,8 @@ def process_width(outfile, ulines, elines): if key in width_dict: del width_dict[key] width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format( - ucs_symbol(int(code_points[0], 16)), - ucs_symbol(int(code_points[1], 16))) + unicode_utils.ucs_symbol(int(code_points[0], 16)), + unicode_utils.ucs_symbol(int(code_points[1], 16))) for key in sorted(width_dict): outfile.write(width_dict[key]+'\n') |