From 04fbc779fe06ebb697c7dfe02493ad2fc0f8e1e5 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 22 Jul 2000 21:22:08 +0000 Subject: Update. * iconv/gconv_trans.c: Correct a few bugs in the search loop. Remove remainders of hash table. * locale/categories.def: Remove remainders of transliteration hash table. * locale/langinfo.h: Likewise. * locale/programs/ld-ctype.c: Likewise. Fix code to write out transliteration tables. * locale/gen-translit.pl: New file. * locale/C-translit.h.in: New file. * locale/C-ctype.c: Include C-translit.h. Initialize transliteration data pointers with data from this file. * locale/Makefile (distribute): Add C-translit.h.in, C-translit.h, and gen-translit.pl. Add rule to generate C-translit.h. --- locale/C-ctype.c | 13 +++-- locale/C-translit.h | 21 +++++++ locale/C-translit.h.in | 97 +++++++++++++++++++++++++++++++ locale/Makefile | 8 +++ locale/categories.def | 3 +- locale/gen-translit.pl | 142 +++++++++++++++++++++++++++++++++++++++++++++ locale/langinfo.h | 3 +- locale/programs/ld-ctype.c | 18 +++--- 8 files changed, 284 insertions(+), 21 deletions(-) create mode 100644 locale/C-translit.h create mode 100644 locale/C-translit.h.in create mode 100644 locale/gen-translit.pl (limited to 'locale') diff --git a/locale/C-ctype.c b/locale/C-ctype.c index e93a585..fe1e8ac 100644 --- a/locale/C-ctype.c +++ b/locale/C-ctype.c @@ -20,6 +20,8 @@ #include "localeinfo.h" #include +#include "C-translit.h" + /* This table's entries are taken from POSIX.2 Table 2-6 ``LC_CTYPE Category Definition in the POSIX Locale''. @@ -420,12 +422,11 @@ const struct locale_data _nl_C_LC_CTYPE = { word: L'7' }, { word: L'8' }, { word: L'9' }, - { word: 0 }, - { word: 0 }, - { string: NULL }, - { string: NULL }, - { string: NULL }, - { string: NULL }, + { word: NTRANSLIT }, + { wstr: translit_from_idx }, + { wstr: (uint32_t *) translit_from_tbl }, + { wstr: translit_to_idx }, + { wstr: (uint32_t *) translit_to_tbl }, { word: 1 }, { wstr: (uint32_t *) L"?" }, { word: 0 }, diff --git a/locale/C-translit.h b/locale/C-translit.h new file mode 100644 index 0000000..2d42133 --- /dev/null +++ b/locale/C-translit.h @@ -0,0 +1,21 @@ +#define NTRANSLIT 20 +static const uint32_t translit_from_idx[] = +{ + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, + 24, 26, 28, 30, 32, 34, 36, 38 +}; +static const wchar_t translit_from_tbl[] = + L"\xa9" L"\0" L"\xab" L"\0" L"\xae" L"\0" L"\xbb" L"\0" L"\xbc" L"\0" + L"\xbd" L"\0" L"\xbe" L"\0" L"\xc4" L"\0" L"\xc5" L"\0" L"\xc6" L"\0" + L"\xd6" L"\0" L"\xdc" L"\0" L"\xdf" L"\0" L"\xe4" L"\0" L"\xe5" L"\0" + L"\xe6" L"\0" L"\xf6" L"\0" L"\xfc" L"\0" L"\x201c" L"\0" L"\x201d"; +static const uint32_t translit_to_idx[] = +{ + 0, 5, 9, 14, 18, 23, 28, 33, 37, 41, 45, 49, + 53, 57, 61, 65, 69, 73, 77, 80 +}; +static const wchar_t translit_to_tbl[] = + L"(C)\0" L"\0" L"<<\0" L"\0" L"(R)\0" L"\0" L">>\0" L"\0" L"1/4\0" L"\0" + L"1/2\0" L"\0" L"3/4\0" L"\0" L"AE\0" L"\0" L"AA\0" L"\0" L"AE\0" L"\0" + L"OE\0" L"\0" L"UE\0" L"\0" L"ss\0" L"\0" L"ae\0" L"\0" L"aa\0" L"\0" + L"ae\0" L"\0" L"oe\0" L"\0" L"ue\0" L"\0" L"\"\0" L"\0" L"\"\0"; diff --git a/locale/C-translit.h.in b/locale/C-translit.h.in new file mode 100644 index 0000000..e2f711e --- /dev/null +++ b/locale/C-translit.h.in @@ -0,0 +1,97 @@ +/* Transliteration for the C locale. + Copyright (C) 2000 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper , 2000. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* The entries here have to be sorted relative to the input string. */ + +/* COPYRIGHT SIGN. */ +"\xa9" "(C)" + +/* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK. */ +"\xab" "<<" + +/* REGISTERED SIGN. */ +"\xae" "(R)" + +/* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK. */ +"\xbb" ">>" + +/* VULGAR FRACTION ONE QUARTER. */ +"\xbc" "1/4" + +/* VULGAR FRACTION ONE HALF. */ +"\xbd" "1/2" + +/* VULGAR FRACTION THREE QUARTERS. */ +"\xbe" "3/4" + +/* LATIN CAPITAL LETTER A WITH DIAERESIS. */ +/* XXX It is not clear whether this is the best transliteration for + all locales. If not, we probably have to take it out completely. */ +"\xc4" "AE" + +/* LATIN CAPITAL LETTER A WITH RING ABOVE. */ +/* XXX It is not clear whether this is the best transliteration for + all locales. If not, we probably have to take it out completely. */ +"\xc5" "AA" + +/* LATIN CAPITAL LETTER AE. */ +"\xc6" "AE" + +/* LATIN CAPITAL LETTER O WITH DIAERESIS. */ +/* XXX It is not clear whether this is the best transliteration for + all locales. If not, we probably have to take it out completely. */ +"\xd6" "OE" + +/* LATIN CAPITAL LETTER U WITH DIAERESIS. */ +/* XXX It is not clear whether this is the best transliteration for + all locales. If not, we probably have to take it out completely. */ +"\xdc" "UE" + +/* LATIN SMALL LETTER SHARP S. */ +"\xdf" "ss" + +/* LATIN SMALL LETTER A WITH DIAERESIS. */ +/* XXX It is not clear whether this is the best transliteration for + all locales. If not, we probably have to take it out completely. */ +"\xe4" "ae" + +/* LATIN SMALL LETTER A WITH RING ABOVE. */ +/* XXX It is not clear whether this is the best transliteration for + all locales. If not, we probably have to take it out completely. */ +"\xe5" "aa" + +/* LATIN SMALL LETTER AE. */ +"\xe6" "ae" + +/* LATIN SMALL LETTER O WITH DIAERESIS. */ +/* XXX It is not clear whether this is the best transliteration for + all locales. If not, we probably have to take it out completely. */ +"\xf6" "oe" + +/* LATIN SMALL LETTER U WITH DIAERESIS. */ +/* XXX It is not clear whether this is the best transliteration for + all locales. If not, we probably have to take it out completely. */ +"\xfc" "ue" + +/* LEFT DOUBLE QUOTATION MARK. */ +"\x201c" "\"" + +/* RIGHT DOUBLE QUOTATION MARK. */ +"\x201d" "\"" diff --git a/locale/Makefile b/locale/Makefile index db71cc2..2825a69 100644 --- a/locale/Makefile +++ b/locale/Makefile @@ -25,6 +25,7 @@ headers = locale.h langinfo.h xlocale.h distribute = localeinfo.h categories.def iso-639.def iso-3166.def \ iso-4217.def weight.h weightwc.h strlen-hash.h elem-hash.h \ indigits.h indigitswc.h outdigits.h outdigitswc.h \ + C-translit.h.in C-translit.h gen-translit.pl \ $(addprefix programs/, \ locale.c localedef.c \ $(localedef-modules:=.c) $(locale-modules:=.c) \ @@ -73,6 +74,13 @@ $(objpfx)localedef: $(localedef-modules:%=$(objpfx)%.o) $(objpfx)locale: $(locale-modules:%=$(objpfx)%.o) $(objpfx)localedef $(objpfx)locale: $(lib-modules:%=$(objpfx)%.o) +C-translit.h: C-translit.h.in gen-translit.pl + $(PERL) gen-translit.pl < $< > $@.tmp + $(move-if-change) $@.tmp $@ +ifeq ($(with-cvs),yes) + test ! -d CVS || cvs $(CVSOPTS) commit -mRegenerated $@ +endif + localepath = "$(localedir):$(i18ndir)" locale-CPPFLAGS := -DLOCALE_PATH='$(localepath)' \ diff --git a/locale/categories.def b/locale/categories.def index 8e5e65a..a8fa30e 100644 --- a/locale/categories.def +++ b/locale/categories.def @@ -126,8 +126,7 @@ DEFINE_CATEGORY DEFINE_ELEMENT (_NL_CTYPE_OUTDIGIT7_WC, "ctype-outdigit7_wc", std, word) DEFINE_ELEMENT (_NL_CTYPE_OUTDIGIT8_WC, "ctype-outdigit8_wc", std, word) DEFINE_ELEMENT (_NL_CTYPE_OUTDIGIT9_WC, "ctype-outdigit9_wc", std, word) - DEFINE_ELEMENT (_NL_CTYPE_TRANSLIT_HASH_SIZE, "ctype-translit-hash-size", std, word) - DEFINE_ELEMENT (_NL_CTYPE_TRANSLIT_HASH_LAYERS, "ctype-translit-hash-layers", std, word) + DEFINE_ELEMENT (_NL_CTYPE_TRANSLIT_TAB_SIZE, "ctype-translit-tab-size", std, word) DEFINE_ELEMENT (_NL_CTYPE_TRANSLIT_FROM_IDX, "ctype-translit-from-idx", std, string) DEFINE_ELEMENT (_NL_CTYPE_TRANSLIT_FROM_TBL, "ctype-translit-from-tbl", std, string) DEFINE_ELEMENT (_NL_CTYPE_TRANSLIT_TO_IDX, "ctype-translit-to-idx", std, string) diff --git a/locale/gen-translit.pl b/locale/gen-translit.pl new file mode 100644 index 0000000..b6fba77 --- /dev/null +++ b/locale/gen-translit.pl @@ -0,0 +1,142 @@ +#! /usr/bin/perl -w +open F, "cat C-translit.h.in | gcc -E - |" || die "Cannot preprocess input file"; + + +sub cstrlen { + my($str) = @_; + my($len) = length($str); + my($cnt); + my($res) = 0; + + for ($cnt = 0; $cnt < $len; ++$cnt) { + if (substr($str, $cnt, 1) eq '\\') { + # Recognize the escape sequence. + if (substr($str, $cnt + 1, 1) eq 'x') { + my($inner); + for ($inner = $cnt + 2; $inner < $len && $inner < $cnt + 10; ++$inner) { + my($ch) = substr($str, $inner, 1); + next if (($ch ge '0' && $ch le '9') + || ($ch ge 'a' && $ch le 'f') + || ($ch ge 'A' && $ch le 'F')); + last; + } + $cnt = $inner; + ++$res; + } else { + die "invalid input" if ($cnt + 1 >= $len); + ++$res; + ++$cnt; + } + } else { + ++$res; + } + } + + return $res; +} + +while () { + next if (/^#/); + next if (/^[ ]*$/); + chop; + + if (/"([^\"]*)"[ ]*"(.*)"/) { + my($from) = $1; + my($to) = $2; + my($fromlen) = cstrlen($from); + my($tolen) = cstrlen($to); + + push(@froms, $from); + push(@fromlens, $fromlen); + push(@tos, $to); + push(@tolens, $tolen); + } +} + +printf "#define NTRANSLIT %d\n", $#froms + 1; + +printf "static const uint32_t translit_from_idx[] =\n{\n "; +$col = 2; +$total = 0; +for ($cnt = 0; $cnt <= $#fromlens; ++$cnt) { + if ($cnt != 0) { + if ($col + 7 >= 79) { + printf(",\n "); + $col = 2; + } else { + printf(", "); + $col += 2; + } + } + printf("%4d", $total); + $total += $fromlens[$cnt] + 1; + $col += 4; +} +printf("\n};\n"); + +printf "static const wchar_t translit_from_tbl[] =\n "; +$col = 1; +for ($cnt = 0; $cnt <= $#froms; ++$cnt) { + if ($cnt != 0) { + if ($col + 6 >= 79) { + printf("\n "); + $col = 1; + } + printf(" L\"\\0\""); + $col += 6; + } + if ($col > 2 && $col + length($froms[$cnt]) + 4 >= 79) { + printf("\n "); + $col = 2; + } else { + printf(" "); + ++$col; + } + printf("L\"$froms[$cnt]\""); + $col += length($froms[$cnt]) + 3; +} +printf(";\n"); + +printf "static const uint32_t translit_to_idx[] =\n{\n "; +$col = 2; +$total = 0; +for ($cnt = 0; $cnt <= $#tolens; ++$cnt) { + if ($cnt != 0) { + if ($col + 7 >= 79) { + printf(",\n "); + $col = 2; + } else { + printf(", "); + $col += 2; + } + } + printf("%4d", $total); + $total += $tolens[$cnt] + 2; + $col += 4; +} +printf("\n};\n"); + +printf "static const wchar_t translit_to_tbl[] =\n "; +$col = 1; +for ($cnt = 0; $cnt <= $#tos; ++$cnt) { + if ($cnt != 0) { + if ($col + 6 >= 79) { + printf("\n "); + $col = 1; + } + printf(" L\"\\0\""); + $col += 6; + } + if ($col > 2 && $col + length($tos[$cnt]) + 6 >= 79) { + printf("\n "); + $col = 2; + } else { + printf(" "); + ++$col; + } + printf("L\"$tos[$cnt]\\0\""); + $col += length($tos[$cnt]) + 5; +} +printf(";\n"); + +exit 0; diff --git a/locale/langinfo.h b/locale/langinfo.h index d2cc2a8..69d7292 100644 --- a/locale/langinfo.h +++ b/locale/langinfo.h @@ -316,8 +316,7 @@ enum _NL_CTYPE_OUTDIGIT7_WC, _NL_CTYPE_OUTDIGIT8_WC, _NL_CTYPE_OUTDIGIT9_WC, - _NL_CTYPE_TRANSLIT_HASH_SIZE, - _NL_CTYPE_TRANSLIT_HASH_LAYERS, + _NL_CTYPE_TRANSLIT_TAB_SIZE, _NL_CTYPE_TRANSLIT_FROM_IDX, _NL_CTYPE_TRANSLIT_FROM_TBL, _NL_CTYPE_TRANSLIT_TO_IDX, diff --git a/locale/programs/ld-ctype.c b/locale/programs/ld-ctype.c index e297aeb..5dfcec3 100644 --- a/locale/programs/ld-ctype.c +++ b/locale/programs/ld-ctype.c @@ -173,13 +173,11 @@ struct locale_ctype_t unsigned char *width; uint32_t mb_cur_max; const char *codeset_name; - uint32_t translit_hash_size; - uint32_t translit_hash_layers; uint32_t *translit_from_idx; uint32_t *translit_from_tbl; uint32_t *translit_to_idx; uint32_t *translit_to_tbl; - size_t translit_idx_size; + uint32_t translit_idx_size; size_t translit_from_tbl_size; size_t translit_to_tbl_size; @@ -866,7 +864,7 @@ ctype_output (struct localedef_t *locale, struct charmap_t *charmap, { #define CTYPE_EMPTY(name) \ case name: \ - iov[2 + elem + offset].iov_base = ""; \ + iov[2 + elem + offset].iov_base = (void *) ""; \ iov[2 + elem + offset].iov_len = 0; \ idx[elem + 1] = idx[elem]; \ break @@ -911,14 +909,12 @@ ctype_output (struct localedef_t *locale, struct charmap_t *charmap, ctype->names, (ctype->plane_size * ctype->plane_cnt * sizeof (uint32_t))); - CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_SIZE, - &ctype->translit_hash_size, sizeof (uint32_t)); - CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_LAYERS, - &ctype->translit_hash_layers, sizeof (uint32_t)); + CTYPE_DATA (_NL_CTYPE_TRANSLIT_TAB_SIZE, + &ctype->translit_idx_size, sizeof (uint32_t)); CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_IDX, ctype->translit_from_idx, - ctype->translit_idx_size); + ctype->translit_idx_size * sizeof (uint32_t)); CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_TBL, ctype->translit_from_tbl, @@ -926,7 +922,7 @@ ctype_output (struct localedef_t *locale, struct charmap_t *charmap, CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_IDX, ctype->translit_to_idx, - ctype->translit_idx_size); + ctype->translit_idx_size * sizeof (uint32_t)); CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_TBL, ctype->translit_to_tbl, ctype->translit_to_tbl_size); @@ -3664,7 +3660,7 @@ Computing table size for character classes might take a while..."), } /* Store the information about the length. */ - ctype->translit_idx_size = number * sizeof (uint32_t); + ctype->translit_idx_size = number; ctype->translit_from_tbl_size = from_len * sizeof (uint32_t); ctype->translit_to_tbl_size = to_len * sizeof (uint32_t); } -- cgit v1.1