From b721a2c03c079f8939fae82823b6f75ef5b6639b Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Thu, 12 Apr 2001 20:26:40 +0000 Subject: Update. 2001-04-12 Bruno Haible * iconvdata/TESTS2: New file. * iconvdata/run-iconv-test.sh: Also run tests from TESTS2. * iconvdata/testdata/alfabeta..UTF-8: New file. * iconvdata/testdata/alfabeta..UTF-16.BE: New file. * iconvdata/testdata/alfabeta..UTF-16.LE: New file. * iconvdata/testdata/alfabeta..UTF-32.BE: New file. * iconvdata/testdata/alfabeta..UTF-32.LE: New file. 2001-04-11 Bruno Haible * iconvdata/utf-32.c: New file. * iconvdata/gconv-modules: Add entries for UTF-32, UTF-32LE, UTF-32BE. * iconvdata/Makefile (modules): Add UTF-32. (distribute): Add utf-32.c. 2001-04-11 Bruno Haible * iconvdata/utf-16.c (PREPARE_LOOP): Initialize 'swap' after possibly changing it in the state. After incrementing 'inptr', store it back. * iconvdata/unicode.c (PREPARE_LOOP): After incrementing 'inptr', store it back. 2001-04-11 Bruno Haible * iconvdata/utf-16.c (gconv_init): Use MAX_NEEDED_FROM, not MIN_NEEDED_FROM. --- iconvdata/Makefile | 4 +- iconvdata/TESTS2 | 27 ++++ iconvdata/gconv-modules | 12 ++ iconvdata/run-iconv-test.sh | 45 +++++- iconvdata/testdata/alfabeta..UTF-16.BE | Bin 0 -> 644 bytes iconvdata/testdata/alfabeta..UTF-16.LE | Bin 0 -> 644 bytes iconvdata/testdata/alfabeta..UTF-32.BE | Bin 0 -> 1068 bytes iconvdata/testdata/alfabeta..UTF-32.LE | Bin 0 -> 1068 bytes iconvdata/testdata/alfabeta..UTF-8 | 6 + iconvdata/unicode.c | 4 +- iconvdata/utf-16.c | 15 +- iconvdata/utf-32.c | 270 +++++++++++++++++++++++++++++++++ 12 files changed, 371 insertions(+), 12 deletions(-) create mode 100644 iconvdata/TESTS2 create mode 100644 iconvdata/testdata/alfabeta..UTF-16.BE create mode 100644 iconvdata/testdata/alfabeta..UTF-16.LE create mode 100644 iconvdata/testdata/alfabeta..UTF-32.BE create mode 100644 iconvdata/testdata/alfabeta..UTF-32.LE create mode 100644 iconvdata/testdata/alfabeta..UTF-8 create mode 100644 iconvdata/utf-32.c (limited to 'iconvdata') diff --git a/iconvdata/Makefile b/iconvdata/Makefile index 252c6ca..7c9c628 100644 --- a/iconvdata/Makefile +++ b/iconvdata/Makefile @@ -46,7 +46,7 @@ modules := ISO8859-1 ISO8859-2 ISO8859-3 ISO8859-4 ISO8859-5 \ INIS-CYRILLIC ISO_6937-2 ISO_2033 ISO_5427 ISO_5427-EXT \ ISO_5428 ISO_10367-BOX MAC-IS MAC-UK NATS-DANO NATS-SEFI \ SAMI-WS2 ISO-IR-197 TIS-620 KOI8-U GBK ISIRI-3342 GBGBK \ - ISO-2022-CN libISOIR165 UTF-16 UNICODE UTF-7 BIG5HKSCS \ + ISO-2022-CN libISOIR165 UTF-16 UNICODE UTF-32 UTF-7 BIG5HKSCS \ GB18030 ISO-2022-CN-EXT VISCII GBBIG5 modules.so := $(addsuffix .so, $(modules)) @@ -134,7 +134,7 @@ distribute := gconv-modules extra-module.mk gap.awk gaptab.awk \ macintosh.c mac-is.c mac-uk.c nats-dano.c nats-sefi.c sjis.c \ t.61.c uhc.c sami-ws2.c iso-ir-197.c tis-620.c koi8-u.c \ isiri-3342.c isiri-3342.h gbgbk.c iso-2022-cn.c cns11643l2.h \ - iso8859-16.c utf-16.c unicode.c utf-7.c big5hkscs.c \ + iso8859-16.c utf-16.c unicode.c utf-32.c utf-7.c big5hkscs.c \ iso-ir-165.c iso-ir-165.h gb18030.c iso-2022-cn-ext.c \ ibm932.c ibm932.h ibm943.c ibm943.h gbbig5.c diff --git a/iconvdata/TESTS2 b/iconvdata/TESTS2 new file mode 100644 index 0000000..7d4ec00 --- /dev/null +++ b/iconvdata/TESTS2 @@ -0,0 +1,27 @@ +# Tests for endianness dependent iconv(1) (and therefore iconv(3)) in GNU libc. +# Copyright (C) 2001 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# Contributed by Bruno Haible , 2001. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Library General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Library General Public License for more details. +# +# You should have received a copy of the GNU Library General Public +# License along with the GNU C Library; see the file COPYING.LIB. If not, +# write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA. + +# Each line consists of three fields: +# 1. The endianness independent encoding. +# 2. The endianness dependent encoding. +# 3. The filename stem. + +UTF-8 UTF-16 alfabeta +UTF-8 UTF-32 alfabeta diff --git a/iconvdata/gconv-modules b/iconvdata/gconv-modules index 79b4b1f..3bbe11c 100644 --- a/iconvdata/gconv-modules +++ b/iconvdata/gconv-modules @@ -1319,6 +1319,18 @@ module UNICODE// INTERNAL UNICODE 1 module INTERNAL UNICODE// UNICODE 1 # from to module cost +module UTF-32// INTERNAL UTF-32 1 +module INTERNAL UTF-32// UTF-32 1 + +# from to module cost +module UTF-32LE// INTERNAL UTF-32 1 +module INTERNAL UTF-32LE// UTF-32 1 + +# from to module cost +module UTF-32BE// INTERNAL UTF-32 1 +module INTERNAL UTF-32BE// UTF-32 1 + +# from to module cost module UTF-7// INTERNAL UTF-7 1 module INTERNAL UTF-7// UTF-7 1 diff --git a/iconvdata/run-iconv-test.sh b/iconvdata/run-iconv-test.sh index 3388e80..0d307a3 100755 --- a/iconvdata/run-iconv-test.sh +++ b/iconvdata/run-iconv-test.sh @@ -127,7 +127,7 @@ while read from to subset targets; do fi if test "$subset" != Y; then - echo $ac_n " suntzu: ASCII -> $to -> ASCII $ac_c" + echo $ac_n " suntzu: ASCII -> $to -> ASCII $ac_c" $PROG -f ASCII -t $to testdata/suntzus | $PROG -f $to -t ASCII > $temp1 || { if test $? -gt 128; then exit 1; fi @@ -139,6 +139,49 @@ while read from to subset targets; do fi done < TESTS +# We read the file named TESTS2. All non-empty lines not starting with +# `#' are interpreted as commands. +while read utf8 from filename; do + # Ignore empty and comment lines. + if test -z "$filename" || test "$utf8" = '#'; then continue; fi + + # Expand the variables now. + PROG=`eval echo $ICONV` + + # Test conversion to the endianness dependent encoding. + echo $ac_n "test encoder: $utf8 -> $from $ac_c" + $PROG -f $utf8 -t $from < testdata/${filename}..${utf8} > $temp1 + cmp $temp1 testdata/${filename}..${from}.BE > /dev/null 2>&1 || + cmp $temp1 testdata/${filename}..${from}.LE > /dev/null 2>&1 || + { echo "/FAILED"; failed=1; continue; } + echo "OK" + + # Test conversion from the endianness dependent encoding. + echo $ac_n "test decoder: $from -> $utf8 $ac_c" + $PROG -f $from -t $utf8 < testdata/${filename}..${from}.BE > $temp1 + cmp $temp1 testdata/${filename}..${utf8} > /dev/null 2>&1 || + { echo "/FAILED"; failed=1; continue; } + $PROG -f $from -t $utf8 < testdata/${filename}..${from}.LE > $temp1 + cmp $temp1 testdata/${filename}..${utf8} > /dev/null 2>&1 || + { echo "/FAILED"; failed=1; continue; } + echo "OK" + + # Test byte swapping behaviour. + echo $ac_n "test non-BOM: ${from}BE -> ${from}LE $ac_c" + $PROG -f ${from}BE -t ${from}LE < testdata/${filename}..${from}.BE > $temp1 + cmp $temp1 testdata/${filename}..${from}.LE > /dev/null 2>&1 || + { echo "/FAILED"; failed=1; continue; } + echo "OK" + + # Test byte swapping behaviour. + echo $ac_n "test non-BOM: ${from}LE -> ${from}BE $ac_c" + $PROG -f ${from}LE -t ${from}BE < testdata/${filename}..${from}.LE > $temp1 + cmp $temp1 testdata/${filename}..${from}.BE > /dev/null 2>&1 || + { echo "/FAILED"; failed=1; continue; } + echo "OK" + +done < TESTS2 + exit $failed # Local Variables: # mode:shell-script diff --git a/iconvdata/testdata/alfabeta..UTF-16.BE b/iconvdata/testdata/alfabeta..UTF-16.BE new file mode 100644 index 0000000..904d9be Binary files /dev/null and b/iconvdata/testdata/alfabeta..UTF-16.BE differ diff --git a/iconvdata/testdata/alfabeta..UTF-16.LE b/iconvdata/testdata/alfabeta..UTF-16.LE new file mode 100644 index 0000000..4f3dd2a Binary files /dev/null and b/iconvdata/testdata/alfabeta..UTF-16.LE differ diff --git a/iconvdata/testdata/alfabeta..UTF-32.BE b/iconvdata/testdata/alfabeta..UTF-32.BE new file mode 100644 index 0000000..6aa4dcb Binary files /dev/null and b/iconvdata/testdata/alfabeta..UTF-32.BE differ diff --git a/iconvdata/testdata/alfabeta..UTF-32.LE b/iconvdata/testdata/alfabeta..UTF-32.LE new file mode 100644 index 0000000..2451dac Binary files /dev/null and b/iconvdata/testdata/alfabeta..UTF-32.LE differ diff --git a/iconvdata/testdata/alfabeta..UTF-8 b/iconvdata/testdata/alfabeta..UTF-8 new file mode 100644 index 0000000..4229c88 --- /dev/null +++ b/iconvdata/testdata/alfabeta..UTF-8 @@ -0,0 +1,6 @@ +ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ <- Greek +𐌀𐌁𐌂𐌃𐌄𐌅𐌆𐌇𐌈𐌉𐌊𐌋𐌌𐌍𐌎𐌏𐌐𐌑𐌒𐌓𐌔𐌕𐌖𐌗𐌘𐌙𐌚𐌛𐌜𐌝 <- Etruscan +ABCDEFGHIJKLMNOPQRSTUVWXYZ <- Latin +АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ <- Cyrillic +𐌰𐌱𐌲𐌳𐌴𐌵𐌶𐌷𐌸𐌹𐌺𐌻𐌼𐌽𐌾𐌿𐍀𐍁𐍂𐍃𐍄𐍅𐍆𐍇𐍈 <- Gothic +אבגדהוזחטיךכלםמןנסעףפץצקרש <- Hebrew diff --git a/iconvdata/unicode.c b/iconvdata/unicode.c index be72f4f..89ec81f 100644 --- a/iconvdata/unicode.c +++ b/iconvdata/unicode.c @@ -53,11 +53,11 @@ \ if (get16u (inptr) == BOM) \ /* Simply ignore the BOM character. */ \ - inptr += 2; \ + *inptrp = inptr += 2; \ else if (get16u (inptr) == BOM_OE) \ { \ ((struct unicode_data *) step->__data)->swap = 1; \ - inptr += 2; \ + *inptrp = inptr += 2; \ } \ } \ } \ diff --git a/iconvdata/utf-16.c b/iconvdata/utf-16.c index c40e296..6001b36 100644 --- a/iconvdata/utf-16.c +++ b/iconvdata/utf-16.c @@ -1,5 +1,5 @@ /* Conversion module for UTF-16. - Copyright (C) 1999, 2000 Free Software Foundation, Inc. + Copyright (C) 1999, 2000, 2001 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1999. @@ -44,7 +44,7 @@ #define PREPARE_LOOP \ enum direction dir = ((struct utf16_data *) step->__data)->dir; \ enum variant var = ((struct utf16_data *) step->__data)->var; \ - int swap = ((struct utf16_data *) step->__data)->swap; \ + int swap; \ if (FROM_DIRECTION && var == UTF_16) \ { \ if (data->__invocation_counter == 0) \ @@ -55,11 +55,11 @@ \ if (get16u (inptr) == BOM) \ /* Simply ignore the BOM character. */ \ - inptr += 2; \ + *inptrp = inptr += 2; \ else if (get16u (inptr) == BOM_OE) \ { \ ((struct utf16_data *) step->__data)->swap = 1; \ - inptr += 2; \ + *inptrp = inptr += 2; \ } \ } \ } \ @@ -72,7 +72,8 @@ \ put16u (outbuf, BOM); \ outbuf += 2; \ - } + } \ + swap = ((struct utf16_data *) step->__data)->swap; #define EXTRA_LOOP_ARGS , var, swap @@ -159,7 +160,7 @@ gconv_init (struct __gconv_step *step) if (dir == from_utf16) { step->__min_needed_from = MIN_NEEDED_FROM; - step->__max_needed_from = MIN_NEEDED_FROM; + step->__max_needed_from = MAX_NEEDED_FROM; step->__min_needed_to = MIN_NEEDED_TO; step->__max_needed_to = MIN_NEEDED_TO; } @@ -168,7 +169,7 @@ gconv_init (struct __gconv_step *step) step->__min_needed_from = MIN_NEEDED_TO; step->__max_needed_from = MIN_NEEDED_TO; step->__min_needed_to = MIN_NEEDED_FROM; - step->__max_needed_to = MIN_NEEDED_FROM; + step->__max_needed_to = MAX_NEEDED_FROM; } step->__stateful = 0; diff --git a/iconvdata/utf-32.c b/iconvdata/utf-32.c new file mode 100644 index 0000000..2e245c9 --- /dev/null +++ b/iconvdata/utf-32.c @@ -0,0 +1,270 @@ +/* Conversion module for UTF-32. + Copyright (C) 1999, 2000, 2001 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#include +#include +#include +#include +#include +#include +#include + +/* This is the Byte Order Mark character (BOM). */ +#define BOM 0x0000feffu +/* And in the other byte order. */ +#define BOM_OE 0xfffe0000u + + +/* Definitions used in the body of the `gconv' function. */ +#define FROM_LOOP from_utf32_loop +#define TO_LOOP to_utf32_loop +#define DEFINE_INIT 0 +#define DEFINE_FINI 0 +#define MIN_NEEDED_FROM 4 +#define MIN_NEEDED_TO 4 +#define FROM_DIRECTION (dir == from_utf32) +#define PREPARE_LOOP \ + enum direction dir = ((struct utf32_data *) step->__data)->dir; \ + enum variant var = ((struct utf32_data *) step->__data)->var; \ + int swap; \ + if (FROM_DIRECTION && var == UTF_32) \ + { \ + if (data->__invocation_counter == 0) \ + { \ + /* We have to find out which byte order the file is encoded in. */ \ + if (inptr + 4 > inend) \ + return __GCONV_EMPTY_INPUT; \ + \ + if (get32u (inptr) == BOM) \ + /* Simply ignore the BOM character. */ \ + *inptrp = inptr += 4; \ + else if (get32u (inptr) == BOM_OE) \ + { \ + ((struct utf32_data *) step->__data)->swap = 1; \ + *inptrp = inptr += 4; \ + } \ + } \ + } \ + else if (!FROM_DIRECTION && var == UTF_32 && !data->__internal_use \ + && data->__invocation_counter == 0) \ + { \ + /* Emit the Byte Order Mark. */ \ + if (__builtin_expect (outbuf + 4 > outend, 0)) \ + return __GCONV_FULL_OUTPUT; \ + \ + put32u (outbuf, BOM); \ + outbuf += 4; \ + } \ + swap = ((struct utf32_data *) step->__data)->swap; +#define EXTRA_LOOP_ARGS , var, swap + + +/* Direction of the transformation. */ +enum direction +{ + illegal_dir, + to_utf32, + from_utf32 +}; + +enum variant +{ + illegal_var, + UTF_32, + UTF_32LE, + UTF_32BE +}; + +struct utf32_data +{ + enum direction dir; + enum variant var; + int swap; +}; + + +extern int gconv_init (struct __gconv_step *step); +int +gconv_init (struct __gconv_step *step) +{ + /* Determine which direction. */ + struct utf32_data *new_data; + enum direction dir = illegal_dir; + enum variant var = illegal_var; + int result; + + if (__strcasecmp (step->__from_name, "UTF-32//") == 0) + { + dir = from_utf32; + var = UTF_32; + } + else if (__strcasecmp (step->__to_name, "UTF-32//") == 0) + { + dir = to_utf32; + var = UTF_32; + } + else if (__strcasecmp (step->__from_name, "UTF-32BE//") == 0) + { + dir = from_utf32; + var = UTF_32BE; + } + else if (__strcasecmp (step->__to_name, "UTF-32BE//") == 0) + { + dir = to_utf32; + var = UTF_32BE; + } + else if (__strcasecmp (step->__from_name, "UTF-32LE//") == 0) + { + dir = from_utf32; + var = UTF_32LE; + } + else if (__strcasecmp (step->__to_name, "UTF-32LE//") == 0) + { + dir = to_utf32; + var = UTF_32LE; + } + + result = __GCONV_NOCONV; + if (__builtin_expect (dir, to_utf32) != illegal_dir) + { + new_data = (struct utf32_data *) malloc (sizeof (struct utf32_data)); + + result = __GCONV_NOMEM; + if (new_data != NULL) + { + new_data->dir = dir; + new_data->var = var; + new_data->swap = ((var == UTF_32LE && BYTE_ORDER == BIG_ENDIAN) + || (var == UTF_32BE + && BYTE_ORDER == LITTLE_ENDIAN)); + step->__data = new_data; + + if (dir == from_utf32) + { + step->__min_needed_from = MIN_NEEDED_FROM; + step->__max_needed_from = MIN_NEEDED_FROM; + step->__min_needed_to = MIN_NEEDED_TO; + step->__max_needed_to = MIN_NEEDED_TO; + } + else + { + step->__min_needed_from = MIN_NEEDED_TO; + step->__max_needed_from = MIN_NEEDED_TO; + step->__min_needed_to = MIN_NEEDED_FROM; + step->__max_needed_to = MIN_NEEDED_FROM; + } + + step->__stateful = 0; + + result = __GCONV_OK; + } + } + + return result; +} + + +extern void gconv_end (struct __gconv_step *data); +void +gconv_end (struct __gconv_step *data) +{ + free (data->__data); +} + + +/* Convert from the internal (UCS4-like) format to UTF-32. */ +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define LOOPFCT TO_LOOP +#define BODY \ + { \ + uint32_t c = get32 (inptr); \ + \ + if (__builtin_expect (c >= 0x110000, 0)) \ + { \ + STANDARD_ERR_HANDLER (4); \ + } \ + else if (__builtin_expect (c >= 0xd800 && c < 0xe000, 0)) \ + { \ + /* Surrogate characters in UCS-4 input are not valid. \ + We must catch this. If we let surrogates pass through, \ + attackers could make a security hole exploit by \ + generating "irregular UTF-32" sequences. */ \ + if (! ignore_errors_p ()) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + inptr += 4; \ + ++*irreversible; \ + continue; \ + } \ + \ + if (swap) \ + put32 (outptr, bswap_32 (c)); \ + else \ + put32 (outptr, c); \ + \ + outptr += 4; \ + inptr += 4; \ + } +#define LOOP_NEED_FLAGS +#define EXTRA_LOOP_DECLS \ + , enum variant var, int swap +#include + + +/* Convert from UTF-32 to the internal (UCS4-like) format. */ +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY \ + { \ + uint32_t u1 = get32 (inptr); \ + \ + if (swap) \ + u1 = bswap_32 (u1); \ + \ + if (__builtin_expect (u1 >= 0x110000, 0)) \ + { \ + /* This is illegal. */ \ + if (! ignore_errors_p ()) \ + { \ + /* This is an illegal character. */ \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + \ + inptr += 4; \ + ++*irreversible; \ + continue; \ + } \ + \ + put32 (outptr, u1); \ + inptr += 4; \ + outptr += 4; \ + } +#define LOOP_NEED_FLAGS +#define EXTRA_LOOP_DECLS \ + , enum variant var, int swap +#include + + +/* Now define the toplevel functions. */ +#include -- cgit v1.1