diff options
author | Jerry DeLisle <jvdelisle@gcc.gnu.org> | 2008-08-16 03:38:31 +0000 |
---|---|---|
committer | Jerry DeLisle <jvdelisle@gcc.gnu.org> | 2008-08-16 03:38:31 +0000 |
commit | 3ae86bf4f45b1f110aa7bd09ea61a8fd30c2a983 (patch) | |
tree | 659ac99319524f53dd916bbef37cd895b6a5cadb /libgfortran/io/read.c | |
parent | dad80a1bff182651128f352095e2163534c5d81c (diff) | |
download | gcc-3ae86bf4f45b1f110aa7bd09ea61a8fd30c2a983.zip gcc-3ae86bf4f45b1f110aa7bd09ea61a8fd30c2a983.tar.gz gcc-3ae86bf4f45b1f110aa7bd09ea61a8fd30c2a983.tar.bz2 |
re PR libfortran/35863 ([F2003] Implement ENCODING="UTF-8")
2008-08-15 Jerry DeLisle <jvdelisle@gcc.gnu.org>
PR libfortran/35863
* intrinsics/selected_char_kind.c: Enable iso_10646.
* io/read.c (typedef uchar): New type.
(read_utf8): New function to read a single UTF-8 encoded character.
(read_utf8_char1): New function to read UTF-8 into a KIND=1 string.
(read_default_char1): New functio to read default into KIND=1 string.
(read_utf8_char4): New function to read UTF-8 into a KIND=4 string.
(read_default_char4): New function to read UTF-8 into a KIND=4 string.
(read_a): Modify to use the new functions.
(read_a_char4): Modify to use the new functions.
* io/write.c (error.h): Add include. (typedef uchar): New type.
(write_default_char4): New function to default write KIND=4 string.
(write_utf8_char4): New function to UTF-8 write KIND=4 string.
(write_a_char4): Modify to use new functions.
(write_character): Modify to use new functions.
From-SVN: r139147
Diffstat (limited to 'libgfortran/io/read.c')
-rw-r--r-- | libgfortran/io/read.c | 239 |
1 files changed, 201 insertions, 38 deletions
diff --git a/libgfortran/io/read.c b/libgfortran/io/read.c index cb88933..8d25493 100644 --- a/libgfortran/io/read.c +++ b/libgfortran/io/read.c @@ -34,6 +34,8 @@ Boston, MA 02110-1301, USA. */ #include <ctype.h> #include <stdlib.h> +typedef unsigned char uchar; + /* read.c -- Deal with formatted reads */ @@ -236,78 +238,239 @@ read_l (st_parameter_dt *dtp, const fnode *f, char *dest, int length) } -/* read_a()-- Read a character record. This one is pretty easy. */ - -void -read_a (st_parameter_dt *dtp, const fnode *f, char *p, int length) +static inline gfc_char4_t +read_utf8 (st_parameter_dt *dtp, size_t *nbytes) { + static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 }; + static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + static uchar buffer[6]; + size_t i, nb, nread; + gfc_char4_t c; + int status; char *s; - int m, n, wi, status; - size_t w; - wi = f->u.w; - if (wi == -1) /* '(A)' edit descriptor */ - wi = length; + *nbytes = 1; + s = (char *) &buffer[0]; + status = read_block_form (dtp, s, nbytes); + if (status == FAILURE) + return 0; - w = wi; + /* If this is a short read, just return. */ + if (*nbytes == 0) + return 0; - s = gfc_alloca (w); + c = buffer[0]; + if (c < 0x80) + return c; - dtp->u.p.sf_read_comma = 0; - status = read_block_form (dtp, s, &w); - dtp->u.p.sf_read_comma = - dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1; + /* The number of leading 1-bits in the first byte indicates how many + bytes follow. */ + for (nb = 2; nb < 7; nb++) + if ((c & ~masks[nb-1]) == patns[nb-1]) + goto found; + goto invalid; + + found: + c = (c & masks[nb-1]); + nread = nb - 1; + + s = (char *) &buffer[1]; + status = read_block_form (dtp, s, &nread); + if (status == FAILURE) + return 0; + /* Decode the bytes read. */ + for (i = 1; i < nb; i++) + { + gfc_char4_t n = *s++; + + if ((n & 0xC0) != 0x80) + goto invalid; + + c = ((c << 6) + (n & 0x3F)); + } + + /* Make sure the shortest possible encoding was used. */ + if (c <= 0x7F && nb > 1) goto invalid; + if (c <= 0x7FF && nb > 2) goto invalid; + if (c <= 0xFFFF && nb > 3) goto invalid; + if (c <= 0x1FFFFF && nb > 4) goto invalid; + if (c <= 0x3FFFFFF && nb > 5) goto invalid; + + /* Make sure the character is valid. */ + if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) + goto invalid; + + return c; + + invalid: + generate_error (&dtp->common, LIBERROR_READ_VALUE, "Invalid UTF-8 encoding"); + return (gfc_char4_t) '?'; +} + + +static void +read_utf8_char1 (st_parameter_dt *dtp, char *p, int len, size_t width) +{ + gfc_char4_t c; + char *dest; + size_t nbytes; + int i, j; + + len = ((int) width < len) ? len : (int) width; + + dest = (char *) p; + + /* Proceed with decoding one character at a time. */ + for (j = 0; j < len; j++, dest++) + { + c = read_utf8 (dtp, &nbytes); + + /* Check for a short read and if so, break out. */ + if (nbytes == 0) + break; + + *dest = c > 255 ? '?' : (uchar) c; + } + + /* If there was a short read, pad the remaining characters. */ + for (i = j; i < len; i++) + *dest++ = ' '; + return; +} + +static void +read_default_char1 (st_parameter_dt *dtp, char *p, int len, size_t width) +{ + char *s; + int m, n, status; + + s = gfc_alloca (width); + + status = read_block_form (dtp, s, &width); + if (status == FAILURE) return; - if (w > (size_t) length) - s += (w - length); + if (width > (size_t) len) + s += (width - len); - m = ((int) w > length) ? length : (int) w; + m = ((int) width > len) ? len : (int) width; memcpy (p, s, m); - n = length - w; + n = len - width; if (n > 0) memset (p + m, ' ', n); } -void -read_a_char4 (st_parameter_dt *dtp, const fnode *f, char *p, int length) + +static void +read_utf8_char4 (st_parameter_dt *dtp, void *p, int len, size_t width) { - char *s; gfc_char4_t *dest; - int m, n, wi, status; - size_t w; + size_t nbytes; + int i, j; - wi = f->u.w; - if (wi == -1) /* '(A)' edit descriptor */ - wi = length; + len = ((int) width < len) ? len : (int) width; - w = wi; + dest = (gfc_char4_t *) p; - s = gfc_alloca (w); + /* Proceed with decoding one character at a time. */ + for (j = 0; j < len; j++, dest++) + { + *dest = read_utf8 (dtp, &nbytes); - /* Read in w bytes, treating comma as not a separator. */ - dtp->u.p.sf_read_comma = 0; - status = read_block_form (dtp, s, &w); - dtp->u.p.sf_read_comma = - dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1; + /* Check for a short read and if so, break out. */ + if (nbytes == 0) + break; + } + + /* If there was a short read, pad the remaining characters. */ + for (i = j; i < len; i++) + *dest++ = (gfc_char4_t) ' '; + return; +} + + +static void +read_default_char4 (st_parameter_dt *dtp, char *p, int len, size_t width) +{ + char *s; + gfc_char4_t *dest; + int m, n, status; + + s = gfc_alloca (width); + + status = read_block_form (dtp, s, &width); if (status == FAILURE) return; - if (w > (size_t) length) - s += (w - length); + if (width > (size_t) len) + s += (width - len); - m = ((int) w > length) ? length : (int) w; + m = ((int) width > len) ? len : (int) width; dest = (gfc_char4_t *) p; for (n = 0; n < m; n++, dest++, s++) *dest = (unsigned char ) *s; - for (n = 0; n < length - (int) w; n++, dest++) + for (n = 0; n < len - (int) width; n++, dest++) *dest = (unsigned char) ' '; } + +/* read_a()-- Read a character record into a KIND=1 character destination, + processing UTF-8 encoding if necessary. */ + +void +read_a (st_parameter_dt *dtp, const fnode *f, char *p, int length) +{ + int wi; + size_t w; + + wi = f->u.w; + if (wi == -1) /* '(A)' edit descriptor */ + wi = length; + w = wi; + + /* Read in w characters, treating comma as not a separator. */ + dtp->u.p.sf_read_comma = 0; + + if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8) + read_utf8_char1 (dtp, p, length, w); + else + read_default_char1 (dtp, p, length, w); + + dtp->u.p.sf_read_comma = + dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1; +} + + +/* read_a_char4()-- Read a character record into a KIND=4 character destination, + processing UTF-8 encoding if necessary. */ + +void +read_a_char4 (st_parameter_dt *dtp, const fnode *f, char *p, int length) +{ + int wi; + size_t w; + + wi = f->u.w; + if (wi == -1) /* '(A)' edit descriptor */ + wi = length; + w = wi; + + /* Read in w characters, treating comma as not a separator. */ + dtp->u.p.sf_read_comma = 0; + + if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8) + read_utf8_char4 (dtp, p, length, w); + else + read_default_char4 (dtp, p, length, w); + + dtp->u.p.sf_read_comma = + dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1; +} + /* eat_leading_spaces()-- Given a character pointer and a width, * ignore the leading spaces. */ |