aboutsummaryrefslogtreecommitdiff
path: root/libgfortran/io/read.c
diff options
context:
space:
mode:
authorJerry DeLisle <jvdelisle@gcc.gnu.org>2008-08-16 03:38:31 +0000
committerJerry DeLisle <jvdelisle@gcc.gnu.org>2008-08-16 03:38:31 +0000
commit3ae86bf4f45b1f110aa7bd09ea61a8fd30c2a983 (patch)
tree659ac99319524f53dd916bbef37cd895b6a5cadb /libgfortran/io/read.c
parentdad80a1bff182651128f352095e2163534c5d81c (diff)
downloadgcc-3ae86bf4f45b1f110aa7bd09ea61a8fd30c2a983.zip
gcc-3ae86bf4f45b1f110aa7bd09ea61a8fd30c2a983.tar.gz
gcc-3ae86bf4f45b1f110aa7bd09ea61a8fd30c2a983.tar.bz2
re PR libfortran/35863 ([F2003] Implement ENCODING="UTF-8")
2008-08-15 Jerry DeLisle <jvdelisle@gcc.gnu.org> PR libfortran/35863 * intrinsics/selected_char_kind.c: Enable iso_10646. * io/read.c (typedef uchar): New type. (read_utf8): New function to read a single UTF-8 encoded character. (read_utf8_char1): New function to read UTF-8 into a KIND=1 string. (read_default_char1): New functio to read default into KIND=1 string. (read_utf8_char4): New function to read UTF-8 into a KIND=4 string. (read_default_char4): New function to read UTF-8 into a KIND=4 string. (read_a): Modify to use the new functions. (read_a_char4): Modify to use the new functions. * io/write.c (error.h): Add include. (typedef uchar): New type. (write_default_char4): New function to default write KIND=4 string. (write_utf8_char4): New function to UTF-8 write KIND=4 string. (write_a_char4): Modify to use new functions. (write_character): Modify to use new functions. From-SVN: r139147
Diffstat (limited to 'libgfortran/io/read.c')
-rw-r--r--libgfortran/io/read.c239
1 files changed, 201 insertions, 38 deletions
diff --git a/libgfortran/io/read.c b/libgfortran/io/read.c
index cb88933..8d25493 100644
--- a/libgfortran/io/read.c
+++ b/libgfortran/io/read.c
@@ -34,6 +34,8 @@ Boston, MA 02110-1301, USA. */
#include <ctype.h>
#include <stdlib.h>
+typedef unsigned char uchar;
+
/* read.c -- Deal with formatted reads */
@@ -236,78 +238,239 @@ read_l (st_parameter_dt *dtp, const fnode *f, char *dest, int length)
}
-/* read_a()-- Read a character record. This one is pretty easy. */
-
-void
-read_a (st_parameter_dt *dtp, const fnode *f, char *p, int length)
+static inline gfc_char4_t
+read_utf8 (st_parameter_dt *dtp, size_t *nbytes)
{
+ static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
+ static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+ static uchar buffer[6];
+ size_t i, nb, nread;
+ gfc_char4_t c;
+ int status;
char *s;
- int m, n, wi, status;
- size_t w;
- wi = f->u.w;
- if (wi == -1) /* '(A)' edit descriptor */
- wi = length;
+ *nbytes = 1;
+ s = (char *) &buffer[0];
+ status = read_block_form (dtp, s, nbytes);
+ if (status == FAILURE)
+ return 0;
- w = wi;
+ /* If this is a short read, just return. */
+ if (*nbytes == 0)
+ return 0;
- s = gfc_alloca (w);
+ c = buffer[0];
+ if (c < 0x80)
+ return c;
- dtp->u.p.sf_read_comma = 0;
- status = read_block_form (dtp, s, &w);
- dtp->u.p.sf_read_comma =
- dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1;
+ /* The number of leading 1-bits in the first byte indicates how many
+ bytes follow. */
+ for (nb = 2; nb < 7; nb++)
+ if ((c & ~masks[nb-1]) == patns[nb-1])
+ goto found;
+ goto invalid;
+
+ found:
+ c = (c & masks[nb-1]);
+ nread = nb - 1;
+
+ s = (char *) &buffer[1];
+ status = read_block_form (dtp, s, &nread);
+ if (status == FAILURE)
+ return 0;
+ /* Decode the bytes read. */
+ for (i = 1; i < nb; i++)
+ {
+ gfc_char4_t n = *s++;
+
+ if ((n & 0xC0) != 0x80)
+ goto invalid;
+
+ c = ((c << 6) + (n & 0x3F));
+ }
+
+ /* Make sure the shortest possible encoding was used. */
+ if (c <= 0x7F && nb > 1) goto invalid;
+ if (c <= 0x7FF && nb > 2) goto invalid;
+ if (c <= 0xFFFF && nb > 3) goto invalid;
+ if (c <= 0x1FFFFF && nb > 4) goto invalid;
+ if (c <= 0x3FFFFFF && nb > 5) goto invalid;
+
+ /* Make sure the character is valid. */
+ if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF))
+ goto invalid;
+
+ return c;
+
+ invalid:
+ generate_error (&dtp->common, LIBERROR_READ_VALUE, "Invalid UTF-8 encoding");
+ return (gfc_char4_t) '?';
+}
+
+
+static void
+read_utf8_char1 (st_parameter_dt *dtp, char *p, int len, size_t width)
+{
+ gfc_char4_t c;
+ char *dest;
+ size_t nbytes;
+ int i, j;
+
+ len = ((int) width < len) ? len : (int) width;
+
+ dest = (char *) p;
+
+ /* Proceed with decoding one character at a time. */
+ for (j = 0; j < len; j++, dest++)
+ {
+ c = read_utf8 (dtp, &nbytes);
+
+ /* Check for a short read and if so, break out. */
+ if (nbytes == 0)
+ break;
+
+ *dest = c > 255 ? '?' : (uchar) c;
+ }
+
+ /* If there was a short read, pad the remaining characters. */
+ for (i = j; i < len; i++)
+ *dest++ = ' ';
+ return;
+}
+
+static void
+read_default_char1 (st_parameter_dt *dtp, char *p, int len, size_t width)
+{
+ char *s;
+ int m, n, status;
+
+ s = gfc_alloca (width);
+
+ status = read_block_form (dtp, s, &width);
+
if (status == FAILURE)
return;
- if (w > (size_t) length)
- s += (w - length);
+ if (width > (size_t) len)
+ s += (width - len);
- m = ((int) w > length) ? length : (int) w;
+ m = ((int) width > len) ? len : (int) width;
memcpy (p, s, m);
- n = length - w;
+ n = len - width;
if (n > 0)
memset (p + m, ' ', n);
}
-void
-read_a_char4 (st_parameter_dt *dtp, const fnode *f, char *p, int length)
+
+static void
+read_utf8_char4 (st_parameter_dt *dtp, void *p, int len, size_t width)
{
- char *s;
gfc_char4_t *dest;
- int m, n, wi, status;
- size_t w;
+ size_t nbytes;
+ int i, j;
- wi = f->u.w;
- if (wi == -1) /* '(A)' edit descriptor */
- wi = length;
+ len = ((int) width < len) ? len : (int) width;
- w = wi;
+ dest = (gfc_char4_t *) p;
- s = gfc_alloca (w);
+ /* Proceed with decoding one character at a time. */
+ for (j = 0; j < len; j++, dest++)
+ {
+ *dest = read_utf8 (dtp, &nbytes);
- /* Read in w bytes, treating comma as not a separator. */
- dtp->u.p.sf_read_comma = 0;
- status = read_block_form (dtp, s, &w);
- dtp->u.p.sf_read_comma =
- dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1;
+ /* Check for a short read and if so, break out. */
+ if (nbytes == 0)
+ break;
+ }
+
+ /* If there was a short read, pad the remaining characters. */
+ for (i = j; i < len; i++)
+ *dest++ = (gfc_char4_t) ' ';
+ return;
+}
+
+
+static void
+read_default_char4 (st_parameter_dt *dtp, char *p, int len, size_t width)
+{
+ char *s;
+ gfc_char4_t *dest;
+ int m, n, status;
+
+ s = gfc_alloca (width);
+
+ status = read_block_form (dtp, s, &width);
if (status == FAILURE)
return;
- if (w > (size_t) length)
- s += (w - length);
+ if (width > (size_t) len)
+ s += (width - len);
- m = ((int) w > length) ? length : (int) w;
+ m = ((int) width > len) ? len : (int) width;
dest = (gfc_char4_t *) p;
for (n = 0; n < m; n++, dest++, s++)
*dest = (unsigned char ) *s;
- for (n = 0; n < length - (int) w; n++, dest++)
+ for (n = 0; n < len - (int) width; n++, dest++)
*dest = (unsigned char) ' ';
}
+
+/* read_a()-- Read a character record into a KIND=1 character destination,
+ processing UTF-8 encoding if necessary. */
+
+void
+read_a (st_parameter_dt *dtp, const fnode *f, char *p, int length)
+{
+ int wi;
+ size_t w;
+
+ wi = f->u.w;
+ if (wi == -1) /* '(A)' edit descriptor */
+ wi = length;
+ w = wi;
+
+ /* Read in w characters, treating comma as not a separator. */
+ dtp->u.p.sf_read_comma = 0;
+
+ if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+ read_utf8_char1 (dtp, p, length, w);
+ else
+ read_default_char1 (dtp, p, length, w);
+
+ dtp->u.p.sf_read_comma =
+ dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1;
+}
+
+
+/* read_a_char4()-- Read a character record into a KIND=4 character destination,
+ processing UTF-8 encoding if necessary. */
+
+void
+read_a_char4 (st_parameter_dt *dtp, const fnode *f, char *p, int length)
+{
+ int wi;
+ size_t w;
+
+ wi = f->u.w;
+ if (wi == -1) /* '(A)' edit descriptor */
+ wi = length;
+ w = wi;
+
+ /* Read in w characters, treating comma as not a separator. */
+ dtp->u.p.sf_read_comma = 0;
+
+ if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+ read_utf8_char4 (dtp, p, length, w);
+ else
+ read_default_char4 (dtp, p, length, w);
+
+ dtp->u.p.sf_read_comma =
+ dtp->u.p.decimal_status == DECIMAL_COMMA ? 0 : 1;
+}
+
/* eat_leading_spaces()-- Given a character pointer and a width,
* ignore the leading spaces. */