aboutsummaryrefslogtreecommitdiff
path: root/iconv/gconv_charset.c
diff options
context:
space:
mode:
authorArjun Shankar <arjun@redhat.com>2020-07-07 20:31:48 +0200
committerArjun Shankar <arjun@redhat.com>2020-07-07 20:34:07 +0200
commit91927b7c76437db860cd86a7714476b56bb39d07 (patch)
treefebc3201dd995bb8324b4712a31fef6d1bea388a /iconv/gconv_charset.c
parent94d9c76e4acc798894ea23d9ac049ce7ce995ec0 (diff)
downloadglibc-91927b7c76437db860cd86a7714476b56bb39d07.zip
glibc-91927b7c76437db860cd86a7714476b56bb39d07.tar.gz
glibc-91927b7c76437db860cd86a7714476b56bb39d07.tar.bz2
Rewrite iconv option parsing [BZ #19519]
This commit replaces string manipulation during `iconv_open' and iconv_prog option parsing with a structured, flag based conversion specification. In doing so, it alters the internal `__gconv_open' interface and accordingly adjusts its uses. This change fixes several hangs in the iconv program and therefore includes a new test to exercise iconv_prog options that originally led to these hangs. It also includes a new regression test for option handling in the iconv function. Reviewed-by: Florian Weimer <fweimer@redhat.com> Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org> Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Diffstat (limited to 'iconv/gconv_charset.c')
-rw-r--r--iconv/gconv_charset.c218
1 files changed, 218 insertions, 0 deletions
diff --git a/iconv/gconv_charset.c b/iconv/gconv_charset.c
new file mode 100644
index 0000000..6ccd077
--- /dev/null
+++ b/iconv/gconv_charset.c
@@ -0,0 +1,218 @@
+/* Charset name normalization.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include <stdlib.h>
+#include <ctype.h>
+#include <locale.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/stat.h>
+#include "gconv_int.h"
+#include "gconv_charset.h"
+
+
+/* This function returns a pointer to the last suffix in a conversion code
+ string. Valid suffixes matched by this function are of the form: '/' or ','
+ followed by arbitrary text that doesn't contain '/' or ','. It does not
+ edit the string in any way. The caller is expected to parse the suffix and
+ remove it (by e.g. truncating the string) before the next call. */
+static char *
+find_suffix (char *s)
+{
+ /* The conversion code is in the form of a triplet, separated by '/' chars.
+ The third component of the triplet contains suffixes. If we don't have two
+ slashes, we don't have a suffix. */
+
+ int slash_count = 0;
+ char *suffix_term = NULL;
+
+ for (int i = 0; s[i] != '\0'; i++)
+ switch (s[i])
+ {
+ case '/':
+ slash_count++;
+ /* Fallthrough */
+ case ',':
+ suffix_term = &s[i];
+ }
+
+ if (slash_count >= 2)
+ return suffix_term;
+
+ return NULL;
+}
+
+
+struct gconv_parsed_code
+{
+ char *code;
+ bool translit;
+ bool ignore;
+};
+
+
+/* This function parses an iconv_open encoding PC.CODE, strips any suffixes
+ (such as TRANSLIT or IGNORE) from it and sets corresponding flags in it. */
+static void
+gconv_parse_code (struct gconv_parsed_code *pc)
+{
+ pc->translit = false;
+ pc->ignore = false;
+
+ while (1)
+ {
+ /* First drop any trailing whitespaces and separators. */
+ size_t len = strlen (pc->code);
+ while ((len > 0)
+ && (isspace (pc->code[len - 1])
+ || pc->code[len - 1] == ','
+ || pc->code[len - 1] == '/'))
+ len--;
+
+ pc->code[len] = '\0';
+
+ if (len == 0)
+ return;
+
+ char * suffix = find_suffix (pc->code);
+ if (suffix == NULL)
+ {
+ /* At this point, we have processed and removed all suffixes from the
+ code and what remains of the code is suffix free. */
+ return;
+ }
+ else
+ {
+ /* A suffix is processed from the end of the code array going
+ backwards, one suffix at a time. The suffix is an index into the
+ code character array and points to: one past the end of the code
+ and any unprocessed suffixes, and to the beginning of the suffix
+ currently being processed during this iteration. We must process
+ this suffix and then drop it from the code by terminating the
+ preceding text with NULL.
+
+ We want to allow and recognize suffixes such as:
+
+ "/TRANSLIT" i.e. single suffix
+ "//TRANSLIT" i.e. single suffix and multiple separators
+ "//TRANSLIT/IGNORE" i.e. suffixes separated by "/"
+ "/TRANSLIT//IGNORE" i.e. suffixes separated by "//"
+ "//IGNORE,TRANSLIT" i.e. suffixes separated by ","
+ "//IGNORE," i.e. trailing ","
+ "//TRANSLIT/" i.e. trailing "/"
+ "//TRANSLIT//" i.e. trailing "//"
+ "/" i.e. empty suffix.
+
+ Unknown suffixes are silently discarded and ignored. */
+
+ if ((__strcasecmp_l (suffix,
+ GCONV_TRIPLE_SEPARATOR
+ GCONV_TRANSLIT_SUFFIX,
+ _nl_C_locobj_ptr) == 0)
+ || (__strcasecmp_l (suffix,
+ GCONV_SUFFIX_SEPARATOR
+ GCONV_TRANSLIT_SUFFIX,
+ _nl_C_locobj_ptr) == 0))
+ pc->translit = true;
+
+ if ((__strcasecmp_l (suffix,
+ GCONV_TRIPLE_SEPARATOR
+ GCONV_IGNORE_ERRORS_SUFFIX,
+ _nl_C_locobj_ptr) == 0)
+ || (__strcasecmp_l (suffix,
+ GCONV_SUFFIX_SEPARATOR
+ GCONV_IGNORE_ERRORS_SUFFIX,
+ _nl_C_locobj_ptr) == 0))
+ pc->ignore = true;
+
+ /* We just processed this suffix. We can now drop it from the
+ code string by truncating it at the suffix's position. */
+ suffix[0] = '\0';
+ }
+ }
+}
+
+
+/* This function accepts the charset names of the source and destination of the
+ conversion and populates *conv_spec with an equivalent conversion
+ specification that may later be used by __gconv_open. The charset names
+ might contain options in the form of suffixes that alter the conversion,
+ e.g. "ISO-10646/UTF-8/TRANSLIT". It processes the charset names, ignoring
+ and truncating any suffix options in fromcode, and processing and truncating
+ any suffix options in tocode. Supported suffix options ("TRANSLIT" or
+ "IGNORE") when found in tocode lead to the corresponding flag in *conv_spec
+ to be set to true. Unrecognized suffix options are silently discarded. If
+ the function succeeds, it returns conv_spec back to the caller. It returns
+ NULL upon failure. conv_spec must be allocated and freed by the caller. */
+struct gconv_spec *
+__gconv_create_spec (struct gconv_spec *conv_spec, const char *fromcode,
+ const char *tocode)
+{
+ struct gconv_parsed_code pfc, ptc;
+ struct gconv_spec *ret = NULL;
+
+ pfc.code = __strdup (fromcode);
+ ptc.code = __strdup (tocode);
+
+ if ((pfc.code == NULL)
+ || (ptc.code == NULL))
+ goto out;
+
+ gconv_parse_code (&pfc);
+ gconv_parse_code (&ptc);
+
+ /* We ignore suffixes in the fromcode because that is how the current
+ implementation has always handled them. Only suffixes in the tocode are
+ processed and handled. The reality is that invalid input in the input
+ character set should only be ignored if the fromcode specifies IGNORE.
+ The current implementation ignores invalid intput in the input character
+ set if the tocode contains IGNORE. We preserve this behavior for
+ backwards compatibility. In the future we may split the handling of
+ IGNORE to allow a finer grained specification of ignorning invalid input
+ and/or ignoring invalid output. */
+ conv_spec->translit = ptc.translit;
+ conv_spec->ignore = ptc.ignore;
+
+ /* 3 extra bytes because 1 extra for '\0', and 2 extra so strip might
+ be able to add one or two trailing '/' characters if necessary. */
+ conv_spec->fromcode = malloc (strlen (fromcode) + 3);
+ if (conv_spec->fromcode == NULL)
+ goto out;
+
+ conv_spec->tocode = malloc (strlen (tocode) + 3);
+ if (conv_spec->tocode == NULL)
+ {
+ free (conv_spec->fromcode);
+ conv_spec->fromcode = NULL;
+ goto out;
+ }
+
+ /* Strip unrecognized characters and ensure that the code has two '/'
+ characters as per conversion code triplet specification. */
+ strip (conv_spec->fromcode, pfc.code);
+ strip (conv_spec->tocode, ptc.code);
+ ret = conv_spec;
+
+out:
+ free (pfc.code);
+ free (ptc.code);
+
+ return ret;
+}
+libc_hidden_def (__gconv_create_spec)