diff options
Diffstat (limited to 'libidn/idna.c')
-rw-r--r-- | libidn/idna.c | 834 |
1 files changed, 0 insertions, 834 deletions
diff --git a/libidn/idna.c b/libidn/idna.c deleted file mode 100644 index 7a15a25..0000000 --- a/libidn/idna.c +++ /dev/null @@ -1,834 +0,0 @@ -/* idna.c Convert to or from IDN strings. - * Copyright (C) 2002, 2003, 2004, 2011 Simon Josefsson - * - * This file is part of GNU Libidn. - * - * GNU Libidn is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * GNU Libidn is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GNU Libidn; if not, see <http://www.gnu.org/licenses/>. - */ - -#if HAVE_CONFIG_H -# include "config.h" -#endif - -#include <stdlib.h> -#include <string.h> -#include <stringprep.h> -#include <punycode.h> -#include <stdint.h> - -#include "idna.h" - -#define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \ - (c) == 0xFF0E || (c) == 0xFF61) - -/* Core functions */ - -/** - * idna_to_ascii_4i - * @in: input array with unicode code points. - * @inlen: length of input array with unicode code points. - * @out: output zero terminated string that must have room for at - * least 63 characters plus the terminating zero. - * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. - * - * The ToASCII operation takes a sequence of Unicode code points that make - * up one label and transforms it into a sequence of code points in the - * ASCII range (0..7F). If ToASCII succeeds, the original sequence and the - * resulting sequence are equivalent labels. - * - * It is important to note that the ToASCII operation can fail. ToASCII - * fails if any step of it fails. If any step of the ToASCII operation - * fails on any label in a domain name, that domain name MUST NOT be used - * as an internationalized domain name. The method for deadling with this - * failure is application-specific. - * - * The inputs to ToASCII are a sequence of code points, the AllowUnassigned - * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a - * sequence of ASCII code points or a failure condition. - * - * ToASCII never alters a sequence of code points that are all in the ASCII - * range to begin with (although it could fail). Applying the ToASCII - * operation multiple times has exactly the same effect as applying it just - * once. - * - * Return value: Returns 0 on success, or an error code. - */ -int -idna_to_ascii_4i (const uint32_t * in, size_t inlen, char *out, int flags) -{ - size_t len, outlen; - uint32_t *src; /* XXX don't need to copy data? */ - int rc; - - /* - * ToASCII consists of the following steps: - * - * 1. If all code points in the sequence are in the ASCII range (0..7F) - * then skip to step 3. - */ - - { - size_t i; - int inasciirange; - - inasciirange = 1; - for (i = 0; i < inlen; i++) - if (in[i] > 0x7F) - inasciirange = 0; - if (inasciirange) - { - src = malloc (sizeof (in[0]) * (inlen + 1)); - if (src == NULL) - return IDNA_MALLOC_ERROR; - - memcpy (src, in, sizeof (in[0]) * inlen); - src[inlen] = 0; - - goto step3; - } - } - - /* - * 2. Perform the steps specified in [NAMEPREP] and fail if there is - * an error. The AllowUnassigned flag is used in [NAMEPREP]. - */ - - { - char *p; - - p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL); - if (p == NULL) - return IDNA_MALLOC_ERROR; - - len = strlen (p); - do - { - char *newp; - - len = 2 * len + 10; /* XXX better guess? */ - newp = realloc (p, len); - if (newp == NULL) - { - free (p); - return IDNA_MALLOC_ERROR; - } - p = newp; - - if (flags & IDNA_ALLOW_UNASSIGNED) - rc = stringprep_nameprep (p, len); - else - rc = stringprep_nameprep_no_unassigned (p, len); - } - while (rc == STRINGPREP_TOO_SMALL_BUFFER); - - if (rc != STRINGPREP_OK) - { - free (p); - return IDNA_STRINGPREP_ERROR; - } - - src = stringprep_utf8_to_ucs4 (p, -1, NULL); - - free (p); - } - -step3: - /* - * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks: - * - * (a) Verify the absence of non-LDH ASCII code points; that is, - * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F. - * - * (b) Verify the absence of leading and trailing hyphen-minus; - * that is, the absence of U+002D at the beginning and end of - * the sequence. - */ - - if (flags & IDNA_USE_STD3_ASCII_RULES) - { - size_t i; - - for (i = 0; src[i]; i++) - if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F || - (src[i] >= 0x3A && src[i] <= 0x40) || - (src[i] >= 0x5B && src[i] <= 0x60) || - (src[i] >= 0x7B && src[i] <= 0x7F)) - { - free (src); - return IDNA_CONTAINS_NON_LDH; - } - - if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D)) - { - free (src); - return IDNA_CONTAINS_MINUS; - } - } - - /* - * 4. If all code points in the sequence are in the ASCII range - * (0..7F), then skip to step 8. - */ - - { - size_t i; - int inasciirange; - - inasciirange = 1; - for (i = 0; src[i]; i++) - { - if (src[i] > 0x7F) - inasciirange = 0; - /* copy string to output buffer if we are about to skip to step8 */ - if (i < 64) - out[i] = src[i]; - } - if (i < 64) - out[i] = '\0'; - if (inasciirange) - goto step8; - } - - /* - * 5. Verify that the sequence does NOT begin with the ACE prefix. - * - */ - - { - size_t i; - int match; - - match = 1; - for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++) - if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i]) - match = 0; - if (match) - { - free (src); - return IDNA_CONTAINS_ACE_PREFIX; - } - } - - /* - * 6. Encode the sequence using the encoding algorithm in [PUNYCODE] - * and fail if there is an error. - */ - for (len = 0; src[len]; len++) - ; - src[len] = '\0'; - outlen = 63 - strlen (IDNA_ACE_PREFIX); - rc = punycode_encode (len, src, NULL, - &outlen, &out[strlen (IDNA_ACE_PREFIX)]); - if (rc != PUNYCODE_SUCCESS) - { - free (src); - return IDNA_PUNYCODE_ERROR; - } - out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0'; - - /* - * 7. Prepend the ACE prefix. - */ - - memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)); - - /* - * 8. Verify that the number of code points is in the range 1 to 63 - * inclusive (0 is excluded). - */ - -step8: - free (src); - if (strlen (out) < 1 || strlen (out) > 63) - return IDNA_INVALID_LENGTH; - - return IDNA_SUCCESS; -} - -/* ToUnicode(). May realloc() utf8in. */ -static int -idna_to_unicode_internal (char *utf8in, - uint32_t * out, size_t * outlen, int flags) -{ - int rc; - char tmpout[64]; - size_t utf8len = strlen (utf8in) + 1; - size_t addlen = 0; - - /* - * ToUnicode consists of the following steps: - * - * 1. If the sequence contains any code points outside the ASCII range - * (0..7F) then proceed to step 2, otherwise skip to step 3. - */ - - { - size_t i; - int inasciirange; - - inasciirange = 1; - for (i = 0; utf8in[i]; i++) - if (utf8in[i] & ~0x7F) - inasciirange = 0; - if (inasciirange) - goto step3; - } - - /* - * 2. Perform the steps specified in [NAMEPREP] and fail if there is an - * error. (If step 3 of ToASCII is also performed here, it will not - * affect the overall behavior of ToUnicode, but it is not - * necessary.) The AllowUnassigned flag is used in [NAMEPREP]. - */ - do - { - char *newp = realloc (utf8in, utf8len + addlen); - if (newp == NULL) - { - free (utf8in); - return IDNA_MALLOC_ERROR; - } - utf8in = newp; - if (flags & IDNA_ALLOW_UNASSIGNED) - rc = stringprep_nameprep (utf8in, utf8len + addlen); - else - rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen); - addlen += 1; - } - while (rc == STRINGPREP_TOO_SMALL_BUFFER); - - if (rc != STRINGPREP_OK) - { - free (utf8in); - return IDNA_STRINGPREP_ERROR; - } - - /* 3. Verify that the sequence begins with the ACE prefix, and save a - * copy of the sequence. - */ - -step3: - if (memcmp (IDNA_ACE_PREFIX, utf8in, strlen (IDNA_ACE_PREFIX)) != 0) - { - free (utf8in); - return IDNA_NO_ACE_PREFIX; - } - - /* 4. Remove the ACE prefix. - */ - - memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)], - strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1); - - /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE] - * and fail if there is an error. Save a copy of the result of - * this step. - */ - - (*outlen)--; /* reserve one for the zero */ - - rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL); - if (rc != PUNYCODE_SUCCESS) - { - free (utf8in); - return IDNA_PUNYCODE_ERROR; - } - - out[*outlen] = 0; /* add zero */ - - /* 6. Apply ToASCII. - */ - - rc = idna_to_ascii_4i (out, *outlen, tmpout, flags); - if (rc != IDNA_SUCCESS) - { - free (utf8in); - return rc; - } - - /* 7. Verify that the result of step 6 matches the saved copy from - * step 3, using a case-insensitive ASCII comparison. - */ - - if (strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0) - { - free (utf8in); - return IDNA_ROUNDTRIP_VERIFY_ERROR; - } - - /* 8. Return the saved copy from step 5. - */ - - free (utf8in); - return IDNA_SUCCESS; -} - -/** - * idna_to_unicode_44i - * @in: input array with unicode code points. - * @inlen: length of input array with unicode code points. - * @out: output array with unicode code points. - * @outlen: on input, maximum size of output array with unicode code points, - * on exit, actual size of output array with unicode code points. - * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. - * - * The ToUnicode operation takes a sequence of Unicode code points - * that make up one label and returns a sequence of Unicode code - * points. If the input sequence is a label in ACE form, then the - * result is an equivalent internationalized label that is not in ACE - * form, otherwise the original sequence is returned unaltered. - * - * ToUnicode never fails. If any step fails, then the original input - * sequence is returned immediately in that step. - * - * The Punycode decoder can never output more code points than it - * inputs, but Nameprep can, and therefore ToUnicode can. Note that - * the number of octets needed to represent a sequence of code points - * depends on the particular character encoding used. - * - * The inputs to ToUnicode are a sequence of code points, the - * AllowUnassigned flag, and the UseSTD3ASCIIRules flag. The output of - * ToUnicode is always a sequence of Unicode code points. - * - * Return value: Returns error condition, but it must only be used for - * debugging purposes. The output buffer is always - * guaranteed to contain the correct data according to - * the specification (sans malloc induced errors). NB! - * This means that you normally ignore the return code - * from this function, as checking it means breaking the - * standard. - */ -int -idna_to_unicode_44i (const uint32_t * in, size_t inlen, - uint32_t * out, size_t * outlen, int flags) -{ - int rc; - size_t outlensave = *outlen; - char *p; - - p = stringprep_ucs4_to_utf8 (in, inlen, NULL, NULL); - if (p == NULL) - return IDNA_MALLOC_ERROR; - - rc = idna_to_unicode_internal (p, out, outlen, flags); - if (rc != IDNA_SUCCESS) - { - memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ? - inlen : outlensave)); - *outlen = inlen; - } - - /* p is freed in idna_to_unicode_internal. */ - - return rc; -} - -/* Wrappers that handle several labels */ - -/** - * idna_to_ascii_4z: - * @input: zero terminated input Unicode string. - * @output: pointer to newly allocated output string. - * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. - * - * Convert UCS-4 domain name to ASCII string. The domain name may - * contain several labels, separated by dots. The output buffer must - * be deallocated by the caller. - * - * Return value: Returns IDNA_SUCCESS on success, or error code. - **/ -int -idna_to_ascii_4z (const uint32_t * input, char **output, int flags) -{ - const uint32_t *start = input; - const uint32_t *end = input; - char buf[64]; - char *out = NULL; - int rc; - - /* 1) Whenever dots are used as label separators, the following - characters MUST be recognized as dots: U+002E (full stop), - U+3002 (ideographic full stop), U+FF0E (fullwidth full stop), - U+FF61 (halfwidth ideographic full stop). */ - - if (input[0] == 0) - { - /* Handle implicit zero-length root label. */ - *output = malloc (1); - if (!*output) - return IDNA_MALLOC_ERROR; - strcpy (*output, ""); - return IDNA_SUCCESS; - } - - if (DOTP (input[0]) && input[1] == 0) - { - /* Handle explicit zero-length root label. */ - *output = malloc (2); - if (!*output) - return IDNA_MALLOC_ERROR; - strcpy (*output, "."); - return IDNA_SUCCESS; - } - - *output = NULL; - do - { - end = start; - - for (; *end && !DOTP (*end); end++) - ; - - if (*end == '\0' && start == end) - { - /* Handle explicit zero-length root label. */ - buf[0] = '\0'; - } - else - { - rc = idna_to_ascii_4i (start, end - start, buf, flags); - if (rc != IDNA_SUCCESS) - return rc; - } - - if (out) - { - char *newp = realloc (out, strlen (out) + 1 + strlen (buf) + 1); - if (!newp) - { - free (out); - return IDNA_MALLOC_ERROR; - } - out = newp; - strcat (out, "."); - strcat (out, buf); - } - else - { - out = (char *) malloc (strlen (buf) + 1); - if (!out) - return IDNA_MALLOC_ERROR; - strcpy (out, buf); - } - - start = end + 1; - } - while (*end); - - *output = out; - - return IDNA_SUCCESS; -} - -/** - * idna_to_ascii_8z: - * @input: zero terminated input UTF-8 string. - * @output: pointer to newly allocated output string. - * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. - * - * Convert UTF-8 domain name to ASCII string. The domain name may - * contain several labels, separated by dots. The output buffer must - * be deallocated by the caller. - * - * Return value: Returns IDNA_SUCCESS on success, or error code. - **/ -int -idna_to_ascii_8z (const char *input, char **output, int flags) -{ - uint32_t *ucs4; - size_t ucs4len; - int rc; - - ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len); - if (!ucs4) - return IDNA_ICONV_ERROR; - - rc = idna_to_ascii_4z (ucs4, output, flags); - - free (ucs4); - - return rc; - -} - -/** - * idna_to_ascii_lz: - * @input: zero terminated input UTF-8 string. - * @output: pointer to newly allocated output string. - * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. - * - * Convert domain name in the locale's encoding to ASCII string. The - * domain name may contain several labels, separated by dots. The - * output buffer must be deallocated by the caller. - * - * Return value: Returns IDNA_SUCCESS on success, or error code. - **/ -int -idna_to_ascii_lz (const char *input, char **output, int flags) -{ - char *utf8; - int rc; - - utf8 = stringprep_locale_to_utf8 (input); - if (!utf8) - return IDNA_ICONV_ERROR; - - rc = idna_to_ascii_8z (utf8, output, flags); - - free (utf8); - - return rc; -} - -/** - * idna_to_unicode_4z4z: - * @input: zero-terminated Unicode string. - * @output: pointer to newly allocated output Unicode string. - * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. - * - * Convert possibly ACE encoded domain name in UCS-4 format into a - * UCS-4 string. The domain name may contain several labels, - * separated by dots. The output buffer must be deallocated by the - * caller. - * - * Return value: Returns IDNA_SUCCESS on success, or error code. - **/ -int -idna_to_unicode_4z4z (const uint32_t * input, uint32_t ** output, int flags) -{ - const uint32_t *start = input; - const uint32_t *end = input; - uint32_t *buf; - size_t buflen; - uint32_t *out = NULL; - size_t outlen = 0; - - *output = NULL; - - do - { - end = start; - - for (; *end && !DOTP (*end); end++) - ; - - buflen = end - start; - buf = malloc (sizeof (buf[0]) * (buflen + 1)); - if (!buf) - return IDNA_MALLOC_ERROR; - - idna_to_unicode_44i (start, end - start, buf, &buflen, flags); - /* don't check return value as per specification! */ - - if (out) - { - uint32_t *newp = realloc (out, - sizeof (out[0]) - * (outlen + 1 + buflen + 1)); - if (!newp) - { - free (buf); - free (out); - return IDNA_MALLOC_ERROR; - } - out = newp; - out[outlen++] = 0x002E; /* '.' (full stop) */ - memcpy (out + outlen, buf, sizeof (buf[0]) * buflen); - outlen += buflen; - out[outlen] = 0x0; - free (buf); - } - else - { - out = buf; - outlen = buflen; - out[outlen] = 0x0; - } - - start = end + 1; - } - while (*end); - - *output = out; - - return IDNA_SUCCESS; -} - -/** - * idna_to_unicode_8z4z: - * @input: zero-terminated UTF-8 string. - * @output: pointer to newly allocated output Unicode string. - * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. - * - * Convert possibly ACE encoded domain name in UTF-8 format into a - * UCS-4 string. The domain name may contain several labels, - * separated by dots. The output buffer must be deallocated by the - * caller. - * - * Return value: Returns IDNA_SUCCESS on success, or error code. - **/ -int -idna_to_unicode_8z4z (const char *input, uint32_t ** output, int flags) -{ - uint32_t *ucs4; - size_t ucs4len; - int rc; - - ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len); - if (!ucs4) - return IDNA_ICONV_ERROR; - - rc = idna_to_unicode_4z4z (ucs4, output, flags); - free (ucs4); - - return rc; -} - -/** - * idna_to_unicode_8z8z: - * @input: zero-terminated UTF-8 string. - * @output: pointer to newly allocated output UTF-8 string. - * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. - * - * Convert possibly ACE encoded domain name in UTF-8 format into a - * UTF-8 string. The domain name may contain several labels, - * separated by dots. The output buffer must be deallocated by the - * caller. - * - * Return value: Returns IDNA_SUCCESS on success, or error code. - **/ -int -idna_to_unicode_8z8z (const char *input, char **output, int flags) -{ - uint32_t *ucs4; - int rc; - - rc = idna_to_unicode_8z4z (input, &ucs4, flags); - *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL); - free (ucs4); - - if (!*output) - return IDNA_ICONV_ERROR; - - return rc; -} - -/** - * idna_to_unicode_8zlz: - * @input: zero-terminated UTF-8 string. - * @output: pointer to newly allocated output string encoded in the - * current locale's character set. - * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. - * - * Convert possibly ACE encoded domain name in UTF-8 format into a - * string encoded in the current locale's character set. The domain - * name may contain several labels, separated by dots. The output - * buffer must be deallocated by the caller. - * - * Return value: Returns IDNA_SUCCESS on success, or error code. - **/ -int -idna_to_unicode_8zlz (const char *input, char **output, int flags) -{ - char *utf8; - int rc; - - rc = idna_to_unicode_8z8z (input, &utf8, flags); - *output = stringprep_utf8_to_locale (utf8); - free (utf8); - - if (!*output) - return IDNA_ICONV_ERROR; - - return rc; -} - -/** - * idna_to_unicode_lzlz: - * @input: zero-terminated string encoded in the current locale's - * character set. - * @output: pointer to newly allocated output string encoded in the - * current locale's character set. - * @flags: IDNA flags, e.g. IDNA_ALLOW_UNASSIGNED or IDNA_USE_STD3_ASCII_RULES. - * - * Convert possibly ACE encoded domain name in the locale's character - * set into a string encoded in the current locale's character set. - * The domain name may contain several labels, separated by dots. The - * output buffer must be deallocated by the caller. - * - * Return value: Returns IDNA_SUCCESS on success, or error code. - **/ -int -idna_to_unicode_lzlz (const char *input, char **output, int flags) -{ - char *utf8; - int rc; - - utf8 = stringprep_locale_to_utf8 (input); - if (!utf8) - return IDNA_ICONV_ERROR; - - rc = idna_to_unicode_8zlz (utf8, output, flags); - free (utf8); - - return rc; -} - -/** - * IDNA_ACE_PREFIX - * - * The IANA allocated prefix to use for IDNA. "xn--" - */ - -/** - * Idna_rc: - * @IDNA_SUCCESS: Successful operation. This value is guaranteed to - * always be zero, the remaining ones are only guaranteed to hold - * non-zero values, for logical comparison purposes. - * @IDNA_STRINGPREP_ERROR: Error during string preparation. - * @IDNA_PUNYCODE_ERROR: Error during punycode operation. - * @IDNA_CONTAINS_NON_LDH: For IDNA_USE_STD3_ASCII_RULES, indicate that - * the string contains non-LDH ASCII characters. - * @IDNA_CONTAINS_MINUS: For IDNA_USE_STD3_ASCII_RULES, indicate that - * the string contains a leading or trailing hyphen-minus (U+002D). - * @IDNA_INVALID_LENGTH: The final output string is not within the - * (inclusive) range 1 to 63 characters. - * @IDNA_NO_ACE_PREFIX: The string does not contain the ACE prefix - * (for ToUnicode). - * @IDNA_ROUNDTRIP_VERIFY_ERROR: The ToASCII operation on output - * string does not equal the input. - * @IDNA_CONTAINS_ACE_PREFIX: The input contains the ACE prefix (for - * ToASCII). - * @IDNA_ICONV_ERROR: Could not convert string in locale encoding. - * @IDNA_MALLOC_ERROR: Could not allocate buffer (this is typically a - * fatal error). - * @IDNA_DLOPEN_ERROR: Could not dlopen the libcidn DSO (only used - * internally in libc). - * - * Enumerated return codes of idna_to_ascii_4i(), - * idna_to_unicode_44i() functions (and functions derived from those - * functions). The value 0 is guaranteed to always correspond to - * success. - */ - - -/** - * Idna_flags: - * @IDNA_ALLOW_UNASSIGNED: Don't reject strings containing unassigned - * Unicode code points. - * @IDNA_USE_STD3_ASCII_RULES: Validate strings according to STD3 - * rules (i.e., normal host name rules). - * - * Flags to pass to idna_to_ascii_4i(), idna_to_unicode_44i() etc. - */ |