diff options
Diffstat (limited to 'libphobos/src/std/uri.d')
-rw-r--r-- | libphobos/src/std/uri.d | 592 |
1 files changed, 592 insertions, 0 deletions
diff --git a/libphobos/src/std/uri.d b/libphobos/src/std/uri.d new file mode 100644 index 0000000..0852955 --- /dev/null +++ b/libphobos/src/std/uri.d @@ -0,0 +1,592 @@ +// Written in the D programming language. + +/** + * Encode and decode Uniform Resource Identifiers (URIs). + * URIs are used in internet transfer protocols. + * Valid URI characters consist of letters, digits, + * and the characters $(B ;/?:@&=+$,-_.!~*'()) + * Reserved URI characters are $(B ;/?:@&=+$,) + * Escape sequences consist of $(B %) followed by two hex digits. + * + * See_Also: + * $(LINK2 http://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br> + * $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia) + * Copyright: Copyright Digital Mars 2000 - 2009. + * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). + * Authors: $(HTTP digitalmars.com, Walter Bright) + * Source: $(PHOBOSSRC std/_uri.d) + */ +/* Copyright Digital Mars 2000 - 2009. + * Distributed under the Boost Software License, Version 1.0. + * (See accompanying file LICENSE_1_0.txt or copy at + * http://www.boost.org/LICENSE_1_0.txt) + */ +module std.uri; + +//debug=uri; // uncomment to turn on debugging writefln's +debug(uri) import std.stdio; +import std.traits : isSomeChar; + +/** This Exception is thrown if something goes wrong when encoding or +decoding a URI. +*/ +class URIException : Exception +{ + import std.exception : basicExceptionCtors; + mixin basicExceptionCtors; +} + +private enum +{ + URI_Alpha = 1, + URI_Reserved = 2, + URI_Mark = 4, + URI_Digit = 8, + URI_Hash = 0x10, // '#' +} + +private immutable char[16] hex2ascii = "0123456789ABCDEF"; + +private immutable ubyte[128] uri_flags = // indexed by character + ({ + ubyte[128] uflags; + + // Compile time initialize + uflags['#'] |= URI_Hash; + + foreach (c; 'A' .. 'Z' + 1) + { + uflags[c] |= URI_Alpha; + uflags[c + 0x20] |= URI_Alpha; // lowercase letters + } + foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit; + foreach (c; ";/?:@&=+$,") uflags[c] |= URI_Reserved; + foreach (c; "-_.!~*'()") uflags[c] |= URI_Mark; + return uflags; + })(); + +private string URI_Encode(dstring str, uint unescapedSet) +{ + import core.exception : OutOfMemoryError; + import core.stdc.stdlib : alloca; + + uint j; + uint k; + dchar V; + dchar C; + + // result buffer + char[50] buffer = void; + char* R; + uint Rlen; + uint Rsize; // alloc'd size + + immutable len = str.length; + + R = buffer.ptr; + Rsize = buffer.length; + Rlen = 0; + + for (k = 0; k != len; k++) + { + C = str[k]; + // if (C in unescapedSet) + if (C < uri_flags.length && uri_flags[C] & unescapedSet) + { + if (Rlen == Rsize) + { + char* R2; + + Rsize *= 2; + if (Rsize > 1024) + { + R2 = (new char[Rsize]).ptr; + } + else + { + R2 = cast(char *) alloca(Rsize * char.sizeof); + if (!R2) + throw new OutOfMemoryError("Alloca failure"); + } + R2[0 .. Rlen] = R[0 .. Rlen]; + R = R2; + } + R[Rlen] = cast(char) C; + Rlen++; + } + else + { + char[6] Octet; + uint L; + + V = C; + + // Transform V into octets + if (V <= 0x7F) + { + Octet[0] = cast(char) V; + L = 1; + } + else if (V <= 0x7FF) + { + Octet[0] = cast(char)(0xC0 | (V >> 6)); + Octet[1] = cast(char)(0x80 | (V & 0x3F)); + L = 2; + } + else if (V <= 0xFFFF) + { + Octet[0] = cast(char)(0xE0 | (V >> 12)); + Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F)); + Octet[2] = cast(char)(0x80 | (V & 0x3F)); + L = 3; + } + else if (V <= 0x1FFFFF) + { + Octet[0] = cast(char)(0xF0 | (V >> 18)); + Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F)); + Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F)); + Octet[3] = cast(char)(0x80 | (V & 0x3F)); + L = 4; + } + else + { + throw new URIException("Undefined UTF-32 code point"); + } + + if (Rlen + L * 3 > Rsize) + { + char *R2; + + Rsize = 2 * (Rlen + L * 3); + if (Rsize > 1024) + { + R2 = (new char[Rsize]).ptr; + } + else + { + R2 = cast(char *) alloca(Rsize * char.sizeof); + if (!R2) + throw new OutOfMemoryError("Alloca failure"); + } + R2[0 .. Rlen] = R[0 .. Rlen]; + R = R2; + } + + for (j = 0; j < L; j++) + { + R[Rlen] = '%'; + R[Rlen + 1] = hex2ascii[Octet[j] >> 4]; + R[Rlen + 2] = hex2ascii[Octet[j] & 15]; + + Rlen += 3; + } + } + } + + return R[0 .. Rlen].idup; +} + +private uint ascii2hex(dchar c) @nogc @safe pure nothrow +{ + return (c <= '9') ? c - '0' : + (c <= 'F') ? c - 'A' + 10 : + c - 'a' + 10; +} + +private dstring URI_Decode(Char)(in Char[] uri, uint reservedSet) +if (isSomeChar!Char) +{ + import core.exception : OutOfMemoryError; + import core.stdc.stdlib : alloca; + import std.ascii : isHexDigit; + + uint j; + uint k; + uint V; + dchar C; + + // Result array, allocated on stack + dchar* R; + uint Rlen; + + immutable len = uri.length; + auto s = uri.ptr; + + // Preallocate result buffer R guaranteed to be large enough for result + auto Rsize = len; + if (Rsize > 1024 / dchar.sizeof) + { + R = (new dchar[Rsize]).ptr; + } + else + { + R = cast(dchar *) alloca(Rsize * dchar.sizeof); + if (!R) + throw new OutOfMemoryError("Alloca failure"); + } + Rlen = 0; + + for (k = 0; k != len; k++) + { + char B; + uint start; + + C = s[k]; + if (C != '%') + { + R[Rlen] = C; + Rlen++; + continue; + } + start = k; + if (k + 2 >= len) + throw new URIException("Unexpected end of URI"); + if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2])) + throw new URIException("Expected two hexadecimal digits after '%'"); + B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2])); + k += 2; + if ((B & 0x80) == 0) + { + C = B; + } + else + { + uint n; + + for (n = 1; ; n++) + { + if (n > 4) + throw new URIException("UTF-32 code point size too large"); + if (((B << n) & 0x80) == 0) + { + if (n == 1) + throw new URIException("UTF-32 code point size too small"); + break; + } + } + + // Pick off (7 - n) significant bits of B from first byte of octet + V = B & ((1 << (7 - n)) - 1); // (!!!) + + if (k + (3 * (n - 1)) >= len) + throw new URIException("UTF-32 unaligned String"); + for (j = 1; j != n; j++) + { + k++; + if (s[k] != '%') + throw new URIException("Expected: '%'"); + if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2])) + throw new URIException("Expected two hexadecimal digits after '%'"); + B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2])); + if ((B & 0xC0) != 0x80) + throw new URIException("Incorrect UTF-32 multi-byte sequence"); + k += 2; + V = (V << 6) | (B & 0x3F); + } + if (V > 0x10FFFF) + throw new URIException("Unknown UTF-32 code point"); + C = V; + } + if (C < uri_flags.length && uri_flags[C] & reservedSet) + { + // R ~= s[start .. k + 1]; + immutable width = (k + 1) - start; + for (int ii = 0; ii < width; ii++) + R[Rlen + ii] = s[start + ii]; + Rlen += width; + } + else + { + R[Rlen] = C; + Rlen++; + } + } + assert(Rlen <= Rsize); // enforce our preallocation size guarantee + + // Copy array on stack to array in memory + return R[0 .. Rlen].idup; +} + +/************************************* + * Decodes the URI string encodedURI into a UTF-8 string and returns it. + * Escape sequences that resolve to reserved URI characters are not replaced. + * Escape sequences that resolve to the '#' character are not replaced. + */ + +string decode(Char)(in Char[] encodedURI) +if (isSomeChar!Char) +{ + import std.algorithm.iteration : each; + import std.utf : encode; + auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash); + char[] r; + s.each!(c => encode(r, c)); + return r; +} + +/******************************* + * Decodes the URI string encodedURI into a UTF-8 string and returns it. All + * escape sequences are decoded. + */ + +string decodeComponent(Char)(in Char[] encodedURIComponent) +if (isSomeChar!Char) +{ + import std.algorithm.iteration : each; + import std.utf : encode; + auto s = URI_Decode(encodedURIComponent, 0); + char[] r; + s.each!(c => encode(r, c)); + return r; +} + +/***************************** + * Encodes the UTF-8 string uri into a URI and returns that URI. Any character + * not a valid URI character is escaped. The '#' character is not escaped. + */ + +string encode(Char)(in Char[] uri) +if (isSomeChar!Char) +{ + import std.utf : toUTF32; + auto s = toUTF32(uri); + return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark); +} + +/******************************** + * Encodes the UTF-8 string uriComponent into a URI and returns that URI. + * Any character not a letter, digit, or one of -_.!~*'() is escaped. + */ + +string encodeComponent(Char)(in Char[] uriComponent) +if (isSomeChar!Char) +{ + import std.utf : toUTF32; + auto s = toUTF32(uriComponent); + return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark); +} + +/* Encode associative array using www-form-urlencoding + * + * Params: + * values = an associative array containing the values to be encoded. + * + * Returns: + * A string encoded using www-form-urlencoding. + */ +package string urlEncode(in string[string] values) +{ + if (values.length == 0) + return ""; + + import std.array : Appender; + import std.format : formattedWrite; + + Appender!string enc; + enc.reserve(values.length * 128); + + bool first = true; + foreach (k, v; values) + { + if (!first) + enc.put('&'); + formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v)); + first = false; + } + return enc.data; +} + +@system unittest +{ + // @system because urlEncode -> encodeComponent -> URI_Encode + // URI_Encode uses alloca and pointer slicing + string[string] a; + assert(urlEncode(a) == ""); + assert(urlEncode(["name1" : "value1"]) == "name1=value1"); + assert(urlEncode(["name1" : "value1", "name2" : "value2"]) == "name1=value1&name2=value2"); +} + +/*************************** + * Does string s[] start with a URL? + * Returns: + * -1 it does not + * len it does, and s[0 .. len] is the slice of s[] that is that URL + */ + +ptrdiff_t uriLength(Char)(in Char[] s) +if (isSomeChar!Char) +{ + /* Must start with one of: + * http:// + * https:// + * www. + */ + import std.ascii : isAlphaNum; + import std.uni : icmp; + + ptrdiff_t i; + + if (s.length <= 4) + return -1; + + if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0) + { + i = 7; + } + else + { + if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0) + i = 8; + else + return -1; + } + + ptrdiff_t lastdot; + for (; i < s.length; i++) + { + auto c = s[i]; + if (isAlphaNum(c)) + continue; + if (c == '-' || c == '_' || c == '?' || + c == '=' || c == '%' || c == '&' || + c == '/' || c == '+' || c == '#' || + c == '~' || c == '$') + continue; + if (c == '.') + { + lastdot = i; + continue; + } + break; + } + if (!lastdot) + return -1; + + return i; +} + +/// +@safe unittest +{ + string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!"; + assert(uriLength(s1) == 49); + string s2 = "no uri here"; + assert(uriLength(s2) == -1); + assert(uriLength("issue 14924") < 0); +} + + +/*************************** + * Does string s[] start with an email address? + * Returns: + * -1 it does not + * len it does, and s[0 .. i] is the slice of s[] that is that email address + * References: + * RFC2822 + */ +ptrdiff_t emailLength(Char)(in Char[] s) +if (isSomeChar!Char) +{ + import std.ascii : isAlpha, isAlphaNum; + + ptrdiff_t i; + + if (!isAlpha(s[0])) + return -1; + + for (i = 1; 1; i++) + { + if (i == s.length) + return -1; + auto c = s[i]; + if (isAlphaNum(c)) + continue; + if (c == '-' || c == '_' || c == '.') + continue; + if (c != '@') + return -1; + i++; + break; + } + + /* Now do the part past the '@' + */ + ptrdiff_t lastdot; + for (; i < s.length; i++) + { + auto c = s[i]; + if (isAlphaNum(c)) + continue; + if (c == '-' || c == '_') + continue; + if (c == '.') + { + lastdot = i; + continue; + } + break; + } + if (!lastdot || (i - lastdot != 3 && i - lastdot != 4)) + return -1; + + return i; +} + +/// +@safe unittest +{ + string s1 = "my.e-mail@www.example-domain.com with garbage added"; + assert(emailLength(s1) == 32); + string s2 = "no email address here"; + assert(emailLength(s2) == -1); + assert(emailLength("issue 14924") < 0); +} + + +@system unittest +{ + //@system because of encode -> URI_Encode + debug(uri) writeln("uri.encodeURI.unittest"); + + string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo"; + string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo"; + + auto result = encode(source); + debug(uri) writefln("result = '%s'", result); + assert(result == target); + result = decode(target); + debug(uri) writefln("result = '%s'", result); + assert(result == source); + + result = encode(decode("%E3%81%82%E3%81%82")); + assert(result == "%E3%81%82%E3%81%82"); + + result = encodeComponent("c++"); + assert(result == "c%2B%2B"); + + auto str = new char[10_000_000]; + str[] = 'A'; + result = encodeComponent(str); + foreach (char c; result) + assert(c == 'A'); + + result = decode("%41%42%43"); + debug(uri) writeln(result); + + import std.meta : AliasSeq; + foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring)) + { + import std.conv : to; + StringType decoded1 = source.to!StringType; + string encoded1 = encode(decoded1); + assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed + assert(encoded1 == target); + assert(decoded1 == decode(encoded1).to!StringType); + + StringType encoded2 = target.to!StringType; + string decoded2 = decode(encoded2); + assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed + assert(decoded2 == source); + assert(encoded2 == encode(decoded2).to!StringType); + } +} |