1 files changed, 592 insertions, 0 deletions
diff --git a/libphobos/src/std/uri.d b/libphobos/src/std/uri.d
new file mode 100644
index 0000000..0852955
--- /dev/null
+++ b/libphobos/src/std/uri.d
@@ -0,0 +1,592 @@
+// Written in the D programming language.
+
+/**
+ * Encode and decode Uniform Resource Identifiers (URIs).
+ * URIs are used in internet transfer protocols.
+ * Valid URI characters consist of letters, digits,
+ * and the characters $(B ;/?:@&amp;=+$,-_.!~*'())
+ * Reserved URI characters are $(B ;/?:@&amp;=+$,)
+ * Escape sequences consist of $(B %) followed by two hex digits.
+ *
+ * See_Also:
+ *  $(LINK2 http://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br>
+ *  $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia)
+ * Copyright: Copyright Digital Mars 2000 - 2009.
+ * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
+ * Authors:   $(HTTP digitalmars.com, Walter Bright)
+ * Source:    $(PHOBOSSRC std/_uri.d)
+ */
+/*          Copyright Digital Mars 2000 - 2009.
+ * Distributed under the Boost Software License, Version 1.0.
+ *    (See accompanying file LICENSE_1_0.txt or copy at
+ *          http://www.boost.org/LICENSE_1_0.txt)
+ */
+module std.uri;
+
+//debug=uri;        // uncomment to turn on debugging writefln's
+debug(uri) import std.stdio;
+import std.traits : isSomeChar;
+
+/** This Exception is thrown if something goes wrong when encoding or
+decoding a URI.
+*/
+class URIException : Exception
+{
+    import std.exception : basicExceptionCtors;
+    mixin basicExceptionCtors;
+}
+
+private enum
+{
+    URI_Alpha = 1,
+    URI_Reserved = 2,
+    URI_Mark = 4,
+    URI_Digit = 8,
+    URI_Hash = 0x10,        // '#'
+}
+
+private immutable char[16] hex2ascii = "0123456789ABCDEF";
+
+private immutable ubyte[128] uri_flags =      // indexed by character
+    ({
+        ubyte[128] uflags;
+
+        // Compile time initialize
+        uflags['#'] |= URI_Hash;
+
+        foreach (c; 'A' .. 'Z' + 1)
+        {
+            uflags[c] |= URI_Alpha;
+            uflags[c + 0x20] |= URI_Alpha;   // lowercase letters
+        }
+        foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit;
+        foreach (c; ";/?:@&=+$,")   uflags[c] |= URI_Reserved;
+        foreach (c; "-_.!~*'()")    uflags[c] |= URI_Mark;
+        return uflags;
+    })();
+
+private string URI_Encode(dstring str, uint unescapedSet)
+{
+    import core.exception : OutOfMemoryError;
+    import core.stdc.stdlib : alloca;
+
+    uint j;
+    uint k;
+    dchar V;
+    dchar C;
+
+    // result buffer
+    char[50] buffer = void;
+    char* R;
+    uint Rlen;
+    uint Rsize; // alloc'd size
+
+    immutable len = str.length;
+
+    R = buffer.ptr;
+    Rsize = buffer.length;
+    Rlen = 0;
+
+    for (k = 0; k != len; k++)
+    {
+        C = str[k];
+        // if (C in unescapedSet)
+        if (C < uri_flags.length && uri_flags[C] & unescapedSet)
+        {
+            if (Rlen == Rsize)
+            {
+                char* R2;
+
+                Rsize *= 2;
+                if (Rsize > 1024)
+                {
+                    R2 = (new char[Rsize]).ptr;
+                }
+                else
+                {
+                    R2 = cast(char *) alloca(Rsize * char.sizeof);
+                    if (!R2)
+                        throw new OutOfMemoryError("Alloca failure");
+                }
+                R2[0 .. Rlen] = R[0 .. Rlen];
+                R = R2;
+            }
+            R[Rlen] = cast(char) C;
+            Rlen++;
+        }
+        else
+        {
+            char[6] Octet;
+            uint L;
+
+            V = C;
+
+            // Transform V into octets
+            if (V <= 0x7F)
+            {
+                Octet[0] = cast(char) V;
+                L = 1;
+            }
+            else if (V <= 0x7FF)
+            {
+                Octet[0] = cast(char)(0xC0 | (V >> 6));
+                Octet[1] = cast(char)(0x80 | (V & 0x3F));
+                L = 2;
+            }
+            else if (V <= 0xFFFF)
+            {
+                Octet[0] = cast(char)(0xE0 | (V >> 12));
+                Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F));
+                Octet[2] = cast(char)(0x80 | (V & 0x3F));
+                L = 3;
+            }
+            else if (V <= 0x1FFFFF)
+            {
+                Octet[0] = cast(char)(0xF0 | (V >> 18));
+                Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F));
+                Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F));
+                Octet[3] = cast(char)(0x80 | (V & 0x3F));
+                L = 4;
+            }
+            else
+            {
+                throw new URIException("Undefined UTF-32 code point");
+            }
+
+            if (Rlen + L * 3 > Rsize)
+            {
+                char *R2;
+
+                Rsize = 2 * (Rlen + L * 3);
+                if (Rsize > 1024)
+                {
+                    R2 = (new char[Rsize]).ptr;
+                }
+                else
+                {
+                    R2 = cast(char *) alloca(Rsize * char.sizeof);
+                    if (!R2)
+                        throw new OutOfMemoryError("Alloca failure");
+                }
+                R2[0 .. Rlen] = R[0 .. Rlen];
+                R = R2;
+            }
+
+            for (j = 0; j < L; j++)
+            {
+                R[Rlen] = '%';
+                R[Rlen + 1] = hex2ascii[Octet[j] >> 4];
+                R[Rlen + 2] = hex2ascii[Octet[j] & 15];
+
+                Rlen += 3;
+            }
+        }
+    }
+
+    return R[0 .. Rlen].idup;
+}
+
+private uint ascii2hex(dchar c) @nogc @safe pure nothrow
+{
+    return (c <= '9') ? c - '0' :
+        (c <= 'F') ? c - 'A' + 10 :
+        c - 'a' + 10;
+}
+
+private dstring URI_Decode(Char)(in Char[] uri, uint reservedSet)
+if (isSomeChar!Char)
+{
+    import core.exception : OutOfMemoryError;
+    import core.stdc.stdlib : alloca;
+    import std.ascii : isHexDigit;
+
+    uint j;
+    uint k;
+    uint V;
+    dchar C;
+
+    // Result array, allocated on stack
+    dchar* R;
+    uint Rlen;
+
+    immutable len = uri.length;
+    auto s = uri.ptr;
+
+    // Preallocate result buffer R guaranteed to be large enough for result
+    auto Rsize = len;
+    if (Rsize > 1024 / dchar.sizeof)
+    {
+        R = (new dchar[Rsize]).ptr;
+    }
+    else
+    {
+        R = cast(dchar *) alloca(Rsize * dchar.sizeof);
+        if (!R)
+            throw new OutOfMemoryError("Alloca failure");
+    }
+    Rlen = 0;
+
+    for (k = 0; k != len; k++)
+    {
+        char B;
+        uint start;
+
+        C = s[k];
+        if (C != '%')
+        {
+            R[Rlen] = C;
+            Rlen++;
+            continue;
+        }
+        start = k;
+        if (k + 2 >= len)
+            throw new URIException("Unexpected end of URI");
+        if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
+            throw new URIException("Expected two hexadecimal digits after '%'");
+        B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
+        k += 2;
+        if ((B & 0x80) == 0)
+        {
+            C = B;
+        }
+        else
+        {
+            uint n;
+
+            for (n = 1; ; n++)
+            {
+                if (n > 4)
+                    throw new URIException("UTF-32 code point size too large");
+                if (((B << n) & 0x80) == 0)
+                {
+                    if (n == 1)
+                        throw new URIException("UTF-32 code point size too small");
+                    break;
+                }
+            }
+
+            // Pick off (7 - n) significant bits of B from first byte of octet
+            V = B & ((1 << (7 - n)) - 1);   // (!!!)
+
+            if (k + (3 * (n - 1)) >= len)
+                throw new URIException("UTF-32 unaligned String");
+            for (j = 1; j != n; j++)
+            {
+                k++;
+                if (s[k] != '%')
+                    throw new URIException("Expected: '%'");
+                if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
+                    throw new URIException("Expected two hexadecimal digits after '%'");
+                B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
+                if ((B & 0xC0) != 0x80)
+                    throw new URIException("Incorrect UTF-32 multi-byte sequence");
+                k += 2;
+                V = (V << 6) | (B & 0x3F);
+            }
+            if (V > 0x10FFFF)
+                throw new URIException("Unknown UTF-32 code point");
+            C = V;
+        }
+        if (C < uri_flags.length && uri_flags[C] & reservedSet)
+        {
+            // R ~= s[start .. k + 1];
+            immutable width = (k + 1) - start;
+            for (int ii = 0; ii < width; ii++)
+                R[Rlen + ii] = s[start + ii];
+            Rlen += width;
+        }
+        else
+        {
+            R[Rlen] = C;
+            Rlen++;
+        }
+    }
+    assert(Rlen <= Rsize);  // enforce our preallocation size guarantee
+
+    // Copy array on stack to array in memory
+    return R[0 .. Rlen].idup;
+}
+
+/*************************************
+ * Decodes the URI string encodedURI into a UTF-8 string and returns it.
+ * Escape sequences that resolve to reserved URI characters are not replaced.
+ * Escape sequences that resolve to the '#' character are not replaced.
+ */
+
+string decode(Char)(in Char[] encodedURI)
+if (isSomeChar!Char)
+{
+    import std.algorithm.iteration : each;
+    import std.utf : encode;
+    auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash);
+    char[] r;
+    s.each!(c => encode(r, c));
+    return r;
+}
+
+/*******************************
+ * Decodes the URI string encodedURI into a UTF-8 string and returns it. All
+ * escape sequences are decoded.
+ */
+
+string decodeComponent(Char)(in Char[] encodedURIComponent)
+if (isSomeChar!Char)
+{
+    import std.algorithm.iteration : each;
+    import std.utf : encode;
+    auto s = URI_Decode(encodedURIComponent, 0);
+    char[] r;
+    s.each!(c => encode(r, c));
+    return r;
+}
+
+/*****************************
+ * Encodes the UTF-8 string uri into a URI and returns that URI. Any character
+ * not a valid URI character is escaped. The '#' character is not escaped.
+ */
+
+string encode(Char)(in Char[] uri)
+if (isSomeChar!Char)
+{
+    import std.utf : toUTF32;
+    auto s = toUTF32(uri);
+    return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark);
+}
+
+/********************************
+ * Encodes the UTF-8 string uriComponent into a URI and returns that URI.
+ * Any character not a letter, digit, or one of -_.!~*'() is escaped.
+ */
+
+string encodeComponent(Char)(in Char[] uriComponent)
+if (isSomeChar!Char)
+{
+    import std.utf : toUTF32;
+    auto s = toUTF32(uriComponent);
+    return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark);
+}
+
+/* Encode associative array using www-form-urlencoding
+ *
+ * Params:
+ *      values = an associative array containing the values to be encoded.
+ *
+ * Returns:
+ *      A string encoded using www-form-urlencoding.
+ */
+package string urlEncode(in string[string] values)
+{
+    if (values.length == 0)
+        return "";
+
+    import std.array : Appender;
+    import std.format : formattedWrite;
+
+    Appender!string enc;
+    enc.reserve(values.length * 128);
+
+    bool first = true;
+    foreach (k, v; values)
+    {
+        if (!first)
+            enc.put('&');
+        formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v));
+        first = false;
+    }
+    return enc.data;
+}
+
+@system unittest
+{
+    // @system because urlEncode -> encodeComponent -> URI_Encode
+    // URI_Encode uses alloca and pointer slicing
+    string[string] a;
+    assert(urlEncode(a) == "");
+    assert(urlEncode(["name1" : "value1"]) == "name1=value1");
+    assert(urlEncode(["name1" : "value1", "name2" : "value2"]) == "name1=value1&name2=value2");
+}
+
+/***************************
+ * Does string s[] start with a URL?
+ * Returns:
+ *  -1   it does not
+ *  len  it does, and s[0 .. len] is the slice of s[] that is that URL
+ */
+
+ptrdiff_t uriLength(Char)(in Char[] s)
+if (isSomeChar!Char)
+{
+    /* Must start with one of:
+     *  http://
+     *  https://
+     *  www.
+     */
+    import std.ascii : isAlphaNum;
+    import std.uni : icmp;
+
+    ptrdiff_t i;
+
+    if (s.length <= 4)
+        return -1;
+
+    if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0)
+    {
+        i = 7;
+    }
+    else
+    {
+        if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0)
+            i = 8;
+        else
+            return -1;
+    }
+
+    ptrdiff_t lastdot;
+    for (; i < s.length; i++)
+    {
+        auto c = s[i];
+        if (isAlphaNum(c))
+            continue;
+        if (c == '-' || c == '_' || c == '?' ||
+                c == '=' || c == '%' || c == '&' ||
+                c == '/' || c == '+' || c == '#' ||
+                c == '~' || c == '$')
+            continue;
+        if (c == '.')
+        {
+            lastdot = i;
+            continue;
+        }
+        break;
+    }
+    if (!lastdot)
+        return -1;
+
+    return i;
+}
+
+///
+@safe unittest
+{
+    string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!";
+    assert(uriLength(s1) == 49);
+    string s2 = "no uri here";
+    assert(uriLength(s2) == -1);
+    assert(uriLength("issue 14924") < 0);
+}
+
+
+/***************************
+ * Does string s[] start with an email address?
+ * Returns:
+ *  -1    it does not
+ *  len   it does, and s[0 .. i] is the slice of s[] that is that email address
+ * References:
+ *  RFC2822
+ */
+ptrdiff_t emailLength(Char)(in Char[] s)
+if (isSomeChar!Char)
+{
+    import std.ascii : isAlpha, isAlphaNum;
+
+    ptrdiff_t i;
+
+    if (!isAlpha(s[0]))
+        return -1;
+
+    for (i = 1; 1; i++)
+    {
+        if (i == s.length)
+            return -1;
+        auto c = s[i];
+        if (isAlphaNum(c))
+            continue;
+        if (c == '-' || c == '_' || c == '.')
+            continue;
+        if (c != '@')
+            return -1;
+        i++;
+        break;
+    }
+
+    /* Now do the part past the '@'
+     */
+    ptrdiff_t lastdot;
+    for (; i < s.length; i++)
+    {
+        auto c = s[i];
+        if (isAlphaNum(c))
+            continue;
+        if (c == '-' || c == '_')
+            continue;
+        if (c == '.')
+        {
+            lastdot = i;
+            continue;
+        }
+        break;
+    }
+    if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
+        return -1;
+
+    return i;
+}
+
+///
+@safe unittest
+{
+    string s1 = "my.e-mail@www.example-domain.com with garbage added";
+    assert(emailLength(s1) == 32);
+    string s2 = "no email address here";
+    assert(emailLength(s2) == -1);
+    assert(emailLength("issue 14924") < 0);
+}
+
+
+@system unittest
+{
+    //@system because of encode -> URI_Encode
+    debug(uri) writeln("uri.encodeURI.unittest");
+
+    string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo";
+    string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo";
+
+    auto result = encode(source);
+    debug(uri) writefln("result = '%s'", result);
+    assert(result == target);
+    result = decode(target);
+    debug(uri) writefln("result = '%s'", result);
+    assert(result == source);
+
+    result = encode(decode("%E3%81%82%E3%81%82"));
+    assert(result == "%E3%81%82%E3%81%82");
+
+    result = encodeComponent("c++");
+    assert(result == "c%2B%2B");
+
+    auto str = new char[10_000_000];
+    str[] = 'A';
+    result = encodeComponent(str);
+    foreach (char c; result)
+        assert(c == 'A');
+
+    result = decode("%41%42%43");
+    debug(uri) writeln(result);
+
+    import std.meta : AliasSeq;
+    foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring))
+    {
+        import std.conv : to;
+        StringType decoded1 = source.to!StringType;
+        string encoded1 = encode(decoded1);
+        assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed
+        assert(encoded1 == target);
+        assert(decoded1 == decode(encoded1).to!StringType);
+
+        StringType encoded2 = target.to!StringType;
+        string decoded2 = decode(encoded2);
+        assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed
+        assert(decoded2 == source);
+        assert(encoded2 == encode(decoded2).to!StringType);
+    }
+}