From f911ba985aa7fe0096c386c5be385ac5825ea527 Mon Sep 17 00:00:00 2001
From: Tom Tromey
+ * A URI instance represents that defined by
+ * RFC3986,
+ * with some deviations.
+ *
+ * At its highest level, a URI consists of:
+ * [scheme:]scheme-specific-part
+ * [#fragment]
+ *
+ * where # and : are literal characters, + * and those parts enclosed in square brackets are optional. + *
+ *+ * There are two main types of URI. An opaque URI is one + * which just consists of the above three parts, and is not further + * defined. An example of such a URI would be mailto: URI. + * In contrast, hierarchical URIs give further definition + * to the scheme-specific part, so as represent some part of a hierarchical + * structure. + *
+ *
+ * [//authority][path]
+ * [?query]
+ *
+ * with / and ? being literal characters. + * When server-based, the authority section is further subdivided into: + *
+ *
+ * [user-info@]host
+ * [:port]
+ *
+ * with @ and : as literal characters. + * Authority sections that are not server-based are said to be registry-based. + *
+ *+ * Hierarchical URIs can be either relative or absolute. Absolute URIs + * always start with a `/', while relative URIs don't + * specify a scheme. Opaque URIs are always absolute. + *
+ *
+ * Each part of the URI may have one of three states: undefined, empty
+ * or containing some content. The former two of these are represented
+ * by null
and the empty string in Java, respectively.
+ * The scheme-specific part may never be undefined. It also follows from
+ * this that the path sub-part may also not be undefined, so as to ensure
+ * the former.
+ *
+ * The characters that can be used within a valid URI are restricted. + * There are two main classes of characters which can't be used as is + * within the URI: + *
+ *+ * The set of valid characters differs depending on the section of the URI: + *
+ *+ * These definitions reference the following sets of characters: + *
+ *
+ * The constructors and accessor methods allow the use and retrieval of
+ * URI components which contain non-US-ASCII characters directly.
+ * They are only escaped when the toASCIIString()
method
+ * is used. In contrast, illegal characters are always quoted, with the
+ * exception of the return values of the non-raw accessors.
+ *
+ * Returns the string content of the specified group of the supplied + * matcher. The returned value is modified according to the following: + *
+ *null
is returned to indicate an undefined
+ * value. Otherwise, the value is truly the empty string and this is
+ * the returned value.+ * This method is used for matching against all parts of the URI + * that may be either undefined or empty (i.e. all those but the + * scheme-specific part and the path). In each case, the preceding + * group is the content of the original group, along with some + * additional distinguishing feature. For example, the preceding + * group for the query includes the preceding question mark, + * while that of the fragment includes the hash symbol. The presence + * of these features enables disambiguation between the two cases + * of a completely unspecified value and a simple non-existant value. + * The scheme differs in that it will never return an empty string; + * the delimiter follows the scheme rather than preceding it, so + * it becomes part of the following section. The same is true + * of the user information. + *
+ * + * @param match the matcher, which contains the results of the URI + * matched against the URI regular expression. + * @return either the matched content,null
for undefined
+ * values, or an empty string for a URI part with empty content.
+ */
+ private static String getURIGroup(Matcher match, int group)
+ {
+ String matched = match.group(group);
+ return matched.length() == 0
+ ? ((match.group(group - 1).length() == 0) ? null : "") : matched;
+ }
+
+ /**
+ * Sets fields of this URI by parsing the given string.
+ *
+ * @param str The string to parse
+ *
+ * @exception URISyntaxException If the given string violates RFC 2396
+ */
+ private void parseURI(String str) throws URISyntaxException
+ {
+ Matcher matcher = URI_PATTERN.matcher(str);
+
+ if (matcher.matches())
+ {
+ scheme = getURIGroup(matcher, SCHEME_GROUP);
+ rawSchemeSpecificPart = matcher.group(SCHEME_SPEC_PART_GROUP);
+ schemeSpecificPart = unquote(rawSchemeSpecificPart);
+ if (!isOpaque())
+ {
+ rawAuthority = getURIGroup(matcher, AUTHORITY_GROUP);
+ rawPath = matcher.group(PATH_GROUP);
+ rawQuery = getURIGroup(matcher, QUERY_GROUP);
+ }
+ rawFragment = getURIGroup(matcher, FRAGMENT_GROUP);
+ }
+ else
+ throw new URISyntaxException(str,
+ "doesn't match URI regular expression");
+ parseServerAuthority();
+
+ // We must eagerly unquote the parts, because this is the only time
+ // we may throw an exception.
+ authority = unquote(rawAuthority);
+ userInfo = unquote(rawUserInfo);
+ host = unquote(rawHost);
+ path = unquote(rawPath);
+ query = unquote(rawQuery);
+ fragment = unquote(rawFragment);
+ }
+
+ /**
+ * Unquote "%" + hex quotes characters
+ *
+ * @param str The string to unquote or null.
+ *
+ * @return The unquoted string or null if str was null.
+ *
+ * @exception URISyntaxException If the given string contains invalid
+ * escape sequences.
+ */
+ private static String unquote(String str) throws URISyntaxException
+ {
+ if (str == null)
+ return null;
+ byte[] buf = new byte[str.length()];
+ int pos = 0;
+ for (int i = 0; i < str.length(); i++)
+ {
+ char c = str.charAt(i);
+ if (c == '%')
+ {
+ if (i + 2 >= str.length())
+ throw new URISyntaxException(str, "Invalid quoted character");
+ int hi = Character.digit(str.charAt(++i), 16);
+ int lo = Character.digit(str.charAt(++i), 16);
+ if (lo < 0 || hi < 0)
+ throw new URISyntaxException(str, "Invalid quoted character");
+ buf[pos++] = (byte) (hi * 16 + lo);
+ }
+ else
+ buf[pos++] = (byte) c;
+ }
+ try
+ {
+ return new String(buf, 0, pos, "utf-8");
+ }
+ catch (java.io.UnsupportedEncodingException x2)
+ {
+ throw (Error) new InternalError().initCause(x2);
+ }
+ }
+
+ /**
+ * Quote characters illegal in URIs in given string.
+ *
+ * Replace illegal characters by encoding their UTF-8
+ * representation as "%" + hex code for each resulting
+ * UTF-8 character.
+ *
+ * @param str The string to quote
+ *
+ * @return The quoted string.
+ */
+ private static String quote(String str)
+ {
+ return quote(str, RFC3986_SSP);
+ }
+
+ /**
+ * Quote characters illegal in URI authorities in given string.
+ *
+ * Replace illegal characters by encoding their UTF-8
+ * representation as "%" + hex code for each resulting
+ * UTF-8 character.
+ *
+ * @param str The string to quote
+ *
+ * @return The quoted string.
+ */
+ private static String quoteAuthority(String str)
+ {
+ // Technically, we should be using RFC2396_AUTHORITY, but
+ // it contains no additional characters.
+ return quote(str, RFC3986_REG_NAME);
+ }
+
+ /**
+ * Quotes the characters in the supplied string that are not part of
+ * the specified set of legal characters.
+ *
+ * @param str the string to quote
+ * @param legalCharacters the set of legal characters
+ *
+ * @return the quoted string.
+ */
+ private static String quote(String str, String legalCharacters)
+ {
+ StringBuffer sb = new StringBuffer(str.length());
+ for (int i = 0; i < str.length(); i++)
+ {
+ char c = str.charAt(i);
+ if (legalCharacters.indexOf(c) == -1)
+ {
+ if (c <= 127)
+ {
+ sb.append('%');
+ sb.append(HEX.charAt(c / 16));
+ sb.append(HEX.charAt(c % 16));
+ }
+ }
+ else
+ sb.append(c);
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Quote characters illegal in URI hosts in given string.
+ *
+ * Replace illegal characters by encoding their UTF-8
+ * representation as "%" + hex code for each resulting
+ * UTF-8 character.
+ *
+ * @param str The string to quote
+ *
+ * @return The quoted string.
+ */
+ private static String quoteHost(String str)
+ {
+ return quote(str, RFC3986_HOST);
+ }
+
+ /**
+ * Quote characters illegal in URI paths in given string.
+ *
+ * Replace illegal characters by encoding their UTF-8
+ * representation as "%" + hex code for each resulting
+ * UTF-8 character.
+ *
+ * @param str The string to quote
+ *
+ * @return The quoted string.
+ */
+ private static String quotePath(String str)
+ {
+ // Technically, we should be using RFC2396_PATH, but
+ // it contains no additional characters.
+ return quote(str, RFC3986_PATH_SEGMENTS);
+ }
+
+ /**
+ * Quote characters illegal in URI user infos in given string.
+ *
+ * Replace illegal characters by encoding their UTF-8
+ * representation as "%" + hex code for each resulting
+ * UTF-8 character.
+ *
+ * @param str The string to quote
+ *
+ * @return The quoted string.
+ */
+ private static String quoteUserInfo(String str)
+ {
+ return quote(str, RFC3986_USERINFO);
+ }
+
+ /**
+ * Creates an URI from the given string
+ *
+ * @param str The string to create the URI from
+ *
+ * @exception URISyntaxException If the given string violates RFC 2396
+ * @exception NullPointerException If str is null
+ */
+ public URI(String str) throws URISyntaxException
+ {
+ this.string = str;
+ parseURI(str);
+ }
+
+ /**
+ * Create an URI from the given components
+ *
+ * @param scheme The scheme name
+ * @param userInfo The username and authorization info
+ * @param host The hostname
+ * @param port The port number
+ * @param path The path
+ * @param query The query
+ * @param fragment The fragment
+ *
+ * @exception URISyntaxException If the given string violates RFC 2396
+ */
+ public URI(String scheme, String userInfo, String host, int port,
+ String path, String query, String fragment)
+ throws URISyntaxException
+ {
+ this((scheme == null ? "" : scheme + ":")
+ + (userInfo == null && host == null && port == -1 ? "" : "//")
+ + (userInfo == null ? "" : quoteUserInfo(userInfo) + "@")
+ + (host == null ? "" : quoteHost(host))
+ + (port == -1 ? "" : ":" + String.valueOf(port))
+ + (path == null ? "" : quotePath(path))
+ + (query == null ? "" : "?" + quote(query))
+ + (fragment == null ? "" : "#" + quote(fragment)));
+ }
+
+ /**
+ * Create an URI from the given components
+ *
+ * @param scheme The scheme name
+ * @param authority The authority
+ * @param path The apth
+ * @param query The query
+ * @param fragment The fragment
+ *
+ * @exception URISyntaxException If the given string violates RFC 2396
+ */
+ public URI(String scheme, String authority, String path, String query,
+ String fragment) throws URISyntaxException
+ {
+ this((scheme == null ? "" : scheme + ":")
+ + (authority == null ? "" : "//" + quoteAuthority(authority))
+ + (path == null ? "" : quotePath(path))
+ + (query == null ? "" : "?" + quote(query))
+ + (fragment == null ? "" : "#" + quote(fragment)));
+ }
+
+ /**
+ * Create an URI from the given components
+ *
+ * @param scheme The scheme name
+ * @param host The hostname
+ * @param path The path
+ * @param fragment The fragment
+ *
+ * @exception URISyntaxException If the given string violates RFC 2396
+ */
+ public URI(String scheme, String host, String path, String fragment)
+ throws URISyntaxException
+ {
+ this(scheme, null, host, -1, path, null, fragment);
+ }
+
+ /**
+ * Create an URI from the given components
+ *
+ * @param scheme The scheme name
+ * @param ssp The scheme specific part
+ * @param fragment The fragment
+ *
+ * @exception URISyntaxException If the given string violates RFC 2396
+ */
+ public URI(String scheme, String ssp, String fragment)
+ throws URISyntaxException
+ {
+ this((scheme == null ? "" : scheme + ":")
+ + (ssp == null ? "" : quote(ssp))
+ + (fragment == null ? "" : "#" + quote(fragment)));
+ }
+
+ /**
+ * Create an URI from the given string
+ *
+ * @param str The string to create the URI from
+ *
+ * @exception IllegalArgumentException If the given string violates RFC 2396
+ * @exception NullPointerException If str is null
+ */
+ public static URI create(String str)
+ {
+ try
+ {
+ return new URI(str);
+ }
+ catch (URISyntaxException e)
+ {
+ throw (IllegalArgumentException) new IllegalArgumentException()
+ .initCause(e);
+ }
+ }
+
+ /**
+ * Attempts to parse this URI's authority component, if defined,
+ * into user-information, host, and port components. The purpose
+ * of this method was to disambiguate between some authority sections,
+ * which form invalid server-based authories, but valid registry
+ * based authorities. In the updated RFC 3986, the authority section
+ * is defined differently, with registry-based authorities part of
+ * the host section. Thus, this method is now simply an explicit
+ * way of parsing any authority section.
+ *
+ * @return the URI, with the authority section parsed into user
+ * information, host and port components.
+ * @throws URISyntaxException if the given string violates RFC 2396
+ */
+ public URI parseServerAuthority() throws URISyntaxException
+ {
+ if (rawAuthority != null)
+ {
+ Matcher matcher = AUTHORITY_PATTERN.matcher(rawAuthority);
+
+ if (matcher.matches())
+ {
+ rawUserInfo = getURIGroup(matcher, AUTHORITY_USERINFO_GROUP);
+ rawHost = getURIGroup(matcher, AUTHORITY_HOST_GROUP);
+
+ String portStr = getURIGroup(matcher, AUTHORITY_PORT_GROUP);
+
+ if (portStr != null)
+ try
+ {
+ port = Integer.parseInt(portStr);
+ }
+ catch (NumberFormatException e)
+ {
+ URISyntaxException use =
+ new URISyntaxException
+ (string, "doesn't match URI regular expression");
+ use.initCause(e);
+ throw use;
+ }
+ }
+ else
+ throw new URISyntaxException(string,
+ "doesn't match URI regular expression");
+ }
+ return this;
+ }
+
+ /**
+ * + * Returns a normalized version of the URI. If the URI is opaque, + * or its path is already in normal form, then this URI is simply + * returned. Otherwise, the following transformation of the path + * element takes place: + *
+ *+ * The resulting URI will be free of `.' and `..' segments, barring those + * that were prepended or which couldn't be paired, respectively. + *
+ * + * @return the normalized URI. + */ + public URI normalize() + { + if (isOpaque() || path.indexOf("/./") == -1 && path.indexOf("/../") == -1) + return this; + try + { + return new URI(scheme, authority, normalizePath(path), query, + fragment); + } + catch (URISyntaxException e) + { + throw (Error) new InternalError("Normalized URI variant could not "+ + "be constructed").initCause(e); + } + } + + /** + *+ * Normalize the given path. The following transformation takes place: + *
+ *+ * The resulting URI will be free of `.' and `..' segments, barring those + * that were prepended or which couldn't be paired, respectively. + *
+ * + * @param relativePath the relative path to be normalized. + * @return the normalized path. + */ + private String normalizePath(String relativePath) + { + /* + This follows the algorithm in section 5.2.4. of RFC3986, + but doesn't modify the input buffer. + */ + StringBuffer input = new StringBuffer(relativePath); + StringBuffer output = new StringBuffer(); + int start = 0; + while (start < input.length()) + { + /* A */ + if (input.indexOf("../",start) == start) + { + start += 3; + continue; + } + if (input.indexOf("./",start) == start) + { + start += 2; + continue; + } + /* B */ + if (input.indexOf("/./",start) == start) + { + start += 2; + continue; + } + if (input.indexOf("/.",start) == start + && input.charAt(start + 2) != '.') + { + start += 1; + input.setCharAt(start,'/'); + continue; + } + /* C */ + if (input.indexOf("/../",start) == start) + { + start += 3; + removeLastSegment(output); + continue; + } + if (input.indexOf("/..",start) == start) + { + start += 2; + input.setCharAt(start,'/'); + removeLastSegment(output); + continue; + } + /* D */ + if (start == input.length() - 1 && input.indexOf(".",start) == start) + { + input.delete(0,1); + continue; + } + if (start == input.length() - 2 && input.indexOf("..",start) == start) + { + input.delete(0,2); + continue; + } + /* E */ + int indexOfSlash = input.indexOf("/",start); + while (indexOfSlash == start) + { + output.append("/"); + ++start; + indexOfSlash = input.indexOf("/",start); + } + if (indexOfSlash == -1) + indexOfSlash = input.length(); + output.append(input.substring(start, indexOfSlash)); + start = indexOfSlash; + } + return output.toString(); + } + + /** + * Removes the last segment of the path from the specified buffer. + * + * @param buffer the buffer containing the path. + */ + private void removeLastSegment(StringBuffer buffer) + { + int lastSlash = buffer.lastIndexOf("/"); + if (lastSlash == -1) + buffer.setLength(0); + else + buffer.setLength(lastSlash); + } + + /** + * Resolves the given URI against this URI + * + * @param uri The URI to resolve against this URI + * + * @return The resulting URI, or null when it couldn't be resolved + * for some reason. + * + * @throws NullPointerException if uri is null + */ + public URI resolve(URI uri) + { + if (uri.isAbsolute()) + return uri; + if (uri.isOpaque()) + return uri; + + String scheme = uri.getScheme(); + String schemeSpecificPart = uri.getSchemeSpecificPart(); + String authority = uri.getAuthority(); + String path = uri.getPath(); + String query = uri.getQuery(); + String fragment = uri.getFragment(); + + try + { + if (fragment != null && path != null && path.equals("") + && scheme == null && authority == null && query == null) + return new URI(this.scheme, this.schemeSpecificPart, fragment); + + if (authority == null) + { + authority = this.authority; + if (path == null) + path = ""; + if (! (path.startsWith("/"))) + { + StringBuffer basepath = new StringBuffer(this.path); + int i = this.path.lastIndexOf('/'); + + if (i >= 0) + basepath.delete(i + 1, basepath.length()); + + basepath.append(path); + path = normalizePath(basepath.toString()); + } + } + return new URI(this.scheme, authority, path, query, fragment); + } + catch (URISyntaxException e) + { + throw (Error) new InternalError("Resolved URI variant could not "+ + "be constructed").initCause(e); + } + } + + /** + * Resolves the given URI string against this URI + * + * @param str The URI as string to resolve against this URI + * + * @return The resulting URI + * + * @throws IllegalArgumentException If the given URI string + * violates RFC 2396 + * @throws NullPointerException If uri is null + */ + public URI resolve(String str) throws IllegalArgumentException + { + return resolve(create(str)); + } + + /** + *+ * Relativizes the given URI against this URI. The following + * algorithm is used: + *
+ *
+ * Compares the URI with the given object for equality. If the
+ * object is not a URI
, then the method returns false.
+ * Otherwise, the following criteria are observed:
+ *
true
if the objects are equal, according to
+ * the specification above.
+ */
+ public boolean equals(Object obj)
+ {
+ if (!(obj instanceof URI))
+ return false;
+ URI uriObj = (URI) obj;
+ if (scheme == null)
+ {
+ if (uriObj.getScheme() != null)
+ return false;
+ }
+ else
+ if (!(scheme.equalsIgnoreCase(uriObj.getScheme())))
+ return false;
+ if (rawFragment == null)
+ {
+ if (uriObj.getRawFragment() != null)
+ return false;
+ }
+ else
+ if (!(rawFragment.equalsIgnoreCase(uriObj.getRawFragment())))
+ return false;
+ boolean opaqueThis = isOpaque();
+ boolean opaqueObj = uriObj.isOpaque();
+ if (opaqueThis && opaqueObj)
+ return rawSchemeSpecificPart.equals(uriObj.getRawSchemeSpecificPart());
+ else if (!opaqueThis && !opaqueObj)
+ {
+ boolean common = rawPath.equalsIgnoreCase(uriObj.getRawPath())
+ && ((rawQuery == null && uriObj.getRawQuery() == null)
+ || rawQuery.equalsIgnoreCase(uriObj.getRawQuery()));
+ if (rawAuthority == null && uriObj.getRawAuthority() == null)
+ return common;
+ if (host == null)
+ return common
+ && rawAuthority.equalsIgnoreCase(uriObj.getRawAuthority());
+ return common
+ && host.equalsIgnoreCase(uriObj.getHost())
+ && port == uriObj.getPort()
+ && (rawUserInfo == null ?
+ uriObj.getRawUserInfo() == null :
+ rawUserInfo.equalsIgnoreCase(uriObj.getRawUserInfo()));
+ }
+ else
+ return false;
+ }
+
+ /**
+ * Computes the hashcode of the URI
+ */
+ public int hashCode()
+ {
+ return (getScheme() == null ? 0 : 13 * getScheme().hashCode())
+ + 17 * getRawSchemeSpecificPart().hashCode()
+ + (getRawFragment() == null ? 0 : 21 + getRawFragment().hashCode());
+ }
+
+ /**
+ * Compare the URI with another object that must also be a URI.
+ * Undefined components are taken to be less than any other component.
+ * The following criteria are observed:
+ *
+ * toString()
for URIs that don't contain any non-US-ASCII
+ * characters. Otherwise, the non-US-ASCII characters are replaced
+ * by their percent-encoded representations.
+ *
+ * @return a string representation of the URI, containing only US-ASCII
+ * characters.
+ */
+ public String toASCIIString()
+ {
+ String strRep = toString();
+ boolean inNonAsciiBlock = false;
+ StringBuffer buffer = new StringBuffer();
+ StringBuffer encBuffer = null;
+ for (int i = 0; i < strRep.length(); i++)
+ {
+ char c = strRep.charAt(i);
+ if (c <= 127)
+ {
+ if (inNonAsciiBlock)
+ {
+ buffer.append(escapeCharacters(encBuffer.toString()));
+ inNonAsciiBlock = false;
+ }
+ buffer.append(c);
+ }
+ else
+ {
+ if (!inNonAsciiBlock)
+ {
+ encBuffer = new StringBuffer();
+ inNonAsciiBlock = true;
+ }
+ encBuffer.append(c);
+ }
+ }
+ return buffer.toString();
+ }
+
+ /**
+ * Converts the non-ASCII characters in the supplied string
+ * to their equivalent percent-encoded representations.
+ * That is, they are replaced by "%" followed by their hexadecimal value.
+ *
+ * @param str a string including non-ASCII characters.
+ * @return the string with the non-ASCII characters converted to their
+ * percent-encoded representations.
+ */
+ private static String escapeCharacters(String str)
+ {
+ try
+ {
+ StringBuffer sb = new StringBuffer();
+ // this is far from optimal, but it works
+ byte[] utf8 = str.getBytes("utf-8");
+ for (int j = 0; j < utf8.length; j++)
+ {
+ sb.append('%');
+ sb.append(HEX.charAt((utf8[j] & 0xff) / 16));
+ sb.append(HEX.charAt((utf8[j] & 0xff) % 16));
+ }
+ return sb.toString();
+ }
+ catch (java.io.UnsupportedEncodingException x)
+ {
+ throw (Error) new InternalError("Escaping error").initCause(x);
+ }
+ }
+
+}
--
cgit v1.1