/** * Gets the canonical url, starting from a relative or absolute url found in a given context * (baseURL). * * @param url the url string defining the reference * @param baseURL the context in which this url was found * @return the canonical url */ public static String getCanonicalURL(String url, String baseURL) { try { /* Build the absolute URL, from the url and the baseURL */ String resolvedURL = URLResolver.resolveUrl(baseURL == null ? "" : baseURL, url); log.debug("Resolved URL: " + resolvedURL); URI canonicalURI = new URI(resolvedURL); /* Some checking. */ if (canonicalURI.getScheme() == null) { throw new MalformedURLException( "Protocol could not be reliably evaluated from uri: " + canonicalURI + " and base url: " + baseURL); } if (canonicalURI.getHost() == null) { throw new MalformedURLException( "Host could not be reliably evaluated from: " + canonicalURI); } /* * Normalize: no empty segments (i.e., "//"), no segments equal to ".", and no segments equal to * ".." that are preceded by a segment not equal to "..". */ String path = canonicalURI.normalize().getRawPath(); /* Convert '//' -> '/' */ int idx = path.indexOf("//"); while (idx >= 0) { path = path.replace("//", "/"); idx = path.indexOf("//"); } /* Drop starting '/../' */ while (path.startsWith("/../")) { path = path.substring(3); } /* Trim */ path = path.trim(); /* Process parameters and sort them. */ final SortedMap<String, String> params = createParameterMap(canonicalURI.getRawQuery()); final String queryString; String canonicalParams = canonicalize(params); queryString = (canonicalParams.isEmpty() ? "" : "?" + canonicalParams); /* Add starting slash if needed */ if (path.length() == 0) { path = "/" + path; } /* Drop default port: example.com:80 -> example.com */ int port = canonicalURI.getPort(); if (port == 80) { port = -1; } /* Lowercasing protocol and host */ String protocol = canonicalURI.getScheme().toLowerCase(); String host = canonicalURI.getHost().toLowerCase(); String pathAndQueryString = normalizePath(path) + queryString; URL result = new URL(protocol, host, port, pathAndQueryString); return result.toExternalForm(); } catch (MalformedURLException ex) { log.warn( "Error while Processing URL in the spidering process (on base " + baseURL + "): " + ex.getMessage()); return null; } catch (URISyntaxException ex) { log.warn( "Error while Processing URI in the spidering process (on base " + baseURL + "): " + ex.getMessage()); return null; } }