public String urlStringToKey(final String urlString) throws URIException { if (urlString.startsWith("dns:")) { return urlString; } String searchUrl = canonicalize(urlString); String scheme = UrlOperations.urlToScheme(searchUrl); if (scheme != null) { searchUrl = searchUrl.substring(scheme.length()); } else { scheme = UrlOperations.HTTP_SCHEME; } if (-1 == searchUrl.indexOf("/")) { searchUrl = scheme + searchUrl + "/"; } else { searchUrl = scheme + searchUrl; } // Custom rules for (CanonicalizationRule rule : getProcessingRules()) { searchUrl = rule.processIfMatches(new CanonicalizationInput(searchUrl)); } // Core rules // TODO: These next few lines look crazy -- need to be reworked.. This // was the only easy way I could find to get the correct unescaping // out of UsableURIs, possible a bug. Definitely needs some TLC in any case, // as building UsableURIs is *not* a cheap operation. // unescape anything that can be: UsableURI tmpURI = null; try { tmpURI = UsableURIFactory.getInstance(searchUrl); } catch (StringIndexOutOfBoundsException e) { LOGGER.warning(e.getMessage() + ": " + searchUrl); return searchUrl; // } catch(URIException e) { // LOGGER.warning(e.getMessage() + ": " + searchUrl); // return searchUrl; } tmpURI.setPath(tmpURI.getPath()); // convert to UsableURI to perform required URI fixup: UsableURI searchURI = UsableURIFactory.getInstance(tmpURI.getURI()); // replace ' ' with '+' (this is only to match Alexa's canonicalization) String newPath = searchURI.getEscapedPath().replace("%20", "+"); // replace multiple consecutive '/'s in the path. while (newPath.contains("//")) { newPath = newPath.replace("//", "/"); } // this would remove trailing a '/' character, unless the path is empty // but we're not going to do this just yet.. // if((newPath.length() > 1) && newPath.endsWith("/")) { // newPath = newPath.substring(0,newPath.length()-1); // } StringBuilder sb = new StringBuilder(searchUrl.length()); sb.append(searchURI.getHostBasename()); // omit port if scheme default: int defaultSchemePort = UrlOperations.schemeToDefaultPort(scheme); if (searchURI.getPort() != defaultSchemePort && searchURI.getPort() != -1) { sb.append(":").append(searchURI.getPort()); } sb.append(newPath); if (searchURI.getEscapedQuery() != null) { sb.append("?").append(searchURI.getEscapedQuery()); } return sb.toString(); }