/** * A method to remove any non-canonical '..' or '.' elements in the path, as well as protecting * against illegal path traversal. * * @param url the raw url * @return String the canonicalized url * @throws MalformedURLException */ public String canonicalizePath(String url) throws MalformedURLException { String canonUrl = UrlUtil.normalizeUrl(url, UrlUtil.PATH_TRAVERSAL_ACTION_THROW); // canonicalize "dir" and "dir/" // XXX if these are ever two separate nodes, this is wrong if (canonUrl.endsWith(UrlUtil.URL_PATH_SEPARATOR)) { canonUrl = canonUrl.substring(0, canonUrl.length() - 1); } return canonUrl; }
public void foundLink(String url) { m_extracted.add(url); try { String normUrl = UrlUtil.normalizeUrl(url); if (BaseCrawler.isSupportedUrlProtocol(normUrl) && m_au.shouldBeCached(normUrl)) { m_incls.add(normUrl); } else { m_excls.add(normUrl); } } catch (MalformedURLException e) { m_excls.add(url); } }