private void buildUrlSets(String url) { try { outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE); URL srcUrl = new URL(url); // URLConnection conn = srcUrl.openConnection(); // String type = conn.getContentType(); // type = conn.getHeaderField("content-type"); // InputStream istr = conn.getInputStream(); LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool); if (proxyHost != null) { conn.setProxy(proxyHost, proxyPort); } if (userAgent != null) { conn.setRequestProperty("user-agent", userAgent); } try { conn.execute(); int resp = conn.getResponseCode(); if (resp != 200) { outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE); return; } depth_fetched[m_curDepth - 1]++; String cookies = conn.getResponseHeaderValue("Set-Cookie"); if (cookies != null) { outputMessage("Cookies: " + cookies, PLAIN_MESSAGE); } String type = conn.getResponseContentType(); if (type == null || !type.toLowerCase().startsWith("text/html")) { outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE); return; } outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE); InputStream istr = conn.getResponseInputStream(); InputStreamReader reader = new InputStreamReader(istr); // MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader); GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor(); extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback()); istr.close(); depth_parsed[m_curDepth - 1]++; } finally { conn.release(); } } catch (MalformedURLException murle) { murle.printStackTrace(); outputErrResults(url, "Malformed URL:" + murle.getMessage()); } catch (IOException ex) { ex.printStackTrace(); outputErrResults(url, "IOException: " + ex.getMessage()); } }
public void foundLink(String url) { m_extracted.add(url); try { String normUrl = UrlUtil.normalizeUrl(url); if (BaseCrawler.isSupportedUrlProtocol(normUrl) && m_au.shouldBeCached(normUrl)) { m_incls.add(normUrl); } else { m_excls.add(normUrl); } } catch (MalformedURLException e) { m_excls.add(url); } }