public class CrawlRuleTester extends Thread { protected static Logger log = Logger.getLogger(CrawlRuleTester.class); /** Proxy host */ public static final String PARAM_PROXY_HOST = Configuration.PREFIX + "crawltest.proxy.host"; /** Proxy port */ public static final String PARAM_PROXY_PORT = Configuration.PREFIX + "crawltest.proxy.port"; public static final int DEFAULT_PROXY_PORT = -1; /** User-Agent */ public static final String PARAM_USER_AGENT = Configuration.PREFIX + "crawltest.userAgent"; /* Message Types */ public static final int ERROR_MESSAGE = 0; public static final int WARNING_MESSAGE = 1; public static final int PLAIN_MESSAGE = 2; public static final int URL_SUMMARY_MESSAGE = 3; public static final int TEST_SUMMARY_MESSAGE = 4; private String m_baseUrl; private int m_crawlDepth; private long m_crawlDelay; private int m_curDepth; private ArchivalUnit m_au; private String m_outputFile = null; private BufferedWriter m_outWriter = null; private Deadline fetchDeadline = Deadline.in(0); private boolean useLocalWriter = true; private MessageHandler m_msgHandler; private LockssUrlConnectionPool connectionPool = new LockssUrlConnectionPool(); private String proxyHost; private String userAgent; private int proxyPort; // our storage for extracted urls private TreeSet m_extracted = new TreeSet(); private TreeSet m_incls = new TreeSet(); private TreeSet m_excls = new TreeSet(); private TreeSet m_reported = new TreeSet(); public CrawlRuleTester(int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { super("crawlrule tester"); m_crawlDepth = crawlDepth; long minFetchDelay = CurrentConfig.getLongParam( BaseArchivalUnit.PARAM_MIN_FETCH_DELAY, BaseArchivalUnit.DEFAULT_MIN_FETCH_DELAY); m_crawlDelay = Math.max(crawlDelay, minFetchDelay); m_baseUrl = baseUrl; m_au = au; } /** * RuleTest * * @param outFile String * @param crawlDepth int * @param crawlDelay long * @param baseUrl String * @param crawlSpec CrawlSpec */ public CrawlRuleTester( String outFile, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_outputFile = outFile; } /** * RuleTest * * @param outWriter BufferedWriter * @param crawlDepth int * @param crawlDelay long * @param baseUrl String * @param crawlSpec CrawlSpec */ public CrawlRuleTester( BufferedWriter outWriter, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_outWriter = outWriter; } /** * RuleTest * * @param msgHandler MessageHandler to take all output * @param crawlDepth the crawl depth to use * @param crawlDelay the type to wait between fetches * @param baseUrl the url to start from * @param crawlSpec a CrawlSpec to use for url checking. */ public CrawlRuleTester( MessageHandler msgHandler, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_msgHandler = msgHandler; } public void run() { try { setConfig(ConfigManager.getCurrentConfig()); if (m_outWriter == null && m_msgHandler == null) { useLocalWriter = true; } else { useLocalWriter = false; } if (useLocalWriter) { openOutputFile(); } checkRules(); if (useLocalWriter) { closeOutputFile(); } } finally { if (m_msgHandler != null) { m_msgHandler.close(); } } } void setConfig(Configuration config) { log.debug("config: " + config); proxyHost = config.get(PARAM_PROXY_HOST); proxyPort = config.getInt(PARAM_PROXY_PORT, DEFAULT_PROXY_PORT); if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { String http_proxy = System.getenv("http_proxy"); if (!StringUtil.isNullString(http_proxy)) { try { HostPortParser hpp = new HostPortParser(http_proxy); proxyHost = hpp.getHost(); proxyPort = hpp.getPort(); } catch (HostPortParser.InvalidSpec e) { log.warning("Can't parse http_proxy environment var, ignoring: " + http_proxy + ": " + e); } } } if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { proxyHost = null; } else { log.info("Proxying through " + proxyHost + ":" + proxyPort); } userAgent = config.get(PARAM_USER_AGENT); if (StringUtil.isNullString(userAgent)) { userAgent = null; } else { log.debug("Setting User-Agent to " + userAgent); } } private void openOutputFile() { if (m_outputFile != null) { try { m_outWriter = new BufferedWriter(new FileWriter(m_outputFile, false)); return; } catch (Exception ex) { System.err.println("Error opening output file, writing to stdout: " + ex); } } m_outWriter = new BufferedWriter(new OutputStreamWriter(System.out)); } private void closeOutputFile() { try { if (m_outWriter != null) { m_outWriter.close(); } } catch (IOException ex) { System.err.println("Error closing output file."); } } int[] depth_incl; int[] depth_fetched; int[] depth_parsed; private void checkRules() { outputMessage("\nChecking " + m_baseUrl, TEST_SUMMARY_MESSAGE); outputMessage( "crawl depth: " + m_crawlDepth + " crawl delay: " + m_crawlDelay + " ms.", PLAIN_MESSAGE); TreeSet crawlList = new TreeSet(); TreeSet fetched = new TreeSet(); // inialize with the baseUrl crawlList.add(m_baseUrl); depth_incl = new int[m_crawlDepth]; depth_fetched = new int[m_crawlDepth]; depth_parsed = new int[m_crawlDepth]; long start_time = TimeBase.nowMs(); for (int depth = 1; depth <= m_crawlDepth; depth++) { if (isInterrupted()) { return; } m_curDepth = depth; if (crawlList.isEmpty() && depth <= m_crawlDepth) { outputMessage("\nNothing left to crawl, exiting after depth " + (depth - 1), PLAIN_MESSAGE); break; } String[] urls = (String[]) crawlList.toArray(new String[0]); crawlList.clear(); outputMessage("\nDepth " + depth, PLAIN_MESSAGE); for (int ix = 0; ix < urls.length; ix++) { if (isInterrupted()) { return; } pauseBeforeFetch(); String urlstr = urls[ix]; m_incls.clear(); m_excls.clear(); // crawl the page buildUrlSets(urlstr); fetched.add(urlstr); // output incl/excl results, // add the new_incls to the crawlList for next crawl depth loop crawlList.addAll(outputUrlResults(urlstr, m_incls, m_excls)); } } long elapsed_time = TimeBase.nowMs() - start_time; outputSummary(m_baseUrl, fetched, crawlList, elapsed_time); } private void buildUrlSets(String url) { try { outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE); URL srcUrl = new URL(url); // URLConnection conn = srcUrl.openConnection(); // String type = conn.getContentType(); // type = conn.getHeaderField("content-type"); // InputStream istr = conn.getInputStream(); LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool); if (proxyHost != null) { conn.setProxy(proxyHost, proxyPort); } if (userAgent != null) { conn.setRequestProperty("user-agent", userAgent); } try { conn.execute(); int resp = conn.getResponseCode(); if (resp != 200) { outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE); return; } depth_fetched[m_curDepth - 1]++; String cookies = conn.getResponseHeaderValue("Set-Cookie"); if (cookies != null) { outputMessage("Cookies: " + cookies, PLAIN_MESSAGE); } String type = conn.getResponseContentType(); if (type == null || !type.toLowerCase().startsWith("text/html")) { outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE); return; } outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE); InputStream istr = conn.getResponseInputStream(); InputStreamReader reader = new InputStreamReader(istr); // MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader); GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor(); extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback()); istr.close(); depth_parsed[m_curDepth - 1]++; } finally { conn.release(); } } catch (MalformedURLException murle) { murle.printStackTrace(); outputErrResults(url, "Malformed URL:" + murle.getMessage()); } catch (IOException ex) { ex.printStackTrace(); outputErrResults(url, "IOException: " + ex.getMessage()); } } private void pauseBeforeFetch() { if (!fetchDeadline.expired()) { try { fetchDeadline.sleep(); } catch (InterruptedException ie) { // no action } } fetchDeadline.expireIn(m_crawlDelay); } private void outputMessage(String msg, int msgType) { if (isInterrupted()) { return; } if (m_msgHandler != null) { m_msgHandler.outputMessage(msg + "\n", msgType); } else { try { m_outWriter.write(msg); m_outWriter.newLine(); } catch (Exception ex) { System.err.println(msg); } } } private void outputErrResults(String url, String errMsg) { outputMessage("Error: " + errMsg + " occured while processing " + url, ERROR_MESSAGE); } private Set outputUrlResults(String url, Set m_inclset, Set m_exclset) { Set new_incls = new TreeSet(CollectionUtils.subtract(m_inclset, m_reported)); Set new_excls = new TreeSet(CollectionUtils.subtract(m_exclset, m_reported)); if (!m_inclset.isEmpty()) { outputMessage( "\nIncluded Urls: (" + new_incls.size() + " new, " + (m_inclset.size() - new_incls.size()) + " old)", URL_SUMMARY_MESSAGE); depth_incl[m_curDepth - 1] += new_incls.size(); } for (Iterator it = new_incls.iterator(); it.hasNext(); ) { outputMessage(it.next().toString(), PLAIN_MESSAGE); } if (!m_exclset.isEmpty()) { outputMessage( "\nExcluded Urls: (" + new_excls.size() + " new, " + (m_exclset.size() - new_excls.size()) + " old)", URL_SUMMARY_MESSAGE); } for (Iterator it = new_excls.iterator(); it.hasNext(); ) { outputMessage(it.next().toString(), PLAIN_MESSAGE); } m_reported.addAll(new_incls); m_reported.addAll(new_excls); if (m_outWriter != null) { try { m_outWriter.flush(); } catch (IOException ex) { } } return new_incls; } private void outputSummary(String baseUrl, Set fetched, Set toCrawl, long elapsedTime) { int fetchCount = fetched.size(); outputMessage( "\n\nSummary for starting Url: " + baseUrl + " and depth: " + m_crawlDepth, TEST_SUMMARY_MESSAGE); outputMessage( "\nUrls fetched: " + fetchCount + " Urls extracted: " + m_extracted.size(), PLAIN_MESSAGE); outputMessage("\nDepth Fetched Parsed New URLs", PLAIN_MESSAGE); for (int depth = 1; depth <= m_crawlDepth; depth++) { PrintfFormat pf = new PrintfFormat("%5d %7d %6d %8d"); Integer[] args = new Integer[] { new Integer(depth), new Integer(depth_fetched[depth - 1]), new Integer(depth_parsed[depth - 1]), new Integer(depth_incl[depth - 1]), }; String s = pf.sprintf(args); outputMessage(s, PLAIN_MESSAGE); } outputMessage("\nRemaining unfetched: " + toCrawl.size(), PLAIN_MESSAGE); if (false) { for (Iterator iter = toCrawl.iterator(); iter.hasNext(); ) { String url = (String) iter.next(); outputMessage(url, PLAIN_MESSAGE); } } long secs = elapsedTime / Constants.SECOND; long fetchRate = 0; if (secs > 0) { fetchRate = fetchCount * 60 * Constants.SECOND / elapsedTime; } outputMessage( "\nElapsed Time: " + secs + " secs." + " Fetch Rate: " + fetchRate + " p/m", PLAIN_MESSAGE); } public interface MessageHandler { void outputMessage(String message, int messageType); void close(); } private class MyLinkExtractorCallback implements LinkExtractor.Callback { MyLinkExtractorCallback() {} public void foundLink(String url) { m_extracted.add(url); try { String normUrl = UrlUtil.normalizeUrl(url); if (BaseCrawler.isSupportedUrlProtocol(normUrl) && m_au.shouldBeCached(normUrl)) { m_incls.add(normUrl); } else { m_excls.add(normUrl); } } catch (MalformedURLException e) { m_excls.add(url); } } } class MyMockCachedUrl implements CachedUrl { private String url; private boolean doesExist = false; private Reader reader = null; public MyMockCachedUrl(String url, Reader reader) { this.url = url; this.reader = reader; } public ArchivalUnit getArchivalUnit() { throw new UnsupportedOperationException("Not implemented"); } public String getUrl() { return url; } public CachedUrl getCuVersion(int version) { throw new UnsupportedOperationException("Not implemented"); } public CachedUrl[] getCuVersions() { throw new UnsupportedOperationException("Not implemented"); } public CachedUrl[] getCuVersions(int maxVersions) { throw new UnsupportedOperationException("Not implemented"); } public int getVersion() { return 1; } public Reader openForReading() { return reader; } public LinkRewriterFactory getLinkRewriterFactory() { throw new UnsupportedOperationException("Not implemented"); } public String getEncoding() { return Constants.DEFAULT_ENCODING; } /** * getUnfilteredInputStream * * @return InputStream */ public InputStream getUnfilteredInputStream() { throw new UnsupportedOperationException("Not implemented"); } /** * getUnfilteredInputStream * * @return InputStream */ public InputStream getUnfilteredInputStream(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } public InputStream getUncompressedInputStream() { throw new UnsupportedOperationException("Not implemented"); } public InputStream getUncompressedInputStream(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } /** * openForHashing * * @return InputStream */ public InputStream openForHashing() { throw new UnsupportedOperationException("Not implemented"); } /** * openForHashing * * @param hasher HashedInputStream.Hasher for unfiltered content * @return InputStream */ public InputStream openForHashing(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } /** * getContentSize * * @return long */ public long getContentSize() { throw new UnsupportedOperationException("Not implemented"); } public String getContentType() { throw new UnsupportedOperationException("Not implemented"); } public void setOption(String option, String val) {} public boolean hasContent() { return doesExist; } public boolean isLeaf() { return true; } public int getType() { return CachedUrlSetNode.TYPE_CACHED_URL; } public CIProperties getProperties() { return null; } public void addProperty(String key, String value) {} public void release() {} public String toString() { StringBuffer sb = new StringBuffer(url.length() + 17); sb.append("[MyMockCachedUrl: "); sb.append(url); sb.append("]"); return sb.toString(); } @Override public FileMetadataExtractor getFileMetadataExtractor(MetadataTarget target) { return null; } public CachedUrl getArchiveMemberCu(ArchiveMemberSpec ams) { throw new UnsupportedOperationException("Not implemented"); } @Override public boolean isArchiveMember() { return false; } } }
/** Explode the archive into its constituent elements */ public void explode() throws CacheException { CachedUrl cachedUrl = null; int goodEntries = 0; int badEntries = 0; int ignoredEntries = 0; int entriesBetweenSleep = 0; ArchiveReader arcReader = null; logger.info( (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode"); try { if (storeArchive) { UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl)); BitSet bs = new BitSet(); bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG); uc.setFetchFlags(bs); uc.storeContent(); archiveData.resetInputStream(); arcStream = archiveData.input; } // Wrap it in an ArchiveReader logger.debug3("About to wrap stream"); arcReader = wrapStream(fetchUrl, arcStream); logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null")); // Explode it if (arcReader == null) { throw new CacheException.ExploderException("no WarcReader for " + origUrl); } ArchivalUnit au = crawlFacade.getAu(); Set stemSet = new HashSet(); logger.debug("Exploding " + fetchUrl); // Iterate through the elements in the WARC file, except the first Iterator i = arcReader.iterator(); // Skip first record for (i.next(); i.hasNext(); ) { // XXX probably not necessary helper.pokeWDog(); if ((++entriesBetweenSleep % sleepAfter) == 0) { long pauseTime = CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE); Deadline pause = Deadline.in(pauseTime); logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime)); while (!pause.expired()) { try { pause.sleep(); } catch (InterruptedException ie) { // no action } } } ArchiveRecord element = (ArchiveRecord) i.next(); // Each element is a URL to be cached in a suitable AU ArchiveRecordHeader elementHeader = element.getHeader(); String elementUrl = elementHeader.getUrl(); String elementMimeType = elementHeader.getMimetype(); long elementLength = elementHeader.getLength(); logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType); if (elementUrl.startsWith("http:")) { ArchiveEntry ae = new ArchiveEntry( elementUrl, elementLength, 0, // XXX need to convert getDate string to long element, // ArchiveRecord extends InputStream this, fetchUrl); ae.setHeaderFields(makeCIProperties(elementHeader)); long bytesStored = elementLength; logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored); try { helper.process(ae); } catch (PluginException ex) { throw new CacheException.ExploderException("helper.process() threw", ex); } if (ae.getBaseUrl() != null) { if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) { storeEntry(ae); handleAddText(ae); goodEntries++; crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored); } else { ignoredEntries++; } } else { badEntries++; logger.debug2("Can't map " + elementUrl + " from " + archiveUrl); } } } } catch (IOException ex) { throw new CacheException.ExploderException(ex); } finally { if (arcReader != null) try { arcReader.close(); arcReader = null; } catch (IOException ex) { throw new CacheException.ExploderException(ex); } if (cachedUrl != null) { cachedUrl.release(); } IOUtil.safeClose(arcStream); } if (badEntries == 0 && goodEntries > 0) { // Make it look like a new crawl finished on each AU to which // URLs were added. for (Iterator it = touchedAus.iterator(); it.hasNext(); ) { ArchivalUnit au = (ArchivalUnit) it.next(); logger.debug3(archiveUrl + " touching " + au.toString()); AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished(); } } else { ArchivalUnit au = crawlFacade.getAu(); String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries"; throw new CacheException.UnretryableException(msg); } }