public int computeStep(int metric) { int work = 0; if (nSteps == whenToThrow) { throw new ExpectedRuntimeException("Hash step throw test"); } if (nSteps-- > 0) { if (eachStepTime > 0) { Deadline time = Deadline.in(eachStepTime); while (!time.expired()) { try { Thread.sleep(1); } catch (InterruptedException e) { throw new RuntimeException(e.toString()); } work++; } } else { work = -eachStepTime; TimeBase.step(work); try { Thread.sleep(1); } catch (InterruptedException e) { throw new RuntimeException(e.toString()); } } } return work; }
private void pauseBeforeFetch() { if (!fetchDeadline.expired()) { try { fetchDeadline.sleep(); } catch (InterruptedException ie) { // no action } } fetchDeadline.expireIn(m_crawlDelay); }
public void testStoreNodeState() throws Exception { TimeBase.setSimulated(100); CachedUrlSet mcus = new MockCachedUrlSet(mau, new RangeCachedUrlSetSpec("http://www.example.com")); CrawlState crawl = new CrawlState(1, 2, 123); List polls = new ArrayList(2); PollState poll1 = new PollState(1, "sdf", "jkl", 2, 123, Deadline.at(456), false); PollState poll2 = new PollState(2, "abc", "def", 3, 321, Deadline.at(654), false); polls.add(poll1); polls.add(poll2); NodeState nodeState = new NodeStateImpl(mcus, 123321, crawl, polls, repository); ((NodeStateImpl) nodeState).setState(NodeState.DAMAGE_AT_OR_BELOW); repository.storeNodeState(nodeState); String filePath = LockssRepositoryImpl.mapAuToFileLocation(tempDirPath, mau); filePath = LockssRepositoryImpl.mapUrlToFileLocation( filePath, "http://www.example.com/" + HistoryRepositoryImpl.NODE_FILE_NAME); File xmlFile = new File(filePath); assertTrue(xmlFile.exists()); nodeState = null; nodeState = repository.loadNodeState(mcus); assertSame(mcus, nodeState.getCachedUrlSet()); assertEquals(123321, nodeState.getAverageHashDuration()); assertEquals(1, nodeState.getCrawlState().getType()); assertEquals(2, nodeState.getCrawlState().getStatus()); assertEquals(123, nodeState.getCrawlState().getStartTime()); assertEquals(NodeState.DAMAGE_AT_OR_BELOW, nodeState.getState()); Iterator pollIt = nodeState.getActivePolls(); assertTrue(pollIt.hasNext()); PollState loadedPoll = (PollState) pollIt.next(); assertEquals(1, loadedPoll.getType()); assertEquals("sdf", loadedPoll.getLwrBound()); assertEquals("jkl", loadedPoll.getUprBound()); assertEquals(2, loadedPoll.getStatus()); assertEquals(123, loadedPoll.getStartTime()); assertEquals(456, loadedPoll.getDeadline().getExpirationTime()); assertTrue(pollIt.hasNext()); loadedPoll = (PollState) pollIt.next(); assertEquals(2, loadedPoll.getType()); assertEquals("abc", loadedPoll.getLwrBound()); assertEquals("def", loadedPoll.getUprBound()); assertEquals(3, loadedPoll.getStatus()); assertEquals(321, loadedPoll.getStartTime()); assertEquals(654, loadedPoll.getDeadline().getExpirationTime()); assertFalse(pollIt.hasNext()); TimeBase.setReal(); }
private void handlePause(int entriesBetweenSleep) { if ((entriesBetweenSleep % sleepAfter) == 0) { long pauseTime = CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE); Deadline pause = Deadline.in(pauseTime); logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime)); while (!pause.expired()) { try { pause.sleep(); } catch (InterruptedException ie) { // no action } } } }
public boolean equals(Object obj) { if (obj instanceof BERec) { BERec o = (BERec) obj; return when.equals(o.when) && task.equals(o.task) && event == o.event; } return false; }
public void testFindOverrunTaskToRun() { assertFalse(tr.findTaskToRun()); StepTask t1 = task(100, 200, 100); Schedule s = sched(ListUtil.list(t1)); fact.setResult(s); assertTrue(tr.addToSchedule(t1)); assertFalse(tr.findTaskToRun()); assertEquals(Deadline.at(100), tr.runningDeadline); StepTask t2 = task(0, 300, 50); tr.addOverrunner(t2); assertTrue(tr.findTaskToRun()); assertEquals(t2, tr.runningTask); assertEquals(Deadline.at(100), tr.runningDeadline); assertNull(tr.runningChunk); }
HashQueue.Request req( Object cookie, long deadlineIn, int duration, int bytes, HashService.Callback callback) { MockCachedUrlSetHasher hasher = new MockCachedUrlSetHasher(); hasher.setNumBytes(bytes); cus.setContentHasher(hasher); // cus.setHashDuration(duration, bytes); HashQueue.Request req = new HashQueue.Request(cus, Deadline.in(deadlineIn), callback, cookie, hasher, duration); return req; }
public void lockssRun() { setPriority(PRIORITY_PARAM_SIZE_CALC, PRIORITY_DEFAULT_SIZE_CALC); startWDog(WDOG_PARAM_SIZE_CALC, WDOG_DEFAULT_SIZE_CALC); triggerWDogOnExit(true); nowRunning(); while (goOn) { try { pokeWDog(); if (sizeCalcQueue.isEmpty()) { Deadline timeout = Deadline.in(Constants.HOUR); sizeCalcSem.take(timeout); } RepositoryNode node; synchronized (sizeCalcQueue) { node = (RepositoryNode) CollectionUtil.getAnElement(sizeCalcQueue); } if (node != null) { long start = TimeBase.nowMs(); log.debug2("CalcSize start: " + node); long dur = 0; try { doSizeCalc(node); dur = TimeBase.nowMs() - start; log.debug2("CalcSize finish (" + StringUtil.timeIntervalToString(dur) + "): " + node); } catch (RuntimeException e) { log.warning("doSizeCalc: " + node, e); } synchronized (sizeCalcQueue) { sizeCalcQueue.remove(node); } pokeWDog(); long sleep = sleepTimeToAchieveLoad(dur, sizeCalcMaxLoad); Deadline.in(sleep).sleep(); } } catch (InterruptedException e) { // just wakeup and check for exit } } if (!goOn) { triggerWDogOnExit(false); } }
private void doSleep() throws IOException { String timestr = getParameter(KEY_TIME); try { long time = StringUtil.parseTimeInterval(timestr); Deadline.in(time).sleep(); statusMsg = "Slept for " + StringUtil.timeIntervalToString(time); } catch (NumberFormatException e) { errMsg = "Illegal duration: " + e; } catch (InterruptedException e) { errMsg = "Interrupted: " + e; } }
public void testGetAvailableHashTimeBefore() { HashQueue q = new HashQueue(); assertEquals(500, q.getAvailableHashTimeBefore(Deadline.in(500))); HashQueue.Request r1, r2, r3, r4, r5, r6, r7; r1 = simpleReq(200, 100); r2 = simpleReq(2000, 1200); r3 = simpleReq(3000, 500); assertTrue(q.insert(r1)); assertTrue(q.insert(r2)); assertTrue(q.insert(r3)); assertEquals(100, q.getAvailableHashTimeBefore(Deadline.in(100))); assertEquals(400, q.getAvailableHashTimeBefore(Deadline.in(500))); assertEquals(700, q.getAvailableHashTimeBefore(Deadline.in(1000))); assertEquals(700, q.getAvailableHashTimeBefore(Deadline.in(2000))); assertEquals(1200, q.getAvailableHashTimeBefore(Deadline.in(3000))); assertEquals(2200, q.getAvailableHashTimeBefore(Deadline.in(4000))); // this will fully commit first 200 ms r4 = simpleReq(200, 100); assertTrue(q.insert(r4)); assertEquals(0, q.getAvailableHashTimeBefore(Deadline.in(100))); assertEquals(0, q.getAvailableHashTimeBefore(Deadline.in(0))); }
public void testFindChunkTaskToRun() { assertFalse(tr.findTaskToRun()); StepTask t1 = task(100, 200, 100); StepTask t2 = task(100, 300, 50); Schedule s = sched(ListUtil.list(t1, t2)); fact.setResults(s, s); assertTrue(tr.addToSchedule(t1)); assertTrue(tr.addToSchedule(t2)); assertFalse(tr.findTaskToRun()); assertEquals(Deadline.at(100), tr.runningDeadline); TimeBase.setSimulated(101); assertTrue(tr.findTaskToRun()); assertEquals(t1, tr.runningTask); assertEquals(t1.getLatestFinish(), tr.runningDeadline); assertEquals(s.getEvents().get(0), tr.runningChunk); }
public void testFindRunnableChunk() { assertFalse(tr.findTaskToRun()); StepTask t1 = task(100, 200, 100); StepTask t2 = task(10, 300, 50); Schedule.Chunk c1 = new Schedule.Chunk(t1, Deadline.at(100), Deadline.at(200), 100); Schedule.Chunk c2 = new Schedule.Chunk(t2, Deadline.at(200), Deadline.at(300), 100); Schedule s = new Schedule(ListUtil.list(c1, c2)); fact.setResults(s, s); assertTrue(tr.addToSchedule(t1)); assertTrue(tr.addToSchedule(t2)); assertFalse(tr.findTaskToRun()); assertEquals(Deadline.at(100), tr.runningDeadline); TimeBase.setSimulated(11); assertTrue(tr.findTaskToRun()); assertEquals(t2, tr.runningTask); assertEquals(c2, tr.runningChunk); assertEquals(Deadline.at(100), tr.runningDeadline); assertEquals(s.getEvents().get(1), tr.runningChunk); }
public void testBackground() { final List rec = new ArrayList(); TaskCallback cb = new TaskCallback() { public void taskEvent(SchedulableTask task, Schedule.EventType event) { rec.add(new BERec(Deadline.in(0), (BackgroundTask) task, event)); } }; assertFalse(tr.findTaskToRun()); BackgroundTask t1 = btask(100, 200, .1, cb); BackgroundTask t2 = btask(100, 300, .2, cb); BackgroundTask t3 = btask(150, 200, .4, cb); Schedule s = sched( ListUtil.list( bEvent(t1, Schedule.EventType.START), bEvent(t2, Schedule.EventType.START), bEvent(t3, Schedule.EventType.START), bEvent(t1, Schedule.EventType.FINISH), bEvent(t3, Schedule.EventType.FINISH), bEvent(t2, Schedule.EventType.FINISH))); fact.setResults(ListUtil.list(s, s, s)); assertTrue(tr.addToSchedule(t1)); assertTrue(tr.addToSchedule(t2)); assertTrue(tr.addToSchedule(t3)); assertEquals(3, tr.getAcceptedTasks().size()); assertIsomorphic(ListUtil.list(t1, t2, t3), tr.getAcceptedTasks()); assertFalse(tr.findTaskToRun()); assertEquals(0, rec.size()); assertEquals(0, tr.getBackgroundLoadFactor(), .005); assertEquals(Deadline.at(100), tr.runningDeadline); TimeBase.setSimulated(101); assertFalse(tr.findTaskToRun()); assertEquals(2, rec.size()); assertEquals(.3, tr.getBackgroundLoadFactor(), .005); TimeBase.setSimulated(151); assertFalse(tr.findTaskToRun()); assertEquals(3, rec.size()); assertEquals(.7, tr.getBackgroundLoadFactor(), .005); assertEquals(3, tr.getAcceptedTasks().size()); TimeBase.setSimulated(201); assertFalse(tr.findTaskToRun()); assertEquals(5, rec.size()); assertEquals(.2, tr.getBackgroundLoadFactor(), .005); assertEquals(1, tr.getAcceptedTasks().size()); t2.taskIsFinished(); TimeBase.setSimulated(202); assertFalse(tr.findTaskToRun()); assertEquals(6, rec.size()); assertEquals(0, tr.getBackgroundLoadFactor(), .005); assertEquals(0, tr.getAcceptedTasks().size()); TimeBase.setSimulated(301); assertFalse(tr.findTaskToRun()); assertEquals(6, rec.size()); assertEquals(0, tr.getBackgroundLoadFactor(), .005); List exp = ListUtil.list( new BERec(101, t1, Schedule.EventType.START), new BERec(101, t2, Schedule.EventType.START), new BERec(151, t3, Schedule.EventType.START), new BERec(201, t1, Schedule.EventType.FINISH), new BERec(201, t3, Schedule.EventType.FINISH), new BERec(201, t2, Schedule.EventType.FINISH)); assertEquals(exp, rec); }
static StepperTask taskBetween(long minStart, long deadline, int duration, Stepper stepper) { return new StepperTask( Deadline.at(minStart), Deadline.at(deadline), duration, null, null, stepper); }
HashQueue.Request simpleReq(long deadlineIn, int duration) { return new HashQueue.Request( cus, Deadline.in(deadlineIn), null, null, new GenericContentHasher(cus, dig), duration); }
BackgroundTask btask(long start, long end, double loadFactor, TaskCallback cb) { return new BackgroundTask(Deadline.at(start), Deadline.at(end), loadFactor, cb); }
StepTask task(long start, long end, long duration, TaskCallback cb, Stepper stepper) { return new StepperTask(Deadline.at(start), Deadline.at(end), duration, cb, null, stepper); }
public class CrawlRuleTester extends Thread { protected static Logger log = Logger.getLogger(CrawlRuleTester.class); /** Proxy host */ public static final String PARAM_PROXY_HOST = Configuration.PREFIX + "crawltest.proxy.host"; /** Proxy port */ public static final String PARAM_PROXY_PORT = Configuration.PREFIX + "crawltest.proxy.port"; public static final int DEFAULT_PROXY_PORT = -1; /** User-Agent */ public static final String PARAM_USER_AGENT = Configuration.PREFIX + "crawltest.userAgent"; /* Message Types */ public static final int ERROR_MESSAGE = 0; public static final int WARNING_MESSAGE = 1; public static final int PLAIN_MESSAGE = 2; public static final int URL_SUMMARY_MESSAGE = 3; public static final int TEST_SUMMARY_MESSAGE = 4; private String m_baseUrl; private int m_crawlDepth; private long m_crawlDelay; private int m_curDepth; private ArchivalUnit m_au; private String m_outputFile = null; private BufferedWriter m_outWriter = null; private Deadline fetchDeadline = Deadline.in(0); private boolean useLocalWriter = true; private MessageHandler m_msgHandler; private LockssUrlConnectionPool connectionPool = new LockssUrlConnectionPool(); private String proxyHost; private String userAgent; private int proxyPort; // our storage for extracted urls private TreeSet m_extracted = new TreeSet(); private TreeSet m_incls = new TreeSet(); private TreeSet m_excls = new TreeSet(); private TreeSet m_reported = new TreeSet(); public CrawlRuleTester(int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { super("crawlrule tester"); m_crawlDepth = crawlDepth; long minFetchDelay = CurrentConfig.getLongParam( BaseArchivalUnit.PARAM_MIN_FETCH_DELAY, BaseArchivalUnit.DEFAULT_MIN_FETCH_DELAY); m_crawlDelay = Math.max(crawlDelay, minFetchDelay); m_baseUrl = baseUrl; m_au = au; } /** * RuleTest * * @param outFile String * @param crawlDepth int * @param crawlDelay long * @param baseUrl String * @param crawlSpec CrawlSpec */ public CrawlRuleTester( String outFile, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_outputFile = outFile; } /** * RuleTest * * @param outWriter BufferedWriter * @param crawlDepth int * @param crawlDelay long * @param baseUrl String * @param crawlSpec CrawlSpec */ public CrawlRuleTester( BufferedWriter outWriter, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_outWriter = outWriter; } /** * RuleTest * * @param msgHandler MessageHandler to take all output * @param crawlDepth the crawl depth to use * @param crawlDelay the type to wait between fetches * @param baseUrl the url to start from * @param crawlSpec a CrawlSpec to use for url checking. */ public CrawlRuleTester( MessageHandler msgHandler, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_msgHandler = msgHandler; } public void run() { try { setConfig(ConfigManager.getCurrentConfig()); if (m_outWriter == null && m_msgHandler == null) { useLocalWriter = true; } else { useLocalWriter = false; } if (useLocalWriter) { openOutputFile(); } checkRules(); if (useLocalWriter) { closeOutputFile(); } } finally { if (m_msgHandler != null) { m_msgHandler.close(); } } } void setConfig(Configuration config) { log.debug("config: " + config); proxyHost = config.get(PARAM_PROXY_HOST); proxyPort = config.getInt(PARAM_PROXY_PORT, DEFAULT_PROXY_PORT); if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { String http_proxy = System.getenv("http_proxy"); if (!StringUtil.isNullString(http_proxy)) { try { HostPortParser hpp = new HostPortParser(http_proxy); proxyHost = hpp.getHost(); proxyPort = hpp.getPort(); } catch (HostPortParser.InvalidSpec e) { log.warning("Can't parse http_proxy environment var, ignoring: " + http_proxy + ": " + e); } } } if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { proxyHost = null; } else { log.info("Proxying through " + proxyHost + ":" + proxyPort); } userAgent = config.get(PARAM_USER_AGENT); if (StringUtil.isNullString(userAgent)) { userAgent = null; } else { log.debug("Setting User-Agent to " + userAgent); } } private void openOutputFile() { if (m_outputFile != null) { try { m_outWriter = new BufferedWriter(new FileWriter(m_outputFile, false)); return; } catch (Exception ex) { System.err.println("Error opening output file, writing to stdout: " + ex); } } m_outWriter = new BufferedWriter(new OutputStreamWriter(System.out)); } private void closeOutputFile() { try { if (m_outWriter != null) { m_outWriter.close(); } } catch (IOException ex) { System.err.println("Error closing output file."); } } int[] depth_incl; int[] depth_fetched; int[] depth_parsed; private void checkRules() { outputMessage("\nChecking " + m_baseUrl, TEST_SUMMARY_MESSAGE); outputMessage( "crawl depth: " + m_crawlDepth + " crawl delay: " + m_crawlDelay + " ms.", PLAIN_MESSAGE); TreeSet crawlList = new TreeSet(); TreeSet fetched = new TreeSet(); // inialize with the baseUrl crawlList.add(m_baseUrl); depth_incl = new int[m_crawlDepth]; depth_fetched = new int[m_crawlDepth]; depth_parsed = new int[m_crawlDepth]; long start_time = TimeBase.nowMs(); for (int depth = 1; depth <= m_crawlDepth; depth++) { if (isInterrupted()) { return; } m_curDepth = depth; if (crawlList.isEmpty() && depth <= m_crawlDepth) { outputMessage("\nNothing left to crawl, exiting after depth " + (depth - 1), PLAIN_MESSAGE); break; } String[] urls = (String[]) crawlList.toArray(new String[0]); crawlList.clear(); outputMessage("\nDepth " + depth, PLAIN_MESSAGE); for (int ix = 0; ix < urls.length; ix++) { if (isInterrupted()) { return; } pauseBeforeFetch(); String urlstr = urls[ix]; m_incls.clear(); m_excls.clear(); // crawl the page buildUrlSets(urlstr); fetched.add(urlstr); // output incl/excl results, // add the new_incls to the crawlList for next crawl depth loop crawlList.addAll(outputUrlResults(urlstr, m_incls, m_excls)); } } long elapsed_time = TimeBase.nowMs() - start_time; outputSummary(m_baseUrl, fetched, crawlList, elapsed_time); } private void buildUrlSets(String url) { try { outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE); URL srcUrl = new URL(url); // URLConnection conn = srcUrl.openConnection(); // String type = conn.getContentType(); // type = conn.getHeaderField("content-type"); // InputStream istr = conn.getInputStream(); LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool); if (proxyHost != null) { conn.setProxy(proxyHost, proxyPort); } if (userAgent != null) { conn.setRequestProperty("user-agent", userAgent); } try { conn.execute(); int resp = conn.getResponseCode(); if (resp != 200) { outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE); return; } depth_fetched[m_curDepth - 1]++; String cookies = conn.getResponseHeaderValue("Set-Cookie"); if (cookies != null) { outputMessage("Cookies: " + cookies, PLAIN_MESSAGE); } String type = conn.getResponseContentType(); if (type == null || !type.toLowerCase().startsWith("text/html")) { outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE); return; } outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE); InputStream istr = conn.getResponseInputStream(); InputStreamReader reader = new InputStreamReader(istr); // MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader); GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor(); extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback()); istr.close(); depth_parsed[m_curDepth - 1]++; } finally { conn.release(); } } catch (MalformedURLException murle) { murle.printStackTrace(); outputErrResults(url, "Malformed URL:" + murle.getMessage()); } catch (IOException ex) { ex.printStackTrace(); outputErrResults(url, "IOException: " + ex.getMessage()); } } private void pauseBeforeFetch() { if (!fetchDeadline.expired()) { try { fetchDeadline.sleep(); } catch (InterruptedException ie) { // no action } } fetchDeadline.expireIn(m_crawlDelay); } private void outputMessage(String msg, int msgType) { if (isInterrupted()) { return; } if (m_msgHandler != null) { m_msgHandler.outputMessage(msg + "\n", msgType); } else { try { m_outWriter.write(msg); m_outWriter.newLine(); } catch (Exception ex) { System.err.println(msg); } } } private void outputErrResults(String url, String errMsg) { outputMessage("Error: " + errMsg + " occured while processing " + url, ERROR_MESSAGE); } private Set outputUrlResults(String url, Set m_inclset, Set m_exclset) { Set new_incls = new TreeSet(CollectionUtils.subtract(m_inclset, m_reported)); Set new_excls = new TreeSet(CollectionUtils.subtract(m_exclset, m_reported)); if (!m_inclset.isEmpty()) { outputMessage( "\nIncluded Urls: (" + new_incls.size() + " new, " + (m_inclset.size() - new_incls.size()) + " old)", URL_SUMMARY_MESSAGE); depth_incl[m_curDepth - 1] += new_incls.size(); } for (Iterator it = new_incls.iterator(); it.hasNext(); ) { outputMessage(it.next().toString(), PLAIN_MESSAGE); } if (!m_exclset.isEmpty()) { outputMessage( "\nExcluded Urls: (" + new_excls.size() + " new, " + (m_exclset.size() - new_excls.size()) + " old)", URL_SUMMARY_MESSAGE); } for (Iterator it = new_excls.iterator(); it.hasNext(); ) { outputMessage(it.next().toString(), PLAIN_MESSAGE); } m_reported.addAll(new_incls); m_reported.addAll(new_excls); if (m_outWriter != null) { try { m_outWriter.flush(); } catch (IOException ex) { } } return new_incls; } private void outputSummary(String baseUrl, Set fetched, Set toCrawl, long elapsedTime) { int fetchCount = fetched.size(); outputMessage( "\n\nSummary for starting Url: " + baseUrl + " and depth: " + m_crawlDepth, TEST_SUMMARY_MESSAGE); outputMessage( "\nUrls fetched: " + fetchCount + " Urls extracted: " + m_extracted.size(), PLAIN_MESSAGE); outputMessage("\nDepth Fetched Parsed New URLs", PLAIN_MESSAGE); for (int depth = 1; depth <= m_crawlDepth; depth++) { PrintfFormat pf = new PrintfFormat("%5d %7d %6d %8d"); Integer[] args = new Integer[] { new Integer(depth), new Integer(depth_fetched[depth - 1]), new Integer(depth_parsed[depth - 1]), new Integer(depth_incl[depth - 1]), }; String s = pf.sprintf(args); outputMessage(s, PLAIN_MESSAGE); } outputMessage("\nRemaining unfetched: " + toCrawl.size(), PLAIN_MESSAGE); if (false) { for (Iterator iter = toCrawl.iterator(); iter.hasNext(); ) { String url = (String) iter.next(); outputMessage(url, PLAIN_MESSAGE); } } long secs = elapsedTime / Constants.SECOND; long fetchRate = 0; if (secs > 0) { fetchRate = fetchCount * 60 * Constants.SECOND / elapsedTime; } outputMessage( "\nElapsed Time: " + secs + " secs." + " Fetch Rate: " + fetchRate + " p/m", PLAIN_MESSAGE); } public interface MessageHandler { void outputMessage(String message, int messageType); void close(); } private class MyLinkExtractorCallback implements LinkExtractor.Callback { MyLinkExtractorCallback() {} public void foundLink(String url) { m_extracted.add(url); try { String normUrl = UrlUtil.normalizeUrl(url); if (BaseCrawler.isSupportedUrlProtocol(normUrl) && m_au.shouldBeCached(normUrl)) { m_incls.add(normUrl); } else { m_excls.add(normUrl); } } catch (MalformedURLException e) { m_excls.add(url); } } } class MyMockCachedUrl implements CachedUrl { private String url; private boolean doesExist = false; private Reader reader = null; public MyMockCachedUrl(String url, Reader reader) { this.url = url; this.reader = reader; } public ArchivalUnit getArchivalUnit() { throw new UnsupportedOperationException("Not implemented"); } public String getUrl() { return url; } public CachedUrl getCuVersion(int version) { throw new UnsupportedOperationException("Not implemented"); } public CachedUrl[] getCuVersions() { throw new UnsupportedOperationException("Not implemented"); } public CachedUrl[] getCuVersions(int maxVersions) { throw new UnsupportedOperationException("Not implemented"); } public int getVersion() { return 1; } public Reader openForReading() { return reader; } public LinkRewriterFactory getLinkRewriterFactory() { throw new UnsupportedOperationException("Not implemented"); } public String getEncoding() { return Constants.DEFAULT_ENCODING; } /** * getUnfilteredInputStream * * @return InputStream */ public InputStream getUnfilteredInputStream() { throw new UnsupportedOperationException("Not implemented"); } /** * getUnfilteredInputStream * * @return InputStream */ public InputStream getUnfilteredInputStream(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } public InputStream getUncompressedInputStream() { throw new UnsupportedOperationException("Not implemented"); } public InputStream getUncompressedInputStream(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } /** * openForHashing * * @return InputStream */ public InputStream openForHashing() { throw new UnsupportedOperationException("Not implemented"); } /** * openForHashing * * @param hasher HashedInputStream.Hasher for unfiltered content * @return InputStream */ public InputStream openForHashing(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } /** * getContentSize * * @return long */ public long getContentSize() { throw new UnsupportedOperationException("Not implemented"); } public String getContentType() { throw new UnsupportedOperationException("Not implemented"); } public void setOption(String option, String val) {} public boolean hasContent() { return doesExist; } public boolean isLeaf() { return true; } public int getType() { return CachedUrlSetNode.TYPE_CACHED_URL; } public CIProperties getProperties() { return null; } public void addProperty(String key, String value) {} public void release() {} public String toString() { StringBuffer sb = new StringBuffer(url.length() + 17); sb.append("[MyMockCachedUrl: "); sb.append(url); sb.append("]"); return sb.toString(); } @Override public FileMetadataExtractor getFileMetadataExtractor(MetadataTarget target) { return null; } public CachedUrl getArchiveMemberCu(ArchiveMemberSpec ams) { throw new UnsupportedOperationException("Not implemented"); } @Override public boolean isArchiveMember() { return false; } } }
BERec(long when, BackgroundTask task, Schedule.EventType event) { this.when = Deadline.at(when); this.task = task; this.event = event; }
StepTask task(long start, long end, long duration, TaskCallback cb) { return new StepperTask( Deadline.at(start), Deadline.at(end), duration, cb, null, new MyMockStepper()); }
/** Explode the archive into its constituent elements */ public void explode() throws CacheException { CachedUrl cachedUrl = null; int goodEntries = 0; int badEntries = 0; int ignoredEntries = 0; int entriesBetweenSleep = 0; ArchiveReader arcReader = null; logger.info( (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode"); try { if (storeArchive) { UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl)); BitSet bs = new BitSet(); bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG); uc.setFetchFlags(bs); uc.storeContent(); archiveData.resetInputStream(); arcStream = archiveData.input; } // Wrap it in an ArchiveReader logger.debug3("About to wrap stream"); arcReader = wrapStream(fetchUrl, arcStream); logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null")); // Explode it if (arcReader == null) { throw new CacheException.ExploderException("no WarcReader for " + origUrl); } ArchivalUnit au = crawlFacade.getAu(); Set stemSet = new HashSet(); logger.debug("Exploding " + fetchUrl); // Iterate through the elements in the WARC file, except the first Iterator i = arcReader.iterator(); // Skip first record for (i.next(); i.hasNext(); ) { // XXX probably not necessary helper.pokeWDog(); if ((++entriesBetweenSleep % sleepAfter) == 0) { long pauseTime = CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE); Deadline pause = Deadline.in(pauseTime); logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime)); while (!pause.expired()) { try { pause.sleep(); } catch (InterruptedException ie) { // no action } } } ArchiveRecord element = (ArchiveRecord) i.next(); // Each element is a URL to be cached in a suitable AU ArchiveRecordHeader elementHeader = element.getHeader(); String elementUrl = elementHeader.getUrl(); String elementMimeType = elementHeader.getMimetype(); long elementLength = elementHeader.getLength(); logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType); if (elementUrl.startsWith("http:")) { ArchiveEntry ae = new ArchiveEntry( elementUrl, elementLength, 0, // XXX need to convert getDate string to long element, // ArchiveRecord extends InputStream this, fetchUrl); ae.setHeaderFields(makeCIProperties(elementHeader)); long bytesStored = elementLength; logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored); try { helper.process(ae); } catch (PluginException ex) { throw new CacheException.ExploderException("helper.process() threw", ex); } if (ae.getBaseUrl() != null) { if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) { storeEntry(ae); handleAddText(ae); goodEntries++; crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored); } else { ignoredEntries++; } } else { badEntries++; logger.debug2("Can't map " + elementUrl + " from " + archiveUrl); } } } } catch (IOException ex) { throw new CacheException.ExploderException(ex); } finally { if (arcReader != null) try { arcReader.close(); arcReader = null; } catch (IOException ex) { throw new CacheException.ExploderException(ex); } if (cachedUrl != null) { cachedUrl.release(); } IOUtil.safeClose(arcStream); } if (badEntries == 0 && goodEntries > 0) { // Make it look like a new crawl finished on each AU to which // URLs were added. for (Iterator it = touchedAus.iterator(); it.hasNext(); ) { ArchivalUnit au = (ArchivalUnit) it.next(); logger.debug3(archiveUrl + " touching " + au.toString()); AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished(); } } else { ArchivalUnit au = crawlFacade.getAu(); String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries"; throw new CacheException.UnretryableException(msg); } }