@Test public void testZeroCrawlDelay() { FetcherPolicy policy = new FetcherPolicy( FetcherPolicy.NO_MIN_RESPONSE_RATE, FetcherPolicy.DEFAULT_MAX_CONTENT_SIZE, FetcherPolicy.NO_CRAWL_END_TIME, 0, FetcherPolicy.DEFAULT_MAX_REDIRECTS); try { FetchRequest request = policy.getFetchRequest(100); Assert.assertEquals(100, request.getNumUrls()); Assert.assertTrue(request.getNextRequestTime() <= System.currentTimeMillis()); } catch (Exception e) { Assert.fail("Exception: " + e.getMessage()); } }
public static void main(String[] args) throws IOException { DemoWebMiningOptions options = new DemoWebMiningOptions(); CmdLineParser parser = new CmdLineParser(options); try { parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Build and run the flow. try { Path workingDirPath = new Path(options.getWorkingDir()); JobConf conf = new JobConf(); FileSystem fs = workingDirPath.getFileSystem(conf); setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME); Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath); if (latestDirPath == null) { error("No previous cycle output dirs exist in " + workingDirPath, parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); FetcherPolicy fetcherPolicy = new FetcherPolicy(); fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT); // We only care about mime types that the Tika HTML parser can handle, // so restrict it to the same. Set<String> validMimeTypes = new HashSet<String>(); Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext()); for (MediaType supportedType : supportedTypes) { validMimeTypes.add( String.format("%s/%s", supportedType.getType(), supportedType.getSubtype())); } fetcherPolicy.setValidMimeTypes(validMimeTypes); // Let's limit our crawl to two loops for (int curLoop = 1; curLoop <= 2; curLoop++) { Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop); Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow( crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options, curLoop == 1); flow.complete(); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (Exception e) { System.err.println("Exception running job: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } }
@Override public void operate(FlowProcess process, BufferCall<NullContext> buffCall) { QueuedValues values = new QueuedValues(buffCall.getArgumentsIterator()); _collector = buffCall.getOutputCollector(); FetcherPolicy fetcherPolicy = _fetcher.getFetcherPolicy(); // Each value is a PreFetchedDatum that contains a set of URLs to fetch in one request from // a single server, plus other values needed to set state properly. while (!Thread.interrupted() && !fetcherPolicy.isTerminateFetch() && !values.isEmpty()) { FetchSetDatum datum = values.nextOrNull(_fetcherMode); try { if (datum == null) { trace("Nothing ready to fetch, sleeping..."); process.keepAlive(); Thread.sleep(NOTHING_TO_FETCH_SLEEP_TIME); } else { List<ScoredUrlDatum> urls = datum.getUrls(); String ref = datum.getGroupingRef(); trace("Processing %d URLs for %s", urls.size(), ref); Runnable doFetch = new FetchTask(this, _fetcher, urls, ref); if (datum.isLastList()) { makeActive(ref, 0L); trace("Executing fetch of %d URLs from %s (last batch)", urls.size(), ref); } else { Long nextFetchTime = System.currentTimeMillis() + datum.getFetchDelay(); makeActive(ref, nextFetchTime); trace( "Executing fetch of %d URLs from %s (next fetch time %d)", urls.size(), ref, nextFetchTime); } long startTime = System.currentTimeMillis(); try { _executor.execute(doFetch); } catch (RejectedExecutionException e) { // should never happen. LOGGER.error("Fetch pool rejected our fetch list for " + ref); finished(ref); skipUrls( urls, UrlStatus.SKIPPED_DEFERRED, String.format("Execution rejection skipped %d URLs", urls.size())); } // Adjust for how long it took to get the request queued. adjustActive(ref, System.currentTimeMillis() - startTime); } } catch (InterruptedException e) { LOGGER.warn("FetchBuffer interrupted!"); Thread.currentThread().interrupt(); } } // Skip all URLs that we've got left. if (!values.isEmpty()) { trace("Found unprocessed URLs"); UrlStatus status = Thread.interrupted() ? UrlStatus.SKIPPED_INTERRUPTED : UrlStatus.SKIPPED_TIME_LIMIT; while (!values.isEmpty()) { FetchSetDatum datum = values.drain(); List<ScoredUrlDatum> urls = datum.getUrls(); trace( "Skipping %d urls from %s (e.g. %s) ", urls.size(), datum.getGroupingRef(), urls.get(0).getUrl()); skipUrls(urls, status, null); } } }