示例#1
0
  @Test
  public void testZeroCrawlDelay() {
    FetcherPolicy policy =
        new FetcherPolicy(
            FetcherPolicy.NO_MIN_RESPONSE_RATE,
            FetcherPolicy.DEFAULT_MAX_CONTENT_SIZE,
            FetcherPolicy.NO_CRAWL_END_TIME,
            0,
            FetcherPolicy.DEFAULT_MAX_REDIRECTS);

    try {
      FetchRequest request = policy.getFetchRequest(100);
      Assert.assertEquals(100, request.getNumUrls());
      Assert.assertTrue(request.getNextRequestTime() <= System.currentTimeMillis());
    } catch (Exception e) {
      Assert.fail("Exception: " + e.getMessage());
    }
  }
示例#2
0
  public static void main(String[] args) throws IOException {

    DemoWebMiningOptions options = new DemoWebMiningOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {
      parser.parseArgument(args);
    } catch (CmdLineException e) {
      System.err.println(e.getMessage());
      printUsageAndExit(parser);
    }

    // Build and run the flow.

    try {

      Path workingDirPath = new Path(options.getWorkingDir());

      JobConf conf = new JobConf();
      FileSystem fs = workingDirPath.getFileSystem(conf);
      setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME);

      Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
      if (latestDirPath == null) {
        error("No previous cycle output dirs exist in " + workingDirPath, parser);
      }

      Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

      UserAgent userAgent =
          new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);

      FetcherPolicy fetcherPolicy = new FetcherPolicy();
      fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
      fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
      fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);

      // We only care about mime types that the Tika HTML parser can handle,
      // so restrict it to the same.
      Set<String> validMimeTypes = new HashSet<String>();
      Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
      for (MediaType supportedType : supportedTypes) {
        validMimeTypes.add(
            String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
      }
      fetcherPolicy.setValidMimeTypes(validMimeTypes);

      // Let's limit our crawl to two loops
      for (int curLoop = 1; curLoop <= 2; curLoop++) {
        Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop);
        Flow flow =
            DemoWebMiningWorkflow.createWebMiningWorkflow(
                crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options, curLoop == 1);
        flow.complete();

        // Update crawlDbPath to point to the latest crawl db
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
      }

    } catch (Exception e) {
      System.err.println("Exception running job: " + e.getMessage());
      e.printStackTrace(System.err);
      System.exit(-1);
    }
  }
示例#3
0
  @Override
  public void operate(FlowProcess process, BufferCall<NullContext> buffCall) {
    QueuedValues values = new QueuedValues(buffCall.getArgumentsIterator());

    _collector = buffCall.getOutputCollector();
    FetcherPolicy fetcherPolicy = _fetcher.getFetcherPolicy();

    // Each value is a PreFetchedDatum that contains a set of URLs to fetch in one request from
    // a single server, plus other values needed to set state properly.
    while (!Thread.interrupted() && !fetcherPolicy.isTerminateFetch() && !values.isEmpty()) {
      FetchSetDatum datum = values.nextOrNull(_fetcherMode);

      try {
        if (datum == null) {
          trace("Nothing ready to fetch, sleeping...");
          process.keepAlive();
          Thread.sleep(NOTHING_TO_FETCH_SLEEP_TIME);
        } else {
          List<ScoredUrlDatum> urls = datum.getUrls();
          String ref = datum.getGroupingRef();
          trace("Processing %d URLs for %s", urls.size(), ref);

          Runnable doFetch = new FetchTask(this, _fetcher, urls, ref);
          if (datum.isLastList()) {
            makeActive(ref, 0L);
            trace("Executing fetch of %d URLs from %s (last batch)", urls.size(), ref);
          } else {
            Long nextFetchTime = System.currentTimeMillis() + datum.getFetchDelay();
            makeActive(ref, nextFetchTime);
            trace(
                "Executing fetch of %d URLs from %s (next fetch time %d)",
                urls.size(), ref, nextFetchTime);
          }

          long startTime = System.currentTimeMillis();

          try {
            _executor.execute(doFetch);
          } catch (RejectedExecutionException e) {
            // should never happen.
            LOGGER.error("Fetch pool rejected our fetch list for " + ref);

            finished(ref);
            skipUrls(
                urls,
                UrlStatus.SKIPPED_DEFERRED,
                String.format("Execution rejection skipped %d URLs", urls.size()));
          }

          // Adjust for how long it took to get the request queued.
          adjustActive(ref, System.currentTimeMillis() - startTime);
        }
      } catch (InterruptedException e) {
        LOGGER.warn("FetchBuffer interrupted!");
        Thread.currentThread().interrupt();
      }
    }

    // Skip all URLs that we've got left.
    if (!values.isEmpty()) {
      trace("Found unprocessed URLs");

      UrlStatus status =
          Thread.interrupted() ? UrlStatus.SKIPPED_INTERRUPTED : UrlStatus.SKIPPED_TIME_LIMIT;

      while (!values.isEmpty()) {
        FetchSetDatum datum = values.drain();
        List<ScoredUrlDatum> urls = datum.getUrls();
        trace(
            "Skipping %d urls from %s (e.g. %s) ",
            urls.size(), datum.getGroupingRef(), urls.get(0).getUrl());
        skipUrls(urls, status, null);
      }
    }
  }