Java FetcherPolicy примеры использования

Язык программирования: Java

Пространство имен/Пакет: bixo.config

Класс/Тип: FetcherPolicy

Примеров на hotexamples.com: 3

Java FetcherPolicy - 3 примера найдено. Это лучшие примеры Java кода для bixo.config.FetcherPolicy, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

getFetchRequest(1)

isTerminateFetch(1)

setCrawlDelay(1)

setFetcherMode(1)

setMaxContentSize(1)

setValidMimeTypes(1)

Пример #1

Показать файл

Файл: FetcherPolicyTest.java Проект: chaithanya/bixo

  @Test
  public void testZeroCrawlDelay() {
    FetcherPolicy policy =
        new FetcherPolicy(
            FetcherPolicy.NO_MIN_RESPONSE_RATE,
            FetcherPolicy.DEFAULT_MAX_CONTENT_SIZE,
            FetcherPolicy.NO_CRAWL_END_TIME,
            0,
            FetcherPolicy.DEFAULT_MAX_REDIRECTS);

    try {
      FetchRequest request = policy.getFetchRequest(100);
      Assert.assertEquals(100, request.getNumUrls());
      Assert.assertTrue(request.getNextRequestTime() <= System.currentTimeMillis());
    } catch (Exception e) {
      Assert.fail("Exception: " + e.getMessage());
    }
  }

Пример #2

Показать файл

Файл: DemoWebMiningTool.java Проект: cloudysunny14/bixo

  public static void main(String[] args) throws IOException {

    DemoWebMiningOptions options = new DemoWebMiningOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {
      parser.parseArgument(args);
    } catch (CmdLineException e) {
      System.err.println(e.getMessage());
      printUsageAndExit(parser);
    }

    // Build and run the flow.

    try {

      Path workingDirPath = new Path(options.getWorkingDir());

      JobConf conf = new JobConf();
      FileSystem fs = workingDirPath.getFileSystem(conf);
      setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME);

      Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
      if (latestDirPath == null) {
        error("No previous cycle output dirs exist in " + workingDirPath, parser);
      }

      Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

      UserAgent userAgent =
          new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);

      FetcherPolicy fetcherPolicy = new FetcherPolicy();
      fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
      fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
      fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);

      // We only care about mime types that the Tika HTML parser can handle,
      // so restrict it to the same.
      Set<String> validMimeTypes = new HashSet<String>();
      Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
      for (MediaType supportedType : supportedTypes) {
        validMimeTypes.add(
            String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
      }
      fetcherPolicy.setValidMimeTypes(validMimeTypes);

      // Let's limit our crawl to two loops
      for (int curLoop = 1; curLoop <= 2; curLoop++) {
        Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop);
        Flow flow =
            DemoWebMiningWorkflow.createWebMiningWorkflow(
                crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options, curLoop == 1);
        flow.complete();

        // Update crawlDbPath to point to the latest crawl db
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
      }

    } catch (Exception e) {
      System.err.println("Exception running job: " + e.getMessage());
      e.printStackTrace(System.err);
      System.exit(-1);
    }
  }

Пример #3

Показать файл

Файл: FetchBuffer.java Проект: jiakuan/bixo

  @Override
  public void operate(FlowProcess process, BufferCall<NullContext> buffCall) {
    QueuedValues values = new QueuedValues(buffCall.getArgumentsIterator());

    _collector = buffCall.getOutputCollector();
    FetcherPolicy fetcherPolicy = _fetcher.getFetcherPolicy();

    // Each value is a PreFetchedDatum that contains a set of URLs to fetch in one request from
    // a single server, plus other values needed to set state properly.
    while (!Thread.interrupted() && !fetcherPolicy.isTerminateFetch() && !values.isEmpty()) {
      FetchSetDatum datum = values.nextOrNull(_fetcherMode);

      try {
        if (datum == null) {
          trace("Nothing ready to fetch, sleeping...");
          process.keepAlive();
          Thread.sleep(NOTHING_TO_FETCH_SLEEP_TIME);
        } else {
          List<ScoredUrlDatum> urls = datum.getUrls();
          String ref = datum.getGroupingRef();
          trace("Processing %d URLs for %s", urls.size(), ref);

          Runnable doFetch = new FetchTask(this, _fetcher, urls, ref);
          if (datum.isLastList()) {
            makeActive(ref, 0L);
            trace("Executing fetch of %d URLs from %s (last batch)", urls.size(), ref);
          } else {
            Long nextFetchTime = System.currentTimeMillis() + datum.getFetchDelay();
            makeActive(ref, nextFetchTime);
            trace(
                "Executing fetch of %d URLs from %s (next fetch time %d)",
                urls.size(), ref, nextFetchTime);
          }

          long startTime = System.currentTimeMillis();

          try {
            _executor.execute(doFetch);
          } catch (RejectedExecutionException e) {
            // should never happen.
            LOGGER.error("Fetch pool rejected our fetch list for " + ref);

            finished(ref);
            skipUrls(
                urls,
                UrlStatus.SKIPPED_DEFERRED,
                String.format("Execution rejection skipped %d URLs", urls.size()));
          }

          // Adjust for how long it took to get the request queued.
          adjustActive(ref, System.currentTimeMillis() - startTime);
        }
      } catch (InterruptedException e) {
        LOGGER.warn("FetchBuffer interrupted!");
        Thread.currentThread().interrupt();
      }
    }

    // Skip all URLs that we've got left.
    if (!values.isEmpty()) {
      trace("Found unprocessed URLs");

      UrlStatus status =
          Thread.interrupted() ? UrlStatus.SKIPPED_INTERRUPTED : UrlStatus.SKIPPED_TIME_LIMIT;

      while (!values.isEmpty()) {
        FetchSetDatum datum = values.drain();
        List<ScoredUrlDatum> urls = datum.getUrls();
        trace(
            "Skipping %d urls from %s (e.g. %s) ",
            urls.size(), datum.getGroupingRef(), urls.get(0).getUrl());
        skipUrls(urls, status, null);
      }
    }
  }