Java CrawlDirUtils примеры использования

Язык программирования: Java

Пространство имен/Пакет: bixo.utils

Класс/Тип: CrawlDirUtils

Примеров на hotexamples.com: 2

Java CrawlDirUtils - 2 примера найдено. Это лучшие примеры Java кода для bixo.utils.CrawlDirUtils, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

findLatestLoopDir(2)

makeLoopDir(2)

Пример #1

Показать файл

Файл: DemoWebMiningTool.java Проект: cloudysunny14/bixo

  static void setupWorkingDir(FileSystem fs, Path workingDirPath, String seedUrlsfileName)
      throws Exception {

    // Check if we already have a crawldb
    Path crawlDbPath = null;
    Path loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
    if (loopDirPath != null) {
      // Clear out any previous loop directory, so we're always starting from scratch
      LOGGER.info("deleting existing working dir");
      while (loopDirPath != null) {
        fs.delete(loopDirPath, true);
        loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
      }
    }

    // Create a "0-<timestamp>" loop sub-directory and import the seed urls
    loopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, 0);
    crawlDbPath = new Path(loopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    DemoWebMiningWorkflow.importSeedUrls(crawlDbPath, seedUrlsfileName);
  }

Пример #2

Показать файл

Файл: DemoWebMiningTool.java Проект: cloudysunny14/bixo

  public static void main(String[] args) throws IOException {

    DemoWebMiningOptions options = new DemoWebMiningOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {
      parser.parseArgument(args);
    } catch (CmdLineException e) {
      System.err.println(e.getMessage());
      printUsageAndExit(parser);
    }

    // Build and run the flow.

    try {

      Path workingDirPath = new Path(options.getWorkingDir());

      JobConf conf = new JobConf();
      FileSystem fs = workingDirPath.getFileSystem(conf);
      setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME);

      Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
      if (latestDirPath == null) {
        error("No previous cycle output dirs exist in " + workingDirPath, parser);
      }

      Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

      UserAgent userAgent =
          new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);

      FetcherPolicy fetcherPolicy = new FetcherPolicy();
      fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
      fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
      fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);

      // We only care about mime types that the Tika HTML parser can handle,
      // so restrict it to the same.
      Set<String> validMimeTypes = new HashSet<String>();
      Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
      for (MediaType supportedType : supportedTypes) {
        validMimeTypes.add(
            String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
      }
      fetcherPolicy.setValidMimeTypes(validMimeTypes);

      // Let's limit our crawl to two loops
      for (int curLoop = 1; curLoop <= 2; curLoop++) {
        Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop);
        Flow flow =
            DemoWebMiningWorkflow.createWebMiningWorkflow(
                crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options, curLoop == 1);
        flow.complete();

        // Update crawlDbPath to point to the latest crawl db
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
      }

    } catch (Exception e) {
      System.err.println("Exception running job: " + e.getMessage());
      e.printStackTrace(System.err);
      System.exit(-1);
    }
  }