Пример #1
0
  static void setupWorkingDir(FileSystem fs, Path workingDirPath, String seedUrlsfileName)
      throws Exception {

    // Check if we already have a crawldb
    Path crawlDbPath = null;
    Path loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
    if (loopDirPath != null) {
      // Clear out any previous loop directory, so we're always starting from scratch
      LOGGER.info("deleting existing working dir");
      while (loopDirPath != null) {
        fs.delete(loopDirPath, true);
        loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
      }
    }

    // Create a "0-<timestamp>" loop sub-directory and import the seed urls
    loopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, 0);
    crawlDbPath = new Path(loopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    DemoWebMiningWorkflow.importSeedUrls(crawlDbPath, seedUrlsfileName);
  }
Пример #2
0
  public static void main(String[] args) throws IOException {

    DemoWebMiningOptions options = new DemoWebMiningOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {
      parser.parseArgument(args);
    } catch (CmdLineException e) {
      System.err.println(e.getMessage());
      printUsageAndExit(parser);
    }

    // Build and run the flow.

    try {

      Path workingDirPath = new Path(options.getWorkingDir());

      JobConf conf = new JobConf();
      FileSystem fs = workingDirPath.getFileSystem(conf);
      setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME);

      Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
      if (latestDirPath == null) {
        error("No previous cycle output dirs exist in " + workingDirPath, parser);
      }

      Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

      UserAgent userAgent =
          new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);

      FetcherPolicy fetcherPolicy = new FetcherPolicy();
      fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
      fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
      fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);

      // We only care about mime types that the Tika HTML parser can handle,
      // so restrict it to the same.
      Set<String> validMimeTypes = new HashSet<String>();
      Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
      for (MediaType supportedType : supportedTypes) {
        validMimeTypes.add(
            String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
      }
      fetcherPolicy.setValidMimeTypes(validMimeTypes);

      // Let's limit our crawl to two loops
      for (int curLoop = 1; curLoop <= 2; curLoop++) {
        Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop);
        Flow flow =
            DemoWebMiningWorkflow.createWebMiningWorkflow(
                crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options, curLoop == 1);
        flow.complete();

        // Update crawlDbPath to point to the latest crawl db
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
      }

    } catch (Exception e) {
      System.err.println("Exception running job: " + e.getMessage());
      e.printStackTrace(System.err);
      System.exit(-1);
    }
  }