private void initializeOnStartOrResume(String type, String crawlerId) {
    if (!CrawlerEvent.CRAWLER_STARTED.equals(type) && !CrawlerEvent.CRAWLER_RESUMED.equals(type)) {
      return;
    }

    // Create new file on crawler start/resume
    outputFile =
        new File(
            outputDir,
            fileNamePrefix
                + FileUtil.toSafeFileName(crawlerId)
                + "-"
                + System.currentTimeMillis()
                + ".tsv");
    try {
      FileUtil.createDirsForFile(outputFile);
    } catch (IOException e) {
      throw new CollectorException("Cannot create output directory for file: " + outputFile, e);
    }
    writeLine("Referrer", "URL", "Status", "Reason", false);

    // Parse status codes
    if (StringUtils.isBlank(statusCodes)) {
      parsedCodes.clear();
      return;
    }
    String[] ranges = statusCodes.split("\\s*,\\s*");
    for (String range : ranges) {
      String[] endPoints = range.split("\\s*-\\s*");
      if (endPoints.length == 1) {
        parsedCodes.add(toInt(endPoints[0]));
      } else if (endPoints.length == 2) {
        int start = toInt(endPoints[0]);
        int end = toInt(endPoints[1]);
        if (start >= end) {
          throw new IllegalArgumentException(
              "Invalid statusCode range: "
                  + range
                  + ". Start value must be higher than end value.");
        }
        while (start <= end) {
          parsedCodes.add(start);
          start++;
        }
      } else {
        throw new IllegalArgumentException("Invalid statusCode range: " + range);
      }
    }
  }
コード例 #2
0
 @Override
 public ICrawlDataStore createCrawlDataStore(ICrawlerConfig config, boolean resume) {
   String storeDir =
       config.getWorkDir().getPath()
           + "/crawlstore/mvstore/"
           + FileUtil.toSafeFileName(config.getId())
           + "/";
   return new MVStoreCrawlDataStore(storeDir, resume);
 }