Java SimpleRobotRules Exemples, SimpleRobotRules Java Exemples

Exemple #1

0

Afficher le fichier

Fichier : SimpleRobotRulesParser.java Projet : alei76/crawler-commons

  @Override
  public BaseRobotRules failedFetch(int httpStatusCode) {
    SimpleRobotRules result;

    if ((httpStatusCode >= 200) && (httpStatusCode < 300)) {
      throw new IllegalStateException("Can't use status code constructor with 2xx response");
    } else if ((httpStatusCode >= 300) && (httpStatusCode < 400)) {
      // Should only happen if we're getting endless redirects (more than our follow limit), so
      // treat it as a temporary failure.
      result = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
      result.setDeferVisits(true);
    } else if ((httpStatusCode >= 400) && (httpStatusCode < 500)) {
      // Some sites return 410 (gone) instead of 404 (not found), so treat as the same.
      // Actually treat all (including forbidden) as "no robots.txt", as that's what Google
      // and other search engines do.
      result = new SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
    } else {
      // Treat all other status codes as a temporary failure.
      result = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
      result.setDeferVisits(true);
    }

    return result;
  }

Exemple #2

0

Afficher le fichier

Fichier : SimpleRobotRulesParser.java Projet : alei76/crawler-commons

  @Override
  public BaseRobotRules parseContent(
      String url, byte[] content, String contentType, String robotName) {
    _numWarnings = 0;

    // If there's nothing there, treat it like we have no restrictions.
    if ((content == null) || (content.length == 0)) {
      return new SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
    }

    int bytesLen = content.length;
    int offset = 0;
    String encoding = "us-ascii";

    // Check for a UTF-8 BOM at the beginning (EF BB BF)
    if ((bytesLen >= 3)
        && (content[0] == (byte) 0xEF)
        && (content[1] == (byte) 0xBB)
        && (content[2] == (byte) 0xBF)) {
      offset = 3;
      bytesLen -= 3;
      encoding = "UTF-8";
    }
    // Check for UTF-16LE BOM at the beginning (FF FE)
    else if ((bytesLen >= 2) && (content[0] == (byte) 0xFF) && (content[1] == (byte) 0xFE)) {
      offset = 2;
      bytesLen -= 2;
      encoding = "UTF-16LE";
    }
    // Check for UTF-16BE BOM at the beginning (FE FF)
    else if ((bytesLen >= 2) && (content[0] == (byte) 0xFE) && (content[1] == (byte) 0xFF)) {
      offset = 2;
      bytesLen -= 2;
      encoding = "UTF-16BE";
    }

    String contentAsStr;
    try {
      contentAsStr = new String(content, offset, bytesLen, encoding);
    } catch (UnsupportedEncodingException e) {
      throw new RuntimeException("Impossible unsupported encoding exception for " + encoding);
    }

    // Decide if we need to do special HTML processing.
    boolean isHtmlType =
        ((contentType != null) && contentType.toLowerCase().startsWith("text/html"));

    // If it looks like it contains HTML, but doesn't have a user agent field, then
    // assume somebody messed up and returned back to us a random HTML page instead
    // of a robots.txt file.
    boolean hasHTML = false;
    if (isHtmlType || SIMPLE_HTML_PATTERN.matcher(contentAsStr).find()) {
      if (!USER_AGENT_PATTERN.matcher(contentAsStr).find()) {
        LOGGER.trace("Found non-robots.txt HTML file: " + url);
        return new SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
      } else {
        // We'll try to strip out HTML tags below.
        if (isHtmlType) {
          LOGGER.debug("HTML content type returned for robots.txt file: " + url);
        } else {
          LOGGER.debug("Found HTML in robots.txt file: " + url);
        }

        hasHTML = true;
      }
    }

    // Break on anything that might be used as a line ending. Since tokenizer doesn't
    // return empty tokens, a \r\n sequence still works since it looks like an empty
    // string between the \r and \n.
    StringTokenizer lineParser = new StringTokenizer(contentAsStr, "\n\r\u0085\u2028\u2029");
    ParseState parseState = new ParseState(url, robotName.toLowerCase());
    boolean keepGoing = true;

    while (keepGoing && lineParser.hasMoreTokens()) {
      String line = lineParser.nextToken();

      // Get rid of HTML markup, in case some brain-dead webmaster has created an HTML
      // page for robots.txt. We could do more sophisticated processing here to better
      // handle bad HTML, but that's a very tiny percentage of all robots.txt files.
      if (hasHTML) {
        line = line.replaceAll("<[^>]+>", "");
      }

      // trim out comments and whitespace
      int hashPos = line.indexOf("#");
      if (hashPos >= 0) {
        line = line.substring(0, hashPos);
      }

      line = line.trim();
      if (line.length() == 0) {
        continue;
      }

      RobotToken token = tokenize(line);
      switch (token.getDirective()) {
        case USER_AGENT:
          keepGoing = handleUserAgent(parseState, token);
          break;

        case DISALLOW:
          keepGoing = handleDisallow(parseState, token);
          break;

        case ALLOW:
          keepGoing = handleAllow(parseState, token);
          break;

        case CRAWL_DELAY:
          keepGoing = handleCrawlDelay(parseState, token);
          break;

        case SITEMAP:
          keepGoing = handleSitemap(parseState, token);
          break;

        case HTTP:
          keepGoing = handleHttp(parseState, token);
          break;

        case UNKNOWN:
          reportWarning("Unknown directive in robots.txt file: " + line, url);
          parseState.setFinishedAgentFields(true);
          break;

        case MISSING:
          reportWarning(
              String.format("Unknown line in robots.txt file (size %d): %s", content.length, line),
              url);
          parseState.setFinishedAgentFields(true);
          break;

        default:
          // All others we just ignore
          // TODO KKr - which of these should be setting finishedAgentFields to true?
          // TODO KKr - handle no-index
          // TODO KKr - handle request-rate and visit-time
          break;
      }
    }

    SimpleRobotRules result = parseState.getRobotRules();
    if (result.getCrawlDelay() > MAX_CRAWL_DELAY) {
      // Some evil sites use a value like 3600 (seconds) for the crawl delay, which would
      // cause lots of problems for us.
      LOGGER.debug("Crawl delay exceeds max value - so disallowing all URLs: " + url);
      return new SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
    } else {
      result.sortRules();
      return result;
    }
  }

Exemple #3

0

Afficher le fichier

Fichier : SimpleRobotRulesParser.java Projet : alei76/crawler-commons

 public void addSitemap(String sitemap) {
   _curRules.addSitemap(sitemap);
 }

Exemple #4

0

Afficher le fichier

Fichier : SimpleRobotRulesParser.java Projet : alei76/crawler-commons

 public void setCrawlDelay(long delay) {
   _curRules.setCrawlDelay(delay);
 }

Exemple #5

0

Afficher le fichier

Fichier : SimpleRobotRulesParser.java Projet : alei76/crawler-commons

 public void addRule(String prefix, boolean allow) {
   _curRules.addRule(prefix, allow);
 }

Exemple #6

0

Afficher le fichier

Fichier : SimpleRobotRulesParser.java Projet : alei76/crawler-commons

 public void clearRules() {
   _curRules.clearRules();
 }