@Override public BaseRobotRules failedFetch(int httpStatusCode) { SimpleRobotRules result; if ((httpStatusCode >= 200) && (httpStatusCode < 300)) { throw new IllegalStateException("Can't use status code constructor with 2xx response"); } else if ((httpStatusCode >= 300) && (httpStatusCode < 400)) { // Should only happen if we're getting endless redirects (more than our follow limit), so // treat it as a temporary failure. result = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE); result.setDeferVisits(true); } else if ((httpStatusCode >= 400) && (httpStatusCode < 500)) { // Some sites return 410 (gone) instead of 404 (not found), so treat as the same. // Actually treat all (including forbidden) as "no robots.txt", as that's what Google // and other search engines do. result = new SimpleRobotRules(RobotRulesMode.ALLOW_ALL); } else { // Treat all other status codes as a temporary failure. result = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE); result.setDeferVisits(true); } return result; }
@Override public BaseRobotRules parseContent( String url, byte[] content, String contentType, String robotName) { _numWarnings = 0; // If there's nothing there, treat it like we have no restrictions. if ((content == null) || (content.length == 0)) { return new SimpleRobotRules(RobotRulesMode.ALLOW_ALL); } int bytesLen = content.length; int offset = 0; String encoding = "us-ascii"; // Check for a UTF-8 BOM at the beginning (EF BB BF) if ((bytesLen >= 3) && (content[0] == (byte) 0xEF) && (content[1] == (byte) 0xBB) && (content[2] == (byte) 0xBF)) { offset = 3; bytesLen -= 3; encoding = "UTF-8"; } // Check for UTF-16LE BOM at the beginning (FF FE) else if ((bytesLen >= 2) && (content[0] == (byte) 0xFF) && (content[1] == (byte) 0xFE)) { offset = 2; bytesLen -= 2; encoding = "UTF-16LE"; } // Check for UTF-16BE BOM at the beginning (FE FF) else if ((bytesLen >= 2) && (content[0] == (byte) 0xFE) && (content[1] == (byte) 0xFF)) { offset = 2; bytesLen -= 2; encoding = "UTF-16BE"; } String contentAsStr; try { contentAsStr = new String(content, offset, bytesLen, encoding); } catch (UnsupportedEncodingException e) { throw new RuntimeException("Impossible unsupported encoding exception for " + encoding); } // Decide if we need to do special HTML processing. boolean isHtmlType = ((contentType != null) && contentType.toLowerCase().startsWith("text/html")); // If it looks like it contains HTML, but doesn't have a user agent field, then // assume somebody messed up and returned back to us a random HTML page instead // of a robots.txt file. boolean hasHTML = false; if (isHtmlType || SIMPLE_HTML_PATTERN.matcher(contentAsStr).find()) { if (!USER_AGENT_PATTERN.matcher(contentAsStr).find()) { LOGGER.trace("Found non-robots.txt HTML file: " + url); return new SimpleRobotRules(RobotRulesMode.ALLOW_ALL); } else { // We'll try to strip out HTML tags below. if (isHtmlType) { LOGGER.debug("HTML content type returned for robots.txt file: " + url); } else { LOGGER.debug("Found HTML in robots.txt file: " + url); } hasHTML = true; } } // Break on anything that might be used as a line ending. Since tokenizer doesn't // return empty tokens, a \r\n sequence still works since it looks like an empty // string between the \r and \n. StringTokenizer lineParser = new StringTokenizer(contentAsStr, "\n\r\u0085\u2028\u2029"); ParseState parseState = new ParseState(url, robotName.toLowerCase()); boolean keepGoing = true; while (keepGoing && lineParser.hasMoreTokens()) { String line = lineParser.nextToken(); // Get rid of HTML markup, in case some brain-dead webmaster has created an HTML // page for robots.txt. We could do more sophisticated processing here to better // handle bad HTML, but that's a very tiny percentage of all robots.txt files. if (hasHTML) { line = line.replaceAll("<[^>]+>", ""); } // trim out comments and whitespace int hashPos = line.indexOf("#"); if (hashPos >= 0) { line = line.substring(0, hashPos); } line = line.trim(); if (line.length() == 0) { continue; } RobotToken token = tokenize(line); switch (token.getDirective()) { case USER_AGENT: keepGoing = handleUserAgent(parseState, token); break; case DISALLOW: keepGoing = handleDisallow(parseState, token); break; case ALLOW: keepGoing = handleAllow(parseState, token); break; case CRAWL_DELAY: keepGoing = handleCrawlDelay(parseState, token); break; case SITEMAP: keepGoing = handleSitemap(parseState, token); break; case HTTP: keepGoing = handleHttp(parseState, token); break; case UNKNOWN: reportWarning("Unknown directive in robots.txt file: " + line, url); parseState.setFinishedAgentFields(true); break; case MISSING: reportWarning( String.format("Unknown line in robots.txt file (size %d): %s", content.length, line), url); parseState.setFinishedAgentFields(true); break; default: // All others we just ignore // TODO KKr - which of these should be setting finishedAgentFields to true? // TODO KKr - handle no-index // TODO KKr - handle request-rate and visit-time break; } } SimpleRobotRules result = parseState.getRobotRules(); if (result.getCrawlDelay() > MAX_CRAWL_DELAY) { // Some evil sites use a value like 3600 (seconds) for the crawl delay, which would // cause lots of problems for us. LOGGER.debug("Crawl delay exceeds max value - so disallowing all URLs: " + url); return new SimpleRobotRules(RobotRulesMode.ALLOW_NONE); } else { result.sortRules(); return result; } }
public void addSitemap(String sitemap) { _curRules.addSitemap(sitemap); }
public void setCrawlDelay(long delay) { _curRules.setCrawlDelay(delay); }
public void addRule(String prefix, boolean allow) { _curRules.addRule(prefix, allow); }
public void clearRules() { _curRules.clearRules(); }