static { for (RobotDirective directive : RobotDirective.values()) { if (!directive.isSpecial()) { String prefix = directive.name().toLowerCase().replaceAll("_", "-"); DIRECTIVE_PREFIX.put(prefix, directive); } } DIRECTIVE_PREFIX.put("useragent", RobotDirective.USER_AGENT); DIRECTIVE_PREFIX.put("useg-agent", RobotDirective.USER_AGENT); DIRECTIVE_PREFIX.put("ser-agent", RobotDirective.USER_AGENT); DIRECTIVE_PREFIX.put("desallow", RobotDirective.DISALLOW); DIRECTIVE_PREFIX.put("dissalow", RobotDirective.DISALLOW); DIRECTIVE_PREFIX.put("dssalow", RobotDirective.DISALLOW); DIRECTIVE_PREFIX.put("dsallow", RobotDirective.DISALLOW); DIRECTIVE_PREFIX.put("crawl delay", RobotDirective.CRAWL_DELAY); }
/** * Figure out directive on line of text from robots.txt file. We assume the line has been * lower-cased * * @param line * @return robot command found on line */ private static RobotToken tokenize(String line) { String lowerLine = line.toLowerCase(); for (String prefix : DIRECTIVE_PREFIX.keySet()) { int prefixLength = prefix.length(); if (lowerLine.startsWith(prefix)) { RobotDirective directive = DIRECTIVE_PREFIX.get(prefix); String dataPortion = lowerLine.substring(prefixLength); // preserve the original case for sitemaps if (directive.equals(RobotDirective.SITEMAP)) dataPortion = line.substring(prefixLength); if (directive.isPrefix()) { Matcher m = DIRECTIVE_SUFFIX_PATTERN.matcher(dataPortion); if (m.matches()) { dataPortion = m.group(1); } else { continue; } } Matcher m = COLON_DIRECTIVE_DELIMITER.matcher(dataPortion); if (!m.matches()) { m = BLANK_DIRECTIVE_DELIMITER.matcher(dataPortion); } if (m.matches()) { return new RobotToken(directive, m.group(1).trim()); } } } Matcher m = COLON_DIRECTIVE_DELIMITER.matcher(lowerLine); if (m.matches()) { return new RobotToken(RobotDirective.UNKNOWN, line); } else { return new RobotToken(RobotDirective.MISSING, line); } }