/** * Handle the user-agent: directive * * @param state current parsing state * @param token data for directive * @return true to keep going, false if we're done */ private boolean handleUserAgent(ParseState state, RobotToken token) { if (state.isMatchedRealName()) { if (state.isFinishedAgentFields()) { // We're all done. return false; } else { // Skip any more of these, once we have a real name match. We're waiting for some // allow/disallow/crawl delay fields. return true; } } if (state.isFinishedAgentFields()) { // We've got a user agent field, so we haven't yet seen anything that tells us // we're done with this set of agent names. state.setFinishedAgentFields(false); state.setAddingRules(false); } // Handle the case when there are multiple target names are passed String[] targetNames = state.getTargetName().split(","); for (int count = 0; count < targetNames.length; count++) { // Extract possible match names from our target agent name, since it appears // to be expected that "Mozilla botname 1.0" matches "botname" String[] targetNameSplits = targetNames[count].trim().split(" "); // TODO KKr - catch case of multiple names, log as non-standard. String[] agentNames = token.getData().split("[ \t,]"); for (String agentName : agentNames) { agentName = agentName.trim(); if (agentName.isEmpty()) { // Ignore empty names } else if (agentName.equals("*") && !state.isMatchedWildcard()) { state.setMatchedWildcard(true); state.setAddingRules(true); } else { for (String targetName : targetNameSplits) { if (targetName.startsWith(agentName)) { state.setMatchedRealName(true); state.setAddingRules(true); state.clearRules(); // In case we previously hit a wildcard rule match break; } } } } } // Keep going return true; }
/** * Handle the allow: directive * * @param state current parsing state * @param token data for directive * @return true to keep going, false if we're done */ private boolean handleAllow(ParseState state, RobotToken token) { state.setFinishedAgentFields(true); if (!state.isAddingRules()) { return true; } String path = token.getData(); try { path = URLDecoder.decode(path, "UTF-8"); } catch (Exception e) { reportWarning("Error parsing robots rules - can't decode path: " + path, state.getUrl()); } if (path.length() == 0) { // Allow: <nothing> => allow all. state.clearRules(); } else { state.addRule(path, true); } return true; }