public void head(Node source, int depth) { if (source instanceof Element) { Element sourceEl = (Element) source; if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs ElementMeta meta = createSafeElement(sourceEl); Element destChild = meta.el; destination.appendChild(destChild); numDiscarded += meta.numAttribsDiscarded; destination = destChild; } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. numDiscarded++; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri()); destination.appendChild(destText); } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) { DataNode sourceData = (DataNode) source; DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri()); destination.appendChild(destData); } else { // else, we don't care about comments, xml proc instructions, etc numDiscarded++; } }
/** * Get the combined data of this element. Data is e.g. the inside of a {@code script} tag. * * @return the data, or empty string if none * @see #dataNodes() */ public String data() { StringBuilder sb = new StringBuilder(); for (Node childNode : childNodes) { if (childNode instanceof DataNode) { DataNode data = (DataNode) childNode; sb.append(data.getWholeData()); } else if (childNode instanceof Element) { Element element = (Element) childNode; String elementData = element.data(); sb.append(elementData); } } return sb.toString(); }
public Set<MatchParserObject> parse(WhoScoredMatchParserObject matchParserObject) { Elements scriptElements = getDocument().getElementsByTag("script"); Pattern matchIdPattern = Pattern.compile( ".*var liveMatchUpdater = .*parameters: \\{.*id: (\\d*).*\\}.*", Pattern.DOTALL); Pattern scriptPattern = Pattern.compile( "(.*)var initialMatchDataForScrappers = \\[\\[(.*), \\[(.*)", Pattern.DOTALL); Pattern fixturePattern = Pattern.compile( "\\[(\\d*),(\\d*),'(.*?)','(.*?)','(.*?)','.*?',\\d*,'(.*?)',(.*)\\]", Pattern.DOTALL); Pattern matchEventsPattern = Pattern.compile("\\[\\[(.*?)\\]\\]", Pattern.DOTALL); Pattern goalPattern = Pattern.compile( ".*\\['(.*?)',('(.*?)')?,'(goal|owngoal|penalty-goal)','.*?',('OG'|'Pen.')?,(\\d*),(\\d*),(\\d*)\\].*", Pattern.DOTALL); Pattern cardPattern = Pattern.compile( ".*\\['(.*?)',,'(yellow|secondyellow|red)',,,(\\d*),(\\d*),(\\d*)\\].*", Pattern.DOTALL); Pattern substitutionPattern = Pattern.compile(".*\\['(.*?)','(.*?)','subst',,,(\\d*),(\\d*),(\\d*)\\].*", Pattern.DOTALL); for (Element scriptElement : scriptElements) { for (DataNode node : scriptElement.dataNodes()) { Matcher matchIdMatcher = matchIdPattern.matcher(node.toString()); if (matchIdMatcher.matches()) { int whoScoredId = Integer.parseInt(matchIdMatcher.group(1)); if (matchParserObject.getWhoScoredId() == 0) { matchParserObject.setWhoScoredId(whoScoredId); } else if (matchParserObject.getWhoScoredId() != whoScoredId) { logger.error( "Provided WhoScoredMatchParserObject had whoScoredId={} but parsed document had whoScoredId={}", matchParserObject.getWhoScoredId(), whoScoredId); } } Matcher scriptMatcher = scriptPattern.matcher(node.toString()); if (scriptMatcher.matches()) { String[] scriptVariables = scriptMatcher.group(2).split("\n, "); for (String scriptVariable : scriptVariables) { scriptVariable = scriptVariable.trim(); Matcher fixtureMatcher = fixturePattern.matcher(scriptVariable); Matcher matchEventsMatcher = matchEventsPattern.matcher(scriptVariable); if (fixtureMatcher.matches()) { if (matchParserObject.getHomeTeam() == null) { matchParserObject.setHomeTeam( new WhoScoredTeamParserObject( fixtureMatcher.group(3), Integer.parseInt(fixtureMatcher.group(1)))); } if (matchParserObject.getAwayTeam() == null) { matchParserObject.setAwayTeam( new WhoScoredTeamParserObject( fixtureMatcher.group(4), Integer.parseInt(fixtureMatcher.group(2)))); } if (matchParserObject.getDateTime() == null) { matchParserObject.setDateTime(fixtureMatcher.group(5)); } if (matchParserObject.getTimeElapsed() == null) { matchParserObject.setTimeElapsed(fixtureMatcher.group(6)); } } else if (matchEventsMatcher.matches()) { for (String eventVariable : matchEventsMatcher.group(1).split("\n")) { Matcher goalMatcher = goalPattern.matcher(eventVariable); Matcher cardMatcher = cardPattern.matcher(eventVariable); Matcher substitutionMatcher = substitutionPattern.matcher(eventVariable); if (goalMatcher.matches()) { WhoScoredGoalParserObject goalParserObject = new WhoScoredGoalParserObject( goalMatcher.group(1), Integer.parseInt(goalMatcher.group(7)), goalMatcher.group(3), Integer.parseInt(goalMatcher.group(8)), Integer.parseInt(goalMatcher.group(6)), goalMatcher.group(4).equalsIgnoreCase("penalty-goal"), goalMatcher.group(4).equalsIgnoreCase("owngoal")); WhoScoredTeamParserObject team = matchParserObject.getTeamForGoal(goalParserObject); if (team == null) { logger.error("Could not find team for goal {}.", goalParserObject); } else { team.getGoals().add(goalParserObject); } } else if (cardMatcher.matches()) { CardType cardType = (cardMatcher.group(2).equalsIgnoreCase("yellow") ? CardType.YELLOW : CardType.RED); WhoScoredCardParserObject cardParserObject = new WhoScoredCardParserObject( cardMatcher.group(1), Integer.parseInt(cardMatcher.group(4)), Integer.parseInt(cardMatcher.group(3)), cardType); matchParserObject .getTeamForPlayer(cardParserObject.getPlayerWhoScoredId()) .getCards() .add(cardParserObject); } else if (substitutionMatcher.matches()) { WhoScoredSubstitutionParserObject substitutionParserObject = new WhoScoredSubstitutionParserObject( substitutionMatcher.group(1), Integer.parseInt(substitutionMatcher.group(4)), substitutionMatcher.group(2), Integer.parseInt(substitutionMatcher.group(5)), Integer.parseInt(substitutionMatcher.group(3))); matchParserObject .getTeamForPlayer(substitutionParserObject.getPlayerOutWhoScoredId()) .getSubstitutions() .add(substitutionParserObject); } } } } } } } getParserProperties().map(matchParserObject); Set<MatchParserObject> matchParserObjects = new HashSet<MatchParserObject>(); matchParserObjects.add(matchParserObject); return matchParserObjects; }