@Override public final List<String[]> analyzeSandboxed(final Document document) throws XPathExpressionException { Preconditions.checkNotNull(document); final NodeList mainNodes = ParserUtil.evaluateXPath(document, this.getXPath()); if (mainNodes.getLength() == 0) { return null; } final Node mainNode = mainNodes.item(0); this.assertBeforeProcessingBegins(); this.enterProcessingStage(); final String[] token = new String[this.tokenLenght]; token[0] = this.getVenueNameFromNode(mainNode); token[0] = VenueUtil.cleanupVenueName(token[0]); token[1] = this.getVenueAddressFromNode(mainNode); token[1] = CleanupUtil.cleanUpName(token[1]); if (!this.isTokenValid(token)) { this.endProcessingOfToken(); return Lists.newArrayList(); } this.assertDuringProcessing(); this.endProcessingOfToken(); final List<String[]> tokenInList = Lists.<String[]>newArrayList(); tokenInList.add(token); Preconditions.checkNotNull(tokenInList); return tokenInList; }
@Override protected final String[] analyzeNodeInternal(final Node node) { final String[] token = new String[this.tokenLenght]; // 0 - event full name token[0] = this.getStringFromNode(node); token[0] = CleanupUtil.cleanUpName(token[0]); // 1 - name of band/artist token[1] = BandUtil.cleanBandName(token[0]); // 2 - name of venue token[2] = this.getVenueFromNode(node.getParentNode().getNextSibling().getNextSibling()); token[2] = CleanupUtil.cleanUpName(token[2]); token[2] = VenueCleanupComponent.extractKnownVenueNameFromLargerText(token[2]); // 3 - date token[3] = this.getDateFromNode(node.getParentNode().getNextSibling().getNextSibling()); token[3] = DateAnalysisUtil.cleanupDate(token[3]); return token; }
/** * - note: the assumption that once the XPath reaches the elements, everything is one big list * with no interruptions is wrong <br> */ @Override protected final String[] analyzeNodeInternal(final Node node) { final String[] token = new String[this.tokenLenght]; String text = null; text = this.getStringFromNode(node); if (text == null) { // skipping over noise return null; } text = text.trim(); token[0] = this.getStreetType(text); if (token[0] != null) { token[0] = CleanupUtil.cleanUpName(token[0]); } token[1] = this.getStreetName(text); if (token[1] != null) { token[1] = StreetUtil.cleanStreetName(token[1]); } return token; }