/** * Uses the extraction template generated by the {@link #templateGeneration} process and utilizes * some own characteristics of the target page pt to extract the content of it which first needs * to be parsed to a DOM tree <em>tp</em>. * * @param tt The previouslz generated extraction template * @param tp_i The root DOM element of the document to predict the main content for */ public void newsContentExtraction(Deque<Token> tt, Token tp_i) { // templateNode <-- tt.firstElement Token templateNode = tt.getFirst(); if (templateNode.getName().equals(tp_i.getName()) && templateNode.getLevel() == tp_i.getLevel() && templateNode.getSibNo() == tp_i.getSibNo()) { tt.removeFirst(); if (tt.size() > 0) { Token nextTemplateNode = tt.getFirst(); if (tp_i.getChildren() == null || tp_i.getChildren().length == 0) { LOG.info(tp_i.getText()); while (nextTemplateNode.getParentNo() == templateNode.getNo()) { tt.removeFirst(); templateNode = nextTemplateNode; nextTemplateNode = tt.getFirst(); } } else { if (nextTemplateNode.getParentNo() != templateNode.getNo()) { System.out.println(this.deleteEmbeddedNoise(tp_i.getSubtreeText())); } for (int j = 0; j < tp_i.getChildren().length; j++) { this.newsContentExtraction(tt, tp_i.getChildren()[j]); } } } else { LOG.info(this.deleteEmbeddedNoise(tp_i.getSubtreeText())); } } }
/** * Generates an extraction template <em>tt</em>, which is the set of non-noisy nodes by * depth-first traversal of the maximum matching tree <em>tm</em> which only contains nodes that * matched in both ta and tb trees. * * <p>If two pages have the same sub-trees, these fractions are considered noise. As anchor-tags * are hidden noise text, the anchor-text-ratio of a sub-segment has to exceed a given threshold * to be excluded. * * <p>Moreover punctuation metrics are used to further separate noise from content. * * @param i The index of the i-th element of the maximum matching tree tm * @return The generated extraction template build from the two provided sources */ private Deque<Token> treeExtraction(int i) { Token matchedNode; if (this.tm.get(i).getMatchedNode() instanceof Tag) { matchedNode = new Tag(this.tm.get(i).getMatchedNode()); } else { matchedNode = new Word(this.tm.get(i).getMatchedNode()); } if (this.tm != null && !this.tm.get(i).getSubtreeText().equals(matchedNode.getSubtreeText()) && this.tm.get(i).getAnchorTextRatio() <= 1.3 * this.ta.get(0).getAnchorTextRatio() && !(this.tm.get(i).getPunctNum() == 0 && this.tm.get(i).getSegNum() >= 3)) { if (this.tm.get(i).getSibNo() == matchedNode.getSibNo()) { this.tt.add(this.tm.get(i)); for (int j = 1; j < this.tm.get(i).getChildren().length; j++) { this.treeExtraction(this.tm.get(i).getChildren()[j].getNo()); } } // expand extraction range of tt to extract content that did not // matched in the maximum matching tree tm as they occurred in tb // but not in ta else if (this.tm.get(this.tm.get(i).getParentNo()).getSibNo() == this.tb.get(matchedNode.getParentNo()).getSibNo()) { while (this.tt.peekLast().getNo() != this.tm.get(i).getParentNo()) { this.tt.removeLast(); } } } return this.tt; }