/** * Generates an extraction template <em>tt</em>, which is the set of non-noisy nodes by * depth-first traversal of the maximum matching tree <em>tm</em> which only contains nodes that * matched in both ta and tb trees. * * <p>If two pages have the same sub-trees, these fractions are considered noise. As anchor-tags * are hidden noise text, the anchor-text-ratio of a sub-segment has to exceed a given threshold * to be excluded. * * <p>Moreover punctuation metrics are used to further separate noise from content. * * @param i The index of the i-th element of the maximum matching tree tm * @return The generated extraction template build from the two provided sources */ private Deque<Token> treeExtraction(int i) { Token matchedNode; if (this.tm.get(i).getMatchedNode() instanceof Tag) { matchedNode = new Tag(this.tm.get(i).getMatchedNode()); } else { matchedNode = new Word(this.tm.get(i).getMatchedNode()); } if (this.tm != null && !this.tm.get(i).getSubtreeText().equals(matchedNode.getSubtreeText()) && this.tm.get(i).getAnchorTextRatio() <= 1.3 * this.ta.get(0).getAnchorTextRatio() && !(this.tm.get(i).getPunctNum() == 0 && this.tm.get(i).getSegNum() >= 3)) { if (this.tm.get(i).getSibNo() == matchedNode.getSibNo()) { this.tt.add(this.tm.get(i)); for (int j = 1; j < this.tm.get(i).getChildren().length; j++) { this.treeExtraction(this.tm.get(i).getChildren()[j].getNo()); } } // expand extraction range of tt to extract content that did not // matched in the maximum matching tree tm as they occurred in tb // but not in ta else if (this.tm.get(this.tm.get(i).getParentNo()).getSibNo() == this.tb.get(matchedNode.getParentNo()).getSibNo()) { while (this.tt.peekLast().getNo() != this.tm.get(i).getParentNo()) { this.tt.removeLast(); } } } return this.tt; }
/** * Uses the extraction template generated by the {@link #templateGeneration} process and utilizes * some own characteristics of the target page pt to extract the content of it which first needs * to be parsed to a DOM tree <em>tp</em>. * * @param tt The previouslz generated extraction template * @param tp_i The root DOM element of the document to predict the main content for */ public void newsContentExtraction(Deque<Token> tt, Token tp_i) { // templateNode <-- tt.firstElement Token templateNode = tt.getFirst(); if (templateNode.getName().equals(tp_i.getName()) && templateNode.getLevel() == tp_i.getLevel() && templateNode.getSibNo() == tp_i.getSibNo()) { tt.removeFirst(); if (tt.size() > 0) { Token nextTemplateNode = tt.getFirst(); if (tp_i.getChildren() == null || tp_i.getChildren().length == 0) { LOG.info(tp_i.getText()); while (nextTemplateNode.getParentNo() == templateNode.getNo()) { tt.removeFirst(); templateNode = nextTemplateNode; nextTemplateNode = tt.getFirst(); } } else { if (nextTemplateNode.getParentNo() != templateNode.getNo()) { System.out.println(this.deleteEmbeddedNoise(tp_i.getSubtreeText())); } for (int j = 0; j < tp_i.getChildren().length; j++) { this.newsContentExtraction(tt, tp_i.getChildren()[j]); } } } else { LOG.info(this.deleteEmbeddedNoise(tp_i.getSubtreeText())); } } }
/** * Matched children backtracking algorithm * * <p>Besides finding the matched children using ta's <em>matchedMatrix</em> field, the main job * is to modify the initial <em>matchedMatrix</em> of each matched child. * * @param p The index of the p-th element of the DOM-tree ta * @param matchedMatrix The matched tree path matrix of ta for node p * @param i The x-position in the matchedMatrix to look for the value * @param j The y-position in the matchedMatrix to look for the value */ private void matchedChildrenBacktracking(int p, int[][] matchedMatrix, int i, int j) { // slight modification as if the 1st page is larger than the second or a // segment of the 1st page contains more children than the 2nd page, // j might become negative while i is 0. // To my understanding if either of i or j gets 0 the border of the // matrix was reached. if (i == 0 && j == 0) { return; } if (matchedMatrix[i][j] == MatchedMatrixValue.UP_LEFT.getValue()) { this.matchedChildrenBacktracking(p, matchedMatrix, i - 1, j - 1); // contains elements from 1st page Token child = this.ta.get(p).getChildren()[i - 1]; // contains elements from 2nd page Token comparedNode = this.tb.get(child.getComparedNodes().peek().getParentNo()); while (this.ta.get(p).getMatchedNode() != comparedNode) { int k = comparedNode.getChildren().length; int n = this.ta.get(p).getChildren().length; for (int _i = 0; _i < k - 1; _i++) // Delete child.comparedNodes.firstElement { child.getComparedNodes().poll(); } for (int h = 0; h < n - 1; h++) { this.ta.get(p).getChildren()[h].setComparedNodes(child.getComparedNodes()); for (int _i = 0; _i < k - 1; _i++) // Delete ta[p].children[h].comparedMatrix.firstElement { this.ta.get(p).getChildren()[h].getComparedMatrix().poll(); } } if (child.getComparedNodes().size() > 0) { comparedNode = this.tb.get(child.getComparedNodes().peek().getParentNo()); } else { break; } } // child.setMatchedNode(child.getComparedNodes().get(j-1)); child.setMatchedNode(comparedNode.getChildren()[j - 1]); child.setMatchedMatrix(null); if (child.getComparedMatrix().size() > 0) { child.setMatchedMatrix(child.getComparedMatrix().get(j - 1)); } // Add child to matchedChildren, tm[p].children // matchedChildren.add(new HTMLNode(child)); this.matchedChildren.add(child); // this.tm[p].addChild(new HTMLNode(child)); this.tm.get(p).addChild(child); } else if (matchedMatrix[i][j] == MatchedMatrixValue.UP.getValue()) { this.matchedChildrenBacktracking(p, matchedMatrix, i - 1, j); } else { this.matchedChildrenBacktracking(p, matchedMatrix, i, j - 1); } }