/** * Uses the extraction template generated by the {@link #templateGeneration} process and utilizes * some own characteristics of the target page pt to extract the content of it which first needs * to be parsed to a DOM tree <em>tp</em>. * * @param tt The previouslz generated extraction template * @param tp_i The root DOM element of the document to predict the main content for */ public void newsContentExtraction(Deque<Token> tt, Token tp_i) { // templateNode <-- tt.firstElement Token templateNode = tt.getFirst(); if (templateNode.getName().equals(tp_i.getName()) && templateNode.getLevel() == tp_i.getLevel() && templateNode.getSibNo() == tp_i.getSibNo()) { tt.removeFirst(); if (tt.size() > 0) { Token nextTemplateNode = tt.getFirst(); if (tp_i.getChildren() == null || tp_i.getChildren().length == 0) { LOG.info(tp_i.getText()); while (nextTemplateNode.getParentNo() == templateNode.getNo()) { tt.removeFirst(); templateNode = nextTemplateNode; nextTemplateNode = tt.getFirst(); } } else { if (nextTemplateNode.getParentNo() != templateNode.getNo()) { System.out.println(this.deleteEmbeddedNoise(tp_i.getSubtreeText())); } for (int j = 0; j < tp_i.getChildren().length; j++) { this.newsContentExtraction(tt, tp_i.getChildren()[j]); } } } else { LOG.info(this.deleteEmbeddedNoise(tp_i.getSubtreeText())); } } }
/** * Matched children backtracking algorithm * * <p>Besides finding the matched children using ta's <em>matchedMatrix</em> field, the main job * is to modify the initial <em>matchedMatrix</em> of each matched child. * * @param p The index of the p-th element of the DOM-tree ta * @param matchedMatrix The matched tree path matrix of ta for node p * @param i The x-position in the matchedMatrix to look for the value * @param j The y-position in the matchedMatrix to look for the value */ private void matchedChildrenBacktracking(int p, int[][] matchedMatrix, int i, int j) { // slight modification as if the 1st page is larger than the second or a // segment of the 1st page contains more children than the 2nd page, // j might become negative while i is 0. // To my understanding if either of i or j gets 0 the border of the // matrix was reached. if (i == 0 && j == 0) { return; } if (matchedMatrix[i][j] == MatchedMatrixValue.UP_LEFT.getValue()) { this.matchedChildrenBacktracking(p, matchedMatrix, i - 1, j - 1); // contains elements from 1st page Token child = this.ta.get(p).getChildren()[i - 1]; // contains elements from 2nd page Token comparedNode = this.tb.get(child.getComparedNodes().peek().getParentNo()); while (this.ta.get(p).getMatchedNode() != comparedNode) { int k = comparedNode.getChildren().length; int n = this.ta.get(p).getChildren().length; for (int _i = 0; _i < k - 1; _i++) // Delete child.comparedNodes.firstElement { child.getComparedNodes().poll(); } for (int h = 0; h < n - 1; h++) { this.ta.get(p).getChildren()[h].setComparedNodes(child.getComparedNodes()); for (int _i = 0; _i < k - 1; _i++) // Delete ta[p].children[h].comparedMatrix.firstElement { this.ta.get(p).getChildren()[h].getComparedMatrix().poll(); } } if (child.getComparedNodes().size() > 0) { comparedNode = this.tb.get(child.getComparedNodes().peek().getParentNo()); } else { break; } } // child.setMatchedNode(child.getComparedNodes().get(j-1)); child.setMatchedNode(comparedNode.getChildren()[j - 1]); child.setMatchedMatrix(null); if (child.getComparedMatrix().size() > 0) { child.setMatchedMatrix(child.getComparedMatrix().get(j - 1)); } // Add child to matchedChildren, tm[p].children // matchedChildren.add(new HTMLNode(child)); this.matchedChildren.add(child); // this.tm[p].addChild(new HTMLNode(child)); this.tm.get(p).addChild(child); } else if (matchedMatrix[i][j] == MatchedMatrixValue.UP.getValue()) { this.matchedChildrenBacktracking(p, matchedMatrix, i - 1, j); } else { this.matchedChildrenBacktracking(p, matchedMatrix, i, j - 1); } }