コード例 #1
0
 /**
  * Uses the extraction template generated by the {@link #templateGeneration} process and utilizes
  * some own characteristics of the target page pt to extract the content of it which first needs
  * to be parsed to a DOM tree <em>tp</em>.
  *
  * @param tt The previouslz generated extraction template
  * @param tp_i The root DOM element of the document to predict the main content for
  */
 public void newsContentExtraction(Deque<Token> tt, Token tp_i) {
   // templateNode <-- tt.firstElement
   Token templateNode = tt.getFirst();
   if (templateNode.getName().equals(tp_i.getName())
       && templateNode.getLevel() == tp_i.getLevel()
       && templateNode.getSibNo() == tp_i.getSibNo()) {
     tt.removeFirst();
     if (tt.size() > 0) {
       Token nextTemplateNode = tt.getFirst();
       if (tp_i.getChildren() == null || tp_i.getChildren().length == 0) {
         LOG.info(tp_i.getText());
         while (nextTemplateNode.getParentNo() == templateNode.getNo()) {
           tt.removeFirst();
           templateNode = nextTemplateNode;
           nextTemplateNode = tt.getFirst();
         }
       } else {
         if (nextTemplateNode.getParentNo() != templateNode.getNo()) {
           System.out.println(this.deleteEmbeddedNoise(tp_i.getSubtreeText()));
         }
         for (int j = 0; j < tp_i.getChildren().length; j++) {
           this.newsContentExtraction(tt, tp_i.getChildren()[j]);
         }
       }
     } else {
       LOG.info(this.deleteEmbeddedNoise(tp_i.getSubtreeText()));
     }
   }
 }
コード例 #2
0
  /**
   * Matched children backtracking algorithm
   *
   * <p>Besides finding the matched children using ta's <em>matchedMatrix</em> field, the main job
   * is to modify the initial <em>matchedMatrix</em> of each matched child.
   *
   * @param p The index of the p-th element of the DOM-tree ta
   * @param matchedMatrix The matched tree path matrix of ta for node p
   * @param i The x-position in the matchedMatrix to look for the value
   * @param j The y-position in the matchedMatrix to look for the value
   */
  private void matchedChildrenBacktracking(int p, int[][] matchedMatrix, int i, int j) {
    // slight modification as if the 1st page is larger than the second or a
    // segment of the 1st page contains more children than the 2nd page,
    // j might become negative while i is 0.
    // To my understanding if either of i or j gets 0 the border of the
    // matrix was reached.
    if (i == 0 && j == 0) {
      return;
    }
    if (matchedMatrix[i][j] == MatchedMatrixValue.UP_LEFT.getValue()) {
      this.matchedChildrenBacktracking(p, matchedMatrix, i - 1, j - 1);
      // contains elements from 1st page
      Token child = this.ta.get(p).getChildren()[i - 1];
      // contains elements from 2nd page
      Token comparedNode = this.tb.get(child.getComparedNodes().peek().getParentNo());

      while (this.ta.get(p).getMatchedNode() != comparedNode) {
        int k = comparedNode.getChildren().length;
        int n = this.ta.get(p).getChildren().length;

        for (int _i = 0; _i < k - 1; _i++)
        // Delete child.comparedNodes.firstElement
        {
          child.getComparedNodes().poll();
        }
        for (int h = 0; h < n - 1; h++) {
          this.ta.get(p).getChildren()[h].setComparedNodes(child.getComparedNodes());
          for (int _i = 0; _i < k - 1; _i++)
          // Delete ta[p].children[h].comparedMatrix.firstElement
          {
            this.ta.get(p).getChildren()[h].getComparedMatrix().poll();
          }
        }
        if (child.getComparedNodes().size() > 0) {
          comparedNode = this.tb.get(child.getComparedNodes().peek().getParentNo());
        } else {
          break;
        }
      }
      //			child.setMatchedNode(child.getComparedNodes().get(j-1));
      child.setMatchedNode(comparedNode.getChildren()[j - 1]);
      child.setMatchedMatrix(null);
      if (child.getComparedMatrix().size() > 0) {
        child.setMatchedMatrix(child.getComparedMatrix().get(j - 1));
      }
      // Add child to matchedChildren, tm[p].children
      //			matchedChildren.add(new HTMLNode(child));
      this.matchedChildren.add(child);
      //			this.tm[p].addChild(new HTMLNode(child));
      this.tm.get(p).addChild(child);
    } else if (matchedMatrix[i][j] == MatchedMatrixValue.UP.getValue()) {
      this.matchedChildrenBacktracking(p, matchedMatrix, i - 1, j);
    } else {
      this.matchedChildrenBacktracking(p, matchedMatrix, i, j - 1);
    }
  }