コード例 #1
0
 /**
  * Uses the extraction template generated by the {@link #templateGeneration} process and utilizes
  * some own characteristics of the target page pt to extract the content of it which first needs
  * to be parsed to a DOM tree <em>tp</em>.
  *
  * @param tt The previouslz generated extraction template
  * @param tp_i The root DOM element of the document to predict the main content for
  */
 public void newsContentExtraction(Deque<Token> tt, Token tp_i) {
   // templateNode <-- tt.firstElement
   Token templateNode = tt.getFirst();
   if (templateNode.getName().equals(tp_i.getName())
       && templateNode.getLevel() == tp_i.getLevel()
       && templateNode.getSibNo() == tp_i.getSibNo()) {
     tt.removeFirst();
     if (tt.size() > 0) {
       Token nextTemplateNode = tt.getFirst();
       if (tp_i.getChildren() == null || tp_i.getChildren().length == 0) {
         LOG.info(tp_i.getText());
         while (nextTemplateNode.getParentNo() == templateNode.getNo()) {
           tt.removeFirst();
           templateNode = nextTemplateNode;
           nextTemplateNode = tt.getFirst();
         }
       } else {
         if (nextTemplateNode.getParentNo() != templateNode.getNo()) {
           System.out.println(this.deleteEmbeddedNoise(tp_i.getSubtreeText()));
         }
         for (int j = 0; j < tp_i.getChildren().length; j++) {
           this.newsContentExtraction(tt, tp_i.getChildren()[j]);
         }
       }
     } else {
       LOG.info(this.deleteEmbeddedNoise(tp_i.getSubtreeText()));
     }
   }
 }
コード例 #2
0
  /**
   * Generates an extraction template <em>tt</em>, which is the set of non-noisy nodes by
   * depth-first traversal of the maximum matching tree <em>tm</em> which only contains nodes that
   * matched in both ta and tb trees.
   *
   * <p>If two pages have the same sub-trees, these fractions are considered noise. As anchor-tags
   * are hidden noise text, the anchor-text-ratio of a sub-segment has to exceed a given threshold
   * to be excluded.
   *
   * <p>Moreover punctuation metrics are used to further separate noise from content.
   *
   * @param i The index of the i-th element of the maximum matching tree tm
   * @return The generated extraction template build from the two provided sources
   */
  private Deque<Token> treeExtraction(int i) {
    Token matchedNode;
    if (this.tm.get(i).getMatchedNode() instanceof Tag) {
      matchedNode = new Tag(this.tm.get(i).getMatchedNode());
    } else {
      matchedNode = new Word(this.tm.get(i).getMatchedNode());
    }

    if (this.tm != null
        && !this.tm.get(i).getSubtreeText().equals(matchedNode.getSubtreeText())
        && this.tm.get(i).getAnchorTextRatio() <= 1.3 * this.ta.get(0).getAnchorTextRatio()
        && !(this.tm.get(i).getPunctNum() == 0 && this.tm.get(i).getSegNum() >= 3)) {
      if (this.tm.get(i).getSibNo() == matchedNode.getSibNo()) {
        this.tt.add(this.tm.get(i));
        for (int j = 1; j < this.tm.get(i).getChildren().length; j++) {
          this.treeExtraction(this.tm.get(i).getChildren()[j].getNo());
        }
      }
      // expand extraction range of tt to extract content that did not
      // matched in the maximum matching tree tm as they occurred in tb
      // but not in ta
      else if (this.tm.get(this.tm.get(i).getParentNo()).getSibNo()
          == this.tb.get(matchedNode.getParentNo()).getSibNo()) {
        while (this.tt.peekLast().getNo() != this.tm.get(i).getParentNo()) {
          this.tt.removeLast();
        }
      }
    }
    return this.tt;
  }