Java Token 예제들

프로그래밍 언어: Java

네임스페이스/패키지 이름: at.rovo.parser

클래스/타입: Token

hotexamples.com에서의 예제들: 3

Java Token - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Java의 at.rovo.parser.Token에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

getParentNo(2)

getChildren(2)

getSibNo(2)

getSubtreeText(2)

getComparedMatrix(1)

getComparedNodes(1)

getLevel(1)

getName(1)

getNo(1)

getText(1)

setMatchedMatrix(1)

setMatchedNode(1)

예제 #1

파일 보기

파일: TemplateExtraction.java 프로젝트: RovoMe/ContextExtraction

  /**
   * Generates an extraction template <em>tt</em>, which is the set of non-noisy nodes by
   * depth-first traversal of the maximum matching tree <em>tm</em> which only contains nodes that
   * matched in both ta and tb trees.
   *
   * <p>If two pages have the same sub-trees, these fractions are considered noise. As anchor-tags
   * are hidden noise text, the anchor-text-ratio of a sub-segment has to exceed a given threshold
   * to be excluded.
   *
   * <p>Moreover punctuation metrics are used to further separate noise from content.
   *
   * @param i The index of the i-th element of the maximum matching tree tm
   * @return The generated extraction template build from the two provided sources
   */
  private Deque<Token> treeExtraction(int i) {
    Token matchedNode;
    if (this.tm.get(i).getMatchedNode() instanceof Tag) {
      matchedNode = new Tag(this.tm.get(i).getMatchedNode());
    } else {
      matchedNode = new Word(this.tm.get(i).getMatchedNode());
    }

    if (this.tm != null
        && !this.tm.get(i).getSubtreeText().equals(matchedNode.getSubtreeText())
        && this.tm.get(i).getAnchorTextRatio() <= 1.3 * this.ta.get(0).getAnchorTextRatio()
        && !(this.tm.get(i).getPunctNum() == 0 && this.tm.get(i).getSegNum() >= 3)) {
      if (this.tm.get(i).getSibNo() == matchedNode.getSibNo()) {
        this.tt.add(this.tm.get(i));
        for (int j = 1; j < this.tm.get(i).getChildren().length; j++) {
          this.treeExtraction(this.tm.get(i).getChildren()[j].getNo());
        }
      }
      // expand extraction range of tt to extract content that did not
      // matched in the maximum matching tree tm as they occurred in tb
      // but not in ta
      else if (this.tm.get(this.tm.get(i).getParentNo()).getSibNo()
          == this.tb.get(matchedNode.getParentNo()).getSibNo()) {
        while (this.tt.peekLast().getNo() != this.tm.get(i).getParentNo()) {
          this.tt.removeLast();
        }
      }
    }
    return this.tt;
  }

예제 #2

파일 보기

파일: TemplateExtraction.java 프로젝트: RovoMe/ContextExtraction

 /**
  * Uses the extraction template generated by the {@link #templateGeneration} process and utilizes
  * some own characteristics of the target page pt to extract the content of it which first needs
  * to be parsed to a DOM tree <em>tp</em>.
  *
  * @param tt The previouslz generated extraction template
  * @param tp_i The root DOM element of the document to predict the main content for
  */
 public void newsContentExtraction(Deque<Token> tt, Token tp_i) {
   // templateNode <-- tt.firstElement
   Token templateNode = tt.getFirst();
   if (templateNode.getName().equals(tp_i.getName())
       && templateNode.getLevel() == tp_i.getLevel()
       && templateNode.getSibNo() == tp_i.getSibNo()) {
     tt.removeFirst();
     if (tt.size() > 0) {
       Token nextTemplateNode = tt.getFirst();
       if (tp_i.getChildren() == null || tp_i.getChildren().length == 0) {
         LOG.info(tp_i.getText());
         while (nextTemplateNode.getParentNo() == templateNode.getNo()) {
           tt.removeFirst();
           templateNode = nextTemplateNode;
           nextTemplateNode = tt.getFirst();
         }
       } else {
         if (nextTemplateNode.getParentNo() != templateNode.getNo()) {
           System.out.println(this.deleteEmbeddedNoise(tp_i.getSubtreeText()));
         }
         for (int j = 0; j < tp_i.getChildren().length; j++) {
           this.newsContentExtraction(tt, tp_i.getChildren()[j]);
         }
       }
     } else {
       LOG.info(this.deleteEmbeddedNoise(tp_i.getSubtreeText()));
     }
   }
 }

예제 #3

파일 보기

파일: TemplateExtraction.java 프로젝트: RovoMe/ContextExtraction

  /**
   * Matched children backtracking algorithm
   *
   * <p>Besides finding the matched children using ta's <em>matchedMatrix</em> field, the main job
   * is to modify the initial <em>matchedMatrix</em> of each matched child.
   *
   * @param p The index of the p-th element of the DOM-tree ta
   * @param matchedMatrix The matched tree path matrix of ta for node p
   * @param i The x-position in the matchedMatrix to look for the value
   * @param j The y-position in the matchedMatrix to look for the value
   */
  private void matchedChildrenBacktracking(int p, int[][] matchedMatrix, int i, int j) {
    // slight modification as if the 1st page is larger than the second or a
    // segment of the 1st page contains more children than the 2nd page,
    // j might become negative while i is 0.
    // To my understanding if either of i or j gets 0 the border of the
    // matrix was reached.
    if (i == 0 && j == 0) {
      return;
    }
    if (matchedMatrix[i][j] == MatchedMatrixValue.UP_LEFT.getValue()) {
      this.matchedChildrenBacktracking(p, matchedMatrix, i - 1, j - 1);
      // contains elements from 1st page
      Token child = this.ta.get(p).getChildren()[i - 1];
      // contains elements from 2nd page
      Token comparedNode = this.tb.get(child.getComparedNodes().peek().getParentNo());

      while (this.ta.get(p).getMatchedNode() != comparedNode) {
        int k = comparedNode.getChildren().length;
        int n = this.ta.get(p).getChildren().length;

        for (int _i = 0; _i < k - 1; _i++)
        // Delete child.comparedNodes.firstElement
        {
          child.getComparedNodes().poll();
        }
        for (int h = 0; h < n - 1; h++) {
          this.ta.get(p).getChildren()[h].setComparedNodes(child.getComparedNodes());
          for (int _i = 0; _i < k - 1; _i++)
          // Delete ta[p].children[h].comparedMatrix.firstElement
          {
            this.ta.get(p).getChildren()[h].getComparedMatrix().poll();
          }
        }
        if (child.getComparedNodes().size() > 0) {
          comparedNode = this.tb.get(child.getComparedNodes().peek().getParentNo());
        } else {
          break;
        }
      }
      //			child.setMatchedNode(child.getComparedNodes().get(j-1));
      child.setMatchedNode(comparedNode.getChildren()[j - 1]);
      child.setMatchedMatrix(null);
      if (child.getComparedMatrix().size() > 0) {
        child.setMatchedMatrix(child.getComparedMatrix().get(j - 1));
      }
      // Add child to matchedChildren, tm[p].children
      //			matchedChildren.add(new HTMLNode(child));
      this.matchedChildren.add(child);
      //			this.tm[p].addChild(new HTMLNode(child));
      this.tm.get(p).addChild(child);
    } else if (matchedMatrix[i][j] == MatchedMatrixValue.UP.getValue()) {
      this.matchedChildrenBacktracking(p, matchedMatrix, i - 1, j);
    } else {
      this.matchedChildrenBacktracking(p, matchedMatrix, i, j - 1);
    }
  }