コード例 #1
0
ファイル: TreeUtils.java プロジェクト: renaud/dkpro-core
 private static boolean isLeaf(Constituent constituent) {
   return (constituent.getChildren() == null || constituent.getChildren().size() == 0);
 }
コード例 #2
0
ファイル: TreeUtils.java プロジェクト: renaud/dkpro-core
  public static Tree createStanfordTree(Annotation root, TreeFactory tFact) {
    JCas aJCas;
    try {
      aJCas = root.getCAS().getJCas();
    } catch (CASException e) {
      throw new IllegalStateException("Unable to get JCas from JCas wrapper");
    }

    // define the new (root) node
    Tree rootNode;

    // before we can create a node, we must check if we have any children (we have to know
    // whether to create a node or a leaf - not very dynamic)
    if (root instanceof Constituent && !isLeaf((Constituent) root)) {
      Constituent node = (Constituent) root;
      List<Tree> childNodes = new ArrayList<Tree>();

      // get childNodes from child annotations
      FSArray children = node.getChildren();
      for (int i = 0; i < children.size(); i++) {
        childNodes.add(createStanfordTree(node.getChildren(i), tFact));
      }

      // now create the node with its children
      rootNode = tFact.newTreeNode(node.getConstituentType(), childNodes);

    } else {
      // Handle leaf annotations
      // Leafs are always Token-annotations
      // We also have to insert a Preterminal node with the value of the
      // POS-Annotation on the token
      // because the POS is not directly stored within the treee
      Token wordAnnotation = (Token) root;

      // create leaf-node for the tree
      Tree wordNode = tFact.newLeaf(wordAnnotation.getCoveredText());

      // create information about preceding and trailing whitespaces in the leaf node
      StringBuilder preWhitespaces = new StringBuilder();
      StringBuilder trailWhitespaces = new StringBuilder();

      List<Token> precedingTokenList = selectPreceding(aJCas, Token.class, wordAnnotation, 1);
      List<Token> followingTokenList = selectFollowing(aJCas, Token.class, wordAnnotation, 1);

      if (precedingTokenList.size() > 0) {
        Token precedingToken = precedingTokenList.get(0);
        int precedingWhitespaces = wordAnnotation.getBegin() - precedingToken.getEnd();
        for (int i = 0; i < precedingWhitespaces; i++) {
          preWhitespaces.append(" ");
        }
      }
      if (followingTokenList.size() > 0) {
        Token followingToken = followingTokenList.get(0);
        int trailingWhitespaces = followingToken.getBegin() - wordAnnotation.getEnd();
        for (int i = 0; i < trailingWhitespaces; i++) {
          trailWhitespaces.append(" ");
        }
      }

      // write whitespace information as CoreAnnotation.BeforeAnnotation and
      // CoreAnnotation.AfterAnnotation to the node add annotation to list and write back to
      // node label
      ((CoreLabel) wordNode.label())
          .set(CoreAnnotations.BeforeAnnotation.class, preWhitespaces.toString());
      ((CoreLabel) wordNode.label())
          .set(CoreAnnotations.AfterAnnotation.class, trailWhitespaces.toString());

      // get POS-annotation
      // get the token that is covered by the POS
      List<POS> coveredPos = JCasUtil.selectCovered(aJCas, POS.class, wordAnnotation);
      // the POS should only cover one token
      assert coveredPos.size() == 1;
      POS pos = coveredPos.get(0);

      // create POS-Node in the tree and attach word-node to it
      rootNode = tFact.newTreeNode(pos.getPosValue(), Arrays.asList((new Tree[] {wordNode})));
    }

    return rootNode;
  }
コード例 #3
0
  public Set<Feature> extract(JCas jcas) {

    double nrOfNPs = 0.0;
    double nrOfVPs = 0.0;
    double nrOfPPs = 0.0;
    int nrOfSbars = 0;
    int nrOfVerbphrases = 0;
    int nrOfComplexNominals = 0;
    double nrOfClauses = 0.0;
    int nrOfDependentClauses = 0;
    double nrOfTunits = 0.0;
    int nrOfComplexTunits = 0;
    int nrOfCoords = 0;

    int lengthSumNPs = 0;
    int lengthSumVPs = 0;
    int lengthSumPPs = 0;
    int lengthSumClauses = 0;
    int lengthSumTunits = 0;
    int parseTreeDepthSum = 0;
    Set<Feature> featSet = new HashSet<Feature>();
    double nrOfSentences = JCasUtil.select(jcas, Sentence.class).size() * 1.0;
    for (Sentence s : JCasUtil.select(jcas, Sentence.class)) {
      parseTreeDepthSum += ParsePatternUtils.getParseDepth(s);
      for (Constituent c : JCasUtil.selectCovered(Constituent.class, s)) {
        if (c instanceof NP) {
          nrOfNPs++;
          lengthSumNPs += c.getCoveredText().length();
        } else if (c instanceof VP) {
          nrOfVPs++;
          lengthSumVPs += c.getCoveredText().length();
        } else if (c instanceof PP) {
          nrOfPPs++;
          lengthSumPPs += c.getCoveredText().length();
        } else if (c instanceof SBAR) {
          nrOfSbars++;
          if (ParsePatternUtils.isDependentClause(c)) {
            nrOfDependentClauses++;
          }

        } else if (ParsePatternUtils.isClause(c)) {
          nrOfClauses++;
          lengthSumClauses += c.getCoveredText().length();
        }

        if (ParsePatternUtils.isTunit(c)) {
          nrOfTunits++;
          lengthSumTunits += c.getCoveredText().length();
          if (ParsePatternUtils.isComplexTunit(c)) {
            nrOfComplexTunits++;
          }
        }
        if (ParsePatternUtils.isCoordinate(c)) {
          nrOfCoords++;
        }

        if (ParsePatternUtils.isComplexNominal(c)) {
          nrOfComplexNominals++;
        }
        if (ParsePatternUtils.isVerbPhrase(c)) {
          nrOfVerbphrases++;
        }
      }
    }

    // avoid division by zero, there should be at least one sentence in the cas
    nrOfSentences = Math.max(1, nrOfSentences);

    featSet.addAll(Arrays.asList(new Feature(NPS_PER_SENTENCE, nrOfNPs / nrOfSentences)));
    featSet.addAll(Arrays.asList(new Feature(VPS_PER_SENTENCE, nrOfVPs / nrOfSentences)));
    featSet.addAll(Arrays.asList(new Feature(PPS_PER_SENTENCE, nrOfPPs / nrOfSentences)));
    featSet.addAll(Arrays.asList(new Feature(SBARS_PER_SENTENCE, nrOfSbars / nrOfSentences)));

    featSet.addAll(Arrays.asList(new Feature(CLAUSES_PER_SENTENCE, nrOfClauses / nrOfSentences)));
    featSet.addAll(
        Arrays.asList(new Feature(DEP_CLAUSES_PER_SENTENCE, nrOfDependentClauses / nrOfSentences)));
    featSet.addAll(Arrays.asList(new Feature(TUNITS_PER_SENTENCE, nrOfTunits / nrOfSentences)));
    featSet.addAll(
        Arrays.asList(new Feature(COMPLEX_TUNITS_PER_SENTENCE, nrOfComplexTunits / nrOfSentences)));
    featSet.addAll(Arrays.asList(new Feature(COORDS_PER_SENTENCE, nrOfCoords / nrOfSentences)));

    // avoid division by 0,
    // if we don't have any NPs, the lengthSum is 0, division by 1 will yield 0 as average
    // length
    nrOfNPs = Math.max(1, nrOfNPs);
    nrOfVPs = Math.max(1, nrOfVPs);
    nrOfPPs = Math.max(1, nrOfPPs);
    nrOfTunits = Math.max(1, nrOfTunits);

    featSet.addAll(Arrays.asList(new Feature(AVG_NP_LENGTH, lengthSumNPs / nrOfNPs)));
    featSet.addAll(Arrays.asList(new Feature(AVG_VP_LENGTH, lengthSumVPs / nrOfVPs)));
    featSet.addAll(Arrays.asList(new Feature(AVG_PP_LENGTH, lengthSumPPs / nrOfPPs)));
    featSet.addAll(Arrays.asList(new Feature(AVG_TUNIT_LENGTH, lengthSumTunits / nrOfTunits)));

    featSet.addAll(Arrays.asList(new Feature(AVG_TREE_DEPTH, parseTreeDepthSum / nrOfSentences)));

    featSet.addAll(Arrays.asList(new Feature(CLAUSES_PER_TUNIT, nrOfClauses / nrOfTunits)));

    nrOfClauses = Math.max(1, nrOfClauses);
    featSet.addAll(Arrays.asList(new Feature(AVG_CLAUSE_LENGTH, lengthSumClauses / nrOfClauses)));
    featSet.addAll(
        Arrays.asList(new Feature(COMPLEX_TUNITS_PER_TUNIT, nrOfComplexTunits / nrOfTunits)));
    featSet.addAll(Arrays.asList(new Feature(COORDS_PER_TUNIT, nrOfCoords / nrOfTunits)));
    featSet.addAll(
        Arrays.asList(new Feature(COMPLEXNOMINALS_PER_TUNIT, nrOfComplexNominals / nrOfTunits)));
    featSet.addAll(Arrays.asList(new Feature(VERBPHRASES_PER_TUNIT, nrOfVerbphrases / nrOfTunits)));
    featSet.addAll(
        Arrays.asList(new Feature(DEPCLAUSE_TUNIT_RATIO, nrOfDependentClauses / nrOfTunits)));
    ;

    featSet.addAll(
        Arrays.asList(new Feature(DEPCLAUSE_CLAUSE_RATIO, nrOfDependentClauses / nrOfClauses)));
    featSet.addAll(Arrays.asList(new Feature(COORDS_PER_CLAUSE, nrOfCoords / nrOfClauses)));
    ;
    featSet.addAll(
        Arrays.asList(new Feature(COMPLEXNOMINALS_PER_CLAUSE, nrOfComplexNominals / nrOfClauses)));
    ;
    return featSet;
  }