private static boolean isLeaf(Constituent constituent) { return (constituent.getChildren() == null || constituent.getChildren().size() == 0); }
public static Tree createStanfordTree(Annotation root, TreeFactory tFact) { JCas aJCas; try { aJCas = root.getCAS().getJCas(); } catch (CASException e) { throw new IllegalStateException("Unable to get JCas from JCas wrapper"); } // define the new (root) node Tree rootNode; // before we can create a node, we must check if we have any children (we have to know // whether to create a node or a leaf - not very dynamic) if (root instanceof Constituent && !isLeaf((Constituent) root)) { Constituent node = (Constituent) root; List<Tree> childNodes = new ArrayList<Tree>(); // get childNodes from child annotations FSArray children = node.getChildren(); for (int i = 0; i < children.size(); i++) { childNodes.add(createStanfordTree(node.getChildren(i), tFact)); } // now create the node with its children rootNode = tFact.newTreeNode(node.getConstituentType(), childNodes); } else { // Handle leaf annotations // Leafs are always Token-annotations // We also have to insert a Preterminal node with the value of the // POS-Annotation on the token // because the POS is not directly stored within the treee Token wordAnnotation = (Token) root; // create leaf-node for the tree Tree wordNode = tFact.newLeaf(wordAnnotation.getCoveredText()); // create information about preceding and trailing whitespaces in the leaf node StringBuilder preWhitespaces = new StringBuilder(); StringBuilder trailWhitespaces = new StringBuilder(); List<Token> precedingTokenList = selectPreceding(aJCas, Token.class, wordAnnotation, 1); List<Token> followingTokenList = selectFollowing(aJCas, Token.class, wordAnnotation, 1); if (precedingTokenList.size() > 0) { Token precedingToken = precedingTokenList.get(0); int precedingWhitespaces = wordAnnotation.getBegin() - precedingToken.getEnd(); for (int i = 0; i < precedingWhitespaces; i++) { preWhitespaces.append(" "); } } if (followingTokenList.size() > 0) { Token followingToken = followingTokenList.get(0); int trailingWhitespaces = followingToken.getBegin() - wordAnnotation.getEnd(); for (int i = 0; i < trailingWhitespaces; i++) { trailWhitespaces.append(" "); } } // write whitespace information as CoreAnnotation.BeforeAnnotation and // CoreAnnotation.AfterAnnotation to the node add annotation to list and write back to // node label ((CoreLabel) wordNode.label()) .set(CoreAnnotations.BeforeAnnotation.class, preWhitespaces.toString()); ((CoreLabel) wordNode.label()) .set(CoreAnnotations.AfterAnnotation.class, trailWhitespaces.toString()); // get POS-annotation // get the token that is covered by the POS List<POS> coveredPos = JCasUtil.selectCovered(aJCas, POS.class, wordAnnotation); // the POS should only cover one token assert coveredPos.size() == 1; POS pos = coveredPos.get(0); // create POS-Node in the tree and attach word-node to it rootNode = tFact.newTreeNode(pos.getPosValue(), Arrays.asList((new Tree[] {wordNode}))); } return rootNode; }
public Set<Feature> extract(JCas jcas) { double nrOfNPs = 0.0; double nrOfVPs = 0.0; double nrOfPPs = 0.0; int nrOfSbars = 0; int nrOfVerbphrases = 0; int nrOfComplexNominals = 0; double nrOfClauses = 0.0; int nrOfDependentClauses = 0; double nrOfTunits = 0.0; int nrOfComplexTunits = 0; int nrOfCoords = 0; int lengthSumNPs = 0; int lengthSumVPs = 0; int lengthSumPPs = 0; int lengthSumClauses = 0; int lengthSumTunits = 0; int parseTreeDepthSum = 0; Set<Feature> featSet = new HashSet<Feature>(); double nrOfSentences = JCasUtil.select(jcas, Sentence.class).size() * 1.0; for (Sentence s : JCasUtil.select(jcas, Sentence.class)) { parseTreeDepthSum += ParsePatternUtils.getParseDepth(s); for (Constituent c : JCasUtil.selectCovered(Constituent.class, s)) { if (c instanceof NP) { nrOfNPs++; lengthSumNPs += c.getCoveredText().length(); } else if (c instanceof VP) { nrOfVPs++; lengthSumVPs += c.getCoveredText().length(); } else if (c instanceof PP) { nrOfPPs++; lengthSumPPs += c.getCoveredText().length(); } else if (c instanceof SBAR) { nrOfSbars++; if (ParsePatternUtils.isDependentClause(c)) { nrOfDependentClauses++; } } else if (ParsePatternUtils.isClause(c)) { nrOfClauses++; lengthSumClauses += c.getCoveredText().length(); } if (ParsePatternUtils.isTunit(c)) { nrOfTunits++; lengthSumTunits += c.getCoveredText().length(); if (ParsePatternUtils.isComplexTunit(c)) { nrOfComplexTunits++; } } if (ParsePatternUtils.isCoordinate(c)) { nrOfCoords++; } if (ParsePatternUtils.isComplexNominal(c)) { nrOfComplexNominals++; } if (ParsePatternUtils.isVerbPhrase(c)) { nrOfVerbphrases++; } } } // avoid division by zero, there should be at least one sentence in the cas nrOfSentences = Math.max(1, nrOfSentences); featSet.addAll(Arrays.asList(new Feature(NPS_PER_SENTENCE, nrOfNPs / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(VPS_PER_SENTENCE, nrOfVPs / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(PPS_PER_SENTENCE, nrOfPPs / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(SBARS_PER_SENTENCE, nrOfSbars / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(CLAUSES_PER_SENTENCE, nrOfClauses / nrOfSentences))); featSet.addAll( Arrays.asList(new Feature(DEP_CLAUSES_PER_SENTENCE, nrOfDependentClauses / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(TUNITS_PER_SENTENCE, nrOfTunits / nrOfSentences))); featSet.addAll( Arrays.asList(new Feature(COMPLEX_TUNITS_PER_SENTENCE, nrOfComplexTunits / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(COORDS_PER_SENTENCE, nrOfCoords / nrOfSentences))); // avoid division by 0, // if we don't have any NPs, the lengthSum is 0, division by 1 will yield 0 as average // length nrOfNPs = Math.max(1, nrOfNPs); nrOfVPs = Math.max(1, nrOfVPs); nrOfPPs = Math.max(1, nrOfPPs); nrOfTunits = Math.max(1, nrOfTunits); featSet.addAll(Arrays.asList(new Feature(AVG_NP_LENGTH, lengthSumNPs / nrOfNPs))); featSet.addAll(Arrays.asList(new Feature(AVG_VP_LENGTH, lengthSumVPs / nrOfVPs))); featSet.addAll(Arrays.asList(new Feature(AVG_PP_LENGTH, lengthSumPPs / nrOfPPs))); featSet.addAll(Arrays.asList(new Feature(AVG_TUNIT_LENGTH, lengthSumTunits / nrOfTunits))); featSet.addAll(Arrays.asList(new Feature(AVG_TREE_DEPTH, parseTreeDepthSum / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(CLAUSES_PER_TUNIT, nrOfClauses / nrOfTunits))); nrOfClauses = Math.max(1, nrOfClauses); featSet.addAll(Arrays.asList(new Feature(AVG_CLAUSE_LENGTH, lengthSumClauses / nrOfClauses))); featSet.addAll( Arrays.asList(new Feature(COMPLEX_TUNITS_PER_TUNIT, nrOfComplexTunits / nrOfTunits))); featSet.addAll(Arrays.asList(new Feature(COORDS_PER_TUNIT, nrOfCoords / nrOfTunits))); featSet.addAll( Arrays.asList(new Feature(COMPLEXNOMINALS_PER_TUNIT, nrOfComplexNominals / nrOfTunits))); featSet.addAll(Arrays.asList(new Feature(VERBPHRASES_PER_TUNIT, nrOfVerbphrases / nrOfTunits))); featSet.addAll( Arrays.asList(new Feature(DEPCLAUSE_TUNIT_RATIO, nrOfDependentClauses / nrOfTunits))); ; featSet.addAll( Arrays.asList(new Feature(DEPCLAUSE_CLAUSE_RATIO, nrOfDependentClauses / nrOfClauses))); featSet.addAll(Arrays.asList(new Feature(COORDS_PER_CLAUSE, nrOfCoords / nrOfClauses))); ; featSet.addAll( Arrays.asList(new Feature(COMPLEXNOMINALS_PER_CLAUSE, nrOfComplexNominals / nrOfClauses))); ; return featSet; }