public Set<Feature> extract(JCas jcas) { double nrOfNPs = 0.0; double nrOfVPs = 0.0; double nrOfPPs = 0.0; int nrOfSbars = 0; int nrOfVerbphrases = 0; int nrOfComplexNominals = 0; double nrOfClauses = 0.0; int nrOfDependentClauses = 0; double nrOfTunits = 0.0; int nrOfComplexTunits = 0; int nrOfCoords = 0; int lengthSumNPs = 0; int lengthSumVPs = 0; int lengthSumPPs = 0; int lengthSumClauses = 0; int lengthSumTunits = 0; int parseTreeDepthSum = 0; Set<Feature> featSet = new HashSet<Feature>(); double nrOfSentences = JCasUtil.select(jcas, Sentence.class).size() * 1.0; for (Sentence s : JCasUtil.select(jcas, Sentence.class)) { parseTreeDepthSum += ParsePatternUtils.getParseDepth(s); for (Constituent c : JCasUtil.selectCovered(Constituent.class, s)) { if (c instanceof NP) { nrOfNPs++; lengthSumNPs += c.getCoveredText().length(); } else if (c instanceof VP) { nrOfVPs++; lengthSumVPs += c.getCoveredText().length(); } else if (c instanceof PP) { nrOfPPs++; lengthSumPPs += c.getCoveredText().length(); } else if (c instanceof SBAR) { nrOfSbars++; if (ParsePatternUtils.isDependentClause(c)) { nrOfDependentClauses++; } } else if (ParsePatternUtils.isClause(c)) { nrOfClauses++; lengthSumClauses += c.getCoveredText().length(); } if (ParsePatternUtils.isTunit(c)) { nrOfTunits++; lengthSumTunits += c.getCoveredText().length(); if (ParsePatternUtils.isComplexTunit(c)) { nrOfComplexTunits++; } } if (ParsePatternUtils.isCoordinate(c)) { nrOfCoords++; } if (ParsePatternUtils.isComplexNominal(c)) { nrOfComplexNominals++; } if (ParsePatternUtils.isVerbPhrase(c)) { nrOfVerbphrases++; } } } // avoid division by zero, there should be at least one sentence in the cas nrOfSentences = Math.max(1, nrOfSentences); featSet.addAll(Arrays.asList(new Feature(NPS_PER_SENTENCE, nrOfNPs / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(VPS_PER_SENTENCE, nrOfVPs / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(PPS_PER_SENTENCE, nrOfPPs / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(SBARS_PER_SENTENCE, nrOfSbars / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(CLAUSES_PER_SENTENCE, nrOfClauses / nrOfSentences))); featSet.addAll( Arrays.asList(new Feature(DEP_CLAUSES_PER_SENTENCE, nrOfDependentClauses / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(TUNITS_PER_SENTENCE, nrOfTunits / nrOfSentences))); featSet.addAll( Arrays.asList(new Feature(COMPLEX_TUNITS_PER_SENTENCE, nrOfComplexTunits / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(COORDS_PER_SENTENCE, nrOfCoords / nrOfSentences))); // avoid division by 0, // if we don't have any NPs, the lengthSum is 0, division by 1 will yield 0 as average // length nrOfNPs = Math.max(1, nrOfNPs); nrOfVPs = Math.max(1, nrOfVPs); nrOfPPs = Math.max(1, nrOfPPs); nrOfTunits = Math.max(1, nrOfTunits); featSet.addAll(Arrays.asList(new Feature(AVG_NP_LENGTH, lengthSumNPs / nrOfNPs))); featSet.addAll(Arrays.asList(new Feature(AVG_VP_LENGTH, lengthSumVPs / nrOfVPs))); featSet.addAll(Arrays.asList(new Feature(AVG_PP_LENGTH, lengthSumPPs / nrOfPPs))); featSet.addAll(Arrays.asList(new Feature(AVG_TUNIT_LENGTH, lengthSumTunits / nrOfTunits))); featSet.addAll(Arrays.asList(new Feature(AVG_TREE_DEPTH, parseTreeDepthSum / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(CLAUSES_PER_TUNIT, nrOfClauses / nrOfTunits))); nrOfClauses = Math.max(1, nrOfClauses); featSet.addAll(Arrays.asList(new Feature(AVG_CLAUSE_LENGTH, lengthSumClauses / nrOfClauses))); featSet.addAll( Arrays.asList(new Feature(COMPLEX_TUNITS_PER_TUNIT, nrOfComplexTunits / nrOfTunits))); featSet.addAll(Arrays.asList(new Feature(COORDS_PER_TUNIT, nrOfCoords / nrOfTunits))); featSet.addAll( Arrays.asList(new Feature(COMPLEXNOMINALS_PER_TUNIT, nrOfComplexNominals / nrOfTunits))); featSet.addAll(Arrays.asList(new Feature(VERBPHRASES_PER_TUNIT, nrOfVerbphrases / nrOfTunits))); featSet.addAll( Arrays.asList(new Feature(DEPCLAUSE_TUNIT_RATIO, nrOfDependentClauses / nrOfTunits))); ; featSet.addAll( Arrays.asList(new Feature(DEPCLAUSE_CLAUSE_RATIO, nrOfDependentClauses / nrOfClauses))); featSet.addAll(Arrays.asList(new Feature(COORDS_PER_CLAUSE, nrOfCoords / nrOfClauses))); ; featSet.addAll( Arrays.asList(new Feature(COMPLEXNOMINALS_PER_CLAUSE, nrOfComplexNominals / nrOfClauses))); ; return featSet; }