/** * Returns average token length of chunks in a view * * @param view the view of the JCas * @return average token length of all chunks */ private double getAverageNounPhraseTokenLength(JCas view) { int totalNumber = 0; for (Chunk chunk : JCasUtil.select(view, Chunk.class)) { totalNumber += JCasUtil.selectCovered(view, Token.class, chunk).size(); } return totalNumber / (double) JCasUtil.select(view, Chunk.class).size(); }
@Override public Set<Feature> extract(JCas view, TextClassificationUnit classificationUnit) throws TextClassificationException { boolean isCompound = false; POS pos = JCasUtil.selectCovered(Token.class, classificationUnit).get(0).getPos(); String word = JCasUtil.selectCovered(Lemma.class, classificationUnit).get(0).getValue().toLowerCase(); // only check for noun compounds if (pos instanceof N) { try { isCompound = isCompound(word); } catch (ResourceInitializationException e) { throw new TextClassificationException(e); } } return new Feature(IS_COMPOUND, isCompound).asSet(); }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { // Keeping track of the ranges of different relation candidates so we wont have duplicate // text snippets for different candidates on the same text Map<IndexRange, IndexRange> rangeMappings = new HashMap<IndexRange, IndexRange>(); if (aggregateJCas == null) aggregateJCas = getEmptyJCas(); CasCopier copier = new CasCopier(aJCas.getCas(), aggregateJCas.getCas()); Iterator<RelationCandidate> iter = JCasUtil.iterator(aJCas, RelationCandidate.class); while (iter.hasNext()) { RelationCandidate candidate = iter.next(); RelationCandidate candidateCopy = (RelationCandidate) copier.copyFs(candidate); // See if we already have this candidate in the aggregate jcas IndexRange candidateRange = new IndexRange(candidate); // The offset between the old jcas and the new of this relation candidate int offset = 0; if (rangeMappings.containsKey(candidateRange)) { offset = rangeMappings.get(candidateRange).getStart() - candidateRange.getStart(); updateAnnotation(candidateCopy, offset); // No need to copy features, has already been done } else { offset = content.length() - candidateRange.getStart(); updateAnnotation(candidateCopy, offset); rangeMappings.put(candidateRange, new IndexRange(candidateCopy)); // For every feature we want to copy for (Class<? extends Annotation> feature : features) { // Iterating over the annotations of this feature type covered by this relation candidate for (Annotation annotation : JCasUtil.selectCovered(aJCas, feature, candidate)) { Annotation cAnnotation = (Annotation) copier.copyFs(annotation); // Updating the indices of the annotation updateAnnotation(cAnnotation, offset); aggregateJCas.addFsToIndexes(cAnnotation); } } // Adding the text content of the relation candidate to the new cas content.append(candidate.getCoveredText()); } aggregateJCas.addFsToIndexes(candidateCopy); } }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { String documentId = DocumentMetaData.get(aJCas).getDocumentId(); Class[] types = {Claim.class, Premise.class, Backing.class, Rebuttal.class, Refutation.class}; for (Class type : types) { for (Object o : JCasUtil.select(aJCas, type)) { ArgumentComponent argumentComponent = (ArgumentComponent) o; // non-implicit components int end = argumentComponent.getEnd(); int begin = argumentComponent.getBegin(); if (end > begin) { List<Sentence> sentences = JCasUtil2.selectOverlapping(Sentence.class, argumentComponent, aJCas); String filename = documentId + "_s" + sentences.size() + "_" + argumentComponent.getClass().getSimpleName() + "_" + begin + "_" + end + ".txt"; StringBuilder sb = new StringBuilder(); for (Sentence sentence : sentences) { List<String> tokens = new ArrayList<>(); for (Token token : JCasUtil.selectCovered(Token.class, sentence)) { tokens.add(token.getCoveredText()); } sb.append(StringUtils.join(tokens, " ")); sb.append("\n"); } try { FileUtils.write(new File(outputFolder, filename), sb.toString().trim()); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } } } } }
@Test public void test() throws AnalysisEngineProcessException, ResourceInitializationException { final String text = "The fox jumps over the dog."; jCas.setDocumentText(text); processJCas(); final Collection<Sentence> select = JCasUtil.select(jCas, Sentence.class); final Sentence s1 = select.iterator().next(); final List<PhraseChunk> phrases = JCasUtil.selectCovered(jCas, PhraseChunk.class, s1); Assert.assertEquals(4, phrases.size()); Assert.assertEquals("The fox", phrases.get(0).getCoveredText()); Assert.assertEquals("jumps over the dog", phrases.get(1).getCoveredText()); Assert.assertEquals("over the dog", phrases.get(2).getCoveredText()); Assert.assertEquals("the dog", phrases.get(3).getCoveredText()); }
@Test public void testProcess() throws AnalysisEngineProcessException, ResourceInitializationException { final String text = "The fox jumps over the dog."; jCas.setDocumentText(text); processJCas(); final Collection<Sentence> select = JCasUtil.select(jCas, Sentence.class); final Sentence s1 = select.iterator().next(); final List<Dependency> dependencies = JCasUtil.selectCovered(jCas, Dependency.class, s1); // We could test the output here, but its so model dependent its not // worth it, as long as annotations have been created" // 7 = 6 words + 1 punctuation, each should have a dependency assertEquals(7, dependencies.size()); }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { CAS cas = aJCas.getCas(); for (AnnotationFS cover : CasUtil.select(cas, CasUtil.getAnnotationType(cas, annotationType))) { // If there is a constraint, check if it matches if (constraint != null) { JXPathContext ctx = JXPathContext.newContext(cover); boolean match = ctx.iterate(constraint).hasNext(); if (!match) { continue; } } // If the target type is a token, use it directly, otherwise select the covered tokens Collection<Token> tokens; if (cover instanceof Token) { tokens = Collections.singleton((Token) cover); } else { tokens = JCasUtil.selectCovered(aJCas, Token.class, cover); } for (Token token : tokens) { try { String semanticField = semanticFieldResource.getSemanticTag(token); SemanticField semanticFieldAnnotation = new SemanticField(aJCas, token.getBegin(), token.getEnd()); semanticFieldAnnotation.setValue(semanticField); semanticFieldAnnotation.addToIndexes(); } catch (ResourceAccessException e) { throw new AnalysisEngineProcessException(e); } } } }
public static Tree createStanfordTree(Annotation root, TreeFactory tFact) { JCas aJCas; try { aJCas = root.getCAS().getJCas(); } catch (CASException e) { throw new IllegalStateException("Unable to get JCas from JCas wrapper"); } // define the new (root) node Tree rootNode; // before we can create a node, we must check if we have any children (we have to know // whether to create a node or a leaf - not very dynamic) if (root instanceof Constituent && !isLeaf((Constituent) root)) { Constituent node = (Constituent) root; List<Tree> childNodes = new ArrayList<Tree>(); // get childNodes from child annotations FSArray children = node.getChildren(); for (int i = 0; i < children.size(); i++) { childNodes.add(createStanfordTree(node.getChildren(i), tFact)); } // now create the node with its children rootNode = tFact.newTreeNode(node.getConstituentType(), childNodes); } else { // Handle leaf annotations // Leafs are always Token-annotations // We also have to insert a Preterminal node with the value of the // POS-Annotation on the token // because the POS is not directly stored within the treee Token wordAnnotation = (Token) root; // create leaf-node for the tree Tree wordNode = tFact.newLeaf(wordAnnotation.getCoveredText()); // create information about preceding and trailing whitespaces in the leaf node StringBuilder preWhitespaces = new StringBuilder(); StringBuilder trailWhitespaces = new StringBuilder(); List<Token> precedingTokenList = selectPreceding(aJCas, Token.class, wordAnnotation, 1); List<Token> followingTokenList = selectFollowing(aJCas, Token.class, wordAnnotation, 1); if (precedingTokenList.size() > 0) { Token precedingToken = precedingTokenList.get(0); int precedingWhitespaces = wordAnnotation.getBegin() - precedingToken.getEnd(); for (int i = 0; i < precedingWhitespaces; i++) { preWhitespaces.append(" "); } } if (followingTokenList.size() > 0) { Token followingToken = followingTokenList.get(0); int trailingWhitespaces = followingToken.getBegin() - wordAnnotation.getEnd(); for (int i = 0; i < trailingWhitespaces; i++) { trailWhitespaces.append(" "); } } // write whitespace information as CoreAnnotation.BeforeAnnotation and // CoreAnnotation.AfterAnnotation to the node add annotation to list and write back to // node label ((CoreLabel) wordNode.label()) .set(CoreAnnotations.BeforeAnnotation.class, preWhitespaces.toString()); ((CoreLabel) wordNode.label()) .set(CoreAnnotations.AfterAnnotation.class, trailWhitespaces.toString()); // get POS-annotation // get the token that is covered by the POS List<POS> coveredPos = JCasUtil.selectCovered(aJCas, POS.class, wordAnnotation); // the POS should only cover one token assert coveredPos.size() == 1; POS pos = coveredPos.get(0); // create POS-Node in the tree and attach word-node to it rootNode = tFact.newTreeNode(pos.getPosValue(), Arrays.asList((new Tree[] {wordNode}))); } return rootNode; }
/** * Recreates a Stanford Tree from the StanfordParser annotations and saves all * non-StanfordParser-Annotations within the scope of the sentence in the label of the best * fitting node. * * <p><strong>CAUTION: </strong><i>This method is intended for the use by CAS Multipliers, which * create new CASes from this tree. The annotation-spans in the source-CAS will be changed!!!!!! * You do NOT want to use the source CAS after this method has been called. The * createStanfordTree()-method does not change the CAS, so use this instead, if the annotations do * not have to be recovered or accessed in the tree.</i> * * <p>TODO: This behavior could be changed by making COPIES of the annotations and changing the * copied instead of the originals. However, in order to being able to make copies, a dummy CAS * must be introduced to which the annotations can be copied. When they are recovered, they will * be copied to the new destination CAS anyway. * * @param root the ROOT annotation * @return an {@link Tree} object representing the syntax structure of the sentence * @throws CASException if the JCas cannot be accessed. */ public static Tree createStanfordTreeWithAnnotations(Annotation root) throws CASException { JCas aJCas = root.getCAS().getJCas(); // Create tree Tree tree = createStanfordTree(root); // Get all non-parser related annotations // and all tokens (needed for span-calculations later on) List<Annotation> nonParserAnnotations = new ArrayList<Annotation>(); List<Token> tokens = new ArrayList<Token>(); // Using getCoveredAnnotations instead of iterate, because subiterators did not work in all // cases List<Annotation> annosWithinRoot = JCasUtil.selectCovered(aJCas, Annotation.class, root); for (Annotation curAnno : annosWithinRoot) { if (!(curAnno instanceof POS) && !(curAnno instanceof Constituent) && !(curAnno instanceof Dependency) && !(curAnno instanceof PennTree) && !(curAnno instanceof Lemma) && !(curAnno instanceof Token) && !(curAnno instanceof DocumentMetaData)) { nonParserAnnotations.add(curAnno); } else if (curAnno instanceof Token) { tokens.add((Token) curAnno); } } // create wrapper for tree and its tokens TreeWithTokens annoTree = new TreeWithTokens(tree, tokens); /* * Add annotations to the best-fitting nodes. The best-fitting node for an annotation is the * deepest node in the tree that still completely contains the annotation. */ for (Annotation curAnno : nonParserAnnotations) { // get best fitting node Tree bestFittingNode = annoTree.getBestFit(curAnno); // Add annotation to node if (bestFittingNode != null) { // translate annotation span to a value relative to the // node-span IntPair span = annoTree.getSpan(bestFittingNode); curAnno.setBegin(curAnno.getBegin() - span.getSource()); curAnno.setEnd(curAnno.getEnd() - span.getSource()); // get the collection from the label of the best-fitting node in which we store UIMA // annotations or create it, if it does not exist Collection<Annotation> annotations = ((CoreLabel) bestFittingNode.label()).get(UIMAAnnotations.class); if (annotations == null) { annotations = new ArrayList<Annotation>(); } // add annotation + checksum of annotated text to list and write it back to node // label annotations.add(curAnno); ((CoreLabel) bestFittingNode.label()).set(UIMAAnnotations.class, annotations); } } return tree; }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { FSIndex answerIndex = aJCas.getAnnotationIndex(Answer.type); FSIndex questionIndex = aJCas.getAnnotationIndex(Question.type); FSIndex tokenIndex = aJCas.getAnnotationIndex(Token.type); Iterator answerIter = answerIndex.iterator(); while (answerIter.hasNext()) { Answer answer = (Answer) answerIter.next(); int beginAnswer = answer.getBegin(); int endAnswer = answer.getEnd(); List<Token> tokenAnswerList = JCasUtil.selectCovered(aJCas, Token.class, beginAnswer - 1, endAnswer); int listLen = tokenAnswerList.size(); for (int i = 0; i < listLen - (n - 1); i++) { FSArray tokensArray = new FSArray(aJCas, n); for (int N = 0; N < n; N++) { tokensArray.set(N, (Token) tokenAnswerList.get(i + N)); } Ngram ngram = new Ngram(aJCas); ngram.setBegin(tokenAnswerList.get(i).getBegin()); ngram.setEnd(tokenAnswerList.get(i + (n - 1)).getEnd()); // Here we set the ngram's string value. String tempNgramToString = ""; for (int j = 0; j < tokensArray.size(); j++) { tempNgramToString += (tokenAnswerList.get(j + i).getToStringValue() + " "); } String nGramToString = tempNgramToString.trim(); ngram.setToStringValue(nGramToString); // Here we set the value of n and also the tokens as an FSArray. ngram.setN(n); ngram.setTokens(tokensArray); ngram.addToIndexes(); } } Iterator questionIter = questionIndex.iterator(); while (questionIter.hasNext()) { Question question = (Question) questionIter.next(); int beginQuestion = question.getBegin(); int endQuestion = question.getEnd(); List<Token> tokenQuestionList = JCasUtil.selectCovered(aJCas, Token.class, beginQuestion - 1, endQuestion); int listLen = tokenQuestionList.size(); for (int i = 0; i < listLen - (n - 1); i++) { FSArray tokensArray = new FSArray(aJCas, n); for (int N = 0; N < n; N++) { tokensArray.set(N, tokenQuestionList.get(i + N)); } Ngram ngram = new Ngram(aJCas); ngram.setBegin(tokenQuestionList.get(i).getBegin()); ngram.setEnd(tokenQuestionList.get(i + (n - 1)).getEnd()); // Here we set the ngram's string value. String tempNgramToString = ""; for (int j = 0; j < tokensArray.size(); j++) { tempNgramToString += (tokenQuestionList.get(j + i).getToStringValue() + " "); } String nGramToString = tempNgramToString.trim(); ngram.setToStringValue(nGramToString); // Here we set the value of n and also the tokens as an FSArray. ngram.setN(n); ngram.setTokens(tokensArray); ngram.addToIndexes(); } } }
public Set<Feature> extract(JCas jcas) { double nrOfNPs = 0.0; double nrOfVPs = 0.0; double nrOfPPs = 0.0; int nrOfSbars = 0; int nrOfVerbphrases = 0; int nrOfComplexNominals = 0; double nrOfClauses = 0.0; int nrOfDependentClauses = 0; double nrOfTunits = 0.0; int nrOfComplexTunits = 0; int nrOfCoords = 0; int lengthSumNPs = 0; int lengthSumVPs = 0; int lengthSumPPs = 0; int lengthSumClauses = 0; int lengthSumTunits = 0; int parseTreeDepthSum = 0; Set<Feature> featSet = new HashSet<Feature>(); double nrOfSentences = JCasUtil.select(jcas, Sentence.class).size() * 1.0; for (Sentence s : JCasUtil.select(jcas, Sentence.class)) { parseTreeDepthSum += ParsePatternUtils.getParseDepth(s); for (Constituent c : JCasUtil.selectCovered(Constituent.class, s)) { if (c instanceof NP) { nrOfNPs++; lengthSumNPs += c.getCoveredText().length(); } else if (c instanceof VP) { nrOfVPs++; lengthSumVPs += c.getCoveredText().length(); } else if (c instanceof PP) { nrOfPPs++; lengthSumPPs += c.getCoveredText().length(); } else if (c instanceof SBAR) { nrOfSbars++; if (ParsePatternUtils.isDependentClause(c)) { nrOfDependentClauses++; } } else if (ParsePatternUtils.isClause(c)) { nrOfClauses++; lengthSumClauses += c.getCoveredText().length(); } if (ParsePatternUtils.isTunit(c)) { nrOfTunits++; lengthSumTunits += c.getCoveredText().length(); if (ParsePatternUtils.isComplexTunit(c)) { nrOfComplexTunits++; } } if (ParsePatternUtils.isCoordinate(c)) { nrOfCoords++; } if (ParsePatternUtils.isComplexNominal(c)) { nrOfComplexNominals++; } if (ParsePatternUtils.isVerbPhrase(c)) { nrOfVerbphrases++; } } } // avoid division by zero, there should be at least one sentence in the cas nrOfSentences = Math.max(1, nrOfSentences); featSet.addAll(Arrays.asList(new Feature(NPS_PER_SENTENCE, nrOfNPs / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(VPS_PER_SENTENCE, nrOfVPs / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(PPS_PER_SENTENCE, nrOfPPs / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(SBARS_PER_SENTENCE, nrOfSbars / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(CLAUSES_PER_SENTENCE, nrOfClauses / nrOfSentences))); featSet.addAll( Arrays.asList(new Feature(DEP_CLAUSES_PER_SENTENCE, nrOfDependentClauses / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(TUNITS_PER_SENTENCE, nrOfTunits / nrOfSentences))); featSet.addAll( Arrays.asList(new Feature(COMPLEX_TUNITS_PER_SENTENCE, nrOfComplexTunits / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(COORDS_PER_SENTENCE, nrOfCoords / nrOfSentences))); // avoid division by 0, // if we don't have any NPs, the lengthSum is 0, division by 1 will yield 0 as average // length nrOfNPs = Math.max(1, nrOfNPs); nrOfVPs = Math.max(1, nrOfVPs); nrOfPPs = Math.max(1, nrOfPPs); nrOfTunits = Math.max(1, nrOfTunits); featSet.addAll(Arrays.asList(new Feature(AVG_NP_LENGTH, lengthSumNPs / nrOfNPs))); featSet.addAll(Arrays.asList(new Feature(AVG_VP_LENGTH, lengthSumVPs / nrOfVPs))); featSet.addAll(Arrays.asList(new Feature(AVG_PP_LENGTH, lengthSumPPs / nrOfPPs))); featSet.addAll(Arrays.asList(new Feature(AVG_TUNIT_LENGTH, lengthSumTunits / nrOfTunits))); featSet.addAll(Arrays.asList(new Feature(AVG_TREE_DEPTH, parseTreeDepthSum / nrOfSentences))); featSet.addAll(Arrays.asList(new Feature(CLAUSES_PER_TUNIT, nrOfClauses / nrOfTunits))); nrOfClauses = Math.max(1, nrOfClauses); featSet.addAll(Arrays.asList(new Feature(AVG_CLAUSE_LENGTH, lengthSumClauses / nrOfClauses))); featSet.addAll( Arrays.asList(new Feature(COMPLEX_TUNITS_PER_TUNIT, nrOfComplexTunits / nrOfTunits))); featSet.addAll(Arrays.asList(new Feature(COORDS_PER_TUNIT, nrOfCoords / nrOfTunits))); featSet.addAll( Arrays.asList(new Feature(COMPLEXNOMINALS_PER_TUNIT, nrOfComplexNominals / nrOfTunits))); featSet.addAll(Arrays.asList(new Feature(VERBPHRASES_PER_TUNIT, nrOfVerbphrases / nrOfTunits))); featSet.addAll( Arrays.asList(new Feature(DEPCLAUSE_TUNIT_RATIO, nrOfDependentClauses / nrOfTunits))); ; featSet.addAll( Arrays.asList(new Feature(DEPCLAUSE_CLAUSE_RATIO, nrOfDependentClauses / nrOfClauses))); featSet.addAll(Arrays.asList(new Feature(COORDS_PER_CLAUSE, nrOfCoords / nrOfClauses))); ; featSet.addAll( Arrays.asList(new Feature(COMPLEXNOMINALS_PER_CLAUSE, nrOfComplexNominals / nrOfClauses))); ; return featSet; }