public static Pair<Integer, Integer> getAnnotationProjection( ComponentAnnotation sourceAnno, ArrayListMultimap<StanfordCorenlpToken, StanfordCorenlpToken> tokenAlignment) { // compute aligned start and end int earliestStart = -1; int latestEnd = -1; boolean isFirstToken = true; for (StanfordCorenlpToken token : JCasUtil.selectCovered(StanfordCorenlpToken.class, sourceAnno)) { for (StanfordCorenlpToken projectedToken : tokenAlignment.get(token)) { if (isFirstToken) { earliestStart = projectedToken.getBegin(); latestEnd = projectedToken.getEnd(); } else { if (earliestStart > projectedToken.getBegin()) { earliestStart = projectedToken.getBegin(); } if (latestEnd < projectedToken.getEnd()) { latestEnd = projectedToken.getEnd(); } } isFirstToken = false; } } return Pair.of(earliestStart, latestEnd); }
public void process(JCas jCas) throws AnalysisEngineProcessException { TriageScore doc = JCasUtil.selectSingle(jCas, TriageScore.class); String code = doc.getInOutCode(); File outFile = new File(baseData.getPath() + "/" + code + ".txt"); try { PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(outFile, true))); out.print(doc.getVpdmfId() + " "); for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence); if (tokens.size() <= 0) { continue; } List<String> tokenStrings = JCasUtil.toText(tokens); for (int i = 0; i < tokens.size(); i++) { out.print(tokenStrings.get(i) + " "); } } out.print("\n"); out.close(); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } }
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { beforeProcess(jCas); try { for (AnnotationFS anno : JCasUtil.select(jCas, annotationType)) { List<String> lemmas = Lists.newLinkedList(); for (Token token : JCasUtil.selectCovered(jCas, Token.class, anno)) { Word w = token2WordIdx.get(token); if (w == null) { lemmas.add(token.getCoveredText()); } else { lemmas.add(MorphCasUtils.getOnlyWordform(w).getLemma()); } } outEntries.add(lemmaJoiner.join(lemmas)); } } finally { afterProcess(jCas); } }
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { Annotation document = this.processor.process(jCas.getDocumentText()); String lastNETag = "O"; int lastNEBegin = -1; int lastNEEnd = -1; for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) { // create the token annotation int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class); int end = tokenAnn.get(CharacterOffsetEndAnnotation.class); String pos = tokenAnn.get(PartOfSpeechAnnotation.class); String lemma = tokenAnn.get(LemmaAnnotation.class); Token token = new Token(jCas, begin, end); token.setPos(pos); token.setLemma(lemma); token.addToIndexes(); // hackery to convert token-level named entity tag into phrase-level tag String neTag = tokenAnn.get(NamedEntityTagAnnotation.class); if (neTag.equals("O") && !lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } else { if (lastNETag.equals("O")) { lastNEBegin = begin; } else if (lastNETag.equals(neTag)) { // do nothing - begin was already set } else { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); lastNEBegin = begin; } lastNEEnd = end; } lastNETag = neTag; } if (!lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } // add sentences and trees for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) { // add the sentence annotation int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class); int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class); Sentence sentence = new Sentence(jCas, sentBegin, sentEnd); sentence.addToIndexes(); // add the syntactic tree annotation List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class); Tree tree = sentenceAnn.get(TreeAnnotation.class); if (tree.children().length != 1) { throw new RuntimeException("Expected single root node, found " + tree); } tree = tree.firstChild(); tree.indexSpans(0); TopTreebankNode root = new TopTreebankNode(jCas); root.setTreebankParse(tree.toString()); // TODO: root.setTerminals(v) this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns); // get the dependencies SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class); // convert Stanford nodes to UIMA annotations List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence); Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>(); for (IndexedWord stanfordNode : dependencies.vertexSet()) { int indexBegin = stanfordNode.get(BeginIndexAnnotation.class); int indexEnd = stanfordNode.get(EndIndexAnnotation.class); int tokenBegin = tokens.get(indexBegin).getBegin(); int tokenEnd = tokens.get(indexEnd - 1).getEnd(); DependencyNode node; if (dependencies.getRoots().contains(stanfordNode)) { node = new TopDependencyNode(jCas, tokenBegin, tokenEnd); } else { node = new DependencyNode(jCas, tokenBegin, tokenEnd); } stanfordToUima.put(stanfordNode, node); } // create relation annotations for each Stanford dependency ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create(); ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create(); for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) { DependencyRelation relation = new DependencyRelation(jCas); DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor()); DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent()); String relationType = stanfordEdge.getRelation().toString(); if (head == null || child == null || relationType == null) { throw new RuntimeException( String.format( "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation, child, head)); } relation.setHead(head); relation.setChild(child); relation.setRelation(relationType); relation.addToIndexes(); headRelations.put(child, relation); childRelations.put(head, relation); } // set the relations for each node annotation for (DependencyNode node : stanfordToUima.values()) { List<DependencyRelation> heads = headRelations.get(node); node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size())); if (heads != null) { FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads); } List<DependencyRelation> children = childRelations.get(node); node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size())); if (children != null) { FSCollectionFactory.fillArrayFS(node.getChildRelations(), children); } node.addToIndexes(); } } // map from spans to named entity mentions Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>(); for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention); } // add mentions for all entities identified by the coreference system List<NamedEntity> entities = new ArrayList<NamedEntity>(); List<List<Token>> sentenceTokens = new ArrayList<List<Token>>(); for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence)); } Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class); for (CorefChain chain : corefChains.values()) { List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>(); for (CorefMention corefMention : chain.getMentionsInTextualOrder()) { // figure out the character span of the token List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1); int begin = tokens.get(corefMention.startIndex - 1).getBegin(); int end = tokens.get(corefMention.endIndex - 2).getEnd(); // use an existing named entity mention when possible; otherwise create a new one NamedEntityMention mention = spanMentionMap.get(new Span(begin, end)); if (mention == null) { mention = new NamedEntityMention(jCas, begin, end); mention.addToIndexes(); } mentions.add(mention); } // create an entity for the mentions Collections.sort( mentions, new Comparator<NamedEntityMention>() { @Override public int compare(NamedEntityMention m1, NamedEntityMention m2) { return m1.getBegin() - m2.getBegin(); } }); // create mentions and add them to entity NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, mentions.size())); int index = 0; for (NamedEntityMention mention : mentions) { mention.setMentionedEntity(entity); entity.setMentions(index, mention); index += 1; } entities.add(entity); } // add singleton entities for any named entities not picked up by coreference system for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { if (mention.getMentionedEntity() == null) { NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, 1)); entity.setMentions(0, mention); mention.setMentionedEntity(entity); entity.getMentions(); entities.add(entity); } } // sort entities by document order Collections.sort( entities, new Comparator<NamedEntity>() { @Override public int compare(NamedEntity o1, NamedEntity o2) { return getFirstBegin(o1) - getFirstBegin(o2); } private int getFirstBegin(NamedEntity entity) { int min = Integer.MAX_VALUE; for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) { if (mention.getBegin() < min) { min = mention.getBegin(); } } return min; } }); // add entities to document for (NamedEntity entity : entities) { entity.addToIndexes(); } }
private void verifyToken(Token token, TestTokenInfo info) throws LAPVerificationException { if (!info.text.equals(token.getCoveredText())) throw new LAPVerificationException( "Bad token text for " + info.id + ":" + info.text + ", expected \"" + info.text + "\", got \"" + token.getCoveredText() + "\""); if (info.begin != token.getBegin()) throw new LAPVerificationException( "Bad token begin index for " + info.id + ":" + info.text + ", expected " + info.begin + ", got " + token.getBegin()); if (info.end != token.getEnd()) throw new LAPVerificationException( "Bad token end index for " + info.id + ":" + info.text + ", expected " + info.end + ", got " + token.getEnd()); if (!info.lemma.equals(token.getLemma().getValue())) throw new LAPVerificationException( "Bad token lemma for " + info.id + ":" + info.text + ", expected \"" + info.lemma + "\", got \"" + token.getLemma().getValue() + "\""); if (!info.posType.equals(token.getPos().getType().getShortName())) throw new LAPVerificationException( "Bad token POS type for " + info.id + ":" + info.text + ", expected " + info.posType + ", got " + token.getPos().getType().getShortName()); if (!info.posValue.equals(token.getPos().getPosValue())) throw new LAPVerificationException( "Bad token POS value for " + info.id + ":" + info.text + ", expected \"" + info.posValue + "\", got \"" + token.getPos().getPosValue() + "\""); String nerType = null; List<NamedEntity> ners = JCasUtil.selectCovered(NamedEntity.class, token); if (ners.size() == 1) { nerType = ners.get(0).getType().getShortName(); } else if (ners.size() > 1) { throw new LAPVerificationException( "Got more than one NER annotation for " + info.id + ":" + info.text + " - " + ners); } if (!Objects.equals(info.nerType, nerType)) throw new LAPVerificationException( "Bad token NER value for " + info.id + ":" + info.text + ", expected \"" + info.nerType + "\", got \"" + nerType + "\""); Set<TestDependencyInfo> infoDependencies = new HashSet<TestDependencyInfo>(Arrays.asList(info.dependencies)); if (!infoDependencies.equals(governors.get(token))) throw new LAPVerificationException( "Bad token dependencies for " + info.id + ":" + info.text + ", expected " + infoDependencies + ", got " + governors.get(token)); System.out.println("Verified token: " + info); }