/** * Extracts the number (sg/pl) from the Token. To be applied only on nouns / pronouns. * * @param token * @return */ public String getNumber(Token token) { String pos = token.getPos().getPosValue(); if (!isNounOrPronoun(token)) { System.err.println( "Use method only for nouns / pronouns. " + pos + " " + token.getCoveredText()); // throw new IllegalArgumentException(); return "unknown"; // occurs e.g. for 'there' (existential) } if (pos.matches("NNP?S")) { return "pl"; } if (pos.matches("NNP?")) { return "sg"; } if (pos.matches("PRP\\$?|CD")) { String lemma = token.getLemma().getValue().toLowerCase(); if (lemma.matches( "I|me|myself|he|him|himself|she|her|herself|it|itself|one|onself|mine|thine|his|hers")) { return "sg"; } if (lemma.matches( "we|us|ourselves|ourself|yourselves|they|them|themselves|theirselves|theirs|ours")) { return "pl"; } } return "unknown"; }
/** * Add an alignment link from T to H, based on the rule t->h in which t is a phrase in T from * index textStart to textEnd of the tokens, and h is a phrase in H from index hypoStart to * hypoEnd of the tokens, * * @param textToken Token in TextView to annotate * @param hypoToken Token in HypoView to annotate * @param confidence The confidence of the rule * @param linkDirection The direction of the link (t to h, h to t or bidirectional). * @param linkInfo The relation of the rule (Wordnet synonym, Wikipedia redirect etc). * @param linkGroupLabel * @throws CASException */ private void addAlignmentAnnotations( Token textToken, Token hypoToken, double confidence, Direction linkDirection, String linkInfo, StringList linkGroupLabel) throws CASException { // Prepare the Target instances Target textTarget = new Target(textView); Target hypoTarget = new Target(hypoView); // Prepare an FSArray instance and put the target annotations in it FSArray textAnnots = new FSArray(textView, 1); FSArray hypoAnnots = new FSArray(hypoView, 1); textAnnots.set(0, textToken); hypoAnnots.set(0, hypoToken); textTarget.setTargetAnnotations(textAnnots); hypoTarget.setTargetAnnotations(hypoAnnots); // Set begin and end value of the Target annotations textTarget.setBegin(textToken.getBegin()); textTarget.setEnd(textToken.getEnd()); hypoTarget.setBegin(hypoToken.getBegin()); hypoTarget.setEnd(hypoToken.getEnd()); // Add the targets to the indices textTarget.addToIndexes(); hypoTarget.addToIndexes(); // Mark an alignment.Link and add it to the hypothesis view Link link = new Link(hypoView); link.setTSideTarget(textTarget); link.setHSideTarget(hypoTarget); // Set the link direction link.setDirection(linkDirection); // Set strength link.setStrength(confidence); // Set Group label link.setGroupLabel(linkGroupLabel); // Add the link information link.setAlignerID(ALIGNER_ID); link.setAlignerVersion(ALIGNER_VERSION); link.setLinkInfo(linkInfo); // Mark begin and end according to the hypothesis target link.setBegin(hypoTarget.getBegin()); link.setEnd(hypoTarget.getEnd()); // Add to index link.addToIndexes(); }
public static boolean isNounOrPronoun(Token token) { String pos = token.getPos().getPosValue(); // JJ: allows things like "British" / "Australian" which are marked as // NEs in ACE if (!(pos.startsWith("N") || pos.matches("PRP\\$?|CD|JJS?") || pos.matches("DT|WHNP|WP|PRP$?") || (pos.matches("WDT|WP") && token.getLemma().getValue().matches("who|which|that")))) { return false; } return true; }
protected Token createToken( final JCas aJCas, final int aBegin, final int aEnd, final int aIndex) { int[] span = new int[] {aBegin, aEnd}; trim(aJCas.getDocumentText(), span); if (!isEmpty(span[0], span[1]) && isWriteToken()) { Token seg = new Token(aJCas, span[0], span[1]); seg.addToIndexes(aJCas); return seg; } else { return null; } }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { String documentId = DocumentMetaData.get(aJCas).getDocumentId(); Class[] types = {Claim.class, Premise.class, Backing.class, Rebuttal.class, Refutation.class}; for (Class type : types) { for (Object o : JCasUtil.select(aJCas, type)) { ArgumentComponent argumentComponent = (ArgumentComponent) o; // non-implicit components int end = argumentComponent.getEnd(); int begin = argumentComponent.getBegin(); if (end > begin) { List<Sentence> sentences = JCasUtil2.selectOverlapping(Sentence.class, argumentComponent, aJCas); String filename = documentId + "_s" + sentences.size() + "_" + argumentComponent.getClass().getSimpleName() + "_" + begin + "_" + end + ".txt"; StringBuilder sb = new StringBuilder(); for (Sentence sentence : sentences) { List<String> tokens = new ArrayList<>(); for (Token token : JCasUtil.selectCovered(Token.class, sentence)) { tokens.add(token.getCoveredText()); } sb.append(StringUtils.join(tokens, " ")); sb.append("\n"); } try { FileUtils.write(new File(outputFolder, filename), sb.toString().trim()); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } } } } }
/** * Returns countability information according to Celex database of English nouns. * * @param token * @return */ public String getCountability(Token token) { if (!USE_CELEX) { System.err.println( "This should never happen, don't call this function if you did not configure to use Celex!"); throw new IllegalStateException(); } if (!token.getPos().getPosValue().startsWith("N")) { return "NO-NOUN"; } if (countability.containsKey(token.getLemma().getValue().toLowerCase())) { return countability.get(token.getLemma().getValue().toLowerCase()); } else { return "none"; } }
@Override public List<Feature> extract(JCas jcas) throws TextClassificationException { double nbToken = 0; double minToken = -1; // Sizes in letter double maxToken = 0; double meanToken = 0; for (Token token : JCasUtil.select(jcas, Token.class)) { nbToken++; if (minToken < 0) { minToken = token.getCoveredText().length(); // gets the size value of the first // token } if (minToken > token.getCoveredText().length()) { minToken = token.getCoveredText().length(); } if (maxToken < token.getCoveredText().length()) { maxToken = token.getCoveredText().length(); } meanToken += token.getCoveredText().length(); } try { meanToken /= nbToken; } catch (Exception e) { meanToken = 0; } List<Feature> featList = new ArrayList<Feature>(); featList.addAll(Arrays.asList(new Feature("nb_" + TOKEN, nbToken))); featList.addAll(Arrays.asList(new Feature("max_" + TOKEN + "_size", maxToken))); featList.addAll(Arrays.asList(new Feature("min_" + TOKEN + "_size", minToken))); featList.addAll(Arrays.asList(new Feature("mean_" + TOKEN + "_size", meanToken))); return featList; }
/** * Extracts person from Token. To be applied only on nouns / pronouns. * * @param token * @return */ public String getPerson(Token token) { if (!isNounOrPronoun(token)) { if (token.getPos().getPosValue().equals("EX")) { return "3"; // existential 'there' } else { System.err.println("Use getPerson method only for nouns / pronouns."); throw new IllegalArgumentException(); } } String lemma = token.getLemma().getValue().toLowerCase(); String person = "3"; if (lemma.matches("i|we|me|us|myself|ourselves|ourself")) { person = "1"; } else if (lemma.matches("you|ye|thou|thee|yourself|thyself|yourselves|yourself")) { person = "2"; } return person; }
/** * Extracts the noun type from the POS tag. Returns proper/common/pronoun. * * @param token * @return */ public static String getNounType(Token token) { if (!isNounOrPronoun(token)) { if (token.getPos().getPosValue().equals("EX")) { return "unknown"; // existential 'there' } else { System.err.println("Use getPerson method only for nouns / pronouns."); throw new IllegalArgumentException(); } } if (token.getPos().getPosValue().matches("NNPS?")) { return "proper"; } if (token.getPos().getPosValue().matches("NNS?")) { return "common"; } if (isPronoun(token.getPos().getPosValue(), token.getLemma().getValue())) { return "pronoun"; } return "unknown"; }
public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) { if (aStatus != null) { if (aStatus.isException()) { System.err.println("Error on process CAS call to remote service:"); List<Exception> exceptions = aStatus.getExceptions(); for (int i = 0; i < exceptions.size(); i++) { ((Throwable) exceptions.get(i)).printStackTrace(); } } try { JCas cas = aCas.getJCas(); for(Token token : JCasUtil.select(cas, Token.class)) { System.out.println(token.getCoveredText() + " " + token.getPos().getPosValue()); } } catch (CASException e) { e.printStackTrace(); } } }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { CAS cas = aJCas.getCas(); for (AnnotationFS cover : CasUtil.select(cas, CasUtil.getAnnotationType(cas, annotationType))) { // If there is a constraint, check if it matches if (constraint != null) { JXPathContext ctx = JXPathContext.newContext(cover); boolean match = ctx.iterate(constraint).hasNext(); if (!match) { continue; } } // If the target type is a token, use it directly, otherwise select the covered tokens Collection<Token> tokens; if (cover instanceof Token) { tokens = Collections.singleton((Token) cover); } else { tokens = JCasUtil.selectCovered(aJCas, Token.class, cover); } for (Token token : tokens) { try { String semanticField = semanticFieldResource.getSemanticTag(token); SemanticField semanticFieldAnnotation = new SemanticField(aJCas, token.getBegin(), token.getEnd()); semanticFieldAnnotation.setValue(semanticField); semanticFieldAnnotation.addToIndexes(); } catch (ResourceAccessException e) { throw new AnalysisEngineProcessException(e); } } } }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { buf = new StringBuilder(); List<Token> toAdd = new ArrayList<Token>(); List<Token> toRemove = new ArrayList<Token>(); for (Token t : select(aJCas, Token.class)) { String text = t.getCoveredText(); int offset = t.getBegin(); int start = 0; SplitPattern lastPattern = getPattern(text.charAt(0), null); Token firstToken = null; for (int i = 1; i < text.length(); i++) { SplitPattern pattern = getPattern(text.charAt(i), lastPattern); if (pattern != lastPattern) { if (lastPattern == null || lastPattern.includeInOutput) { Token nt = addToken(aJCas, offset, text, start, i, toAdd); firstToken = (firstToken == null) ? nt : firstToken; } start = i; } lastPattern = pattern; } // If we would just create the same token again, better do nothing if (start == 0) { // That is - if the whole token matches something to exclude, we remove it if (lastPattern != null && !lastPattern.includeInOutput) { toRemove.add(t); } continue; } if (deleteCover) { toRemove.add(t); } // The rest goes into the final token if (lastPattern == null || lastPattern.includeInOutput) { addToken(aJCas, offset, text, start, text.length(), toAdd); } } for (Token t : toAdd) { t.addToIndexes(); } for (Token t : toRemove) { t.removeFromIndexes(); } }
/** * Returns true if the Token is a bare plural (definition by Reiter: excludes the quantified cases * -- different from Suh!!). * * @param jCas * @param token * @return */ public static Boolean isBarePlural( JCas jCas, Token token, HashMap<Token, Set<Dependency>> childNodeMap) { // is it a plural? String pos = token.getPos().getPosValue(); if (!pos.matches("NNP?S")) { return false; } if (!childNodeMap.containsKey(token)) { return true; } for (Dependency dep : childNodeMap.get(token)) { if (dep.getGovernor() == token && dep.getDependencyType().matches("det|poss")) { return false; } } return true; }
public static Tree createStanfordTree(Annotation root, TreeFactory tFact) { JCas aJCas; try { aJCas = root.getCAS().getJCas(); } catch (CASException e) { throw new IllegalStateException("Unable to get JCas from JCas wrapper"); } // define the new (root) node Tree rootNode; // before we can create a node, we must check if we have any children (we have to know // whether to create a node or a leaf - not very dynamic) if (root instanceof Constituent && !isLeaf((Constituent) root)) { Constituent node = (Constituent) root; List<Tree> childNodes = new ArrayList<Tree>(); // get childNodes from child annotations FSArray children = node.getChildren(); for (int i = 0; i < children.size(); i++) { childNodes.add(createStanfordTree(node.getChildren(i), tFact)); } // now create the node with its children rootNode = tFact.newTreeNode(node.getConstituentType(), childNodes); } else { // Handle leaf annotations // Leafs are always Token-annotations // We also have to insert a Preterminal node with the value of the // POS-Annotation on the token // because the POS is not directly stored within the treee Token wordAnnotation = (Token) root; // create leaf-node for the tree Tree wordNode = tFact.newLeaf(wordAnnotation.getCoveredText()); // create information about preceding and trailing whitespaces in the leaf node StringBuilder preWhitespaces = new StringBuilder(); StringBuilder trailWhitespaces = new StringBuilder(); List<Token> precedingTokenList = selectPreceding(aJCas, Token.class, wordAnnotation, 1); List<Token> followingTokenList = selectFollowing(aJCas, Token.class, wordAnnotation, 1); if (precedingTokenList.size() > 0) { Token precedingToken = precedingTokenList.get(0); int precedingWhitespaces = wordAnnotation.getBegin() - precedingToken.getEnd(); for (int i = 0; i < precedingWhitespaces; i++) { preWhitespaces.append(" "); } } if (followingTokenList.size() > 0) { Token followingToken = followingTokenList.get(0); int trailingWhitespaces = followingToken.getBegin() - wordAnnotation.getEnd(); for (int i = 0; i < trailingWhitespaces; i++) { trailWhitespaces.append(" "); } } // write whitespace information as CoreAnnotation.BeforeAnnotation and // CoreAnnotation.AfterAnnotation to the node add annotation to list and write back to // node label ((CoreLabel) wordNode.label()) .set(CoreAnnotations.BeforeAnnotation.class, preWhitespaces.toString()); ((CoreLabel) wordNode.label()) .set(CoreAnnotations.AfterAnnotation.class, trailWhitespaces.toString()); // get POS-annotation // get the token that is covered by the POS List<POS> coveredPos = JCasUtil.selectCovered(aJCas, POS.class, wordAnnotation); // the POS should only cover one token assert coveredPos.size() == 1; POS pos = coveredPos.get(0); // create POS-Node in the tree and attach word-node to it rootNode = tFact.newTreeNode(pos.getPosValue(), Arrays.asList((new Tree[] {wordNode}))); } return rootNode; }
public void convert(JCas aJCas, BufferedReader aReader) throws IOException { try { if (readPos) { posMappingProvider.configure(aJCas.getCas()); } if (readConstituent) { constituentMappingProvider.configure(aJCas.getCas()); } } catch (AnalysisEngineProcessException e) { throw new IOException(e); } Map<String, CoreferenceLink> chains = new HashMap<>(); JCasBuilder doc = new JCasBuilder(aJCas); List<String[]> words; while ((words = readSentence(aJCas, aReader)) != null) { if (words.isEmpty()) { // Ignore empty sentences. This can happen when there are multiple end-of-sentence // markers following each other. continue; } int sentenceBegin = doc.getPosition(); int sentenceEnd = sentenceBegin; StringBuilder parse = new StringBuilder(); // Tokens, Lemma, POS Map<Integer, Token> tokenById = new HashMap<Integer, Token>(); List<SemPred> preds = new ArrayList<>(); for (String[] word : words) { // Read token Token token = doc.add(word[FORM], Token.class); tokenById.put(Integer.valueOf(word[ID]), token); doc.add(" "); // Read lemma if (!UNUSED.equals(word[LEMMA]) && readLemma) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); lemma.setValue(word[LEMMA]); lemma.addToIndexes(); token.setLemma(lemma); } // Read part-of-speech tag if (!UNUSED.equals(word[POS]) && readPos) { Type posTag = posMappingProvider.getTagType(word[POS]); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); pos.setPosValue(word[POS]); pos.addToIndexes(); token.setPos(pos); } if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) { SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd()); pred.setCategory(word[PRED]); pred.addToIndexes(); preds.add(pred); } if (!UNUSED.equals(word[PARSE]) && readConstituent) { String fixed = word[PARSE].replace("*", "(" + word[POS] + " " + word[FORM] + ")"); parse.append(fixed); } if (!UNUSED.equals(word[WORD_SENSE]) && readWordSense) { WordSense wordSense = new WordSense(aJCas, token.getBegin(), token.getEnd()); wordSense.setValue(word[WORD_SENSE]); wordSense.addToIndexes(); } if (!UNUSED.equals(word[word.length - 1]) && readCoreference) { String[] chainFragments = word[word.length - 1].split("\\|"); for (String chainFragment : chainFragments) { boolean beginning = chainFragment.startsWith("("); boolean ending = chainFragment.endsWith(")"); String chainId = chainFragment.substring( beginning ? 1 : 0, ending ? chainFragment.length() - 1 : chainFragment.length()); CoreferenceLink link = chains.get(chainId); if (beginning) { if (link == null) { link = new CoreferenceLink(aJCas); CoreferenceChain chain = new CoreferenceChain(aJCas); chain.setFirst(link); chain.addToIndexes(); } else { CoreferenceLink newLink = new CoreferenceLink(aJCas); link.setNext(newLink); link = newLink; } link.setReferenceType(chainId); link.setBegin(token.getBegin()); } if (ending) { link.setEnd(token.getEnd()); link.addToIndexes(); } chains.put(chainId, link); } } sentenceEnd = token.getEnd(); } // Named entities if (readNamedEntity) { int currentNeBegin = -1; String currentNeType = null; for (int i = 0; i < words.size(); i++) { String ne = words.get(i)[NAMED_ENTITIES]; boolean beginning = ne.startsWith("("); boolean ending = ne.endsWith(")"); // When a NE is beginning, we remember what the NE is and where it began if (beginning) { // The NE is beginning with "(" and either ending with "(" or "*", so we trim // the first and last character currentNeType = ne.substring(1, ne.length() - 1); currentNeBegin = i; } // We need to create an annotation if the current token is the end of an annotation if (ending) { // Determine begin and end of named entity int begin = tokenById.get(currentNeBegin).getBegin(); int end = tokenById.get(i).getEnd(); // Add named entity NamedEntity namedEntity = new NamedEntity(aJCas, begin, end); namedEntity.setValue(currentNeType); namedEntity.addToIndexes(); // Forget remembered named entity currentNeBegin = -1; currentNeType = null; } } } // Semantic arguments if (readSemanticPredicate) { // Get arguments for one predicate at a time for (int p = 0; p < preds.size(); p++) { SemPred pred = preds.get(p); List<SemArgLink> args = new ArrayList<>(); int currentArgBegin = -1; String currentArgType = null; for (int i = 0; i < words.size(); i++) { String ne = words.get(i)[APRED + p]; boolean beginning = ne.startsWith("("); boolean ending = ne.endsWith(")"); // When a arg is beginning, we remember what the NE is and where it began if (beginning) { // The arg is beginning with "(" and either ending with "(" or "*", so // we trim the first and last character currentArgType = ne.substring(1, ne.length() - 1); currentArgBegin = i; } // We need to create an annotation if the current token is the end of an // annotation if (ending) { // Determine begin and end of argument int begin = tokenById.get(currentArgBegin).getBegin(); int end = tokenById.get(i).getEnd(); // Add named entity unless it is a (V*) which has the same offsets as // the predicate if (!(pred.getBegin() == begin && pred.getEnd() == end)) { SemArg arg = new SemArg(aJCas, begin, end); arg.addToIndexes(); SemArgLink link = new SemArgLink(aJCas); link.setRole(currentArgType); link.setTarget(arg); args.add(link); } // Forget remembered arg currentArgBegin = -1; currentArgType = null; } } pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args)); } } // Sentence Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); sentence.addToIndexes(); converter.convertPennTree(sentence, PennTreeUtils.parsePennTree(parse.toString())); // Once sentence per line. doc.add("\n"); } doc.close(); }
public void convert(JCas aJCas, BufferedReader aReader) throws IOException { if (readPos) { try { posMappingProvider.configure(aJCas.getCas()); } catch (AnalysisEngineProcessException e) { throw new IOException(e); } } JCasBuilder doc = new JCasBuilder(aJCas); List<String[]> words; while ((words = readSentence(aReader)) != null) { if (words.isEmpty()) { // Ignore empty sentences. This can happen when there are multiple end-of-sentence // markers following each other. continue; } int sentenceBegin = doc.getPosition(); int sentenceEnd = sentenceBegin; // Tokens, Lemma, POS Map<Integer, Token> tokens = new HashMap<Integer, Token>(); List<SemanticPredicate> preds = new ArrayList<>(); for (String[] word : words) { // Read token Token token = doc.add(word[FORM], Token.class); tokens.put(Integer.valueOf(word[ID]), token); doc.add(" "); // Read lemma if (!UNUSED.equals(word[LEMMA]) && readLemma) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); lemma.setValue(word[LEMMA]); lemma.addToIndexes(); token.setLemma(lemma); } // Read part-of-speech tag if (!UNUSED.equals(word[POS]) && readPos) { Type posTag = posMappingProvider.getTagType(word[POS]); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); pos.setPosValue(word[POS]); pos.addToIndexes(); token.setPos(pos); } // Read morphological features if (!UNUSED.equals(word[FEAT]) && readMorph) { MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd()); morphtag.setValue(word[FEAT]); morphtag.addToIndexes(); } if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) { SemanticPredicate pred = new SemanticPredicate(aJCas, token.getBegin(), token.getEnd()); pred.setCategory(word[PRED]); pred.addToIndexes(); preds.add(pred); } sentenceEnd = token.getEnd(); } // Dependencies if (readDependency) { for (String[] word : words) { if (!UNUSED.equals(word[DEPREL])) { int depId = Integer.valueOf(word[ID]); int govId = Integer.valueOf(word[HEAD]); // Model the root as a loop onto itself if (govId == 0) { Dependency rel = new ROOT(aJCas); rel.setGovernor(tokens.get(depId)); rel.setDependent(tokens.get(depId)); rel.setDependencyType(word[DEPREL]); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); rel.addToIndexes(); } else { Dependency rel = new Dependency(aJCas); rel.setGovernor(tokens.get(govId)); rel.setDependent(tokens.get(depId)); rel.setDependencyType(word[DEPREL]); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); rel.addToIndexes(); } } } } // Semantic arguments if (readSemanticPredicate) { // Get arguments for one predicate at a time for (int p = 0; p < preds.size(); p++) { List<SemanticArgument> args = new ArrayList<SemanticArgument>(); for (String[] word : words) { if (!UNUSED.equals(word[APRED + p])) { Token token = tokens.get(Integer.valueOf(word[ID])); SemanticArgument arg = new SemanticArgument(aJCas, token.getBegin(), token.getEnd()); arg.setRole(word[APRED + p]); arg.addToIndexes(); args.add(arg); } } SemanticPredicate pred = preds.get(p); pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args)); } } // Sentence Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); sentence.addToIndexes(); // Once sentence per line. doc.add("\n"); } doc.close(); }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { // Convert UIMA to LIF Container Container container = new Container(); container.setLanguage(aJCas.getDocumentLanguage()); container.setText(aJCas.getDocumentText()); View view = container.newView(); // Paragraph for (Paragraph p : select(aJCas, Paragraph.class)) { view.newAnnotation(id(PARAGRAPH, p), Discriminators.Uri.PARAGRAPH, p.getBegin(), p.getEnd()); } // Sentence for (Sentence s : select(aJCas, Sentence.class)) { view.newAnnotation(id(SENTENCE, s), Discriminators.Uri.SENTENCE, s.getBegin(), s.getEnd()); } // Token, POS, Lemma for (Token t : select(aJCas, Token.class)) { Annotation a = view.newAnnotation(id(TOKEN, t), Discriminators.Uri.TOKEN, t.getBegin(), t.getEnd()); if (t.getPos() != null) { a.addFeature(Features.Token.POS, t.getPos().getPosValue()); } if (t.getLemma() != null) { a.addFeature(Features.Token.LEMMA, t.getLemma().getValue()); } } // NamedEntity for (NamedEntity neAnno : select(aJCas, NamedEntity.class)) { Annotation ne = view.newAnnotation( id(NAMED_ENTITY, neAnno), Discriminators.Uri.NE, neAnno.getBegin(), neAnno.getEnd()); ne.setLabel(neAnno.getValue()); } // Dependency for (Sentence s : select(aJCas, Sentence.class)) { Set<String> depRelIds = new TreeSet<>(); for (Dependency dep : selectCovered(Dependency.class, s)) { String depRelId = id(DEPENDENCY, dep); // LAPPS dependencies inherit from Relation which has no offsets Annotation depRel = view.newAnnotation(depRelId, Discriminators.Uri.DEPENDENCY); depRel.setLabel(dep.getDependencyType()); depRel.addFeature(Features.Dependency.GOVERNOR, id(TOKEN, dep.getGovernor())); depRel.addFeature(Features.Dependency.DEPENDENT, id(TOKEN, dep.getDependent())); depRelIds.add(depRelId); } if (!depRelIds.isEmpty()) { Annotation depStruct = view.newAnnotation( id(DEPENDENCY_STRUCTURE, s), Discriminators.Uri.DEPENDENCY_STRUCTURE, s.getBegin(), s.getEnd()); depStruct.addFeature(Features.DependencyStructure.DEPENDENCIES, depRelIds); } } // Constituents for (ROOT r : select(aJCas, ROOT.class)) { Set<String> constituents = new LinkedHashSet<>(); convertConstituent(view, r, constituents); Annotation phraseStruct = view.newAnnotation( id(PHRASE_STRUCTURE, r), Discriminators.Uri.PHRASE_STRUCTURE, r.getBegin(), r.getEnd()); phraseStruct.addFeature(Features.PhraseStructure.CONSTITUENTS, constituents); } try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) { String json = Serializer.toPrettyJson(container); IOUtils.write(json, docOS, encoding); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } }
private void verifyToken(Token token, TestTokenInfo info) throws LAPVerificationException { if (!info.text.equals(token.getCoveredText())) throw new LAPVerificationException( "Bad token text for " + info.id + ":" + info.text + ", expected \"" + info.text + "\", got \"" + token.getCoveredText() + "\""); if (info.begin != token.getBegin()) throw new LAPVerificationException( "Bad token begin index for " + info.id + ":" + info.text + ", expected " + info.begin + ", got " + token.getBegin()); if (info.end != token.getEnd()) throw new LAPVerificationException( "Bad token end index for " + info.id + ":" + info.text + ", expected " + info.end + ", got " + token.getEnd()); if (!info.lemma.equals(token.getLemma().getValue())) throw new LAPVerificationException( "Bad token lemma for " + info.id + ":" + info.text + ", expected \"" + info.lemma + "\", got \"" + token.getLemma().getValue() + "\""); if (!info.posType.equals(token.getPos().getType().getShortName())) throw new LAPVerificationException( "Bad token POS type for " + info.id + ":" + info.text + ", expected " + info.posType + ", got " + token.getPos().getType().getShortName()); if (!info.posValue.equals(token.getPos().getPosValue())) throw new LAPVerificationException( "Bad token POS value for " + info.id + ":" + info.text + ", expected \"" + info.posValue + "\", got \"" + token.getPos().getPosValue() + "\""); String nerType = null; List<NamedEntity> ners = JCasUtil.selectCovered(NamedEntity.class, token); if (ners.size() == 1) { nerType = ners.get(0).getType().getShortName(); } else if (ners.size() > 1) { throw new LAPVerificationException( "Got more than one NER annotation for " + info.id + ":" + info.text + " - " + ners); } if (!Objects.equals(info.nerType, nerType)) throw new LAPVerificationException( "Bad token NER value for " + info.id + ":" + info.text + ", expected \"" + info.nerType + "\", got \"" + nerType + "\""); Set<TestDependencyInfo> infoDependencies = new HashSet<TestDependencyInfo>(Arrays.asList(info.dependencies)); if (!infoDependencies.equals(governors.get(token))) throw new LAPVerificationException( "Bad token dependencies for " + info.id + ":" + info.text + ", expected " + infoDependencies + ", got " + governors.get(token)); System.out.println("Verified token: " + info); }
@Override public void process(JCas jcas) throws AnalysisEngineProcessException { getContext().getLogger().log(Level.CONFIG, "Entering " + this.getClass().getSimpleName()); Type tokenType = jcas.getCas().getTypeSystem().getType(Token.class.getCanonicalName()); Type stemType = jcas.getCas().getTypeSystem().getType(Stem.class.getCanonicalName()); Type lemmaType = jcas.getCas().getTypeSystem().getType(Lemma.class.getCanonicalName()); Type posType = jcas.getCas().getTypeSystem().getType(POS.class.getCanonicalName()); Type typeToRemoveType = jcas.getCas().getTypeSystem().getType(typeToRemove); if (typeToRemoveType == null) { throw new AnalysisEngineProcessException( new Throwable("Could not get type for feature path: " + typeToRemove)); } List<AnnotationFS> toRemove = new ArrayList<AnnotationFS>(); try { for (Entry<AnnotationFS, String> entry : FeaturePathFactory.select(jcas.getCas(), typeToRemove)) { AnnotationFS annotation = entry.getKey(); AnnotationFS pos; if (typeToRemoveType.equals(posType)) { pos = annotation; } else { pos = getAnnotation(posType, annotation); if (pos == null) { continue; } } String posString = pos.getType().getShortName(); if (posString.equals("ADJ") && !adj) { toRemove.add(annotation); continue; } if (posString.equals("ADV") && !adv) { toRemove.add(annotation); continue; } if (posString.equals("ART") && !art) { toRemove.add(annotation); continue; } if (posString.equals("CARD") && !card) { toRemove.add(annotation); continue; } if (posString.equals("CONJ") && !conj) { toRemove.add(annotation); continue; } if ((posString.equals("N") || posString.equals("NN") || posString.equals("NP")) && !n) { toRemove.add(annotation); continue; } if (posString.equals("O") && !o) { toRemove.add(annotation); continue; } if (posString.equals("PP") && !pp) { toRemove.add(annotation); continue; } if (posString.equals("PR") && !pr) { toRemove.add(annotation); continue; } if (posString.equals("PUNC") && !punc) { toRemove.add(annotation); continue; } if (posString.equals("V") && !v) { toRemove.add(annotation); continue; } } } catch (FeaturePathException e) { throw new AnalysisEngineProcessException(e); } for (AnnotationFS fs : toRemove) { // If we want to remove tokens, we also remove accompanying lemma, stem, POS tag. if (fs.getType().equals(tokenType)) { AnnotationFS stemFS = getAnnotation(stemType, fs); if (stemFS != null) { jcas.getCas().removeFsFromIndexes(stemFS); } AnnotationFS lemmaFS = getAnnotation(lemmaType, fs); if (lemmaFS != null) { jcas.getCas().removeFsFromIndexes(lemmaFS); } AnnotationFS posFS = getAnnotation(posType, fs); if (posFS != null) { jcas.getCas().removeFsFromIndexes(posFS); } } // We don't want to keep the feature in the token, remove it here. else { if (fs.getType().equals(stemType) || fs.getType().equals(lemmaType)) { Token token = (Token) getAnnotation(tokenType, fs); if (token != null) { String fbn = fs.getType().getShortName().toLowerCase(); Feature f = tokenType.getFeatureByBaseName(fbn); token.setFeatureValue(f, null); } } else if (fs instanceof POS) { Token token = (Token) getAnnotation(tokenType, fs); if (token != null) { token.setPos(null); } } } jcas.getCas().removeFsFromIndexes(fs); } }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { for (Token token : select(aJCas, Token.class)) { System.out.printf("%s\t%s%n", token.getCoveredText(), token.getPos().getPosValue()); } }