/** * Extracts the number (sg/pl) from the Token. To be applied only on nouns / pronouns. * * @param token * @return */ public String getNumber(Token token) { String pos = token.getPos().getPosValue(); if (!isNounOrPronoun(token)) { System.err.println( "Use method only for nouns / pronouns. " + pos + " " + token.getCoveredText()); // throw new IllegalArgumentException(); return "unknown"; // occurs e.g. for 'there' (existential) } if (pos.matches("NNP?S")) { return "pl"; } if (pos.matches("NNP?")) { return "sg"; } if (pos.matches("PRP\\$?|CD")) { String lemma = token.getLemma().getValue().toLowerCase(); if (lemma.matches( "I|me|myself|he|him|himself|she|her|herself|it|itself|one|onself|mine|thine|his|hers")) { return "sg"; } if (lemma.matches( "we|us|ourselves|ourself|yourselves|they|them|themselves|theirselves|theirs|ours")) { return "pl"; } } return "unknown"; }
/** * Extracts the noun type from the POS tag. Returns proper/common/pronoun. * * @param token * @return */ public static String getNounType(Token token) { if (!isNounOrPronoun(token)) { if (token.getPos().getPosValue().equals("EX")) { return "unknown"; // existential 'there' } else { System.err.println("Use getPerson method only for nouns / pronouns."); throw new IllegalArgumentException(); } } if (token.getPos().getPosValue().matches("NNPS?")) { return "proper"; } if (token.getPos().getPosValue().matches("NNS?")) { return "common"; } if (isPronoun(token.getPos().getPosValue(), token.getLemma().getValue())) { return "pronoun"; } return "unknown"; }
public static boolean isNounOrPronoun(Token token) { String pos = token.getPos().getPosValue(); // JJ: allows things like "British" / "Australian" which are marked as // NEs in ACE if (!(pos.startsWith("N") || pos.matches("PRP\\$?|CD|JJS?") || pos.matches("DT|WHNP|WP|PRP$?") || (pos.matches("WDT|WP") && token.getLemma().getValue().matches("who|which|that")))) { return false; } return true; }
/** * Returns countability information according to Celex database of English nouns. * * @param token * @return */ public String getCountability(Token token) { if (!USE_CELEX) { System.err.println( "This should never happen, don't call this function if you did not configure to use Celex!"); throw new IllegalStateException(); } if (!token.getPos().getPosValue().startsWith("N")) { return "NO-NOUN"; } if (countability.containsKey(token.getLemma().getValue().toLowerCase())) { return countability.get(token.getLemma().getValue().toLowerCase()); } else { return "none"; } }
/** * Returns true if the Token is a bare plural (definition by Reiter: excludes the quantified cases * -- different from Suh!!). * * @param jCas * @param token * @return */ public static Boolean isBarePlural( JCas jCas, Token token, HashMap<Token, Set<Dependency>> childNodeMap) { // is it a plural? String pos = token.getPos().getPosValue(); if (!pos.matches("NNP?S")) { return false; } if (!childNodeMap.containsKey(token)) { return true; } for (Dependency dep : childNodeMap.get(token)) { if (dep.getGovernor() == token && dep.getDependencyType().matches("det|poss")) { return false; } } return true; }
/** * Extracts person from Token. To be applied only on nouns / pronouns. * * @param token * @return */ public String getPerson(Token token) { if (!isNounOrPronoun(token)) { if (token.getPos().getPosValue().equals("EX")) { return "3"; // existential 'there' } else { System.err.println("Use getPerson method only for nouns / pronouns."); throw new IllegalArgumentException(); } } String lemma = token.getLemma().getValue().toLowerCase(); String person = "3"; if (lemma.matches("i|we|me|us|myself|ourselves|ourself")) { person = "1"; } else if (lemma.matches("you|ye|thou|thee|yourself|thyself|yourselves|yourself")) { person = "2"; } return person; }
public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) { if (aStatus != null) { if (aStatus.isException()) { System.err.println("Error on process CAS call to remote service:"); List<Exception> exceptions = aStatus.getExceptions(); for (int i = 0; i < exceptions.size(); i++) { ((Throwable) exceptions.get(i)).printStackTrace(); } } try { JCas cas = aCas.getJCas(); for(Token token : JCasUtil.select(cas, Token.class)) { System.out.println(token.getCoveredText() + " " + token.getPos().getPosValue()); } } catch (CASException e) { e.printStackTrace(); } } }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { // Convert UIMA to LIF Container Container container = new Container(); container.setLanguage(aJCas.getDocumentLanguage()); container.setText(aJCas.getDocumentText()); View view = container.newView(); // Paragraph for (Paragraph p : select(aJCas, Paragraph.class)) { view.newAnnotation(id(PARAGRAPH, p), Discriminators.Uri.PARAGRAPH, p.getBegin(), p.getEnd()); } // Sentence for (Sentence s : select(aJCas, Sentence.class)) { view.newAnnotation(id(SENTENCE, s), Discriminators.Uri.SENTENCE, s.getBegin(), s.getEnd()); } // Token, POS, Lemma for (Token t : select(aJCas, Token.class)) { Annotation a = view.newAnnotation(id(TOKEN, t), Discriminators.Uri.TOKEN, t.getBegin(), t.getEnd()); if (t.getPos() != null) { a.addFeature(Features.Token.POS, t.getPos().getPosValue()); } if (t.getLemma() != null) { a.addFeature(Features.Token.LEMMA, t.getLemma().getValue()); } } // NamedEntity for (NamedEntity neAnno : select(aJCas, NamedEntity.class)) { Annotation ne = view.newAnnotation( id(NAMED_ENTITY, neAnno), Discriminators.Uri.NE, neAnno.getBegin(), neAnno.getEnd()); ne.setLabel(neAnno.getValue()); } // Dependency for (Sentence s : select(aJCas, Sentence.class)) { Set<String> depRelIds = new TreeSet<>(); for (Dependency dep : selectCovered(Dependency.class, s)) { String depRelId = id(DEPENDENCY, dep); // LAPPS dependencies inherit from Relation which has no offsets Annotation depRel = view.newAnnotation(depRelId, Discriminators.Uri.DEPENDENCY); depRel.setLabel(dep.getDependencyType()); depRel.addFeature(Features.Dependency.GOVERNOR, id(TOKEN, dep.getGovernor())); depRel.addFeature(Features.Dependency.DEPENDENT, id(TOKEN, dep.getDependent())); depRelIds.add(depRelId); } if (!depRelIds.isEmpty()) { Annotation depStruct = view.newAnnotation( id(DEPENDENCY_STRUCTURE, s), Discriminators.Uri.DEPENDENCY_STRUCTURE, s.getBegin(), s.getEnd()); depStruct.addFeature(Features.DependencyStructure.DEPENDENCIES, depRelIds); } } // Constituents for (ROOT r : select(aJCas, ROOT.class)) { Set<String> constituents = new LinkedHashSet<>(); convertConstituent(view, r, constituents); Annotation phraseStruct = view.newAnnotation( id(PHRASE_STRUCTURE, r), Discriminators.Uri.PHRASE_STRUCTURE, r.getBegin(), r.getEnd()); phraseStruct.addFeature(Features.PhraseStructure.CONSTITUENTS, constituents); } try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) { String json = Serializer.toPrettyJson(container); IOUtils.write(json, docOS, encoding); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { for (Token token : select(aJCas, Token.class)) { System.out.printf("%s\t%s%n", token.getCoveredText(), token.getPos().getPosValue()); } }
private void verifyToken(Token token, TestTokenInfo info) throws LAPVerificationException { if (!info.text.equals(token.getCoveredText())) throw new LAPVerificationException( "Bad token text for " + info.id + ":" + info.text + ", expected \"" + info.text + "\", got \"" + token.getCoveredText() + "\""); if (info.begin != token.getBegin()) throw new LAPVerificationException( "Bad token begin index for " + info.id + ":" + info.text + ", expected " + info.begin + ", got " + token.getBegin()); if (info.end != token.getEnd()) throw new LAPVerificationException( "Bad token end index for " + info.id + ":" + info.text + ", expected " + info.end + ", got " + token.getEnd()); if (!info.lemma.equals(token.getLemma().getValue())) throw new LAPVerificationException( "Bad token lemma for " + info.id + ":" + info.text + ", expected \"" + info.lemma + "\", got \"" + token.getLemma().getValue() + "\""); if (!info.posType.equals(token.getPos().getType().getShortName())) throw new LAPVerificationException( "Bad token POS type for " + info.id + ":" + info.text + ", expected " + info.posType + ", got " + token.getPos().getType().getShortName()); if (!info.posValue.equals(token.getPos().getPosValue())) throw new LAPVerificationException( "Bad token POS value for " + info.id + ":" + info.text + ", expected \"" + info.posValue + "\", got \"" + token.getPos().getPosValue() + "\""); String nerType = null; List<NamedEntity> ners = JCasUtil.selectCovered(NamedEntity.class, token); if (ners.size() == 1) { nerType = ners.get(0).getType().getShortName(); } else if (ners.size() > 1) { throw new LAPVerificationException( "Got more than one NER annotation for " + info.id + ":" + info.text + " - " + ners); } if (!Objects.equals(info.nerType, nerType)) throw new LAPVerificationException( "Bad token NER value for " + info.id + ":" + info.text + ", expected \"" + info.nerType + "\", got \"" + nerType + "\""); Set<TestDependencyInfo> infoDependencies = new HashSet<TestDependencyInfo>(Arrays.asList(info.dependencies)); if (!infoDependencies.equals(governors.get(token))) throw new LAPVerificationException( "Bad token dependencies for " + info.id + ":" + info.text + ", expected " + infoDependencies + ", got " + governors.get(token)); System.out.println("Verified token: " + info); }