public static final String doCorefResolution(Annotation annotation) { Map<Integer, CorefChain> corefs = annotation.get(CorefChainAnnotation.class); List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); List<String> resolved = new ArrayList<String>(); for (CoreMap sentence : sentences) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); for (CoreLabel token : tokens) { Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class); CorefChain chain = corefs.get(corefClustId); if (chain == null) resolved.add(token.word()); else { int sentINdx = chain.getRepresentativeMention().sentNum - 1; CoreMap corefSentence = sentences.get(sentINdx); List<CoreLabel> corefSentenceTokens = corefSentence.get(TokensAnnotation.class); CorefMention reprMent = chain.getRepresentativeMention(); if (token.index() < reprMent.startIndex || token.index() > reprMent.endIndex) { for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) { CoreLabel matchedLabel = corefSentenceTokens.get(i - 1); resolved.add(matchedLabel.word()); } } else resolved.add(token.word()); } } } String resolvedStr = ""; System.out.println(); for (String str : resolved) { resolvedStr += str + " "; } System.out.println(resolvedStr); return resolvedStr; }
/** Speaker extraction */ private void findSpeakers(Dictionaries dict) { Boolean useMarkedDiscourseBoolean = annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class); boolean useMarkedDiscourse = (useMarkedDiscourseBoolean != null) ? useMarkedDiscourseBoolean : false; if (Constants.USE_GOLD_SPEAKER_TAGS || useMarkedDiscourse) { for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class); speakers.put(utterIndex, w.get(CoreAnnotations.SpeakerAnnotation.class)); } } } else { if (docType == DocType.CONVERSATION) findSpeakersInConversation(dict); else if (docType == DocType.ARTICLE) findSpeakersInArticle(dict); // set speaker info to annotation for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class); if (speakers.containsKey(utterIndex)) { w.set(CoreAnnotations.SpeakerAnnotation.class, speakers.get(utterIndex)); } } } } }
public static void addFigerAnnotationToDocument(Annotation d) throws SQLException { List<CoreMap> sentences = d.get(CoreAnnotations.SentencesAnnotation.class); Set<String> entityIds = new HashSet<String>(); for (CoreMap sen : sentences) { List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation = sen.get(NamedEntityLinkingAnnotation.class); for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) { String id = t.second; if (!id.equals("null")) { entityIds.add(id); } } } Map<String, Set<String>> idTypeMap = bigQuery(entityIds); // add type onto sentences for (CoreMap sen : sentences) { List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation = sen.get(NamedEntityLinkingAnnotation.class); List<Triple<Set<String>, Integer, Integer>> figerData = new ArrayList<>(); for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) { Integer start = t.first.first; Integer end = t.first.second; Set<String> types = null; if (!t.second.equals("null")) { types = idTypeMap.get(GuidMidConversion.convertBackward(t.second)); } Triple<Set<String>, Integer, Integer> figerTrip = new Triple<>(types, start, end); figerData.add(figerTrip); } sen.set(FigerAnnotation.class, figerData); } }
public static void main(String[] args) { SentenceDAO sentenceDAO = new SentenceDAOImpl(); List<Sentence> sentences = sentenceDAO.findAll(); Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); int i = 0; for (Sentence sentence : sentences) { if (sentence.getPredicate() == null) { try { System.out.println(i++); String text = sentence.getContent(); Annotation annotation = new Annotation(text); pipeline.annotate(annotation); for (CoreMap core : annotation.get(SentencesAnnotation.class)) { SemanticGraph graph = core.get(CollapsedCCProcessedDependenciesAnnotation.class); sentence.setPredicate(graph.getFirstRoot().lemma()); } sentenceDAO.save(sentence); } catch (Exception e) { e.printStackTrace(); } } } // System.out.println(sentence.getWords()); }
private void findSpeakersInConversation(Dictionaries dict) { for (List<Mention> l : predictedOrderedMentionsBySentence) { for (Mention m : l) { if (m.predicateNominatives == null) continue; for (Mention a : m.predicateNominatives) { if (a.spanToString().toLowerCase().equals("i")) { speakers.put( m.headWord.get(CoreAnnotations.UtteranceAnnotation.class), Integer.toString(m.mentionID)); } } } } List<CoreMap> paragraph = new ArrayList<CoreMap>(); int paragraphUtterIndex = 0; String nextParagraphSpeaker = ""; int paragraphOffset = 0; for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { int currentUtter = sent.get(CoreAnnotations.TokensAnnotation.class) .get(0) .get(CoreAnnotations.UtteranceAnnotation.class); if (paragraphUtterIndex != currentUtter) { nextParagraphSpeaker = findParagraphSpeaker( paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict); paragraphUtterIndex = currentUtter; paragraphOffset += paragraph.size(); paragraph = new ArrayList<CoreMap>(); } paragraph.add(sent); } findParagraphSpeaker( paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict); }
private void findSpeakersInArticle(Dictionaries dict) { List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); Pair<Integer, Integer> beginQuotation = new Pair<Integer, Integer>(); Pair<Integer, Integer> endQuotation = new Pair<Integer, Integer>(); boolean insideQuotation = false; int utterNum = -1; for (int i = 0; i < sentences.size(); i++) { List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class); for (int j = 0; j < sent.size(); j++) { int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class); if (utterIndex != 0 && !insideQuotation) { utterNum = utterIndex; insideQuotation = true; beginQuotation.setFirst(i); beginQuotation.setSecond(j); } else if (utterIndex == 0 && insideQuotation) { insideQuotation = false; endQuotation.setFirst(i); endQuotation.setSecond(j); findQuotationSpeaker(utterNum, sentences, beginQuotation, endQuotation, dict); } } } }
public static List<String> lemmatizeDocument(String documentText) { if (pipeline == null) { loadModels(); } List<String> lemmas = new LinkedList<>(); // create an empty Annotation just with the given text Annotation document = new Annotation(documentText); // run all Annotators on this text pipeline.annotate(document); // Iterate over all of the sentences found List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // Iterate over all tokens in a sentence for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // Retrieve and add the lemma for each word into the // list of lemmas lemmas.add(token.get(CoreAnnotations.LemmaAnnotation.class)); } } return lemmas; }
/** * @param t * @return */ public static String lemmatize(String t) { if (pipeline == null) { loadModels(); } String lemma = ""; try { // create an empty Annotation just with the given text Annotation document = new Annotation(t); // run all Annotators on this text pipeline.annotate(document); // Iterate over all of the sentences found List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // Iterate over all tokens in a sentence for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // Retrieve and add the lemma for each word into the // list of lemmas lemma += " " + token.get(CoreAnnotations.LemmaAnnotation.class); } } } catch (Exception e) { System.err.println("Stanford Lemmatizer error exception Word: " + t); } return lemma.trim(); }
public Map<Integer, Integer> getGeneSpans(String text) { Map<Integer, Integer> begin2end = new HashMap<Integer, Integer>(); Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { List<CoreLabel> candidate = new ArrayList<CoreLabel>(); for (CoreLabel token : sentence.get(TokensAnnotation.class)) { String pos = token.get(PartOfSpeechAnnotation.class); if (pos.startsWith("NN")) { candidate.add(token); } else if (candidate.size() > 0) { int begin = candidate.get(0).beginPosition(); int end = candidate.get(candidate.size() - 1).endPosition(); begin2end.put(begin, end); candidate.clear(); } } if (candidate.size() > 0) { int begin = candidate.get(0).beginPosition(); int end = candidate.get(candidate.size() - 1).endPosition(); begin2end.put(begin, end); candidate.clear(); } } return begin2end; }
private static List<AnaphorWithReferent> parseText(InputText text) { Annotation annotatedText = new Annotation(text.toString()); Container.getStanfordCoreNLP().annotate(annotatedText); List<CoreMap> coreMapSentences = annotatedText.get(CoreAnnotations.SentencesAnnotation.class); List<Tree> trees = coreMapSentences .stream() .map(s -> s.get(TreeCoreAnnotations.TreeAnnotation.class)) .collect(Collectors.toList()); List<Sentence> allSentences = IntStream.range(0, trees.size()) .mapToObj( id -> new Sentence( id, trees.get(id), Container.getNPsFromParseTreeExtractor().extract(trees.get(id)))) .collect(Collectors.toList()); List<AnaphorWithReferent> anaphoraWithReferentFromAllSentences = allSentences .stream() .map(s -> Container.getAllAnaphorWithReferentPerSentenceFinder().find(s, allSentences)) .flatMap(a -> a.stream()) .collect(Collectors.toList()); return anaphoraWithReferentFromAllSentences; }
public List<NLPInfo> analyze(String text) { Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); if (sentences == null || sentences.isEmpty()) { return null; } List<NLPInfo> res = new ArrayList<NLPInfo>(); NLPInfo info; for (CoreMap sentence : sentences) { info = new NLPInfo(); NLPToken tokenInfo; for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { tokenInfo = new NLPToken(); tokenInfo.setWord(token.get(CoreAnnotations.TextAnnotation.class)); tokenInfo.setTag(token.get(CoreAnnotations.PartOfSpeechAnnotation.class)); tokenInfo.setNer(token.get(CoreAnnotations.NamedEntityTagAnnotation.class)); info.appendToken(tokenInfo); } res.add(info); } return res; }
public static ArrayList<String[]> extractNounPhrases( StanfordCoreNLP pipeline, String text, int searchRange) { ArrayList<String[]> wordPairs = new ArrayList<String[]>(); Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); MAX_STEPS = searchRange; for (CoreMap sentence : sentences) { List<CoreLabel> labels = sentence.get(TokensAnnotation.class); // Check negation boolean hasNegation = false; for (CoreLabel label : labels) { if (NEGATIONS.contains(label.lemma().toLowerCase())) { hasNegation = true; } } for (int idx = 0; idx < labels.size(); idx++) { CoreLabel label = labels.get(idx); if (NN_TAGS.contains(label.get(PartOfSpeechAnnotation.class))) { for (int step = 1; step <= MAX_STEPS; step++) { CoreLabel leftLabel = labels.get(Math.max(0, idx - step)); if (JJ_TAGS.contains(leftLabel.tag())) { if (hasNegation) addPair( wordPairs, NOT_PREFIX + leftLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); else addPair( wordPairs, leftLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); break; } CoreLabel rightLabel = labels.get(Math.min(idx + step, labels.size() - 1)); if (JJ_TAGS.contains(rightLabel.tag())) { if (hasNegation) addPair( wordPairs, NOT_PREFIX + rightLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); else addPair( wordPairs, rightLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); break; } } } } } return wordPairs; }
private void handleCvtBinary(FormulaGenerationInfo fgInfo, Set<String> res) { String description1 = normalizeFbDescription(fgInfo.bInfo.descriptions.get(0)); String description2 = normalizeFbDescription(fgInfo.bInfo.descriptions.get(1)); Annotation a1 = getAnnotation(description1); Annotation a2 = getAnnotation(description2); String question = generateCvtQuestion( fgInfo, description1, description2, a1.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class).firstChild(), a2.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class).firstChild(), getPosTagsFromAnnotation(a1), getPosTagsFromAnnotation(a2)); if (question != null) res.add(question); }
public static void main(String[] args) throws IOException { SUTimePipeline pipeline = new SUTimePipeline(); Annotator timeAnnotator = pipeline.getTimeAnnotator("sutime", new Properties()); BufferedReader is = new BufferedReader(new InputStreamReader(System.in)); System.out.print("> "); for (String line; (line = is.readLine()) != null; ) { Annotation ann = pipeline.process(line, null, timeAnnotator); System.out.println(ann.get(TimeAnnotations.TimexAnnotations.class)); System.out.print("> "); } }
public static void describe(DBObject doc, StanfordCoreNLP pipeline) { // create an empty Annotation just with the given text Annotation document = new Annotation((String) doc.get("cleansed_text")); BasicDBObject m_doc = new BasicDBObject(); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom // types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = token.get(TextAnnotation.class); // this is the POS tag of the token String pos = token.get(PartOfSpeechAnnotation.class); // this is the NER label of the token String ne = token.get(NamedEntityTagAnnotation.class); // System.out.print("(" + word + ", " + pos + ", " + ne + ")"); } // System.out.println(); // this is the parse tree of the current sentence Tree tree = sentence.get(TreeAnnotation.class); // this is the Stanford dependency graph of the current sentence SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); } // This is the coreference link graph // Each chain stores a set of mentions that link to each other, // along with a method for getting the most representative mention // Both sentence and token offsets start at 1! Map<Integer, CorefChain> graph = document.get(CorefChainAnnotation.class); }
private void handleNonCvtBinary(FormulaGenerationInfo fgInfo, Set<String> res) { String description = normalizeFbDescription(fgInfo.bInfo.descriptions.get(0)); Annotation a = getAnnotation(description); String question = generateNonCvtQuestion( fgInfo, description, getPosTagsFromAnnotation(a), a.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class).firstChild(), fbFormulasInfo.isReversed(fgInfo.bInfo.formula)); if (question != null) res.add(question); }
/** * Finds the position of the sentence in the given document that achieves the best ROUGE-N scores * w.r.t. to the reference summaries. * * @param task the document and the corresponding models * @return the position of the best sentence in the document * @throws IOException */ public int getBestSentencePos(Task task) { Document document = task.getDocument(); Annotation documentAnnotation = annotationProvider.getAnnotation(document.getContent()); RougeN rouge = rougeFactory.make(task.getModels(), annotationProvider); BestSentenceSelector sentenceSelector = new BestSentenceSelector(rouge); Annotation bestAnnotation = sentenceSelector.select(documentAnnotation); CoreMap sentence = bestAnnotation.get(SentencesAnnotation.class).get(0); String bestPos = sentence.get(SentencePositionAnnotation.class); return Integer.valueOf(bestPos); }
private void parseThread(ArrayList<Thread> threads) { for (Thread t : threads) { ThreadVector tv = new ThreadVector(t); allThreads.add(tv); for (Email e : t.getEmails()) { StringBuffer sb = new StringBuffer(); for (Sentence s : e.getSentences()) { // if it's the content of this email if (s.getQuotationTimes() == 0) { sb.append(s.getText() + " "); } } String content = sb.toString().toLowerCase(); // create an empty Annotation just with the given text Annotation document = new Annotation(content); // run all Annotators on this text this.pipeline.annotate(document); // Iterate over all of the sentences found List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { List<String> lemmas = new LinkedList<String>(); // Iterate over all tokens in a sentence for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // Retrieve and add the lemma for each word into the // list of lemmas lemmas.add(token.get(LemmaAnnotation.class)); } HashMap<String, Integer> wordCount = countWordsInSentence(lemmas); // if it has valid words if (wordCount.size() > 0) { totalSentenceNumber++; for (String word : wordCount.keySet()) { if (!dictionaryIndex.containsKey(word)) { dictionaryIndex.put(word, dictionaryIndex.size()); dictionaryDocumentCount.put(word, 1); } else { dictionaryDocumentCount.put(word, dictionaryDocumentCount.get(word) + 1); } } SentenceVector sv = new SentenceVector(sentence.toString(), wordCount); tv.addSentenceVectors(sv); } } } } }
public static Collection<String> lemmatize(String rawInput) { Collection<String> lemmas = Lists.newArrayListWithCapacity(30); // should to the initial capacity in other places too Annotation rawInputAnnotation = new Annotation(rawInput); coreNlp.annotate(rawInputAnnotation); List<CoreLabel> allTokens = rawInputAnnotation.get(TokensAnnotation.class); for (CoreLabel eachToken : allTokens) { lemmas.add(eachToken.get(LemmaAnnotation.class)); } return lemmas; }
public static DependencyParse parse(String text) { if (pipeline == null) { loadModels(); } DependencyParse parse = new DependencyParse(); Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); IndexedWord root = dependencies.getFirstRoot(); parse.setHeadNode(root.index()); List<SemanticGraphEdge> edges = dependencies.edgeListSorted(); // System.out.println(edges); for (SemanticGraphEdge t : edges) { String dep = t.getDependent().originalText(); int depIndex = t.getDependent().index(); String depPOS = t.getDependent().tag(); int depStart = t.getDependent().beginPosition(); int depEnd = t.getDependent().endPosition(); String gov = t.getGovernor().originalText(); int govIndex = t.getGovernor().index(); String govPOS = t.getGovernor().tag(); int govStart = t.getGovernor().beginPosition(); int govEnd = t.getGovernor().endPosition(); parse.addNode(govIndex, gov, govPOS, govStart, govEnd); parse.addNode(depIndex, dep, depPOS, depStart, depEnd); parse.addEdge(depIndex, govIndex, t.getRelation().getShortName()); } } return parse; }
/** Reads an annotation from the given filename using the requested input. */ public static List<Annotation> getAnnotations( StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) { switch (inputFormat) { case TEXT: { String text = IOUtils.slurpFileNoExceptions(filename); Annotation annotation = new Annotation(text); tokenizer.annotate(annotation); List<Annotation> annotations = Generics.newArrayList(); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class)); nextAnnotation.set( CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence)); annotations.add(nextAnnotation); } return annotations; } case TREES: { List<Tree> trees; if (filterUnknown) { trees = SentimentUtils.readTreesWithGoldLabels(filename); trees = SentimentUtils.filterUnknownRoots(trees); } else { trees = Generics.newArrayList(); MemoryTreebank treebank = new MemoryTreebank("utf-8"); treebank.loadPath(filename, null); for (Tree tree : treebank) { trees.add(tree); } } List<Annotation> annotations = Generics.newArrayList(); for (Tree tree : trees) { CoreMap sentence = new Annotation(Sentence.listToString(tree.yield())); sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree); List<CoreMap> sentences = Collections.singletonList(sentence); Annotation annotation = new Annotation(""); annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences); annotations.add(annotation); } return annotations; } default: throw new IllegalArgumentException("Unknown format " + inputFormat); } }
private void addMissingAnnotation(Annotation anno) { boolean useConstituency = CorefProperties.useConstituencyTree(props); final boolean LEMMATIZE = true; List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { boolean hasTree = sentence.containsKey(TreeCoreAnnotations.TreeAnnotation.class); Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); if (!useConstituency) { // TODO: temp for dev: make sure we don't use constituency tree sentence.remove(TreeCoreAnnotations.TreeAnnotation.class); } if (LEMMATIZE && hasTree && useConstituency) treeLemmatizer.transformTree(tree); // TODO don't need? } corenlp.annotate(anno); }
public static void saveCoNLLFiles( String dir, Annotation dataset, boolean useSubTypes, boolean alreadyBIO) throws IOException { List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class); String docid = null; PrintStream os = null; for (CoreMap sentence : sentences) { String myDocid = sentence.get(CoreAnnotations.DocIDAnnotation.class); if (docid == null || !myDocid.equals(docid)) { if (os != null) { os.close(); } docid = myDocid; os = new PrintStream(new FileOutputStream(dir + File.separator + docid + ".conll")); } List<CoreLabel> labeledSentence = AnnotationUtils.sentenceEntityMentionsToCoreLabels( sentence, true, null, null, useSubTypes, alreadyBIO); assert (labeledSentence != null); String prev = null; for (CoreLabel word : labeledSentence) { String w = word.word().replaceAll("[ \t\n]+", "_"); String t = word.get(CoreAnnotations.PartOfSpeechAnnotation.class); String l = word.get(CoreAnnotations.AnswerAnnotation.class); String nl = l; if (!alreadyBIO && !l.equals("O")) { if (prev != null && l.equals(prev)) nl = "I-" + l; else nl = "B-" + l; } String line = w + ' ' + t + ' ' + nl; String[] toks = line.split("[ \t\n]+"); if (toks.length != 3) { throw new RuntimeException("INVALID LINE: \"" + line + '"'); } os.printf("%s %s %s\n", w, t, nl); prev = l; } os.println(); } if (os != null) { os.close(); } }
private void testParseTree() { try { Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // read some text in the text variable String text = "Give me a list of all bandleaders that play trumpet."; // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom // types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods // this is the parse tree of the current sentence Tree tree = sentence.get(TreeAnnotation.class); // this is the Stanford dependency graph of the current sentence SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); Set<IndexedWord> vertices = dependencies.vertexSet(); List<SemanticGraphEdge> edges = dependencies.edgeListSorted(); for (SemanticGraphEdge e : edges) {} for (IndexedWord i : vertices) { System.out.println(i.toString()); } } } catch (Exception e) { } }
public void annotate(Annotation annotation) { if (verbose) { timer.start(); System.err.print("Adding gender annotation..."); } if (!annotation.containsKey(SentencesAnnotation.class)) throw new RuntimeException("Unable to find sentences in " + annotation); List<CoreMap> sentences = annotation.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { List<CoreLabel> tokens = sentence.get(TokensAnnotation.class); classifier.classify(tokens); for (CoreLabel token : tokens) token.set(GenderAnnotation.class, token.get(AnswerAnnotation.class)); } if (verbose) timer.stop("done."); }
/* * This function return the lemmatized word from the original term */ private String lemmatize(String text) { // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and // has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); String lemma = null; for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token : sentence.get(TokensAnnotation.class)) { lemma = token.get(LemmaAnnotation.class); } } return lemma; }
/** Find document type: Conversation or article */ private DocType findDocType(Dictionaries dict) { boolean speakerChange = false; Set<Integer> discourseWithIorYou = Generics.newHashSet(); for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class); if (utterIndex != 0) speakerChange = true; if (speakerChange && utterIndex == 0) return DocType.ARTICLE; if (dict.firstPersonPronouns.contains( w.get(CoreAnnotations.TextAnnotation.class).toLowerCase()) || dict.secondPersonPronouns.contains( w.get(CoreAnnotations.TextAnnotation.class).toLowerCase())) { discourseWithIorYou.add(utterIndex); } if (maxUtter < utterIndex) maxUtter = utterIndex; } } if (!speakerChange) return DocType.ARTICLE; return DocType.CONVERSATION; // in conversation, utter index keep increasing. }
private static List<Extraction> getExtractions( Corpus c, ArgumentIdentification ai, SententialInstanceGeneration sig, DocumentExtractor de) throws SQLException, IOException { List<Extraction> extrs = new ArrayList<Extraction>(); Iterator<Annotation> docs = c.getDocumentIterator(); Map<Integer, String> ftID2ftMap = ModelUtils.getFeatureIDToFeatureMap(de.getMapping()); while (docs.hasNext()) { Annotation doc = docs.next(); List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); int sentenceCount = 1; for (CoreMap sentence : sentences) { // argument identification List<Argument> arguments = ai.identifyArguments(doc, sentence); // sentential instance generation List<Pair<Argument, Argument>> sententialInstances = sig.generateSententialInstances(arguments, sentence); for (Pair<Argument, Argument> p : sententialInstances) { Pair<Triple<String, Double, Double>, Map<Integer, Double>> extrResult = de.extractFromSententialInstanceWithFeatureScores(p.first, p.second, sentence, doc); if (extrResult != null) { Triple<String, Double, Double> extrScoreTripe = extrResult.first; Map<Integer, Double> featureScores = extrResult.second; String rel = extrScoreTripe.first; if (targetRelations.contains(rel)) { String docName = sentence.get(SentDocName.class); String senText = sentence.get(CoreAnnotations.TextAnnotation.class); Integer sentNum = sentence.get(SentGlobalID.class); Extraction e = new Extraction( p.first, p.second, docName, rel, sentNum, extrScoreTripe.third, senText); e.setFeatureScoreList(EvaluationUtils.getFeatureScoreList(featureScores, ftID2ftMap)); extrs.add(e); } } } sentenceCount++; } } return EvaluationUtils.getUniqueList(extrs); }
private static void recallErrors( List<List<Mention>> goldMentions, List<List<Mention>> predictedMentions, Annotation doc) throws IOException { List<CoreMap> coreMaps = doc.get(CoreAnnotations.SentencesAnnotation.class); int numSentences = goldMentions.size(); for (int i = 0; i < numSentences; i++) { CoreMap coreMap = coreMaps.get(i); List<CoreLabel> words = coreMap.get(CoreAnnotations.TokensAnnotation.class); Tree tree = coreMap.get(TreeCoreAnnotations.TreeAnnotation.class); List<Mention> goldMentionsSent = goldMentions.get(i); List<Pair<Integer, Integer>> goldMentionsSpans = extractSpans(goldMentionsSent); for (Pair<Integer, Integer> mentionSpan : goldMentionsSpans) { logger.finer("RECALL ERROR\n"); logger.finer(coreMap + "\n"); for (int x = mentionSpan.first; x < mentionSpan.second; x++) { logger.finer(words.get(x).value() + " "); } logger.finer("\n" + tree + "\n"); } } }
/** Set paragraph index */ private void setParagraphAnnotation() { int paragraphIndex = 0; int previousOffset = -10; for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { if (w.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) { if (w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) > previousOffset + 2) paragraphIndex++; w.set(CoreAnnotations.ParagraphAnnotation.class, paragraphIndex); previousOffset = w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); } else { w.set(CoreAnnotations.ParagraphAnnotation.class, -1); } } } for (List<Mention> l : predictedOrderedMentionsBySentence) { for (Mention m : l) { m.paragraph = m.headWord.get(CoreAnnotations.ParagraphAnnotation.class); } } numParagraph = paragraphIndex; }