public List<NLPInfo> analyze(String text) { Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); if (sentences == null || sentences.isEmpty()) { return null; } List<NLPInfo> res = new ArrayList<NLPInfo>(); NLPInfo info; for (CoreMap sentence : sentences) { info = new NLPInfo(); NLPToken tokenInfo; for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { tokenInfo = new NLPToken(); tokenInfo.setWord(token.get(CoreAnnotations.TextAnnotation.class)); tokenInfo.setTag(token.get(CoreAnnotations.PartOfSpeechAnnotation.class)); tokenInfo.setNer(token.get(CoreAnnotations.NamedEntityTagAnnotation.class)); info.appendToken(tokenInfo); } res.add(info); } return res; }
public Map<Integer, Integer> getGeneSpans(String text) { Map<Integer, Integer> begin2end = new HashMap<Integer, Integer>(); Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { List<CoreLabel> candidate = new ArrayList<CoreLabel>(); for (CoreLabel token : sentence.get(TokensAnnotation.class)) { String pos = token.get(PartOfSpeechAnnotation.class); if (pos.startsWith("NN")) { candidate.add(token); } else if (candidate.size() > 0) { int begin = candidate.get(0).beginPosition(); int end = candidate.get(candidate.size() - 1).endPosition(); begin2end.put(begin, end); candidate.clear(); } } if (candidate.size() > 0) { int begin = candidate.get(0).beginPosition(); int end = candidate.get(candidate.size() - 1).endPosition(); begin2end.put(begin, end); candidate.clear(); } } return begin2end; }
/** * @param t * @return */ public static String lemmatize(String t) { if (pipeline == null) { loadModels(); } String lemma = ""; try { // create an empty Annotation just with the given text Annotation document = new Annotation(t); // run all Annotators on this text pipeline.annotate(document); // Iterate over all of the sentences found List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // Iterate over all tokens in a sentence for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // Retrieve and add the lemma for each word into the // list of lemmas lemma += " " + token.get(CoreAnnotations.LemmaAnnotation.class); } } } catch (Exception e) { System.err.println("Stanford Lemmatizer error exception Word: " + t); } return lemma.trim(); }
public static final String doCorefResolution(Annotation annotation) { Map<Integer, CorefChain> corefs = annotation.get(CorefChainAnnotation.class); List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); List<String> resolved = new ArrayList<String>(); for (CoreMap sentence : sentences) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); for (CoreLabel token : tokens) { Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class); CorefChain chain = corefs.get(corefClustId); if (chain == null) resolved.add(token.word()); else { int sentINdx = chain.getRepresentativeMention().sentNum - 1; CoreMap corefSentence = sentences.get(sentINdx); List<CoreLabel> corefSentenceTokens = corefSentence.get(TokensAnnotation.class); CorefMention reprMent = chain.getRepresentativeMention(); if (token.index() < reprMent.startIndex || token.index() > reprMent.endIndex) { for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) { CoreLabel matchedLabel = corefSentenceTokens.get(i - 1); resolved.add(matchedLabel.word()); } } else resolved.add(token.word()); } } } String resolvedStr = ""; System.out.println(); for (String str : resolved) { resolvedStr += str + " "; } System.out.println(resolvedStr); return resolvedStr; }
public static void main(String[] args) { SentenceDAO sentenceDAO = new SentenceDAOImpl(); List<Sentence> sentences = sentenceDAO.findAll(); Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); int i = 0; for (Sentence sentence : sentences) { if (sentence.getPredicate() == null) { try { System.out.println(i++); String text = sentence.getContent(); Annotation annotation = new Annotation(text); pipeline.annotate(annotation); for (CoreMap core : annotation.get(SentencesAnnotation.class)) { SemanticGraph graph = core.get(CollapsedCCProcessedDependenciesAnnotation.class); sentence.setPredicate(graph.getFirstRoot().lemma()); } sentenceDAO.save(sentence); } catch (Exception e) { e.printStackTrace(); } } } // System.out.println(sentence.getWords()); }
/** * Make Document for coref (for method coref(Document doc, StringBuilder[] outputs)). Mention * detection and document preprocessing is done here. * * @throws Exception */ public Document makeDocument(InputDoc input) throws Exception { if (input == null) return null; Annotation anno = input.annotation; // add missing annotation if (needMissingAnnotations) { addMissingAnnotation(anno); } if (Boolean.parseBoolean(props.getProperty("hcoref.useMarkedDiscourse", "false"))) { anno.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true); } // remove nested NP with same headword except newswire document for chinese if (input.conllDoc != null && CorefProperties.getLanguage(props) == Locale.CHINESE) { CorefProperties.setRemoveNested(props, !input.conllDoc.documentID.contains("nw")); } // mention detection: MD gives following information about mentions: mention start/end index, // span, headword // rest information will be set in preprocess step List<List<Mention>> mentions = md.findMentions(anno, dict, props); Document doc = new Document(input, mentions); // find headword for gold mentions if (input.goldMentions != null) findGoldMentionHeads(doc); // document preprocessing: initialization (assign ID), mention processing (gender, number, type, // etc), speaker extraction, etc Preprocessor.preprocess(doc, dict, singletonPredictor, headFinder); return doc; }
/** Speaker extraction */ private void findSpeakers(Dictionaries dict) { Boolean useMarkedDiscourseBoolean = annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class); boolean useMarkedDiscourse = (useMarkedDiscourseBoolean != null) ? useMarkedDiscourseBoolean : false; if (Constants.USE_GOLD_SPEAKER_TAGS || useMarkedDiscourse) { for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class); speakers.put(utterIndex, w.get(CoreAnnotations.SpeakerAnnotation.class)); } } } else { if (docType == DocType.CONVERSATION) findSpeakersInConversation(dict); else if (docType == DocType.ARTICLE) findSpeakersInArticle(dict); // set speaker info to annotation for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) { int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class); if (speakers.containsKey(utterIndex)) { w.set(CoreAnnotations.SpeakerAnnotation.class, speakers.get(utterIndex)); } } } } }
private static List<AnaphorWithReferent> parseText(InputText text) { Annotation annotatedText = new Annotation(text.toString()); Container.getStanfordCoreNLP().annotate(annotatedText); List<CoreMap> coreMapSentences = annotatedText.get(CoreAnnotations.SentencesAnnotation.class); List<Tree> trees = coreMapSentences .stream() .map(s -> s.get(TreeCoreAnnotations.TreeAnnotation.class)) .collect(Collectors.toList()); List<Sentence> allSentences = IntStream.range(0, trees.size()) .mapToObj( id -> new Sentence( id, trees.get(id), Container.getNPsFromParseTreeExtractor().extract(trees.get(id)))) .collect(Collectors.toList()); List<AnaphorWithReferent> anaphoraWithReferentFromAllSentences = allSentences .stream() .map(s -> Container.getAllAnaphorWithReferentPerSentenceFinder().find(s, allSentences)) .flatMap(a -> a.stream()) .collect(Collectors.toList()); return anaphoraWithReferentFromAllSentences; }
public static List<String> lemmatizeDocument(String documentText) { if (pipeline == null) { loadModels(); } List<String> lemmas = new LinkedList<>(); // create an empty Annotation just with the given text Annotation document = new Annotation(documentText); // run all Annotators on this text pipeline.annotate(document); // Iterate over all of the sentences found List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // Iterate over all tokens in a sentence for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // Retrieve and add the lemma for each word into the // list of lemmas lemmas.add(token.get(CoreAnnotations.LemmaAnnotation.class)); } } return lemmas; }
public static ArrayList<String[]> extractNounPhrases( StanfordCoreNLP pipeline, String text, int searchRange) { ArrayList<String[]> wordPairs = new ArrayList<String[]>(); Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); MAX_STEPS = searchRange; for (CoreMap sentence : sentences) { List<CoreLabel> labels = sentence.get(TokensAnnotation.class); // Check negation boolean hasNegation = false; for (CoreLabel label : labels) { if (NEGATIONS.contains(label.lemma().toLowerCase())) { hasNegation = true; } } for (int idx = 0; idx < labels.size(); idx++) { CoreLabel label = labels.get(idx); if (NN_TAGS.contains(label.get(PartOfSpeechAnnotation.class))) { for (int step = 1; step <= MAX_STEPS; step++) { CoreLabel leftLabel = labels.get(Math.max(0, idx - step)); if (JJ_TAGS.contains(leftLabel.tag())) { if (hasNegation) addPair( wordPairs, NOT_PREFIX + leftLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); else addPair( wordPairs, leftLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); break; } CoreLabel rightLabel = labels.get(Math.min(idx + step, labels.size() - 1)); if (JJ_TAGS.contains(rightLabel.tag())) { if (hasNegation) addPair( wordPairs, NOT_PREFIX + rightLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); else addPair( wordPairs, rightLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); break; } } } } } return wordPairs; }
/** * TODO(gabor) JavaDoc * * @param sentence * @param pipeline */ public static void annotate(CoreMap sentence, AnnotationPipeline pipeline) { Annotation ann = new Annotation(StringUtils.join(sentence.get(CoreAnnotations.TokensAnnotation.class), " ")); ann.set( CoreAnnotations.TokensAnnotation.class, sentence.get(CoreAnnotations.TokensAnnotation.class)); ann.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence)); pipeline.annotate(ann); }
public Annotation process(String sentence, String dateString, Annotator timeAnnotator) { log.info("Processing text \"" + sentence + "\" with dateString = " + dateString); Annotation anno = new Annotation(sentence); if (dateString != null && !dateString.equals("")) { anno.set(CoreAnnotations.DocDateAnnotation.class, dateString); } pipeline.annotate(anno); timeAnnotator.annotate(anno); return anno; }
public static void main(String[] args) throws IOException { SUTimePipeline pipeline = new SUTimePipeline(); Annotator timeAnnotator = pipeline.getTimeAnnotator("sutime", new Properties()); BufferedReader is = new BufferedReader(new InputStreamReader(System.in)); System.out.print("> "); for (String line; (line = is.readLine()) != null; ) { Annotation ann = pipeline.process(line, null, timeAnnotator); System.out.println(ann.get(TimeAnnotations.TimexAnnotations.class)); System.out.print("> "); } }
private void handleNonCvtBinary(FormulaGenerationInfo fgInfo, Set<String> res) { String description = normalizeFbDescription(fgInfo.bInfo.descriptions.get(0)); Annotation a = getAnnotation(description); String question = generateNonCvtQuestion( fgInfo, description, getPosTagsFromAnnotation(a), a.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class).firstChild(), fbFormulasInfo.isReversed(fgInfo.bInfo.formula)); if (question != null) res.add(question); }
/** * Finds the position of the sentence in the given document that achieves the best ROUGE-N scores * w.r.t. to the reference summaries. * * @param task the document and the corresponding models * @return the position of the best sentence in the document * @throws IOException */ public int getBestSentencePos(Task task) { Document document = task.getDocument(); Annotation documentAnnotation = annotationProvider.getAnnotation(document.getContent()); RougeN rouge = rougeFactory.make(task.getModels(), annotationProvider); BestSentenceSelector sentenceSelector = new BestSentenceSelector(rouge); Annotation bestAnnotation = sentenceSelector.select(documentAnnotation); CoreMap sentence = bestAnnotation.get(SentencesAnnotation.class).get(0); String bestPos = sentence.get(SentencePositionAnnotation.class); return Integer.valueOf(bestPos); }
private void parseThread(ArrayList<Thread> threads) { for (Thread t : threads) { ThreadVector tv = new ThreadVector(t); allThreads.add(tv); for (Email e : t.getEmails()) { StringBuffer sb = new StringBuffer(); for (Sentence s : e.getSentences()) { // if it's the content of this email if (s.getQuotationTimes() == 0) { sb.append(s.getText() + " "); } } String content = sb.toString().toLowerCase(); // create an empty Annotation just with the given text Annotation document = new Annotation(content); // run all Annotators on this text this.pipeline.annotate(document); // Iterate over all of the sentences found List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { List<String> lemmas = new LinkedList<String>(); // Iterate over all tokens in a sentence for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // Retrieve and add the lemma for each word into the // list of lemmas lemmas.add(token.get(LemmaAnnotation.class)); } HashMap<String, Integer> wordCount = countWordsInSentence(lemmas); // if it has valid words if (wordCount.size() > 0) { totalSentenceNumber++; for (String word : wordCount.keySet()) { if (!dictionaryIndex.containsKey(word)) { dictionaryIndex.put(word, dictionaryIndex.size()); dictionaryDocumentCount.put(word, 1); } else { dictionaryDocumentCount.put(word, dictionaryDocumentCount.get(word) + 1); } } SentenceVector sv = new SentenceVector(sentence.toString(), wordCount); tv.addSentenceVectors(sv); } } } } }
public static Collection<String> lemmatize(String rawInput) { Collection<String> lemmas = Lists.newArrayListWithCapacity(30); // should to the initial capacity in other places too Annotation rawInputAnnotation = new Annotation(rawInput); coreNlp.annotate(rawInputAnnotation); List<CoreLabel> allTokens = rawInputAnnotation.get(TokensAnnotation.class); for (CoreLabel eachToken : allTokens) { lemmas.add(eachToken.get(LemmaAnnotation.class)); } return lemmas; }
public static DependencyParse parse(String text) { if (pipeline == null) { loadModels(); } DependencyParse parse = new DependencyParse(); Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); IndexedWord root = dependencies.getFirstRoot(); parse.setHeadNode(root.index()); List<SemanticGraphEdge> edges = dependencies.edgeListSorted(); // System.out.println(edges); for (SemanticGraphEdge t : edges) { String dep = t.getDependent().originalText(); int depIndex = t.getDependent().index(); String depPOS = t.getDependent().tag(); int depStart = t.getDependent().beginPosition(); int depEnd = t.getDependent().endPosition(); String gov = t.getGovernor().originalText(); int govIndex = t.getGovernor().index(); String govPOS = t.getGovernor().tag(); int govStart = t.getGovernor().beginPosition(); int govEnd = t.getGovernor().endPosition(); parse.addNode(govIndex, gov, govPOS, govStart, govEnd); parse.addNode(depIndex, dep, depPOS, depStart, depEnd); parse.addEdge(depIndex, govIndex, t.getRelation().getShortName()); } } return parse; }
private void findSpeakersInConversation(Dictionaries dict) { for (List<Mention> l : predictedOrderedMentionsBySentence) { for (Mention m : l) { if (m.predicateNominatives == null) continue; for (Mention a : m.predicateNominatives) { if (a.spanToString().toLowerCase().equals("i")) { speakers.put( m.headWord.get(CoreAnnotations.UtteranceAnnotation.class), Integer.toString(m.mentionID)); } } } } List<CoreMap> paragraph = new ArrayList<CoreMap>(); int paragraphUtterIndex = 0; String nextParagraphSpeaker = ""; int paragraphOffset = 0; for (CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { int currentUtter = sent.get(CoreAnnotations.TokensAnnotation.class) .get(0) .get(CoreAnnotations.UtteranceAnnotation.class); if (paragraphUtterIndex != currentUtter) { nextParagraphSpeaker = findParagraphSpeaker( paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict); paragraphUtterIndex = currentUtter; paragraphOffset += paragraph.size(); paragraph = new ArrayList<CoreMap>(); } paragraph.add(sent); } findParagraphSpeaker( paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict); }
private void findSpeakersInArticle(Dictionaries dict) { List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); Pair<Integer, Integer> beginQuotation = new Pair<Integer, Integer>(); Pair<Integer, Integer> endQuotation = new Pair<Integer, Integer>(); boolean insideQuotation = false; int utterNum = -1; for (int i = 0; i < sentences.size(); i++) { List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class); for (int j = 0; j < sent.size(); j++) { int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class); if (utterIndex != 0 && !insideQuotation) { utterNum = utterIndex; insideQuotation = true; beginQuotation.setFirst(i); beginQuotation.setSecond(j); } else if (utterIndex == 0 && insideQuotation) { insideQuotation = false; endQuotation.setFirst(i); endQuotation.setSecond(j); findQuotationSpeaker(utterNum, sentences, beginQuotation, endQuotation, dict); } } } }
public static void addFigerAnnotationToDocument(Annotation d) throws SQLException { List<CoreMap> sentences = d.get(CoreAnnotations.SentencesAnnotation.class); Set<String> entityIds = new HashSet<String>(); for (CoreMap sen : sentences) { List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation = sen.get(NamedEntityLinkingAnnotation.class); for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) { String id = t.second; if (!id.equals("null")) { entityIds.add(id); } } } Map<String, Set<String>> idTypeMap = bigQuery(entityIds); // add type onto sentences for (CoreMap sen : sentences) { List<Triple<Pair<Integer, Integer>, String, Float>> nelAnnotation = sen.get(NamedEntityLinkingAnnotation.class); List<Triple<Set<String>, Integer, Integer>> figerData = new ArrayList<>(); for (Triple<Pair<Integer, Integer>, String, Float> t : nelAnnotation) { Integer start = t.first.first; Integer end = t.first.second; Set<String> types = null; if (!t.second.equals("null")) { types = idTypeMap.get(GuidMidConversion.convertBackward(t.second)); } Triple<Set<String>, Integer, Integer> figerTrip = new Triple<>(types, start, end); figerData.add(figerTrip); } sen.set(FigerAnnotation.class, figerData); } }
private void handleCvtBinary(FormulaGenerationInfo fgInfo, Set<String> res) { String description1 = normalizeFbDescription(fgInfo.bInfo.descriptions.get(0)); String description2 = normalizeFbDescription(fgInfo.bInfo.descriptions.get(1)); Annotation a1 = getAnnotation(description1); Annotation a2 = getAnnotation(description2); String question = generateCvtQuestion( fgInfo, description1, description2, a1.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class).firstChild(), a2.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class).firstChild(), getPosTagsFromAnnotation(a1), getPosTagsFromAnnotation(a2)); if (question != null) res.add(question); }
private void testParseTree() { try { Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // read some text in the text variable String text = "Give me a list of all bandleaders that play trumpet."; // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom // types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods // this is the parse tree of the current sentence Tree tree = sentence.get(TreeAnnotation.class); // this is the Stanford dependency graph of the current sentence SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); Set<IndexedWord> vertices = dependencies.vertexSet(); List<SemanticGraphEdge> edges = dependencies.edgeListSorted(); for (SemanticGraphEdge e : edges) {} for (IndexedWord i : vertices) { System.out.println(i.toString()); } } } catch (Exception e) { } }
public void annotate(Annotation annotation) { if (verbose) { timer.start(); System.err.print("Adding gender annotation..."); } if (!annotation.containsKey(SentencesAnnotation.class)) throw new RuntimeException("Unable to find sentences in " + annotation); List<CoreMap> sentences = annotation.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { List<CoreLabel> tokens = sentence.get(TokensAnnotation.class); classifier.classify(tokens); for (CoreLabel token : tokens) token.set(GenderAnnotation.class, token.get(AnswerAnnotation.class)); } if (verbose) timer.stop("done."); }
public static void describe(DBObject doc, StanfordCoreNLP pipeline) { // create an empty Annotation just with the given text Annotation document = new Annotation((String) doc.get("cleansed_text")); BasicDBObject m_doc = new BasicDBObject(); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom // types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = token.get(TextAnnotation.class); // this is the POS tag of the token String pos = token.get(PartOfSpeechAnnotation.class); // this is the NER label of the token String ne = token.get(NamedEntityTagAnnotation.class); // System.out.print("(" + word + ", " + pos + ", " + ne + ")"); } // System.out.println(); // this is the parse tree of the current sentence Tree tree = sentence.get(TreeAnnotation.class); // this is the Stanford dependency graph of the current sentence SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); } // This is the coreference link graph // Each chain stores a set of mentions that link to each other, // along with a method for getting the most representative mention // Both sentence and token offsets start at 1! Map<Integer, CorefChain> graph = document.get(CorefChainAnnotation.class); }
/* * This function return the lemmatized word from the original term */ private String lemmatize(String text) { // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and // has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); String lemma = null; for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token : sentence.get(TokensAnnotation.class)) { lemma = token.get(LemmaAnnotation.class); } } return lemma; }
private static List<Extraction> getExtractions( Corpus c, ArgumentIdentification ai, SententialInstanceGeneration sig, DocumentExtractor de) throws SQLException, IOException { List<Extraction> extrs = new ArrayList<Extraction>(); Iterator<Annotation> docs = c.getDocumentIterator(); Map<Integer, String> ftID2ftMap = ModelUtils.getFeatureIDToFeatureMap(de.getMapping()); while (docs.hasNext()) { Annotation doc = docs.next(); List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); int sentenceCount = 1; for (CoreMap sentence : sentences) { // argument identification List<Argument> arguments = ai.identifyArguments(doc, sentence); // sentential instance generation List<Pair<Argument, Argument>> sententialInstances = sig.generateSententialInstances(arguments, sentence); for (Pair<Argument, Argument> p : sententialInstances) { Pair<Triple<String, Double, Double>, Map<Integer, Double>> extrResult = de.extractFromSententialInstanceWithFeatureScores(p.first, p.second, sentence, doc); if (extrResult != null) { Triple<String, Double, Double> extrScoreTripe = extrResult.first; Map<Integer, Double> featureScores = extrResult.second; String rel = extrScoreTripe.first; if (targetRelations.contains(rel)) { String docName = sentence.get(SentDocName.class); String senText = sentence.get(CoreAnnotations.TextAnnotation.class); Integer sentNum = sentence.get(SentGlobalID.class); Extraction e = new Extraction( p.first, p.second, docName, rel, sentNum, extrScoreTripe.third, senText); e.setFeatureScoreList(EvaluationUtils.getFeatureScoreList(featureScores, ftID2ftMap)); extrs.add(e); } } } sentenceCount++; } } return EvaluationUtils.getUniqueList(extrs); }
private String getHeadNoun(String uri) { String[] tokens = lexicalize(uri); // if we have multiple tokens, get the head noun String head; if (tokens.length > 1) { head = Joiner.on(" ").join(tokens); Annotation document = new Annotation(head); pipeline.annotate(document); CoreMap sentence = document.get(SentencesAnnotation.class).get(0); Tree tree = sentence.get(TreeAnnotation.class); Tree headTree = headFinder.determineHead(tree); // we assume that the last occurring NN is the head noun List<Tree> leaves = headTree.getLeaves(); head = leaves.get(leaves.size() - 1).label().value(); } else { head = tokens[0]; } return head; }
public String[] wordsSegment(String text) { String[] listTokenSens = null; List<String> listSens = new ArrayList<String>(); // creates a StanfordCoreNLP object, with POS tagging, lemmatization, // NER, parsing, and coreference resolution Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and // has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods List<String> listWord = new ArrayList<String>(); for (CoreLabel token : sentence.get(TokensAnnotation.class)) { System.err.println(token.lemma()); // this is the text of the token String word = token.get(TextAnnotation.class); listWord.add(word); // this is the POS tag of the token // String pos = token.get(PartOfSpeechAnnotation.class); } listSens.add(StringUtils.join(listWord, " ")); } listTokenSens = new String[listSens.size()]; listTokenSens = listSens.toArray(listTokenSens); return listTokenSens; }
@Override protected void process( ComplexEventChunk<StreamEvent> streamEventChunk, Processor nextProcessor, StreamEventCloner streamEventCloner, ComplexEventPopulater complexEventPopulater) { synchronized (this) { while (streamEventChunk.hasNext()) { StreamEvent streamEvent = streamEventChunk.next(); if (logger.isDebugEnabled()) { logger.debug( String.format( "Event received. Regex:%s Event:%s", regexPattern.pattern(), streamEvent)); } Annotation document = pipeline.process(attributeExpressionExecutors[1].execute(streamEvent).toString()); for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) { TokenSequenceMatcher matcher = regexPattern.getMatcher(sentence.get(CoreAnnotations.TokensAnnotation.class)); while (matcher.find()) { Object[] data = new Object[attributeCount]; data[0] = matcher.group(); for (int i = 1; i < attributeCount; i++) { data[i] = matcher.group(i); } StreamEvent newStreamEvent = streamEventCloner.copyStreamEvent(streamEvent); complexEventPopulater.populateComplexEvent(newStreamEvent, data); streamEventChunk.insertBeforeCurrent(newStreamEvent); } } streamEventChunk.remove(); } } nextProcessor.process(streamEventChunk); }