public Map<Integer, Integer> getGeneSpans(String text) { Map<Integer, Integer> begin2end = new HashMap<Integer, Integer>(); Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { List<CoreLabel> candidate = new ArrayList<CoreLabel>(); for (CoreLabel token : sentence.get(TokensAnnotation.class)) { String pos = token.get(PartOfSpeechAnnotation.class); if (pos.startsWith("NN")) { candidate.add(token); } else if (candidate.size() > 0) { int begin = candidate.get(0).beginPosition(); int end = candidate.get(candidate.size() - 1).endPosition(); begin2end.put(begin, end); candidate.clear(); } } if (candidate.size() > 0) { int begin = candidate.get(0).beginPosition(); int end = candidate.get(candidate.size() - 1).endPosition(); begin2end.put(begin, end); candidate.clear(); } } return begin2end; }
/** * Get the text value of this entity. The headTokenSpan MUST be set before calling this method! */ public String getValue() { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); // int lastEnd = -1; StringBuilder sb = new StringBuilder(); for (int i = headTokenSpan.start(); i < headTokenSpan.end(); i++) { CoreLabel token = tokens.get(i); // we are not guaranteed to have CharacterOffsets so we can't use them... /* Integer start = token.get(CharacterOffsetBeginAnnotation.class); Integer end = token.get(CharacterOffsetEndAnnotation.class); if (start != null && end != null) { if (lastEnd != -1 && !start.equals(lastEnd)) { sb.append(StringUtils.repeat(" ", start - lastEnd)); lastEnd = end; } } else { if (lastEnd != -1) sb.append(" "); lastEnd = 0; } */ if (i > headTokenSpan.start()) sb.append(" "); sb.append(token.word()); } return sb.toString(); }
private LinkedHashMap<LinkedHashMap<Integer, String>, String> identifyNER(String text) { LinkedHashMap<LinkedHashMap<Integer, String>, String> map = new LinkedHashMap<>(); String serializedClassifier = "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"; CRFClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier); List<List<CoreLabel>> classify = classifier.classify(text); for (List<CoreLabel> coreLabels : classify) { for (CoreLabel coreLabel : coreLabels) { String word = coreLabel.word(); int index = coreLabel.index(); String category = coreLabel.get(CoreAnnotations.AnswerAnnotation.class); if (!"O".equals(category)) { // for(Entry e1 : map.entrySet()){ // // LinkedHashMap<Integer, String> entries = (LinkedHashMap<Integer, // String>) e1; // // // } System.out.println(word + ":" + category); } } } return map; }
public static List<String> lemmatizeDocument(String documentText) { if (pipeline == null) { loadModels(); } List<String> lemmas = new LinkedList<>(); // create an empty Annotation just with the given text Annotation document = new Annotation(documentText); // run all Annotators on this text pipeline.annotate(document); // Iterate over all of the sentences found List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // Iterate over all tokens in a sentence for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // Retrieve and add the lemma for each word into the // list of lemmas lemmas.add(token.get(CoreAnnotations.LemmaAnnotation.class)); } } return lemmas; }
/** * @param t * @return */ public static String lemmatize(String t) { if (pipeline == null) { loadModels(); } String lemma = ""; try { // create an empty Annotation just with the given text Annotation document = new Annotation(t); // run all Annotators on this text pipeline.annotate(document); // Iterate over all of the sentences found List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // Iterate over all tokens in a sentence for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // Retrieve and add the lemma for each word into the // list of lemmas lemma += " " + token.get(CoreAnnotations.LemmaAnnotation.class); } } } catch (Exception e) { System.err.println("Stanford Lemmatizer error exception Word: " + t); } return lemma.trim(); }
// TODO: roll check into tokens regex pattern? // That allows for better matching because unmatched sequences will be eliminated at match time private boolean checkPosTags(List<CoreLabel> tokens, int start, int end) { if (validPosPattern != null) { // Need to check POS tag too... switch (posMatchType) { case MATCH_ONE_TOKEN_PHRASE_ONLY: if (tokens.size() > 1) return true; // fall through case MATCH_AT_LEAST_ONE_TOKEN: for (int i = start; i < end; i++) { CoreLabel token = tokens.get(i); String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); if (pos != null && validPosPattern.matcher(pos).matches()) { return true; } } return false; case MATCH_ALL_TOKENS: // Checked else where return true; default: // Don't know this match type.... return true; } } return true; }
public List<NLPInfo> analyze(String text) { Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); if (sentences == null || sentences.isEmpty()) { return null; } List<NLPInfo> res = new ArrayList<NLPInfo>(); NLPInfo info; for (CoreMap sentence : sentences) { info = new NLPInfo(); NLPToken tokenInfo; for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { tokenInfo = new NLPToken(); tokenInfo.setWord(token.get(CoreAnnotations.TextAnnotation.class)); tokenInfo.setTag(token.get(CoreAnnotations.PartOfSpeechAnnotation.class)); tokenInfo.setNer(token.get(CoreAnnotations.NamedEntityTagAnnotation.class)); info.appendToken(tokenInfo); } res.add(info); } return res; }
private String findNextParagraphSpeaker( List<CoreMap> paragraph, int paragraphOffset, Dictionaries dict) { CoreMap lastSent = paragraph.get(paragraph.size() - 1); String speaker = ""; for (CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) { if (w.get(CoreAnnotations.LemmaAnnotation.class).equals("report") || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) { String word = w.get(CoreAnnotations.TextAnnotation.class); SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); IndexedWord t = dependency.getNodeByWordPattern(word); for (Pair<GrammaticalRelation, IndexedWord> child : dependency.childPairs(t)) { if (child.first().getShortName().equals("nsubj")) { int subjectIndex = child.second().index(); // start from 1 IntTuple headPosition = new IntTuple(2); headPosition.set(0, paragraph.size() - 1 + paragraphOffset); headPosition.set(1, subjectIndex - 1); if (mentionheadPositions.containsKey(headPosition) && mentionheadPositions.get(headPosition).nerString.startsWith("PER")) { speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID); } } } } } return speaker; }
/** * 29% in FactorTable.getValue() 28% in CRFCliqueTree.getCalibratedCliqueTree() 12.6% waiting for * threads * * <p>Single threaded: 15000 ms - 26000 ms Multi threaded: 4500 ms - 7000 ms * * <p>with 8 cpus, 3.3x - 3.7x speedup, around 800% utilization */ public static void benchmarkCRF() { Properties props = new Properties(); props.setProperty("macro", "true"); // use a generic CRF configuration props.setProperty("useIfInteger", "true"); props.setProperty("featureFactory", "edu.stanford.nlp.benchmarks.BenchmarkFeatureFactory"); props.setProperty("saveFeatureIndexToDisk", "false"); CRFClassifier<CoreLabel> crf = new CRFClassifier<CoreLabel>(props); Random r = new Random(42); List<List<CoreLabel>> data = new ArrayList<>(); for (int i = 0; i < 100; i++) { List<CoreLabel> sentence = new ArrayList<>(); for (int j = 0; j < 20; j++) { CoreLabel l = new CoreLabel(); l.setWord("j:" + j); boolean tag = j % 2 == 0 ^ (r.nextDouble() > 0.7); l.set(CoreAnnotations.AnswerAnnotation.class, "target:" + tag); sentence.add(l); } data.add(sentence); } long msStart = System.currentTimeMillis(); crf.train(data); long delay = System.currentTimeMillis() - msStart; System.out.println("Training took " + delay + " ms"); }
@Test public void testCorp() { // We test a 2x2 design: {strict, regular} x {no following context, following context} for (int sent = 0; sent < 4; sent++) { PTBTokenizer<CoreLabel> ptbTokenizer = new PTBTokenizer<>( new StringReader(corpInputs[sent / 2]), new CoreLabelTokenFactory(), (sent % 2 == 0) ? "strictTreebank3" : ""); int i = 0; while (ptbTokenizer.hasNext()) { CoreLabel w = ptbTokenizer.next(); try { assertEquals("PTBTokenizer problem", corpGold[sent % 2][i], w.word()); } catch (ArrayIndexOutOfBoundsException aioobe) { // the assertion below outside the loop will fail } i++; } if (i != corpGold[sent % 2].length) { System.out.println("Gold: " + Arrays.toString(corpGold[sent % 2])); List<CoreLabel> tokens = new PTBTokenizer<>( new StringReader(corpInputs[sent / 2]), new CoreLabelTokenFactory(), (sent % 2 == 0) ? "strictTreebank3" : "") .tokenize(); System.out.println("Guess: " + SentenceUtils.listToString(tokens)); System.out.flush(); } assertEquals("PTBTokenizer num tokens problem", i, corpGold[sent % 2].length); } }
public static void saveCoNLL( PrintStream os, List<List<CoreLabel>> sentences, boolean alreadyBIO) { os.println("-DOCSTART- -X- O\n"); for (List<CoreLabel> sent : sentences) { String prev = null; for (CoreLabel word : sent) { String w = word.word().replaceAll("[ \t\n]+", "_"); String t = word.get(CoreAnnotations.PartOfSpeechAnnotation.class); String l = word.get(CoreAnnotations.AnswerAnnotation.class); String nl = l; if (!alreadyBIO && !l.equals("O")) { if (prev != null && l.equals(prev)) nl = "I-" + l; else nl = "B-" + l; } String line = w + ' ' + t + ' ' + nl; String[] toks = line.split("[ \t\n]+"); if (toks.length != 3) { throw new RuntimeException("INVALID LINE: \"" + line + '"'); } os.printf("%s %s %s\n", w, t, nl); prev = l; } os.println(); } }
private static List<String> getTokenStrs(List<CoreLabel> tokens) { List<String> mainTokenStrs = new ArrayList<String>(tokens.size()); for (CoreLabel token : tokens) { String text = token.get(CoreAnnotations.TextAnnotation.class); mainTokenStrs.add(text); } return mainTokenStrs; }
/** * Set index for each token and sentence in the document. * * @param doc */ public static void setTokenIndices(Document doc) { int token_index = 0; for (CoreMap sent : doc.annotation.get(SentencesAnnotation.class)) { for (CoreLabel token : sent.get(TokensAnnotation.class)) { token.set(TokenBeginAnnotation.class, token_index++); } } }
@Override public void printAnswers(List<CoreLabel> doc, PrintWriter out) { for (CoreLabel wi : doc) { String answer = wi.get(CoreAnnotations.AnswerAnnotation.class); String goldAnswer = wi.get(CoreAnnotations.GoldAnswerAnnotation.class); out.println(wi.word() + "\t" + goldAnswer + "\t" + answer); } out.println(); }
public static void fillEntity(List<Entity> entities, List<CoreLabel> tokens) { for (Entity entity : entities) { for (int i = 0; i < tokens.size(); i++) { CoreLabel token = tokens.get(i); if (entity.offset == token.beginPosition()) entity.start = i; if (entity.offsetEnd == token.endPosition()) entity.end = i; } } }
private static String toString(final List<CoreLabel> lineage) { StringBuilder sb = new StringBuilder(); for (CoreLabel cl : lineage) { sb.append(cl.value()); sb.append(" <-- "); } return sb.toString(); }
private static List<String> getMainTokenStrs(List<CoreLabel> tokens) { List<String> mainTokenStrs = new ArrayList<String>(tokens.size()); for (CoreLabel token : tokens) { String text = token.get(CoreAnnotations.TextAnnotation.class); if (!text.isEmpty() && (text.length() >= 4 || Character.isUpperCase(text.charAt(0)))) { mainTokenStrs.add(text); } } return mainTokenStrs; }
public String getExtentString() { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); StringBuilder sb = new StringBuilder(); for (int i = extentTokenSpan.start(); i < extentTokenSpan.end(); i++) { CoreLabel token = tokens.get(i); if (i > extentTokenSpan.start()) sb.append(" "); sb.append(token.word()); } return sb.toString(); }
public static void standfordNLP() { CoreLabelTokenFactory ctf = new CoreLabelTokenFactory(); PTBTokenizer<CoreLabel> ptb = new PTBTokenizer<>(new StringReader(paragraph), ctf, "invertible=true"); while (ptb.hasNext()) { CoreLabel cl = ptb.next(); System.out.print( cl.originalText() + " [" + cl.beginPosition() + "-" + cl.endPosition() + "];"); } System.out.println(); }
private void verifyWord(CoreLabel expected, CoreLabel result) { for (Class annotation : tokenAnnotations) { if (expected.get(annotation) == null && result.get(annotation) != null && "".equals(result.get(annotation))) { // allow "" in place of null continue; } assertEquals( "Different for class " + annotation, expected.get(annotation), result.get(annotation)); } }
/** * Converts the tree labels to CoreLabels. We need this because we store additional info in the * CoreLabel, like token span. * * @param tree */ public static void convertToCoreLabels(Tree tree) { Label l = tree.label(); if (!(l instanceof CoreLabel)) { CoreLabel cl = new CoreLabel(); cl.setValue(l.value()); tree.setLabel(cl); } for (Tree kid : tree.children()) { convertToCoreLabels(kid); } }
private static void taggedLeafLabels(Tree t, List<CoreLabel> l) { if (t.isPreTerminal()) { CoreLabel fl = (CoreLabel) t.getChild(0).label(); fl.set(TagLabelAnnotation.class, t.label()); l.add(fl); } else { Tree[] kids = t.children(); for (int j = 0, n = kids.length; j < n; j++) { taggedLeafLabels(kids[j], l); } } }
private static int reIndexLeaves(Tree t, int startIndex) { if (t.isLeaf()) { CoreLabel afl = (CoreLabel) t.label(); afl.setIndex(startIndex); startIndex++; } else { for (Tree child : t.children()) { startIndex = reIndexLeaves(child, startIndex); } } return startIndex; }
public static final String doCorefResolution(Annotation annotation) { Map<Integer, CorefChain> corefs = annotation.get(CorefChainAnnotation.class); List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); List<String> resolved = new ArrayList<String>(); for (CoreMap sentence : sentences) { List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); for (CoreLabel token : tokens) { Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class); CorefChain chain = corefs.get(corefClustId); if (chain == null) resolved.add(token.word()); else { int sentINdx = chain.getRepresentativeMention().sentNum - 1; CoreMap corefSentence = sentences.get(sentINdx); List<CoreLabel> corefSentenceTokens = corefSentence.get(TokensAnnotation.class); CorefMention reprMent = chain.getRepresentativeMention(); if (token.index() < reprMent.startIndex || token.index() > reprMent.endIndex) { for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) { CoreLabel matchedLabel = corefSentenceTokens.get(i - 1); resolved.add(matchedLabel.word()); } } else resolved.add(token.word()); } } } String resolvedStr = ""; System.out.println(); for (String str : resolved) { resolvedStr += str + " "; } System.out.println(resolvedStr); return resolvedStr; }
/** * This should be called after the classifier has been trained and parseAndTrain has been called * to accumulate test set * * <p>This will return precision,recall and F1 measure */ public void runTestSet(List<List<CoreLabel>> testSet) { Counter<String> tp = new DefaultCounter<>(); Counter<String> fp = new DefaultCounter<>(); Counter<String> fn = new DefaultCounter<>(); Counter<String> actual = new DefaultCounter<>(); for (List<CoreLabel> labels : testSet) { List<CoreLabel> unannotatedLabels = new ArrayList<>(); // create a new label without answer annotation for (CoreLabel label : labels) { CoreLabel newLabel = new CoreLabel(); newLabel.set(annotationForWord, label.get(annotationForWord)); newLabel.set(PartOfSpeechAnnotation.class, label.get(PartOfSpeechAnnotation.class)); unannotatedLabels.add(newLabel); } List<CoreLabel> annotatedLabels = this.classifier.classify(unannotatedLabels); int ind = 0; for (CoreLabel expectedLabel : labels) { CoreLabel annotatedLabel = annotatedLabels.get(ind); String answer = annotatedLabel.get(AnswerAnnotation.class); String expectedAnswer = expectedLabel.get(AnswerAnnotation.class); actual.incrementCount(expectedAnswer); // match only non background symbols if (!SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL.equals(expectedAnswer) && expectedAnswer.equals(answer)) { // true positives tp.incrementCount(answer); System.out.println("True Positive:" + annotatedLabel); } else if (!SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL.equals(answer)) { // false positives fp.incrementCount(answer); System.out.println("False Positive:" + annotatedLabel); } else if (!SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL.equals(expectedAnswer)) { // false negatives fn.incrementCount(expectedAnswer); System.out.println("False Negative:" + expectedLabel); } // else true negatives ind++; } } actual.remove(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL); }
private void parseThread(ArrayList<Thread> threads) { for (Thread t : threads) { ThreadVector tv = new ThreadVector(t); allThreads.add(tv); for (Email e : t.getEmails()) { StringBuffer sb = new StringBuffer(); for (Sentence s : e.getSentences()) { // if it's the content of this email if (s.getQuotationTimes() == 0) { sb.append(s.getText() + " "); } } String content = sb.toString().toLowerCase(); // create an empty Annotation just with the given text Annotation document = new Annotation(content); // run all Annotators on this text this.pipeline.annotate(document); // Iterate over all of the sentences found List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { List<String> lemmas = new LinkedList<String>(); // Iterate over all tokens in a sentence for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // Retrieve and add the lemma for each word into the // list of lemmas lemmas.add(token.get(LemmaAnnotation.class)); } HashMap<String, Integer> wordCount = countWordsInSentence(lemmas); // if it has valid words if (wordCount.size() > 0) { totalSentenceNumber++; for (String word : wordCount.keySet()) { if (!dictionaryIndex.containsKey(word)) { dictionaryIndex.put(word, dictionaryIndex.size()); dictionaryDocumentCount.put(word, 1); } else { dictionaryDocumentCount.put(word, dictionaryDocumentCount.get(word) + 1); } } SentenceVector sv = new SentenceVector(sentence.toString(), wordCount); tv.addSentenceVectors(sv); } } } } }
public static Collection<String> lemmatize(String rawInput) { Collection<String> lemmas = Lists.newArrayListWithCapacity(30); // should to the initial capacity in other places too Annotation rawInputAnnotation = new Annotation(rawInput); coreNlp.annotate(rawInputAnnotation); List<CoreLabel> allTokens = rawInputAnnotation.get(TokensAnnotation.class); for (CoreLabel eachToken : allTokens) { lemmas.add(eachToken.get(LemmaAnnotation.class)); } return lemmas; }
/** * Parse a CoNLL formatted tree into a SemanticGraph object (along with a list of tokens). * * @param conll The CoNLL formatted tree. * @return A pair of a SemanticGraph and a token list, corresponding to the parse of the sentence * and to tokens in the sentence. */ protected Pair<SemanticGraph, List<CoreLabel>> mkTree(String conll) { List<CoreLabel> sentence = new ArrayList<>(); SemanticGraph tree = new SemanticGraph(); for (String line : conll.split("\n")) { if (line.trim().equals("")) { continue; } String[] fields = line.trim().split("\\s+"); int index = Integer.parseInt(fields[0]); String word = fields[1]; CoreLabel label = IETestUtils.mkWord(word, index); sentence.add(label); if (fields[2].equals("0")) { tree.addRoot(new IndexedWord(label)); } else { tree.addVertex(new IndexedWord(label)); } if (fields.length > 4) { label.setTag(fields[4]); } if (fields.length > 5) { label.setNER(fields[5]); } if (fields.length > 6) { label.setLemma(fields[6]); } } int i = 0; for (String line : conll.split("\n")) { if (line.trim().equals("")) { continue; } String[] fields = line.trim().split("\\s+"); int parent = Integer.parseInt(fields[2]); String reln = fields[3]; if (parent > 0) { tree.addEdge( new IndexedWord(sentence.get(parent - 1)), new IndexedWord(sentence.get(i)), new GrammaticalRelation(Language.UniversalEnglish, reln, null, null), 1.0, false); } i += 1; } return Pair.makePair(tree, sentence); }
/** * Determine if the given tree contains a leaf which matches the part-of-speech and lexical * criteria. * * @param pos Regular expression to match part of speech (may be null, in which case any POS is * allowed) * @param pos Regular expression to match word (may be null, in which case any word is allowed) */ public static boolean shouldPrintTree(Tree tree, Pattern pos, Pattern word) { for (Tree t : tree) { if (t.isPreTerminal()) { CoreLabel label = (CoreLabel) t.label(); String tpos = label.value(); Tree wordNode = t.firstChild(); CoreLabel wordLabel = (CoreLabel) wordNode.label(); String tword = wordLabel.value(); if ((pos == null || pos.matcher(tpos).find()) && (word == null || word.matcher(tword).find())) return true; } } return false; }
/** * Sets the labels on the tree (except the leaves) to be the integer value of the sentiment * prediction. Makes it easy to print out with Tree.toString() */ static void setSentimentLabels(Tree tree) { if (tree.isLeaf()) { return; } for (Tree child : tree.children()) { setSentimentLabels(child); } Label label = tree.label(); if (!(label instanceof CoreLabel)) { throw new IllegalArgumentException("Required a tree with CoreLabels"); } CoreLabel cl = (CoreLabel) label; cl.setValue(Integer.toString(RNNCoreAnnotations.getPredictedClass(tree))); }