public static ArrayList<String[]> extractNounPhrases( StanfordCoreNLP pipeline, String text, int searchRange) { ArrayList<String[]> wordPairs = new ArrayList<String[]>(); Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); MAX_STEPS = searchRange; for (CoreMap sentence : sentences) { List<CoreLabel> labels = sentence.get(TokensAnnotation.class); // Check negation boolean hasNegation = false; for (CoreLabel label : labels) { if (NEGATIONS.contains(label.lemma().toLowerCase())) { hasNegation = true; } } for (int idx = 0; idx < labels.size(); idx++) { CoreLabel label = labels.get(idx); if (NN_TAGS.contains(label.get(PartOfSpeechAnnotation.class))) { for (int step = 1; step <= MAX_STEPS; step++) { CoreLabel leftLabel = labels.get(Math.max(0, idx - step)); if (JJ_TAGS.contains(leftLabel.tag())) { if (hasNegation) addPair( wordPairs, NOT_PREFIX + leftLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); else addPair( wordPairs, leftLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); break; } CoreLabel rightLabel = labels.get(Math.min(idx + step, labels.size() - 1)); if (JJ_TAGS.contains(rightLabel.tag())) { if (hasNegation) addPair( wordPairs, NOT_PREFIX + rightLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); else addPair( wordPairs, rightLabel.get(LemmaAnnotation.class), label.get(LemmaAnnotation.class)); break; } } } } } return wordPairs; }
public String[] wordsSegment(String text) { String[] listTokenSens = null; List<String> listSens = new ArrayList<String>(); // creates a StanfordCoreNLP object, with POS tagging, lemmatization, // NER, parsing, and coreference resolution Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and // has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods List<String> listWord = new ArrayList<String>(); for (CoreLabel token : sentence.get(TokensAnnotation.class)) { System.err.println(token.lemma()); // this is the text of the token String word = token.get(TextAnnotation.class); listWord.add(word); // this is the POS tag of the token // String pos = token.get(PartOfSpeechAnnotation.class); } listSens.add(StringUtils.join(listWord, " ")); } listTokenSens = new String[listSens.size()]; listTokenSens = listSens.toArray(listTokenSens); return listTokenSens; }
public void process(String inFilepath, String outFilepath, String nerOutFile) { try { StringBuilder inText = new StringBuilder(); StringBuilder outText = new StringBuilder(); StringBuilder nerText = new StringBuilder(); // read some text in the inText variable from input file BufferedReader reader = new BufferedReader(new FileReader(inFilepath)); String line = null; while ((line = reader.readLine()) != null) { if (line.trim().length() == 0) continue; inText.append(line + "\n"); } reader.close(); // create an empty Annotation just with the given text Annotation document = new Annotation(inText.toString()); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom // types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token : sentence.get(TokensAnnotation.class)) { totalWords++; String pos = token.tag(); if (tagFilter.contains(pos)) { remainWords++; String lemma = token.lemma(); outText.append(lemma + " "); if (nerFilter.contains(token.ner())) { nerText.append(token.word() + " "); } } } } // write the processed text to output file FileWriter fw = FileUtil.open(outFilepath); fw.append(outText); FileUtil.close(fw); if (nerOutFile != null) { FileWriter fw2 = FileUtil.open(nerOutFile); fw2.append(nerText); FileUtil.close(fw2); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }