public void createPhrases() { //// Tokenize List<CoreLabel> tokens = new ArrayList<>(); PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<>(new StringReader(text), new CoreLabelTokenFactory(), ""); while (tokenizer.hasNext()) { tokens.add(tokenizer.next()); } //// Split sentences from tokens List<List<CoreLabel>> sentences = new WordToSentenceProcessor<CoreLabel>().process(tokens); //// Join back together int end; int start = 0; for (List<CoreLabel> sentence : sentences) { end = sentence.get(sentence.size() - 1).endPosition(); phrases.add(new Phrase(text.substring(start, end).trim())); start = end; } /*Reader reader = new StringReader(text); DocumentPreprocessor dp = new DocumentPreprocessor(reader); for (List<HasWord> sentence : dp) { String out = Sentence.listToString(sentence); //replace -LRB- and -RRB- with opening and closing brackets out = out.replace("-LRB-", "("); out = out.replace("-RRB-", ")"); Phrase line = new Phrase(out); phrases.add(line); System.out.println(line.getText()); }*/ }
public static void standfordNLP() { CoreLabelTokenFactory ctf = new CoreLabelTokenFactory(); PTBTokenizer<CoreLabel> ptb = new PTBTokenizer<>(new StringReader(paragraph), ctf, "invertible=true"); while (ptb.hasNext()) { CoreLabel cl = ptb.next(); System.out.print( cl.originalText() + " [" + cl.beginPosition() + "-" + cl.endPosition() + "];"); } System.out.println(); }
private static float lexSimilarity(String s1, String s2) { String[] split1 = s1.split("[.?!]"); String[] split2 = s2.split("[.?!]"); Set<String> stemsInFirst = new HashSet<String>(); Set<String> stemsInSecond = new HashSet<String>(); for (int i = 0; i < split1.length; i++) { PTBTokenizer<Word> tokenizer1 = PTBTokenizer.newPTBTokenizer(new StringReader(split1[i])); while (tokenizer1.hasNext()) { Word w = tokenizer1.next(); String stem = m.stem(w).word(); stemsInFirst.add(stem); } } for (int j = 0; j < split2.length; j++) { PTBTokenizer<Word> tokenizer2 = PTBTokenizer.newPTBTokenizer(new StringReader(split2[j])); while (tokenizer2.hasNext()) { Word w = tokenizer2.next(); String stem = m.stem(w).word(); stemsInSecond.add(stem); } } Iterator<String> i = stemsInSecond.iterator(); float commonStems = 0; while (i.hasNext()) { String curStem = i.next(); // System.out.println(curStem); if (stemsInFirst.contains(curStem)) commonStems++; } int secondSize = stemsInSecond.size(); if (secondSize > 0) return commonStems / (float) (secondSize); else return 0; }
private void tokenizeDate(String inputDate) { tokens = new ArrayList<String>(); Pattern pat = Pattern.compile("[-]"); if (inputDate == null) { System.out.println("Null input date"); } Matcher m = pat.matcher(inputDate); String str = m.replaceAll(" - "); str = str.replaceAll(",", " "); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str))); while (tokenizer.hasNext()) { Word nextToken = tokenizer.next(); tokens.add(nextToken.toString()); } if (DEBUG) { System.out.println("tokens:" + tokens); } }
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; // Output bag DataBag bagOfTokens = bagFactory.newDefaultBag(); StringReader textInput = new StringReader(input.get(0).toString()); PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), ""); for (CoreLabel label; ptbt.hasNext(); ) { label = (CoreLabel) ptbt.next(); if (label.value().length() > 2) { System.err.println(label.toString()); Tuple termText = tupleFactory.newTuple(label.word()); bagOfTokens.add(termText); } } return bagOfTokens; }
public static void main(String[] args) throws IOException { PrintWriter pw = new PrintWriter(filename + ".tokenized.SPL"); @SuppressWarnings({"unchecked", "rawtypes"}) PTBTokenizer tokenizer = new PTBTokenizer(new FileReader(filename), new CoreLabelTokenFactory(), ""); Sentence s = new Sentence(); for (CoreLabel label; tokenizer.hasNext(); ) { label = (CoreLabel) tokenizer.next(); String token = label.toString("value"); if (token.equals("-LRB-")) { token = "("; } if (token.equals("-RRB-")) { token = ")"; } if (token.equals("-LSB-")) { token = "["; } if (token.equals("-RSB-")) { token = "]"; } if (token.equals("-LCB-")) { token = "{"; } if (token.equals("-RCB-")) { token = "}"; } s.words.add(token); // is it sentence splitter? if (token.equals(".") || token.equals("?") || token.equals("!")) { if (s.isCleanSentence()) { pw.println(s.toString()); } s = new Sentence(); } } pw.close(); }