private void tokenizeDate(String inputDate) { tokens = new ArrayList<String>(); Pattern pat = Pattern.compile("[-]"); if (inputDate == null) { System.out.println("Null input date"); } Matcher m = pat.matcher(inputDate); String str = m.replaceAll(" - "); str = str.replaceAll(",", " "); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str))); while (tokenizer.hasNext()) { Word nextToken = tokenizer.next(); tokens.add(nextToken.toString()); } if (DEBUG) { System.out.println("tokens:" + tokens); } }
/** * Test program for demonstrating the Stemmer. It reads text from a a list of files, stems each * word, and writes the result to standard output. Note that the word stemmed is expected to be in * lower case: forcing lower case must be done outside the Stemmer class. Usage: Stemmer file-name * file-name ... */ public static void main(String[] args) throws IOException { Stemmer s = new Stemmer(); if (args[0].equals("-file")) { Iterator<Word> it = PTBTokenizer.newPTBTokenizer( new InputStreamReader(new FileInputStream(args[1]), "utf-8")); while (it.hasNext()) { Word token = it.next(); System.out.print(s.stem(token.word())); System.out.print(' '); } } else { for (String arg : args) { System.out.print(s.stem(arg)); System.out.print(' '); } } System.out.println(); }
/** * Returns a new Document with the same meta-data as <tt>in</tt>, and the same words except tags * are stripped. */ public List<Word> process(List<? extends Word> in) { List<Word> out = new ArrayList<>(); boolean justInsertedNewline = false; // to prevent contiguous newlines for (Word w : in) { String ws = w.word(); if (ws.startsWith("<") && ws.endsWith(">")) { if (markLineBreaks && !justInsertedNewline) { // finds start and end of tag name (ignores brackets and /) // e.g. <p>, <br/>, or </table> // se s e s e int tagStartIndex = 1; while (tagStartIndex < ws.length() && !Character.isLetter(ws.charAt(tagStartIndex))) { tagStartIndex++; } if (tagStartIndex == ws.length()) { continue; // no tag text } int tagEndIndex = ws.length() - 1; while (tagEndIndex > tagStartIndex && !Character.isLetterOrDigit(ws.charAt(tagEndIndex))) { tagEndIndex--; } // looks up tag name in list of known block-level tags String tagName = ws.substring(tagStartIndex, tagEndIndex + 1).toLowerCase(); if (blockTags.contains(tagName)) { out.add(new Word("\n")); // mark newline for block-level tags justInsertedNewline = true; } } } else { out.add(w); // normal word justInsertedNewline = false; } } return out; }
/** Stems <code>w</code> and returns stemmed <code>Word</code>. */ public Word stem(Word w) { return (new Word(stem(w.word()))); }
private static String yield(Tree t) { StringBuilder sb = new StringBuilder(); for (Word word : t.yieldWords()) sb.append(word.word() + " "); return sb.toString().trim(); }