private void tokenizeDate(String inputDate) {
   tokens = new ArrayList<String>();
   Pattern pat = Pattern.compile("[-]");
   if (inputDate == null) {
     System.out.println("Null input date");
   }
   Matcher m = pat.matcher(inputDate);
   String str = m.replaceAll(" - ");
   str = str.replaceAll(",", " ");
   PTBTokenizer<Word> tokenizer =
       PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str)));
   while (tokenizer.hasNext()) {
     Word nextToken = tokenizer.next();
     tokens.add(nextToken.toString());
   }
   if (DEBUG) {
     System.out.println("tokens:" + tokens);
   }
 }
示例#2
0
文件: Stemmer.java 项目: hans/CoreNLP
 /**
  * Test program for demonstrating the Stemmer. It reads text from a a list of files, stems each
  * word, and writes the result to standard output. Note that the word stemmed is expected to be in
  * lower case: forcing lower case must be done outside the Stemmer class. Usage: Stemmer file-name
  * file-name ...
  */
 public static void main(String[] args) throws IOException {
   Stemmer s = new Stemmer();
   if (args[0].equals("-file")) {
     Iterator<Word> it =
         PTBTokenizer.newPTBTokenizer(
             new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
     while (it.hasNext()) {
       Word token = it.next();
       System.out.print(s.stem(token.word()));
       System.out.print(' ');
     }
   } else {
     for (String arg : args) {
       System.out.print(s.stem(arg));
       System.out.print(' ');
     }
   }
   System.out.println();
 }
示例#3
0
  /**
   * Returns a new Document with the same meta-data as <tt>in</tt>, and the same words except tags
   * are stripped.
   */
  public List<Word> process(List<? extends Word> in) {
    List<Word> out = new ArrayList<>();
    boolean justInsertedNewline = false; // to prevent contiguous newlines
    for (Word w : in) {
      String ws = w.word();
      if (ws.startsWith("<") && ws.endsWith(">")) {
        if (markLineBreaks && !justInsertedNewline) {
          // finds start and end of tag name (ignores brackets and /)
          // e.g. <p>, <br/>, or </table>
          //       se   s e        s    e

          int tagStartIndex = 1;
          while (tagStartIndex < ws.length() && !Character.isLetter(ws.charAt(tagStartIndex))) {
            tagStartIndex++;
          }
          if (tagStartIndex == ws.length()) {
            continue; // no tag text
          }

          int tagEndIndex = ws.length() - 1;
          while (tagEndIndex > tagStartIndex
              && !Character.isLetterOrDigit(ws.charAt(tagEndIndex))) {
            tagEndIndex--;
          }

          // looks up tag name in list of known block-level tags
          String tagName = ws.substring(tagStartIndex, tagEndIndex + 1).toLowerCase();
          if (blockTags.contains(tagName)) {
            out.add(new Word("\n")); // mark newline for block-level tags
            justInsertedNewline = true;
          }
        }
      } else {
        out.add(w); // normal word
        justInsertedNewline = false;
      }
    }
    return out;
  }
示例#4
0
文件: Stemmer.java 项目: hans/CoreNLP
 /** Stems <code>w</code> and returns stemmed <code>Word</code>. */
 public Word stem(Word w) {
   return (new Word(stem(w.word())));
 }
示例#5
0
 private static String yield(Tree t) {
   StringBuilder sb = new StringBuilder();
   for (Word word : t.yieldWords()) sb.append(word.word() + " ");
   return sb.toString().trim();
 }