Пример #1
0
  protected void addForDoc(File doc) {
    Set<String> encountered = new HashSet<String>();
    SentenceIterator iter = new LineSentenceIterator(doc);
    while (iter.hasNext()) {
      String line = iter.nextSentence();
      if (line == null) continue;
      Tokenizer tokenizer = tokenizerFactory.create(new InputHomogenization(line).transform());
      while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();
        java.util.regex.Matcher m = punct.matcher(token);
        if (validWord(token)) {
          documentWordFrequencies.incrementCount(token, doc.getAbsolutePath(), 1.0);
          tf.incrementCount(token, 1.0);
          if (!encountered.contains(token)) {
            idf.incrementCount(token, 1.0);
            encountered.add(token);
          }
        }
      }

      iter.finish();
    }
  }