Пример #1
0
  public DoubleMatrix getScoreMatrix(File file) {
    Counter<String> docWords = new Counter<String>();
    try {
      LineIterator iter = FileUtils.lineIterator(file);
      while (iter.hasNext()) {
        Tokenizer t =
            tokenizerFactory.create((new InputHomogenization(iter.nextLine()).transform()));
        while (t.hasMoreTokens()) {
          docWords.incrementCount(t.nextToken(), 1.0);
        }
      }

      iter.close();
    } catch (IOException e) {
      throw new IllegalStateException("Unable to read file", e);
    }
    DoubleMatrix ret = new DoubleMatrix(1, currVocab.size());

    for (int i = 0; i < currVocab.size(); i++) {
      if (docWords.getCount(currVocab.get(i).toString()) > 0) {
        ret.put(i, wordScores.getCount(currVocab.get(i).toString()));
      }
    }

    return ret;
  }
 /**
  * Transforms the matrix
  *
  * @param text
  * @return
  */
 @Override
 public DoubleMatrix transform(String text) {
   Tokenizer tokenizer = tokenizerFactory.create(text);
   List<String> tokens = tokenizer.getTokens();
   DoubleMatrix input = new DoubleMatrix(1, vocab.size());
   for (int i = 0; i < tokens.size(); i++) {
     int idx = vocab.indexOf(tokens.get(i));
     if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i)));
   }
   return input;
 }
 private void process() {
   while (sentenceIter.hasNext()) {
     Tokenizer tokenizer = tokenizerFactory.create(sentenceIter.nextSentence());
     List<String> tokens = tokenizer.getTokens();
     for (String token : tokens)
       if (!stopWords.contains(token)) {
         wordCounts.incrementCount(token, 1.0);
         if (vocab.indexOf(token) < 0) vocab.add(token);
       }
   }
 }
  /**
   * Vectorizes the passed in text treating it as one document
   *
   * @param text the text to vectorize
   * @param label the label of the text
   * @return a dataset with a applyTransformToDestination of weights(relative to impl; could be word
   *     counts or tfidf scores)
   */
  @Override
  public DataSet vectorize(String text, String label) {
    Tokenizer tokenizer = tokenizerFactory.create(text);
    List<String> tokens = tokenizer.getTokens();
    DoubleMatrix input = new DoubleMatrix(1, vocab.size());
    for (int i = 0; i < tokens.size(); i++) {
      int idx = vocab.indexOf(tokens.get(i));
      if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i)));
    }

    DoubleMatrix labelMatrix = MatrixUtil.toOutcomeVector(labels.indexOf(label), labels.size());
    return new DataSet(input, labelMatrix);
  }
Пример #5
0
  protected void addForDoc(File doc) {
    Set<String> encountered = new HashSet<String>();
    SentenceIterator iter = new LineSentenceIterator(doc);
    while (iter.hasNext()) {
      String line = iter.nextSentence();
      if (line == null) continue;
      Tokenizer tokenizer = tokenizerFactory.create(new InputHomogenization(line).transform());
      while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();
        java.util.regex.Matcher m = punct.matcher(token);
        if (validWord(token)) {
          documentWordFrequencies.incrementCount(token, doc.getAbsolutePath(), 1.0);
          tf.incrementCount(token, 1.0);
          if (!encountered.contains(token)) {
            idf.incrementCount(token, 1.0);
            encountered.add(token);
          }
        }
      }

      iter.finish();
    }
  }