/**
  * Transforms the matrix
  *
  * @param text
  * @return
  */
 @Override
 public DoubleMatrix transform(String text) {
   Tokenizer tokenizer = tokenizerFactory.create(text);
   List<String> tokens = tokenizer.getTokens();
   DoubleMatrix input = new DoubleMatrix(1, vocab.size());
   for (int i = 0; i < tokens.size(); i++) {
     int idx = vocab.indexOf(tokens.get(i));
     if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i)));
   }
   return input;
 }
  /**
   * Vectorizes the passed in text treating it as one document
   *
   * @param text the text to vectorize
   * @param label the label of the text
   * @return a dataset with a applyTransformToDestination of weights(relative to impl; could be word
   *     counts or tfidf scores)
   */
  @Override
  public DataSet vectorize(String text, String label) {
    Tokenizer tokenizer = tokenizerFactory.create(text);
    List<String> tokens = tokenizer.getTokens();
    DoubleMatrix input = new DoubleMatrix(1, vocab.size());
    for (int i = 0; i < tokens.size(); i++) {
      int idx = vocab.indexOf(tokens.get(i));
      if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i)));
    }

    DoubleMatrix labelMatrix = MatrixUtil.toOutcomeVector(labels.indexOf(label), labels.size());
    return new DataSet(input, labelMatrix);
  }
예제 #3
0
  /**
   * Creates an index of the top size words based on tf-idf metrics
   *
   * @param size the number of words in the vocab
   * @return the index of the words
   * @throws IOException
   */
  public Index createVocab(int size) {
    Index vocab = new Index();

    // bootstrapping
    calcWordFrequencies();

    // term frequency has every word
    for (String word : tf.keySet()) {
      double tfVal = MathUtils.tf((int) documentWordFrequencies.getCount(word));
      double idfVal = MathUtils.idf(numFiles, idf.getCount(word));
      double tfidfVal = MathUtils.tfidf(tfVal, idfVal);
      java.util.regex.Matcher m = punct.matcher(word);

      if (!stopWords.contains(word) && !m.matches()) tfidf.setCount(word, tfidfVal);
    }

    Counter<String> aggregate = tfidf;

    // keep top size keys via tfidf rankings
    aggregate.keepTopNKeys(size - 1);

    log.info("Created vocab of size " + aggregate.size());

    wordScores = aggregate;

    // add words that made it via rankings
    for (String word : aggregate.keySet()) {
      if (vocab.indexOf(word) < 0) vocab.add(word);
    }

    // cache the vocab
    currVocab = vocab;
    return vocab;
  }
 private void process() {
   while (sentenceIter.hasNext()) {
     Tokenizer tokenizer = tokenizerFactory.create(sentenceIter.nextSentence());
     List<String> tokens = tokenizer.getTokens();
     for (String token : tokens)
       if (!stopWords.contains(token)) {
         wordCounts.incrementCount(token, 1.0);
         if (vocab.indexOf(token) < 0) vocab.add(token);
       }
   }
 }
 /**
  * Returns the index of a given word
  *
  * @param word the index of a given word
  * @return the index of a given word or -1 if not found
  */
 @Override
 public synchronized int indexOf(String word) {
   return wordIndex.indexOf(word);
 }
 /**
  * Returns the index of a given word
  *
  * @param word the index of a given word
  * @return the index of a given word or -1 if not found
  */
 @Override
 public int indexOf(String word) {
   return wordIndex.indexOf(word);
 }