/**
   * @param index
   * @param word
   */
  @Override
  public synchronized void addWordToIndex(int index, String word) {
    if (word == null || word.isEmpty())
      throw new IllegalArgumentException("Word can't be empty or null");

    if (!tokens.containsKey(word)) {
      VocabWord token = new VocabWord(1.0, word);
      tokens.put(word, token);
      wordFrequencies.incrementCount(word, 1.0);
    }

    /*
       If we're speaking about adding any word to index directly, it means it's going to be vocab word, not token
    */
    if (!vocabs.containsKey(word)) {
      VocabWord vw = tokenFor(word);
      vw.setIndex(index);
      vocabs.put(word, vw);
      vw.setIndex(index);
    }

    if (!wordFrequencies.containsKey(word)) wordFrequencies.incrementCount(word, 1);

    wordIndex.add(word, index);
  }
 @Override
 public int hashCode() {
   int result = wordIndex != null ? wordIndex.hashCode() : 0;
   result = 31 * result + (wordFrequencies != null ? wordFrequencies.hashCode() : 0);
   result = 31 * result + (docFrequencies != null ? docFrequencies.hashCode() : 0);
   result = 31 * result + (vocabs != null ? vocabs.hashCode() : 0);
   result = 31 * result + (tokens != null ? tokens.hashCode() : 0);
   result = 31 * result + (totalWordOccurrences != null ? totalWordOccurrences.hashCode() : 0);
   result = 31 * result + numDocs;
   return result;
 }
예제 #3
0
  public DoubleMatrix getScoreMatrix(File file) {
    Counter<String> docWords = new Counter<String>();
    try {
      LineIterator iter = FileUtils.lineIterator(file);
      while (iter.hasNext()) {
        Tokenizer t =
            tokenizerFactory.create((new InputHomogenization(iter.nextLine()).transform()));
        while (t.hasMoreTokens()) {
          docWords.incrementCount(t.nextToken(), 1.0);
        }
      }

      iter.close();
    } catch (IOException e) {
      throw new IllegalStateException("Unable to read file", e);
    }
    DoubleMatrix ret = new DoubleMatrix(1, currVocab.size());

    for (int i = 0; i < currVocab.size(); i++) {
      if (docWords.getCount(currVocab.get(i).toString()) > 0) {
        ret.put(i, wordScores.getCount(currVocab.get(i).toString()));
      }
    }

    return ret;
  }
예제 #4
0
  /**
   * Creates an index of the top size words based on tf-idf metrics
   *
   * @param size the number of words in the vocab
   * @return the index of the words
   * @throws IOException
   */
  public Index createVocab(int size) {
    Index vocab = new Index();

    // bootstrapping
    calcWordFrequencies();

    // term frequency has every word
    for (String word : tf.keySet()) {
      double tfVal = MathUtils.tf((int) documentWordFrequencies.getCount(word));
      double idfVal = MathUtils.idf(numFiles, idf.getCount(word));
      double tfidfVal = MathUtils.tfidf(tfVal, idfVal);
      java.util.regex.Matcher m = punct.matcher(word);

      if (!stopWords.contains(word) && !m.matches()) tfidf.setCount(word, tfidfVal);
    }

    Counter<String> aggregate = tfidf;

    // keep top size keys via tfidf rankings
    aggregate.keepTopNKeys(size - 1);

    log.info("Created vocab of size " + aggregate.size());

    wordScores = aggregate;

    // add words that made it via rankings
    for (String word : aggregate.keySet()) {
      if (vocab.indexOf(word) < 0) vocab.add(word);
    }

    // cache the vocab
    currVocab = vocab;
    return vocab;
  }
 /**
  * Transforms the matrix
  *
  * @param text
  * @return
  */
 @Override
 public DoubleMatrix transform(String text) {
   Tokenizer tokenizer = tokenizerFactory.create(text);
   List<String> tokens = tokenizer.getTokens();
   DoubleMatrix input = new DoubleMatrix(1, vocab.size());
   for (int i = 0; i < tokens.size(); i++) {
     int idx = vocab.indexOf(tokens.get(i));
     if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i)));
   }
   return input;
 }
 private void process() {
   while (sentenceIter.hasNext()) {
     Tokenizer tokenizer = tokenizerFactory.create(sentenceIter.nextSentence());
     List<String> tokens = tokenizer.getTokens();
     for (String token : tokens)
       if (!stopWords.contains(token)) {
         wordCounts.incrementCount(token, 1.0);
         if (vocab.indexOf(token) < 0) vocab.add(token);
       }
   }
 }
  /**
   * Vectorizes the passed in text treating it as one document
   *
   * @param text the text to vectorize
   * @param label the label of the text
   * @return a dataset with a applyTransformToDestination of weights(relative to impl; could be word
   *     counts or tfidf scores)
   */
  @Override
  public DataSet vectorize(String text, String label) {
    Tokenizer tokenizer = tokenizerFactory.create(text);
    List<String> tokens = tokenizer.getTokens();
    DoubleMatrix input = new DoubleMatrix(1, vocab.size());
    for (int i = 0; i < tokens.size(); i++) {
      int idx = vocab.indexOf(tokens.get(i));
      if (vocab.indexOf(tokens.get(i)) >= 0) input.put(idx, wordCounts.getCount(tokens.get(i)));
    }

    DoubleMatrix labelMatrix = MatrixUtil.toOutcomeVector(labels.indexOf(label), labels.size());
    return new DataSet(input, labelMatrix);
  }
  /**
   * Loads the google binary model Credit to:
   * https://github.com/NLPchina/Word2VEC_java/blob/master/src/com/ansj/vec/Word2VEC.java
   *
   * @param path path to model
   * @throws IOException
   */
  public static Word2Vec loadGoogleModel(String path) throws IOException {
    DataInputStream dis = null;
    BufferedInputStream bis = null;
    double len = 0;
    float vector = 0;
    Word2Vec ret = new Word2Vec();
    Index wordIndex = new Index();
    FloatMatrix wordVectors = null;
    try {
      bis =
          new BufferedInputStream(
              path.endsWith(".gz")
                  ? new GZIPInputStream(new FileInputStream(path))
                  : new FileInputStream(path));
      dis = new DataInputStream(bis);
      Map<String, FloatMatrix> wordMap = new HashMap<>();
      // number of words
      int words = Integer.parseInt(readString(dis));
      // word vector size
      int size = Integer.parseInt(readString(dis));
      wordVectors = new FloatMatrix(words, size);
      String word;
      float[] vectors = null;
      for (int i = 0; i < words; i++) {
        word = readString(dis);
        log.info("Loaded " + word);
        vectors = new float[size];
        len = 0;
        for (int j = 0; j < size; j++) {
          vector = readFloat(dis);
          len += vector * vector;
          vectors[j] = vector;
        }
        len = Math.sqrt(len);

        for (int j = 0; j < size; j++) {
          vectors[j] /= len;
        }
        wordIndex.add(word);
        wordVectors.putRow(i, new FloatMatrix(vectors));
      }
    } finally {
      bis.close();
      dis.close();
    }

    ret.setWordIndex(wordIndex);
    ret.setSyn0(wordVectors);

    return ret;
  }
 /**
  * @param index
  * @param word
  */
 @Override
 public synchronized void addWordToIndex(int index, String word) {
   if (word == null || word.isEmpty())
     throw new IllegalArgumentException("Word can't be empty or null");
   if (!wordFrequencies.containsKey(word)) wordFrequencies.incrementCount(word, 1);
   wordIndex.add(word, index);
 }
 /** @param word */
 @Override
 public synchronized void putVocabWord(String word) {
   if (word == null || word.isEmpty())
     throw new IllegalArgumentException("Word can't be empty or null");
   // STOP and UNK are not added as tokens
   if (word.equals("STOP") || word.equals("UNK")) return;
   VocabWord token = tokenFor(word);
   if (token == null)
     throw new IllegalStateException("Word " + word + " not found as token in vocab");
   int ind = token.getIndex();
   addWordToIndex(ind, word);
   if (!hasToken(word))
     throw new IllegalStateException("Unable to add token " + word + " when not already a token");
   vocabs.put(word, token);
   wordIndex.add(word, token.getIndex());
 }
  @Override
  public boolean equals(Object o) {
    if (this == o) return true;
    if (o == null || getClass() != o.getClass()) return false;

    InMemoryLookupCache that = (InMemoryLookupCache) o;

    if (numDocs != that.numDocs) return false;
    if (wordIndex != null ? !wordIndex.equals(that.wordIndex) : that.wordIndex != null)
      return false;
    if (wordFrequencies != null
        ? !wordFrequencies.equals(that.wordFrequencies)
        : that.wordFrequencies != null) return false;
    if (docFrequencies != null
        ? !docFrequencies.equals(that.docFrequencies)
        : that.docFrequencies != null) return false;
    if (vocabWords().equals(that.vocabWords())) return true;

    return true;
  }
 /**
  * Returns the index of a given word
  *
  * @param word the index of a given word
  * @return the index of a given word or -1 if not found
  */
 @Override
 public synchronized int indexOf(String word) {
   return wordIndex.indexOf(word);
 }
 /**
  * Returns the word contained at the given index or null
  *
  * @param index the index of the word to get
  * @return the word at the given index
  */
 @Override
 public synchronized String wordAtIndex(int index) {
   return (String) wordIndex.get(index);
 }
 /**
  * Returns the index of a given word
  *
  * @param word the index of a given word
  * @return the index of a given word or -1 if not found
  */
 @Override
 public int indexOf(String word) {
   return wordIndex.indexOf(word);
 }
 /**
  * Returns the word contained at the given index or null
  *
  * @param index the index of the word to get
  * @return the word at the given index
  */
 @Override
 public String wordAtIndex(int index) {
   return (String) wordIndex.get(index);
 }
 /**
  * @param index
  * @param word
  */
 @Override
 public void addWordToIndex(int index, String word) {
   if (!wordFrequencies.containsKey(word)) wordFrequencies.incrementCount(word, 1);
   if (!vocabs.containsKey(word)) vocabs.put(word, new VocabWord(1, vectorLength));
   wordIndex.add(word);
 }