public InMemoryLookupCache(boolean addUnk) {
   if (addUnk) {
     VocabWord word = new VocabWord(1.0, Word2Vec.UNK);
     word.setIndex(0);
     addToken(word);
     addWordToIndex(0, Word2Vec.UNK);
     putVocabWord(Word2Vec.UNK);
   }
 }
Пример #2
0
  private static void addTokenToVocabCache(InMemoryLookupCache vocab, String stringToken) {
    // Making string token into actual token if not already an actual token (vocabWord)
    VocabWord actualToken;
    if (vocab.hasToken(stringToken)) {
      actualToken = vocab.tokenFor(stringToken);
    } else {
      actualToken = new VocabWord(1, stringToken);
    }

    // Set the index of the actual token (vocabWord)
    // Put vocabWord into vocabs in InMemoryVocabCache
    boolean vocabContainsWord = vocab.containsWord(stringToken);
    if (!vocabContainsWord) {
      vocab.addToken(actualToken);
      int idx = vocab.numWords();
      actualToken.setIndex(idx);
      vocab.putVocabWord(stringToken);
    }
  }
  /**
   * Load a look up cache from an input stream delimited by \n
   *
   * @param from the input stream to read from
   * @return the in memory lookup cache
   */
  public static InMemoryLookupCache load(InputStream from) {
    Reader inputStream = new InputStreamReader(from);
    LineIterator iter = IOUtils.lineIterator(inputStream);
    String line;
    InMemoryLookupCache ret = new InMemoryLookupCache();
    int count = 0;
    while ((iter.hasNext())) {
      line = iter.nextLine();
      if (line.isEmpty()) continue;
      ret.incrementWordCount(line);
      VocabWord word = new VocabWord(1.0, line);
      word.setIndex(count);
      ret.addToken(word);
      ret.addWordToIndex(count, line);
      ret.putVocabWord(line);
      count++;
    }

    return ret;
  }