コード例 #1
0
 /** This methods reset counters for all words in vocabulary */
 public void resetWordCounters() {
   for (VocabularyWord word : getVocabulary()) {
     word.setHuffmanNode(null);
     word.setFrequencyShift(null);
     word.setCount(0);
   }
 }
コード例 #2
0
  /**
   * Builds VocabularyHolder from VocabCache.
   *
   * <p>Basically we just ignore tokens, and transfer VocabularyWords, supposing that it's already
   * truncated by minWordFrequency.
   *
   * <p>Huffman tree data is ignored and recalculated, due to suspectable flaw in dl4j huffman impl,
   * and it's exsessive memory usage.
   *
   * <p>This code is required for compatibility between dl4j w2v implementation, and standalone w2v
   *
   * @param cache
   */
  protected VocabularyHolder(@NonNull VocabCache cache, boolean markAsSpecial) {
    this.vocabCache = cache;
    for (VocabWord word : cache.tokens()) {
      VocabularyWord vw = new VocabularyWord(word.getWord());
      vw.setCount((int) word.getWordFrequency());

      // since we're importing this word from external VocabCache, we'll assume that this word is
      // SPECIAL, and should NOT be affected by minWordFrequency
      vw.setSpecial(markAsSpecial);

      // please note: we don't transfer huffman data, since proper way is  to recalculate it after
      // new words being added
      if (word.getPoints() != null && !word.getPoints().isEmpty()) {
        vw.setHuffmanNode(
            buildNode(word.getCodes(), word.getPoints(), word.getCodeLength(), word.getIndex()));
      }

      vocabulary.put(vw.getWord(), vw);
    }

    // there's no sense building huffman tree just for UNK word
    if (numWords() > 1) updateHuffmanCodes();
    logger.info("Init from VocabCache is complete. " + numWords() + " word(s) were transferred.");
  }