コード例 #1
0
  /**
   * This method is required for compatibility purposes. It just transfers vocabulary from
   * VocabHolder into VocabCache
   *
   * @param cache
   */
  public void transferBackToVocabCache(VocabCache cache, boolean emptyHolder) {
    if (!(cache instanceof InMemoryLookupCache))
      throw new IllegalStateException("Sorry, only InMemoryLookupCache use implemented.");

    // make sure that huffman codes are updated before transfer
    List<VocabularyWord> words = words(); // updateHuffmanCodes();

    for (VocabularyWord word : words) {
      if (word.getWord().isEmpty()) continue;
      VocabWord vocabWord = new VocabWord(1, word.getWord());

      // if we're transferring full model, it CAN contain HistoricalGradient for AdaptiveGradient
      // feature
      if (word.getHistoricalGradient() != null) {
        INDArray gradient = Nd4j.create(word.getHistoricalGradient());
        vocabWord.setHistoricalGradient(gradient);
      }

      // put VocabWord into both Tokens and Vocabs maps
      ((InMemoryLookupCache) cache).getVocabs().put(word.getWord(), vocabWord);
      ((InMemoryLookupCache) cache).getTokens().put(word.getWord(), vocabWord);

      // update Huffman tree information
      if (word.getHuffmanNode() != null) {
        vocabWord.setIndex(word.getHuffmanNode().getIdx());
        vocabWord.setCodeLength(word.getHuffmanNode().getLength());
        vocabWord.setPoints(
            arrayToList(word.getHuffmanNode().getPoint(), word.getHuffmanNode().getLength()));
        vocabWord.setCodes(
            arrayToList(word.getHuffmanNode().getCode(), word.getHuffmanNode().getLength()));

        // put word into index
        cache.addWordToIndex(word.getHuffmanNode().getIdx(), word.getWord());
      }

      // update vocabWord counter. substract 1, since its the base value for any token
      // >1 hack is required since VocabCache impl imples 1 as base word count, not 0
      if (word.getCount() > 1) cache.incrementWordCount(word.getWord(), word.getCount() - 1);
    }

    // at this moment its pretty safe to nullify all vocabs.
    if (emptyHolder) {
      idxMap.clear();
      vocabulary.clear();
    }
  }
コード例 #2
0
  /**
   * Builds VocabularyHolder from VocabCache.
   *
   * <p>Basically we just ignore tokens, and transfer VocabularyWords, supposing that it's already
   * truncated by minWordFrequency.
   *
   * <p>Huffman tree data is ignored and recalculated, due to suspectable flaw in dl4j huffman impl,
   * and it's exsessive memory usage.
   *
   * <p>This code is required for compatibility between dl4j w2v implementation, and standalone w2v
   *
   * @param cache
   */
  protected VocabularyHolder(@NonNull VocabCache cache, boolean markAsSpecial) {
    this.vocabCache = cache;
    for (VocabWord word : cache.tokens()) {
      VocabularyWord vw = new VocabularyWord(word.getWord());
      vw.setCount((int) word.getWordFrequency());

      // since we're importing this word from external VocabCache, we'll assume that this word is
      // SPECIAL, and should NOT be affected by minWordFrequency
      vw.setSpecial(markAsSpecial);

      // please note: we don't transfer huffman data, since proper way is  to recalculate it after
      // new words being added
      if (word.getPoints() != null && !word.getPoints().isEmpty()) {
        vw.setHuffmanNode(
            buildNode(word.getCodes(), word.getPoints(), word.getCodeLength(), word.getIndex()));
      }

      vocabulary.put(vw.getWord(), vw);
    }

    // there's no sense building huffman tree just for UNK word
    if (numWords() > 1) updateHuffmanCodes();
    logger.info("Init from VocabCache is complete. " + numWords() + " word(s) were transferred.");
  }