/** * This method is required for compatibility purposes. It just transfers vocabulary from * VocabHolder into VocabCache * * @param cache */ public void transferBackToVocabCache(VocabCache cache, boolean emptyHolder) { if (!(cache instanceof InMemoryLookupCache)) throw new IllegalStateException("Sorry, only InMemoryLookupCache use implemented."); // make sure that huffman codes are updated before transfer List<VocabularyWord> words = words(); // updateHuffmanCodes(); for (VocabularyWord word : words) { if (word.getWord().isEmpty()) continue; VocabWord vocabWord = new VocabWord(1, word.getWord()); // if we're transferring full model, it CAN contain HistoricalGradient for AdaptiveGradient // feature if (word.getHistoricalGradient() != null) { INDArray gradient = Nd4j.create(word.getHistoricalGradient()); vocabWord.setHistoricalGradient(gradient); } // put VocabWord into both Tokens and Vocabs maps ((InMemoryLookupCache) cache).getVocabs().put(word.getWord(), vocabWord); ((InMemoryLookupCache) cache).getTokens().put(word.getWord(), vocabWord); // update Huffman tree information if (word.getHuffmanNode() != null) { vocabWord.setIndex(word.getHuffmanNode().getIdx()); vocabWord.setCodeLength(word.getHuffmanNode().getLength()); vocabWord.setPoints( arrayToList(word.getHuffmanNode().getPoint(), word.getHuffmanNode().getLength())); vocabWord.setCodes( arrayToList(word.getHuffmanNode().getCode(), word.getHuffmanNode().getLength())); // put word into index cache.addWordToIndex(word.getHuffmanNode().getIdx(), word.getWord()); } // update vocabWord counter. substract 1, since its the base value for any token // >1 hack is required since VocabCache impl imples 1 as base word count, not 0 if (word.getCount() > 1) cache.incrementWordCount(word.getWord(), word.getCount() - 1); } // at this moment its pretty safe to nullify all vocabs. if (emptyHolder) { idxMap.clear(); vocabulary.clear(); } }
/** * Builds VocabularyHolder from VocabCache. * * <p>Basically we just ignore tokens, and transfer VocabularyWords, supposing that it's already * truncated by minWordFrequency. * * <p>Huffman tree data is ignored and recalculated, due to suspectable flaw in dl4j huffman impl, * and it's exsessive memory usage. * * <p>This code is required for compatibility between dl4j w2v implementation, and standalone w2v * * @param cache */ protected VocabularyHolder(@NonNull VocabCache cache, boolean markAsSpecial) { this.vocabCache = cache; for (VocabWord word : cache.tokens()) { VocabularyWord vw = new VocabularyWord(word.getWord()); vw.setCount((int) word.getWordFrequency()); // since we're importing this word from external VocabCache, we'll assume that this word is // SPECIAL, and should NOT be affected by minWordFrequency vw.setSpecial(markAsSpecial); // please note: we don't transfer huffman data, since proper way is to recalculate it after // new words being added if (word.getPoints() != null && !word.getPoints().isEmpty()) { vw.setHuffmanNode( buildNode(word.getCodes(), word.getPoints(), word.getCodeLength(), word.getIndex())); } vocabulary.put(vw.getWord(), vw); } // there's no sense building huffman tree just for UNK word if (numWords() > 1) updateHuffmanCodes(); logger.info("Init from VocabCache is complete. " + numWords() + " word(s) were transferred."); }