/** This methods reset counters for all words in vocabulary */ public void resetWordCounters() { for (VocabularyWord word : getVocabulary()) { word.setHuffmanNode(null); word.setFrequencyShift(null); word.setCount(0); } }
/** * Builds VocabularyHolder from VocabCache. * * <p>Basically we just ignore tokens, and transfer VocabularyWords, supposing that it's already * truncated by minWordFrequency. * * <p>Huffman tree data is ignored and recalculated, due to suspectable flaw in dl4j huffman impl, * and it's exsessive memory usage. * * <p>This code is required for compatibility between dl4j w2v implementation, and standalone w2v * * @param cache */ protected VocabularyHolder(@NonNull VocabCache cache, boolean markAsSpecial) { this.vocabCache = cache; for (VocabWord word : cache.tokens()) { VocabularyWord vw = new VocabularyWord(word.getWord()); vw.setCount((int) word.getWordFrequency()); // since we're importing this word from external VocabCache, we'll assume that this word is // SPECIAL, and should NOT be affected by minWordFrequency vw.setSpecial(markAsSpecial); // please note: we don't transfer huffman data, since proper way is to recalculate it after // new words being added if (word.getPoints() != null && !word.getPoints().isEmpty()) { vw.setHuffmanNode( buildNode(word.getCodes(), word.getPoints(), word.getCodeLength(), word.getIndex())); } vocabulary.put(vw.getWord(), vw); } // there's no sense building huffman tree just for UNK word if (numWords() > 1) updateHuffmanCodes(); logger.info("Init from VocabCache is complete. " + numWords() + " word(s) were transferred."); }