/** * This method removes low-frequency words based on their frequency change between activations. * I.e. if word has appeared only once, and it's retained the same frequency over consequence * activations, we can assume it can be removed freely */ protected synchronized void activateScavenger() { int initialSize = vocabulary.size(); List<VocabularyWord> words = new ArrayList<>(vocabulary.values()); for (VocabularyWord word : words) { // scavenging could be applied only to non-special tokens that are below minWordFrequency if (word.isSpecial() || word.getCount() >= minWordFrequency || word.getFrequencyShift() == null) { word.setFrequencyShift(null); continue; } // save current word counter to byte array at specified position word.getFrequencyShift()[word.getRetentionStep()] = (byte) word.getCount(); /* we suppose that we're hunting only low-freq words that already passed few activations so, we assume word personal threshold as 20% of minWordFrequency, but not less then 1. so, if after few scavenging cycles wordCount is still <= activation - just remove word. otherwise nullify word.frequencyShift to avoid further checks */ int activation = Math.max(minWordFrequency / 5, 2); logger.debug( "Current state> Activation: [" + activation + "], retention info: " + Arrays.toString(word.getFrequencyShift())); if (word.getCount() <= activation && word.getFrequencyShift()[this.retentionDelay - 1] > 0) { // if final word count at latest retention point is the same as at the beginning - just // remove word if (word.getFrequencyShift()[this.retentionDelay - 1] <= activation && word.getFrequencyShift()[this.retentionDelay - 1] == word.getFrequencyShift()[0]) { vocabulary.remove(word.getWord()); } } // shift retention history to the left if (word.getRetentionStep() < retentionDelay - 1) { word.incrementRetentionStep(); } else { for (int x = 1; x < retentionDelay; x++) { word.getFrequencyShift()[x - 1] = word.getFrequencyShift()[x]; } } } logger.info( "Scavenger was activated. Vocab size before: [" + initialSize + "], after: [" + vocabulary.size() + "]"); }
public long totalWordsBeyondLimit() { if (totalWordOccurencies == 0) { for (VocabularyWord word : vocabulary.values()) { totalWordOccurencies += word.getCount(); } return totalWordOccurencies; } else return totalWordOccurencies; }
/** * This method is required for compatibility purposes. It just transfers vocabulary from * VocabHolder into VocabCache * * @param cache */ public void transferBackToVocabCache(VocabCache cache, boolean emptyHolder) { if (!(cache instanceof InMemoryLookupCache)) throw new IllegalStateException("Sorry, only InMemoryLookupCache use implemented."); // make sure that huffman codes are updated before transfer List<VocabularyWord> words = words(); // updateHuffmanCodes(); for (VocabularyWord word : words) { if (word.getWord().isEmpty()) continue; VocabWord vocabWord = new VocabWord(1, word.getWord()); // if we're transferring full model, it CAN contain HistoricalGradient for AdaptiveGradient // feature if (word.getHistoricalGradient() != null) { INDArray gradient = Nd4j.create(word.getHistoricalGradient()); vocabWord.setHistoricalGradient(gradient); } // put VocabWord into both Tokens and Vocabs maps ((InMemoryLookupCache) cache).getVocabs().put(word.getWord(), vocabWord); ((InMemoryLookupCache) cache).getTokens().put(word.getWord(), vocabWord); // update Huffman tree information if (word.getHuffmanNode() != null) { vocabWord.setIndex(word.getHuffmanNode().getIdx()); vocabWord.setCodeLength(word.getHuffmanNode().getLength()); vocabWord.setPoints( arrayToList(word.getHuffmanNode().getPoint(), word.getHuffmanNode().getLength())); vocabWord.setCodes( arrayToList(word.getHuffmanNode().getCode(), word.getHuffmanNode().getLength())); // put word into index cache.addWordToIndex(word.getHuffmanNode().getIdx(), word.getWord()); } // update vocabWord counter. substract 1, since its the base value for any token // >1 hack is required since VocabCache impl imples 1 as base word count, not 0 if (word.getCount() > 1) cache.incrementWordCount(word.getWord(), word.getCount() - 1); } // at this moment its pretty safe to nullify all vocabs. if (emptyHolder) { idxMap.clear(); vocabulary.clear(); } }
/** * All words with frequency below threshold wii be removed * * @param threshold exclusive threshold for removal */ public void truncateVocabulary(int threshold) { logger.debug("Truncating vocabulary to minWordFrequency: [" + threshold + "]"); Set<String> keyset = vocabulary.keySet(); for (String word : keyset) { VocabularyWord vw = vocabulary.get(word); // please note: we're not applying threshold to SPECIAL words if (!vw.isSpecial() && vw.getCount() < threshold) { vocabulary.remove(word); if (vw.getHuffmanNode() != null) idxMap.remove(vw.getHuffmanNode().getIdx()); } } }