/**
   * This method removes low-frequency words based on their frequency change between activations.
   * I.e. if word has appeared only once, and it's retained the same frequency over consequence
   * activations, we can assume it can be removed freely
   */
  protected synchronized void activateScavenger() {
    int initialSize = vocabulary.size();
    List<VocabularyWord> words = new ArrayList<>(vocabulary.values());
    for (VocabularyWord word : words) {
      // scavenging could be applied only to non-special tokens that are below minWordFrequency
      if (word.isSpecial()
          || word.getCount() >= minWordFrequency
          || word.getFrequencyShift() == null) {
        word.setFrequencyShift(null);
        continue;
      }

      // save current word counter to byte array at specified position
      word.getFrequencyShift()[word.getRetentionStep()] = (byte) word.getCount();

      /*
            we suppose that we're hunting only low-freq words that already passed few activations
            so, we assume word personal threshold as 20% of minWordFrequency, but not less then 1.

            so, if after few scavenging cycles wordCount is still <= activation - just remove word.
            otherwise nullify word.frequencyShift to avoid further checks
      */
      int activation = Math.max(minWordFrequency / 5, 2);
      logger.debug(
          "Current state> Activation: ["
              + activation
              + "], retention info: "
              + Arrays.toString(word.getFrequencyShift()));
      if (word.getCount() <= activation && word.getFrequencyShift()[this.retentionDelay - 1] > 0) {

        // if final word count at latest retention point is the same as at the beginning - just
        // remove word
        if (word.getFrequencyShift()[this.retentionDelay - 1] <= activation
            && word.getFrequencyShift()[this.retentionDelay - 1] == word.getFrequencyShift()[0]) {
          vocabulary.remove(word.getWord());
        }
      }

      // shift retention history to the left
      if (word.getRetentionStep() < retentionDelay - 1) {
        word.incrementRetentionStep();
      } else {
        for (int x = 1; x < retentionDelay; x++) {
          word.getFrequencyShift()[x - 1] = word.getFrequencyShift()[x];
        }
      }
    }
    logger.info(
        "Scavenger was activated. Vocab size before: ["
            + initialSize
            + "],  after: ["
            + vocabulary.size()
            + "]");
  }
 public long totalWordsBeyondLimit() {
   if (totalWordOccurencies == 0) {
     for (VocabularyWord word : vocabulary.values()) {
       totalWordOccurencies += word.getCount();
     }
     return totalWordOccurencies;
   } else return totalWordOccurencies;
 }
  /**
   * This method is required for compatibility purposes. It just transfers vocabulary from
   * VocabHolder into VocabCache
   *
   * @param cache
   */
  public void transferBackToVocabCache(VocabCache cache, boolean emptyHolder) {
    if (!(cache instanceof InMemoryLookupCache))
      throw new IllegalStateException("Sorry, only InMemoryLookupCache use implemented.");

    // make sure that huffman codes are updated before transfer
    List<VocabularyWord> words = words(); // updateHuffmanCodes();

    for (VocabularyWord word : words) {
      if (word.getWord().isEmpty()) continue;
      VocabWord vocabWord = new VocabWord(1, word.getWord());

      // if we're transferring full model, it CAN contain HistoricalGradient for AdaptiveGradient
      // feature
      if (word.getHistoricalGradient() != null) {
        INDArray gradient = Nd4j.create(word.getHistoricalGradient());
        vocabWord.setHistoricalGradient(gradient);
      }

      // put VocabWord into both Tokens and Vocabs maps
      ((InMemoryLookupCache) cache).getVocabs().put(word.getWord(), vocabWord);
      ((InMemoryLookupCache) cache).getTokens().put(word.getWord(), vocabWord);

      // update Huffman tree information
      if (word.getHuffmanNode() != null) {
        vocabWord.setIndex(word.getHuffmanNode().getIdx());
        vocabWord.setCodeLength(word.getHuffmanNode().getLength());
        vocabWord.setPoints(
            arrayToList(word.getHuffmanNode().getPoint(), word.getHuffmanNode().getLength()));
        vocabWord.setCodes(
            arrayToList(word.getHuffmanNode().getCode(), word.getHuffmanNode().getLength()));

        // put word into index
        cache.addWordToIndex(word.getHuffmanNode().getIdx(), word.getWord());
      }

      // update vocabWord counter. substract 1, since its the base value for any token
      // >1 hack is required since VocabCache impl imples 1 as base word count, not 0
      if (word.getCount() > 1) cache.incrementWordCount(word.getWord(), word.getCount() - 1);
    }

    // at this moment its pretty safe to nullify all vocabs.
    if (emptyHolder) {
      idxMap.clear();
      vocabulary.clear();
    }
  }
  /**
   * All words with frequency below threshold wii be removed
   *
   * @param threshold exclusive threshold for removal
   */
  public void truncateVocabulary(int threshold) {
    logger.debug("Truncating vocabulary to minWordFrequency: [" + threshold + "]");
    Set<String> keyset = vocabulary.keySet();
    for (String word : keyset) {
      VocabularyWord vw = vocabulary.get(word);

      // please note: we're not applying threshold to SPECIAL words
      if (!vw.isSpecial() && vw.getCount() < threshold) {
        vocabulary.remove(word);
        if (vw.getHuffmanNode() != null) idxMap.remove(vw.getHuffmanNode().getIdx());
      }
    }
  }