コード例 #1
0
 public void consumeVocabulary(VocabularyHolder holder) {
   for (VocabularyWord word : holder.getVocabulary()) {
     if (!this.containsWord(word.getWord())) {
       this.addWord(word);
     } else {
       holder.incrementWordCounter(word.getWord());
     }
   }
 }
コード例 #2
0
  /**
   * This method is required for compatibility purposes. It just transfers vocabulary from
   * VocabHolder into VocabCache
   *
   * @param cache
   */
  public void transferBackToVocabCache(VocabCache cache, boolean emptyHolder) {
    if (!(cache instanceof InMemoryLookupCache))
      throw new IllegalStateException("Sorry, only InMemoryLookupCache use implemented.");

    // make sure that huffman codes are updated before transfer
    List<VocabularyWord> words = words(); // updateHuffmanCodes();

    for (VocabularyWord word : words) {
      if (word.getWord().isEmpty()) continue;
      VocabWord vocabWord = new VocabWord(1, word.getWord());

      // if we're transferring full model, it CAN contain HistoricalGradient for AdaptiveGradient
      // feature
      if (word.getHistoricalGradient() != null) {
        INDArray gradient = Nd4j.create(word.getHistoricalGradient());
        vocabWord.setHistoricalGradient(gradient);
      }

      // put VocabWord into both Tokens and Vocabs maps
      ((InMemoryLookupCache) cache).getVocabs().put(word.getWord(), vocabWord);
      ((InMemoryLookupCache) cache).getTokens().put(word.getWord(), vocabWord);

      // update Huffman tree information
      if (word.getHuffmanNode() != null) {
        vocabWord.setIndex(word.getHuffmanNode().getIdx());
        vocabWord.setCodeLength(word.getHuffmanNode().getLength());
        vocabWord.setPoints(
            arrayToList(word.getHuffmanNode().getPoint(), word.getHuffmanNode().getLength()));
        vocabWord.setCodes(
            arrayToList(word.getHuffmanNode().getCode(), word.getHuffmanNode().getLength()));

        // put word into index
        cache.addWordToIndex(word.getHuffmanNode().getIdx(), word.getWord());
      }

      // update vocabWord counter. substract 1, since its the base value for any token
      // >1 hack is required since VocabCache impl imples 1 as base word count, not 0
      if (word.getCount() > 1) cache.incrementWordCount(word.getWord(), word.getCount() - 1);
    }

    // at this moment its pretty safe to nullify all vocabs.
    if (emptyHolder) {
      idxMap.clear();
      vocabulary.clear();
    }
  }
コード例 #3
0
  /**
   * This method removes low-frequency words based on their frequency change between activations.
   * I.e. if word has appeared only once, and it's retained the same frequency over consequence
   * activations, we can assume it can be removed freely
   */
  protected synchronized void activateScavenger() {
    int initialSize = vocabulary.size();
    List<VocabularyWord> words = new ArrayList<>(vocabulary.values());
    for (VocabularyWord word : words) {
      // scavenging could be applied only to non-special tokens that are below minWordFrequency
      if (word.isSpecial()
          || word.getCount() >= minWordFrequency
          || word.getFrequencyShift() == null) {
        word.setFrequencyShift(null);
        continue;
      }

      // save current word counter to byte array at specified position
      word.getFrequencyShift()[word.getRetentionStep()] = (byte) word.getCount();

      /*
            we suppose that we're hunting only low-freq words that already passed few activations
            so, we assume word personal threshold as 20% of minWordFrequency, but not less then 1.

            so, if after few scavenging cycles wordCount is still <= activation - just remove word.
            otherwise nullify word.frequencyShift to avoid further checks
      */
      int activation = Math.max(minWordFrequency / 5, 2);
      logger.debug(
          "Current state> Activation: ["
              + activation
              + "], retention info: "
              + Arrays.toString(word.getFrequencyShift()));
      if (word.getCount() <= activation && word.getFrequencyShift()[this.retentionDelay - 1] > 0) {

        // if final word count at latest retention point is the same as at the beginning - just
        // remove word
        if (word.getFrequencyShift()[this.retentionDelay - 1] <= activation
            && word.getFrequencyShift()[this.retentionDelay - 1] == word.getFrequencyShift()[0]) {
          vocabulary.remove(word.getWord());
        }
      }

      // shift retention history to the left
      if (word.getRetentionStep() < retentionDelay - 1) {
        word.incrementRetentionStep();
      } else {
        for (int x = 1; x < retentionDelay; x++) {
          word.getFrequencyShift()[x - 1] = word.getFrequencyShift()[x];
        }
      }
    }
    logger.info(
        "Scavenger was activated. Vocab size before: ["
            + initialSize
            + "],  after: ["
            + vocabulary.size()
            + "]");
  }
コード例 #4
0
  /**
   * Builds VocabularyHolder from VocabCache.
   *
   * <p>Basically we just ignore tokens, and transfer VocabularyWords, supposing that it's already
   * truncated by minWordFrequency.
   *
   * <p>Huffman tree data is ignored and recalculated, due to suspectable flaw in dl4j huffman impl,
   * and it's exsessive memory usage.
   *
   * <p>This code is required for compatibility between dl4j w2v implementation, and standalone w2v
   *
   * @param cache
   */
  protected VocabularyHolder(@NonNull VocabCache cache, boolean markAsSpecial) {
    this.vocabCache = cache;
    for (VocabWord word : cache.tokens()) {
      VocabularyWord vw = new VocabularyWord(word.getWord());
      vw.setCount((int) word.getWordFrequency());

      // since we're importing this word from external VocabCache, we'll assume that this word is
      // SPECIAL, and should NOT be affected by minWordFrequency
      vw.setSpecial(markAsSpecial);

      // please note: we don't transfer huffman data, since proper way is  to recalculate it after
      // new words being added
      if (word.getPoints() != null && !word.getPoints().isEmpty()) {
        vw.setHuffmanNode(
            buildNode(word.getCodes(), word.getPoints(), word.getCodeLength(), word.getIndex()));
      }

      vocabulary.put(vw.getWord(), vw);
    }

    // there's no sense building huffman tree just for UNK word
    if (numWords() > 1) updateHuffmanCodes();
    logger.info("Init from VocabCache is complete. " + numWords() + " word(s) were transferred.");
  }
コード例 #5
0
 public void addWord(VocabularyWord word) {
   vocabulary.put(word.getWord(), word);
 }