コード例 #1
0
 /** This methods reset counters for all words in vocabulary */
 public void resetWordCounters() {
   for (VocabularyWord word : getVocabulary()) {
     word.setHuffmanNode(null);
     word.setFrequencyShift(null);
     word.setCount(0);
   }
 }
コード例 #2
0
  /**
   * This method removes low-frequency words based on their frequency change between activations.
   * I.e. if word has appeared only once, and it's retained the same frequency over consequence
   * activations, we can assume it can be removed freely
   */
  protected synchronized void activateScavenger() {
    int initialSize = vocabulary.size();
    List<VocabularyWord> words = new ArrayList<>(vocabulary.values());
    for (VocabularyWord word : words) {
      // scavenging could be applied only to non-special tokens that are below minWordFrequency
      if (word.isSpecial()
          || word.getCount() >= minWordFrequency
          || word.getFrequencyShift() == null) {
        word.setFrequencyShift(null);
        continue;
      }

      // save current word counter to byte array at specified position
      word.getFrequencyShift()[word.getRetentionStep()] = (byte) word.getCount();

      /*
            we suppose that we're hunting only low-freq words that already passed few activations
            so, we assume word personal threshold as 20% of minWordFrequency, but not less then 1.

            so, if after few scavenging cycles wordCount is still <= activation - just remove word.
            otherwise nullify word.frequencyShift to avoid further checks
      */
      int activation = Math.max(minWordFrequency / 5, 2);
      logger.debug(
          "Current state> Activation: ["
              + activation
              + "], retention info: "
              + Arrays.toString(word.getFrequencyShift()));
      if (word.getCount() <= activation && word.getFrequencyShift()[this.retentionDelay - 1] > 0) {

        // if final word count at latest retention point is the same as at the beginning - just
        // remove word
        if (word.getFrequencyShift()[this.retentionDelay - 1] <= activation
            && word.getFrequencyShift()[this.retentionDelay - 1] == word.getFrequencyShift()[0]) {
          vocabulary.remove(word.getWord());
        }
      }

      // shift retention history to the left
      if (word.getRetentionStep() < retentionDelay - 1) {
        word.incrementRetentionStep();
      } else {
        for (int x = 1; x < retentionDelay; x++) {
          word.getFrequencyShift()[x - 1] = word.getFrequencyShift()[x];
        }
      }
    }
    logger.info(
        "Scavenger was activated. Vocab size before: ["
            + initialSize
            + "],  after: ["
            + vocabulary.size()
            + "]");
  }
コード例 #3
0
  // TODO: investigate, if it's worth to make this internally synchronized and virtually thread-safe
  public void addWord(String word) {
    if (!vocabulary.containsKey(word)) {
      VocabularyWord vw = new VocabularyWord(word);

      /*
        TODO: this should be done in different way, since this implementation causes minWordFrequency ultimate ignoral if markAsSpecial set to TRUE

        Probably the best way to solve it, is remove markAsSpecial option here, and let this issue be regulated with minWordFrequency
      */
      // vw.setSpecial(markAsSpecial);

      // initialize frequencyShift only if hugeModelExpected. It's useless otherwise :)
      if (hugeModelExpected) vw.setFrequencyShift(new byte[retentionDelay]);

      vocabulary.put(word, vw);

      if (hugeModelExpected
          && minWordFrequency > 1
          && hiddenWordsCounter.incrementAndGet() % scavengerThreshold == 0) activateScavenger();

      return;
    }
  }