   * @param index
   * @param word
  public synchronized void addWordToIndex(int index, String word) {
    if (word == null || word.isEmpty())
      throw new IllegalArgumentException("Word can't be empty or null");

    if (!tokens.containsKey(word)) {
      VocabWord token = new VocabWord(1.0, word);
      tokens.put(word, token);
      wordFrequencies.incrementCount(word, 1.0);

       If we're speaking about adding any word to index directly, it means it's going to be vocab word, not token
    if (!vocabs.containsKey(word)) {
      VocabWord vw = tokenFor(word);
      vocabs.put(word, vw);

    if (!wordFrequencies.containsKey(word)) wordFrequencies.incrementCount(word, 1);

    wordIndex.add(word, index);
 public void updateWordsOccurencies() {
   for (VocabWord word : vocabWords()) {
     totalWordOccurrences.addAndGet((long) word.getElementFrequency());
 public InMemoryLookupCache(boolean addUnk) {
   if (addUnk) {
     VocabWord word = new VocabWord(1.0, Word2Vec.UNK);
     addWordToIndex(0, Word2Vec.UNK);
   * Increment the count for the given word by the amount increment
   * @param word the word to increment the count for
   * @param increment the amount to increment by
  public synchronized void incrementWordCount(String word, int increment) {
    if (word == null || word.isEmpty())
      throw new IllegalArgumentException("Word can't be empty or null");
    wordFrequencies.incrementCount(word, increment);

    if (hasToken(word)) {
      VocabWord token = tokenFor(word);
    totalWordOccurrences.set(totalWordOccurrences.get() + increment);
 /** @param word */
 public synchronized void putVocabWord(String word) {
   if (word == null || word.isEmpty())
     throw new IllegalArgumentException("Word can't be empty or null");
   // STOP and UNK are not added as tokens
   if (word.equals("STOP") || word.equals("UNK")) return;
   VocabWord token = tokenFor(word);
   if (token == null)
     throw new IllegalStateException("Word " + word + " not found as token in vocab");
   int ind = token.getIndex();
   addWordToIndex(ind, word);
   if (!hasToken(word))
     throw new IllegalStateException("Unable to add token " + word + " when not already a token");
   vocabs.put(word, token);
   wordIndex.add(word, token.getIndex());
Ejemplo n.º 6
  private Pair<INDArray, Double> update(
      AdaGrad weightAdaGrad,
      AdaGrad biasAdaGrad,
      INDArray syn0,
      INDArray bias,
      VocabWord w1,
      INDArray wordVector,
      INDArray contextVector,
      double gradient) {
    // gradient for word vectors
    INDArray grad1 = contextVector.mul(gradient);
    INDArray update = weightAdaGrad.getGradient(grad1, w1.getIndex(), syn0.shape());

    double w1Bias = bias.getDouble(w1.getIndex());
    double biasGradient = biasAdaGrad.getGradient(gradient, w1.getIndex(), bias.shape());
    double update2 = w1Bias - biasGradient;
    return new Pair<>(update, update2);
   * Loads an in memory cache from the given path (sets syn0 and the vocab)
   * @param vectorsFile the path of the file to load
   * @return
   * @throws FileNotFoundException
  public static Pair<InMemoryLookupTable, VocabCache> loadTxt(File vectorsFile)
      throws FileNotFoundException {
    BufferedReader write = new BufferedReader(new FileReader(vectorsFile));
    VocabCache cache = new InMemoryLookupCache();

    InMemoryLookupTable lookupTable;

    LineIterator iter = IOUtils.lineIterator(write);
    List<INDArray> arrays = new ArrayList<>();
    while (iter.hasNext()) {
      String line = iter.nextLine();
      String[] split = line.split(" ");
      String word = split[0];
      VocabWord word1 = new VocabWord(1.0, word);
      cache.addWordToIndex(cache.numWords(), word);
      INDArray row = Nd4j.create(Nd4j.createBuffer(split.length - 1));
      for (int i = 1; i < split.length; i++) {
        row.putScalar(i - 1, Float.parseFloat(split[i]));

    INDArray syn = Nd4j.create(new int[] {arrays.size(), arrays.get(0).columns()});
    for (int i = 0; i < syn.rows(); i++) {
      syn.putRow(i, arrays.get(i));

    lookupTable =
            new InMemoryLookupTable.Builder()


    return new Pair<>(lookupTable, cache);
Ejemplo n.º 8
  private static void addTokenToVocabCache(InMemoryLookupCache vocab, String stringToken) {
    // Making string token into actual token if not already an actual token (vocabWord)
    VocabWord actualToken;
    if (vocab.hasToken(stringToken)) {
      actualToken = vocab.tokenFor(stringToken);
    } else {
      actualToken = new VocabWord(1, stringToken);

    // Set the index of the actual token (vocabWord)
    // Put vocabWord into vocabs in InMemoryVocabCache
    boolean vocabContainsWord = vocab.containsWord(stringToken);
    if (!vocabContainsWord) {
      int idx = vocab.numWords();
   * Load a look up cache from an input stream delimited by \n
   * @param from the input stream to read from
   * @return the in memory lookup cache
  public static InMemoryLookupCache load(InputStream from) {
    Reader inputStream = new InputStreamReader(from);
    LineIterator iter = IOUtils.lineIterator(inputStream);
    String line;
    InMemoryLookupCache ret = new InMemoryLookupCache();
    int count = 0;
    while ((iter.hasNext())) {
      line = iter.nextLine();
      if (line.isEmpty()) continue;
      VocabWord word = new VocabWord(1.0, line);
      ret.addWordToIndex(count, line);

    return ret;
 public void importVocabulary(VocabCache<VocabWord> vocabCache) {
   for (VocabWord word : vocabCache.vocabWords()) {
     if (vocabs.containsKey(word.getLabel())) {
       wordFrequencies.incrementCount(word.getLabel(), word.getElementFrequency());
     } else {
       tokens.put(word.getLabel(), word);
       vocabs.put(word.getLabel(), word);
       wordFrequencies.incrementCount(word.getLabel(), word.getElementFrequency());
     totalWordOccurrences.addAndGet((long) word.getElementFrequency());
Ejemplo n.º 11
  private void addTokenToVocabCache(String stringToken, Double tokenCount) {
    // Making string token into actual token if not already an actual token (vocabWord)
    VocabWord actualToken;
    if (vocabCache.hasToken(stringToken)) {
      actualToken = vocabCache.tokenFor(stringToken);
    } else {
      actualToken = new VocabWord(tokenCount, stringToken);

    // Set the index of the actual token (vocabWord)
    // Put vocabWord into vocabs in InMemoryVocabCache
    boolean vocabContainsWord = vocabCache.containsWord(stringToken);
    if (!vocabContainsWord) {

      int idx = vocabCache.numWords();
   * Builds VocabularyHolder from VocabCache.
   * <p>Basically we just ignore tokens, and transfer VocabularyWords, supposing that it's already
   * truncated by minWordFrequency.
   * <p>Huffman tree data is ignored and recalculated, due to suspectable flaw in dl4j huffman impl,
   * and it's exsessive memory usage.
   * <p>This code is required for compatibility between dl4j w2v implementation, and standalone w2v
   * @param cache
  protected VocabularyHolder(@NonNull VocabCache cache, boolean markAsSpecial) {
    this.vocabCache = cache;
    for (VocabWord word : cache.tokens()) {
      VocabularyWord vw = new VocabularyWord(word.getWord());
      vw.setCount((int) word.getWordFrequency());

      // since we're importing this word from external VocabCache, we'll assume that this word is
      // SPECIAL, and should NOT be affected by minWordFrequency

      // please note: we don't transfer huffman data, since proper way is  to recalculate it after
      // new words being added
      if (word.getPoints() != null && !word.getPoints().isEmpty()) {
            buildNode(word.getCodes(), word.getPoints(), word.getCodeLength(), word.getIndex()));

      vocabulary.put(vw.getWord(), vw);

    // there's no sense building huffman tree just for UNK word
    if (numWords() > 1) updateHuffmanCodes();
    logger.info("Init from VocabCache is complete. " + numWords() + " word(s) were transferred.");
   * This method is required for compatibility purposes. It just transfers vocabulary from
   * VocabHolder into VocabCache
   * @param cache
  public void transferBackToVocabCache(VocabCache cache, boolean emptyHolder) {
    if (!(cache instanceof InMemoryLookupCache))
      throw new IllegalStateException("Sorry, only InMemoryLookupCache use implemented.");

    // make sure that huffman codes are updated before transfer
    List<VocabularyWord> words = words(); // updateHuffmanCodes();

    for (VocabularyWord word : words) {
      if (word.getWord().isEmpty()) continue;
      VocabWord vocabWord = new VocabWord(1, word.getWord());

      // if we're transferring full model, it CAN contain HistoricalGradient for AdaptiveGradient
      // feature
      if (word.getHistoricalGradient() != null) {
        INDArray gradient = Nd4j.create(word.getHistoricalGradient());

      // put VocabWord into both Tokens and Vocabs maps
      ((InMemoryLookupCache) cache).getVocabs().put(word.getWord(), vocabWord);
      ((InMemoryLookupCache) cache).getTokens().put(word.getWord(), vocabWord);

      // update Huffman tree information
      if (word.getHuffmanNode() != null) {
            arrayToList(word.getHuffmanNode().getPoint(), word.getHuffmanNode().getLength()));
            arrayToList(word.getHuffmanNode().getCode(), word.getHuffmanNode().getLength()));

        // put word into index
        cache.addWordToIndex(word.getHuffmanNode().getIdx(), word.getWord());

      // update vocabWord counter. substract 1, since its the base value for any token
      // >1 hack is required since VocabCache impl imples 1 as base word count, not 0
      if (word.getCount() > 1) cache.incrementWordCount(word.getWord(), word.getCount() - 1);

    // at this moment its pretty safe to nullify all vocabs.
    if (emptyHolder) {
 public synchronized void addToken(VocabWord word) {
   tokens.put(word.getWord(), word);
  public void iterateSample(VocabWord w1, VocabWord w2, double currentSentenceAlpha) {

    if (w1 == null || w2 == null || w2.getIndex() < 0 || w2.getIndex() == w1.getIndex()) return;
    final int currentWordIndex = w2.getIndex();

    // error for current word and context
    INDArray neu1e = Nd4j.create(vectorLength);

    // First iteration Syn0 is random numbers
    INDArray l1 = null;
    if (indexSyn0VecMap.containsKey(vocab.elementAtIndex(currentWordIndex))) {
      l1 = indexSyn0VecMap.get(vocab.elementAtIndex(currentWordIndex));
    } else {
      l1 = getRandomSyn0Vec(vectorLength, (long) currentWordIndex);

    for (int i = 0; i < w1.getCodeLength(); i++) {
      int code = w1.getCodes().get(i);
      int point = w1.getPoints().get(i);
      if (point < 0) throw new IllegalStateException("Illegal point " + point);
      // Point to
      INDArray syn1;
      if (pointSyn1VecMap.containsKey(point)) {
        syn1 = pointSyn1VecMap.get(point);
      } else {
        syn1 = Nd4j.zeros(1, vectorLength); // 1 row of vector length of zeros
        pointSyn1VecMap.put(point, syn1);

      // Dot product of Syn0 and Syn1 vecs
      double dot = Nd4j.getBlasWrapper().level1().dot(vectorLength, 1.0, l1, syn1);

      if (dot < -maxExp || dot >= maxExp) continue;

      int idx = (int) ((dot + maxExp) * ((double) expTable.length / maxExp / 2.0));

      if (idx > expTable.length) continue;

      // score
      double f = expTable[idx];
      // gradient
      double g =
          (1 - code - f)
              * (useAdaGrad
                  ? w1.getGradient(i, currentSentenceAlpha, currentSentenceAlpha)
                  : currentSentenceAlpha);

      Nd4j.getBlasWrapper().level1().axpy(vectorLength, g, syn1, neu1e);
      Nd4j.getBlasWrapper().level1().axpy(vectorLength, g, l1, syn1);

    int target = w1.getIndex();
    int label;
    // negative sampling
    if (negative > 0)
      for (int d = 0; d < negative + 1; d++) {
        if (d == 0) label = 1;
        else {
          nextRandom.set(nextRandom.get() * 25214903917L + 11);
          int idx = Math.abs((int) (nextRandom.get() >> 16) % negativeHolder.getTable().length());

          target = negativeHolder.getTable().getInt(idx);
          if (target <= 0) target = (int) nextRandom.get() % (vocab.numWords() - 1) + 1;

          if (target == w1.getIndex()) continue;
          label = 0;

        if (target >= negativeHolder.getSyn1Neg().rows() || target < 0) continue;

        double f = Nd4j.getBlasWrapper().dot(l1, negativeHolder.getSyn1Neg().slice(target));
        double g;
        if (f > maxExp)
          g = useAdaGrad ? w1.getGradient(target, (label - 1), alpha) : (label - 1) * alpha;
        else if (f < -maxExp)
          g = label * (useAdaGrad ? w1.getGradient(target, alpha, alpha) : alpha);
        else {
          int idx = (int) ((f + maxExp) * (expTable.length / maxExp / 2));
          if (idx >= expTable.length) continue;

          g =
                  ? w1.getGradient(target, label - expTable[idx], alpha)
                  : (label - expTable[idx]) * alpha;

        Nd4j.getBlasWrapper().axpy((float) g, negativeHolder.getSyn1Neg().slice(target), neu1e);

        Nd4j.getBlasWrapper().axpy((float) g, l1, negativeHolder.getSyn1Neg().slice(target));

    // Updated the Syn0 vector based on gradient. Syn0 is not random anymore.
    Nd4j.getBlasWrapper().level1().axpy(vectorLength, 1.0f, neu1e, l1);

    if (aff.get() == 0) {
      synchronized (this) {

    VocabWord word = vocab.elementAtIndex(currentWordIndex);
    indexSyn0VecMap.put(word, l1);
 public void removeElement(VocabWord element) {