/** @param word */ @Override public synchronized void putVocabWord(String word) { if (word == null || word.isEmpty()) throw new IllegalArgumentException("Word can't be empty or null"); // STOP and UNK are not added as tokens if (word.equals("STOP") || word.equals("UNK")) return; VocabWord token = tokenFor(word); if (token == null) throw new IllegalStateException("Word " + word + " not found as token in vocab"); int ind = token.getIndex(); addWordToIndex(ind, word); if (!hasToken(word)) throw new IllegalStateException("Unable to add token " + word + " when not already a token"); vocabs.put(word, token); wordIndex.add(word, token.getIndex()); }
private Pair<INDArray, Double> update( AdaGrad weightAdaGrad, AdaGrad biasAdaGrad, INDArray syn0, INDArray bias, VocabWord w1, INDArray wordVector, INDArray contextVector, double gradient) { // gradient for word vectors INDArray grad1 = contextVector.mul(gradient); INDArray update = weightAdaGrad.getGradient(grad1, w1.getIndex(), syn0.shape()); double w1Bias = bias.getDouble(w1.getIndex()); double biasGradient = biasAdaGrad.getGradient(gradient, w1.getIndex(), bias.shape()); double update2 = w1Bias - biasGradient; return new Pair<>(update, update2); }
/** * Builds VocabularyHolder from VocabCache. * * <p>Basically we just ignore tokens, and transfer VocabularyWords, supposing that it's already * truncated by minWordFrequency. * * <p>Huffman tree data is ignored and recalculated, due to suspectable flaw in dl4j huffman impl, * and it's exsessive memory usage. * * <p>This code is required for compatibility between dl4j w2v implementation, and standalone w2v * * @param cache */ protected VocabularyHolder(@NonNull VocabCache cache, boolean markAsSpecial) { this.vocabCache = cache; for (VocabWord word : cache.tokens()) { VocabularyWord vw = new VocabularyWord(word.getWord()); vw.setCount((int) word.getWordFrequency()); // since we're importing this word from external VocabCache, we'll assume that this word is // SPECIAL, and should NOT be affected by minWordFrequency vw.setSpecial(markAsSpecial); // please note: we don't transfer huffman data, since proper way is to recalculate it after // new words being added if (word.getPoints() != null && !word.getPoints().isEmpty()) { vw.setHuffmanNode( buildNode(word.getCodes(), word.getPoints(), word.getCodeLength(), word.getIndex())); } vocabulary.put(vw.getWord(), vw); } // there's no sense building huffman tree just for UNK word if (numWords() > 1) updateHuffmanCodes(); logger.info("Init from VocabCache is complete. " + numWords() + " word(s) were transferred."); }
public void iterateSample(VocabWord w1, VocabWord w2, double currentSentenceAlpha) { if (w1 == null || w2 == null || w2.getIndex() < 0 || w2.getIndex() == w1.getIndex()) return; final int currentWordIndex = w2.getIndex(); // error for current word and context INDArray neu1e = Nd4j.create(vectorLength); // First iteration Syn0 is random numbers INDArray l1 = null; if (indexSyn0VecMap.containsKey(vocab.elementAtIndex(currentWordIndex))) { l1 = indexSyn0VecMap.get(vocab.elementAtIndex(currentWordIndex)); } else { l1 = getRandomSyn0Vec(vectorLength, (long) currentWordIndex); } // for (int i = 0; i < w1.getCodeLength(); i++) { int code = w1.getCodes().get(i); int point = w1.getPoints().get(i); if (point < 0) throw new IllegalStateException("Illegal point " + point); // Point to INDArray syn1; if (pointSyn1VecMap.containsKey(point)) { syn1 = pointSyn1VecMap.get(point); } else { syn1 = Nd4j.zeros(1, vectorLength); // 1 row of vector length of zeros pointSyn1VecMap.put(point, syn1); } // Dot product of Syn0 and Syn1 vecs double dot = Nd4j.getBlasWrapper().level1().dot(vectorLength, 1.0, l1, syn1); if (dot < -maxExp || dot >= maxExp) continue; int idx = (int) ((dot + maxExp) * ((double) expTable.length / maxExp / 2.0)); if (idx > expTable.length) continue; // score double f = expTable[idx]; // gradient double g = (1 - code - f) * (useAdaGrad ? w1.getGradient(i, currentSentenceAlpha, currentSentenceAlpha) : currentSentenceAlpha); Nd4j.getBlasWrapper().level1().axpy(vectorLength, g, syn1, neu1e); Nd4j.getBlasWrapper().level1().axpy(vectorLength, g, l1, syn1); } int target = w1.getIndex(); int label; // negative sampling if (negative > 0) for (int d = 0; d < negative + 1; d++) { if (d == 0) label = 1; else { nextRandom.set(nextRandom.get() * 25214903917L + 11); int idx = Math.abs((int) (nextRandom.get() >> 16) % negativeHolder.getTable().length()); target = negativeHolder.getTable().getInt(idx); if (target <= 0) target = (int) nextRandom.get() % (vocab.numWords() - 1) + 1; if (target == w1.getIndex()) continue; label = 0; } if (target >= negativeHolder.getSyn1Neg().rows() || target < 0) continue; double f = Nd4j.getBlasWrapper().dot(l1, negativeHolder.getSyn1Neg().slice(target)); double g; if (f > maxExp) g = useAdaGrad ? w1.getGradient(target, (label - 1), alpha) : (label - 1) * alpha; else if (f < -maxExp) g = label * (useAdaGrad ? w1.getGradient(target, alpha, alpha) : alpha); else { int idx = (int) ((f + maxExp) * (expTable.length / maxExp / 2)); if (idx >= expTable.length) continue; g = useAdaGrad ? w1.getGradient(target, label - expTable[idx], alpha) : (label - expTable[idx]) * alpha; } Nd4j.getBlasWrapper().axpy((float) g, negativeHolder.getSyn1Neg().slice(target), neu1e); Nd4j.getBlasWrapper().axpy((float) g, l1, negativeHolder.getSyn1Neg().slice(target)); } // Updated the Syn0 vector based on gradient. Syn0 is not random anymore. Nd4j.getBlasWrapper().level1().axpy(vectorLength, 1.0f, neu1e, l1); if (aff.get() == 0) { synchronized (this) { cid.set(EnvironmentUtils.buildCId()); aff.set(EnvironmentUtils.buildEnvironment().getAvailableMemory()); } } VocabWord word = vocab.elementAtIndex(currentWordIndex); word.setVocabId(cid.get()); word.setAffinityId(aff.get()); indexSyn0VecMap.put(word, l1); }