@Override public GradientUpdater getUpdater() { AdaGrad adaGrad = new AdaGrad(lrSum / count); adaGrad.setHistoricalGradient(historicalGradientSum.div(count)); adaGrad.setNumIterations((int) (numIterationsSum / count)); return adaGrad; }
@Test public void testAdaGrad1() { int rows = 1; int cols = 1; AdaGrad grad = new AdaGrad(rows, cols, 1e-3); INDArray W = Nd4j.ones(rows, cols); assertEquals(1e-1, grad.getGradient(W, 0).getDouble(0), 1e-1); }
private Pair<INDArray, Double> update( AdaGrad weightAdaGrad, AdaGrad biasAdaGrad, INDArray syn0, INDArray bias, VocabWord w1, INDArray wordVector, INDArray contextVector, double gradient) { // gradient for word vectors INDArray grad1 = contextVector.mul(gradient); INDArray update = weightAdaGrad.getGradient(grad1, w1.getIndex(), syn0.shape()); double w1Bias = bias.getDouble(w1.getIndex()); double biasGradient = biasAdaGrad.getGradient(gradient, w1.getIndex(), bias.shape()); double update2 = w1Bias - biasGradient; return new Pair<>(update, update2); }
/* compute the gradient given the current solution, the probabilities and the constant */ protected Pair<Double, INDArray> gradient(INDArray p) { INDArray sumY = pow(y, 2).sum(1); if (yIncs == null) yIncs = zeros(y.shape()); if (gains == null) gains = ones(y.shape()); // Student-t distribution // also un normalized q INDArray qu = y.mmul(y.transpose()) .muli(-2) .addiRowVector(sumY) .transpose() .addiRowVector(sumY) .addi(1) .rdivi(1); int n = y.rows(); // set diagonal to zero doAlongDiagonal(qu, new Zero()); // normalize to get probabilities INDArray q = qu.div(qu.sum(Integer.MAX_VALUE)); BooleanIndexing.applyWhere(q, Conditions.lessThan(realMin), new Value(realMin)); INDArray PQ = p.sub(q); INDArray yGrads = getYGradient(n, PQ, qu); gains = gains .add(.2) .muli( yGrads.cond(Conditions.greaterThan(0)).neqi(yIncs.cond(Conditions.greaterThan(0)))) .addi( gains .mul(0.8) .muli( yGrads .cond(Conditions.greaterThan(0)) .eqi(yIncs.cond(Conditions.greaterThan(0))))); BooleanIndexing.applyWhere(gains, Conditions.lessThan(minGain), new Value(minGain)); INDArray gradChange = gains.mul(yGrads); if (useAdaGrad) gradChange = adaGrad.getGradient(gradChange, 0); else gradChange.muli(learningRate); yIncs.muli(momentum).subi(gradChange); double cost = p.mul(log(p.div(q), false)).sum(Integer.MAX_VALUE).getDouble(0); return new Pair<>(cost, yIncs); }
@Test public void testAdaGrad() { int rows = 10; int cols = 2; /* Project for tomorrow: BaseElementWiseOp is having issues with the reshape (which produces inconsistent results) the test case for this was adagrad */ AdaGrad grad = new AdaGrad(rows, cols, 0.1); INDArray W = Nd4j.zeros(rows, cols); Distribution dist = Nd4j.getDistributions().createNormal(1, 1); for (int i = 0; i < W.rows(); i++) W.putRow(i, Nd4j.create(dist.sample(W.columns()))); for (int i = 0; i < 5; i++) { String learningRates = String.valueOf("\nAdagrad\n " + grad.getGradient(W, i)).replaceAll(";", "\n"); log.info(learningRates); W.addi(Nd4j.randn(rows, cols)); } }
/** * @param X * @param nDims * @param perplexity */ public INDArray calculate(INDArray X, int nDims, double perplexity) { if (usePca) X = PCA.pca(X, Math.min(50, X.columns()), normalize); // normalization (don't normalize again after pca) if (normalize) { X.subi(X.min(Integer.MAX_VALUE)); X = X.divi(X.max(Integer.MAX_VALUE)); X = X.subiRowVector(X.mean(0)); } if (nDims > X.columns()) nDims = X.columns(); INDArray sumX = pow(X, 2).sum(1); INDArray D = X.mmul(X.transpose()).muli(-2).addRowVector(sumX).transpose().addRowVector(sumX); // output if (y == null) y = randn(X.rows(), nDims, Nd4j.getRandom()).muli(1e-3f); INDArray p = computeGaussianPerplexity(D, perplexity); // lie for better local minima p.muli(4); // init adagrad where needed if (useAdaGrad) { if (adaGrad == null) { adaGrad = new AdaGrad(y.shape()); adaGrad.setMasterStepSize(learningRate); } } for (int i = 0; i < maxIter; i++) { step(p, i); if (i == switchMomentumIteration) momentum = finalMomentum; if (i == stopLyingIteration) p.divi(4); if (iterationListener != null) iterationListener.iterationDone(null, i); } return y; }
public AdaGrad createSubset(int index) { if (historicalGradient == null) this.historicalGradient = Nd4j.ones(shape); if (Shape.isMatrix(shape)) { AdaGrad a = new AdaGrad(1, historicalGradient.columns()); // grab only the needed elements INDArray slice = historicalGradient.slice(index).dup(); a.historicalGradient = slice; a.setLearningRate(learningRate); return a; } else { AdaGrad a = new AdaGrad(1, 1); // grab only the needed elements INDArray slice = Nd4j.scalar(historicalGradient.getDouble(index)); a.historicalGradient = slice; a.setLearningRate(learningRate); return a; } }
public INDArray getValueGradient(final List<Tree> trainingBatch) { // We use TreeMap for each of these so that they stay in a // canonical sorted order // TODO: factor out the initialization routines // binaryTD stands for Transform Derivatives final MultiDimensionalMap<String, String, INDArray> binaryTD = MultiDimensionalMap.newTreeBackedMap(); // the derivatives of the INd4j for the binary nodes final MultiDimensionalMap<String, String, INDArray> binaryINDArrayTD = MultiDimensionalMap.newTreeBackedMap(); // binaryCD stands for Classification Derivatives final MultiDimensionalMap<String, String, INDArray> binaryCD = MultiDimensionalMap.newTreeBackedMap(); // unaryCD stands for Classification Derivatives final Map<String, INDArray> unaryCD = new TreeMap<>(); // word vector derivatives final Map<String, INDArray> wordVectorD = new TreeMap<>(); for (MultiDimensionalMap.Entry<String, String, INDArray> entry : binaryTransform.entrySet()) { int numRows = entry.getValue().rows(); int numCols = entry.getValue().columns(); binaryTD.put(entry.getFirstKey(), entry.getSecondKey(), Nd4j.create(numRows, numCols)); } if (!combineClassification) { for (MultiDimensionalMap.Entry<String, String, INDArray> entry : binaryClassification.entrySet()) { int numRows = entry.getValue().rows(); int numCols = entry.getValue().columns(); binaryCD.put(entry.getFirstKey(), entry.getSecondKey(), Nd4j.create(numRows, numCols)); } } if (useDoubleTensors) { for (MultiDimensionalMap.Entry<String, String, INDArray> entry : binaryTensors.entrySet()) { int numRows = entry.getValue().size(1); int numCols = entry.getValue().size(2); int numSlices = entry.getValue().slices(); binaryINDArrayTD.put( entry.getFirstKey(), entry.getSecondKey(), Nd4j.create(numRows, numCols, numSlices)); } } for (Map.Entry<String, INDArray> entry : unaryClassification.entrySet()) { int numRows = entry.getValue().rows(); int numCols = entry.getValue().columns(); unaryCD.put(entry.getKey(), Nd4j.create(numRows, numCols)); } for (String s : vocabCache.words()) { INDArray vector = featureVectors.vector(s); int numRows = vector.rows(); int numCols = vector.columns(); wordVectorD.put(s, Nd4j.create(numRows, numCols)); } final List<Tree> forwardPropTrees = new CopyOnWriteArrayList<>(); // if(!forwardPropTrees.isEmpty()) Parallelization.iterateInParallel( trainingBatch, new Parallelization.RunnableWithParams<Tree>() { public void run(Tree currentItem, Object[] args) { Tree trainingTree = new Tree(currentItem); trainingTree.connect(new ArrayList<>(currentItem.children())); // this will attach the error vectors and the node vectors // to each node in the tree forwardPropagateTree(trainingTree); forwardPropTrees.add(trainingTree); } }, rnTnActorSystem); // TODO: we may find a big speedup by separating the derivatives and then summing final AtomicDouble error = new AtomicDouble(0); if (!forwardPropTrees.isEmpty()) Parallelization.iterateInParallel( forwardPropTrees, new Parallelization.RunnableWithParams<Tree>() { public void run(Tree currentItem, Object[] args) { backpropDerivativesAndError( currentItem, binaryTD, binaryCD, binaryINDArrayTD, unaryCD, wordVectorD); error.addAndGet(currentItem.errorSum()); } }, new Parallelization.RunnableWithParams<Tree>() { public void run(Tree currentItem, Object[] args) {} }, rnTnActorSystem, new Object[] {binaryTD, binaryCD, binaryINDArrayTD, unaryCD, wordVectorD}); // scale the error by the number of sentences so that the // regularization isn't drowned out for large training batchs double scale = trainingBatch == null || trainingBatch.isEmpty() ? 1.0f : (1.0f / trainingBatch.size()); value = error.doubleValue() * scale; value += scaleAndRegularize(binaryTD, binaryTransform, scale, regTransformMatrix); value += scaleAndRegularize(binaryCD, binaryClassification, scale, regClassification); value += scaleAndRegularizeINDArray(binaryINDArrayTD, binaryTensors, scale, regTransformINDArray); value += scaleAndRegularize(unaryCD, unaryClassification, scale, regClassification); value += scaleAndRegularize(wordVectorD, featureVectors, scale, regWordVector); INDArray derivative = Nd4j.toFlattened( getNumParameters(), binaryTD.values().iterator(), binaryCD.values().iterator(), binaryINDArrayTD.values().iterator(), unaryCD.values().iterator(), wordVectorD.values().iterator()); if (derivative.length() != numParameters) throw new IllegalStateException( "Gradient has wrong number of parameters " + derivative.length() + " should have been " + numParameters); if (paramAdaGrad == null) paramAdaGrad = new AdaGrad(1, derivative.columns()); derivative = paramAdaGrad.getGradient(derivative, 0); return derivative; }