@Before public void beforeDo() { weightGradient = Nd4j.ones(nIn, nOut); biasGradient = Nd4j.ones(1, nOut); gradient.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup()); gradient.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup()); }
@Test public void testNoOpUpdater() { Random r = new Random(12345L); double lr = 0.5; NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder() .learningRate(lr) .layer( new DenseLayer.Builder() .nIn(nIn) .nOut(nOut) .updater(org.deeplearning4j.nn.conf.Updater.NONE) .build()) .build(); int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true); INDArray params = Nd4j.create(1, numParams); Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true); Updater updater = UpdaterCreator.getUpdater(layer); for (int i = 0; i < weightGradient.length(); i++) weightGradient.putScalar(i, r.nextDouble()); for (int i = 0; i < biasGradient.length(); i++) biasGradient.putScalar(i, r.nextDouble()); gradient.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, weightGradient); gradient.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, biasGradient); updater.update(layer, gradient, -1, 1); INDArray weightGradActual = gradient.getGradientFor(DefaultParamInitializer.WEIGHT_KEY); INDArray biasGradActual = gradient.getGradientFor(DefaultParamInitializer.BIAS_KEY); assertEquals(weightGradient, weightGradActual); assertEquals(biasGradient, biasGradActual); }
@Test public void testSGDUpdater() { double lr = 0.05; NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder() .learningRate(lr) .layer( new DenseLayer.Builder() .nIn(nIn) .nOut(nOut) .updater(org.deeplearning4j.nn.conf.Updater.SGD) .build()) .build(); int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true); INDArray params = Nd4j.create(1, numParams); Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true); Updater updater = UpdaterCreator.getUpdater(layer); updater.update(layer, gradient, -1, 1); Gradient gradientDup = new DefaultGradient(); gradientDup.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup()); gradientDup.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup()); for (Map.Entry<String, INDArray> entry : gradientDup.gradientForVariable().entrySet()) { val = entry.getValue(); gradExpected = val.mul(lr); assertEquals(gradExpected, gradient.getGradientFor(entry.getKey())); } assertEquals(lr, layer.conf().getLayer().getLearningRate(), 1e-4); }
private Gradient createPrevGradient() { Gradient gradient = new DefaultGradient(); INDArray pseudoGradients = Nd4j.ones(nExamples, nChannelsIn, featureMapHeight, featureMapWidth); gradient.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, pseudoGradients); gradient.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, pseudoGradients); return gradient; }
/** * Contrastive divergence revolves around the idea of approximating the log likelihood around * x1(input) with repeated sampling. Given is an energy based model: the higher k is (the more we * sample the model) the more we lower the energy (increase the likelihood of the model) * * <p>and lower the likelihood (increase the energy) of the hidden samples. * * <p>Other insights: CD - k involves keeping the first k samples of a gibbs sampling of the * model. */ public void contrastiveDivergence() { Gradient gradient = gradient(); getParam(PretrainParamInitializer.VISIBLE_BIAS_KEY) .subi(gradient.gradientForVariable().get(PretrainParamInitializer.VISIBLE_BIAS_KEY)); getParam(PretrainParamInitializer.BIAS_KEY) .subi(gradient.gradientForVariable().get(PretrainParamInitializer.BIAS_KEY)); getParam(PretrainParamInitializer.WEIGHT_KEY) .subi(gradient.gradientForVariable().get(PretrainParamInitializer.WEIGHT_KEY)); }
@Override public void update(Layer layer, Gradient gradient, int iteration) { preApply(layer, gradient, iteration); for (Map.Entry<String, INDArray> gradientPair : gradient.gradientForVariable().entrySet()) { GradientUpdater updater = init(gradientPair.getKey(), gradientPair.getValue(), layer); INDArray gradient2 = updater.getGradient(gradientPair.getValue(), iteration); postApply(layer, gradient2, gradientPair.getKey()); gradient.setGradientFor(gradientPair.getKey(), gradient2); } }
private Pair<Gradient, INDArray> getGradientsAndDelta(INDArray output) { INDArray labelsSubOut = labels.sub(output); Gradient gradient = new DefaultGradient(); gradient.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, labelsSubOut.sum(0)); switch (conf.getLossFunction()) { case MCXENT: // cross-entropy (multi-class, with one-hot encoding) gradient .gradientForVariable() .put(DefaultParamInitializer.WEIGHT_KEY, input.transpose().mmul(labelsSubOut)); return new Pair<>(gradient, labelsSubOut); case XENT: // cross-entropy (single binary output variable) gradient .gradientForVariable() .put( DefaultParamInitializer.WEIGHT_KEY, input.transpose().mmul(labelsSubOut.div(output.mul(output.rsub(1))))); return new Pair<>(gradient, labelsSubOut); case MSE: // mean squared error gradient .gradientForVariable() .put(DefaultParamInitializer.WEIGHT_KEY, input.transpose().mmul(labelsSubOut.neg())); return new Pair<>(gradient, labelsSubOut); case EXPLL: // exponential logarithmic gradient .gradientForVariable() .put( DefaultParamInitializer.WEIGHT_KEY, input.transpose().mmul(labels.rsub(1).divi(output))); return new Pair<>(gradient, labelsSubOut); case RMSE_XENT: // root mean squared error cross entropy INDArray squaredrmseXentDiff = pow(labelsSubOut, 2.0); INDArray sqrt = sqrt(squaredrmseXentDiff); gradient .gradientForVariable() .put(DefaultParamInitializer.WEIGHT_KEY, input.transpose().mmul(sqrt)); return new Pair<>(gradient, labelsSubOut); case SQUARED_LOSS: gradient .gradientForVariable() .put( DefaultParamInitializer.WEIGHT_KEY, input.transpose().mmul(input.transpose().mmul(pow(labelsSubOut, 2)))); return new Pair<>(gradient, labelsSubOut); case NEGATIVELOGLIKELIHOOD: // mulit-class cross-entropy gradient .gradientForVariable() .put(DefaultParamInitializer.WEIGHT_KEY, input.transpose().mmul(labelsSubOut)); return new Pair<>(gradient, labelsSubOut); default: throw new IllegalStateException("Invalid loss function: " + conf.getLossFunction()); } }
@Test public void testRMSPropUpdater() { double lr = 0.01; double rmsDecay = 0.25; Map<String, INDArray> lastG = new HashMap<>(); NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder() .learningRate(lr) .rmsDecay(rmsDecay) .layer( new DenseLayer.Builder() .nIn(nIn) .nOut(nOut) .updater(org.deeplearning4j.nn.conf.Updater.RMSPROP) .build()) .build(); int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true); INDArray params = Nd4j.create(1, numParams); Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true); Updater updater = UpdaterCreator.getUpdater(layer); int updaterStateSize = updater.stateSizeForLayer(layer); INDArray updaterState = Nd4j.create(1, updaterStateSize); updater.setStateViewArray(layer, updaterState, true); updater.update(layer, gradient, -1, 1); Gradient gradientDup = new DefaultGradient(); gradientDup.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup()); gradientDup.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup()); for (Map.Entry<String, INDArray> entry : gradientDup.gradientForVariable().entrySet()) { key = entry.getKey(); val = entry.getValue(); INDArray lastGTmp = lastG.get(key); if (lastGTmp == null) lastGTmp = Nd4j.zeros(val.shape()); lastGTmp.muli(rmsDecay).addi(val.mul(val).muli(1 - rmsDecay)); gradExpected = val.mul(lr).div(Transforms.sqrt(lastGTmp.add(Nd4j.EPS_THRESHOLD))); assertEquals(gradExpected, gradient.getGradientFor(entry.getKey())); lastG.put(key, lastGTmp); } assertEquals(rmsDecay, layer.conf().getLayer().getRmsDecay(), 1e-4); }
@Test public void testAdamUpdater() { INDArray m, v; double lr = 0.01; int iteration = 0; double beta1 = 0.8; double beta2 = 0.888; NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder() .learningRate(lr) .iterations(iteration) .adamMeanDecay(beta1) .adamVarDecay(beta2) .layer( new DenseLayer.Builder() .nIn(nIn) .nOut(nOut) .updater(org.deeplearning4j.nn.conf.Updater.ADAM) .build()) .build(); int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true); INDArray params = Nd4j.create(1, numParams); Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true); Updater updater = UpdaterCreator.getUpdater(layer); int updaterStateSize = updater.stateSizeForLayer(layer); INDArray updaterState = Nd4j.create(1, updaterStateSize); updater.setStateViewArray(layer, updaterState, true); updater.update(layer, gradient, iteration, 1); double beta1t = FastMath.pow(beta1, iteration); double beta2t = FastMath.pow(beta2, iteration); double alphat = lr * FastMath.sqrt(1 - beta2t) / (1 - beta1t); if (Double.isNaN(alphat) || alphat == 0.0) alphat = epsilon; Gradient gradientDup = new DefaultGradient(); gradientDup.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient); gradientDup.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient); for (Map.Entry<String, INDArray> entry : gradientDup.gradientForVariable().entrySet()) { val = entry.getValue(); m = Nd4j.zeros(val.shape()); v = Nd4j.zeros(val.shape()); m.muli(beta1).addi(val.mul(1.0 - beta1)); v.muli(beta2).addi(val.mul(val).mul(1.0 - beta2)); gradExpected = m.mul(alphat).divi(Transforms.sqrt(v).addi(epsilon)); if (!gradExpected.equals(gradient.getGradientFor(entry.getKey()))) { System.out.println(Arrays.toString(gradExpected.dup().data().asFloat())); System.out.println( Arrays.toString(gradient.getGradientFor(entry.getKey()).dup().data().asFloat())); } assertEquals(gradExpected, gradient.getGradientFor(entry.getKey())); } assertEquals(beta1, layer.conf().getLayer().getAdamMeanDecay(), 1e-4); assertEquals(beta2, layer.conf().getLayer().getAdamVarDecay(), 1e-4); }
private Gradient createPrevGradient() { int inputWidth = 28; int inputHeight = 28; int[] stride = new int[] {2, 2}; int[] padding = new int[] {0, 0}; int[] kernelSize = new int[] {9, 9}; int nChannelsIn = 1; int nExamples = 5; int featureMapHeight = (inputHeight + padding[0] * 2 - kernelSize[0]) / stride[0] + 1; int featureMapWidth = (inputWidth + padding[1] * 2 - kernelSize[1]) / stride[1] + 1; Gradient gradient = new DefaultGradient(); INDArray pseudoGradients = Nd4j.ones(nExamples, nChannelsIn, featureMapHeight, featureMapWidth); gradient.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, pseudoGradients); gradient.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, pseudoGradients); return gradient; }
@Test public void testNestorovsUpdater() { double lr = 1e-2; double mu = 0.6; INDArray v, vPrev; NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder() .learningRate(lr) .momentum(mu) .layer( new DenseLayer.Builder() .nIn(nIn) .nOut(nOut) .updater(org.deeplearning4j.nn.conf.Updater.NESTEROVS) .build()) .build(); int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true); INDArray params = Nd4j.create(1, numParams); Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true); Updater updater = UpdaterCreator.getUpdater(layer); int updaterStateSize = updater.stateSizeForLayer(layer); INDArray updaterState = Nd4j.create(1, updaterStateSize); updater.setStateViewArray(layer, updaterState, true); updater.update(layer, gradient, -1, 1); Gradient gradientDup = new DefaultGradient(); gradientDup.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup()); gradientDup.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup()); for (Map.Entry<String, INDArray> entry : gradientDup.gradientForVariable().entrySet()) { val = entry.getValue(); v = Nd4j.zeros(val.shape()); vPrev = v; v = vPrev.mul(mu).subi(val.mul(lr)); gradExpected = vPrev.muli(mu).addi(v.mul(-mu - 1)); assertEquals(gradExpected, gradient.getGradientFor(entry.getKey())); } assertEquals(mu, layer.conf().getLayer().getMomentum(), 1e-4); }
@Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; ModelAndGradient that = (ModelAndGradient) o; if (gradient != null ? !gradient.equals(that.gradient) : that.gradient != null) return false; return !(model != null ? !model.equals(that.model) : that.model != null); }
@Test public void testAdaGradUpdater() { double lr = 1e-2; NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder() .learningRate(lr) .layer( new DenseLayer.Builder() .nIn(nIn) .nOut(nOut) .updater(org.deeplearning4j.nn.conf.Updater.ADAGRAD) .build()) .build(); int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true); INDArray params = Nd4j.create(1, numParams); Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true); Updater updater = UpdaterCreator.getUpdater(layer); int updaterStateSize = updater.stateSizeForLayer(layer); INDArray updaterState = Nd4j.create(1, updaterStateSize); updater.setStateViewArray(layer, updaterState, true); updater.update(layer, gradient, -1, 1); Gradient gradientDup = new DefaultGradient(); gradientDup.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient); gradientDup.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient); for (Map.Entry<String, INDArray> entry : gradientDup.gradientForVariable().entrySet()) { val = entry.getValue(); gradExpected = Transforms.sqrt(val.mul(val).add(epsilon)).rdiv(lr).mul(val); assertEquals(gradExpected, gradient.getGradientFor(entry.getKey())); } assertEquals(lr, layer.conf().getLayer().getLearningRate(), 1e-4); }
@Override public void computeGradientAndScore() { int k = layerConf().getK(); // POSITIVE PHASE Pair<INDArray, INDArray> probHidden = sampleHiddenGivenVisible(input()); /* * Start the gibbs sampling. */ INDArray chainStart = probHidden.getSecond(); /* * Note that at a later date, we can explore alternative methods of * storing the chain transitions for different kinds of sampling * and exploring the search space. */ Pair<Pair<INDArray, INDArray>, Pair<INDArray, INDArray>> matrices; // negative visible means or expected values INDArray nvMeans = null; // negative value samples INDArray nvSamples = null; // negative hidden means or expected values INDArray nhMeans = null; // negative hidden samples INDArray nhSamples = null; /* * K steps of gibbs sampling. This is the positive phase of contrastive divergence. * * There are 4 matrices being computed for each gibbs sampling. * The samples from both the positive and negative phases and their expected values * or averages. * */ for (int i = 0; i < k; i++) { // NEGATIVE PHASE if (i == 0) matrices = gibbhVh(chainStart); else matrices = gibbhVh(nhSamples); // get the cost updates for sampling in the chain after k iterations nvMeans = matrices.getFirst().getFirst(); nvSamples = matrices.getFirst().getSecond(); nhMeans = matrices.getSecond().getFirst(); nhSamples = matrices.getSecond().getSecond(); } /* * Update gradient parameters */ INDArray wGradient = input().transposei().mmul(probHidden.getSecond()).subi(nvSamples.transpose().mmul(nhMeans)); INDArray hBiasGradient; if (layerConf().getSparsity() != 0) // all hidden units must stay around this number hBiasGradient = probHidden.getSecond().rsub(layerConf().getSparsity()).sum(0); else // update rule: the expected values of the hidden input - the negative hidden means adjusted // by the learning rate hBiasGradient = probHidden.getSecond().sub(nhMeans).sum(0); // update rule: the expected values of the input - the negative samples adjusted by the learning // rate INDArray delta = input.sub(nvSamples); INDArray vBiasGradient = delta.sum(0); Gradient ret = new DefaultGradient(); ret.gradientForVariable().put(PretrainParamInitializer.VISIBLE_BIAS_KEY, vBiasGradient); ret.gradientForVariable().put(PretrainParamInitializer.BIAS_KEY, hBiasGradient); ret.gradientForVariable().put(PretrainParamInitializer.WEIGHT_KEY, wGradient); gradient = ret; setScoreWithZ(delta); }
/** Apply gradient normalization: scale based on L2, clipping etc. */ public void preApply(Layer layer, Gradient gradient, int iteration) { GradientNormalization normalization = layer.conf().getLayer().getGradientNormalization(); if (normalization == null || normalization == GradientNormalization.None) return; // no op final double threshold = layer.conf().getLayer().getGradientNormalizationThreshold(); switch (normalization) { case RenormalizeL2PerLayer: double sumSquares = 0.0; for (INDArray g : gradient.gradientForVariable().values()) { double l2 = g.norm2Number().doubleValue(); // l2 norm: sqrt(sum_i g_i^2) sumSquares += l2 * l2; } double layerL2 = FastMath.sqrt(sumSquares); for (INDArray g : gradient.gradientForVariable().values()) { g.divi(layerL2); } break; case RenormalizeL2PerParamType: for (INDArray g : gradient.gradientForVariable().values()) { double l2 = Nd4j.getExecutioner().execAndReturn(new Norm2(g)).getFinalResult().doubleValue(); g.divi(l2); } break; case ClipElementWiseAbsoluteValue: Condition absValueCondition = new AbsValueGreaterThan(threshold); Function<Number, Number> clipFn = new Function<Number, Number>() { @Override public Number apply(Number number) { return (number.doubleValue() > threshold ? threshold : -threshold); } }; for (INDArray g : gradient.gradientForVariable().values()) { BooleanIndexing.applyWhere(g, absValueCondition, clipFn); } break; case ClipL2PerLayer: double sumSquares2 = 0.0; for (INDArray g : gradient.gradientForVariable().values()) { double l2 = Nd4j.getExecutioner().execAndReturn(new Norm2(g)).getFinalResult().doubleValue(); // l2 norm: sqrt(sum_i g_i^2) sumSquares2 += l2 * l2; } double layerL22 = FastMath.sqrt(sumSquares2); if (layerL22 > threshold) { double scalingFactor = threshold / layerL22; // g = g / l2 * threshold -> for (INDArray g : gradient.gradientForVariable().values()) { g.muli(scalingFactor); } } break; case ClipL2PerParamType: for (INDArray g : gradient.gradientForVariable().values()) { double l2 = g.norm2Number().doubleValue(); if (l2 > threshold) { double scalingFactor = l2 / threshold; g.divi(scalingFactor); } } break; default: throw new RuntimeException( "Unknown (or not implemented) gradient normalization strategy: " + normalization); } }
@Override public int hashCode() { int result = gradient != null ? gradient.hashCode() : 0; result = 31 * result + (model != null ? model.hashCode() : 0); return result; }
@Override public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) { // First: Do forward pass to get gate activations etc. INDArray[] activations = activateHelper(true, null); // Order: {outputActivations,rucZs,rucAs} INDArray outputActivations = activations[0]; INDArray rucZs = activations[1]; INDArray rucAs = activations[2]; INDArray inputWeights = getParam(GRUParamInitializer.INPUT_WEIGHT_KEY); // Shape: [n^(L-1),3*n^L], order: [wr,wu,wc] INDArray recurrentWeights = getParam(GRUParamInitializer.RECURRENT_WEIGHT_KEY); // Shape: [n^L,3*n^L]; order: [wR,wU,wC] int layerSize = recurrentWeights.size(0); // i.e., n^L int prevLayerSize = inputWeights.size(0); // n^(L-1) int miniBatchSize = epsilon.size(0); boolean is2dInput = epsilon.rank() < 3; // Edge case: T=1 may have shape [miniBatchSize,n^(L+1)], equiv. to // [miniBatchSize,n^(L+1),1] int timeSeriesLength = (is2dInput ? 1 : epsilon.size(2)); INDArray wr = inputWeights.get(NDArrayIndex.all(), interval(0, layerSize)); INDArray wu = inputWeights.get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize)); INDArray wc = inputWeights.get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize)); INDArray wR = recurrentWeights.get(NDArrayIndex.all(), interval(0, layerSize)); INDArray wU = recurrentWeights.get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize)); INDArray wC = recurrentWeights.get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize)); INDArray wRdiag = Nd4j.diag(wR).transpose(); // INDArray wUdiag = Nd4j.diag(wU).transpose(); INDArray wCdiag = Nd4j.diag(wC).transpose(); // Parameter gradients: Stores sum over each time step here INDArray biasGradients = Nd4j.zeros(new int[] {1, 3 * layerSize}); INDArray inputWeightGradients = Nd4j.zeros(new int[] {prevLayerSize, 3 * layerSize}); INDArray recurrentWeightGradients = Nd4j.zeros(new int[] {layerSize, 3 * layerSize}); INDArray epsilonNext = Nd4j.zeros( miniBatchSize, prevLayerSize, timeSeriesLength); // i.e., what would be W^L*(delta^L)^T. Shape: [m,n^(L-1),T] INDArray deltaOutNext = Nd4j.zeros(miniBatchSize, layerSize); for (int t = timeSeriesLength - 1; t >= 0; t--) { INDArray prevOut = (t == 0 ? Nd4j.zeros(miniBatchSize, layerSize) : outputActivations.tensorAlongDimension(t - 1, 1, 0)); // Shape: [m,n^L] INDArray aSlice = (is2dInput ? rucAs : rucAs.tensorAlongDimension(t, 1, 0)); INDArray zSlice = (is2dInput ? rucZs : rucZs.tensorAlongDimension(t, 1, 0)); INDArray aSliceNext; INDArray zSliceNext; if (t == timeSeriesLength - 1) { aSliceNext = Nd4j.zeros(miniBatchSize, 3 * layerSize); zSliceNext = Nd4j.zeros(miniBatchSize, 3 * layerSize); } else { aSliceNext = rucAs.tensorAlongDimension(t + 1, 1, 0); zSliceNext = rucZs.tensorAlongDimension(t + 1, 1, 0); } INDArray zr = zSlice.get(NDArrayIndex.all(), interval(0, layerSize)); INDArray sigmaPrimeZr = Nd4j.getExecutioner() .execAndReturn(Nd4j.getOpFactory().createTransform("sigmoid", zr.dup()).derivative()); INDArray epsilonSlice = (is2dInput ? epsilon : epsilon.tensorAlongDimension(t, 1, 0)); // (w^{L+1}*(delta^{(L+1)t})^T)^T or equiv. INDArray deltaOut = epsilonSlice.dup(); if (t < timeSeriesLength - 1) { INDArray aOut = (is2dInput ? outputActivations : outputActivations.tensorAlongDimension(t, 1, 0)); INDArray arNext = aSliceNext.get(NDArrayIndex.all(), interval(0, layerSize)); INDArray auNext = aSliceNext.get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize)); INDArray acNext = aSliceNext.get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize)); INDArray zrNext = zSliceNext.get(NDArrayIndex.all(), interval(0, layerSize)); INDArray zuNext = zSliceNext.get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize)); INDArray zcNext = zSliceNext.get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize)); INDArray sigmaPrimeZrNext = Nd4j.getExecutioner() .execAndReturn( Nd4j.getOpFactory().createTransform("sigmoid", zrNext.dup()).derivative()); INDArray sigmaPrimeZuNext = Nd4j.getExecutioner() .execAndReturn( Nd4j.getOpFactory().createTransform("sigmoid", zuNext.dup()).derivative()); INDArray sigmaPrimeZcNext = Nd4j.getExecutioner() .execAndReturn( Nd4j.getOpFactory() .createTransform(conf.getLayer().getActivationFunction(), zcNext.dup()) .derivative()); deltaOut.addi(auNext.mul(deltaOutNext)); deltaOut.addi( aOut.sub(acNext) .muli(sigmaPrimeZuNext) .muli(wU.mmul(deltaOutNext.transpose()).transpose())); deltaOut.addi( auNext .rsub(1.0) .muli(sigmaPrimeZcNext) .muli(arNext.add(aOut.mul(sigmaPrimeZrNext).muliRowVector(wRdiag))) .muli(wC.mmul(deltaOutNext.transpose()).transpose())); } // Delta at update gate INDArray zu = zSlice.get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize)); INDArray sigmaPrimeZu = Nd4j.getExecutioner() .execAndReturn(Nd4j.getOpFactory().createTransform("sigmoid", zu.dup()).derivative()); INDArray ac = aSlice.get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize)); INDArray deltaU = deltaOut.mul(sigmaPrimeZu).muli(prevOut.sub(ac)); // Delta for candidate activation INDArray zc = zSlice.get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize)); INDArray sigmaPrimeZc = Nd4j.getExecutioner() .execAndReturn( Nd4j.getOpFactory() .createTransform(conf.getLayer().getActivationFunction(), zc.dup()) .derivative()); INDArray au = aSlice.get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize)); INDArray deltaC = deltaOut.mul(sigmaPrimeZc).muli(au.rsub(1.0)); // Delta at reset gate INDArray deltaR = deltaC.mulRowVector(wCdiag).muli(prevOut).muli(sigmaPrimeZr); // Add input gradients for this time step: INDArray prevLayerActivationSlice = (is2dInput ? input : input.tensorAlongDimension(t, 1, 0)); inputWeightGradients .get(NDArrayIndex.all(), interval(0, layerSize)) .addi(deltaR.transpose().mmul(prevLayerActivationSlice).transpose()); inputWeightGradients .get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize)) .addi(deltaU.transpose().mmul(prevLayerActivationSlice).transpose()); inputWeightGradients .get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize)) .addi(deltaC.transpose().mmul(prevLayerActivationSlice).transpose()); // Add recurrent weight gradients for this time step: if (t > 0) { // t=0: no previous output recurrentWeightGradients .get(NDArrayIndex.all(), interval(0, layerSize)) .addi(deltaR.transpose().mmul(prevOut).transpose()); recurrentWeightGradients .get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize)) .addi(deltaU.transpose().mmul(prevOut).transpose()); INDArray ar = aSlice.get(NDArrayIndex.all(), interval(0, layerSize)); recurrentWeightGradients .get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize)) .addi(deltaC.transpose().mmul(prevOut.mul(ar)).transpose()); } // Add bias gradients for this time step: biasGradients.get(NDArrayIndex.point(0), interval(0, layerSize)).addi(deltaR.sum(0)); biasGradients .get(NDArrayIndex.point(0), interval(layerSize, 2 * layerSize)) .addi(deltaU.sum(0)); biasGradients .get(NDArrayIndex.point(0), interval(2 * layerSize, 3 * layerSize)) .addi(deltaC.sum(0)); INDArray epsilonNextSlice = wr.mmul(deltaR.transpose()) .transpose() .addi(wu.mmul(deltaU.transpose()).transpose()) .addi(wc.mmul(deltaC.transpose()).transpose()); epsilonNext.tensorAlongDimension(t, 1, 0).assign(epsilonNextSlice); deltaOutNext = deltaOut; } Gradient g = new DefaultGradient(); g.setGradientFor(GRUParamInitializer.INPUT_WEIGHT_KEY, inputWeightGradients); g.setGradientFor(GRUParamInitializer.RECURRENT_WEIGHT_KEY, recurrentWeightGradients); g.setGradientFor(GRUParamInitializer.BIAS_KEY, biasGradients); return new Pair<>(g, epsilonNext); }
@Test public void testAdaDeltaUpdate() { INDArray dxSquared; Map<String, INDArray> msg = new HashMap<>(); Map<String, INDArray> msdx = new HashMap<>(); double rho = 0.85; NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder() .rho(rho) .layer( new DenseLayer.Builder() .nIn(nIn) .nOut(nOut) .updater(org.deeplearning4j.nn.conf.Updater.ADADELTA) .build()) .build(); int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true); INDArray params = Nd4j.create(1, numParams); Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true); Updater updater = UpdaterCreator.getUpdater(layer); int updaterStateSize = updater.stateSizeForLayer(layer); INDArray updaterState = Nd4j.create(1, updaterStateSize); updater.setStateViewArray(layer, updaterState, true); Gradient gradientDup = new DefaultGradient(); gradientDup.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup()); gradientDup.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup()); for (int i = 0; i < 2; i++) { updater.update(layer, gradient, i, 1); // calculations for one iteration / update for (Map.Entry<String, INDArray> entry : gradientDup.gradientForVariable().entrySet()) { key = entry.getKey(); val = entry.getValue(); INDArray msgTmp = msg.get(key); INDArray msdxTmp = msdx.get(key); if (msgTmp == null) { msgTmp = Nd4j.zeros(val.shape()); msdxTmp = Nd4j.zeros(val.shape()); } msgTmp.muli(rho); msgTmp.addi(1 - rho).muli(val.mul(val)); gradExpected = Transforms.sqrt(msdxTmp.add(Nd4j.EPS_THRESHOLD)) .divi(Transforms.sqrt(msgTmp.add(Nd4j.EPS_THRESHOLD))) .muli(val); gradientDup.setGradientFor(key, gradExpected); assertEquals(gradExpected, gradient.getGradientFor(entry.getKey())); msdxTmp.muli(rho); dxSquared = gradExpected.mul(gradExpected); msdxTmp.addi(dxSquared.muli(1 - rho)); msg.put(key, msgTmp); msdx.put(key, msdxTmp); } assertEquals(rho, layer.conf().getLayer().getRho(), 1e-4); } }
@Test public void testMultiLayerUpdater() throws Exception { Nd4j.getRandom().setSeed(12345L); double lr = 0.03; MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() .learningRate(lr) .momentum(0.6) .list() .layer( 0, new DenseLayer.Builder() .nIn(4) .nOut(5) .updater(org.deeplearning4j.nn.conf.Updater.SGD) .build()) .layer( 1, new DenseLayer.Builder() .nIn(5) .nOut(6) .updater(org.deeplearning4j.nn.conf.Updater.NONE) .build()) .layer( 2, new DenseLayer.Builder() .nIn(6) .nOut(7) .updater(org.deeplearning4j.nn.conf.Updater.ADAGRAD) .build()) .layer( 3, new DenseLayer.Builder() .nIn(7) .nOut(8) .updater(org.deeplearning4j.nn.conf.Updater.NESTEROVS) .build()) .build(); MultiLayerNetwork net = new MultiLayerNetwork(conf); net.init(); Updater updater = UpdaterCreator.getUpdater(net); assertNotNull(updater); assertTrue(updater.getClass() == MultiLayerUpdater.class); Field f = MultiLayerUpdater.class.getDeclaredField("layerUpdaters"); f.setAccessible(true); Updater[] updaters = (Updater[]) f.get(updater); assertNotNull(updaters); assertTrue(updaters.length == net.getnLayers()); assertTrue(updaters[0] instanceof SgdUpdater); assertTrue(updaters[1] instanceof NoOpUpdater); assertTrue(updaters[2] instanceof AdaGradUpdater); assertTrue(updaters[3] instanceof NesterovsUpdater); Updater[] uArr = new Updater[4]; uArr[0] = new SgdUpdater(); uArr[1] = new NoOpUpdater(); uArr[2] = new AdaGradUpdater(); int updaterStateSize = uArr[2].stateSizeForLayer(net.getLayer(2)); INDArray updaterState = Nd4j.create(1, updaterStateSize); uArr[2].setStateViewArray(net.getLayer(2), updaterState, true); uArr[3] = new NesterovsUpdater(); updaterStateSize = uArr[3].stateSizeForLayer(net.getLayer(3)); updaterState = Nd4j.create(1, updaterStateSize); uArr[3].setStateViewArray(net.getLayer(3), updaterState, true); int[] nIns = {4, 5, 6, 7}; int[] nOuts = {5, 6, 7, 8}; for (int i = 0; i < 5; i++) { Gradient gradient = new DefaultGradient(); Map<String, INDArray> expectedGradient = new LinkedHashMap<>(); for (int j = 0; j < net.getnLayers(); j++) { // Generate test gradient: INDArray wGrad = Nd4j.rand(1, nIns[j] * nOuts[j]); INDArray bGrad = Nd4j.rand(1, nOuts[j]); String wKey = j + "_" + DefaultParamInitializer.WEIGHT_KEY; String bKey = j + "_" + DefaultParamInitializer.BIAS_KEY; gradient.setGradientFor(wKey, wGrad); gradient.setGradientFor(bKey, bGrad); // Also put copy of gradient through separate layer updaters to compare Gradient layerGradient = new DefaultGradient(); layerGradient.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, wGrad.dup()); layerGradient.setGradientFor(DefaultParamInitializer.BIAS_KEY, bGrad.dup()); uArr[j].update(net.getLayer(j), layerGradient, i, 1); for (String s : layerGradient.gradientForVariable().keySet()) { expectedGradient.put(j + "_" + s, layerGradient.getGradientFor(s)); } } updater.update(net, gradient, i, 1); assertEquals(gradient.gradientForVariable(), expectedGradient); } }
@Override public Pair<Gradient, INDArray> backpropGradient( INDArray input, INDArray weights, INDArray delta, int[] kernel, int[] strides, int[] pad, INDArray biasGradView, INDArray weightGradView, String afn) { int miniBatch = input.size(0); int inH = input.size(2); int inW = input.size(3); int outDepth = weights.size(0); int inDepth = weights.size(1); int kH = weights.size(2); int kW = weights.size(3); int outH = Convolution.outSize(inH, kernel[0], strides[0], pad[0], false); int outW = Convolution.outSize(inW, kernel[1], strides[1], pad[1], false); if (!Shape.strideDescendingCAscendingF(delta)) { // apparently not supported by cuDNN delta = delta.dup(); } int[] srcStride = input.stride(); int[] deltaStride = delta.stride(); int[] algo = new int[1]; checkCudnn( cudnnSetTensor4dDescriptorEx( cudnnContext.srcTensorDesc, dataType, miniBatch, inDepth, inH, inW, srcStride[0], srcStride[1], srcStride[2], srcStride[3])); checkCudnn( cudnnSetTensor4dDescriptorEx( cudnnContext.deltaTensorDesc, dataType, miniBatch, outDepth, outH, outW, deltaStride[0], deltaStride[1], deltaStride[2], deltaStride[3])); checkCudnn( cudnnSetConvolution2dDescriptor( cudnnContext.convDesc, pad[0], pad[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION)); checkCudnn( cudnnSetFilter4dDescriptor( cudnnContext.filterDesc, dataType, tensorFormat, outDepth, inDepth, kH, kW)); checkCudnn( cudnnGetConvolutionBackwardFilterAlgorithm( cudnnContext, cudnnContext.srcTensorDesc, cudnnContext.deltaTensorDesc, cudnnContext.convDesc, cudnnContext.filterDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, algo)); INDArray epsNext = Nd4j.create(new int[] {miniBatch, inDepth, inH, inW}, 'c'); int[] dstStride = epsNext.stride(); Allocator allocator = AtomicAllocator.getInstance(); CudaContext context = allocator .getFlowController() .prepareAction(input, weights, weightGradView, biasGradView, delta, epsNext); Pointer srcData = allocator.getPointer(input, context); Pointer filterData = allocator.getPointer(weights, context); Pointer filterGradData = allocator.getPointer(weightGradView, context); Pointer biasGradData = allocator.getPointer(biasGradView, context); Pointer deltaData = allocator.getPointer(delta, context); Pointer dstData = allocator.getPointer(epsNext, context); checkCudnn(cudnnSetStream(cudnnContext, new CUstream_st(context.getOldStream()))); checkCudnn( cudnnSetTensor4dDescriptorEx( cudnnContext.dstTensorDesc, dataType, miniBatch, inDepth, inH, inW, dstStride[0], dstStride[1], dstStride[2], dstStride[3])); checkCudnn( cudnnGetConvolutionBackwardFilterWorkspaceSize( cudnnContext, cudnnContext.srcTensorDesc, cudnnContext.deltaTensorDesc, cudnnContext.convDesc, cudnnContext.filterDesc, algo[0], sizeInBytes)); long sizeInBytes1 = sizeInBytes.get(0); checkCudnn( cudnnGetConvolutionBackwardDataWorkspaceSize( cudnnContext, cudnnContext.filterDesc, cudnnContext.deltaTensorDesc, cudnnContext.convDesc, cudnnContext.dstTensorDesc, algo[0], sizeInBytes)); long sizeInBytes2 = sizeInBytes.get(0); if (sizeInBytes1 > workSpace.capacity() || sizeInBytes2 > workSpace.capacity()) { workSpace.deallocate(); workSpace = new WorkSpace(Math.max(sizeInBytes1, sizeInBytes2)); } checkCudnn( cudnnSetTensor4dDescriptor( cudnnContext.biasTensorDesc, tensorFormat, dataType, 1, outDepth, 1, 1)); checkCudnn( cudnnConvolutionBackwardBias( cudnnContext, alpha, cudnnContext.deltaTensorDesc, deltaData, beta, cudnnContext.biasTensorDesc, biasGradData)); checkCudnn( cudnnConvolutionBackwardFilter( cudnnContext, alpha, cudnnContext.srcTensorDesc, srcData, cudnnContext.deltaTensorDesc, deltaData, cudnnContext.convDesc, algo[0], workSpace, workSpace.capacity(), beta, cudnnContext.filterDesc, filterGradData)); checkCudnn( cudnnConvolutionBackwardData( cudnnContext, alpha, cudnnContext.filterDesc, filterData, cudnnContext.deltaTensorDesc, deltaData, cudnnContext.convDesc, algo[0], workSpace, workSpace.capacity(), beta, cudnnContext.dstTensorDesc, dstData)); allocator.registerAction(context, input, weights, weightGradView, biasGradView, delta, epsNext); Gradient retGradient = new DefaultGradient(); retGradient.setGradientFor(ConvolutionParamInitializer.BIAS_KEY, biasGradView); retGradient.setGradientFor(ConvolutionParamInitializer.WEIGHT_KEY, weightGradView, 'c'); return new Pair<>(retGradient, epsNext); }