예제 #1
0
 @Before
 public void beforeDo() {
   weightGradient = Nd4j.ones(nIn, nOut);
   biasGradient = Nd4j.ones(1, nOut);
   gradient.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup());
   gradient.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup());
 }
예제 #2
0
  @Test
  public void testNoOpUpdater() {
    Random r = new Random(12345L);
    double lr = 0.5;

    NeuralNetConfiguration conf =
        new NeuralNetConfiguration.Builder()
            .learningRate(lr)
            .layer(
                new DenseLayer.Builder()
                    .nIn(nIn)
                    .nOut(nOut)
                    .updater(org.deeplearning4j.nn.conf.Updater.NONE)
                    .build())
            .build();

    int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true);
    INDArray params = Nd4j.create(1, numParams);
    Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true);
    Updater updater = UpdaterCreator.getUpdater(layer);

    for (int i = 0; i < weightGradient.length(); i++) weightGradient.putScalar(i, r.nextDouble());
    for (int i = 0; i < biasGradient.length(); i++) biasGradient.putScalar(i, r.nextDouble());

    gradient.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, weightGradient);
    gradient.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, biasGradient);

    updater.update(layer, gradient, -1, 1);

    INDArray weightGradActual = gradient.getGradientFor(DefaultParamInitializer.WEIGHT_KEY);
    INDArray biasGradActual = gradient.getGradientFor(DefaultParamInitializer.BIAS_KEY);

    assertEquals(weightGradient, weightGradActual);
    assertEquals(biasGradient, biasGradActual);
  }
예제 #3
0
  @Test
  public void testSGDUpdater() {
    double lr = 0.05;

    NeuralNetConfiguration conf =
        new NeuralNetConfiguration.Builder()
            .learningRate(lr)
            .layer(
                new DenseLayer.Builder()
                    .nIn(nIn)
                    .nOut(nOut)
                    .updater(org.deeplearning4j.nn.conf.Updater.SGD)
                    .build())
            .build();

    int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true);
    INDArray params = Nd4j.create(1, numParams);
    Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true);
    Updater updater = UpdaterCreator.getUpdater(layer);

    updater.update(layer, gradient, -1, 1);

    Gradient gradientDup = new DefaultGradient();
    gradientDup.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup());
    gradientDup.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup());

    for (Map.Entry<String, INDArray> entry : gradientDup.gradientForVariable().entrySet()) {
      val = entry.getValue();
      gradExpected = val.mul(lr);
      assertEquals(gradExpected, gradient.getGradientFor(entry.getKey()));
    }
    assertEquals(lr, layer.conf().getLayer().getLearningRate(), 1e-4);
  }
  private Gradient createPrevGradient() {
    Gradient gradient = new DefaultGradient();
    INDArray pseudoGradients = Nd4j.ones(nExamples, nChannelsIn, featureMapHeight, featureMapWidth);

    gradient.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, pseudoGradients);
    gradient.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, pseudoGradients);
    return gradient;
  }
예제 #5
0
 /**
  * Contrastive divergence revolves around the idea of approximating the log likelihood around
  * x1(input) with repeated sampling. Given is an energy based model: the higher k is (the more we
  * sample the model) the more we lower the energy (increase the likelihood of the model)
  *
  * <p>and lower the likelihood (increase the energy) of the hidden samples.
  *
  * <p>Other insights: CD - k involves keeping the first k samples of a gibbs sampling of the
  * model.
  */
 public void contrastiveDivergence() {
   Gradient gradient = gradient();
   getParam(PretrainParamInitializer.VISIBLE_BIAS_KEY)
       .subi(gradient.gradientForVariable().get(PretrainParamInitializer.VISIBLE_BIAS_KEY));
   getParam(PretrainParamInitializer.BIAS_KEY)
       .subi(gradient.gradientForVariable().get(PretrainParamInitializer.BIAS_KEY));
   getParam(PretrainParamInitializer.WEIGHT_KEY)
       .subi(gradient.gradientForVariable().get(PretrainParamInitializer.WEIGHT_KEY));
 }
예제 #6
0
 @Override
 public void update(Layer layer, Gradient gradient, int iteration) {
   preApply(layer, gradient, iteration);
   for (Map.Entry<String, INDArray> gradientPair : gradient.gradientForVariable().entrySet()) {
     GradientUpdater updater = init(gradientPair.getKey(), gradientPair.getValue(), layer);
     INDArray gradient2 = updater.getGradient(gradientPair.getValue(), iteration);
     postApply(layer, gradient2, gradientPair.getKey());
     gradient.setGradientFor(gradientPair.getKey(), gradient2);
   }
 }
예제 #7
0
  private Pair<Gradient, INDArray> getGradientsAndDelta(INDArray output) {
    INDArray labelsSubOut = labels.sub(output);
    Gradient gradient = new DefaultGradient();
    gradient.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, labelsSubOut.sum(0));

    switch (conf.getLossFunction()) {
      case MCXENT: // cross-entropy (multi-class, with one-hot encoding)
        gradient
            .gradientForVariable()
            .put(DefaultParamInitializer.WEIGHT_KEY, input.transpose().mmul(labelsSubOut));
        return new Pair<>(gradient, labelsSubOut);
      case XENT: // cross-entropy (single binary output variable)
        gradient
            .gradientForVariable()
            .put(
                DefaultParamInitializer.WEIGHT_KEY,
                input.transpose().mmul(labelsSubOut.div(output.mul(output.rsub(1)))));
        return new Pair<>(gradient, labelsSubOut);

      case MSE: // mean squared error
        gradient
            .gradientForVariable()
            .put(DefaultParamInitializer.WEIGHT_KEY, input.transpose().mmul(labelsSubOut.neg()));
        return new Pair<>(gradient, labelsSubOut);

      case EXPLL: // exponential logarithmic
        gradient
            .gradientForVariable()
            .put(
                DefaultParamInitializer.WEIGHT_KEY,
                input.transpose().mmul(labels.rsub(1).divi(output)));
        return new Pair<>(gradient, labelsSubOut);

      case RMSE_XENT: // root mean squared error cross entropy
        INDArray squaredrmseXentDiff = pow(labelsSubOut, 2.0);
        INDArray sqrt = sqrt(squaredrmseXentDiff);
        gradient
            .gradientForVariable()
            .put(DefaultParamInitializer.WEIGHT_KEY, input.transpose().mmul(sqrt));
        return new Pair<>(gradient, labelsSubOut);

      case SQUARED_LOSS:
        gradient
            .gradientForVariable()
            .put(
                DefaultParamInitializer.WEIGHT_KEY,
                input.transpose().mmul(input.transpose().mmul(pow(labelsSubOut, 2))));
        return new Pair<>(gradient, labelsSubOut);

      case NEGATIVELOGLIKELIHOOD: // mulit-class cross-entropy
        gradient
            .gradientForVariable()
            .put(DefaultParamInitializer.WEIGHT_KEY, input.transpose().mmul(labelsSubOut));
        return new Pair<>(gradient, labelsSubOut);
      default:
        throw new IllegalStateException("Invalid loss function: " + conf.getLossFunction());
    }
  }
예제 #8
0
  @Test
  public void testRMSPropUpdater() {
    double lr = 0.01;
    double rmsDecay = 0.25;
    Map<String, INDArray> lastG = new HashMap<>();

    NeuralNetConfiguration conf =
        new NeuralNetConfiguration.Builder()
            .learningRate(lr)
            .rmsDecay(rmsDecay)
            .layer(
                new DenseLayer.Builder()
                    .nIn(nIn)
                    .nOut(nOut)
                    .updater(org.deeplearning4j.nn.conf.Updater.RMSPROP)
                    .build())
            .build();

    int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true);
    INDArray params = Nd4j.create(1, numParams);
    Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true);
    Updater updater = UpdaterCreator.getUpdater(layer);
    int updaterStateSize = updater.stateSizeForLayer(layer);
    INDArray updaterState = Nd4j.create(1, updaterStateSize);
    updater.setStateViewArray(layer, updaterState, true);

    updater.update(layer, gradient, -1, 1);

    Gradient gradientDup = new DefaultGradient();
    gradientDup.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup());
    gradientDup.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup());

    for (Map.Entry<String, INDArray> entry : gradientDup.gradientForVariable().entrySet()) {
      key = entry.getKey();
      val = entry.getValue();
      INDArray lastGTmp = lastG.get(key);

      if (lastGTmp == null) lastGTmp = Nd4j.zeros(val.shape());

      lastGTmp.muli(rmsDecay).addi(val.mul(val).muli(1 - rmsDecay));
      gradExpected = val.mul(lr).div(Transforms.sqrt(lastGTmp.add(Nd4j.EPS_THRESHOLD)));

      assertEquals(gradExpected, gradient.getGradientFor(entry.getKey()));
      lastG.put(key, lastGTmp);
    }
    assertEquals(rmsDecay, layer.conf().getLayer().getRmsDecay(), 1e-4);
  }
예제 #9
0
  @Test
  public void testAdamUpdater() {
    INDArray m, v;
    double lr = 0.01;
    int iteration = 0;
    double beta1 = 0.8;
    double beta2 = 0.888;

    NeuralNetConfiguration conf =
        new NeuralNetConfiguration.Builder()
            .learningRate(lr)
            .iterations(iteration)
            .adamMeanDecay(beta1)
            .adamVarDecay(beta2)
            .layer(
                new DenseLayer.Builder()
                    .nIn(nIn)
                    .nOut(nOut)
                    .updater(org.deeplearning4j.nn.conf.Updater.ADAM)
                    .build())
            .build();

    int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true);
    INDArray params = Nd4j.create(1, numParams);
    Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true);
    Updater updater = UpdaterCreator.getUpdater(layer);
    int updaterStateSize = updater.stateSizeForLayer(layer);
    INDArray updaterState = Nd4j.create(1, updaterStateSize);
    updater.setStateViewArray(layer, updaterState, true);

    updater.update(layer, gradient, iteration, 1);

    double beta1t = FastMath.pow(beta1, iteration);
    double beta2t = FastMath.pow(beta2, iteration);
    double alphat = lr * FastMath.sqrt(1 - beta2t) / (1 - beta1t);
    if (Double.isNaN(alphat) || alphat == 0.0) alphat = epsilon;

    Gradient gradientDup = new DefaultGradient();
    gradientDup.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient);
    gradientDup.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient);

    for (Map.Entry<String, INDArray> entry : gradientDup.gradientForVariable().entrySet()) {
      val = entry.getValue();
      m = Nd4j.zeros(val.shape());
      v = Nd4j.zeros(val.shape());

      m.muli(beta1).addi(val.mul(1.0 - beta1));
      v.muli(beta2).addi(val.mul(val).mul(1.0 - beta2));
      gradExpected = m.mul(alphat).divi(Transforms.sqrt(v).addi(epsilon));
      if (!gradExpected.equals(gradient.getGradientFor(entry.getKey()))) {
        System.out.println(Arrays.toString(gradExpected.dup().data().asFloat()));
        System.out.println(
            Arrays.toString(gradient.getGradientFor(entry.getKey()).dup().data().asFloat()));
      }
      assertEquals(gradExpected, gradient.getGradientFor(entry.getKey()));
    }

    assertEquals(beta1, layer.conf().getLayer().getAdamMeanDecay(), 1e-4);
    assertEquals(beta2, layer.conf().getLayer().getAdamVarDecay(), 1e-4);
  }
  private Gradient createPrevGradient() {
    int inputWidth = 28;
    int inputHeight = 28;
    int[] stride = new int[] {2, 2};
    int[] padding = new int[] {0, 0};
    int[] kernelSize = new int[] {9, 9};
    int nChannelsIn = 1;
    int nExamples = 5;
    int featureMapHeight = (inputHeight + padding[0] * 2 - kernelSize[0]) / stride[0] + 1;
    int featureMapWidth = (inputWidth + padding[1] * 2 - kernelSize[1]) / stride[1] + 1;

    Gradient gradient = new DefaultGradient();
    INDArray pseudoGradients = Nd4j.ones(nExamples, nChannelsIn, featureMapHeight, featureMapWidth);

    gradient.gradientForVariable().put(DefaultParamInitializer.BIAS_KEY, pseudoGradients);
    gradient.gradientForVariable().put(DefaultParamInitializer.WEIGHT_KEY, pseudoGradients);
    return gradient;
  }
예제 #11
0
  @Test
  public void testNestorovsUpdater() {
    double lr = 1e-2;
    double mu = 0.6;
    INDArray v, vPrev;

    NeuralNetConfiguration conf =
        new NeuralNetConfiguration.Builder()
            .learningRate(lr)
            .momentum(mu)
            .layer(
                new DenseLayer.Builder()
                    .nIn(nIn)
                    .nOut(nOut)
                    .updater(org.deeplearning4j.nn.conf.Updater.NESTEROVS)
                    .build())
            .build();

    int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true);
    INDArray params = Nd4j.create(1, numParams);
    Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true);
    Updater updater = UpdaterCreator.getUpdater(layer);
    int updaterStateSize = updater.stateSizeForLayer(layer);
    INDArray updaterState = Nd4j.create(1, updaterStateSize);
    updater.setStateViewArray(layer, updaterState, true);

    updater.update(layer, gradient, -1, 1);

    Gradient gradientDup = new DefaultGradient();
    gradientDup.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup());
    gradientDup.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup());

    for (Map.Entry<String, INDArray> entry : gradientDup.gradientForVariable().entrySet()) {
      val = entry.getValue();
      v = Nd4j.zeros(val.shape());
      vPrev = v;
      v = vPrev.mul(mu).subi(val.mul(lr));
      gradExpected = vPrev.muli(mu).addi(v.mul(-mu - 1));

      assertEquals(gradExpected, gradient.getGradientFor(entry.getKey()));
    }

    assertEquals(mu, layer.conf().getLayer().getMomentum(), 1e-4);
  }
  @Override
  public boolean equals(Object o) {
    if (this == o) return true;
    if (o == null || getClass() != o.getClass()) return false;

    ModelAndGradient that = (ModelAndGradient) o;

    if (gradient != null ? !gradient.equals(that.gradient) : that.gradient != null) return false;
    return !(model != null ? !model.equals(that.model) : that.model != null);
  }
예제 #13
0
  @Test
  public void testAdaGradUpdater() {
    double lr = 1e-2;

    NeuralNetConfiguration conf =
        new NeuralNetConfiguration.Builder()
            .learningRate(lr)
            .layer(
                new DenseLayer.Builder()
                    .nIn(nIn)
                    .nOut(nOut)
                    .updater(org.deeplearning4j.nn.conf.Updater.ADAGRAD)
                    .build())
            .build();

    int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true);
    INDArray params = Nd4j.create(1, numParams);
    Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true);
    Updater updater = UpdaterCreator.getUpdater(layer);
    int updaterStateSize = updater.stateSizeForLayer(layer);
    INDArray updaterState = Nd4j.create(1, updaterStateSize);
    updater.setStateViewArray(layer, updaterState, true);

    updater.update(layer, gradient, -1, 1);

    Gradient gradientDup = new DefaultGradient();
    gradientDup.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient);
    gradientDup.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient);

    for (Map.Entry<String, INDArray> entry : gradientDup.gradientForVariable().entrySet()) {
      val = entry.getValue();
      gradExpected = Transforms.sqrt(val.mul(val).add(epsilon)).rdiv(lr).mul(val);
      assertEquals(gradExpected, gradient.getGradientFor(entry.getKey()));
    }
    assertEquals(lr, layer.conf().getLayer().getLearningRate(), 1e-4);
  }
예제 #14
0
  @Override
  public void computeGradientAndScore() {
    int k = layerConf().getK();

    // POSITIVE PHASE
    Pair<INDArray, INDArray> probHidden = sampleHiddenGivenVisible(input());

    /*
     * Start the gibbs sampling.
     */
    INDArray chainStart = probHidden.getSecond();

    /*
     * Note that at a later date, we can explore alternative methods of
     * storing the chain transitions for different kinds of sampling
     * and exploring the search space.
     */
    Pair<Pair<INDArray, INDArray>, Pair<INDArray, INDArray>> matrices;
    // negative visible means or expected values
    INDArray nvMeans = null;
    // negative value samples
    INDArray nvSamples = null;
    // negative hidden means or expected values
    INDArray nhMeans = null;
    // negative hidden samples
    INDArray nhSamples = null;

    /*
     * K steps of gibbs sampling. This is the positive phase of contrastive divergence.
     *
     * There are 4 matrices being computed for each gibbs sampling.
     * The samples from both the positive and negative phases and their expected values
     * or averages.
     *
     */

    for (int i = 0; i < k; i++) {

      // NEGATIVE PHASE
      if (i == 0) matrices = gibbhVh(chainStart);
      else matrices = gibbhVh(nhSamples);

      // get the cost updates for sampling in the chain after k iterations
      nvMeans = matrices.getFirst().getFirst();
      nvSamples = matrices.getFirst().getSecond();
      nhMeans = matrices.getSecond().getFirst();
      nhSamples = matrices.getSecond().getSecond();
    }

    /*
     * Update gradient parameters
     */
    INDArray wGradient =
        input().transposei().mmul(probHidden.getSecond()).subi(nvSamples.transpose().mmul(nhMeans));

    INDArray hBiasGradient;

    if (layerConf().getSparsity() != 0)
      // all hidden units must stay around this number
      hBiasGradient = probHidden.getSecond().rsub(layerConf().getSparsity()).sum(0);
    else
      // update rule: the expected values of the hidden input - the negative hidden  means adjusted
      // by the learning rate
      hBiasGradient = probHidden.getSecond().sub(nhMeans).sum(0);

    // update rule: the expected values of the input - the negative samples adjusted by the learning
    // rate
    INDArray delta = input.sub(nvSamples);
    INDArray vBiasGradient = delta.sum(0);

    Gradient ret = new DefaultGradient();
    ret.gradientForVariable().put(PretrainParamInitializer.VISIBLE_BIAS_KEY, vBiasGradient);
    ret.gradientForVariable().put(PretrainParamInitializer.BIAS_KEY, hBiasGradient);
    ret.gradientForVariable().put(PretrainParamInitializer.WEIGHT_KEY, wGradient);
    gradient = ret;
    setScoreWithZ(delta);
  }
예제 #15
0
  /** Apply gradient normalization: scale based on L2, clipping etc. */
  public void preApply(Layer layer, Gradient gradient, int iteration) {
    GradientNormalization normalization = layer.conf().getLayer().getGradientNormalization();
    if (normalization == null || normalization == GradientNormalization.None) return; // no op

    final double threshold = layer.conf().getLayer().getGradientNormalizationThreshold();

    switch (normalization) {
      case RenormalizeL2PerLayer:
        double sumSquares = 0.0;
        for (INDArray g : gradient.gradientForVariable().values()) {
          double l2 = g.norm2Number().doubleValue();
          // l2 norm: sqrt(sum_i g_i^2)
          sumSquares += l2 * l2;
        }
        double layerL2 = FastMath.sqrt(sumSquares);
        for (INDArray g : gradient.gradientForVariable().values()) {
          g.divi(layerL2);
        }
        break;
      case RenormalizeL2PerParamType:
        for (INDArray g : gradient.gradientForVariable().values()) {
          double l2 =
              Nd4j.getExecutioner().execAndReturn(new Norm2(g)).getFinalResult().doubleValue();
          g.divi(l2);
        }
        break;
      case ClipElementWiseAbsoluteValue:
        Condition absValueCondition = new AbsValueGreaterThan(threshold);
        Function<Number, Number> clipFn =
            new Function<Number, Number>() {
              @Override
              public Number apply(Number number) {
                return (number.doubleValue() > threshold ? threshold : -threshold);
              }
            };

        for (INDArray g : gradient.gradientForVariable().values()) {
          BooleanIndexing.applyWhere(g, absValueCondition, clipFn);
        }
        break;
      case ClipL2PerLayer:
        double sumSquares2 = 0.0;
        for (INDArray g : gradient.gradientForVariable().values()) {
          double l2 =
              Nd4j.getExecutioner().execAndReturn(new Norm2(g)).getFinalResult().doubleValue();
          // l2 norm: sqrt(sum_i g_i^2)
          sumSquares2 += l2 * l2;
        }
        double layerL22 = FastMath.sqrt(sumSquares2);
        if (layerL22 > threshold) {
          double scalingFactor = threshold / layerL22; // g = g / l2 * threshold ->
          for (INDArray g : gradient.gradientForVariable().values()) {
            g.muli(scalingFactor);
          }
        }
        break;
      case ClipL2PerParamType:
        for (INDArray g : gradient.gradientForVariable().values()) {
          double l2 = g.norm2Number().doubleValue();
          if (l2 > threshold) {
            double scalingFactor = l2 / threshold;
            g.divi(scalingFactor);
          }
        }
        break;
      default:
        throw new RuntimeException(
            "Unknown (or not implemented) gradient normalization strategy: " + normalization);
    }
  }
 @Override
 public int hashCode() {
   int result = gradient != null ? gradient.hashCode() : 0;
   result = 31 * result + (model != null ? model.hashCode() : 0);
   return result;
 }
예제 #17
0
  @Override
  public Pair<Gradient, INDArray> backpropGradient(INDArray epsilon) {
    // First: Do forward pass to get gate activations etc.
    INDArray[] activations = activateHelper(true, null); // Order: {outputActivations,rucZs,rucAs}
    INDArray outputActivations = activations[0];
    INDArray rucZs = activations[1];
    INDArray rucAs = activations[2];

    INDArray inputWeights =
        getParam(GRUParamInitializer.INPUT_WEIGHT_KEY); // Shape: [n^(L-1),3*n^L], order: [wr,wu,wc]
    INDArray recurrentWeights =
        getParam(GRUParamInitializer.RECURRENT_WEIGHT_KEY); // Shape: [n^L,3*n^L]; order: [wR,wU,wC]

    int layerSize = recurrentWeights.size(0); // i.e., n^L
    int prevLayerSize = inputWeights.size(0); // n^(L-1)
    int miniBatchSize = epsilon.size(0);
    boolean is2dInput =
        epsilon.rank()
            < 3; // Edge case: T=1 may have shape [miniBatchSize,n^(L+1)], equiv. to
                 // [miniBatchSize,n^(L+1),1]
    int timeSeriesLength = (is2dInput ? 1 : epsilon.size(2));

    INDArray wr = inputWeights.get(NDArrayIndex.all(), interval(0, layerSize));
    INDArray wu = inputWeights.get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize));
    INDArray wc = inputWeights.get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize));
    INDArray wR = recurrentWeights.get(NDArrayIndex.all(), interval(0, layerSize));
    INDArray wU = recurrentWeights.get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize));
    INDArray wC = recurrentWeights.get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize));
    INDArray wRdiag = Nd4j.diag(wR).transpose();
    //		INDArray wUdiag = Nd4j.diag(wU).transpose();
    INDArray wCdiag = Nd4j.diag(wC).transpose();

    // Parameter gradients: Stores sum over each time step here
    INDArray biasGradients = Nd4j.zeros(new int[] {1, 3 * layerSize});
    INDArray inputWeightGradients = Nd4j.zeros(new int[] {prevLayerSize, 3 * layerSize});
    INDArray recurrentWeightGradients = Nd4j.zeros(new int[] {layerSize, 3 * layerSize});

    INDArray epsilonNext =
        Nd4j.zeros(
            miniBatchSize,
            prevLayerSize,
            timeSeriesLength); // i.e., what would be W^L*(delta^L)^T. Shape: [m,n^(L-1),T]

    INDArray deltaOutNext = Nd4j.zeros(miniBatchSize, layerSize);
    for (int t = timeSeriesLength - 1; t >= 0; t--) {
      INDArray prevOut =
          (t == 0
              ? Nd4j.zeros(miniBatchSize, layerSize)
              : outputActivations.tensorAlongDimension(t - 1, 1, 0)); // Shape: [m,n^L]

      INDArray aSlice = (is2dInput ? rucAs : rucAs.tensorAlongDimension(t, 1, 0));
      INDArray zSlice = (is2dInput ? rucZs : rucZs.tensorAlongDimension(t, 1, 0));
      INDArray aSliceNext;
      INDArray zSliceNext;
      if (t == timeSeriesLength - 1) {
        aSliceNext = Nd4j.zeros(miniBatchSize, 3 * layerSize);
        zSliceNext = Nd4j.zeros(miniBatchSize, 3 * layerSize);
      } else {
        aSliceNext = rucAs.tensorAlongDimension(t + 1, 1, 0);
        zSliceNext = rucZs.tensorAlongDimension(t + 1, 1, 0);
      }

      INDArray zr = zSlice.get(NDArrayIndex.all(), interval(0, layerSize));
      INDArray sigmaPrimeZr =
          Nd4j.getExecutioner()
              .execAndReturn(Nd4j.getOpFactory().createTransform("sigmoid", zr.dup()).derivative());

      INDArray epsilonSlice =
          (is2dInput
              ? epsilon
              : epsilon.tensorAlongDimension(t, 1, 0)); // (w^{L+1}*(delta^{(L+1)t})^T)^T or equiv.
      INDArray deltaOut = epsilonSlice.dup();
      if (t < timeSeriesLength - 1) {
        INDArray aOut =
            (is2dInput ? outputActivations : outputActivations.tensorAlongDimension(t, 1, 0));
        INDArray arNext = aSliceNext.get(NDArrayIndex.all(), interval(0, layerSize));
        INDArray auNext = aSliceNext.get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize));
        INDArray acNext =
            aSliceNext.get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize));
        INDArray zrNext = zSliceNext.get(NDArrayIndex.all(), interval(0, layerSize));
        INDArray zuNext = zSliceNext.get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize));
        INDArray zcNext =
            zSliceNext.get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize));

        INDArray sigmaPrimeZrNext =
            Nd4j.getExecutioner()
                .execAndReturn(
                    Nd4j.getOpFactory().createTransform("sigmoid", zrNext.dup()).derivative());
        INDArray sigmaPrimeZuNext =
            Nd4j.getExecutioner()
                .execAndReturn(
                    Nd4j.getOpFactory().createTransform("sigmoid", zuNext.dup()).derivative());
        INDArray sigmaPrimeZcNext =
            Nd4j.getExecutioner()
                .execAndReturn(
                    Nd4j.getOpFactory()
                        .createTransform(conf.getLayer().getActivationFunction(), zcNext.dup())
                        .derivative());

        deltaOut.addi(auNext.mul(deltaOutNext));
        deltaOut.addi(
            aOut.sub(acNext)
                .muli(sigmaPrimeZuNext)
                .muli(wU.mmul(deltaOutNext.transpose()).transpose()));
        deltaOut.addi(
            auNext
                .rsub(1.0)
                .muli(sigmaPrimeZcNext)
                .muli(arNext.add(aOut.mul(sigmaPrimeZrNext).muliRowVector(wRdiag)))
                .muli(wC.mmul(deltaOutNext.transpose()).transpose()));
      }

      // Delta at update gate
      INDArray zu = zSlice.get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize));
      INDArray sigmaPrimeZu =
          Nd4j.getExecutioner()
              .execAndReturn(Nd4j.getOpFactory().createTransform("sigmoid", zu.dup()).derivative());
      INDArray ac = aSlice.get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize));
      INDArray deltaU = deltaOut.mul(sigmaPrimeZu).muli(prevOut.sub(ac));

      // Delta for candidate activation
      INDArray zc = zSlice.get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize));
      INDArray sigmaPrimeZc =
          Nd4j.getExecutioner()
              .execAndReturn(
                  Nd4j.getOpFactory()
                      .createTransform(conf.getLayer().getActivationFunction(), zc.dup())
                      .derivative());
      INDArray au = aSlice.get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize));
      INDArray deltaC = deltaOut.mul(sigmaPrimeZc).muli(au.rsub(1.0));

      // Delta at reset gate
      INDArray deltaR = deltaC.mulRowVector(wCdiag).muli(prevOut).muli(sigmaPrimeZr);

      // Add input gradients for this time step:
      INDArray prevLayerActivationSlice = (is2dInput ? input : input.tensorAlongDimension(t, 1, 0));
      inputWeightGradients
          .get(NDArrayIndex.all(), interval(0, layerSize))
          .addi(deltaR.transpose().mmul(prevLayerActivationSlice).transpose());
      inputWeightGradients
          .get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize))
          .addi(deltaU.transpose().mmul(prevLayerActivationSlice).transpose());
      inputWeightGradients
          .get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize))
          .addi(deltaC.transpose().mmul(prevLayerActivationSlice).transpose());

      // Add recurrent weight gradients for this time step:
      if (t > 0) { // t=0: no previous output
        recurrentWeightGradients
            .get(NDArrayIndex.all(), interval(0, layerSize))
            .addi(deltaR.transpose().mmul(prevOut).transpose());
        recurrentWeightGradients
            .get(NDArrayIndex.all(), interval(layerSize, 2 * layerSize))
            .addi(deltaU.transpose().mmul(prevOut).transpose());
        INDArray ar = aSlice.get(NDArrayIndex.all(), interval(0, layerSize));
        recurrentWeightGradients
            .get(NDArrayIndex.all(), interval(2 * layerSize, 3 * layerSize))
            .addi(deltaC.transpose().mmul(prevOut.mul(ar)).transpose());
      }

      // Add bias gradients for this time step:
      biasGradients.get(NDArrayIndex.point(0), interval(0, layerSize)).addi(deltaR.sum(0));
      biasGradients
          .get(NDArrayIndex.point(0), interval(layerSize, 2 * layerSize))
          .addi(deltaU.sum(0));
      biasGradients
          .get(NDArrayIndex.point(0), interval(2 * layerSize, 3 * layerSize))
          .addi(deltaC.sum(0));

      INDArray epsilonNextSlice =
          wr.mmul(deltaR.transpose())
              .transpose()
              .addi(wu.mmul(deltaU.transpose()).transpose())
              .addi(wc.mmul(deltaC.transpose()).transpose());
      epsilonNext.tensorAlongDimension(t, 1, 0).assign(epsilonNextSlice);

      deltaOutNext = deltaOut;
    }

    Gradient g = new DefaultGradient();
    g.setGradientFor(GRUParamInitializer.INPUT_WEIGHT_KEY, inputWeightGradients);
    g.setGradientFor(GRUParamInitializer.RECURRENT_WEIGHT_KEY, recurrentWeightGradients);
    g.setGradientFor(GRUParamInitializer.BIAS_KEY, biasGradients);

    return new Pair<>(g, epsilonNext);
  }
예제 #18
0
  @Test
  public void testAdaDeltaUpdate() {
    INDArray dxSquared;
    Map<String, INDArray> msg = new HashMap<>();
    Map<String, INDArray> msdx = new HashMap<>();

    double rho = 0.85;

    NeuralNetConfiguration conf =
        new NeuralNetConfiguration.Builder()
            .rho(rho)
            .layer(
                new DenseLayer.Builder()
                    .nIn(nIn)
                    .nOut(nOut)
                    .updater(org.deeplearning4j.nn.conf.Updater.ADADELTA)
                    .build())
            .build();

    int numParams = LayerFactories.getFactory(conf).initializer().numParams(conf, true);
    INDArray params = Nd4j.create(1, numParams);
    Layer layer = LayerFactories.getFactory(conf).create(conf, null, 0, params, true);
    Updater updater = UpdaterCreator.getUpdater(layer);
    int updaterStateSize = updater.stateSizeForLayer(layer);
    INDArray updaterState = Nd4j.create(1, updaterStateSize);
    updater.setStateViewArray(layer, updaterState, true);

    Gradient gradientDup = new DefaultGradient();
    gradientDup.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, weightGradient.dup());
    gradientDup.setGradientFor(DefaultParamInitializer.BIAS_KEY, biasGradient.dup());

    for (int i = 0; i < 2; i++) {
      updater.update(layer, gradient, i, 1);

      // calculations for one iteration / update

      for (Map.Entry<String, INDArray> entry : gradientDup.gradientForVariable().entrySet()) {
        key = entry.getKey();
        val = entry.getValue();
        INDArray msgTmp = msg.get(key);
        INDArray msdxTmp = msdx.get(key);

        if (msgTmp == null) {
          msgTmp = Nd4j.zeros(val.shape());
          msdxTmp = Nd4j.zeros(val.shape());
        }

        msgTmp.muli(rho);
        msgTmp.addi(1 - rho).muli(val.mul(val));

        gradExpected =
            Transforms.sqrt(msdxTmp.add(Nd4j.EPS_THRESHOLD))
                .divi(Transforms.sqrt(msgTmp.add(Nd4j.EPS_THRESHOLD)))
                .muli(val);
        gradientDup.setGradientFor(key, gradExpected);
        assertEquals(gradExpected, gradient.getGradientFor(entry.getKey()));

        msdxTmp.muli(rho);
        dxSquared = gradExpected.mul(gradExpected);
        msdxTmp.addi(dxSquared.muli(1 - rho));

        msg.put(key, msgTmp);
        msdx.put(key, msdxTmp);
      }
      assertEquals(rho, layer.conf().getLayer().getRho(), 1e-4);
    }
  }
예제 #19
0
  @Test
  public void testMultiLayerUpdater() throws Exception {
    Nd4j.getRandom().setSeed(12345L);
    double lr = 0.03;

    MultiLayerConfiguration conf =
        new NeuralNetConfiguration.Builder()
            .learningRate(lr)
            .momentum(0.6)
            .list()
            .layer(
                0,
                new DenseLayer.Builder()
                    .nIn(4)
                    .nOut(5)
                    .updater(org.deeplearning4j.nn.conf.Updater.SGD)
                    .build())
            .layer(
                1,
                new DenseLayer.Builder()
                    .nIn(5)
                    .nOut(6)
                    .updater(org.deeplearning4j.nn.conf.Updater.NONE)
                    .build())
            .layer(
                2,
                new DenseLayer.Builder()
                    .nIn(6)
                    .nOut(7)
                    .updater(org.deeplearning4j.nn.conf.Updater.ADAGRAD)
                    .build())
            .layer(
                3,
                new DenseLayer.Builder()
                    .nIn(7)
                    .nOut(8)
                    .updater(org.deeplearning4j.nn.conf.Updater.NESTEROVS)
                    .build())
            .build();

    MultiLayerNetwork net = new MultiLayerNetwork(conf);
    net.init();

    Updater updater = UpdaterCreator.getUpdater(net);
    assertNotNull(updater);
    assertTrue(updater.getClass() == MultiLayerUpdater.class);

    Field f = MultiLayerUpdater.class.getDeclaredField("layerUpdaters");
    f.setAccessible(true);
    Updater[] updaters = (Updater[]) f.get(updater);
    assertNotNull(updaters);
    assertTrue(updaters.length == net.getnLayers());
    assertTrue(updaters[0] instanceof SgdUpdater);
    assertTrue(updaters[1] instanceof NoOpUpdater);
    assertTrue(updaters[2] instanceof AdaGradUpdater);
    assertTrue(updaters[3] instanceof NesterovsUpdater);

    Updater[] uArr = new Updater[4];
    uArr[0] = new SgdUpdater();
    uArr[1] = new NoOpUpdater();
    uArr[2] = new AdaGradUpdater();
    int updaterStateSize = uArr[2].stateSizeForLayer(net.getLayer(2));
    INDArray updaterState = Nd4j.create(1, updaterStateSize);
    uArr[2].setStateViewArray(net.getLayer(2), updaterState, true);

    uArr[3] = new NesterovsUpdater();
    updaterStateSize = uArr[3].stateSizeForLayer(net.getLayer(3));
    updaterState = Nd4j.create(1, updaterStateSize);
    uArr[3].setStateViewArray(net.getLayer(3), updaterState, true);

    int[] nIns = {4, 5, 6, 7};
    int[] nOuts = {5, 6, 7, 8};

    for (int i = 0; i < 5; i++) {
      Gradient gradient = new DefaultGradient();
      Map<String, INDArray> expectedGradient = new LinkedHashMap<>();

      for (int j = 0; j < net.getnLayers(); j++) {
        // Generate test gradient:
        INDArray wGrad = Nd4j.rand(1, nIns[j] * nOuts[j]);
        INDArray bGrad = Nd4j.rand(1, nOuts[j]);

        String wKey = j + "_" + DefaultParamInitializer.WEIGHT_KEY;
        String bKey = j + "_" + DefaultParamInitializer.BIAS_KEY;

        gradient.setGradientFor(wKey, wGrad);
        gradient.setGradientFor(bKey, bGrad);

        // Also put copy of gradient through separate layer updaters to compare
        Gradient layerGradient = new DefaultGradient();
        layerGradient.setGradientFor(DefaultParamInitializer.WEIGHT_KEY, wGrad.dup());
        layerGradient.setGradientFor(DefaultParamInitializer.BIAS_KEY, bGrad.dup());

        uArr[j].update(net.getLayer(j), layerGradient, i, 1);
        for (String s : layerGradient.gradientForVariable().keySet()) {
          expectedGradient.put(j + "_" + s, layerGradient.getGradientFor(s));
        }
      }

      updater.update(net, gradient, i, 1);
      assertEquals(gradient.gradientForVariable(), expectedGradient);
    }
  }
  @Override
  public Pair<Gradient, INDArray> backpropGradient(
      INDArray input,
      INDArray weights,
      INDArray delta,
      int[] kernel,
      int[] strides,
      int[] pad,
      INDArray biasGradView,
      INDArray weightGradView,
      String afn) {
    int miniBatch = input.size(0);
    int inH = input.size(2);
    int inW = input.size(3);

    int outDepth = weights.size(0);
    int inDepth = weights.size(1);
    int kH = weights.size(2);
    int kW = weights.size(3);

    int outH = Convolution.outSize(inH, kernel[0], strides[0], pad[0], false);
    int outW = Convolution.outSize(inW, kernel[1], strides[1], pad[1], false);

    if (!Shape.strideDescendingCAscendingF(delta)) {
      // apparently not supported by cuDNN
      delta = delta.dup();
    }

    int[] srcStride = input.stride();
    int[] deltaStride = delta.stride();
    int[] algo = new int[1];
    checkCudnn(
        cudnnSetTensor4dDescriptorEx(
            cudnnContext.srcTensorDesc,
            dataType,
            miniBatch,
            inDepth,
            inH,
            inW,
            srcStride[0],
            srcStride[1],
            srcStride[2],
            srcStride[3]));
    checkCudnn(
        cudnnSetTensor4dDescriptorEx(
            cudnnContext.deltaTensorDesc,
            dataType,
            miniBatch,
            outDepth,
            outH,
            outW,
            deltaStride[0],
            deltaStride[1],
            deltaStride[2],
            deltaStride[3]));
    checkCudnn(
        cudnnSetConvolution2dDescriptor(
            cudnnContext.convDesc,
            pad[0],
            pad[1],
            strides[0],
            strides[1],
            1,
            1,
            CUDNN_CROSS_CORRELATION));
    checkCudnn(
        cudnnSetFilter4dDescriptor(
            cudnnContext.filterDesc, dataType, tensorFormat, outDepth, inDepth, kH, kW));
    checkCudnn(
        cudnnGetConvolutionBackwardFilterAlgorithm(
            cudnnContext,
            cudnnContext.srcTensorDesc,
            cudnnContext.deltaTensorDesc,
            cudnnContext.convDesc,
            cudnnContext.filterDesc,
            CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
            0,
            algo));

    INDArray epsNext = Nd4j.create(new int[] {miniBatch, inDepth, inH, inW}, 'c');
    int[] dstStride = epsNext.stride();

    Allocator allocator = AtomicAllocator.getInstance();
    CudaContext context =
        allocator
            .getFlowController()
            .prepareAction(input, weights, weightGradView, biasGradView, delta, epsNext);
    Pointer srcData = allocator.getPointer(input, context);
    Pointer filterData = allocator.getPointer(weights, context);
    Pointer filterGradData = allocator.getPointer(weightGradView, context);
    Pointer biasGradData = allocator.getPointer(biasGradView, context);
    Pointer deltaData = allocator.getPointer(delta, context);
    Pointer dstData = allocator.getPointer(epsNext, context);

    checkCudnn(cudnnSetStream(cudnnContext, new CUstream_st(context.getOldStream())));
    checkCudnn(
        cudnnSetTensor4dDescriptorEx(
            cudnnContext.dstTensorDesc,
            dataType,
            miniBatch,
            inDepth,
            inH,
            inW,
            dstStride[0],
            dstStride[1],
            dstStride[2],
            dstStride[3]));
    checkCudnn(
        cudnnGetConvolutionBackwardFilterWorkspaceSize(
            cudnnContext,
            cudnnContext.srcTensorDesc,
            cudnnContext.deltaTensorDesc,
            cudnnContext.convDesc,
            cudnnContext.filterDesc,
            algo[0],
            sizeInBytes));
    long sizeInBytes1 = sizeInBytes.get(0);
    checkCudnn(
        cudnnGetConvolutionBackwardDataWorkspaceSize(
            cudnnContext,
            cudnnContext.filterDesc,
            cudnnContext.deltaTensorDesc,
            cudnnContext.convDesc,
            cudnnContext.dstTensorDesc,
            algo[0],
            sizeInBytes));
    long sizeInBytes2 = sizeInBytes.get(0);
    if (sizeInBytes1 > workSpace.capacity() || sizeInBytes2 > workSpace.capacity()) {
      workSpace.deallocate();
      workSpace = new WorkSpace(Math.max(sizeInBytes1, sizeInBytes2));
    }

    checkCudnn(
        cudnnSetTensor4dDescriptor(
            cudnnContext.biasTensorDesc, tensorFormat, dataType, 1, outDepth, 1, 1));
    checkCudnn(
        cudnnConvolutionBackwardBias(
            cudnnContext,
            alpha,
            cudnnContext.deltaTensorDesc,
            deltaData,
            beta,
            cudnnContext.biasTensorDesc,
            biasGradData));
    checkCudnn(
        cudnnConvolutionBackwardFilter(
            cudnnContext,
            alpha,
            cudnnContext.srcTensorDesc,
            srcData,
            cudnnContext.deltaTensorDesc,
            deltaData,
            cudnnContext.convDesc,
            algo[0],
            workSpace,
            workSpace.capacity(),
            beta,
            cudnnContext.filterDesc,
            filterGradData));
    checkCudnn(
        cudnnConvolutionBackwardData(
            cudnnContext,
            alpha,
            cudnnContext.filterDesc,
            filterData,
            cudnnContext.deltaTensorDesc,
            deltaData,
            cudnnContext.convDesc,
            algo[0],
            workSpace,
            workSpace.capacity(),
            beta,
            cudnnContext.dstTensorDesc,
            dstData));

    allocator.registerAction(context, input, weights, weightGradView, biasGradView, delta, epsNext);

    Gradient retGradient = new DefaultGradient();
    retGradient.setGradientFor(ConvolutionParamInitializer.BIAS_KEY, biasGradView);
    retGradient.setGradientFor(ConvolutionParamInitializer.WEIGHT_KEY, weightGradView, 'c');

    return new Pair<>(retGradient, epsNext);
  }