Exemple #1
   * Gibbs sampling step: hidden ---> visible ---> hidden
   * @param h the hidden input
   * @return the expected values and samples of both the visible samples given the hidden and the
   *     new hidden input and expected values
  public Pair<Pair<INDArray, INDArray>, Pair<INDArray, INDArray>> gibbhVh(INDArray h) {
    Pair<INDArray, INDArray> v1MeanAndSample = sampleVisibleGivenHidden(h);
    INDArray vSample = v1MeanAndSample.getSecond();

    Pair<INDArray, INDArray> h1MeanAndSample = sampleHiddenGivenVisible(vSample);
    return new Pair<>(v1MeanAndSample, h1MeanAndSample);
  public static void main(String[] args) throws Exception {
    int iterations = 100;
    Nd4j.dtype = DataBuffer.Type.DOUBLE;
    List<String> cacheList = new ArrayList<>();

    log.info("Load & Vectorize data....");
    File wordFile = new ClassPathResource("words.txt").getFile();
    Pair<InMemoryLookupTable, VocabCache> vectors = WordVectorSerializer.loadTxt(wordFile);
    VocabCache cache = vectors.getSecond();
    INDArray weights = vectors.getFirst().getSyn0();

    for (int i = 0; i < cache.numWords(); i++) cacheList.add(cache.wordAtIndex(i));

    log.info("Build model....");
    BarnesHutTsne tsne =
        new BarnesHutTsne.Builder()

    log.info("Store TSNE Coordinates for Plotting....");
    String outputFile = "target/archive-tmp/tsne-standard-coords.csv";
    (new File(outputFile)).getParentFile().mkdirs();
    tsne.plot(weights, 2, cacheList, outputFile);
  public void testSubSampleLayerNoneBackprop() throws Exception {
    Layer layer = getCNNConfig(nChannelsIn, depth, kernelSize, stride, padding);

    Pair<Gradient, INDArray> out = layer.backpropGradient(epsilon);
    assertEquals(epsilon.shape().length, out.getSecond().shape().length);
    assertEquals(nExamples, out.getSecond().size(1)); // depth retained
Exemple #4
  * An individual iteration
  * @param p the probabilities that certain points are near each other
  * @param i the iteration (primarily for debugging purposes)
 public void step(INDArray p, int i) {
   Pair<Double, INDArray> costGradient = gradient(p);
   INDArray yIncs = costGradient.getSecond();
   log.info("Cost at iteration " + i + " was " + costGradient.getFirst());
   INDArray tiled = Nd4j.tile(y.mean(0), new int[] {y.rows(), y.columns()});
  public void computeGradientAndScore() {
    if (input == null || labels == null) return;

    INDArray output = output(input);
    Pair<Gradient, INDArray> pair = getGradientsAndDelta(output);
    this.gradient = pair.getFirst();
Exemple #6
   * Convert data to probability co-occurrences (aka calculating the kernel)
   * @param d the data to convert
   * @param u the perplexity of the model
   * @return the probabilities of co-occurrence
  public INDArray computeGaussianPerplexity(final INDArray d, double u) {
    int n = d.rows();
    final INDArray p = zeros(n, n);
    final INDArray beta = ones(n, 1);
    final double logU = Math.log(u);

    log.info("Calculating probabilities of data similarities..");
    for (int i = 0; i < n; i++) {
      if (i % 500 == 0 && i > 0) log.info("Handled " + i + " records");

      double betaMin = Double.NEGATIVE_INFINITY;
      double betaMax = Double.POSITIVE_INFINITY;
      int[] vals = Ints.concat(ArrayUtil.range(0, i), ArrayUtil.range(i + 1, d.columns()));
      INDArrayIndex[] range = new INDArrayIndex[] {new NDArrayIndex(vals)};

      INDArray row = d.slice(i).get(range);
      Pair<INDArray, INDArray> pair = hBeta(row, beta.getDouble(i));
      INDArray hDiff = pair.getFirst().sub(logU);
      int tries = 0;

      // while hdiff > tolerance
      while (BooleanIndexing.and(abs(hDiff), Conditions.greaterThan(tolerance)) && tries < 50) {
        // if hdiff > 0
        if (BooleanIndexing.and(hDiff, Conditions.greaterThan(0))) {
          if (Double.isInfinite(betaMax)) beta.putScalar(i, beta.getDouble(i) * 2.0);
          else beta.putScalar(i, (beta.getDouble(i) + betaMax) / 2.0);
          betaMin = beta.getDouble(i);
        } else {
          if (Double.isInfinite(betaMin)) beta.putScalar(i, beta.getDouble(i) / 2.0);
          else beta.putScalar(i, (beta.getDouble(i) + betaMin) / 2.0);
          betaMax = beta.getDouble(i);

        pair = hBeta(row, beta.getDouble(i));
        hDiff = pair.getFirst().subi(logU);

      p.slice(i).put(range, pair.getSecond());

    // dont need data in memory after
    log.info("Mean value of sigma " + sqrt(beta.rdiv(1)).mean(Integer.MAX_VALUE));
    BooleanIndexing.applyWhere(p, Conditions.isNan(), new Value(realMin));

    // set 0 along the diagonal
    INDArray permute = p.transpose();

    INDArray pOut = p.add(permute);

        pOut, Conditions.lessThan(Nd4j.EPS_THRESHOLD), new Value(Nd4j.EPS_THRESHOLD));
    // ensure no nans
    return pOut;
 public void testFeedForwardActivationsAndDerivatives() {
   MultiLayerNetwork network = new MultiLayerNetwork(getConf());
   DataSet data = new IrisDataSetIterator(1, 150).next();
   Pair result = network.feedForwardActivationsAndDerivatives();
   List<INDArray> first = (List) result.getFirst();
   List<INDArray> second = (List) result.getSecond();
   assertEquals(first.size(), second.size());
  public Pair<Gradient, INDArray> backpropGradient(
      INDArray epsilon, Gradient nextGradient, Layer layer) {
    Pair<Gradient, INDArray> pair =
            output(input)); // Returns Gradient and delta^(this), not Gradient and epsilon^(this-1)
    INDArray delta = pair.getSecond();

    INDArray epsilonNext =
    return new Pair<>(pair.getFirst(), epsilonNext);
  public static void main(String[] args) throws Exception {
    // STEP 1: Initialization
    int iterations = 100;
    // create an n-dimensional array of doubles
    List<String> cacheList =
        new ArrayList<>(); // cacheList is a dynamic array of strings used to hold all words

    // STEP 2: Turn text input into a list of words
    log.info("Load & Vectorize data....");
    File wordFile = new ClassPathResource("words.txt").getFile(); // Open the file
    // Get the data of all unique word vectors
    Pair<InMemoryLookupTable, VocabCache> vectors = WordVectorSerializer.loadTxt(wordFile);
    VocabCache cache = vectors.getSecond();
    INDArray weights =
        vectors.getFirst().getSyn0(); // seperate weights of unique words into their own list

    for (int i = 0; i < cache.numWords(); i++) // seperate strings of words into their own list

    // STEP 3: build a dual-tree tsne to use later
    log.info("Build model....");
    BarnesHutTsne tsne =
        new BarnesHutTsne.Builder()
            //                .usePca(false)

    // STEP 4: establish the tsne values and save them to a file
    log.info("Store TSNE Coordinates for Plotting....");
    String outputFile = "target/archive-tmp/tsne-standard-coords.csv";
    (new File(outputFile)).getParentFile().mkdirs();
    tsne.plot(weights, 2, cacheList, outputFile);
    // This tsne will use the weights of the vectors as its matrix, have two dimensions, use the
    // words strings as
    // labels, and be written to the outputFile created on the previous line

    // !!! Possible error: plot was recently deprecated. Might need to re-do the last line
  // note precision is off on this test but the numbers are close
  // investigation in a future release should determine how to resolve
  public void testBackpropResultsContained() {
    Layer layer = getContainedConfig();
    INDArray input = getContainedData();
    INDArray col = getContainedCol();
    INDArray epsilon = Nd4j.ones(1, 2, 4, 4);

    INDArray expectedBiasGradient =
        Nd4j.create(new double[] {0.16608272, 0.16608272}, new int[] {1, 2});
    INDArray expectedWeightGradient =
            new double[] {
            new int[] {2, 1, 2, 2});
    INDArray expectedEpsilon =
            new double[] {
              0.00039383, 0.00039383, 0.00039383, 0.00039383, 0.00039383,
              0.00039383, 0., 0., 0.00039383, 0.00039383,
              0.00039383, 0.00039383, 0.00039383, 0.00039383, 0.,
              0., 0.02036651, 0.02036651, 0.02036651, 0.02036651,
              0.02036651, 0.02036651, 0., 0., 0.02036651,
              0.02036651, 0.02036651, 0.02036651, 0.02036651, 0.02036651,
              0., 0., 0.00039383, 0.00039383, 0.00039383,
              0.00039383, 0.00039383, 0.00039383, 0., 0.,
              0.00039383, 0.00039383, 0.00039383, 0.00039383, 0.00039383,
              0.00039383, 0., 0., 0., 0.,
              0., 0., 0., 0., 0.,
              0., 0., 0., 0., 0.,
              0., 0., 0., 0.
            new int[] {1, 1, 8, 8});

    org.deeplearning4j.nn.layers.convolution.ConvolutionLayer layer2 =
        (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer) layer;
    Pair<Gradient, INDArray> pair = layer2.backpropGradient(epsilon);

    assertArrayEquals(expectedEpsilon.shape(), pair.getSecond().shape());
    assertArrayEquals(expectedWeightGradient.shape(), pair.getFirst().getGradientFor("W").shape());
    assertArrayEquals(expectedBiasGradient.shape(), pair.getFirst().getGradientFor("b").shape());
    assertEquals(expectedEpsilon, pair.getSecond());
    assertEquals(expectedWeightGradient, pair.getFirst().getGradientFor("W"));
    assertEquals(expectedBiasGradient, pair.getFirst().getGradientFor("b"));
  public void testBackpropResults() {
    Layer layer = getContainedConfig();
    INDArray col = getContainedCol();

    INDArray expectedWeightGradient =
            new double[] {-1440., -1440., -1984., -1984., -1440., -1440., -1984., -1984.},
            new int[] {2, 1, 2, 2});
    INDArray expectedBiasGradient =
            new double[] {-544., -544.},
            new int[] {
    INDArray expectedEpsilon =
            new double[] {
              -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12.,
              -12., -12., -12., -12., -12., -56., -56., -56., -56., -56., -56.,
              -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -12.,
              -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12.,
              -12., -12., -12., -12., -56., -56., -56., -56., -56., -56., -56.,
              -56., -56., -56., -56., -56., -56., -56., -56., -56.
            new int[] {1, 1, 8, 8});

    org.deeplearning4j.nn.layers.convolution.ConvolutionLayer layer2 =
        (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer) layer;
    Pair<Gradient, INDArray> pair = layer2.backpropGradient(epsilon);

    assertEquals(expectedEpsilon.shape(), pair.getSecond().shape());
    assertEquals(expectedWeightGradient.shape(), pair.getFirst().getGradientFor("W").shape());
    assertEquals(expectedBiasGradient.shape(), pair.getFirst().getGradientFor("b").shape());
    assertEquals(expectedEpsilon, pair.getSecond());
    assertEquals(expectedWeightGradient, pair.getFirst().getGradientFor("W"));
    assertEquals(expectedBiasGradient, pair.getFirst().getGradientFor("b"));
 public FloatDataSet(Pair<FloatMatrix, FloatMatrix> pair) {
   this(pair.getFirst(), pair.getSecond());
   * Train on the corpus
   * @param rdd the rdd to train
   * @return the vocab and weights
  public Pair<VocabCache, GloveWeightLookupTable> train(JavaRDD<String> rdd) {
    TextPipeline pipeline = new TextPipeline(rdd);
    final Pair<VocabCache, Long> vocabAndNumWords = pipeline.process();
    SparkConf conf = rdd.context().getConf();
    JavaSparkContext sc = new JavaSparkContext(rdd.context());
    vocabCacheBroadcast = sc.broadcast(vocabAndNumWords.getFirst());

    final GloveWeightLookupTable gloveWeightLookupTable =
        new GloveWeightLookupTable.Builder()
            .lr(conf.getDouble(GlovePerformer.ALPHA, 0.025))
            .maxCount(conf.getDouble(GlovePerformer.MAX_COUNT, 100))
            .vectorLength(conf.getInt(GlovePerformer.VECTOR_LENGTH, 300))
            .xMax(conf.getDouble(GlovePerformer.X_MAX, 0.75))

    gloveWeightLookupTable.getBiasAdaGrad().historicalGradient =
    gloveWeightLookupTable.getWeightAdaGrad().historicalGradient =

        "Created lookup table of size "
            + Arrays.toString(gloveWeightLookupTable.getSyn0().shape()));
    CounterMap<String, String> coOccurrenceCounts =
        rdd.map(new TokenizerFunction(tokenizerFactoryClazz))
            .map(new CoOccurrenceCalculator(symmetric, vocabCacheBroadcast, windowSize))
            .fold(new CounterMap<String, String>(), new CoOccurrenceCounts());

    List<Triple<String, String, Double>> counts = new ArrayList<>();
    Iterator<Pair<String, String>> pairIter = coOccurrenceCounts.getPairIterator();
    while (pairIter.hasNext()) {
      Pair<String, String> pair = pairIter.next();
          new Triple<>(
              coOccurrenceCounts.getCount(pair.getFirst(), pair.getSecond())));

    log.info("Calculated co occurrences");

    JavaRDD<Triple<String, String, Double>> parallel = sc.parallelize(counts);
    JavaPairRDD<String, Tuple2<String, Double>> pairs =
            new PairFunction<Triple<String, String, Double>, String, Tuple2<String, Double>>() {
              public Tuple2<String, Tuple2<String, Double>> call(
                  Triple<String, String, Double> stringStringDoubleTriple) throws Exception {
                return new Tuple2<>(
                    new Tuple2<>(
                        stringStringDoubleTriple.getFirst(), stringStringDoubleTriple.getThird()));

    JavaPairRDD<VocabWord, Tuple2<VocabWord, Double>> pairsVocab =
            new PairFunction<
                Tuple2<String, Tuple2<String, Double>>, VocabWord, Tuple2<VocabWord, Double>>() {
              public Tuple2<VocabWord, Tuple2<VocabWord, Double>> call(
                  Tuple2<String, Tuple2<String, Double>> stringTuple2Tuple2) throws Exception {
                return new Tuple2<>(
                    new Tuple2<>(

    for (int i = 0; i < iterations; i++) {

      JavaRDD<GloveChange> change =
              new Function<Tuple2<VocabWord, Tuple2<VocabWord, Double>>, GloveChange>() {
                public GloveChange call(
                    Tuple2<VocabWord, Tuple2<VocabWord, Double>> vocabWordTuple2Tuple2)
                    throws Exception {
                  VocabWord w1 = vocabWordTuple2Tuple2._1();
                  VocabWord w2 = vocabWordTuple2Tuple2._2()._1();
                  INDArray w1Vector = gloveWeightLookupTable.getSyn0().slice(w1.getIndex());
                  INDArray w2Vector = gloveWeightLookupTable.getSyn0().slice(w2.getIndex());
                  INDArray bias = gloveWeightLookupTable.getBias();
                  double score = vocabWordTuple2Tuple2._2()._2();
                  double xMax = gloveWeightLookupTable.getxMax();
                  double maxCount = gloveWeightLookupTable.getMaxCount();
                  // w1 * w2 + bias
                  double prediction = Nd4j.getBlasWrapper().dot(w1Vector, w2Vector);
                  prediction += bias.getDouble(w1.getIndex()) + bias.getDouble(w2.getIndex());

                  double weight = Math.pow(Math.min(1.0, (score / maxCount)), xMax);

                  double fDiff =
                      score > xMax ? prediction : weight * (prediction - Math.log(score));
                  if (Double.isNaN(fDiff)) fDiff = Nd4j.EPS_THRESHOLD;
                  // amount of change
                  double gradient = fDiff;
                  // update(w1,w1Vector,w2Vector,gradient);
                  // update(w2,w2Vector,w1Vector,gradient);

                  Pair<INDArray, Double> w1Update =
                  Pair<INDArray, Double> w2Update =
                  return new GloveChange(

      JavaRDD<Double> error =
              new Function<GloveChange, Double>() {
                public Double call(GloveChange gloveChange) throws Exception {
                  return gloveChange.getError();

      final Accumulator<Double> d = sc.accumulator(0.0);
          new VoidFunction<Double>() {
            public void call(Double aDouble) throws Exception {

      log.info("Error at iteration " + i + " was " + d.value());

    return new Pair<>(vocabAndNumWords.getFirst(), gloveWeightLookupTable);
   * Check backprop gradients for a MultiLayerNetwork.
   * @param mln MultiLayerNetwork to test. This must be initialized.
   * @param epsilon Usually on the order/ of 1e-4 or so.
   * @param maxRelError Maximum relative error. Usually < 1e-5 or so, though maybe more for deep
   *     networks or those with nonlinear activation
   * @param minAbsoluteError Minimum absolute error to cause a failure. Numerical gradients can be
   *     non-zero due to precision issues. For example, 0.0 vs. 1e-18: relative error is 1.0, but
   *     not really a failure
   * @param print Whether to print full pass/failure details for each parameter gradient
   * @param exitOnFirstError If true: return upon first failure. If false: continue checking even if
   *     one parameter gradient has failed. Typically use false for debugging, true for unit tests.
   * @param input Input array to use for forward pass. May be mini-batch data.
   * @param labels Labels/targets to use to calculate backprop gradient. May be mini-batch data.
   * @return true if gradients are passed, false otherwise.
  public static boolean checkGradients(
      MultiLayerNetwork mln,
      double epsilon,
      double maxRelError,
      double minAbsoluteError,
      boolean print,
      boolean exitOnFirstError,
      INDArray input,
      INDArray labels) {
    // Basic sanity checks on input:
    if (epsilon <= 0.0 || epsilon > 0.1)
      throw new IllegalArgumentException(
          "Invalid epsilon: expect epsilon in range (0,0.1], usually 1e-4 or so");
    if (maxRelError <= 0.0 || maxRelError > 0.25)
      throw new IllegalArgumentException("Invalid maxRelativeError: " + maxRelError);
    if (!(mln.getOutputLayer() instanceof BaseOutputLayer))
      throw new IllegalArgumentException("Cannot check backprop gradients without OutputLayer");

    // Check network configuration:

    int layerCount = 0;
    for (NeuralNetConfiguration n : mln.getLayerWiseConfigurations().getConfs()) {
      org.deeplearning4j.nn.conf.Updater u = n.getLayer().getUpdater();
      if (u == org.deeplearning4j.nn.conf.Updater.SGD) {
        // Must have LR of 1.0
        double lr = n.getLayer().getLearningRate();
        if (lr != 1.0) {
          throw new IllegalStateException(
              "When using SGD updater, must also use lr=1.0 for layer "
                  + layerCount
                  + "; got "
                  + u
                  + " with lr="
                  + lr);
      } else if (u != org.deeplearning4j.nn.conf.Updater.NONE) {
        throw new IllegalStateException(
            "Must have Updater.NONE (or SGD + lr=1.0) for layer " + layerCount + "; got " + u);

    Pair<Gradient, Double> gradAndScore = mln.gradientAndScore();

    Updater updater = UpdaterCreator.getUpdater(mln);
    updater.update(mln, gradAndScore.getFirst(), 0, mln.batchSize());

    INDArray gradientToCheck =
            .dup(); // need dup: gradients are a *view* of the full gradient array (which will
                    // change every time backprop is done)
    INDArray originalParams =
        mln.params().dup(); // need dup: params are a *view* of full parameters

    int nParams = originalParams.length();

    int totalNFailures = 0;
    double maxError = 0.0;
    for (int i = 0; i < nParams; i++) {
      // (w+epsilon): Do forward pass and score
      INDArray params = originalParams.dup();
      params.putScalar(i, params.getDouble(i) + epsilon);
      double scorePlus = mln.score();

      // (w-epsilon): Do forward pass and score
      params.putScalar(i, params.getDouble(i) - 2 * epsilon); // +eps - 2*eps = -eps
      double scoreMinus = mln.score();

      // Calculate numerical parameter gradient:
      double scoreDelta = scorePlus - scoreMinus;

      double numericalGradient = scoreDelta / (2 * epsilon);
      if (Double.isNaN(numericalGradient))
        throw new IllegalStateException(
            "Numerical gradient was NaN for parameter " + i + " of " + nParams);

      double backpropGradient = gradientToCheck.getDouble(i);
      // http://cs231n.github.io/neural-networks-3/#gradcheck
      // use mean centered
      double relError =
          Math.abs(backpropGradient - numericalGradient)
              / (Math.abs(numericalGradient) + Math.abs(backpropGradient));
      if (backpropGradient == 0.0 && numericalGradient == 0.0)
        relError = 0.0; // Edge case: i.e., RNNs with time series length of 1.0

      if (relError > maxError) maxError = relError;
      if (relError > maxRelError || Double.isNaN(relError)) {
        double absError = Math.abs(backpropGradient - numericalGradient);
        if (absError < minAbsoluteError) {
              "Param "
                  + i
                  + " passed: grad= "
                  + backpropGradient
                  + ", numericalGrad= "
                  + numericalGradient
                  + ", relError= "
                  + relError
                  + "; absolute error = "
                  + absError
                  + " < minAbsoluteError = "
                  + minAbsoluteError);
        } else {
          if (print)
                "Param "
                    + i
                    + " FAILED: grad= "
                    + backpropGradient
                    + ", numericalGrad= "
                    + numericalGradient
                    + ", relError= "
                    + relError
                    + ", scorePlus="
                    + scorePlus
                    + ", scoreMinus= "
                    + scoreMinus);
          if (exitOnFirstError) return false;
      } else if (print) {
            "Param "
                + i
                + " passed: grad= "
                + backpropGradient
                + ", numericalGrad= "
                + numericalGradient
                + ", relError= "
                + relError);

    if (print) {
      int nPass = nParams - totalNFailures;
          "GradientCheckUtil.checkGradients(): "
              + nParams
              + " params checked, "
              + nPass
              + " passed, "
              + totalNFailures
              + " failed. Largest relative error = "
              + maxError);

    return totalNFailures == 0;
   * Check backprop gradients for a ComputationGraph
   * @param graph ComputationGraph to test. This must be initialized.
   * @param epsilon Usually on the order of 1e-4 or so.
   * @param maxRelError Maximum relative error. Usually < 0.01, though maybe more for deep networks
   * @param minAbsoluteError Minimum absolute error to cause a failure. Numerical gradients can be
   *     non-zero due to precision issues. For example, 0.0 vs. 1e-18: relative error is 1.0, but
   *     not really a failure
   * @param print Whether to print full pass/failure details for each parameter gradient
   * @param exitOnFirstError If true: return upon first failure. If false: continue checking even if
   *     one parameter gradient has failed. Typically use false for debugging, true for unit tests.
   * @param inputs Input arrays to use for forward pass. May be mini-batch data.
   * @param labels Labels/targets (output) arrays to use to calculate backprop gradient. May be
   *     mini-batch data.
   * @return true if gradients are passed, false otherwise.
  public static boolean checkGradients(
      ComputationGraph graph,
      double epsilon,
      double maxRelError,
      double minAbsoluteError,
      boolean print,
      boolean exitOnFirstError,
      INDArray[] inputs,
      INDArray[] labels) {
    // Basic sanity checks on input:
    if (epsilon <= 0.0 || epsilon > 0.1)
      throw new IllegalArgumentException(
          "Invalid epsilon: expect epsilon in range (0,0.1], usually 1e-4 or so");
    if (maxRelError <= 0.0 || maxRelError > 0.25)
      throw new IllegalArgumentException("Invalid maxRelativeError: " + maxRelError);

    if (graph.getNumInputArrays() != inputs.length)
      throw new IllegalArgumentException(
          "Invalid input arrays: expect " + graph.getNumInputArrays() + " inputs");
    if (graph.getNumOutputArrays() != labels.length)
      throw new IllegalArgumentException(
          "Invalid labels arrays: expect " + graph.getNumOutputArrays() + " outputs");

    // Check configuration
    int layerCount = 0;
    for (String vertexName : graph.getConfiguration().getVertices().keySet()) {
      GraphVertex gv = graph.getConfiguration().getVertices().get(vertexName);
      if (!(gv instanceof LayerVertex)) continue;
      LayerVertex lv = (LayerVertex) gv;

      org.deeplearning4j.nn.conf.Updater u = lv.getLayerConf().getLayer().getUpdater();
      if (u == org.deeplearning4j.nn.conf.Updater.SGD) {
        // Must have LR of 1.0
        double lr = lv.getLayerConf().getLayer().getLearningRate();
        if (lr != 1.0) {
          throw new IllegalStateException(
              "When using SGD updater, must also use lr=1.0 for layer \""
                  + vertexName
                  + "\"; got "
                  + u);
      } else if (u != org.deeplearning4j.nn.conf.Updater.NONE) {
        throw new IllegalStateException(
            "Must have Updater.NONE (or SGD + lr=1.0) for layer \"" + vertexName + "\"; got " + u);

    for (int i = 0; i < inputs.length; i++) graph.setInput(i, inputs[i]);
    for (int i = 0; i < labels.length; i++) graph.setLabel(i, labels[i]);

    Pair<Gradient, Double> gradAndScore = graph.gradientAndScore();

    ComputationGraphUpdater updater = new ComputationGraphUpdater(graph);
    updater.update(graph, gradAndScore.getFirst(), 0, graph.batchSize());

    INDArray gradientToCheck =
            .dup(); // need dup: gradients are a *view* of the full gradient array (which will
                    // change every time backprop is done)
    INDArray originalParams =
        graph.params().dup(); // need dup: params are a *view* of full parameters

    int nParams = originalParams.length();

    int totalNFailures = 0;
    double maxError = 0.0;
    for (int i = 0; i < nParams; i++) {
      // (w+epsilon): Do forward pass and score
      INDArray params = originalParams.dup();
      params.putScalar(i, params.getDouble(i) + epsilon);
      double scorePlus = graph.score();

      // (w-epsilon): Do forward pass and score
      params.putScalar(i, params.getDouble(i) - 2 * epsilon); // +eps - 2*eps = -eps
      double scoreMinus = graph.score();

      // Calculate numerical parameter gradient:
      double scoreDelta = scorePlus - scoreMinus;

      double numericalGradient = scoreDelta / (2 * epsilon);
      if (Double.isNaN(numericalGradient))
        throw new IllegalStateException(
            "Numerical gradient was NaN for parameter " + i + " of " + nParams);

      double backpropGradient = gradientToCheck.getDouble(i);
      // http://cs231n.github.io/neural-networks-3/#gradcheck
      // use mean centered
      double relError =
          Math.abs(backpropGradient - numericalGradient)
              / (Math.abs(numericalGradient) + Math.abs(backpropGradient));
      if (backpropGradient == 0.0 && numericalGradient == 0.0)
        relError = 0.0; // Edge case: i.e., RNNs with time series length of 1.0

      if (relError > maxError) maxError = relError;
      if (relError > maxRelError || Double.isNaN(relError)) {
        double absError = Math.abs(backpropGradient - numericalGradient);
        if (absError < minAbsoluteError) {
              "Param "
                  + i
                  + " passed: grad= "
                  + backpropGradient
                  + ", numericalGrad= "
                  + numericalGradient
                  + ", relError= "
                  + relError
                  + "; absolute error = "
                  + absError
                  + " < minAbsoluteError = "
                  + minAbsoluteError);
        } else {
          if (print)
                "Param "
                    + i
                    + " FAILED: grad= "
                    + backpropGradient
                    + ", numericalGrad= "
                    + numericalGradient
                    + ", relError= "
                    + relError
                    + ", scorePlus="
                    + scorePlus
                    + ", scoreMinus= "
                    + scoreMinus);
          if (exitOnFirstError) return false;
      } else if (print) {
            "Param "
                + i
                + " passed: grad= "
                + backpropGradient
                + ", numericalGrad= "
                + numericalGradient
                + ", relError= "
                + relError);

    if (print) {
      int nPass = nParams - totalNFailures;
          "GradientCheckUtil.checkGradients(): "
              + nParams
              + " params checked, "
              + nPass
              + " passed, "
              + totalNFailures
              + " failed. Largest relative error = "
              + maxError);

    return totalNFailures == 0;
Exemple #16
  public void computeGradientAndScore() {
    int k = layerConf().getK();

    Pair<INDArray, INDArray> probHidden = sampleHiddenGivenVisible(input());

     * Start the gibbs sampling.
    INDArray chainStart = probHidden.getSecond();

     * Note that at a later date, we can explore alternative methods of
     * storing the chain transitions for different kinds of sampling
     * and exploring the search space.
    Pair<Pair<INDArray, INDArray>, Pair<INDArray, INDArray>> matrices;
    // negative visible means or expected values
    INDArray nvMeans = null;
    // negative value samples
    INDArray nvSamples = null;
    // negative hidden means or expected values
    INDArray nhMeans = null;
    // negative hidden samples
    INDArray nhSamples = null;

     * K steps of gibbs sampling. This is the positive phase of contrastive divergence.
     * There are 4 matrices being computed for each gibbs sampling.
     * The samples from both the positive and negative phases and their expected values
     * or averages.

    for (int i = 0; i < k; i++) {

      if (i == 0) matrices = gibbhVh(chainStart);
      else matrices = gibbhVh(nhSamples);

      // get the cost updates for sampling in the chain after k iterations
      nvMeans = matrices.getFirst().getFirst();
      nvSamples = matrices.getFirst().getSecond();
      nhMeans = matrices.getSecond().getFirst();
      nhSamples = matrices.getSecond().getSecond();

     * Update gradient parameters
    INDArray wGradient =

    INDArray hBiasGradient;

    if (layerConf().getSparsity() != 0)
      // all hidden units must stay around this number
      hBiasGradient = probHidden.getSecond().rsub(layerConf().getSparsity()).sum(0);
      // update rule: the expected values of the hidden input - the negative hidden  means adjusted
      // by the learning rate
      hBiasGradient = probHidden.getSecond().sub(nhMeans).sum(0);

    // update rule: the expected values of the input - the negative samples adjusted by the learning
    // rate
    INDArray delta = input.sub(nvSamples);
    INDArray vBiasGradient = delta.sum(0);

    Gradient ret = new DefaultGradient();
    ret.gradientForVariable().put(PretrainParamInitializer.VISIBLE_BIAS_KEY, vBiasGradient);
    ret.gradientForVariable().put(PretrainParamInitializer.BIAS_KEY, hBiasGradient);
    ret.gradientForVariable().put(PretrainParamInitializer.WEIGHT_KEY, wGradient);
    gradient = ret;
Exemple #17
  private void init() {

    if (rng == null) rng = new MersenneTwister(123);

    MultiDimensionalSet<String, String> binaryProductions = MultiDimensionalSet.hashSet();
    if (simplifiedModel) {
      binaryProductions.add("", "");
    } else {
      // TODO
      // figure out what binary productions we have in these trees
      // Note: the current sentiment training data does not actually
      // have any constituent labels
      throw new UnsupportedOperationException("Not yet implemented");

    Set<String> unaryProductions = new HashSet<>();

    if (simplifiedModel) {
    } else {
      // TODO
      // figure out what unary productions we have in these trees (preterminals only, after the
      // collapsing)
      throw new UnsupportedOperationException("Not yet implemented");

    identity = FloatMatrix.eye(numHidden);

    binaryTransform = MultiDimensionalMap.newTreeBackedMap();
    binaryFloatTensors = MultiDimensionalMap.newTreeBackedMap();
    binaryClassification = MultiDimensionalMap.newTreeBackedMap();

    // When making a flat model (no semantic untying) the
    // basicCategory function will return the same basic category for
    // all labels, so all entries will map to the same matrix
    for (Pair<String, String> binary : binaryProductions) {
      String left = basicCategory(binary.getFirst());
      String right = basicCategory(binary.getSecond());
      if (binaryTransform.contains(left, right)) {

      binaryTransform.put(left, right, randomTransformMatrix());
      if (useFloatTensors) {
        binaryFloatTensors.put(left, right, randomBinaryFloatTensor());

      if (!combineClassification) {
        binaryClassification.put(left, right, randomClassificationMatrix());

    numBinaryMatrices = binaryTransform.size();
    binaryTransformSize = numHidden * (2 * numHidden + 1);

    if (useFloatTensors) {
      binaryFloatTensorSize = numHidden * numHidden * numHidden * 4;
    } else {
      binaryFloatTensorSize = 0;

    binaryClassificationSize = (combineClassification) ? 0 : numOuts * (numHidden + 1);

    unaryClassification = new TreeMap<>();

    // When making a flat model (no semantic untying) the
    // basicCategory function will return the same basic category for
    // all labels, so all entries will map to the same matrix

    for (String unary : unaryProductions) {
      unary = basicCategory(unary);
      if (unaryClassification.containsKey(unary)) {
      unaryClassification.put(unary, randomClassificationMatrix());

    binaryClassificationSize = (combineClassification) ? 0 : numOuts * (numHidden + 1);

    numUnaryMatrices = unaryClassification.size();
    unaryClassificationSize = numOuts * (numHidden + 1);

    featureVectors.put(UNKNOWN_FEATURE, randomWordVector());
    numUnaryMatrices = unaryClassification.size();
    unaryClassificationSize = numOuts * (numHidden + 1);
    classWeights = new HashMap<>();
  * Load word vectors from the given pair
  * @param pair the given pair
  * @return a read only word vectors impl based on the given lookup table and vocab
 public static WordVectors fromPair(Pair<InMemoryLookupTable, VocabCache> pair) {
   WordVectorsImpl vectors = new WordVectorsImpl();
   return vectors;