/** * Gibbs sampling step: hidden ---> visible ---> hidden * * @param h the hidden input * @return the expected values and samples of both the visible samples given the hidden and the * new hidden input and expected values */ public Pair<Pair<INDArray, INDArray>, Pair<INDArray, INDArray>> gibbhVh(INDArray h) { Pair<INDArray, INDArray> v1MeanAndSample = sampleVisibleGivenHidden(h); INDArray vSample = v1MeanAndSample.getSecond(); Pair<INDArray, INDArray> h1MeanAndSample = sampleHiddenGivenVisible(vSample); return new Pair<>(v1MeanAndSample, h1MeanAndSample); }
public static void main(String[] args) throws Exception { int iterations = 100; Nd4j.dtype = DataBuffer.Type.DOUBLE; Nd4j.factory().setDType(DataBuffer.Type.DOUBLE); List<String> cacheList = new ArrayList<>(); log.info("Load & Vectorize data...."); File wordFile = new ClassPathResource("words.txt").getFile(); Pair<InMemoryLookupTable, VocabCache> vectors = WordVectorSerializer.loadTxt(wordFile); VocabCache cache = vectors.getSecond(); INDArray weights = vectors.getFirst().getSyn0(); for (int i = 0; i < cache.numWords(); i++) cacheList.add(cache.wordAtIndex(i)); log.info("Build model...."); BarnesHutTsne tsne = new BarnesHutTsne.Builder() .setMaxIter(iterations) .theta(0.5) .normalize(false) .learningRate(500) .useAdaGrad(false) .usePca(false) .build(); log.info("Store TSNE Coordinates for Plotting...."); String outputFile = "target/archive-tmp/tsne-standard-coords.csv"; (new File(outputFile)).getParentFile().mkdirs(); tsne.plot(weights, 2, cacheList, outputFile); }
@Test public void testSubSampleLayerNoneBackprop() throws Exception { Layer layer = getCNNConfig(nChannelsIn, depth, kernelSize, stride, padding); Pair<Gradient, INDArray> out = layer.backpropGradient(epsilon); assertEquals(epsilon.shape().length, out.getSecond().shape().length); assertEquals(nExamples, out.getSecond().size(1)); // depth retained }
/** * An individual iteration * * @param p the probabilities that certain points are near each other * @param i the iteration (primarily for debugging purposes) */ public void step(INDArray p, int i) { Pair<Double, INDArray> costGradient = gradient(p); INDArray yIncs = costGradient.getSecond(); log.info("Cost at iteration " + i + " was " + costGradient.getFirst()); y.addi(yIncs); y.addi(yIncs).subiRowVector(y.mean(0)); INDArray tiled = Nd4j.tile(y.mean(0), new int[] {y.rows(), y.columns()}); y.subi(tiled); }
@Override public void computeGradientAndScore() { if (input == null || labels == null) return; INDArray output = output(input); Pair<Gradient, INDArray> pair = getGradientsAndDelta(output); this.gradient = pair.getFirst(); setScoreWithZ(output); }
/** * Convert data to probability co-occurrences (aka calculating the kernel) * * @param d the data to convert * @param u the perplexity of the model * @return the probabilities of co-occurrence */ public INDArray computeGaussianPerplexity(final INDArray d, double u) { int n = d.rows(); final INDArray p = zeros(n, n); final INDArray beta = ones(n, 1); final double logU = Math.log(u); log.info("Calculating probabilities of data similarities.."); for (int i = 0; i < n; i++) { if (i % 500 == 0 && i > 0) log.info("Handled " + i + " records"); double betaMin = Double.NEGATIVE_INFINITY; double betaMax = Double.POSITIVE_INFINITY; int[] vals = Ints.concat(ArrayUtil.range(0, i), ArrayUtil.range(i + 1, d.columns())); INDArrayIndex[] range = new INDArrayIndex[] {new NDArrayIndex(vals)}; INDArray row = d.slice(i).get(range); Pair<INDArray, INDArray> pair = hBeta(row, beta.getDouble(i)); INDArray hDiff = pair.getFirst().sub(logU); int tries = 0; // while hdiff > tolerance while (BooleanIndexing.and(abs(hDiff), Conditions.greaterThan(tolerance)) && tries < 50) { // if hdiff > 0 if (BooleanIndexing.and(hDiff, Conditions.greaterThan(0))) { if (Double.isInfinite(betaMax)) beta.putScalar(i, beta.getDouble(i) * 2.0); else beta.putScalar(i, (beta.getDouble(i) + betaMax) / 2.0); betaMin = beta.getDouble(i); } else { if (Double.isInfinite(betaMin)) beta.putScalar(i, beta.getDouble(i) / 2.0); else beta.putScalar(i, (beta.getDouble(i) + betaMin) / 2.0); betaMax = beta.getDouble(i); } pair = hBeta(row, beta.getDouble(i)); hDiff = pair.getFirst().subi(logU); tries++; } p.slice(i).put(range, pair.getSecond()); } // dont need data in memory after log.info("Mean value of sigma " + sqrt(beta.rdiv(1)).mean(Integer.MAX_VALUE)); BooleanIndexing.applyWhere(p, Conditions.isNan(), new Value(realMin)); // set 0 along the diagonal INDArray permute = p.transpose(); INDArray pOut = p.add(permute); pOut.divi(pOut.sum(Integer.MAX_VALUE)); BooleanIndexing.applyWhere( pOut, Conditions.lessThan(Nd4j.EPS_THRESHOLD), new Value(Nd4j.EPS_THRESHOLD)); // ensure no nans return pOut; }
@Test public void testFeedForwardActivationsAndDerivatives() { MultiLayerNetwork network = new MultiLayerNetwork(getConf()); network.init(); DataSet data = new IrisDataSetIterator(1, 150).next(); network.fit(data); Pair result = network.feedForwardActivationsAndDerivatives(); List<INDArray> first = (List) result.getFirst(); List<INDArray> second = (List) result.getSecond(); assertEquals(first.size(), second.size()); }
@Override public Pair<Gradient, INDArray> backpropGradient( INDArray epsilon, Gradient nextGradient, Layer layer) { Pair<Gradient, INDArray> pair = getGradientsAndDelta( output(input)); // Returns Gradient and delta^(this), not Gradient and epsilon^(this-1) INDArray delta = pair.getSecond(); INDArray epsilonNext = params.get(DefaultParamInitializer.WEIGHT_KEY).mmul(delta.transpose()).transpose(); return new Pair<>(pair.getFirst(), epsilonNext); }
public static void main(String[] args) throws Exception { // STEP 1: Initialization int iterations = 100; // create an n-dimensional array of doubles DataTypeUtil.setDTypeForContext(DataBuffer.Type.DOUBLE); List<String> cacheList = new ArrayList<>(); // cacheList is a dynamic array of strings used to hold all words // STEP 2: Turn text input into a list of words log.info("Load & Vectorize data...."); File wordFile = new ClassPathResource("words.txt").getFile(); // Open the file // Get the data of all unique word vectors Pair<InMemoryLookupTable, VocabCache> vectors = WordVectorSerializer.loadTxt(wordFile); VocabCache cache = vectors.getSecond(); INDArray weights = vectors.getFirst().getSyn0(); // seperate weights of unique words into their own list for (int i = 0; i < cache.numWords(); i++) // seperate strings of words into their own list cacheList.add(cache.wordAtIndex(i)); // STEP 3: build a dual-tree tsne to use later log.info("Build model...."); BarnesHutTsne tsne = new BarnesHutTsne.Builder() .setMaxIter(iterations) .theta(0.5) .normalize(false) .learningRate(500) .useAdaGrad(false) // .usePca(false) .build(); // STEP 4: establish the tsne values and save them to a file log.info("Store TSNE Coordinates for Plotting...."); String outputFile = "target/archive-tmp/tsne-standard-coords.csv"; (new File(outputFile)).getParentFile().mkdirs(); tsne.plot(weights, 2, cacheList, outputFile); // This tsne will use the weights of the vectors as its matrix, have two dimensions, use the // words strings as // labels, and be written to the outputFile created on the previous line // !!! Possible error: plot was recently deprecated. Might need to re-do the last line }
// note precision is off on this test but the numbers are close // investigation in a future release should determine how to resolve @Test public void testBackpropResultsContained() { Layer layer = getContainedConfig(); INDArray input = getContainedData(); INDArray col = getContainedCol(); INDArray epsilon = Nd4j.ones(1, 2, 4, 4); INDArray expectedBiasGradient = Nd4j.create(new double[] {0.16608272, 0.16608272}, new int[] {1, 2}); INDArray expectedWeightGradient = Nd4j.create( new double[] { 0.17238397, 0.17238397, 0.33846668, 0.33846668, 0.17238397, 0.17238397, 0.33846668, 0.33846668 }, new int[] {2, 1, 2, 2}); INDArray expectedEpsilon = Nd4j.create( new double[] { 0.00039383, 0.00039383, 0.00039383, 0.00039383, 0.00039383, 0.00039383, 0., 0., 0.00039383, 0.00039383, 0.00039383, 0.00039383, 0.00039383, 0.00039383, 0., 0., 0.02036651, 0.02036651, 0.02036651, 0.02036651, 0.02036651, 0.02036651, 0., 0., 0.02036651, 0.02036651, 0.02036651, 0.02036651, 0.02036651, 0.02036651, 0., 0., 0.00039383, 0.00039383, 0.00039383, 0.00039383, 0.00039383, 0.00039383, 0., 0., 0.00039383, 0.00039383, 0.00039383, 0.00039383, 0.00039383, 0.00039383, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. }, new int[] {1, 1, 8, 8}); layer.setInput(input); org.deeplearning4j.nn.layers.convolution.ConvolutionLayer layer2 = (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer) layer; layer2.setCol(col); Pair<Gradient, INDArray> pair = layer2.backpropGradient(epsilon); assertArrayEquals(expectedEpsilon.shape(), pair.getSecond().shape()); assertArrayEquals(expectedWeightGradient.shape(), pair.getFirst().getGradientFor("W").shape()); assertArrayEquals(expectedBiasGradient.shape(), pair.getFirst().getGradientFor("b").shape()); assertEquals(expectedEpsilon, pair.getSecond()); assertEquals(expectedWeightGradient, pair.getFirst().getGradientFor("W")); assertEquals(expectedBiasGradient, pair.getFirst().getGradientFor("b")); }
@Test public void testBackpropResults() { Layer layer = getContainedConfig(); INDArray col = getContainedCol(); INDArray expectedWeightGradient = Nd4j.create( new double[] {-1440., -1440., -1984., -1984., -1440., -1440., -1984., -1984.}, new int[] {2, 1, 2, 2}); INDArray expectedBiasGradient = Nd4j.create( new double[] {-544., -544.}, new int[] { 2, }); INDArray expectedEpsilon = Nd4j.create( new double[] { -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -12., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56., -56. }, new int[] {1, 1, 8, 8}); org.deeplearning4j.nn.layers.convolution.ConvolutionLayer layer2 = (org.deeplearning4j.nn.layers.convolution.ConvolutionLayer) layer; layer2.setCol(col); Pair<Gradient, INDArray> pair = layer2.backpropGradient(epsilon); assertEquals(expectedEpsilon.shape(), pair.getSecond().shape()); assertEquals(expectedWeightGradient.shape(), pair.getFirst().getGradientFor("W").shape()); assertEquals(expectedBiasGradient.shape(), pair.getFirst().getGradientFor("b").shape()); assertEquals(expectedEpsilon, pair.getSecond()); assertEquals(expectedWeightGradient, pair.getFirst().getGradientFor("W")); assertEquals(expectedBiasGradient, pair.getFirst().getGradientFor("b")); }
public FloatDataSet(Pair<FloatMatrix, FloatMatrix> pair) { this(pair.getFirst(), pair.getSecond()); }
/** * Train on the corpus * * @param rdd the rdd to train * @return the vocab and weights */ public Pair<VocabCache, GloveWeightLookupTable> train(JavaRDD<String> rdd) { TextPipeline pipeline = new TextPipeline(rdd); final Pair<VocabCache, Long> vocabAndNumWords = pipeline.process(); SparkConf conf = rdd.context().getConf(); JavaSparkContext sc = new JavaSparkContext(rdd.context()); vocabCacheBroadcast = sc.broadcast(vocabAndNumWords.getFirst()); final GloveWeightLookupTable gloveWeightLookupTable = new GloveWeightLookupTable.Builder() .cache(vocabAndNumWords.getFirst()) .lr(conf.getDouble(GlovePerformer.ALPHA, 0.025)) .maxCount(conf.getDouble(GlovePerformer.MAX_COUNT, 100)) .vectorLength(conf.getInt(GlovePerformer.VECTOR_LENGTH, 300)) .xMax(conf.getDouble(GlovePerformer.X_MAX, 0.75)) .build(); gloveWeightLookupTable.resetWeights(); gloveWeightLookupTable.getBiasAdaGrad().historicalGradient = Nd4j.zeros(gloveWeightLookupTable.getSyn0().rows()); gloveWeightLookupTable.getWeightAdaGrad().historicalGradient = Nd4j.create(gloveWeightLookupTable.getSyn0().shape()); log.info( "Created lookup table of size " + Arrays.toString(gloveWeightLookupTable.getSyn0().shape())); CounterMap<String, String> coOccurrenceCounts = rdd.map(new TokenizerFunction(tokenizerFactoryClazz)) .map(new CoOccurrenceCalculator(symmetric, vocabCacheBroadcast, windowSize)) .fold(new CounterMap<String, String>(), new CoOccurrenceCounts()); List<Triple<String, String, Double>> counts = new ArrayList<>(); Iterator<Pair<String, String>> pairIter = coOccurrenceCounts.getPairIterator(); while (pairIter.hasNext()) { Pair<String, String> pair = pairIter.next(); counts.add( new Triple<>( pair.getFirst(), pair.getSecond(), coOccurrenceCounts.getCount(pair.getFirst(), pair.getSecond()))); } log.info("Calculated co occurrences"); JavaRDD<Triple<String, String, Double>> parallel = sc.parallelize(counts); JavaPairRDD<String, Tuple2<String, Double>> pairs = parallel.mapToPair( new PairFunction<Triple<String, String, Double>, String, Tuple2<String, Double>>() { @Override public Tuple2<String, Tuple2<String, Double>> call( Triple<String, String, Double> stringStringDoubleTriple) throws Exception { return new Tuple2<>( stringStringDoubleTriple.getFirst(), new Tuple2<>( stringStringDoubleTriple.getFirst(), stringStringDoubleTriple.getThird())); } }); JavaPairRDD<VocabWord, Tuple2<VocabWord, Double>> pairsVocab = pairs.mapToPair( new PairFunction< Tuple2<String, Tuple2<String, Double>>, VocabWord, Tuple2<VocabWord, Double>>() { @Override public Tuple2<VocabWord, Tuple2<VocabWord, Double>> call( Tuple2<String, Tuple2<String, Double>> stringTuple2Tuple2) throws Exception { return new Tuple2<>( vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._1()), new Tuple2<>( vocabCacheBroadcast.getValue().wordFor(stringTuple2Tuple2._2()._1()), stringTuple2Tuple2._2()._2())); } }); for (int i = 0; i < iterations; i++) { JavaRDD<GloveChange> change = pairsVocab.map( new Function<Tuple2<VocabWord, Tuple2<VocabWord, Double>>, GloveChange>() { @Override public GloveChange call( Tuple2<VocabWord, Tuple2<VocabWord, Double>> vocabWordTuple2Tuple2) throws Exception { VocabWord w1 = vocabWordTuple2Tuple2._1(); VocabWord w2 = vocabWordTuple2Tuple2._2()._1(); INDArray w1Vector = gloveWeightLookupTable.getSyn0().slice(w1.getIndex()); INDArray w2Vector = gloveWeightLookupTable.getSyn0().slice(w2.getIndex()); INDArray bias = gloveWeightLookupTable.getBias(); double score = vocabWordTuple2Tuple2._2()._2(); double xMax = gloveWeightLookupTable.getxMax(); double maxCount = gloveWeightLookupTable.getMaxCount(); // w1 * w2 + bias double prediction = Nd4j.getBlasWrapper().dot(w1Vector, w2Vector); prediction += bias.getDouble(w1.getIndex()) + bias.getDouble(w2.getIndex()); double weight = Math.pow(Math.min(1.0, (score / maxCount)), xMax); double fDiff = score > xMax ? prediction : weight * (prediction - Math.log(score)); if (Double.isNaN(fDiff)) fDiff = Nd4j.EPS_THRESHOLD; // amount of change double gradient = fDiff; // update(w1,w1Vector,w2Vector,gradient); // update(w2,w2Vector,w1Vector,gradient); Pair<INDArray, Double> w1Update = update( gloveWeightLookupTable.getWeightAdaGrad(), gloveWeightLookupTable.getBiasAdaGrad(), gloveWeightLookupTable.getSyn0(), gloveWeightLookupTable.getBias(), w1, w1Vector, w2Vector, gradient); Pair<INDArray, Double> w2Update = update( gloveWeightLookupTable.getWeightAdaGrad(), gloveWeightLookupTable.getBiasAdaGrad(), gloveWeightLookupTable.getSyn0(), gloveWeightLookupTable.getBias(), w2, w2Vector, w1Vector, gradient); return new GloveChange( w1, w2, w1Update.getFirst(), w2Update.getFirst(), w1Update.getSecond(), w2Update.getSecond(), fDiff); } }); JavaRDD<Double> error = change.map( new Function<GloveChange, Double>() { @Override public Double call(GloveChange gloveChange) throws Exception { gloveChange.apply(gloveWeightLookupTable); return gloveChange.getError(); } }); final Accumulator<Double> d = sc.accumulator(0.0); error.foreach( new VoidFunction<Double>() { @Override public void call(Double aDouble) throws Exception { d.$plus$eq(aDouble); } }); log.info("Error at iteration " + i + " was " + d.value()); } return new Pair<>(vocabAndNumWords.getFirst(), gloveWeightLookupTable); }
/** * Check backprop gradients for a MultiLayerNetwork. * * @param mln MultiLayerNetwork to test. This must be initialized. * @param epsilon Usually on the order/ of 1e-4 or so. * @param maxRelError Maximum relative error. Usually < 1e-5 or so, though maybe more for deep * networks or those with nonlinear activation * @param minAbsoluteError Minimum absolute error to cause a failure. Numerical gradients can be * non-zero due to precision issues. For example, 0.0 vs. 1e-18: relative error is 1.0, but * not really a failure * @param print Whether to print full pass/failure details for each parameter gradient * @param exitOnFirstError If true: return upon first failure. If false: continue checking even if * one parameter gradient has failed. Typically use false for debugging, true for unit tests. * @param input Input array to use for forward pass. May be mini-batch data. * @param labels Labels/targets to use to calculate backprop gradient. May be mini-batch data. * @return true if gradients are passed, false otherwise. */ public static boolean checkGradients( MultiLayerNetwork mln, double epsilon, double maxRelError, double minAbsoluteError, boolean print, boolean exitOnFirstError, INDArray input, INDArray labels) { // Basic sanity checks on input: if (epsilon <= 0.0 || epsilon > 0.1) throw new IllegalArgumentException( "Invalid epsilon: expect epsilon in range (0,0.1], usually 1e-4 or so"); if (maxRelError <= 0.0 || maxRelError > 0.25) throw new IllegalArgumentException("Invalid maxRelativeError: " + maxRelError); if (!(mln.getOutputLayer() instanceof BaseOutputLayer)) throw new IllegalArgumentException("Cannot check backprop gradients without OutputLayer"); // Check network configuration: int layerCount = 0; for (NeuralNetConfiguration n : mln.getLayerWiseConfigurations().getConfs()) { org.deeplearning4j.nn.conf.Updater u = n.getLayer().getUpdater(); if (u == org.deeplearning4j.nn.conf.Updater.SGD) { // Must have LR of 1.0 double lr = n.getLayer().getLearningRate(); if (lr != 1.0) { throw new IllegalStateException( "When using SGD updater, must also use lr=1.0 for layer " + layerCount + "; got " + u + " with lr=" + lr); } } else if (u != org.deeplearning4j.nn.conf.Updater.NONE) { throw new IllegalStateException( "Must have Updater.NONE (or SGD + lr=1.0) for layer " + layerCount + "; got " + u); } } mln.setInput(input); mln.setLabels(labels); mln.computeGradientAndScore(); Pair<Gradient, Double> gradAndScore = mln.gradientAndScore(); Updater updater = UpdaterCreator.getUpdater(mln); updater.update(mln, gradAndScore.getFirst(), 0, mln.batchSize()); INDArray gradientToCheck = gradAndScore .getFirst() .gradient() .dup(); // need dup: gradients are a *view* of the full gradient array (which will // change every time backprop is done) INDArray originalParams = mln.params().dup(); // need dup: params are a *view* of full parameters int nParams = originalParams.length(); int totalNFailures = 0; double maxError = 0.0; for (int i = 0; i < nParams; i++) { // (w+epsilon): Do forward pass and score INDArray params = originalParams.dup(); params.putScalar(i, params.getDouble(i) + epsilon); mln.setParameters(params); mln.computeGradientAndScore(); double scorePlus = mln.score(); // (w-epsilon): Do forward pass and score params.putScalar(i, params.getDouble(i) - 2 * epsilon); // +eps - 2*eps = -eps mln.setParameters(params); mln.computeGradientAndScore(); double scoreMinus = mln.score(); // Calculate numerical parameter gradient: double scoreDelta = scorePlus - scoreMinus; double numericalGradient = scoreDelta / (2 * epsilon); if (Double.isNaN(numericalGradient)) throw new IllegalStateException( "Numerical gradient was NaN for parameter " + i + " of " + nParams); double backpropGradient = gradientToCheck.getDouble(i); // http://cs231n.github.io/neural-networks-3/#gradcheck // use mean centered double relError = Math.abs(backpropGradient - numericalGradient) / (Math.abs(numericalGradient) + Math.abs(backpropGradient)); if (backpropGradient == 0.0 && numericalGradient == 0.0) relError = 0.0; // Edge case: i.e., RNNs with time series length of 1.0 if (relError > maxError) maxError = relError; if (relError > maxRelError || Double.isNaN(relError)) { double absError = Math.abs(backpropGradient - numericalGradient); if (absError < minAbsoluteError) { log.info( "Param " + i + " passed: grad= " + backpropGradient + ", numericalGrad= " + numericalGradient + ", relError= " + relError + "; absolute error = " + absError + " < minAbsoluteError = " + minAbsoluteError); } else { if (print) log.info( "Param " + i + " FAILED: grad= " + backpropGradient + ", numericalGrad= " + numericalGradient + ", relError= " + relError + ", scorePlus=" + scorePlus + ", scoreMinus= " + scoreMinus); if (exitOnFirstError) return false; totalNFailures++; } } else if (print) { log.info( "Param " + i + " passed: grad= " + backpropGradient + ", numericalGrad= " + numericalGradient + ", relError= " + relError); } } if (print) { int nPass = nParams - totalNFailures; log.info( "GradientCheckUtil.checkGradients(): " + nParams + " params checked, " + nPass + " passed, " + totalNFailures + " failed. Largest relative error = " + maxError); } return totalNFailures == 0; }
/** * Check backprop gradients for a ComputationGraph * * @param graph ComputationGraph to test. This must be initialized. * @param epsilon Usually on the order of 1e-4 or so. * @param maxRelError Maximum relative error. Usually < 0.01, though maybe more for deep networks * @param minAbsoluteError Minimum absolute error to cause a failure. Numerical gradients can be * non-zero due to precision issues. For example, 0.0 vs. 1e-18: relative error is 1.0, but * not really a failure * @param print Whether to print full pass/failure details for each parameter gradient * @param exitOnFirstError If true: return upon first failure. If false: continue checking even if * one parameter gradient has failed. Typically use false for debugging, true for unit tests. * @param inputs Input arrays to use for forward pass. May be mini-batch data. * @param labels Labels/targets (output) arrays to use to calculate backprop gradient. May be * mini-batch data. * @return true if gradients are passed, false otherwise. */ public static boolean checkGradients( ComputationGraph graph, double epsilon, double maxRelError, double minAbsoluteError, boolean print, boolean exitOnFirstError, INDArray[] inputs, INDArray[] labels) { // Basic sanity checks on input: if (epsilon <= 0.0 || epsilon > 0.1) throw new IllegalArgumentException( "Invalid epsilon: expect epsilon in range (0,0.1], usually 1e-4 or so"); if (maxRelError <= 0.0 || maxRelError > 0.25) throw new IllegalArgumentException("Invalid maxRelativeError: " + maxRelError); if (graph.getNumInputArrays() != inputs.length) throw new IllegalArgumentException( "Invalid input arrays: expect " + graph.getNumInputArrays() + " inputs"); if (graph.getNumOutputArrays() != labels.length) throw new IllegalArgumentException( "Invalid labels arrays: expect " + graph.getNumOutputArrays() + " outputs"); // Check configuration int layerCount = 0; for (String vertexName : graph.getConfiguration().getVertices().keySet()) { GraphVertex gv = graph.getConfiguration().getVertices().get(vertexName); if (!(gv instanceof LayerVertex)) continue; LayerVertex lv = (LayerVertex) gv; org.deeplearning4j.nn.conf.Updater u = lv.getLayerConf().getLayer().getUpdater(); if (u == org.deeplearning4j.nn.conf.Updater.SGD) { // Must have LR of 1.0 double lr = lv.getLayerConf().getLayer().getLearningRate(); if (lr != 1.0) { throw new IllegalStateException( "When using SGD updater, must also use lr=1.0 for layer \"" + vertexName + "\"; got " + u); } } else if (u != org.deeplearning4j.nn.conf.Updater.NONE) { throw new IllegalStateException( "Must have Updater.NONE (or SGD + lr=1.0) for layer \"" + vertexName + "\"; got " + u); } } for (int i = 0; i < inputs.length; i++) graph.setInput(i, inputs[i]); for (int i = 0; i < labels.length; i++) graph.setLabel(i, labels[i]); graph.computeGradientAndScore(); Pair<Gradient, Double> gradAndScore = graph.gradientAndScore(); ComputationGraphUpdater updater = new ComputationGraphUpdater(graph); updater.update(graph, gradAndScore.getFirst(), 0, graph.batchSize()); INDArray gradientToCheck = gradAndScore .getFirst() .gradient() .dup(); // need dup: gradients are a *view* of the full gradient array (which will // change every time backprop is done) INDArray originalParams = graph.params().dup(); // need dup: params are a *view* of full parameters int nParams = originalParams.length(); int totalNFailures = 0; double maxError = 0.0; for (int i = 0; i < nParams; i++) { // (w+epsilon): Do forward pass and score INDArray params = originalParams.dup(); params.putScalar(i, params.getDouble(i) + epsilon); graph.setParams(params); graph.computeGradientAndScore(); double scorePlus = graph.score(); // (w-epsilon): Do forward pass and score params.putScalar(i, params.getDouble(i) - 2 * epsilon); // +eps - 2*eps = -eps graph.setParams(params); graph.computeGradientAndScore(); double scoreMinus = graph.score(); // Calculate numerical parameter gradient: double scoreDelta = scorePlus - scoreMinus; double numericalGradient = scoreDelta / (2 * epsilon); if (Double.isNaN(numericalGradient)) throw new IllegalStateException( "Numerical gradient was NaN for parameter " + i + " of " + nParams); double backpropGradient = gradientToCheck.getDouble(i); // http://cs231n.github.io/neural-networks-3/#gradcheck // use mean centered double relError = Math.abs(backpropGradient - numericalGradient) / (Math.abs(numericalGradient) + Math.abs(backpropGradient)); if (backpropGradient == 0.0 && numericalGradient == 0.0) relError = 0.0; // Edge case: i.e., RNNs with time series length of 1.0 if (relError > maxError) maxError = relError; if (relError > maxRelError || Double.isNaN(relError)) { double absError = Math.abs(backpropGradient - numericalGradient); if (absError < minAbsoluteError) { log.info( "Param " + i + " passed: grad= " + backpropGradient + ", numericalGrad= " + numericalGradient + ", relError= " + relError + "; absolute error = " + absError + " < minAbsoluteError = " + minAbsoluteError); } else { if (print) log.info( "Param " + i + " FAILED: grad= " + backpropGradient + ", numericalGrad= " + numericalGradient + ", relError= " + relError + ", scorePlus=" + scorePlus + ", scoreMinus= " + scoreMinus); if (exitOnFirstError) return false; totalNFailures++; } } else if (print) { log.info( "Param " + i + " passed: grad= " + backpropGradient + ", numericalGrad= " + numericalGradient + ", relError= " + relError); } } if (print) { int nPass = nParams - totalNFailures; log.info( "GradientCheckUtil.checkGradients(): " + nParams + " params checked, " + nPass + " passed, " + totalNFailures + " failed. Largest relative error = " + maxError); } return totalNFailures == 0; }
@Override public void computeGradientAndScore() { int k = layerConf().getK(); // POSITIVE PHASE Pair<INDArray, INDArray> probHidden = sampleHiddenGivenVisible(input()); /* * Start the gibbs sampling. */ INDArray chainStart = probHidden.getSecond(); /* * Note that at a later date, we can explore alternative methods of * storing the chain transitions for different kinds of sampling * and exploring the search space. */ Pair<Pair<INDArray, INDArray>, Pair<INDArray, INDArray>> matrices; // negative visible means or expected values INDArray nvMeans = null; // negative value samples INDArray nvSamples = null; // negative hidden means or expected values INDArray nhMeans = null; // negative hidden samples INDArray nhSamples = null; /* * K steps of gibbs sampling. This is the positive phase of contrastive divergence. * * There are 4 matrices being computed for each gibbs sampling. * The samples from both the positive and negative phases and their expected values * or averages. * */ for (int i = 0; i < k; i++) { // NEGATIVE PHASE if (i == 0) matrices = gibbhVh(chainStart); else matrices = gibbhVh(nhSamples); // get the cost updates for sampling in the chain after k iterations nvMeans = matrices.getFirst().getFirst(); nvSamples = matrices.getFirst().getSecond(); nhMeans = matrices.getSecond().getFirst(); nhSamples = matrices.getSecond().getSecond(); } /* * Update gradient parameters */ INDArray wGradient = input().transposei().mmul(probHidden.getSecond()).subi(nvSamples.transpose().mmul(nhMeans)); INDArray hBiasGradient; if (layerConf().getSparsity() != 0) // all hidden units must stay around this number hBiasGradient = probHidden.getSecond().rsub(layerConf().getSparsity()).sum(0); else // update rule: the expected values of the hidden input - the negative hidden means adjusted // by the learning rate hBiasGradient = probHidden.getSecond().sub(nhMeans).sum(0); // update rule: the expected values of the input - the negative samples adjusted by the learning // rate INDArray delta = input.sub(nvSamples); INDArray vBiasGradient = delta.sum(0); Gradient ret = new DefaultGradient(); ret.gradientForVariable().put(PretrainParamInitializer.VISIBLE_BIAS_KEY, vBiasGradient); ret.gradientForVariable().put(PretrainParamInitializer.BIAS_KEY, hBiasGradient); ret.gradientForVariable().put(PretrainParamInitializer.WEIGHT_KEY, wGradient); gradient = ret; setScoreWithZ(delta); }
private void init() { if (rng == null) rng = new MersenneTwister(123); MultiDimensionalSet<String, String> binaryProductions = MultiDimensionalSet.hashSet(); if (simplifiedModel) { binaryProductions.add("", ""); } else { // TODO // figure out what binary productions we have in these trees // Note: the current sentiment training data does not actually // have any constituent labels throw new UnsupportedOperationException("Not yet implemented"); } Set<String> unaryProductions = new HashSet<>(); if (simplifiedModel) { unaryProductions.add(""); } else { // TODO // figure out what unary productions we have in these trees (preterminals only, after the // collapsing) throw new UnsupportedOperationException("Not yet implemented"); } identity = FloatMatrix.eye(numHidden); binaryTransform = MultiDimensionalMap.newTreeBackedMap(); binaryFloatTensors = MultiDimensionalMap.newTreeBackedMap(); binaryClassification = MultiDimensionalMap.newTreeBackedMap(); // When making a flat model (no semantic untying) the // basicCategory function will return the same basic category for // all labels, so all entries will map to the same matrix for (Pair<String, String> binary : binaryProductions) { String left = basicCategory(binary.getFirst()); String right = basicCategory(binary.getSecond()); if (binaryTransform.contains(left, right)) { continue; } binaryTransform.put(left, right, randomTransformMatrix()); if (useFloatTensors) { binaryFloatTensors.put(left, right, randomBinaryFloatTensor()); } if (!combineClassification) { binaryClassification.put(left, right, randomClassificationMatrix()); } } numBinaryMatrices = binaryTransform.size(); binaryTransformSize = numHidden * (2 * numHidden + 1); if (useFloatTensors) { binaryFloatTensorSize = numHidden * numHidden * numHidden * 4; } else { binaryFloatTensorSize = 0; } binaryClassificationSize = (combineClassification) ? 0 : numOuts * (numHidden + 1); unaryClassification = new TreeMap<>(); // When making a flat model (no semantic untying) the // basicCategory function will return the same basic category for // all labels, so all entries will map to the same matrix for (String unary : unaryProductions) { unary = basicCategory(unary); if (unaryClassification.containsKey(unary)) { continue; } unaryClassification.put(unary, randomClassificationMatrix()); } binaryClassificationSize = (combineClassification) ? 0 : numOuts * (numHidden + 1); numUnaryMatrices = unaryClassification.size(); unaryClassificationSize = numOuts * (numHidden + 1); featureVectors.put(UNKNOWN_FEATURE, randomWordVector()); numUnaryMatrices = unaryClassification.size(); unaryClassificationSize = numOuts * (numHidden + 1); classWeights = new HashMap<>(); }
/** * Load word vectors from the given pair * * @param pair the given pair * @return a read only word vectors impl based on the given lookup table and vocab */ public static WordVectors fromPair(Pair<InMemoryLookupTable, VocabCache> pair) { WordVectorsImpl vectors = new WordVectorsImpl(); vectors.setLookupTable(pair.getFirst()); vectors.setVocab(pair.getSecond()); return vectors; }