@Override public void init( Map<String, INDArray> params, NeuralNetConfiguration conf, INDArray paramsView, boolean initializeParams) { if (((org.deeplearning4j.nn.conf.layers.ConvolutionLayer) conf.getLayer()) .getKernelSize() .length != 2) throw new IllegalArgumentException("Filter size must be == 2"); org.deeplearning4j.nn.conf.layers.ConvolutionLayer layerConf = (org.deeplearning4j.nn.conf.layers.ConvolutionLayer) conf.getLayer(); int[] kernel = layerConf.getKernelSize(); int nIn = layerConf.getNIn(); int nOut = layerConf.getNOut(); INDArray biasView = paramsView.get(NDArrayIndex.point(0), NDArrayIndex.interval(0, nOut)); INDArray weightView = paramsView.get(NDArrayIndex.point(0), NDArrayIndex.interval(nOut, numParams(conf, true))); params.put(BIAS_KEY, createBias(conf, biasView, initializeParams)); params.put(WEIGHT_KEY, createWeightMatrix(conf, weightView, initializeParams)); conf.addVariable(WEIGHT_KEY); conf.addVariable(BIAS_KEY); }
protected INDArray createWeightMatrix( NeuralNetConfiguration conf, INDArray weightView, boolean initializeParams) { /* Create a 4d weight matrix of: (number of kernels, num input channels, kernel height, kernel width) Note c order is used specifically for the CNN weights, as opposed to f order elsewhere Inputs to the convolution layer are: (batch size, num input feature maps, image height, image width) */ org.deeplearning4j.nn.conf.layers.ConvolutionLayer layerConf = (org.deeplearning4j.nn.conf.layers.ConvolutionLayer) conf.getLayer(); if (initializeParams) { Distribution dist = Distributions.createDistribution(conf.getLayer().getDist()); int[] kernel = layerConf.getKernelSize(); return WeightInitUtil.initWeights( new int[] {layerConf.getNOut(), layerConf.getNIn(), kernel[0], kernel[1]}, layerConf.getWeightInit(), dist, 'c', weightView); } else { int[] kernel = layerConf.getKernelSize(); return WeightInitUtil.reshapeWeights( new int[] {layerConf.getNOut(), layerConf.getNIn(), kernel[0], kernel[1]}, weightView, 'c'); } }
@Override public void init( Map<String, INDArray> params, NeuralNetConfiguration conf, INDArray paramsView, boolean initializeParameters) { if (!(conf.getLayer() instanceof org.deeplearning4j.nn.conf.layers.FeedForwardLayer)) throw new IllegalArgumentException( "unsupported layer type: " + conf.getLayer().getClass().getName()); int length = numParams(conf, true); if (paramsView.length() != length) throw new IllegalStateException( "Expected params view of length " + length + ", got length " + paramsView.length()); org.deeplearning4j.nn.conf.layers.FeedForwardLayer layerConf = (org.deeplearning4j.nn.conf.layers.FeedForwardLayer) conf.getLayer(); int nIn = layerConf.getNIn(); int nOut = layerConf.getNOut(); int nWeightParams = nIn * nOut; INDArray weightView = paramsView.get(NDArrayIndex.point(0), NDArrayIndex.interval(0, nWeightParams)); INDArray biasView = paramsView.get( NDArrayIndex.point(0), NDArrayIndex.interval(nWeightParams, nWeightParams + nOut)); params.put(WEIGHT_KEY, createWeightMatrix(conf, weightView, initializeParameters)); params.put(BIAS_KEY, createBias(conf, biasView, initializeParameters)); conf.addVariable(WEIGHT_KEY); conf.addVariable(BIAS_KEY); }
@Override public void init(Map<String, INDArray> params, NeuralNetConfiguration conf) { if (conf.getKernelSize().length < 2) throw new IllegalArgumentException("Filter size must be == 2"); params.put(BIAS_KEY, createBias(conf)); params.put(WEIGHT_KEY, createWeightMatrix(conf)); conf.addVariable(WEIGHT_KEY); conf.addVariable(BIAS_KEY); }
protected INDArray createWeightMatrix(NeuralNetConfiguration conf) { /** * Create a 4d weight matrix of: (number of kernels, num input channels, kernel height, kernel * width) Inputs to the convolution layer are: (batch size, num input feature maps, image * height, image width) */ Distribution dist = Distributions.createDistribution(conf.getDist()); return WeightInitUtil.initWeights( Ints.concat(new int[] {conf.getNOut(), conf.getNIn()}, conf.getKernelSize()), conf.getWeightInit(), dist); }
@Override public void init(Map<String, INDArray> params, NeuralNetConfiguration conf) { Distribution dist = Distributions.createDistribution(conf.getDist()); int nL = conf.getNOut(); // i.e., n neurons in this layer int nLast = conf.getNIn(); // i.e., n neurons in previous layer conf.addVariable(RECURRENT_WEIGHTS); conf.addVariable(INPUT_WEIGHTS); conf.addVariable(BIAS); params.put( RECURRENT_WEIGHTS, WeightInitUtil.initWeights(nL, 4 * nL + 3, conf.getWeightInit(), dist)); params.put( INPUT_WEIGHTS, WeightInitUtil.initWeights(nLast, 4 * nL, conf.getWeightInit(), dist)); INDArray biases = Nd4j.zeros(1, 4 * nL); // Order: input, forget, output, input modulation, i.e., IFOG biases.put( new NDArrayIndex[] {NDArrayIndex.interval(nL, 2 * nL), new NDArrayIndex(0)}, Nd4j.ones(1, nL).muli(5)); /*The above line initializes the forget gate biases to 5. * See Sutskever PhD thesis, pg19: * "it is important for [the forget gate activations] to be approximately 1 at the early stages of learning, * which is accomplished by initializing [the forget gate biases] to a large value (such as 5). If it is * not done, it will be harder to learn long range dependencies because the smaller values of the forget * gates will create a vanishing gradients problem." * http://www.cs.utoronto.ca/~ilya/pubs/ilya_sutskever_phd_thesis.pdf */ params.put(BIAS, biases); params.get(RECURRENT_WEIGHTS).data().persist(); params.get(INPUT_WEIGHTS).data().persist(); params.get(BIAS).data().persist(); }
// 1 bias per feature map protected INDArray createBias( NeuralNetConfiguration conf, INDArray biasView, boolean initializeParams) { // the bias is a 1D tensor -- one bias per output feature map org.deeplearning4j.nn.conf.layers.ConvolutionLayer layerConf = (org.deeplearning4j.nn.conf.layers.ConvolutionLayer) conf.getLayer(); if (initializeParams) biasView.assign(layerConf.getBiasInit()); return biasView; }
@Override public int numParams(NeuralNetConfiguration conf, boolean backprop) { org.deeplearning4j.nn.conf.layers.FeedForwardLayer layerConf = (org.deeplearning4j.nn.conf.layers.FeedForwardLayer) conf.getLayer(); int nIn = layerConf.getNIn(); int nOut = layerConf.getNOut(); return nIn * nOut + nOut; // weights + bias }
protected INDArray createBias( NeuralNetConfiguration conf, INDArray biasParamView, boolean initializeParameters) { org.deeplearning4j.nn.conf.layers.FeedForwardLayer layerConf = (org.deeplearning4j.nn.conf.layers.FeedForwardLayer) conf.getLayer(); if (initializeParameters) { INDArray ret = Nd4j.valueArrayOf(layerConf.getNOut(), layerConf.getBiasInit()); biasParamView.assign(ret); } return biasParamView; }
@Override public int numParams(NeuralNetConfiguration conf, boolean backprop) { org.deeplearning4j.nn.conf.layers.ConvolutionLayer layerConf = (org.deeplearning4j.nn.conf.layers.ConvolutionLayer) conf.getLayer(); int[] kernel = layerConf.getKernelSize(); int nIn = layerConf.getNIn(); int nOut = layerConf.getNOut(); return nIn * nOut * kernel[0] * kernel[1] + nOut; }
/** * JSON model configuration passed in If you are entering a MultiLayerConfiguration JSON, your * file name MUST contain '_multi'. Otherwise, it will be processed as a regular * NeuralNetConfiguration * * <p>Takes in JSON file path Checks file path for indication of MultiLayer Reads JSON file to * string Creates neural net configuration from string config */ @Override public <E> E value(String value) throws Exception { Boolean isMultiLayer = value.contains("_multi"); String json = FileUtils.readFileToString(new File(value)); if (isMultiLayer) { return (E) MultiLayerConfiguration.fromJson(json); } else { return (E) NeuralNetConfiguration.fromJson(json); } }
private static void checkNinNoutForEachLayer( int[] expNin, int[] expNout, MultiLayerConfiguration conf, MultiLayerNetwork network) { // Check configuration for (int i = 0; i < expNin.length; i++) { NeuralNetConfiguration layerConf = conf.getConf(i); assertTrue(layerConf.getNIn() == expNin[i]); assertTrue(layerConf.getNOut() == expNout[i]); } // Check Layer for (int i = 0; i < expNin.length; i++) { Layer layer = network.getLayers()[i]; assertTrue(layer.conf().getNIn() == expNin[i]); assertTrue(layer.conf().getNOut() == expNout[i]); int[] weightShape = layer.getParam(DefaultParamInitializer.WEIGHT_KEY).shape(); assertTrue(weightShape[0] == expNin[i]); assertTrue(weightShape[1] == expNout[i]); } }
/** * Apply the regularization * * @param layer * @param gradient * @param param */ public void postApply(Layer layer, INDArray gradient, String param) { NeuralNetConfiguration conf = layer.conf(); INDArray params = layer.getParam(param); if (conf.isUseRegularization() && conf.getLayer().getL2() > 0 && !(param.equals(DefaultParamInitializer.BIAS_KEY))) gradient.addi( params.mul( conf.getLayer() .getL2())); // dC/dw = dC0/dw + lambda/n * w where C0 is pre-l2 cost function if (conf.isUseRegularization() && conf.getLayer().getL1() > 0 && !(param.equals(DefaultParamInitializer.BIAS_KEY))) gradient.addi(Transforms.sign(params).muli(conf.getLayer().getL1())); if (conf.isMiniBatch()) gradient.divi(layer.getInputMiniBatchSize()); if (conf.isConstrainGradientToUnitNorm()) gradient.divi(gradient.norm2(Integer.MAX_VALUE)); }
public ModelAndGradient() { NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder() .lossFunction(LossFunctions.LossFunction.MCXENT) .optimizationAlgo(OptimizationAlgorithm.ITERATION_GRADIENT_DESCENT) .activationFunction("softmax") .iterations(10) .weightInit(WeightInit.XAVIER) .learningRate(1e-1) .nIn(4) .nOut(3) .layer(new org.deeplearning4j.nn.conf.layers.OutputLayer()) .build(); OutputLayer l = LayerFactories.getFactory(conf.getLayer()) .create(conf, Arrays.<IterationListener>asList(new ScoreIterationListener(1))); this.model = l; l.setInput(Nd4j.ones(4)); l.setLabels(Nd4j.ones(3)); this.gradient = l.gradient(); }
@Test public void testModelSerde() throws Exception { ObjectMapper mapper = getMapper(); NeuralNetConfiguration conf = new NeuralNetConfiguration.Builder() .momentum(0.9f) .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT) .iterations(1000) .constrainGradientToUnitNorm(true) .learningRate(1e-1f) .layer( new org.deeplearning4j.nn.conf.layers.AutoEncoder.Builder() .nIn(4) .nOut(3) .corruptionLevel(0.6) .sparsity(0.5) .lossFunction(LossFunctions.LossFunction.RECONSTRUCTION_CROSSENTROPY) .build()) .build(); DataSet d2 = new IrisDataSetIterator(150, 150).next(); INDArray input = d2.getFeatureMatrix(); AutoEncoder da = LayerFactories.getFactory(conf.getLayer()) .create( conf, Arrays.<IterationListener>asList( new ScoreIterationListener(1), new HistogramIterationListener(1)), 0); da.setInput(input); ModelAndGradient g = new ModelAndGradient(da); String json = mapper.writeValueAsString(g); ModelAndGradient read = mapper.readValue(json, ModelAndGradient.class); assertEquals(g, read); }
protected INDArray createWeightMatrix( NeuralNetConfiguration conf, INDArray weightParamView, boolean initializeParameters) { org.deeplearning4j.nn.conf.layers.FeedForwardLayer layerConf = (org.deeplearning4j.nn.conf.layers.FeedForwardLayer) conf.getLayer(); if (initializeParameters) { Distribution dist = Distributions.createDistribution(layerConf.getDist()); INDArray ret = WeightInitUtil.initWeights( layerConf.getNIn(), layerConf.getNOut(), layerConf.getWeightInit(), dist, weightParamView); return ret; } else { return WeightInitUtil.reshapeWeights( new int[] {layerConf.getNIn(), layerConf.getNOut()}, weightParamView); } }
@Override public Map<String, INDArray> getGradientsFromFlattened( NeuralNetConfiguration conf, INDArray gradientView) { org.deeplearning4j.nn.conf.layers.ConvolutionLayer layerConf = (org.deeplearning4j.nn.conf.layers.ConvolutionLayer) conf.getLayer(); int[] kernel = layerConf.getKernelSize(); int nIn = layerConf.getNIn(); int nOut = layerConf.getNOut(); INDArray biasGradientView = gradientView.get(NDArrayIndex.point(0), NDArrayIndex.interval(0, nOut)); INDArray weightGradientView = gradientView .get(NDArrayIndex.point(0), NDArrayIndex.interval(nOut, numParams(conf, true))) .reshape('c', nOut, nIn, kernel[0], kernel[1]); Map<String, INDArray> out = new LinkedHashMap<>(); out.put(BIAS_KEY, biasGradientView); out.put(WEIGHT_KEY, weightGradientView); return out; }
@Override public Map<String, INDArray> getGradientsFromFlattened( NeuralNetConfiguration conf, INDArray gradientView) { org.deeplearning4j.nn.conf.layers.FeedForwardLayer layerConf = (org.deeplearning4j.nn.conf.layers.FeedForwardLayer) conf.getLayer(); int nIn = layerConf.getNIn(); int nOut = layerConf.getNOut(); int nWeightParams = nIn * nOut; INDArray weightGradientView = gradientView .get(NDArrayIndex.point(0), NDArrayIndex.interval(0, nWeightParams)) .reshape('f', nIn, nOut); INDArray biasView = gradientView.get( NDArrayIndex.point(0), NDArrayIndex.interval(nWeightParams, nWeightParams + nOut)); // Already a row vector Map<String, INDArray> out = new LinkedHashMap<>(); out.put(WEIGHT_KEY, weightGradientView); out.put(BIAS_KEY, biasView); return out; }
/** * Check backprop gradients for a MultiLayerNetwork. * * @param mln MultiLayerNetwork to test. This must be initialized. * @param epsilon Usually on the order/ of 1e-4 or so. * @param maxRelError Maximum relative error. Usually < 1e-5 or so, though maybe more for deep * networks or those with nonlinear activation * @param minAbsoluteError Minimum absolute error to cause a failure. Numerical gradients can be * non-zero due to precision issues. For example, 0.0 vs. 1e-18: relative error is 1.0, but * not really a failure * @param print Whether to print full pass/failure details for each parameter gradient * @param exitOnFirstError If true: return upon first failure. If false: continue checking even if * one parameter gradient has failed. Typically use false for debugging, true for unit tests. * @param input Input array to use for forward pass. May be mini-batch data. * @param labels Labels/targets to use to calculate backprop gradient. May be mini-batch data. * @return true if gradients are passed, false otherwise. */ public static boolean checkGradients( MultiLayerNetwork mln, double epsilon, double maxRelError, double minAbsoluteError, boolean print, boolean exitOnFirstError, INDArray input, INDArray labels) { // Basic sanity checks on input: if (epsilon <= 0.0 || epsilon > 0.1) throw new IllegalArgumentException( "Invalid epsilon: expect epsilon in range (0,0.1], usually 1e-4 or so"); if (maxRelError <= 0.0 || maxRelError > 0.25) throw new IllegalArgumentException("Invalid maxRelativeError: " + maxRelError); if (!(mln.getOutputLayer() instanceof BaseOutputLayer)) throw new IllegalArgumentException("Cannot check backprop gradients without OutputLayer"); // Check network configuration: int layerCount = 0; for (NeuralNetConfiguration n : mln.getLayerWiseConfigurations().getConfs()) { org.deeplearning4j.nn.conf.Updater u = n.getLayer().getUpdater(); if (u == org.deeplearning4j.nn.conf.Updater.SGD) { // Must have LR of 1.0 double lr = n.getLayer().getLearningRate(); if (lr != 1.0) { throw new IllegalStateException( "When using SGD updater, must also use lr=1.0 for layer " + layerCount + "; got " + u + " with lr=" + lr); } } else if (u != org.deeplearning4j.nn.conf.Updater.NONE) { throw new IllegalStateException( "Must have Updater.NONE (or SGD + lr=1.0) for layer " + layerCount + "; got " + u); } } mln.setInput(input); mln.setLabels(labels); mln.computeGradientAndScore(); Pair<Gradient, Double> gradAndScore = mln.gradientAndScore(); Updater updater = UpdaterCreator.getUpdater(mln); updater.update(mln, gradAndScore.getFirst(), 0, mln.batchSize()); INDArray gradientToCheck = gradAndScore .getFirst() .gradient() .dup(); // need dup: gradients are a *view* of the full gradient array (which will // change every time backprop is done) INDArray originalParams = mln.params().dup(); // need dup: params are a *view* of full parameters int nParams = originalParams.length(); int totalNFailures = 0; double maxError = 0.0; for (int i = 0; i < nParams; i++) { // (w+epsilon): Do forward pass and score INDArray params = originalParams.dup(); params.putScalar(i, params.getDouble(i) + epsilon); mln.setParameters(params); mln.computeGradientAndScore(); double scorePlus = mln.score(); // (w-epsilon): Do forward pass and score params.putScalar(i, params.getDouble(i) - 2 * epsilon); // +eps - 2*eps = -eps mln.setParameters(params); mln.computeGradientAndScore(); double scoreMinus = mln.score(); // Calculate numerical parameter gradient: double scoreDelta = scorePlus - scoreMinus; double numericalGradient = scoreDelta / (2 * epsilon); if (Double.isNaN(numericalGradient)) throw new IllegalStateException( "Numerical gradient was NaN for parameter " + i + " of " + nParams); double backpropGradient = gradientToCheck.getDouble(i); // http://cs231n.github.io/neural-networks-3/#gradcheck // use mean centered double relError = Math.abs(backpropGradient - numericalGradient) / (Math.abs(numericalGradient) + Math.abs(backpropGradient)); if (backpropGradient == 0.0 && numericalGradient == 0.0) relError = 0.0; // Edge case: i.e., RNNs with time series length of 1.0 if (relError > maxError) maxError = relError; if (relError > maxRelError || Double.isNaN(relError)) { double absError = Math.abs(backpropGradient - numericalGradient); if (absError < minAbsoluteError) { log.info( "Param " + i + " passed: grad= " + backpropGradient + ", numericalGrad= " + numericalGradient + ", relError= " + relError + "; absolute error = " + absError + " < minAbsoluteError = " + minAbsoluteError); } else { if (print) log.info( "Param " + i + " FAILED: grad= " + backpropGradient + ", numericalGrad= " + numericalGradient + ", relError= " + relError + ", scorePlus=" + scorePlus + ", scoreMinus= " + scoreMinus); if (exitOnFirstError) return false; totalNFailures++; } } else if (print) { log.info( "Param " + i + " passed: grad= " + backpropGradient + ", numericalGrad= " + numericalGradient + ", relError= " + relError); } } if (print) { int nPass = nParams - totalNFailures; log.info( "GradientCheckUtil.checkGradients(): " + nParams + " params checked, " + nPass + " passed, " + totalNFailures + " failed. Largest relative error = " + maxError); } return totalNFailures == 0; }
// 1 bias per feature map protected INDArray createBias(NeuralNetConfiguration conf) { // the bias is a 1D tensor -- one bias per output feature map return Nd4j.zeros(conf.getNOut()); }