/**
   * Update the gradient according to the configuration such as adagrad, momentum, and sparsity
   *
   * @param gradient the gradient to modify
   * @param iteration the current iteration
   * @param learningRate the learning rate for the current iteration
   */
  protected void updateGradientAccordingToParams(
      NeuralNetworkGradient gradient, int iteration, double learningRate) {
    DoubleMatrix wGradient = gradient.getwGradient();

    DoubleMatrix hBiasGradient = gradient.gethBiasGradient();
    DoubleMatrix vBiasGradient = gradient.getvBiasGradient();

    // reset adagrad history
    if (iteration != 0 && resetAdaGradIterations > 0 && iteration % resetAdaGradIterations == 0) {
      wAdaGrad.historicalGradient = null;
      hBiasAdaGrad.historicalGradient = null;
      vBiasAdaGrad.historicalGradient = null;
      if (this.W != null && this.wAdaGrad == null)
        this.wAdaGrad = new AdaGrad(this.W.rows, this.W.columns);

      if (this.vBias != null && this.vBiasAdaGrad == null)
        this.vBiasAdaGrad = new AdaGrad(this.vBias.rows, this.vBias.columns);

      if (this.hBias != null && this.hBiasAdaGrad == null)
        this.hBiasAdaGrad = new AdaGrad(this.hBias.rows, this.hBias.columns);

      log.info("Resetting adagrad");
    }

    DoubleMatrix wLearningRates = wAdaGrad.getLearningRates(wGradient);
    // change up momentum after so many iterations if specified
    double momentum = this.momentum;
    if (momentumAfter != null && !momentumAfter.isEmpty()) {
      int key = momentumAfter.keySet().iterator().next();
      if (iteration >= key) {
        momentum = momentumAfter.get(key);
      }
    }

    if (useAdaGrad) wGradient.muli(wLearningRates);
    else wGradient.muli(learningRate);

    if (useAdaGrad) hBiasGradient = hBiasGradient.mul(hBiasAdaGrad.getLearningRates(hBiasGradient));
    else hBiasGradient = hBiasGradient.mul(learningRate);

    if (useAdaGrad) vBiasGradient = vBiasGradient.mul(vBiasAdaGrad.getLearningRates(vBiasGradient));
    else vBiasGradient = vBiasGradient.mul(learningRate);

    // only do this with binary hidden layers
    if (applySparsity && this.hBiasGradient != null) applySparsity(hBiasGradient, learningRate);

    if (momentum != 0 && this.wGradient != null)
      wGradient.addi(this.wGradient.mul(momentum).add(wGradient.mul(1 - momentum)));

    if (momentum != 0 && this.vBiasGradient != null)
      vBiasGradient.addi(this.vBiasGradient.mul(momentum).add(vBiasGradient.mul(1 - momentum)));

    if (momentum != 0 && this.hBiasGradient != null)
      hBiasGradient.addi(this.hBiasGradient.mul(momentum).add(hBiasGradient.mul(1 - momentum)));

    if (normalizeByInputRows) {
      wGradient.divi(lastMiniBatchSize);
      vBiasGradient.divi(lastMiniBatchSize);
      hBiasGradient.divi(lastMiniBatchSize);
    }

    // simulate post gradient application  and apply the difference to the gradient to decrease the
    // change the gradient has
    if (useRegularization && l2 > 0) {
      if (useAdaGrad) wGradient.subi(W.mul(l2).mul(wLearningRates));
      else wGradient.subi(W.mul(l2 * learningRate));
    }

    if (constrainGradientToUnitNorm) {
      wGradient.divi(wGradient.norm2());
      vBiasGradient.divi(vBiasGradient.norm2());
      hBiasGradient.divi(hBiasGradient.norm2());
    }

    this.wGradient = wGradient;
    this.vBiasGradient = vBiasGradient;
    this.hBiasGradient = hBiasGradient;
  }