double calculateMSEOnValidationSet(Matrix featuresValidationSet, Matrix labelsValidationSet)
      throws Exception {

    double sumSquaredError = 0;

    for (int instance = 0; instance < featuresValidationSet.rows(); instance++) {
      double errorAcrossOutputNodes = 0;
      double[] predictedLabel = labelsValidationSet.row(instance); // this is the target
      predict(featuresValidationSet.row(instance), labelsValidationSet.row(instance));
      for (int col = 0; col < globalStoredOutputNodeFNetValues.length; col++) {
        errorAcrossOutputNodes +=
            (globalStoredOutputNodeTargetValues[col] - globalStoredOutputNodeFNetValues[col]);
      }
      sumSquaredError += (errorAcrossOutputNodes * errorAcrossOutputNodes);
    }

    double MSE =
        (sumSquaredError
            / (featuresValidationSet.rows() * globalStoredOutputNodeFNetValues.length));

    return MSE;
  }
  /*
   * We incorporate the ability to create an arbitrary network structure.
   * We use array of arrays of doubles for each inter-layer matrix
   * Thus, between each layer, we need a matrix of weights.
   * Num rows * num columns in matrix = nodes in layer below * nodes in layer above
   *
   * We use the Math library's pow function to raise to exponent: double pow(double base, double exponent)
   *
   *                       Hidden Nodes in current Layer (j)
   * previous layers nodes[                             ]
   *             Features [         Wij                 ]
   *                (i)   [                             ]
   *
   * I set up a matrix with dimensions: [ nodes in previous layer ] [ nodes in next layer ]
   *
   * Since we are traveling through one layer at a time, we need to have another data structure
   * that will be outputs for this layer
   *
   * I use for loops to initialize array of arrays ( allocated necessary memory)
   * Please note that: number of layers + 1 = number of weight arrays needed
   */
  public void train(Matrix features, Matrix labels) throws Exception {

    double[] recentAccuracies = new double[5];
    int currentAccuracyIndex = 0;
    double currentAccuracy = 0;

    Random rand = new Random();
    // SHUFFLE labels, features together
    features.shuffle(rand, labels);

    // need to map 0,1, or 2 to the three dimensional vectors, DO N-OF-K-ENCODING FOR THE
    // BACKPROPAGATION
    Matrix newNOfKLabelsMatrix = new Matrix();
    newNOfKLabelsMatrix.setSize(
        labels.rows(), labels.valueCount(0)); // I HARD CODE IN THAT THERE SHOULD BE 3 OUTPUT NODES
    for (int row = 0; row < newNOfKLabelsMatrix.rows(); row++) { // for each instance
      for (int k = 0; k < labels.valueCount(0); k++) {
        if (labels.get(row, 0) == k) {
          for (int m = 0; m < labels.valueCount(0); m++) {
            newNOfKLabelsMatrix.set(row, m, 0);
          }
          newNOfKLabelsMatrix.set(row, k, 1);
        }
      }
    }
    labels = newNOfKLabelsMatrix;

    // IMMEDIATELY SAVE SOME OF THIS, NEVER WILL TRAIN ON THESE
    // STICK THESE INTO A VALIDATION SET
    // ONCE MSE STARTS TO INCREASE AGAIN ON THE VALIDATION SET, WE'VE GONE TOO FAR
    int numRowsToGetIntoTrainingSet = (int) (features.rows() * validationSetPercentageOfData);

    Matrix featuresForTrainingTrimmed = new Matrix();
    featuresForTrainingTrimmed.setSize(numRowsToGetIntoTrainingSet, features.cols());
    Matrix featuresValidationSet = new Matrix();
    featuresValidationSet.setSize(features.rows() - numRowsToGetIntoTrainingSet, features.cols());

    Matrix labelsForTrainingTrimmed = new Matrix();
    labelsForTrainingTrimmed.setSize(numRowsToGetIntoTrainingSet, labels.cols());
    Matrix labelsValidationSet = new Matrix();
    labelsValidationSet.setSize(features.rows() - numRowsToGetIntoTrainingSet, labels.cols());

    // LOOP THROUGH AND PUT MOST OF FEATURES INTO featuresForTrainingTrimmed
    for (int row = 0; row < features.rows(); row++) {
      for (int col = 0; col < features.cols(); col++) {
        if (row < numRowsToGetIntoTrainingSet) {
          featuresForTrainingTrimmed.set(row, col, features.get(row, col));
        } else {
          featuresValidationSet.set(row - numRowsToGetIntoTrainingSet, col, features.get(row, col));
        }
      }
    }

    // LOOP THROUGH AND PUT MOST OF FEATURES INTO featuresForTrainingTrimmed
    for (int row = 0; row < labels.rows(); row++) {
      for (int col = 0; col < labels.cols(); col++) {
        if (row < numRowsToGetIntoTrainingSet) {
          labelsForTrainingTrimmed.set(row, col, labels.get(row, col));
        } else {
          labelsValidationSet.set(row - numRowsToGetIntoTrainingSet, col, labels.get(row, col));
        }
      }
    }

    features = featuresForTrainingTrimmed;
    labels = labelsForTrainingTrimmed;
    // LOOP THROUGH AND PUT LEFTOVER PORTION OF FEATURES INTO validationSet
    arrayListOfEachLayersWeightMatrices = new ArrayList<double[][]>();

    for (int i = 0; i < numHiddenLayers + 1; i++) { // each layer
      double[][] specificLayersWeightMatrix;
      if (i == 0) { // first hidden layer (Each layer owns its own weights)
        specificLayersWeightMatrix =
            new double[features.cols()][numNodesPerHiddenLayer[i]]; // INPUTS are the rows
      } else if (i == numHiddenLayers) {
        specificLayersWeightMatrix =
            new double[numNodesPerHiddenLayer[i - 1]][labels.cols()]; // OUTPUTS ARE THE COLUMNS
      } else {
        specificLayersWeightMatrix =
            new double[numNodesPerHiddenLayer[i - 1]][numNodesPerHiddenLayer[i]];
      }
      arrayListOfEachLayersWeightMatrices.add(specificLayersWeightMatrix);
    }

    changeInWeightMatricesForEveryLayer = new ArrayList<double[][]>();

    for (int i = 0; i < numHiddenLayers + 1; i++) { // each layer
      double[][] specificLayersWeightMatrix;
      if (i == 0) { // first hidden layer (Each layer owns its own weights)
        specificLayersWeightMatrix =
            new double[features.cols()][numNodesPerHiddenLayer[i]]; // INPUTS are the rows
      } else if (i == numHiddenLayers) {
        specificLayersWeightMatrix =
            new double[numNodesPerHiddenLayer[i - 1]][labels.cols()]; // OUTPUTS ARE THE COLUMNS
      } else {
        specificLayersWeightMatrix =
            new double[numNodesPerHiddenLayer[i - 1]][numNodesPerHiddenLayer[i]];
      }
      changeInWeightMatricesForEveryLayer.add(specificLayersWeightMatrix);
    }

    // allocate space/ initialize the previous change in weights that we'll use for momentum
    temporaryStashChangeInWeightMatricesForEveryLayer = new ArrayList<double[][]>();

    for (int i = 0; i < numHiddenLayers + 1; i++) { // each layer
      double[][] specificLayersWeightMatrix;
      if (i == 0) { // first hidden layer (Each layer owns its own weights)
        specificLayersWeightMatrix =
            new double[features.cols()][numNodesPerHiddenLayer[i]]; // INPUTS are the rows
      } else if (i == numHiddenLayers) {
        specificLayersWeightMatrix =
            new double[numNodesPerHiddenLayer[i - 1]][labels.cols()]; // OUTPUTS ARE THE COLUMNS
      } else {
        specificLayersWeightMatrix =
            new double[numNodesPerHiddenLayer[i - 1]][numNodesPerHiddenLayer[i]];
      }
      temporaryStashChangeInWeightMatricesForEveryLayer.add(specificLayersWeightMatrix);
    }

    // ALLOCATE SPACE FOR DELTA ( INTERMEDIATE VALUES THAT WE USE TO UPDATE THE WEIGHTS)

    arrayListOfEachLayersDeltaArray = new ArrayList<double[]>();
    //  EACH LAYER HAS AN ARRAY OF DELTA VALUES
    for (int i = 0;
        i < numHiddenLayers + 2;
        i++) { // each layer  // OF COURSE WE COULD HAVE DONE numHiddenLayers + 1, but I want
               // consistency with fnet ArrayList
      double[] specificLayersDeltaArray;
      if (i == 0) { // first hidden layer (Each layer owns its own weights)
        specificLayersDeltaArray = new double[features.cols()]; // INPUTS are the rows
      } else if (i == (numHiddenLayers + 1)) {
        // specificLayersDeltaArray = new double[ numNodesPerHiddenLayer[ i-1 ] ]  ; //[
        // numNodesPerHiddenLayer[ labels.cols() ] ] ; // OUTPUTS ARE THE COLUMNS
        specificLayersDeltaArray = new double[labels.cols()]; // FIND OUT # NODES AT EACH LEVEL
      } else {
        specificLayersDeltaArray = new double[numNodesPerHiddenLayer[i - 1]];
      }
      arrayListOfEachLayersDeltaArray.add(specificLayersDeltaArray);
    }

    previousChangeInWeightMatricesForEachLayer = new ArrayList<double[][]>();

    for (int i = 0; i < numHiddenLayers + 1; i++) { // each layer
      double[][] specificLayersWeightMatrix;
      if (i == 0) { // first hidden layer (Each layer owns its own weights)
        specificLayersWeightMatrix =
            new double[features.cols()][numNodesPerHiddenLayer[i]]; // INPUTS are the rows
      } else if (i == numHiddenLayers) {
        specificLayersWeightMatrix =
            new double[numNodesPerHiddenLayer[i - 1]][labels.cols()]; // OUTPUTS ARE THE COLUMNS
      } else {
        specificLayersWeightMatrix =
            new double[numNodesPerHiddenLayer[i - 1]][numNodesPerHiddenLayer[i]];
      }
      previousChangeInWeightMatricesForEachLayer.add(specificLayersWeightMatrix);
    }

    // INITIALIZE ALL OF PREVIOUS DELTA VALUES TO 0 [ THIS IS DONE AUTOMATICALLY, CAN DELETE ALL OF
    // THIS CODE ]

    // initialize all weights randomly ( small random weights with 0 mean)

    double[][] currentLayersWeightMatrix;
    for (int i = 0; i < numNodesPerHiddenLayer.length + 1; i++) { // scroll across each layer

      currentLayersWeightMatrix = arrayListOfEachLayersWeightMatrices.get(i);
      for (int j = 0; j < currentLayersWeightMatrix.length; j++) {
        for (int k = 0; k < currentLayersWeightMatrix[j].length; k++) {
          currentLayersWeightMatrix[j][k] = (2 * rand.nextDouble()) - 1;
        }
      }
    }

    // GO THROUGH AND ADD THE SPECIFIC WEIGHTS
    // Initial Weights:

    // PUT ALL BIAS WEIGHTS INTO ARRAYLIST (ONE ARRAY FOR EACH LAYER'S BIAS WEIGHTS)
    biasWeightsAcrossAllLayers = new ArrayList<double[]>();
    for (int i = 0; i < numHiddenLayers + 1; i++) {
      if (i < numHiddenLayers) {
        double[] biasArrayToBeAdded = new double[numNodesPerHiddenLayer[i]];
        biasWeightsAcrossAllLayers.add(biasArrayToBeAdded);
      } else {
        double[] biasArrayForOutputNodesToBeAdded = new double[labels.cols()];
        biasWeightsAcrossAllLayers.add(biasArrayForOutputNodesToBeAdded);
      }
    }

    double[] currentBiasLayersWeightArray;
    for (int i = 0; i < numNodesPerHiddenLayer.length + 1; i++) { // scroll across each layer
      currentBiasLayersWeightArray = biasWeightsAcrossAllLayers.get(i);
      for (int j = 0; j < currentBiasLayersWeightArray.length; j++) {

        currentBiasLayersWeightArray[j] = (2 * rand.nextDouble()) - 1;
      }
    }

    // We'll need to store the previous bias weights
    previousBiasChangeInWeightsAcrossAllLayers = new ArrayList<double[]>();
    for (int i = 0; i < numHiddenLayers + 1; i++) {
      if (i < numHiddenLayers) {
        double[] biasArrayToBeAdded = new double[numNodesPerHiddenLayer[i]];
        previousBiasChangeInWeightsAcrossAllLayers.add(biasArrayToBeAdded);
      } else {
        double[] biasArrayForOutputNodesToBeAdded = new double[labels.cols()];
        previousBiasChangeInWeightsAcrossAllLayers.add(biasArrayForOutputNodesToBeAdded);
      }
    }

    // temporarily stashed bias weights across all layers
    temporarilyStashedChangeInBiasWeightsAcrossAllLayers = new ArrayList<double[]>();
    for (int i = 0; i < numHiddenLayers + 1; i++) {
      if (i < numHiddenLayers) {
        double[] biasArrayToBeAdded = new double[numNodesPerHiddenLayer[i]];
        temporarilyStashedChangeInBiasWeightsAcrossAllLayers.add(biasArrayToBeAdded);
      } else {
        double[] biasArrayForOutputNodesToBeAdded = new double[labels.cols()];
        temporarilyStashedChangeInBiasWeightsAcrossAllLayers.add(biasArrayForOutputNodesToBeAdded);
      }
    }

    changeInBiasArrayForEveryLayer = new ArrayList<double[]>();
    for (int i = 0; i < numHiddenLayers + 1; i++) {
      if (i < numHiddenLayers) {
        double[] biasArrayToBeAdded = new double[numNodesPerHiddenLayer[i]];
        changeInBiasArrayForEveryLayer.add(biasArrayToBeAdded);
      } else {
        double[] biasArrayForOutputNodesToBeAdded = new double[labels.cols()];
        changeInBiasArrayForEveryLayer.add(biasArrayForOutputNodesToBeAdded);
      }
    }

    // INITIALIZE BIAS FOR HIDDEN AND OUTPUT NEURONS

    // Stochastic weight update
    // SOMEHOW GOT TO INITIALIZE ALL OF THIS, ADD BLANKS, SO THAT LATER WE CAN
    // storedFNetForEachLayer.set( i, blah );

    storedFNetForEachLayer =
        new ArrayList<double[]>(); // f_net is the output that is fed into the next layer
    for (int i = 0;
        i < numHiddenLayers + 2;
        i++) { // WE HAVE ONE MORE layer of fnet( consider inputs as fnet)
      double[] thisLayersFNetValues;
      // COULD DO IF/ELSE STATEMENTS IF WE ARE LOOKING AT INPUTS, OR THEN HIDDEN NODES,
      if (i == 0) {
        thisLayersFNetValues = new double[features.cols()]; // FIND OUT # NODES AT EACH LEVEL
      } else if (i == numHiddenLayers + 1) { // OR IS IT +1
        thisLayersFNetValues = new double[labels.cols()]; // FIND OUT # NODES AT EACH LEVEL
      } else {
        thisLayersFNetValues =
            new double[numNodesPerHiddenLayer[i - 1]]; // FIND OUT # NODES AT EACH LEVEL
      }
      storedFNetForEachLayer.add(thisLayersFNetValues);
    }

    // -----BEGIN THE TRAINING-----
    double netValAtNode = 0;
    double fOfNetValAtNode = 0;
    for (int epoch = 0;
        epoch < 10000;
        epoch++) { // For each epoch, cap it at 10000, we want to avoid infinite loop
      System.out.println("---Epoch " + epoch + "---");
      for (int instance = 0;
          instance < features.rows();
          instance++) { // later we will swap this Matrix for featuresForTrainingTrimmed
        // GO FORWARD
        // ---------------------------------------------------------------------------------------------------------------------
        //				System.out.println("Forward propagating...");
        for (int layer = 0;
            layer < numHiddenLayers + 2;
            layer++) { // HERE LAYER DENOTES HIDDEN LAYER
          if (layer == 0) {
            storedFNetForEachLayer.set(
                layer, Arrays.copyOf(features.row(instance), features.row(0).length));
            continue;
          }
          double[] thisLayersFNetValues =
              storedFNetForEachLayer.get(
                  layer); // make a new array of doubles  CAN I PLEASE DELETE THIS LINE OF CODE
          for (int node = 0; node < storedFNetForEachLayer.get(layer).length; node++) {
            netValAtNode = 0;
            // FIND THE CROSS PRODUCT;
            // use a for loop to multiply each col of weights vector by each col of
            // outputsFromPreviousLayer
            for (int colInInputVector = 0;
                colInInputVector < storedFNetForEachLayer.get(layer - 1).length;
                colInInputVector++) {
              netValAtNode +=
                  (storedFNetForEachLayer.get(layer - 1)[colInInputVector]
                      * arrayListOfEachLayersWeightMatrices.get(layer - 1)[colInInputVector][node]);
            }
            netValAtNode += (biasWeightsAcrossAllLayers.get(layer - 1)[node]);
            if (netValAtNode < 0) { // make special function
              fOfNetValAtNode = (1 / (1 + Math.pow(Math.E, (-1 * netValAtNode))));
            } else { // normal
              fOfNetValAtNode =
                  (1
                      / (1
                          + (1
                              / (Math.pow(
                                  Math.E,
                                  (netValAtNode)))))); // if it was positive, then we raise to neg
                                                       // exponent
            }
            thisLayersFNetValues[node] = fOfNetValAtNode; // stick it into the object
          }
          storedFNetForEachLayer.set(
              layer,
              thisLayersFNetValues); // or if we are editing object, this is not even necessary
                                     // DOUBLE CHECK
        }
        // ---NOW FOR THIS INSTANCE, GO
        // BACKWARDS-----------------------------------------------------------------------------------------------------------------------
        // System.out.println("Back propagating...");
        // UPDATE THE WEIGHTS
        for (int layer = numHiddenLayers + 1; layer > 0; layer--) { // ACROSS EACH LAYER BACKWARD
          if (layer == numHiddenLayers + 1) { // THIS IS AN OUTPUT LAYER
            for (int node = 0; node < labels.cols(); node++) {
              double deltaArrayForThisLayer[] = arrayListOfEachLayersDeltaArray.get(layer);
              deltaArrayForThisLayer[node] =
                  ((labels.get(instance, node) - storedFNetForEachLayer.get(layer)[node])
                      * (storedFNetForEachLayer.get(layer)[node])
                      * (1 - (storedFNetForEachLayer.get(layer)[node])));
              // should automatically be set since we get the objects address from heap memory, and
              // change it
              for (int inputToThisNode = 0;
                  inputToThisNode < numNodesPerHiddenLayer[layer - 2] + 1;
                  inputToThisNode++) {
                double changeInWeightBetweenIJ = 0;
                if (inputToThisNode == numNodesPerHiddenLayer[layer - 2]) { // this is a bias node

                  changeInWeightBetweenIJ =
                      (learningRate
                          * 1
                          * arrayListOfEachLayersDeltaArray
                              .get(layer)[node]); // NEED TO ADD STUFF FOR MOMENTUM
                  double[] thisLayersBiasWeights =
                      changeInBiasArrayForEveryLayer.get(
                          layer - 1); // NEED TO ADD STUFF FOR MOMENTUM
                  thisLayersBiasWeights[node] =
                      (changeInWeightBetweenIJ); // NEED TO ADD STUFF FOR MOMENTUM
                } else {

                  changeInWeightBetweenIJ =
                      (learningRate
                          * storedFNetForEachLayer.get(layer - 1)[inputToThisNode]
                          * arrayListOfEachLayersDeltaArray.get(layer)[node]);
                  // double[][] thisLayersWeightMatrix =
                  // arrayListOfEachLayersWeightMatrices.get(layer-1);
                  // thisLayersWeightMatrix[inputToThisNode][node] += ( changeInWeightBetweenIJ );
                  double[][] changeInWeightsMatrixForThisLayer =
                      changeInWeightMatricesForEveryLayer.get(layer - 1);
                  changeInWeightsMatrixForThisLayer[inputToThisNode][node] =
                      changeInWeightBetweenIJ;
                }
              }
            }
          } else {

            for (int node = 0;
                node < numNodesPerHiddenLayer[layer - 1] + 1;
                node++) { // ACROSS EACH HIDDEN LAYER (ie these are not output nodes)
              double deltaArrayForThisLayer[] = arrayListOfEachLayersDeltaArray.get(layer);

              if (node == numNodesPerHiddenLayer[layer - 1]) { // this is a bias node
                // change in weight = learningRate *
              } else { // this is not a bias node
                double summedOutgoingWeightsCrossOutputDelta = 0;

                for (int outgoingEdgeToOutgoingNode = 0;
                    outgoingEdgeToOutgoingNode
                        < arrayListOfEachLayersDeltaArray.get(layer + 1).length;
                    outgoingEdgeToOutgoingNode++) {
                  summedOutgoingWeightsCrossOutputDelta +=
                      (arrayListOfEachLayersDeltaArray.get(layer + 1)[outgoingEdgeToOutgoingNode]
                          * arrayListOfEachLayersWeightMatrices
                              .get(layer)[node][outgoingEdgeToOutgoingNode]);
                }

                deltaArrayForThisLayer[node] =
                    ((summedOutgoingWeightsCrossOutputDelta)
                        * (storedFNetForEachLayer.get(layer)[node])
                        * (1 - (storedFNetForEachLayer.get(layer)[node])));
                // should automatically be set since we get the objects address from heap memory,
                // and change it

                if (layer == 1) {
                  // need a for loop across the neural net's input nodes
                  for (int inputToTheNeuralNet = 0;
                      inputToTheNeuralNet < features.cols() + 1;
                      inputToTheNeuralNet++) {
                    double changeInWeightBetweenIJ = 0;
                    if (inputToTheNeuralNet
                        == features.cols()) { // then we know that this is our bias node

                      changeInWeightBetweenIJ =
                          (learningRate
                              * 1
                              * arrayListOfEachLayersDeltaArray
                                  .get(layer)[node]); // NEED TO ADD STUFF FOR MOMENTUM
                      double[] thisLayersBiasWeights =
                          changeInBiasArrayForEveryLayer.get(
                              layer - 1); // NEED TO ADD STUFF FOR MOMENTUM
                      thisLayersBiasWeights[node] =
                          (changeInWeightBetweenIJ); // NEED TO ADD STUFF FOR MOMENTUM

                    } else {

                      changeInWeightBetweenIJ =
                          (learningRate
                              * storedFNetForEachLayer.get(layer - 1)[inputToTheNeuralNet]
                              * arrayListOfEachLayersDeltaArray.get(layer)[node]);
                      double[][] changeInWeightsMatrixForThisLayer =
                          changeInWeightMatricesForEveryLayer.get(layer - 1);
                      changeInWeightsMatrixForThisLayer[inputToTheNeuralNet][node] =
                          changeInWeightBetweenIJ;
                    }
                  }
                } else {
                  for (int inputToThisNode = 0;
                      inputToThisNode < numNodesPerHiddenLayer[layer - 2] + 1;
                      inputToThisNode++) {
                    double changeInWeightBetweenIJ = 0;
                    if (inputToThisNode
                        == numNodesPerHiddenLayer[layer - 2]) { // this is a bias node

                      changeInWeightBetweenIJ =
                          (learningRate
                              * 1
                              * arrayListOfEachLayersDeltaArray
                                  .get(layer)[node]); // NEED TO ADD STUFF FOR MOMENTUM
                      double[] thisLayersBiasWeights =
                          changeInBiasArrayForEveryLayer.get(
                              layer - 1); // NEED TO ADD STUFF FOR MOMENTUM
                      thisLayersBiasWeights[node] =
                          (changeInWeightBetweenIJ); // NEED TO ADD STUFF FOR MOMENTUM

                    } else {

                      changeInWeightBetweenIJ =
                          (learningRate
                              * storedFNetForEachLayer.get(layer - 1)[inputToThisNode]
                              * arrayListOfEachLayersDeltaArray.get(layer)[node]);
                      // double[][] thisLayersWeightMatrix =
                      // arrayListOfEachLayersWeightMatrices.get(layer-1);
                      // thisLayersWeightMatrix[inputToThisNode][node] += ( changeInWeightBetweenIJ
                      // );
                      double[][] changeInWeightsMatrixForThisLayer =
                          changeInWeightMatricesForEveryLayer.get(layer - 1);
                      changeInWeightsMatrixForThisLayer[inputToThisNode][node] =
                          changeInWeightBetweenIJ;
                    }
                  }
                }
              }
            }
          }
        }

        //				System.out.printf( "e_0=%.17f,  e_1=%.17f, e_2=%.17f, e_3=%.17f\n" ,
        // arrayListOfEachLayersDeltaArray.get(2)[0], arrayListOfEachLayersDeltaArray.get(1)[0] ,
        //				arrayListOfEachLayersDeltaArray.get(1)[1] ,
        // arrayListOfEachLayersDeltaArray.get(1)[2]);
        //				System.out.println("Descending Gradient...");

        //				// PUT TEMPORARILY STASHED INTO PREVIOUS
        //				// ONLY HERE SHOULD WE PUT IN THE STASHED WEIGHTS INTO THE PREVIOUS-STASH-SPOT
        //				// PUT STASHED INTO PREVIOUS
        //
        //				// update the bias weights

        // GET NEW CHANGE IN WEIGHT THANKS TO MOMENTUM, PLACE IN PREVIOUS SPOT
        // should be changeInBiasArrayForEveryLayer not

        for (int w = 0; w < previousBiasChangeInWeightsAcrossAllLayers.size(); w++) {
          for (int y = 0; y < previousBiasChangeInWeightsAcrossAllLayers.get(w).length; y++) {
            double currentChangeInWeightVal = changeInBiasArrayForEveryLayer.get(w)[y];
            double[] fullBiasWeightList = biasWeightsAcrossAllLayers.get(w);
            double previousXYCoordInBiasWeightMatrix =
                previousBiasChangeInWeightsAcrossAllLayers.get(w)[y];
            double thisIsTheWeightChangeIncludingMomentum =
                (currentChangeInWeightVal + (momentum * previousXYCoordInBiasWeightMatrix));
            fullBiasWeightList[y] += thisIsTheWeightChangeIncludingMomentum;
            double[] arrayOfPreviousBiases = previousBiasChangeInWeightsAcrossAllLayers.get(w);
            arrayOfPreviousBiases[y] = thisIsTheWeightChangeIncludingMomentum;
          }
        }

        // GET NEW CHANGE IN WEIGHT THANKS TO MOMENTUM, PLACE IN PREVIOUS SPOT

        // We update the weights ( by adding the changes in weights to the weight matrices) after
        // every layer has been processed
        for (int w = 0; w < arrayListOfEachLayersWeightMatrices.size(); w++) {
          for (int y = 0; y < arrayListOfEachLayersWeightMatrices.get(w).length; y++) {
            for (int z = 0; z < arrayListOfEachLayersWeightMatrices.get(w)[y].length; z++) {
              double currentXYCoordInMatrix = changeInWeightMatricesForEveryLayer.get(w)[y][z];
              double[] fullWeightListForLayer = arrayListOfEachLayersWeightMatrices.get(w)[y];

              double previousXYCoordInChangeInWeightMatrix =
                  previousChangeInWeightMatricesForEachLayer.get(w)[y][z];
              double thisIsTheWeightChangeIncludingMomentum =
                  (currentXYCoordInMatrix + (previousXYCoordInChangeInWeightMatrix * momentum));
              fullWeightListForLayer[z] += thisIsTheWeightChangeIncludingMomentum;
              double[][] arrayOfPreviousBiases = previousChangeInWeightMatricesForEachLayer.get(w);
              arrayOfPreviousBiases[y][z] = thisIsTheWeightChangeIncludingMomentum;
              // newWeight(at next round t+1) = learningRate * delta_at_node_we_feed_into * Xi +
              // momentum_parameter * change_in_weight_at_t
              // momentum goes into the weight updates ( not in the change in weights)
            }
          }
        }

        //				System.out.printf( "w_0=%.17f,  w_1=%.17f, w_2=%.17f, w_3=%.17f, w_4=%.17f,
        // w_5=%.17f,\n w_6=%.17f, w_7=%.17f, w_8=%.17f, w_9=%.17f," +
        //						"w_10=%.17f, w_11=%.17f,\n w_12=%.17f\n" ,
        //						biasWeightsAcrossAllLayers.get(1)[0],
        // arrayListOfEachLayersWeightMatrices.get(1)[0][0] ,
        //						arrayListOfEachLayersWeightMatrices.get(1)[1][0] ,
        // arrayListOfEachLayersWeightMatrices.get(1)[2][0] , biasWeightsAcrossAllLayers.get(0)[0],
        //				arrayListOfEachLayersWeightMatrices.get(0)[0][0],
        // arrayListOfEachLayersWeightMatrices.get(0)[1][0], biasWeightsAcrossAllLayers.get(0)[1],
        //				arrayListOfEachLayersWeightMatrices.get(0)[0][1],
        // arrayListOfEachLayersWeightMatrices.get(0)[1][1],
        // arrayListOfEachLayersWeightMatrices.get(0)[0][2],
        //				biasWeightsAcrossAllLayers.get(0)[2],
        // arrayListOfEachLayersWeightMatrices.get(0)[0][2],
        // arrayListOfEachLayersWeightMatrices.get(0)[1][2]);
        //				// ONLY AFTER THIS POINT HAS EVERY LAYER BEEN PROCESSED

      }

      // if( STOPPING CRITERIA MET ) {  // HAVE TO USE THE VALIDATION SET THIS TIME FOR THE STOPPING
      // CRITERION
      currentAccuracy = calculateMSEOnValidationSet(featuresValidationSet, labelsValidationSet);
      // currentAccuracy = calculateMSEOnValidationSet( features , labels ); // On the training set
      // now
      System.out.println(" Current MSE on epoch # " + epoch + " is: " + currentAccuracy);
      currentAccuracyIndex++;
      recentAccuracies[currentAccuracyIndex % 5] = currentAccuracy;
      double sumAccuracies = 0;
      if (currentAccuracyIndex > 5) {
        for (int i = 0; i < recentAccuracies.length; i++) {
          sumAccuracies +=
              Math.abs(recentAccuracies[currentAccuracyIndex % 5] - recentAccuracies[i]);
        }
        if (sumAccuracies
            < 0.01) { // we only stop training when measureAccuracy after 5 epochs does not increase
                      // by 0.01
          break;
        }
      }

      // In theory, it would be wise here to go back to the old best weights because now we're
      // already overfitting if the stopping criterion is met
      features.shuffle(
          rand, labels); // MUST SHUFFLE DATA ROWS AFTER EACH EPOCH,labels is the buddy matrix
    }
    return;
  }