private static SparseRealMatrix initializeMatrix(SparseRealMatrix matrix, double sigma) {
      NormalDistribution normRandom = new NormalDistribution(0.0, sigma);
      int r = matrix.getRowDimension();
      int c = matrix.getColumnDimension();

      for (int i = 0; i < r; i++) {
        for (int j = 0; j < c; j++) {
          double x = normRandom.sample();
          matrix.setEntry(i, j, x);
        }
      }
      return matrix;
    }
    private void stochasticUpdateStep(Pair<Integer, Set<Integer>> wordPlusContexts, int s) {
      double eta = learningRateDecay(s);
      int wordIndex = wordPlusContexts.getFirst(); // actual center word
      // Set h vector equal to the kth row of weight matrix W1. h = x' * W = W[k,:] = v(input)
      RealVector h = W1.getRowVector(wordIndex); // 1xN row vector

      for (int contextWordIndex : wordPlusContexts.getSecond()) {
        Set<Integer> negativeContexts;
        if (sampleUnigram) {
          negativeContexts = negativeSampleContexts(wordIndex, noiseSampler);
        } else {
          negativeContexts = negativeSampleContexts(wordIndex);
        }
        // wordIndex is the input word
        // negativeContexts is the k negative contexts
        // contextWordIndex is 1 positive context

        // First update the output vectors for 1 positive context
        RealVector vPrime_j = W2.getColumnVector(contextWordIndex); // Nx1 column vector
        double u = h.dotProduct(vPrime_j); // u_j = vPrime(output) * v(input)
        double t_j = 1.0; // t_j := 1{j == contextWordIndex}
        double scale = sigmoid(u) - t_j;
        scale = eta * scale;
        RealVector gradientOut2Hidden = h.mapMultiply(scale);
        vPrime_j = vPrime_j.subtract(gradientOut2Hidden);
        W2.setColumnVector(contextWordIndex, vPrime_j);

        // Next backpropagate the error to the hidden layer and update the input vectors
        RealVector v_I = h;
        u = h.dotProduct(vPrime_j);
        scale = sigmoid(u) - t_j;
        scale = eta * scale;
        RealVector gradientHidden2In = vPrime_j.mapMultiply(scale);
        v_I = v_I.subtract(gradientHidden2In);
        h = v_I;
        W1.setRowVector(wordIndex, v_I);

        // Repeat update process for k negative contexts
        t_j = 0.0; // t_j := 1{j == contextWordIndex}
        for (int negContext : negativeContexts) {
          vPrime_j = W2.getColumnVector(negContext);
          u = h.dotProduct(vPrime_j);
          scale = sigmoid(u) - t_j;
          scale = eta * scale;
          gradientOut2Hidden = h.mapMultiply(scale);
          vPrime_j = vPrime_j.subtract(gradientOut2Hidden);
          W2.setColumnVector(negContext, vPrime_j);

          // Backpropagate the error to the hidden layer and update the input vectors
          v_I = h;
          u = h.dotProduct(vPrime_j);
          scale = sigmoid(u) - t_j;
          scale = eta * scale;
          gradientHidden2In = vPrime_j.mapMultiply(scale);
          v_I = v_I.subtract(gradientHidden2In);
          h = v_I;
          W1.setRowVector(wordIndex, v_I);
        }
      }
    }
    private HashMap<String, float[]> convertEmbeddings(Set<String> targetVocab) {
      // For every string in vocabulary
      // Get corresponding column of output matrix W2
      // Map String to array of floats
      HashMap<String, float[]> embeddingMatrix = new HashMap<String, float[]>();

      for (String word : targetVocab) {
        int wordIndex = encodedVocab.get(word);
        double[] wordEmbedding = W2.getColumn(wordIndex);
        float[] wordEmbeddingFloat = new float[wordEmbedding.length];
        for (int i = 0; i < wordEmbedding.length; i++) {
          wordEmbeddingFloat[i] = (float) wordEmbedding[i];
        }
        embeddingMatrix.put(word, wordEmbeddingFloat);
      }
      return embeddingMatrix;
    }