private void stochasticUpdateStep(Pair<Integer, Set<Integer>> wordPlusContexts, int s) {
      double eta = learningRateDecay(s);
      int wordIndex = wordPlusContexts.getFirst(); // actual center word
      // Set h vector equal to the kth row of weight matrix W1. h = x' * W = W[k,:] = v(input)
      RealVector h = W1.getRowVector(wordIndex); // 1xN row vector

      for (int contextWordIndex : wordPlusContexts.getSecond()) {
        Set<Integer> negativeContexts;
        if (sampleUnigram) {
          negativeContexts = negativeSampleContexts(wordIndex, noiseSampler);
        } else {
          negativeContexts = negativeSampleContexts(wordIndex);
        }
        // wordIndex is the input word
        // negativeContexts is the k negative contexts
        // contextWordIndex is 1 positive context

        // First update the output vectors for 1 positive context
        RealVector vPrime_j = W2.getColumnVector(contextWordIndex); // Nx1 column vector
        double u = h.dotProduct(vPrime_j); // u_j = vPrime(output) * v(input)
        double t_j = 1.0; // t_j := 1{j == contextWordIndex}
        double scale = sigmoid(u) - t_j;
        scale = eta * scale;
        RealVector gradientOut2Hidden = h.mapMultiply(scale);
        vPrime_j = vPrime_j.subtract(gradientOut2Hidden);
        W2.setColumnVector(contextWordIndex, vPrime_j);

        // Next backpropagate the error to the hidden layer and update the input vectors
        RealVector v_I = h;
        u = h.dotProduct(vPrime_j);
        scale = sigmoid(u) - t_j;
        scale = eta * scale;
        RealVector gradientHidden2In = vPrime_j.mapMultiply(scale);
        v_I = v_I.subtract(gradientHidden2In);
        h = v_I;
        W1.setRowVector(wordIndex, v_I);

        // Repeat update process for k negative contexts
        t_j = 0.0; // t_j := 1{j == contextWordIndex}
        for (int negContext : negativeContexts) {
          vPrime_j = W2.getColumnVector(negContext);
          u = h.dotProduct(vPrime_j);
          scale = sigmoid(u) - t_j;
          scale = eta * scale;
          gradientOut2Hidden = h.mapMultiply(scale);
          vPrime_j = vPrime_j.subtract(gradientOut2Hidden);
          W2.setColumnVector(negContext, vPrime_j);

          // Backpropagate the error to the hidden layer and update the input vectors
          v_I = h;
          u = h.dotProduct(vPrime_j);
          scale = sigmoid(u) - t_j;
          scale = eta * scale;
          gradientHidden2In = vPrime_j.mapMultiply(scale);
          v_I = v_I.subtract(gradientHidden2In);
          h = v_I;
          W1.setRowVector(wordIndex, v_I);
        }
      }
    }
 private static double sigmoid(RealVector x, RealVector y) {
   double z = x.dotProduct(y);
   return sigmoid(z);
 }