public final void sumOverThreadsAndMPI() {

      for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
        TotalNumberofPoints += NumberofPoints[ThreadNo];
      }

      // Note - parallel for
      try {
        forallChunked(
            0,
            SALSAUtility.ThreadCount - 1,
            (threadIndex) -> {
              int beginindex = ParallelArrayRanges[threadIndex].getStartIndex();
              int indexlength = ParallelArrayRanges[threadIndex].getLength();
              for (int ArrayLoop = beginindex; ArrayLoop < beginindex + indexlength; ArrayLoop++) {
                TotalVectorSum[ArrayLoop] = 0.0;
                for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
                  TotalVectorSum[ArrayLoop] += VectorSum[ThreadNo][ArrayLoop];
                }
              }
            });
      } catch (SuspendableException e) {
        SALSAUtility.printAndThrowRuntimeException(e.getMessage());
      }

      if (SALSAUtility.MPI_Size > 1) {
        SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
        // Note - MPI Call - Allreduce - double - sum
        TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
        // Note - MPI Call - Allreduce - double[] - sum
        SALSAUtility.mpiOps.allReduce(TotalVectorSum, MPI.SUM);
        SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
      }
    }
    public final void sumOverThreadsAndMPI() {
      SALSAUtility.StartSubTimer(SALSAUtility.ThreadTiming);
      // Note - parallel for
      try {
        forallChunked(
            0,
            SALSAUtility.ThreadCount - 1,
            (threadIndex) -> {
              int beginindex = ParallelArrayRanges[threadIndex].getStartIndex();
              int indexlength = ParallelArrayRanges[threadIndex].getLength();
              for (int ArrayLoop = beginindex; ArrayLoop < beginindex + indexlength; ArrayLoop++) {
                double tmp = 0.0;
                for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
                  tmp += VectorSum[ThreadNo][ArrayLoop];
                }
                TotalVectorSum[ArrayLoop] = tmp;
              }
            });
      } catch (SuspendableException e) {
        SALSAUtility.printAndThrowRuntimeException(e.getMessage());
      }
      SALSAUtility.StopSubTimer(SALSAUtility.ThreadTiming);

      if (SALSAUtility.MPI_Size > 1) {
        SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
        // Note - MPI Call - Allreduce - double - sum
        TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
        int bigsize = TotalVectorSum.length;
        if (bigsize <= 4096) {
          // Note - MPI Call - Allreduce - double[] - sum
          SALSAUtility.mpiOps.allReduce(TotalVectorSum, MPI.SUM);
        } else {
          double[] buffer = new double[4096];
          int start = 0;
          while (start < bigsize) {
            int whatsLeft = Math.min(bigsize - start, 4096);
            System.arraycopy(TotalVectorSum, start, buffer, 0, whatsLeft);
            // Note - MPI Call - Allreduce - double[] - sum
            SALSAUtility.mpiOps.allReduce(buffer, MPI.SUM);
            System.arraycopy(buffer, 0, TotalVectorSum, start, whatsLeft);
            start += whatsLeft;
          }
        }
        SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
      }
    }