public final void sumOverThreadsAndMPI() {

      for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
        TotalNumberofPoints += NumberofPoints[ThreadNo];
      }

      // Note - parallel for
      try {
        forallChunked(
            0,
            SALSAUtility.ThreadCount - 1,
            (threadIndex) -> {
              int beginindex = ParallelArrayRanges[threadIndex].getStartIndex();
              int indexlength = ParallelArrayRanges[threadIndex].getLength();
              for (int ArrayLoop = beginindex; ArrayLoop < beginindex + indexlength; ArrayLoop++) {
                TotalVectorSum[ArrayLoop] = 0.0;
                for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
                  TotalVectorSum[ArrayLoop] += VectorSum[ThreadNo][ArrayLoop];
                }
              }
            });
      } catch (SuspendableException e) {
        SALSAUtility.printAndThrowRuntimeException(e.getMessage());
      }

      if (SALSAUtility.MPI_Size > 1) {
        SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
        // Note - MPI Call - Allreduce - double - sum
        TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
        // Note - MPI Call - Allreduce - double[] - sum
        SALSAUtility.mpiOps.allReduce(TotalVectorSum, MPI.SUM);
        SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
      }
    }
    public final void sumOverThreadsAndMPI() {
      for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
        TotalNumberofPoints += NumberofPoints[ThreadNo];
        Totalmean += mean[ThreadNo];
        Totalsquare += square[ThreadNo];
      }
      if (SALSAUtility.MPI_Size > 1) {
        SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
        // Note - MPI Call - Allreduce - double - sum
        TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
        // Note - MPI Call - Allreduce - double - sum
        Totalmean = SALSAUtility.mpiOps.allReduce(Totalmean, MPI.SUM);
        // Note - MPI Call - Allreduce - double - sum
        Totalsquare = SALSAUtility.mpiOps.allReduce(Totalsquare, MPI.SUM);
        SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
      }

      if (TotalNumberofPoints < 0.5) {
        return;
      }

      Totalmean = Totalmean / TotalNumberofPoints;
      Totalsquare = (Totalsquare / TotalNumberofPoints) - Totalmean * Totalmean;
      Totalsigma = Math.sqrt(Math.max(0.0, Totalsquare));
    }
 public final void sumOverThreadsAndMPI() {
   for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
     TotalNumberofPoints += NumberofPoints[ThreadNo];
     TotalInt += Intvalue[ThreadNo];
   }
   if (SALSAUtility.MPI_Size > 1) {
     SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
     // Note - MPI Call - Allreduce - int - sum
     TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
     // Note - MPI Call - Allreduce - int - sum
     TotalInt = SALSAUtility.mpiOps.allReduce(TotalInt, MPI.SUM);
     SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
   }
 }
 public final void sumOverThreadsAndMPI() {
   for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
     TotalNumberofPoints += NumberofPoints[ThreadNo];
     TotalOr = Orvalue[ThreadNo] || TotalOr;
   }
   if (SALSAUtility.MPI_Size > 1) {
     SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
     // Note - MPI Call - Allreduce - double - sum
     TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
     // Note - MPI Call - Allreduce - boolean - or
     TotalOr = SALSAUtility.mpiOps.allReduce(TotalOr, MPI.LOR);
     SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
   }
 }
 public final void sumOverThreadsAndMPI() {
   for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
     TotalNumberofPoints += NumberofPoints[ThreadNo];
     TotalMax = Math.max(TotalMax, Maxvalue[ThreadNo]);
   }
   if (SALSAUtility.MPI_Size > 1) {
     SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
     // Note - MPI Call - Allreduce - double - sum
     TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
     // Note - MPI Call - Allreduce - double - max
     TotalMax = SALSAUtility.mpiOps.allReduce(TotalMax, MPI.MAX);
     SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
   }
 }
 public final void sumOverThreadsAndMPI() {
   for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
     TotalNumberofPoints += NumberofPoints[ThreadNo];
     for (int ArrayLoop = 0; ArrayLoop < ArraySize; ArrayLoop++) {
       TotalVectorSum[ArrayLoop] += VectorSum[ThreadNo][ArrayLoop];
     }
   }
   if (SALSAUtility.MPI_Size > 1) {
     SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
     // Note - MPI Call - Allreduce - int - sum
     TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
     // Note - MPI Call - Allreduce - int[] - sum
     SALSAUtility.mpiOps.allReduce(TotalVectorSum, MPI.SUM);
     SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
   }
 }
    public final void sumOverThreadsAndMPI() {
      SALSAUtility.StartSubTimer(SALSAUtility.ThreadTiming);
      // Note - parallel for
      try {
        forallChunked(
            0,
            SALSAUtility.ThreadCount - 1,
            (threadIndex) -> {
              int beginindex = ParallelArrayRanges[threadIndex].getStartIndex();
              int indexlength = ParallelArrayRanges[threadIndex].getLength();
              for (int ArrayLoop = beginindex; ArrayLoop < beginindex + indexlength; ArrayLoop++) {
                double tmp = 0.0;
                for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
                  tmp += VectorSum[ThreadNo][ArrayLoop];
                }
                TotalVectorSum[ArrayLoop] = tmp;
              }
            });
      } catch (SuspendableException e) {
        SALSAUtility.printAndThrowRuntimeException(e.getMessage());
      }
      SALSAUtility.StopSubTimer(SALSAUtility.ThreadTiming);

      if (SALSAUtility.MPI_Size > 1) {
        SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
        // Note - MPI Call - Allreduce - double - sum
        TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
        int bigsize = TotalVectorSum.length;
        if (bigsize <= 4096) {
          // Note - MPI Call - Allreduce - double[] - sum
          SALSAUtility.mpiOps.allReduce(TotalVectorSum, MPI.SUM);
        } else {
          double[] buffer = new double[4096];
          int start = 0;
          while (start < bigsize) {
            int whatsLeft = Math.min(bigsize - start, 4096);
            System.arraycopy(TotalVectorSum, start, buffer, 0, whatsLeft);
            // Note - MPI Call - Allreduce - double[] - sum
            SALSAUtility.mpiOps.allReduce(buffer, MPI.SUM);
            System.arraycopy(buffer, 0, TotalVectorSum, start, whatsLeft);
            start += whatsLeft;
          }
        }
        SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
      }
    }
    public final void sumOverThreadsAndMPI() {
      for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
        TotalNumberofPoints += NumberofPoints[ThreadNo];
        for (int ArrayLoop = 0; ArrayLoop < ArraySize; ArrayLoop++) {
          TotalVectorMax[ArrayLoop] =
              Math.max(TotalVectorMax[ArrayLoop], VectorMax[ThreadNo][ArrayLoop]);
        }
      }

      if (SALSAUtility.MPI_Size > 1) {
        SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
        // Note - MPI Call - Allreduce - double - sum
        TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
        // Note - MPI Call - Allreduce - double[] - max
        SALSAUtility.mpiOps.allReduce(TotalVectorMax, MPI.MAX);
        SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
      }
    }
    public final void sumOverThreadsAndMPI() {
      for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
        if (IndexValue[ThreadNo] < 0) {
          continue;
        }

        TotalNumberofPoints += NumberofPoints[ThreadNo];
        if (MinMaxPointer != 0) {
          if ((TotalIndexValue >= 0) && (TotalMaxOrMin > MaxOrMinvalue[ThreadNo])) {
            continue;
          }
        } else {
          if ((TotalIndexValue >= 0) && (TotalMaxOrMin <= MaxOrMinvalue[ThreadNo])) {
            continue;
          }
        }

        TotalMaxOrMin = MaxOrMinvalue[ThreadNo];
        TotalIndexValue = IndexValue[ThreadNo];
      }
      if (SALSAUtility.MPI_Size > 1) {
        SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
        if (MinMaxPointer != 0) {
          // Note - MPI Call - Allreduce - MPIReducePlusIndex - max with index
          salsa.mpi.MPIReducePlusIndex result =
              SALSAUtility.mpiOps.allReduce(
                  new salsa.mpi.MPIReducePlusIndex(TotalIndexValue, TotalMaxOrMin),
                  salsa.mpi.MPIReducePlusIndex.Op.MAX_WITH_INDEX);
          TotalMaxOrMin = result.getValue();
          TotalIndexValue = result.getIndex();
        } else {
          // Note - MPI Call - Allreduce - MPIReducePlusIndex - min with index
          salsa.mpi.MPIReducePlusIndex result =
              SALSAUtility.mpiOps.allReduce(
                  new salsa.mpi.MPIReducePlusIndex(TotalIndexValue, TotalMaxOrMin),
                  salsa.mpi.MPIReducePlusIndex.Op.MIN_WITH_INDEX);
          TotalMaxOrMin = result.getValue();
          TotalIndexValue = result.getIndex();
        }
        // Note - MPI Call - Allreduce - double - sum
        TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
        SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
      }
    }
 public final void sumOverThreadsAndMPI() {
   for (int threadNo = 0; threadNo < NumberOfThreads; threadNo++) {
     TotalNumberofPoints += NumberOfPoints[threadNo];
     for (int i = 0; i < OuterDimension; ++i) {
       for (int j = 0; j < InnerDimension; ++j) {
         TotalSum[i][j] += Sum[threadNo][i][j];
       }
     }
   }
   if (SALSAUtility.MPI_Size > 1) {
     SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
     // Note - MPI Call - Allreduce - double - sum
     TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
     for (int i = 0; i < OuterDimension; ++i) {
       // Note - MPI Call - Allreduce - double[] - sum
       SALSAUtility.mpiOps.allReduce(TotalSum[i], MPI.SUM);
     }
     SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
   }
 }
    public final void sumOverThreadsAndMPI() {
      for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
        TotalNumberofPoints += NumberofPoints[ThreadNo];
        Totalmean1 += mean1[ThreadNo];
        Totalmean2 += mean2[ThreadNo];
        Totalsquare1 += square1[ThreadNo];
        Totalsquare2 += square2[ThreadNo];
        Totalcross12 += cross12[ThreadNo];
      }
      if (SALSAUtility.MPI_Size > 1) {
        SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
        // Note - MPI Call - Allreduce - double - sum
        TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
        // Note - MPI Call - Allreduce - double - sum
        Totalmean1 = SALSAUtility.mpiOps.allReduce(Totalmean1, MPI.SUM);
        // Note - MPI Call - Allreduce - double - sum
        Totalmean2 = SALSAUtility.mpiOps.allReduce(Totalmean2, MPI.SUM);
        // Note - MPI Call - Allreduce - double - sum
        Totalsquare1 = SALSAUtility.mpiOps.allReduce(Totalsquare1, MPI.SUM);
        // Note - MPI Call - Allreduce - double - sum
        Totalsquare2 = SALSAUtility.mpiOps.allReduce(Totalsquare2, MPI.SUM);
        // Note - MPI Call - Allreduce - double - sum
        Totalcross12 = SALSAUtility.mpiOps.allReduce(Totalcross12, MPI.SUM);
        SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
      }

      if (TotalNumberofPoints < 0.5) {
        return;
      }

      Totalmean1 = Totalmean1 / TotalNumberofPoints;
      Totalmean2 = Totalmean2 / TotalNumberofPoints;
      Totalsquare1 = (Totalsquare1 / TotalNumberofPoints) - Totalmean1 * Totalmean1;
      Totalsquare2 = (Totalsquare2 / TotalNumberofPoints) - Totalmean2 * Totalmean2;
      Totalcross12 = (Totalcross12 / TotalNumberofPoints) - Totalmean1 * Totalmean2;
      Totalsigma1 = Math.sqrt(Totalsquare1);
      Totalsigma2 = Math.sqrt(Totalsquare2);
      Totalcross12 = Totalcross12 / (Totalsigma1 * Totalsigma2);
    }
    public final void sumOverThreadsAndMPI() {
      for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
        TotalNumberofPoints += NumberofPoints[ThreadNo];
        for (int ArrayLoop = 0; ArrayLoop < ArraySize; ArrayLoop++) {
          Totalmean[ArrayLoop] += mean[ThreadNo][ArrayLoop];
        }
      }
      if (SALSAUtility.MPI_Size > 1) {
        SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
        // Note - MPI Call - Allreduce - double - sum
        TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
        // Note - MPI Call - Allreduce - double[] - sum
        SALSAUtility.mpiOps.allReduce(Totalmean, MPI.SUM);
        SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
      }

      if (TotalNumberofPoints < 0.5) {
        return;
      }
      for (int ArrayLoop = 0; ArrayLoop < ArraySize; ArrayLoop++) {
        Totalmean[ArrayLoop] = Totalmean[ArrayLoop] / TotalNumberofPoints;
      }
    }
 public final void print(String label, String FPformat) {
   if ((SALSAUtility.DebugPrintOption == 0) || (SALSAUtility.MPI_Rank != 0)) {
     return;
   }
   SALSAUtility.SALSAPrint(
       1,
       label
           + " means "
           + String.format(FPformat, Totalmean1)
           + " "
           + String.format(FPformat, Totalmean2)
           + " sigmas "
           + String.format(FPformat, Totalsigma1)
           + " "
           + String.format(FPformat, Totalsigma2)
           + " correl "
           + String.format(FPformat, Totalcross12));
 }
    public final void sumOverThreadsAndMPI() {

      for (int storeloop = 0; storeloop < Numbertofind; storeloop++) {
        TotalMinValue[storeloop] = -1.0;
        TotalIndexValue[storeloop] = -1;
      }
      TotalWorst = -1;
      for (int ThreadNo = 0; ThreadNo < NumberofThreads; ThreadNo++) {
        TotalNumberofPoints += NumberofPoints[ThreadNo];
        for (int storeloop = 0; storeloop < Numbertofind; storeloop++) {
          if (IndexValuebythread[ThreadNo][storeloop] < 0) {
            continue; // End this thread
          }
          tangible.RefObject<Integer> tempRef_TotalWorst = new tangible.RefObject<>(TotalWorst);
          FindMinimumSet(
              MinValuebythread[ThreadNo][storeloop],
              IndexValuebythread[ThreadNo][storeloop],
              tempRef_TotalWorst,
              TotalMinValue,
              TotalIndexValue,
              Numbertofind);
          TotalWorst = tempRef_TotalWorst.argValue;
        }
      }
      if (SALSAUtility.MPI_Size > 1) {
        SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
        // Note - MPI Call - Allreduce - double - sum
        TotalNumberofPoints = SALSAUtility.mpiOps.allReduce(TotalNumberofPoints, MPI.SUM);
        SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
      }
      // Sort in absolute order and accumulate over processes. This takes Numbertofindsteps
      for (int OrderLoop = 0; OrderLoop < Numbertofind; OrderLoop++) {
        int localindex = -1; // unset
        double localvalue = -1.0;
        int loopused = -1;
        for (int internalloop = 0; internalloop < Numbertofind; internalloop++) { // Find minimum
          if (TotalIndexValue[internalloop] < 0) {
            continue;
          }
          if ((localindex < 0) || (TotalMinValue[internalloop] < localvalue)) {
            localindex = TotalIndexValue[internalloop];
            localvalue = TotalMinValue[internalloop];
            loopused = internalloop;
          }
        }
        int oldlocalindex = localindex;
        if (SALSAUtility.MPI_Size > 1) {
          SALSAUtility.StartSubTimer(SALSAUtility.MPIREDUCETiming1);
          // Note - MPI Call - Allreduce - MPIReducePlusIndex - min with index
          salsa.mpi.MPIReducePlusIndex result =
              SALSAUtility.mpiOps.allReduce(
                  new salsa.mpi.MPIReducePlusIndex(localindex, localvalue),
                  salsa.mpi.MPIReducePlusIndex.Op.MIN_WITH_INDEX);
          localvalue = result.getValue();
          localindex = result.getIndex();
          SALSAUtility.StopSubTimer(SALSAUtility.MPIREDUCETiming1);
        }

        OrderedMinValue[OrderLoop] = localvalue;
        OrderedIndexValue[OrderLoop] = localindex;
        if ((oldlocalindex >= 0) && (OrderedIndexValue[OrderLoop] == oldlocalindex)) {
          TotalIndexValue[loopused] = -1;
          TotalMinValue[loopused] = -1.0;
        }
      } // Loop over Order Loop
    }