private synchronized void projectSingleProjection(
      int projectionNumber, int dimz, float respoffset) {
    // load projection matrix
    initProjectionMatrix(projectionNumber);
    // load projection
    Grid2D projection = projections.get(projectionNumber);
    initProjectionData(projection);
    if (!largeVolumeMode) {
      // projections.remove(projectionNumber);
    }
    // backproject for each slice
    // OpenCL Grids are only two dimensional!
    int reconDimensionZ = dimz;
    double voxelSpacingX = getGeometry().getVoxelSpacingX();
    double voxelSpacingY = getGeometry().getVoxelSpacingY();
    double voxelSpacingZ = getGeometry().getVoxelSpacingZ();

    // write kernel parameters
    kernelFunction.rewind();
    kernelFunction
        .putArg(volumePointer)
        .putArg(respoffset)
        .putArg((int) lineOffset)
        .putArg(reconDimensionZ)
        .putArg((float) voxelSpacingX)
        .putArg((float) voxelSpacingY)
        .putArg((float) voxelSpacingZ)
        .putArg((float) offsetX)
        .putArg((float) offsetY)
        .putArg((float) offsetZ)
        .putArg(projectionTex)
        .putArg(volStride)
        .putArg(projectionMatrix);

    int[] realLocalSize = {
      Math.min(device.getMaxWorkGroupSize(), bpBlockSize[0]),
      Math.min(device.getMaxWorkGroupSize(), bpBlockSize[1])
    };
    // rounded up to the nearest multiple of localWorkSize
    int[] globalWorkSize = {getGeometry().getReconDimensionX(), getGeometry().getReconDimensionY()};

    // Call the OpenCL kernel, writing the results into the volume which is pointed at
    commandQueue
        .putWriteImage(projectionTex, false)
        .finish()
        .put2DRangeKernel(
            kernelFunction,
            0,
            0,
            globalWorkSize[0],
            globalWorkSize[1],
            realLocalSize[0],
            realLocalSize[1])
        // .finish()
        // .putReadBuffer(dOut, true)
        .finish();
  }
  private synchronized void initProjectionMatrix(int projectionNumber) {
    // load projection Matrix for current Projection.
    SimpleMatrix pMat = getGeometry().getProjectionMatrix(projectionNumber).computeP();
    float[] pMatFloat = new float[pMat.getCols() * pMat.getRows()];
    for (int j = 0; j < pMat.getRows(); j++) {
      for (int i = 0; i < pMat.getCols(); i++) {

        pMatFloat[(j * pMat.getCols()) + i] = (float) pMat.getElement(j, i);
      }
    }

    // Obtain the global pointer to the view matrix from
    // the module
    if (projectionMatrix == null)
      projectionMatrix = context.createFloatBuffer(pMatFloat.length, Mem.READ_ONLY);

    projectionMatrix.getBuffer().put(pMatFloat);
    projectionMatrix.getBuffer().rewind();
    commandQueue.putWriteBuffer(projectionMatrix, true).finish();
  }
  public void OpenCLRun(double[] motionfield) {
    try {
      while (projectionsAvailable.size() > 0) {
        Thread.sleep(CONRAD.INVERSE_SPEEDUP);
        if (showStatus) {
          float status = (float) (1.0 / projections.size());
          if (largeVolumeMode) {
            IJ.showStatus("Streaming Projections to OpenCL Buffer");
          } else {
            IJ.showStatus("Backprojecting with OpenCL");
          }
          IJ.showProgress(status);
        }
        if (!largeVolumeMode) {
          workOnProjectionData(motionfield);
        } else {
          checkProjectionData();
        }
      }
      //			System.out.println("large Volume " + largeVolumeMode);
      if (largeVolumeMode) {
        // we have collected all projections.
        // now we can reconstruct subvolumes and stich them together.
        int reconDimensionZ = getGeometry().getReconDimensionZ();
        double voxelSpacingX = getGeometry().getVoxelSpacingX();
        double voxelSpacingY = getGeometry().getVoxelSpacingY();
        double voxelSpacingZ = getGeometry().getVoxelSpacingZ();
        useVOImap = false;
        initialize(projections.get(0));
        double originalOffsetZ = offsetZ;
        double originalReconDimZ = reconDimensionZ;
        reconDimensionZ = subVolumeZ;
        int maxProjectionNumber = projections.size();
        float all = nSteps * maxProjectionNumber * 2;
        for (int n = 0; n < nSteps; n++) { // For each subvolume
          // set all to 0;
          Arrays.fill(h_volume, 0);

          volumePointer.getBuffer().rewind();
          volumePointer.getBuffer().put(h_volume);
          volumePointer.getBuffer().rewind();
          commandQueue.putWriteBuffer(volumePointer, true).finish();

          offsetZ = originalOffsetZ - (reconDimensionZ * voxelSpacingZ * n);
          for (int p = 0; p < maxProjectionNumber; p++) { // For all projections
            float currentStep = (n * maxProjectionNumber * 2) + p;
            if (showStatus) {
              IJ.showStatus("Backprojecting with OpenCL");
              IJ.showProgress(currentStep / all);
            }
            // System.out.println("Current: " + p);
            float respoffset = (float) Math.round(motionfield[p] / voxelSpacingZ);
            try {
              projectSingleProjection(p, reconDimensionZ, respoffset);
            } catch (Exception e) {
              System.out.println("Backprojection of projection " + p + " was not successful.");
              e.printStackTrace();
            }
          }
          // Gather volume
          commandQueue.putReadBuffer(volumePointer, true).finish();
          volumePointer.getBuffer().rewind();
          volumePointer.getBuffer().get(h_volume);
          volumePointer.getBuffer().rewind();

          // move data to ImagePlus;
          if (projectionVolume != null) {
            for (int k = 0; k < reconDimensionZ; k++) {
              int index = (n * subVolumeZ) + k;
              if (showStatus) {
                float currentStep = (n * maxProjectionNumber * 2) + maxProjectionNumber + k;
                IJ.showStatus("Fetching Volume from OpenCL");
                IJ.showProgress(currentStep / all);
              }
              if (index < originalReconDimZ) {
                for (int j = 0; j < projectionVolume.getSize()[1]; j++) {
                  for (int i = 0; i < projectionVolume.getSize()[0]; i++) {
                    float value =
                        h_volume[
                            (((projectionVolume.getSize()[1] * k) + j)
                                    * projectionVolume.getSize()[0])
                                + i];
                    double[][] voxel = new double[4][1];
                    voxel[0][0] = (voxelSpacingX * i) - offsetX;
                    voxel[1][0] = (voxelSpacingY * j) - offsetY;
                    voxel[2][0] = (voxelSpacingZ * index) - originalOffsetZ;

                    // exception for the case "interestedInVolume == null" and largeVolume is
                    // enabled
                    if (interestedInVolume == null) {
                      projectionVolume.setAtIndex(i, j, index, value);
                    } else {
                      if (interestedInVolume.contains(voxel[0][0], voxel[1][0], voxel[2][0])) {
                        projectionVolume.setAtIndex(i, j, index, value);
                      } else {
                        projectionVolume.setAtIndex(i, j, index, 0);
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }

    } catch (InterruptedException e) {

      e.printStackTrace();
    }
    if (showStatus) IJ.showProgress(1.0);
    unload();
    if (debug) System.out.println("Unloaded");
  }
  private synchronized void unload() {
    if (initialized) {

      if ((projectionVolume != null) && (!largeVolumeMode)) {

        commandQueue.putReadBuffer(volumePointer, true).finish();
        volumePointer.getBuffer().rewind();
        volumePointer.getBuffer().get(h_volume);
        volumePointer.getBuffer().rewind();

        int width = projectionVolume.getSize()[0];
        int height = projectionVolume.getSize()[1];
        if (this.useVOImap) {
          for (int k = 0; k < projectionVolume.getSize()[2]; k++) {
            for (int j = 0; j < height; j++) {
              for (int i = 0; i < width; i++) {
                float value = h_volume[(((height * k) + j) * width) + i];
                if (voiMap[i][j][k]) {
                  projectionVolume.setAtIndex(i, j, k, value);
                } else {
                  projectionVolume.setAtIndex(i, j, k, 0);
                }
              }
            }
          }
        } else {
          for (int k = 0; k < projectionVolume.getSize()[2]; k++) {
            for (int j = 0; j < height; j++) {
              for (int i = 0; i < width; i++) {
                float value = h_volume[(((height * k) + j) * width) + i];
                projectionVolume.setAtIndex(i, j, k, value);
              }
            }
          }
        }
      } else {
        System.out.println("Check ProjectionVolume. It seems null.");
      }

      h_volume = null;

      // free memory on device
      commandQueue.release();

      if (projectionTex != null) projectionTex.release();
      if (projectionMatrix != null) projectionMatrix.release();
      if (volStride != null) volStride.release();
      if (projectionArray != null) projectionArray.release();
      if (volumePointer != null) volumePointer.release();

      kernelFunction.release();
      program.release();
      // destory context
      context.release();

      commandQueue = null;
      projectionArray = null;
      projectionMatrix = null;
      projectionTex = null;
      volStride = null;
      volumePointer = null;
      kernelFunction = null;
      program = null;
      context = null;

      initialized = false;
    }
  }
  protected void init() {
    if (!initialized) {
      largeVolumeMode = false;

      int reconDimensionX = getGeometry().getReconDimensionX();
      int reconDimensionY = getGeometry().getReconDimensionY();
      int reconDimensionZ = getGeometry().getReconDimensionZ();
      projectionsAvailable = new ArrayList<Integer>();
      projectionsDone = new ArrayList<Integer>();

      // Initialize JOCL.
      context = OpenCLUtil.createContext();

      try {
        // get the fastest device
        device = context.getMaxFlopsDevice();
        // create the command queue
        commandQueue = device.createCommandQueue();

        // initialize the program
        if (program == null || !program.getContext().equals(this.context)) {
          program =
              context
                  .createProgram(
                      OpenCLCompensatedBackProjector.class.getResourceAsStream(
                          "compensatedBackprojectCL.cl"))
                  .build();
        }

      } catch (Exception e) {
        if (commandQueue != null) commandQueue.release();
        if (kernelFunction != null) kernelFunction.release();
        if (program != null) program.release();
        // destory context
        if (context != null) context.release();
        // TODO: handle exception
        e.printStackTrace();
      }

      // check space on device:
      long memory = device.getMaxMemAllocSize();
      long availableMemory = (memory);
      long requiredMemory =
          (long)
              (((((double) reconDimensionX) * reconDimensionY * ((double) reconDimensionZ) * 4)
                  + (((double)
                          Configuration.getGlobalConfiguration().getGeometry().getDetectorHeight())
                      * Configuration.getGlobalConfiguration().getGeometry().getDetectorWidth()
                      * 4)));
      if (debug) {
        System.out.println("Total available Memory on OpenCL card:" + availableMemory);
        System.out.println("Required Memory on OpenCL card:" + requiredMemory);
      }
      if (requiredMemory > availableMemory) {
        nSteps = (int) OpenCLUtil.iDivUp(requiredMemory, availableMemory);
        if (debug) System.out.println("Switching to large volume mode with nSteps = " + nSteps);
        largeVolumeMode = true;
      }
      if (debug) {
        // TODO replace
        /*
        CUdevprop prop = new CUdevprop();
        JCudaDriver.cuDeviceGetProperties(prop, dev);
        System.out.println(prop.toFormattedString());
        */
      }

      // create the computing kernel
      kernelFunction = program.createCLKernel("backprojectKernel");

      // create the reconstruction volume;
      int memorysize = reconDimensionX * reconDimensionY * reconDimensionZ * 4;
      if (largeVolumeMode) {
        subVolumeZ = OpenCLUtil.iDivUp(reconDimensionZ, nSteps);
        if (debug) System.out.println("SubVolumeZ: " + subVolumeZ);
        h_volume = new float[reconDimensionX * reconDimensionY * subVolumeZ];
        memorysize = reconDimensionX * reconDimensionY * subVolumeZ * 4;
        if (debug) System.out.println("Memory: " + memorysize);
      } else {
        h_volume = new float[reconDimensionX * reconDimensionY * reconDimensionZ];
      }

      // compute adapted volume size
      //    volume size in x = multiple of bpBlockSize[0]
      //    volume size in y = multiple of bpBlockSize[1]

      int adaptedVolSize[] = new int[3];
      if ((reconDimensionX % bpBlockSize[0]) == 0) {
        adaptedVolSize[0] = reconDimensionX;
      } else {
        adaptedVolSize[0] = ((reconDimensionX / bpBlockSize[0]) + 1) * bpBlockSize[0];
      }
      if ((reconDimensionY % bpBlockSize[1]) == 0) {
        adaptedVolSize[1] = reconDimensionY;
      } else {
        adaptedVolSize[1] = ((reconDimensionY / bpBlockSize[1]) + 1) * bpBlockSize[1];
      }
      adaptedVolSize[2] = reconDimensionZ;
      int volStrideHost[] = new int[2];
      // compute volstride and copy it to constant memory
      volStrideHost[0] = adaptedVolSize[0];
      volStrideHost[1] = adaptedVolSize[0] * adaptedVolSize[1];

      // copy volume to device
      volumePointer = context.createFloatBuffer(h_volume.length, Mem.WRITE_ONLY);
      volumePointer.getBuffer().put(h_volume);
      volumePointer.getBuffer().rewind();

      // copy volume stride to device
      volStride = context.createIntBuffer(volStrideHost.length, Mem.READ_ONLY);
      volStride.getBuffer().put(volStrideHost);
      volStride.getBuffer().rewind();

      commandQueue.putWriteBuffer(volumePointer, true).putWriteBuffer(volStride, true).finish();

      initialized = true;
    }
  }
  public Grid2D add(OpenCLGrid2D image1, OpenCLGrid2D image2) {

    // create context
    if (context == null) {
      context = OpenCLUtil.getStaticContext();
    }
    // select device
    if (device == null) {
      device = context.getMaxFlopsDevice();
    }
    // define local and global sizes
    int width = Math.min(image1.getWidth(), image2.getWidth());
    int height = Math.min(image1.getHeight(), image2.getHeight());

    int imageSize = width * height;
    int localWorkSize = Math.min(device.getMaxWorkGroupSize(), 8);
    int globalWorkSizeW =
        OpenCLUtil.roundUp(
            localWorkSize, width); // rounded up to the nearest multiple of localWorkSize
    int globalWorkSizeH = OpenCLUtil.roundUp(localWorkSize, height);

    // load sources, create and build programm
    if (program == null) {
      try {
        program =
            context.createProgram(this.getClass().getResourceAsStream("exercise4.cl")).build();
      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.exit(-1);
      }
    }

    // create output image
    CLBuffer<FloatBuffer> output = context.createFloatBuffer(imageSize, Mem.WRITE_ONLY);

    if (kernel == null) {
      kernel = program.createCLKernel("addImages");
    }
    // createCommandQueue
    CLCommandQueue queue = device.createCommandQueue();
    image1.getDelegate().prepareForDeviceOperation();
    image2.getDelegate().prepareForDeviceOperation();
    // put memory on the graphics card

    kernel
        .putArg(image1.getDelegate().getCLBuffer())
        .putArg(image2.getDelegate().getCLBuffer())
        .putArg(output)
        .putArg(width)
        .putArg(height);
    kernel.rewind();

    queue
        .put2DRangeKernel(
            kernel, 0, 0, globalWorkSizeW, globalWorkSizeH, localWorkSize, localWorkSize)
        .putBarrier()
        // put memory from graphic card to host
        .putReadBuffer(output, true)
        .finish();

    Grid2D result = new Grid2D(image1);
    output.getBuffer().rewind();

    for (int i = 0; i < result.getSize()[1]; ++i) {
      for (int j = 0; j < result.getSize()[0]; j++) {
        result.setAtIndex(j, i, output.getBuffer().get());
      }
    }

    output.release();
    queue.release();

    return result;
  }
  public Grid2D openCLBackprojection(
      OpenCLGrid2D filteredSinogramm,
      int widthPhantom,
      int heightPhantom,
      int worksize,
      float detectorSpacing,
      int numberOfPixel,
      int numberProjections,
      float scanAngle,
      double[] spacing,
      double[] origin) {
    // create context
    CLContext context = OpenCLUtil.getStaticContext();

    // select device
    CLDevice device = context.getMaxFlopsDevice();

    // define local and global sizes

    double spacingAngle = (double) (scanAngle / numberProjections);
    double originDetector = -(detectorSpacing * numberOfPixel) / 2.0;

    int imageSize = widthPhantom * heightPhantom;
    int localWorkSize = Math.min(device.getMaxWorkGroupSize(), worksize);
    int globalWorkSizeW =
        OpenCLUtil.roundUp(
            localWorkSize, widthPhantom); // rounded up to the nearest multiple of localWorkSize
    int globalWorkSizeH = OpenCLUtil.roundUp(localWorkSize, heightPhantom);

    // load sources, create and build programm

    try {
      this.program =
          context.createProgram(this.getClass().getResourceAsStream("exercise4.cl")).build();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      System.exit(-1);
    }

    // create image from input grid
    // CLImageFormat format = new CLImageFormat(ChannelOrder.INTENSITY, ChannelType.FLOAT);

    // create output image
    CLBuffer<FloatBuffer> output = context.createFloatBuffer(imageSize, Mem.WRITE_ONLY);
    if (kernel == null) {
      kernel = program.createCLKernel("parallelBackProjection");
    }

    // createCommandQueue
    CLCommandQueue queue = device.createCommandQueue();
    filteredSinogramm.getDelegate().prepareForDeviceOperation();
    // put memory on the graphics card

    kernel
        .putArg(filteredSinogramm.getDelegate().getCLBuffer())
        .putArg(output)
        .putArg(numberProjections)
        .putArg(numberOfPixel)
        .putArg(scanAngle)
        .putArg(widthPhantom)
        .putArg(heightPhantom)
        .putArg(spacing[0])
        .putArg(spacing[1])
        .putArg(origin[0])
        .putArg(origin[1])
        .putArg(detectorSpacing)
        .putArg(spacingAngle)
        .putArg(originDetector)
        .putArg(0.d);

    kernel.rewind();

    queue
        .put2DRangeKernel(
            kernel, 0, 0, globalWorkSizeW, globalWorkSizeH, localWorkSize, localWorkSize)
        .putBarrier()
        .finish();
    // put memory from graphic card to host
    queue.putReadBuffer(output, true).finish();

    output.getBuffer().rewind();

    for (int i = 0; i < image.getSize()[1]; ++i) {
      for (int j = 0; j < image.getSize()[0]; j++) {
        image.setAtIndex(j, i, output.getBuffer().get());
      }
    }

    output.release();
    queue.release();

    return image;
  }