private synchronized void projectSingleProjection(
      int projectionNumber, int dimz, float respoffset) {
    // load projection matrix
    initProjectionMatrix(projectionNumber);
    // load projection
    Grid2D projection = projections.get(projectionNumber);
    initProjectionData(projection);
    if (!largeVolumeMode) {
      // projections.remove(projectionNumber);
    }
    // backproject for each slice
    // OpenCL Grids are only two dimensional!
    int reconDimensionZ = dimz;
    double voxelSpacingX = getGeometry().getVoxelSpacingX();
    double voxelSpacingY = getGeometry().getVoxelSpacingY();
    double voxelSpacingZ = getGeometry().getVoxelSpacingZ();

    // write kernel parameters
    kernelFunction.rewind();
    kernelFunction
        .putArg(volumePointer)
        .putArg(respoffset)
        .putArg((int) lineOffset)
        .putArg(reconDimensionZ)
        .putArg((float) voxelSpacingX)
        .putArg((float) voxelSpacingY)
        .putArg((float) voxelSpacingZ)
        .putArg((float) offsetX)
        .putArg((float) offsetY)
        .putArg((float) offsetZ)
        .putArg(projectionTex)
        .putArg(volStride)
        .putArg(projectionMatrix);

    int[] realLocalSize = {
      Math.min(device.getMaxWorkGroupSize(), bpBlockSize[0]),
      Math.min(device.getMaxWorkGroupSize(), bpBlockSize[1])
    };
    // rounded up to the nearest multiple of localWorkSize
    int[] globalWorkSize = {getGeometry().getReconDimensionX(), getGeometry().getReconDimensionY()};

    // Call the OpenCL kernel, writing the results into the volume which is pointed at
    commandQueue
        .putWriteImage(projectionTex, false)
        .finish()
        .put2DRangeKernel(
            kernelFunction,
            0,
            0,
            globalWorkSize[0],
            globalWorkSize[1],
            realLocalSize[0],
            realLocalSize[1])
        // .finish()
        // .putReadBuffer(dOut, true)
        .finish();
  }
  public Grid2D add(OpenCLGrid2D image1, OpenCLGrid2D image2) {

    // create context
    if (context == null) {
      context = OpenCLUtil.getStaticContext();
    }
    // select device
    if (device == null) {
      device = context.getMaxFlopsDevice();
    }
    // define local and global sizes
    int width = Math.min(image1.getWidth(), image2.getWidth());
    int height = Math.min(image1.getHeight(), image2.getHeight());

    int imageSize = width * height;
    int localWorkSize = Math.min(device.getMaxWorkGroupSize(), 8);
    int globalWorkSizeW =
        OpenCLUtil.roundUp(
            localWorkSize, width); // rounded up to the nearest multiple of localWorkSize
    int globalWorkSizeH = OpenCLUtil.roundUp(localWorkSize, height);

    // load sources, create and build programm
    if (program == null) {
      try {
        program =
            context.createProgram(this.getClass().getResourceAsStream("exercise4.cl")).build();
      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.exit(-1);
      }
    }

    // create output image
    CLBuffer<FloatBuffer> output = context.createFloatBuffer(imageSize, Mem.WRITE_ONLY);

    if (kernel == null) {
      kernel = program.createCLKernel("addImages");
    }
    // createCommandQueue
    CLCommandQueue queue = device.createCommandQueue();
    image1.getDelegate().prepareForDeviceOperation();
    image2.getDelegate().prepareForDeviceOperation();
    // put memory on the graphics card

    kernel
        .putArg(image1.getDelegate().getCLBuffer())
        .putArg(image2.getDelegate().getCLBuffer())
        .putArg(output)
        .putArg(width)
        .putArg(height);
    kernel.rewind();

    queue
        .put2DRangeKernel(
            kernel, 0, 0, globalWorkSizeW, globalWorkSizeH, localWorkSize, localWorkSize)
        .putBarrier()
        // put memory from graphic card to host
        .putReadBuffer(output, true)
        .finish();

    Grid2D result = new Grid2D(image1);
    output.getBuffer().rewind();

    for (int i = 0; i < result.getSize()[1]; ++i) {
      for (int j = 0; j < result.getSize()[0]; j++) {
        result.setAtIndex(j, i, output.getBuffer().get());
      }
    }

    output.release();
    queue.release();

    return result;
  }
  protected void init() {
    if (!initialized) {
      largeVolumeMode = false;

      int reconDimensionX = getGeometry().getReconDimensionX();
      int reconDimensionY = getGeometry().getReconDimensionY();
      int reconDimensionZ = getGeometry().getReconDimensionZ();
      projectionsAvailable = new ArrayList<Integer>();
      projectionsDone = new ArrayList<Integer>();

      // Initialize JOCL.
      context = OpenCLUtil.createContext();

      try {
        // get the fastest device
        device = context.getMaxFlopsDevice();
        // create the command queue
        commandQueue = device.createCommandQueue();

        // initialize the program
        if (program == null || !program.getContext().equals(this.context)) {
          program =
              context
                  .createProgram(
                      OpenCLCompensatedBackProjector.class.getResourceAsStream(
                          "compensatedBackprojectCL.cl"))
                  .build();
        }

      } catch (Exception e) {
        if (commandQueue != null) commandQueue.release();
        if (kernelFunction != null) kernelFunction.release();
        if (program != null) program.release();
        // destory context
        if (context != null) context.release();
        // TODO: handle exception
        e.printStackTrace();
      }

      // check space on device:
      long memory = device.getMaxMemAllocSize();
      long availableMemory = (memory);
      long requiredMemory =
          (long)
              (((((double) reconDimensionX) * reconDimensionY * ((double) reconDimensionZ) * 4)
                  + (((double)
                          Configuration.getGlobalConfiguration().getGeometry().getDetectorHeight())
                      * Configuration.getGlobalConfiguration().getGeometry().getDetectorWidth()
                      * 4)));
      if (debug) {
        System.out.println("Total available Memory on OpenCL card:" + availableMemory);
        System.out.println("Required Memory on OpenCL card:" + requiredMemory);
      }
      if (requiredMemory > availableMemory) {
        nSteps = (int) OpenCLUtil.iDivUp(requiredMemory, availableMemory);
        if (debug) System.out.println("Switching to large volume mode with nSteps = " + nSteps);
        largeVolumeMode = true;
      }
      if (debug) {
        // TODO replace
        /*
        CUdevprop prop = new CUdevprop();
        JCudaDriver.cuDeviceGetProperties(prop, dev);
        System.out.println(prop.toFormattedString());
        */
      }

      // create the computing kernel
      kernelFunction = program.createCLKernel("backprojectKernel");

      // create the reconstruction volume;
      int memorysize = reconDimensionX * reconDimensionY * reconDimensionZ * 4;
      if (largeVolumeMode) {
        subVolumeZ = OpenCLUtil.iDivUp(reconDimensionZ, nSteps);
        if (debug) System.out.println("SubVolumeZ: " + subVolumeZ);
        h_volume = new float[reconDimensionX * reconDimensionY * subVolumeZ];
        memorysize = reconDimensionX * reconDimensionY * subVolumeZ * 4;
        if (debug) System.out.println("Memory: " + memorysize);
      } else {
        h_volume = new float[reconDimensionX * reconDimensionY * reconDimensionZ];
      }

      // compute adapted volume size
      //    volume size in x = multiple of bpBlockSize[0]
      //    volume size in y = multiple of bpBlockSize[1]

      int adaptedVolSize[] = new int[3];
      if ((reconDimensionX % bpBlockSize[0]) == 0) {
        adaptedVolSize[0] = reconDimensionX;
      } else {
        adaptedVolSize[0] = ((reconDimensionX / bpBlockSize[0]) + 1) * bpBlockSize[0];
      }
      if ((reconDimensionY % bpBlockSize[1]) == 0) {
        adaptedVolSize[1] = reconDimensionY;
      } else {
        adaptedVolSize[1] = ((reconDimensionY / bpBlockSize[1]) + 1) * bpBlockSize[1];
      }
      adaptedVolSize[2] = reconDimensionZ;
      int volStrideHost[] = new int[2];
      // compute volstride and copy it to constant memory
      volStrideHost[0] = adaptedVolSize[0];
      volStrideHost[1] = adaptedVolSize[0] * adaptedVolSize[1];

      // copy volume to device
      volumePointer = context.createFloatBuffer(h_volume.length, Mem.WRITE_ONLY);
      volumePointer.getBuffer().put(h_volume);
      volumePointer.getBuffer().rewind();

      // copy volume stride to device
      volStride = context.createIntBuffer(volStrideHost.length, Mem.READ_ONLY);
      volStride.getBuffer().put(volStrideHost);
      volStride.getBuffer().rewind();

      commandQueue.putWriteBuffer(volumePointer, true).putWriteBuffer(volStride, true).finish();

      initialized = true;
    }
  }
  public Grid2D openCLBackprojection(
      OpenCLGrid2D filteredSinogramm,
      int widthPhantom,
      int heightPhantom,
      int worksize,
      float detectorSpacing,
      int numberOfPixel,
      int numberProjections,
      float scanAngle,
      double[] spacing,
      double[] origin) {
    // create context
    CLContext context = OpenCLUtil.getStaticContext();

    // select device
    CLDevice device = context.getMaxFlopsDevice();

    // define local and global sizes

    double spacingAngle = (double) (scanAngle / numberProjections);
    double originDetector = -(detectorSpacing * numberOfPixel) / 2.0;

    int imageSize = widthPhantom * heightPhantom;
    int localWorkSize = Math.min(device.getMaxWorkGroupSize(), worksize);
    int globalWorkSizeW =
        OpenCLUtil.roundUp(
            localWorkSize, widthPhantom); // rounded up to the nearest multiple of localWorkSize
    int globalWorkSizeH = OpenCLUtil.roundUp(localWorkSize, heightPhantom);

    // load sources, create and build programm

    try {
      this.program =
          context.createProgram(this.getClass().getResourceAsStream("exercise4.cl")).build();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      System.exit(-1);
    }

    // create image from input grid
    // CLImageFormat format = new CLImageFormat(ChannelOrder.INTENSITY, ChannelType.FLOAT);

    // create output image
    CLBuffer<FloatBuffer> output = context.createFloatBuffer(imageSize, Mem.WRITE_ONLY);
    if (kernel == null) {
      kernel = program.createCLKernel("parallelBackProjection");
    }

    // createCommandQueue
    CLCommandQueue queue = device.createCommandQueue();
    filteredSinogramm.getDelegate().prepareForDeviceOperation();
    // put memory on the graphics card

    kernel
        .putArg(filteredSinogramm.getDelegate().getCLBuffer())
        .putArg(output)
        .putArg(numberProjections)
        .putArg(numberOfPixel)
        .putArg(scanAngle)
        .putArg(widthPhantom)
        .putArg(heightPhantom)
        .putArg(spacing[0])
        .putArg(spacing[1])
        .putArg(origin[0])
        .putArg(origin[1])
        .putArg(detectorSpacing)
        .putArg(spacingAngle)
        .putArg(originDetector)
        .putArg(0.d);

    kernel.rewind();

    queue
        .put2DRangeKernel(
            kernel, 0, 0, globalWorkSizeW, globalWorkSizeH, localWorkSize, localWorkSize)
        .putBarrier()
        .finish();
    // put memory from graphic card to host
    queue.putReadBuffer(output, true).finish();

    output.getBuffer().rewind();

    for (int i = 0; i < image.getSize()[1]; ++i) {
      for (int j = 0; j < image.getSize()[0]; j++) {
        image.setAtIndex(j, i, output.getBuffer().get());
      }
    }

    output.release();
    queue.release();

    return image;
  }