private synchronized void projectSingleProjection(int projectionNumber, int dimz) {
    // load projection matrix
    initProjectionMatrix(projectionNumber);
    // load projection
    Grid2D projection = (Grid2D) projections.get(projectionNumber).clone();
    // Correct for constant part of distance weighting + For angular sampling
    double D = getGeometry().getSourceToDetectorDistance();
    NumericPointwiseOperators.multiplyBy(
        projection, (float) (D * D * 2 * Math.PI / getGeometry().getNumProjectionMatrices()));

    initProjectionData(projection);
    if (!largeVolumeMode) {
      projections.remove(projectionNumber);
    }
    // backproject for each slice
    // CUDA Grids are only two dimensional!
    int[] zed = new int[1];
    int reconDimensionZ = dimz;
    double voxelSpacingX = getGeometry().getVoxelSpacingX();
    double voxelSpacingY = getGeometry().getVoxelSpacingY();
    double voxelSpacingZ = getGeometry().getVoxelSpacingZ();

    zed[0] = reconDimensionZ;
    Pointer dOut = Pointer.to(volumePointer);
    Pointer pWidth = Pointer.to(new int[] {(int) lineOffset});
    Pointer pZOffset = Pointer.to(zed);
    float[] vsx = new float[] {(float) voxelSpacingX};
    Pointer pvsx = Pointer.to(vsx);
    Pointer pvsy = Pointer.to(new float[] {(float) voxelSpacingY});
    Pointer pvsz = Pointer.to(new float[] {(float) voxelSpacingZ});
    Pointer pox = Pointer.to(new float[] {(float) offsetX});
    Pointer poy = Pointer.to(new float[] {(float) offsetY});
    Pointer poz = Pointer.to(new float[] {(float) offsetZ});

    int offset = 0;
    // System.out.println(dimz + " " + zed[0] + " " + offsetZ + " " + voxelSpacingZ);
    offset = CUDAUtil.align(offset, Sizeof.POINTER);
    JCudaDriver.cuParamSetv(function, offset, dOut, Sizeof.POINTER);
    offset += Sizeof.POINTER;

    offset = CUDAUtil.align(offset, Sizeof.INT);
    JCudaDriver.cuParamSetv(function, offset, pWidth, Sizeof.INT);
    offset += Sizeof.INT;

    offset = CUDAUtil.align(offset, Sizeof.INT);
    JCudaDriver.cuParamSetv(function, offset, pZOffset, Sizeof.INT);
    offset += Sizeof.INT;

    offset = CUDAUtil.align(offset, Sizeof.FLOAT);
    JCudaDriver.cuParamSetv(function, offset, pvsx, Sizeof.FLOAT);
    offset += Sizeof.FLOAT;

    offset = CUDAUtil.align(offset, Sizeof.FLOAT);
    JCudaDriver.cuParamSetv(function, offset, pvsy, Sizeof.FLOAT);
    offset += Sizeof.FLOAT;

    offset = CUDAUtil.align(offset, Sizeof.FLOAT);
    JCudaDriver.cuParamSetv(function, offset, pvsz, Sizeof.FLOAT);
    offset += Sizeof.FLOAT;

    offset = CUDAUtil.align(offset, Sizeof.FLOAT);
    JCudaDriver.cuParamSetv(function, offset, pox, Sizeof.FLOAT);
    offset += Sizeof.FLOAT;

    offset = CUDAUtil.align(offset, Sizeof.FLOAT);
    JCudaDriver.cuParamSetv(function, offset, poy, Sizeof.FLOAT);
    offset += Sizeof.FLOAT;

    offset = CUDAUtil.align(offset, Sizeof.FLOAT);
    JCudaDriver.cuParamSetv(function, offset, poz, Sizeof.FLOAT);
    offset += Sizeof.FLOAT;

    JCudaDriver.cuParamSetSize(function, offset);

    // Call the CUDA kernel, writing the results into the volume which is pointed at
    JCudaDriver.cuFuncSetBlockShape(function, bpBlockSize[0], bpBlockSize[1], 1);
    JCudaDriver.cuLaunchGrid(function, gridSize.x, gridSize.y);
    JCudaDriver.cuCtxSynchronize();
  }
  protected void init() {
    if (!initialized) {
      largeVolumeMode = false;

      int reconDimensionX = getGeometry().getReconDimensionX();
      int reconDimensionY = getGeometry().getReconDimensionY();
      int reconDimensionZ = getGeometry().getReconDimensionZ();
      projections = new ImageGridBuffer();
      projectionsAvailable = new ArrayList<Integer>();
      projectionsDone = new ArrayList<Integer>();
      // Initialize the JCudaDriver. Note that this has to be done from
      // the same thread that will later use the JCudaDriver API.
      JCudaDriver.setExceptionsEnabled(true);
      JCudaDriver.cuInit(0);
      CUdevice dev = CUDAUtil.getBestDevice();
      cuCtx = new CUcontext();
      JCudaDriver.cuCtxCreate(cuCtx, 0, dev);
      // check space on device:
      int[] memory = new int[1];
      int[] total = new int[1];
      JCudaDriver.cuDeviceTotalMem(memory, dev);
      JCudaDriver.cuMemGetInfo(memory, total);
      int availableMemory = (int) (CUDAUtil.correctMemoryValue(memory[0]) / ((long) 1024 * 1024));
      int requiredMemory =
          (int)
              (((((double) reconDimensionX)
                          * reconDimensionY
                          * ((double) reconDimensionZ)
                          * Sizeof.FLOAT)
                      + (((double)
                              Configuration.getGlobalConfiguration()
                                  .getGeometry()
                                  .getDetectorHeight())
                          * Configuration.getGlobalConfiguration().getGeometry().getDetectorWidth()
                          * Sizeof.FLOAT))
                  / (1024.0 * 1024));
      if (debug) {
        System.out.println("Total available Memory on CUDA card:" + availableMemory);
        System.out.println("Required Memory on CUDA card:" + requiredMemory);
      }
      if (requiredMemory > availableMemory) {
        nSteps = CUDAUtil.iDivUp(requiredMemory, (int) (availableMemory));
        if (debug) System.out.println("Switching to large volume mode with nSteps = " + nSteps);
        largeVolumeMode = true;
      }
      if (debug) {
        CUdevprop prop = new CUdevprop();
        JCudaDriver.cuDeviceGetProperties(prop, dev);
        System.out.println(prop.toFormattedString());
      }

      // Load the CUBIN file containing the kernel
      module = new CUmodule();
      JCudaDriver.cuModuleLoad(module, "backprojectWithCuda.ptx");

      // Obtain a function pointer to the kernel function. This function
      // will later be called.
      //
      function = new CUfunction();
      JCudaDriver.cuModuleGetFunction(function, module, "_Z17backprojectKernelPfiiffffff");
      // create the reconstruction volume;
      int memorysize = reconDimensionX * reconDimensionY * reconDimensionZ * Sizeof.FLOAT;
      if (largeVolumeMode) {
        subVolumeZ = CUDAUtil.iDivUp(reconDimensionZ, nSteps);
        if (debug) System.out.println("SubVolumeZ: " + subVolumeZ);
        h_volume = new float[reconDimensionX * reconDimensionY * subVolumeZ];
        memorysize = reconDimensionX * reconDimensionY * subVolumeZ * Sizeof.FLOAT;
        if (debug) System.out.println("Memory: " + memorysize);
      } else {
        h_volume = new float[reconDimensionX * reconDimensionY * reconDimensionZ];
      }
      // copy volume to device
      volumePointer = new CUdeviceptr();
      JCudaDriver.cuMemAlloc(volumePointer, memorysize);
      JCudaDriver.cuMemcpyHtoD(volumePointer, Pointer.to(h_volume), memorysize);

      // compute adapted volume size
      //    volume size in x = multiple of bpBlockSize[0]
      //    volume size in y = multiple of bpBlockSize[1]

      int adaptedVolSize[] = new int[3];
      if ((reconDimensionX % bpBlockSize[0]) == 0) {
        adaptedVolSize[0] = reconDimensionX;
      } else {
        adaptedVolSize[0] = ((reconDimensionX / bpBlockSize[0]) + 1) * bpBlockSize[0];
      }
      if ((reconDimensionY % bpBlockSize[1]) == 0) {
        adaptedVolSize[1] = reconDimensionY;
      } else {
        adaptedVolSize[1] = ((reconDimensionY / bpBlockSize[1]) + 1) * bpBlockSize[1];
      }
      adaptedVolSize[2] = reconDimensionZ;
      int volStrideHost[] = new int[2];
      // compute volstride and copy it to constant memory
      volStrideHost[0] = adaptedVolSize[0];
      volStrideHost[1] = adaptedVolSize[0] * adaptedVolSize[1];

      volStride = new CUdeviceptr();
      JCudaDriver.cuModuleGetGlobal(volStride, new int[1], module, "gVolStride");
      JCudaDriver.cuMemcpyHtoD(volStride, Pointer.to(volStrideHost), Sizeof.INT * 2);

      // Calculate new grid size
      gridSize =
          new dim3(
              CUDAUtil.iDivUp(adaptedVolSize[0], bpBlockSize[0]),
              CUDAUtil.iDivUp(adaptedVolSize[1], bpBlockSize[1]),
              adaptedVolSize[2]);

      // Obtain the global pointer to the view matrix from
      // the module
      projectionMatrix = new CUdeviceptr();
      JCudaDriver.cuModuleGetGlobal(projectionMatrix, new int[1], module, "gProjMatrix");

      initialized = true;
    }
  }