private synchronized void projectSingleProjection(int projectionNumber, int dimz) { // load projection matrix initProjectionMatrix(projectionNumber); // load projection Grid2D projection = (Grid2D) projections.get(projectionNumber).clone(); // Correct for constant part of distance weighting + For angular sampling double D = getGeometry().getSourceToDetectorDistance(); NumericPointwiseOperators.multiplyBy( projection, (float) (D * D * 2 * Math.PI / getGeometry().getNumProjectionMatrices())); initProjectionData(projection); if (!largeVolumeMode) { projections.remove(projectionNumber); } // backproject for each slice // CUDA Grids are only two dimensional! int[] zed = new int[1]; int reconDimensionZ = dimz; double voxelSpacingX = getGeometry().getVoxelSpacingX(); double voxelSpacingY = getGeometry().getVoxelSpacingY(); double voxelSpacingZ = getGeometry().getVoxelSpacingZ(); zed[0] = reconDimensionZ; Pointer dOut = Pointer.to(volumePointer); Pointer pWidth = Pointer.to(new int[] {(int) lineOffset}); Pointer pZOffset = Pointer.to(zed); float[] vsx = new float[] {(float) voxelSpacingX}; Pointer pvsx = Pointer.to(vsx); Pointer pvsy = Pointer.to(new float[] {(float) voxelSpacingY}); Pointer pvsz = Pointer.to(new float[] {(float) voxelSpacingZ}); Pointer pox = Pointer.to(new float[] {(float) offsetX}); Pointer poy = Pointer.to(new float[] {(float) offsetY}); Pointer poz = Pointer.to(new float[] {(float) offsetZ}); int offset = 0; // System.out.println(dimz + " " + zed[0] + " " + offsetZ + " " + voxelSpacingZ); offset = CUDAUtil.align(offset, Sizeof.POINTER); JCudaDriver.cuParamSetv(function, offset, dOut, Sizeof.POINTER); offset += Sizeof.POINTER; offset = CUDAUtil.align(offset, Sizeof.INT); JCudaDriver.cuParamSetv(function, offset, pWidth, Sizeof.INT); offset += Sizeof.INT; offset = CUDAUtil.align(offset, Sizeof.INT); JCudaDriver.cuParamSetv(function, offset, pZOffset, Sizeof.INT); offset += Sizeof.INT; offset = CUDAUtil.align(offset, Sizeof.FLOAT); JCudaDriver.cuParamSetv(function, offset, pvsx, Sizeof.FLOAT); offset += Sizeof.FLOAT; offset = CUDAUtil.align(offset, Sizeof.FLOAT); JCudaDriver.cuParamSetv(function, offset, pvsy, Sizeof.FLOAT); offset += Sizeof.FLOAT; offset = CUDAUtil.align(offset, Sizeof.FLOAT); JCudaDriver.cuParamSetv(function, offset, pvsz, Sizeof.FLOAT); offset += Sizeof.FLOAT; offset = CUDAUtil.align(offset, Sizeof.FLOAT); JCudaDriver.cuParamSetv(function, offset, pox, Sizeof.FLOAT); offset += Sizeof.FLOAT; offset = CUDAUtil.align(offset, Sizeof.FLOAT); JCudaDriver.cuParamSetv(function, offset, poy, Sizeof.FLOAT); offset += Sizeof.FLOAT; offset = CUDAUtil.align(offset, Sizeof.FLOAT); JCudaDriver.cuParamSetv(function, offset, poz, Sizeof.FLOAT); offset += Sizeof.FLOAT; JCudaDriver.cuParamSetSize(function, offset); // Call the CUDA kernel, writing the results into the volume which is pointed at JCudaDriver.cuFuncSetBlockShape(function, bpBlockSize[0], bpBlockSize[1], 1); JCudaDriver.cuLaunchGrid(function, gridSize.x, gridSize.y); JCudaDriver.cuCtxSynchronize(); }
protected void init() { if (!initialized) { largeVolumeMode = false; int reconDimensionX = getGeometry().getReconDimensionX(); int reconDimensionY = getGeometry().getReconDimensionY(); int reconDimensionZ = getGeometry().getReconDimensionZ(); projections = new ImageGridBuffer(); projectionsAvailable = new ArrayList<Integer>(); projectionsDone = new ArrayList<Integer>(); // Initialize the JCudaDriver. Note that this has to be done from // the same thread that will later use the JCudaDriver API. JCudaDriver.setExceptionsEnabled(true); JCudaDriver.cuInit(0); CUdevice dev = CUDAUtil.getBestDevice(); cuCtx = new CUcontext(); JCudaDriver.cuCtxCreate(cuCtx, 0, dev); // check space on device: int[] memory = new int[1]; int[] total = new int[1]; JCudaDriver.cuDeviceTotalMem(memory, dev); JCudaDriver.cuMemGetInfo(memory, total); int availableMemory = (int) (CUDAUtil.correctMemoryValue(memory[0]) / ((long) 1024 * 1024)); int requiredMemory = (int) (((((double) reconDimensionX) * reconDimensionY * ((double) reconDimensionZ) * Sizeof.FLOAT) + (((double) Configuration.getGlobalConfiguration() .getGeometry() .getDetectorHeight()) * Configuration.getGlobalConfiguration().getGeometry().getDetectorWidth() * Sizeof.FLOAT)) / (1024.0 * 1024)); if (debug) { System.out.println("Total available Memory on CUDA card:" + availableMemory); System.out.println("Required Memory on CUDA card:" + requiredMemory); } if (requiredMemory > availableMemory) { nSteps = CUDAUtil.iDivUp(requiredMemory, (int) (availableMemory)); if (debug) System.out.println("Switching to large volume mode with nSteps = " + nSteps); largeVolumeMode = true; } if (debug) { CUdevprop prop = new CUdevprop(); JCudaDriver.cuDeviceGetProperties(prop, dev); System.out.println(prop.toFormattedString()); } // Load the CUBIN file containing the kernel module = new CUmodule(); JCudaDriver.cuModuleLoad(module, "backprojectWithCuda.ptx"); // Obtain a function pointer to the kernel function. This function // will later be called. // function = new CUfunction(); JCudaDriver.cuModuleGetFunction(function, module, "_Z17backprojectKernelPfiiffffff"); // create the reconstruction volume; int memorysize = reconDimensionX * reconDimensionY * reconDimensionZ * Sizeof.FLOAT; if (largeVolumeMode) { subVolumeZ = CUDAUtil.iDivUp(reconDimensionZ, nSteps); if (debug) System.out.println("SubVolumeZ: " + subVolumeZ); h_volume = new float[reconDimensionX * reconDimensionY * subVolumeZ]; memorysize = reconDimensionX * reconDimensionY * subVolumeZ * Sizeof.FLOAT; if (debug) System.out.println("Memory: " + memorysize); } else { h_volume = new float[reconDimensionX * reconDimensionY * reconDimensionZ]; } // copy volume to device volumePointer = new CUdeviceptr(); JCudaDriver.cuMemAlloc(volumePointer, memorysize); JCudaDriver.cuMemcpyHtoD(volumePointer, Pointer.to(h_volume), memorysize); // compute adapted volume size // volume size in x = multiple of bpBlockSize[0] // volume size in y = multiple of bpBlockSize[1] int adaptedVolSize[] = new int[3]; if ((reconDimensionX % bpBlockSize[0]) == 0) { adaptedVolSize[0] = reconDimensionX; } else { adaptedVolSize[0] = ((reconDimensionX / bpBlockSize[0]) + 1) * bpBlockSize[0]; } if ((reconDimensionY % bpBlockSize[1]) == 0) { adaptedVolSize[1] = reconDimensionY; } else { adaptedVolSize[1] = ((reconDimensionY / bpBlockSize[1]) + 1) * bpBlockSize[1]; } adaptedVolSize[2] = reconDimensionZ; int volStrideHost[] = new int[2]; // compute volstride and copy it to constant memory volStrideHost[0] = adaptedVolSize[0]; volStrideHost[1] = adaptedVolSize[0] * adaptedVolSize[1]; volStride = new CUdeviceptr(); JCudaDriver.cuModuleGetGlobal(volStride, new int[1], module, "gVolStride"); JCudaDriver.cuMemcpyHtoD(volStride, Pointer.to(volStrideHost), Sizeof.INT * 2); // Calculate new grid size gridSize = new dim3( CUDAUtil.iDivUp(adaptedVolSize[0], bpBlockSize[0]), CUDAUtil.iDivUp(adaptedVolSize[1], bpBlockSize[1]), adaptedVolSize[2]); // Obtain the global pointer to the view matrix from // the module projectionMatrix = new CUdeviceptr(); JCudaDriver.cuModuleGetGlobal(projectionMatrix, new int[1], module, "gProjMatrix"); initialized = true; } }