public static void conv2d( MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { cudnnTensorDescriptor srcTensorDesc = null; cudnnTensorDescriptor dstTensorDesc = null; cudnnFilterDescriptor filterDesc = null; cudnnConvolutionDescriptor convDesc = null; Pointer workSpace = null; long sizeInBytes = 0; Pointer alpha = null; Pointer beta = null; try { // Allocate descriptors srcTensorDesc = allocateTensorDescriptor(N, C, H, W); dstTensorDesc = allocateTensorDescriptor(N, K, P, Q); filterDesc = allocateFilterDescriptor(K, C, R, S); // Allocate data // (Pointer) gpuCtx.prepare(image, true, true); // (Pointer) gpuCtx.prepare(filter, true, true); Pointer imagePointer = ((JCudaObject) image._gpuHandle).jcudaPointer; Pointer filterPointer = ((JCudaObject) filter._gpuHandle).jcudaPointer; Pointer dstPointer = ((JCudaObject) outputBlock._gpuHandle).jcudaPointer; int padding[] = {pad_h, pad_w}; int strides[] = {stride_h, stride_w}; convDesc = allocateConvolutionDescriptor(padding, strides); // Select the best algorithm depending on the data and supported CUDA int algo = -1; workSpace = new Pointer(); if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) { algo = jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } else if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST) { int[] algos = { jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_GEMM, jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM }; // TODO: Look into FFt, Winograd, etc // Also ensure that GPU has enough memory to allocate memory long sizeInBytesArray[] = {0}; algo = jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm( cudnnHandle, srcTensorDesc, filterDesc, convDesc, dstTensorDesc, CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos); cudnnGetConvolutionForwardWorkspaceSize( cudnnHandle, srcTensorDesc, filterDesc, convDesc, dstTensorDesc, algo, sizeInBytesArray); if (sizeInBytesArray[0] != 0) jcuda.runtime.JCuda.cudaMalloc(workSpace, sizeInBytesArray[0]); sizeInBytes = sizeInBytesArray[0]; } else if (CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT) { throw new DMLRuntimeException( "CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT is not implemented"); } else { throw new DMLRuntimeException("Unsupported preference criteria for convolution"); } alpha = pointerTo(1.0); beta = pointerTo(0.0f); int status = cudnnConvolutionForward( cudnnHandle, alpha, srcTensorDesc, imagePointer, filterDesc, filterPointer, convDesc, algo, workSpace, sizeInBytes, beta, dstTensorDesc, dstPointer); if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException( "Could not executed cudnnConvolutionForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); } } finally { if (alpha != null) cudaFree(alpha); if (beta != null) cudaFree(beta); if (srcTensorDesc != null) cudnnDestroyTensorDescriptor(srcTensorDesc); if (dstTensorDesc != null) cudnnDestroyTensorDescriptor(dstTensorDesc); if (filterDesc != null) cudnnDestroyFilterDescriptor(filterDesc); if (convDesc != null) cudnnDestroyConvolutionDescriptor(convDesc); if (workSpace != null && sizeInBytes != 0) cudaFree(workSpace); } }
public void reset() { projectionArray = null; JCuda.cudaThreadExit(); }