public static void conv2d(
      MatrixObject image,
      MatrixObject filter,
      MatrixObject outputBlock,
      int N,
      int C,
      int H,
      int W,
      int K,
      int R,
      int S,
      int pad_h,
      int pad_w,
      int stride_h,
      int stride_w,
      int P,
      int Q)
      throws DMLRuntimeException {
    cudnnTensorDescriptor srcTensorDesc = null;
    cudnnTensorDescriptor dstTensorDesc = null;
    cudnnFilterDescriptor filterDesc = null;
    cudnnConvolutionDescriptor convDesc = null;
    Pointer workSpace = null;
    long sizeInBytes = 0;
    Pointer alpha = null;
    Pointer beta = null;
    try {
      // Allocate descriptors
      srcTensorDesc = allocateTensorDescriptor(N, C, H, W);
      dstTensorDesc = allocateTensorDescriptor(N, K, P, Q);
      filterDesc = allocateFilterDescriptor(K, C, R, S);

      // Allocate data
      // (Pointer) gpuCtx.prepare(image, true, true);
      // (Pointer) gpuCtx.prepare(filter, true, true);

      Pointer imagePointer = ((JCudaObject) image._gpuHandle).jcudaPointer;
      Pointer filterPointer = ((JCudaObject) filter._gpuHandle).jcudaPointer;
      Pointer dstPointer = ((JCudaObject) outputBlock._gpuHandle).jcudaPointer;

      int padding[] = {pad_h, pad_w};
      int strides[] = {stride_h, stride_w};
      convDesc = allocateConvolutionDescriptor(padding, strides);

      // Select the best algorithm depending on the data and supported CUDA

      int algo = -1;
      workSpace = new Pointer();

      if (CONVOLUTION_PREFERENCE
          == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) {
        algo = jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
      } else if (CONVOLUTION_PREFERENCE
          == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST) {
        int[] algos = {
          jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
          jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
          jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
        };
        // TODO: Look into FFt, Winograd, etc
        // Also ensure that GPU has enough memory to allocate memory
        long sizeInBytesArray[] = {0};
        algo =
            jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(
                cudnnHandle,
                srcTensorDesc,
                filterDesc,
                convDesc,
                dstTensorDesc,
                CONVOLUTION_PREFERENCE,
                sizeInBytesArray[0],
                algos);
        cudnnGetConvolutionForwardWorkspaceSize(
            cudnnHandle,
            srcTensorDesc,
            filterDesc,
            convDesc,
            dstTensorDesc,
            algo,
            sizeInBytesArray);
        if (sizeInBytesArray[0] != 0)
          jcuda.runtime.JCuda.cudaMalloc(workSpace, sizeInBytesArray[0]);
        sizeInBytes = sizeInBytesArray[0];
      } else if (CONVOLUTION_PREFERENCE
          == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT) {
        throw new DMLRuntimeException(
            "CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT is not implemented");
      } else {
        throw new DMLRuntimeException("Unsupported preference criteria for convolution");
      }

      alpha = pointerTo(1.0);
      beta = pointerTo(0.0f);
      int status =
          cudnnConvolutionForward(
              cudnnHandle,
              alpha,
              srcTensorDesc,
              imagePointer,
              filterDesc,
              filterPointer,
              convDesc,
              algo,
              workSpace,
              sizeInBytes,
              beta,
              dstTensorDesc,
              dstPointer);
      if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
        throw new DMLRuntimeException(
            "Could not executed cudnnConvolutionForward: "
                + jcuda.jcudnn.cudnnStatus.stringFor(status));
      }
    } finally {

      if (alpha != null) cudaFree(alpha);
      if (beta != null) cudaFree(beta);

      if (srcTensorDesc != null) cudnnDestroyTensorDescriptor(srcTensorDesc);
      if (dstTensorDesc != null) cudnnDestroyTensorDescriptor(dstTensorDesc);
      if (filterDesc != null) cudnnDestroyFilterDescriptor(filterDesc);
      if (convDesc != null) cudnnDestroyConvolutionDescriptor(convDesc);
      if (workSpace != null && sizeInBytes != 0) cudaFree(workSpace);
    }
  }
 public void reset() {
   projectionArray = null;
   JCuda.cudaThreadExit();
 }