public static void conv2d(
      MatrixObject image,
      MatrixObject filter,
      MatrixObject outputBlock,
      int N,
      int C,
      int H,
      int W,
      int K,
      int R,
      int S,
      int pad_h,
      int pad_w,
      int stride_h,
      int stride_w,
      int P,
      int Q)
      throws DMLRuntimeException {
    cudnnTensorDescriptor srcTensorDesc = null;
    cudnnTensorDescriptor dstTensorDesc = null;
    cudnnFilterDescriptor filterDesc = null;
    cudnnConvolutionDescriptor convDesc = null;
    Pointer workSpace = null;
    long sizeInBytes = 0;
    Pointer alpha = null;
    Pointer beta = null;
    try {
      // Allocate descriptors
      srcTensorDesc = allocateTensorDescriptor(N, C, H, W);
      dstTensorDesc = allocateTensorDescriptor(N, K, P, Q);
      filterDesc = allocateFilterDescriptor(K, C, R, S);

      // Allocate data
      // (Pointer) gpuCtx.prepare(image, true, true);
      // (Pointer) gpuCtx.prepare(filter, true, true);

      Pointer imagePointer = ((JCudaObject) image._gpuHandle).jcudaPointer;
      Pointer filterPointer = ((JCudaObject) filter._gpuHandle).jcudaPointer;
      Pointer dstPointer = ((JCudaObject) outputBlock._gpuHandle).jcudaPointer;

      int padding[] = {pad_h, pad_w};
      int strides[] = {stride_h, stride_w};
      convDesc = allocateConvolutionDescriptor(padding, strides);

      // Select the best algorithm depending on the data and supported CUDA

      int algo = -1;
      workSpace = new Pointer();

      if (CONVOLUTION_PREFERENCE
          == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) {
        algo = jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
      } else if (CONVOLUTION_PREFERENCE
          == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST) {
        int[] algos = {
          jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
          jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
          jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
        };
        // TODO: Look into FFt, Winograd, etc
        // Also ensure that GPU has enough memory to allocate memory
        long sizeInBytesArray[] = {0};
        algo =
            jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(
                cudnnHandle,
                srcTensorDesc,
                filterDesc,
                convDesc,
                dstTensorDesc,
                CONVOLUTION_PREFERENCE,
                sizeInBytesArray[0],
                algos);
        cudnnGetConvolutionForwardWorkspaceSize(
            cudnnHandle,
            srcTensorDesc,
            filterDesc,
            convDesc,
            dstTensorDesc,
            algo,
            sizeInBytesArray);
        if (sizeInBytesArray[0] != 0)
          jcuda.runtime.JCuda.cudaMalloc(workSpace, sizeInBytesArray[0]);
        sizeInBytes = sizeInBytesArray[0];
      } else if (CONVOLUTION_PREFERENCE
          == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT) {
        throw new DMLRuntimeException(
            "CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT is not implemented");
      } else {
        throw new DMLRuntimeException("Unsupported preference criteria for convolution");
      }

      alpha = pointerTo(1.0);
      beta = pointerTo(0.0f);
      int status =
          cudnnConvolutionForward(
              cudnnHandle,
              alpha,
              srcTensorDesc,
              imagePointer,
              filterDesc,
              filterPointer,
              convDesc,
              algo,
              workSpace,
              sizeInBytes,
              beta,
              dstTensorDesc,
              dstPointer);
      if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
        throw new DMLRuntimeException(
            "Could not executed cudnnConvolutionForward: "
                + jcuda.jcudnn.cudnnStatus.stringFor(status));
      }
    } finally {

      if (alpha != null) cudaFree(alpha);
      if (beta != null) cudaFree(beta);

      if (srcTensorDesc != null) cudnnDestroyTensorDescriptor(srcTensorDesc);
      if (dstTensorDesc != null) cudnnDestroyTensorDescriptor(dstTensorDesc);
      if (filterDesc != null) cudnnDestroyFilterDescriptor(filterDesc);
      if (convDesc != null) cudnnDestroyConvolutionDescriptor(convDesc);
      if (workSpace != null && sizeInBytes != 0) cudaFree(workSpace);
    }
  }
  public static void conv2d_backward_data(
      MatrixObject filter,
      MatrixObject dout,
      MatrixObject output,
      int N,
      int C,
      int H,
      int W,
      int K,
      int R,
      int S,
      int pad_h,
      int pad_w,
      int stride_h,
      int stride_w,
      int P,
      int Q)
      throws DMLRuntimeException {
    Pointer alpha = null;
    Pointer beta = null;
    cudnnTensorDescriptor dyDesc = null;
    cudnnTensorDescriptor dxDesc = null;
    cudnnFilterDescriptor wDesc = null;
    cudnnConvolutionDescriptor convDesc = null;

    Pointer workSpace = null;
    long sizeInBytes = 0;
    try {
      // Allocate descriptors
      wDesc = allocateFilterDescriptor(K, C, R, S);
      dyDesc = allocateTensorDescriptor(N, K, P, Q);
      dxDesc = allocateTensorDescriptor(N, C, H, W);

      // Allocate data
      Pointer w = ((JCudaObject) filter._gpuHandle).jcudaPointer;
      Pointer dy = ((JCudaObject) dout._gpuHandle).jcudaPointer;
      Pointer dx = ((JCudaObject) output._gpuHandle).jcudaPointer;

      alpha = pointerTo(1.0); // TODO
      beta = pointerTo(0.0f);

      int padding[] = {pad_h, pad_w};
      int strides[] = {stride_h, stride_w};
      convDesc = allocateConvolutionDescriptor(padding, strides);
      long sizeInBytesArray[] = {0};

      // TODO: Select the best algorithm depending on the data and supported CUDA
      int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
      workSpace = new Pointer();
      cudnnGetConvolutionBackwardDataWorkspaceSize(
          cudnnHandle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray);

      int status =
          cudnnConvolutionBackwardData(
              cudnnHandle,
              alpha,
              wDesc,
              w,
              dyDesc,
              dy,
              convDesc,
              algo,
              workSpace,
              sizeInBytes,
              beta,
              dxDesc,
              dx);
      if (status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
        throw new DMLRuntimeException(
            "Could not executed cudnnConvolutionBackwardData: "
                + jcuda.jcudnn.cudnnStatus.stringFor(status));
      }
    } finally {
      if (alpha != null) cudaFree(alpha);
      if (beta != null) cudaFree(beta);
      if (dyDesc != null) cudnnDestroyTensorDescriptor(dyDesc);
      if (dxDesc != null) cudnnDestroyTensorDescriptor(dxDesc);
      if (wDesc != null) cudnnDestroyFilterDescriptor(wDesc);

      if (convDesc != null) cudnnDestroyConvolutionDescriptor(convDesc);

      if (workSpace != null && sizeInBytes != 0) cudaFree(workSpace);
    }
  }