public static void convolve(Kernel2D_F64 kernel, GrayF64 input, GrayF64 output) {
    final double[] dataSrc = input.data;
    final double[] dataDst = output.data;
    final double[] dataKer = kernel.data;

    final int kernelWidth = kernel.getWidth();
    final int offsetL = kernel.getOffset();
    final int offsetR = kernelWidth - offsetL - 1;

    final int width = input.getWidth();
    final int height = input.getHeight();

    // convolve across the left and right borders
    for (int y = 0; y < height; y++) {

      int minI = y >= offsetL ? -offsetL : -y;
      int maxI = y < height - offsetR ? offsetR : height - y - 1;

      int indexDst = output.startIndex + y * output.stride;

      for (int x = 0; x < offsetL; x++) {

        double total = 0;
        double weight = 0;

        for (int i = minI; i <= maxI; i++) {
          int indexSrc = input.startIndex + (y + i) * input.stride + x;
          int indexKer = (i + offsetL) * kernelWidth;

          for (int j = -x; j <= offsetR; j++) {
            double w = dataKer[indexKer + j + offsetL];
            weight += w;
            total += (dataSrc[indexSrc + j]) * w;
          }
        }

        dataDst[indexDst++] = (total / weight);
      }

      indexDst = output.startIndex + y * output.stride + width - offsetR;
      for (int x = width - offsetR; x < width; x++) {

        int maxJ = width - x - 1;

        double total = 0;
        double weight = 0;

        for (int i = minI; i <= maxI; i++) {
          int indexSrc = input.startIndex + (y + i) * input.stride + x;
          int indexKer = (i + offsetL) * kernelWidth;

          for (int j = -offsetL; j <= maxJ; j++) {
            double w = dataKer[indexKer + j + offsetL];
            weight += w;
            total += (dataSrc[indexSrc + j]) * w;
          }
        }

        dataDst[indexDst++] = (total / weight);
      }
    }

    // convolve across the top border while avoiding convolving the corners again
    for (int y = 0; y < offsetL; y++) {

      int indexDst = output.startIndex + y * output.stride + offsetL;

      for (int x = offsetL; x < width - offsetR; x++) {

        double total = 0;
        double weight = 0;

        for (int i = -y; i <= offsetR; i++) {
          int indexSrc = input.startIndex + (y + i) * input.stride + x;
          int indexKer = (i + offsetL) * kernelWidth;

          for (int j = -offsetL; j <= offsetR; j++) {
            double w = dataKer[indexKer + j + offsetL];
            weight += w;
            total += (dataSrc[indexSrc + j]) * w;
          }
        }
        dataDst[indexDst++] = (total / weight);
      }
    }

    // convolve across the bottom border
    for (int y = height - offsetR; y < height; y++) {

      int maxI = height - y - 1;
      int indexDst = output.startIndex + y * output.stride + offsetL;

      for (int x = offsetL; x < width - offsetR; x++) {

        double total = 0;
        double weight = 0;

        for (int i = -offsetL; i <= maxI; i++) {
          int indexSrc = input.startIndex + (y + i) * input.stride + x;
          int indexKer = (i + offsetL) * kernelWidth;

          for (int j = -offsetL; j <= offsetR; j++) {
            double w = dataKer[indexKer + j + offsetL];
            weight += w;
            total += (dataSrc[indexSrc + j]) * w;
          }
        }
        dataDst[indexDst++] = (total / weight);
      }
    }
  }