public static void convolve3(Kernel2D_I32 kernel, ImageSInt16 src, ImageInt16 dest, int divisor) {
    final short[] dataSrc = src.data;
    final short[] dataDst = dest.data;

    final int width = src.getWidth();
    final int height = src.getHeight();
    final int halfDivisor = divisor / 2;

    final int kernelRadius = kernel.getRadius();
    final int totalRow[] = new int[width];

    for (int y = kernelRadius; y < height - kernelRadius; y++) {

      // first time through the value needs to be set
      int k1 = kernel.data[0];
      int k2 = kernel.data[1];
      int k3 = kernel.data[2];

      int indexSrcRow = src.startIndex + (y - kernelRadius) * src.stride - kernelRadius;
      for (int x = kernelRadius; x < width - kernelRadius; x++) {
        int indexSrc = indexSrcRow + x;

        int total = 0;
        total += (dataSrc[indexSrc++]) * k1;
        total += (dataSrc[indexSrc++]) * k2;
        total += (dataSrc[indexSrc]) * k3;

        totalRow[x] = total;
      }

      // rest of the convolution rows are an addition
      for (int i = 1; i < 3; i++) {
        indexSrcRow = src.startIndex + (y + i - kernelRadius) * src.stride - kernelRadius;

        k1 = kernel.data[i * 3 + 0];
        k2 = kernel.data[i * 3 + 1];
        k3 = kernel.data[i * 3 + 2];

        for (int x = kernelRadius; x < width - kernelRadius; x++) {
          int indexSrc = indexSrcRow + x;

          int total = 0;
          total += (dataSrc[indexSrc++]) * k1;
          total += (dataSrc[indexSrc++]) * k2;
          total += (dataSrc[indexSrc]) * k3;

          totalRow[x] += total;
        }
      }
      int indexDst = dest.startIndex + y * dest.stride + kernelRadius;
      for (int x = kernelRadius; x < width - kernelRadius; x++) {
        dataDst[indexDst++] = (short) ((totalRow[x] + halfDivisor) / divisor);
      }
    }
  }
  public static int convolve(Kernel2D_I32 kernel, ImageBorder_S32 input, int x, int y) {
    final int r = kernel.getRadius();
    final int w = kernel.getWidth();

    int total = 0;

    for (int i = 0; i < w; i++) {
      for (int j = 0; j < w; j++) {
        total += input.get(x + j - r, y + i - r) * kernel.get(i, j);
      }
    }

    return total;
  }