public static void vertical(Kernel1D_I32 kernel, GrayS32 input, GrayS32 output) {
    final int[] dataSrc = input.data;
    final int[] dataDst = output.data;
    final int[] dataKer = kernel.data;

    final int kernelWidth = kernel.getWidth();
    final int offsetL = kernel.getOffset();
    final int offsetR = kernelWidth - offsetL - 1;

    final int imgWidth = output.getWidth();
    final int imgHeight = output.getHeight();

    final int yEnd = imgHeight - offsetR;

    for (int y = 0; y < offsetL; y++) {
      int indexDst = output.startIndex + y * output.stride;
      int i = input.startIndex + y * input.stride;
      final int iEnd = i + imgWidth;

      int kStart = offsetL - y;

      int weight = 0;
      for (int k = kStart; k < kernelWidth; k++) {
        weight += dataKer[k];
      }

      for (; i < iEnd; i++) {
        int total = 0;
        int indexSrc = i - y * input.stride;
        for (int k = kStart; k < kernelWidth; k++, indexSrc += input.stride) {
          total += (dataSrc[indexSrc]) * dataKer[k];
        }
        dataDst[indexDst++] = ((total + weight / 2) / weight);
      }
    }

    for (int y = yEnd; y < imgHeight; y++) {
      int indexDst = output.startIndex + y * output.stride;
      int i = input.startIndex + y * input.stride;
      final int iEnd = i + imgWidth;

      int kEnd = imgHeight - (y - offsetL);

      int weight = 0;
      for (int k = 0; k < kEnd; k++) {
        weight += dataKer[k];
      }

      for (; i < iEnd; i++) {
        int total = 0;
        int indexSrc = i - offsetL * input.stride;
        for (int k = 0; k < kEnd; k++, indexSrc += input.stride) {
          total += (dataSrc[indexSrc]) * dataKer[k];
        }
        dataDst[indexDst++] = ((total + weight / 2) / weight);
      }
    }
  }
  public static int vertical(Kernel1D_I32 kernel, ImageBorder_S32 input, int x, int y) {
    final int r = kernel.getRadius();
    final int w = kernel.getWidth();

    int total = 0;

    for (int i = 0; i < w; i++) {
      total += input.get(x, y + i - r) * kernel.get(i);
    }

    return total;
  }
  public static void horizontal(Kernel1D_I32 kernel, GrayS32 input, GrayS32 output) {
    final int[] dataSrc = input.data;
    final int[] dataDst = output.data;
    final int[] dataKer = kernel.data;

    final int kernelWidth = kernel.getWidth();
    final int offsetL = kernel.getOffset();
    final int offsetR = kernelWidth - offsetL - 1;

    final int width = input.getWidth();
    final int height = input.getHeight();

    for (int i = 0; i < height; i++) {
      int indexDest = output.startIndex + i * output.stride;
      int j = input.startIndex + i * input.stride;
      final int jStart = j;
      int jEnd = j + offsetL;

      for (; j < jEnd; j++) {
        int total = 0;
        int weight = 0;
        int indexSrc = jStart;
        for (int k = kernelWidth - (offsetR + 1 + j - jStart); k < kernelWidth; k++) {
          int w = dataKer[k];
          weight += w;
          total += (dataSrc[indexSrc++]) * w;
        }
        dataDst[indexDest++] = ((total + weight / 2) / weight);
      }

      j += width - (offsetL + offsetR);
      indexDest += width - (offsetL + offsetR);

      jEnd = jStart + width;
      for (; j < jEnd; j++) {
        int total = 0;
        int weight = 0;
        int indexSrc = j - offsetL;
        final int kEnd = jEnd - indexSrc;

        for (int k = 0; k < kEnd; k++) {
          int w = dataKer[k];
          weight += w;
          total += (dataSrc[indexSrc++]) * w;
        }
        dataDst[indexDest++] = ((total + weight / 2) / weight);
      }
    }
  }
  public static int convolve(
      Kernel1D_I32 horizontal,
      Kernel1D_I32 vertical,
      ImageUInt8 input,
      int c_x,
      int c_y,
      int storage[],
      int divisorHorizontal,
      int divisorVertical) {
    // convolve horizontally first
    int width = horizontal.getWidth();
    int radius = width / 2;

    for (int i = 0; i < width; i++) {
      int indexImg = input.startIndex + (i + c_y - radius) * input.stride + c_x - radius;

      int total = 0;
      for (int j = 0; j < width; j++, indexImg++) {
        total += (input.data[indexImg] & 0xFF) * horizontal.data[j];
      }
      storage[i] = total / divisorHorizontal;
    }

    // convolve vertically
    int total = 0;
    for (int i = 0; i < width; i++) {
      total += storage[i] * vertical.data[i];
    }
    return total / divisorVertical;
  }
  public static void horizontal3(
      Kernel1D_I32 kernel, ImageSInt16 image, ImageInt16 dest, int divisor, boolean includeBorder) {
    final short[] dataSrc = image.data;
    final short[] dataDst = dest.data;

    final int k1 = kernel.data[0];
    final int k2 = kernel.data[1];
    final int k3 = kernel.data[2];

    final int radius = kernel.getRadius();

    final int yBorder = includeBorder ? 0 : radius;

    final int width = image.getWidth();
    final int height = image.getHeight() - yBorder;
    final int halfDivisor = divisor / 2;

    for (int i = yBorder; i < height; i++) {
      int indexDst = dest.startIndex + i * dest.stride + radius;
      int j = image.startIndex + i * image.stride - radius;
      final int jEnd = j + width - radius;

      for (j += radius; j < jEnd; j++) {
        int indexSrc = j;
        int total = (dataSrc[indexSrc++]) * k1;
        total += (dataSrc[indexSrc++]) * k2;
        total += (dataSrc[indexSrc]) * k3;

        dataDst[indexDst++] = (short) ((total + halfDivisor) / divisor);
      }
    }
  }
  public static void vertical9(
      Kernel1D_I32 kernel, ImageSInt16 image, ImageInt16 dest, int divisor, boolean includeBorder) {
    final short[] dataSrc = image.data;
    final short[] dataDst = dest.data;

    final int k1 = kernel.data[0];
    final int k2 = kernel.data[1];
    final int k3 = kernel.data[2];
    final int k4 = kernel.data[3];
    final int k5 = kernel.data[4];
    final int k6 = kernel.data[5];
    final int k7 = kernel.data[6];
    final int k8 = kernel.data[7];
    final int k9 = kernel.data[8];

    final int radius = kernel.getRadius();

    final int imgWidth = dest.getWidth();
    final int imgHeight = dest.getHeight();
    final int halfDivisor = divisor / 2;

    final int yEnd = imgHeight - radius;

    final int xBorder = includeBorder ? 0 : radius;

    for (int y = radius; y < yEnd; y++) {
      int indexDst = dest.startIndex + y * dest.stride + xBorder;
      int i = image.startIndex + (y - radius) * image.stride;
      final int iEnd = i + imgWidth - xBorder;

      for (i += xBorder; i < iEnd; i++) {
        int indexSrc = i;

        int total = (dataSrc[indexSrc]) * k1;
        indexSrc += image.stride;
        total += (dataSrc[indexSrc]) * k2;
        indexSrc += image.stride;
        total += (dataSrc[indexSrc]) * k3;
        indexSrc += image.stride;
        total += (dataSrc[indexSrc]) * k4;
        indexSrc += image.stride;
        total += (dataSrc[indexSrc]) * k5;
        indexSrc += image.stride;
        total += (dataSrc[indexSrc]) * k6;
        indexSrc += image.stride;
        total += (dataSrc[indexSrc]) * k7;
        indexSrc += image.stride;
        total += (dataSrc[indexSrc]) * k8;
        indexSrc += image.stride;
        total += (dataSrc[indexSrc]) * k9;

        dataDst[indexDst++] = (short) ((total + halfDivisor) / divisor);
      }
    }
  }
  public static void vertical(
      Kernel1D_I32 kernelX, Kernel1D_I32 kernelY, GrayS32 input, GrayI16 output) {
    final int[] dataSrc = input.data;
    final short[] dataDst = output.data;
    final int[] dataKer = kernelY.data;

    final int offsetY = kernelY.getOffset();
    final int kernelWidthY = kernelY.getWidth();

    final int offsetX = kernelX.getOffset();
    final int kernelWidthX = kernelX.getWidth();
    final int offsetX1 = kernelWidthX - offsetX - 1;

    final int imgWidth = output.getWidth();
    final int imgHeight = output.getHeight();

    final int yEnd = imgHeight - (kernelWidthY - offsetY - 1);

    int startWeightX = 0;
    for (int k = offsetX; k < kernelWidthX; k++) {
      startWeightX += kernelX.data[k];
    }

    for (int y = 0; y < offsetY; y++) {
      int indexDst = output.startIndex + y * output.stride;
      int i = input.startIndex + y * input.stride;
      final int iEnd = i + imgWidth;

      int kStart = offsetY - y;

      int weightY = 0;
      for (int k = kStart; k < kernelWidthY; k++) {
        weightY += dataKer[k];
      }
      int weightX = startWeightX;

      for (int x = 0; i < iEnd; i++, x++) {
        int weight = weightX * weightY;
        int total = 0;
        int indexSrc = i - y * input.stride;
        for (int k = kStart; k < kernelWidthY; k++, indexSrc += input.stride) {
          total += (dataSrc[indexSrc]) * dataKer[k];
        }
        dataDst[indexDst++] = (short) ((total + weight / 2) / weight);
        if (x < offsetX) {
          weightX += kernelX.data[offsetX - x - 1];
        } else if (x >= input.width - (kernelWidthX - offsetX)) {
          weightX -= kernelX.data[input.width - x + offsetX - 1];
        }
      }
    }

    for (int y = yEnd; y < imgHeight; y++) {
      int indexDst = output.startIndex + y * output.stride;
      int i = input.startIndex + y * input.stride;
      final int iEnd = i + imgWidth;

      int kEnd = imgHeight - (y - offsetY);

      int weightY = 0;
      for (int k = 0; k < kEnd; k++) {
        weightY += dataKer[k];
      }
      int weightX = startWeightX;

      for (int x = 0; i < iEnd; i++, x++) {
        int weight = weightX * weightY;
        int total = 0;
        int indexSrc = i - offsetY * input.stride;
        for (int k = 0; k < kEnd; k++, indexSrc += input.stride) {
          total += (dataSrc[indexSrc]) * dataKer[k];
        }
        dataDst[indexDst++] = (short) ((total + weight / 2) / weight);
        if (x < offsetX) {
          weightX += kernelX.data[offsetX - x - 1];
        } else if (x >= input.width - (kernelWidthX - offsetX)) {
          weightX -= kernelX.data[input.width - x + offsetX - 1];
        }
      }
    }

    // left and right border
    int weightY = kernelY.computeSum();
    for (int y = offsetY; y < yEnd; y++) {
      int indexDst = output.startIndex + y * output.stride;
      int i = input.startIndex + y * input.stride;

      // left side
      int iEnd = i + offsetY;
      int weightX = startWeightX;
      for (int x = 0; i < iEnd; i++, x++) {
        int weight = weightX * weightY;
        int total = 0;
        int indexSrc = i - offsetY * input.stride;
        for (int k = 0; k < kernelWidthY; k++, indexSrc += input.stride) {
          total += (dataSrc[indexSrc]) * dataKer[k];
        }
        dataDst[indexDst++] = (short) ((total + weight / 2) / weight);
        weightX += kernelX.data[offsetX - x - 1];
      }

      // right side
      int startX = input.width - offsetX1;
      indexDst = output.startIndex + y * output.stride + startX;
      i = input.startIndex + y * input.stride + startX;
      iEnd = input.startIndex + y * input.stride + input.width;
      for (int x = startX; i < iEnd; i++, x++) {
        weightX -= kernelX.data[input.width - x + offsetX];
        int weight = weightX * weightY;
        int total = 0;
        int indexSrc = i - offsetY * input.stride;
        for (int k = 0; k < kernelWidthY; k++, indexSrc += input.stride) {
          total += (dataSrc[indexSrc]) * dataKer[k];
        }
        dataDst[indexDst++] = (short) ((total + weight / 2) / weight);
      }
    }
  }