public static void vertical(Kernel1D_F32 kernel, GrayF32 input, GrayF32 output) {
    final float[] dataSrc = input.data;
    final float[] dataDst = output.data;
    final float[] dataKer = kernel.data;

    final int kernelWidth = kernel.getWidth();
    final int offsetL = kernel.getOffset();
    final int offsetR = kernelWidth - offsetL - 1;

    final int imgWidth = output.getWidth();
    final int imgHeight = output.getHeight();

    final int yEnd = imgHeight - offsetR;

    for (int y = 0; y < offsetL; y++) {
      int indexDst = output.startIndex + y * output.stride;
      int i = input.startIndex + y * input.stride;
      final int iEnd = i + imgWidth;

      int kStart = offsetL - y;

      float weight = 0;
      for (int k = kStart; k < kernelWidth; k++) {
        weight += dataKer[k];
      }

      for (; i < iEnd; i++) {
        float total = 0;
        int indexSrc = i - y * input.stride;
        for (int k = kStart; k < kernelWidth; k++, indexSrc += input.stride) {
          total += (dataSrc[indexSrc]) * dataKer[k];
        }
        dataDst[indexDst++] = (total / weight);
      }
    }

    for (int y = yEnd; y < imgHeight; y++) {
      int indexDst = output.startIndex + y * output.stride;
      int i = input.startIndex + y * input.stride;
      final int iEnd = i + imgWidth;

      int kEnd = imgHeight - (y - offsetL);

      float weight = 0;
      for (int k = 0; k < kEnd; k++) {
        weight += dataKer[k];
      }

      for (; i < iEnd; i++) {
        float total = 0;
        int indexSrc = i - offsetL * input.stride;
        for (int k = 0; k < kEnd; k++, indexSrc += input.stride) {
          total += (dataSrc[indexSrc]) * dataKer[k];
        }
        dataDst[indexDst++] = (total / weight);
      }
    }
  }
  public static float vertical(Kernel1D_F32 kernel, ImageBorder_F32 input, int x, int y) {
    final int r = kernel.getRadius();
    final int w = kernel.getWidth();

    float total = 0;

    for (int i = 0; i < w; i++) {
      total += input.get(x, y + i - r) * kernel.get(i);
    }

    return total;
  }
  public static void horizontal(Kernel1D_F32 kernel, GrayF32 input, GrayF32 output) {
    final float[] dataSrc = input.data;
    final float[] dataDst = output.data;
    final float[] dataKer = kernel.data;

    final int kernelWidth = kernel.getWidth();
    final int offsetL = kernel.getOffset();
    final int offsetR = kernelWidth - offsetL - 1;

    final int width = input.getWidth();
    final int height = input.getHeight();

    for (int i = 0; i < height; i++) {
      int indexDest = output.startIndex + i * output.stride;
      int j = input.startIndex + i * input.stride;
      final int jStart = j;
      int jEnd = j + offsetL;

      for (; j < jEnd; j++) {
        float total = 0;
        float weight = 0;
        int indexSrc = jStart;
        for (int k = kernelWidth - (offsetR + 1 + j - jStart); k < kernelWidth; k++) {
          float w = dataKer[k];
          weight += w;
          total += (dataSrc[indexSrc++]) * w;
        }
        dataDst[indexDest++] = (total / weight);
      }

      j += width - (offsetL + offsetR);
      indexDest += width - (offsetL + offsetR);

      jEnd = jStart + width;
      for (; j < jEnd; j++) {
        float total = 0;
        float weight = 0;
        int indexSrc = j - offsetL;
        final int kEnd = jEnd - indexSrc;

        for (int k = 0; k < kEnd; k++) {
          float w = dataKer[k];
          weight += w;
          total += (dataSrc[indexSrc++]) * w;
        }
        dataDst[indexDest++] = (total / weight);
      }
    }
  }
  public static float convolve(
      Kernel1D_F32 horizontal,
      Kernel1D_F32 vertical,
      ImageFloat32 input,
      int c_x,
      int c_y,
      float storage[]) {
    //		if( c_x == 2 && c_y == 2 )
    //			System.out.println("stp here");

    // convolve horizontally first
    int width = horizontal.getWidth();
    int radius = width / 2;

    for (int i = 0; i < width; i++) {
      int indexImg = input.startIndex + (i + c_y - radius) * input.stride + c_x - radius;

      float total = 0;
      for (int j = 0; j < width; j++, indexImg++) {
        total += (input.data[indexImg]) * horizontal.data[j];
      }
      storage[i] = total;
    }

    // convolve vertically
    float total = 0;
    for (int i = 0; i < width; i++) {
      total += storage[i] * vertical.data[i];
    }
    return total;
  }