// same as SubsampleBinaryToGray, and change byteLoop to protected in superclass?
  // extends that and save this? using prote
  private void byteLoop(Raster source, WritableRaster dest, Rectangle destRect) {
    PixelAccessor pa = new PixelAccessor(source.getSampleModel(), null);
    PackedImageData pid = pa.getPackedPixels(source, source.getBounds(), false, false);
    byte[] sourceData = pid.data;
    int sourceDBOffset = pid.offset;
    int dx = destRect.x;
    int dy = destRect.y;
    int dwi = destRect.width;
    int dhi = destRect.height;
    int sourceTransX = pid.rect.x; // source.getSampleModelTranslateX();
    int sourceTransY = pid.rect.y; // source.getSampleModelTranslateY();
    int sourceDataBitOffset = pid.bitOffset;
    int sourceScanlineStride = pid.lineStride;

    PixelInterleavedSampleModel destSM = (PixelInterleavedSampleModel) dest.getSampleModel();
    DataBufferByte destDB = (DataBufferByte) dest.getDataBuffer();
    int destTransX = dest.getSampleModelTranslateX();
    int destTransY = dest.getSampleModelTranslateY();
    int destScanlineStride = destSM.getScanlineStride();

    byte[] destData = destDB.getData();
    int destDBOffset = destDB.getOffset();

    int[] sbytenum = new int[dwi];
    int[] sstartbit = new int[dwi];
    int[] sAreaBitsOn = new int[dwi];
    for (int i = 0; i < dwi; i++) {
      int x = xValues[dx + i];
      int sbitnum = sourceDataBitOffset + (x - sourceTransX);
      sbytenum[i] = sbitnum >> 3;
      sstartbit[i] = sbitnum % 8;
    }

    for (int j = 0; j < dhi; j++) {

      for (int i = 0; i < dwi; i++) {
        sAreaBitsOn[i] = 0;
      }

      for (int y = yValues[dy + j]; y < yValues[dy + j] + blockY; y++) {

        int sourceYOffset = (y - sourceTransY) * sourceScanlineStride + sourceDBOffset;

        int delement, selement, sendbiti, sendbytenumi;
        for (int i = 0; i < dwi; i++) {
          delement = 0;
          sendbiti = sstartbit[i] + blockX - 1;
          sendbytenumi = sbytenum[i] + (sendbiti >> 3); // byte num of the end bit
          sendbiti %= 8; // true src end bit position
          selement = 0x00ff & (int) sourceData[sourceYOffset + sbytenum[i]];

          int swingBits = 24 + sstartbit[i];
          if (sbytenum[i] == sendbytenumi) {
            // selement  <<= 24 + sstartbit[i];
            selement <<= swingBits;
            selement >>>= 31 - sendbiti + sstartbit[i];
            delement += lut[selement];
          } else {
            selement <<= swingBits;
            selement >>>= swingBits;
            // selement >>>= 24;

            delement += lut[selement];
            for (int b = sbytenum[i] + 1; b < sendbytenumi; b++) {
              selement = 0x00ff & (int) sourceData[sourceYOffset + b];
              delement += lut[selement];
            }
            selement = 0x00ff & (int) sourceData[sourceYOffset + sendbytenumi];
            selement >>>= 7 - sendbiti;
            delement += lut[selement];
          }
          sAreaBitsOn[i] += delement;
        }
      }
      int destYOffset = (j + dy - destTransY) * destScanlineStride + destDBOffset;

      destYOffset += dx - destTransX;

      // update dest values for row j in raster

      for (int i = 0; i < dwi; i++) {
        destData[destYOffset + i] = lutGray[sAreaBitsOn[i]];
      }
    }
  }
  // speed up for the case of 4x4
  // and data buffer bitOffset is 0 or 4
  private void byteLoop4x4(Raster source, WritableRaster dest, Rectangle destRect) {
    PixelAccessor pa = new PixelAccessor(source.getSampleModel(), null);
    PackedImageData pid = pa.getPackedPixels(source, source.getBounds(), false, false);

    if (pid.bitOffset % 4 != 0) {
      // special treatment only for offsets 0 and 4
      byteLoop(source, dest, destRect);
      return;
    }

    byte[] sourceData = pid.data;
    int sourceDBOffset = pid.offset;
    int dx = destRect.x;
    int dy = destRect.y;
    int dwi = destRect.width;
    int dhi = destRect.height;
    int sourceTransX = pid.rect.x; // source.getSampleModelTranslateX();
    int sourceTransY = pid.rect.y; // source.getSampleModelTranslateY();
    int sourceDataBitOffset = pid.bitOffset;
    int sourceScanlineStride = pid.lineStride;

    PixelInterleavedSampleModel destSM = (PixelInterleavedSampleModel) dest.getSampleModel();
    DataBufferByte destDB = (DataBufferByte) dest.getDataBuffer();
    int destTransX = dest.getSampleModelTranslateX();
    int destTransY = dest.getSampleModelTranslateY();
    int destScanlineStride = destSM.getScanlineStride();

    byte[] destData = destDB.getData();
    int destDBOffset = destDB.getOffset();

    int[] sAreaBitsOn = new int[2];

    for (int j = 0; j < dhi; j++) {
      int y = (dy + j) << 2; // int y = (dy + j) * blockY;
      int sourceYOffset = (y - sourceTransY) * sourceScanlineStride + sourceDBOffset;

      int destYOffset = (j + dy - destTransY) * destScanlineStride + destDBOffset;
      destYOffset += dx - destTransX;

      int selement, sbitnumi, sstartbiti, sbytenumi;
      // sbitnumi   - the 1st bit position from the minX of the raster
      // sstartbiti - the 1st bit position in the byte data
      // sbitnumi = blockX * dx - sourceTransX + sourceDataBitOffset;
      sbitnumi = (dx << 2) - sourceTransX + sourceDataBitOffset;

      for (int i = 0; i < dwi; ) {
        sbytenumi = sbitnumi >> 3;
        sstartbiti = sbitnumi % 8;
        int byteindex = sourceYOffset + sbytenumi;
        sAreaBitsOn[0] = sAreaBitsOn[1] = 0;
        for (int k = 0; k < 4; k++, byteindex += sourceScanlineStride) {
          selement = 0x00ff & (int) sourceData[byteindex];
          sAreaBitsOn[1] += lut[selement & 0x000f];
          sAreaBitsOn[0] += lut[selement >> 4];
        }
        // set dest elements
        // count in 4s
        // sstartbiti = 0 means the 0th of sAreaBitsOn is added to
        //     current dest position, ie destYOffset + i;
        // sstartbiti = 4 means the 1th of sAreaBitsOn is added to
        //     current dest position, ie destYOffset + i;
        // sstartbiti now means different
        // sstartbiti = sstartbiti / 4;
        sstartbiti >>= 2;

        while (sstartbiti < 2 && i < dwi) {
          destData[destYOffset + i] = lutGray[sAreaBitsOn[sstartbiti]];
          sstartbiti++;
          i++;
          sbitnumi += blockX;
        }
      }
    }
  }