static {
    gf256res = new int[16 * 256 * 4];
    gf256resInv = new int[16 * 256 * 4];

    byte[] tmp = new byte[16];
    for (int index = 0; index < 16; index++) {
      for (int i = 0; i < 256; i++) {
        for (int l = 0; l < 16; l++) {
          tmp[l] = 0;
        }
        tmp[index] = (byte) i;
        kuz_l(tmp);
        Pack.bigEndianToInt(tmp, 0, gf256res, (index + (16 * i)) * 4, 4);

        for (int l = 0; l < 16; l++) {
          tmp[l] = 0;
        }
        tmp[index] = (byte) i;
        kuz_l_inv(tmp);
        Pack.bigEndianToInt(tmp, 0, gf256resInv, (index + (16 * i)) * 4, 4);
      }
    }
  }
  static int[][] convertKey(byte[] key) {
    if (key.length != 32) {
      throw new RuntimeException("Key might be 32 bytes length");
    }

    int[][] kuz = new int[10][4];

    // w128_t c, x, y, z;
    int[] c = new int[4];
    int[] x = new int[4];
    int[] y = new int[4];
    int[] z = new int[4];

    kuz[0][0] = x[0] = Pack.bigEndianToInt(key, 0);
    kuz[0][1] = x[1] = Pack.bigEndianToInt(key, 4);
    kuz[0][2] = x[2] = Pack.bigEndianToInt(key, 8);
    kuz[0][3] = x[3] = Pack.bigEndianToInt(key, 12);

    kuz[1][0] = y[0] = Pack.bigEndianToInt(key, 16);
    kuz[1][1] = y[1] = Pack.bigEndianToInt(key, 20);
    kuz[1][2] = y[2] = Pack.bigEndianToInt(key, 24);
    kuz[1][3] = y[3] = Pack.bigEndianToInt(key, 28);

    for (int i = 1; i <= 32; i++) {

      c[0] = 0;
      c[1] = 0;
      c[2] = 0;
      c[3] = i;

      kuz_l_fast(c);

      z[0] = x[0] ^ c[0];
      z[1] = x[1] ^ c[1];
      z[2] = x[2] ^ c[2];
      z[3] = x[3] ^ c[3];

      z[0] =
          (kuz_pi[z[0] & 0xFF] & 0xFF)
              + ((kuz_pi[(z[0] >> 8) & 0xFF] & 0xFF) << 8)
              + ((kuz_pi[(z[0] >> 16) & 0xFF] & 0xFF) << 16)
              + ((kuz_pi[(z[0] >> 24) & 0xFF] & 0xFF) << 24);

      z[1] =
          (kuz_pi[z[1] & 0xFF] & 0xFF)
              + ((kuz_pi[(z[1] >> 8) & 0xFF] & 0xFF) << 8)
              + ((kuz_pi[(z[1] >> 16) & 0xFF] & 0xFF) << 16)
              + ((kuz_pi[(z[1] >> 24) & 0xFF] & 0xFF) << 24);

      z[2] =
          (kuz_pi[z[2] & 0xFF] & 0xFF)
              + ((kuz_pi[(z[2] >> 8) & 0xFF] & 0xFF) << 8)
              + ((kuz_pi[(z[2] >> 16) & 0xFF] & 0xFF) << 16)
              + ((kuz_pi[(z[2] >> 24) & 0xFF] & 0xFF) << 24);

      z[3] =
          (kuz_pi[z[3] & 0xFF] & 0xFF)
              + ((kuz_pi[(z[3] >> 8) & 0xFF] & 0xFF) << 8)
              + ((kuz_pi[(z[3] >> 16) & 0xFF] & 0xFF) << 16)
              + ((kuz_pi[(z[3] >> 24) & 0xFF] & 0xFF) << 24);

      kuz_l_fast(z);

      z[0] = z[0] ^ y[0];
      z[1] = z[1] ^ y[1];
      z[2] = z[2] ^ y[2];
      z[3] = z[3] ^ y[3];

      y[0] = x[0];
      y[1] = x[1];
      y[2] = x[2];
      y[3] = x[3];

      x[0] = z[0];
      x[1] = z[1];
      x[2] = z[2];
      x[3] = z[3];

      if ((i & 7) == 0) {
        kuz[i >> 2][0] = x[0];
        kuz[i >> 2][1] = x[1];
        kuz[i >> 2][2] = x[2];
        kuz[i >> 2][3] = x[3];

        kuz[(i >> 2) + 1][0] = y[0];
        kuz[(i >> 2) + 1][1] = y[1];
        kuz[(i >> 2) + 1][2] = y[2];
        kuz[(i >> 2) + 1][3] = y[3];
      }
    }

    return kuz;
  }
 private void packBlock(byte[] dest, int destOffset) {
   Pack.intToBigEndian(C0, dest, destOffset);
   Pack.intToBigEndian(C1, dest, destOffset + 4);
   Pack.intToBigEndian(C2, dest, destOffset + 8);
   Pack.intToBigEndian(C3, dest, destOffset + 12);
 }
 private void unpackBlock(byte[] bytes, int off) {
   this.C0 = Pack.bigEndianToInt(bytes, off);
   this.C1 = Pack.bigEndianToInt(bytes, off + 4);
   this.C2 = Pack.bigEndianToInt(bytes, off + 8);
   this.C3 = Pack.bigEndianToInt(bytes, off + 12);
 }