Example #1
0
  // check and compare canEncoding/Encoding
  static char[] checkEncoding(Charset oldCS, Charset newCS) throws Exception {
    System.out.printf("Encoding <%s> <%s>...%n", oldCS.name(), newCS.name());
    CharsetEncoder encOLD = oldCS.newEncoder();
    CharsetEncoder encNew = newCS.newEncoder();
    char[] cc = new char[0x10000];
    int pos = 0;
    boolean is970 = "x-IBM970-Old".equals(oldCS.name());

    for (char c = 0; c < 0xffff; c++) {
      boolean canOld = encOLD.canEncode(c);
      boolean canNew = encNew.canEncode(c);

      if (is970 && c == 0x2299) continue;

      if (canOld != canNew) {
        if (canNew) {
          System.out.printf("      NEW(only): ");
          printEntry(c, newCS);
        } else {
          if (is970) {
            byte[] bb = new String(new char[] {c}).getBytes(oldCS);
            if (bb.length == 2 && bb[0] == (byte) 0xa2 && bb[1] == (byte) 0xc1) {
              // we know 970 has bogus nnnn -> a2c1 -> 2299
              continue;
            }
          }
          System.out.printf("      OLD(only): ");
          printEntry(c, oldCS);
        }
      } else if (canNew) {
        byte[] bbNew = new String(new char[] {c}).getBytes(newCS);
        byte[] bbOld = new String(new char[] {c}).getBytes(oldCS);
        if (!Arrays.equals(bbNew, bbOld)) {
          System.out.printf("      c->b NEW: ");
          printEntry(c, newCS);
          System.out.printf("      c->b OLD: ");
          printEntry(c, oldCS);
        } else {
          String sNew = new String(bbNew, newCS);
          String sOld = new String(bbOld, oldCS);
          if (!sNew.equals(sOld)) {
            System.out.printf("      b2c NEW (c=%x):", c & 0xffff);
            printEntry(sNew.charAt(0), newCS);
            System.out.printf("      b2c OLD:");
            printEntry(sOld.charAt(0), oldCS);
          }
        }
      }
      if (canNew & canOld) { // added only both for now
        cc[pos++] = c;
      }
    }
    return Arrays.copyOf(cc, pos);
  }
Example #2
0
  static void compare(Charset cs1, Charset cs2, char[] cc) throws Exception {
    System.gc(); // enqueue finalizable objects
    Thread.sleep(1000);
    System.gc(); // enqueue finalizable objects

    String csn1 = cs1.name();
    String csn2 = cs2.name();
    System.out.printf("Diff     <%s> <%s>...%n", csn1, csn2);

    Time t1 = new Time();
    Time t2 = new Time();

    byte[] bb1 = encode(cc, cs1, false, t1);
    byte[] bb2 = encode(cc, cs2, false, t2);

    System.out.printf(
        "    Encoding TimeRatio %s/%s: %d,%d :%f%n",
        csn2, csn1, t2.t, t1.t, (double) (t2.t) / (t1.t));
    if (!Arrays.equals(bb1, bb2)) {
      System.out.printf("        encoding failed%n");
    }

    char[] cc2 = decode(bb1, cs2, false, t2);
    char[] cc1 = decode(bb1, cs1, false, t1);
    System.out.printf(
        "    Decoding TimeRatio %s/%s: %d,%d :%f%n",
        csn2, csn1, t2.t, t1.t, (double) (t2.t) / (t1.t));
    if (!Arrays.equals(cc1, cc2)) {
      System.out.printf("        decoding failed%n");
    }

    bb1 = encode(cc, cs1, true, t1);
    bb2 = encode(cc, cs2, true, t2);

    System.out.printf(
        "    Encoding(dir) TimeRatio %s/%s: %d,%d :%f%n",
        csn2, csn1, t2.t, t1.t, (double) (t2.t) / (t1.t));

    if (!Arrays.equals(bb1, bb2)) System.out.printf("        encoding (direct) failed%n");

    cc1 = decode(bb1, cs1, true, t1);
    cc2 = decode(bb1, cs2, true, t2);
    System.out.printf(
        "    Decoding(dir) TimeRatio %s/%s: %d,%d :%f%n",
        csn2, csn1, t2.t, t1.t, (double) (t2.t) / (t1.t));
    if (!Arrays.equals(cc1, cc2)) {
      System.out.printf("        decoding (direct) failed%n");
    }
  }
Example #3
0
 /**
  * Converts a token to a sequence of codepoints.
  *
  * @param token token
  * @return codepoints
  */
 public static int[] cps(final byte[] token) {
   int pos = 0;
   final int len = token.length;
   final int[] cp = new int[len];
   for (int i = 0; i < len; i += cl(token, i)) cp[pos++] = cp(token, i);
   return pos < len ? Arrays.copyOf(cp, pos) : cp;
 }
Example #4
0
 static void checkMalformed(Charset cs, byte[][] malformed) throws Exception {
   boolean failed = false;
   String csn = cs.name();
   System.out.printf("Check malformed <%s>...%n", csn);
   for (boolean direct : new boolean[] {false, true}) {
     for (byte[] bins : malformed) {
       int mlen = bins[0];
       byte[] bin = Arrays.copyOfRange(bins, 1, bins.length);
       CoderResult cr = decodeCR(bin, cs, direct);
       String ashex = "";
       for (int i = 0; i < bin.length; i++) {
         if (i > 0) ashex += " ";
         ashex += Integer.toString((int) bin[i] & 0xff, 16);
       }
       if (!cr.isMalformed()) {
         System.out.printf(
             "        FAIL(direct=%b): [%s] not malformed. -->cr=%s\n",
             direct, ashex, cr.toString());
         failed = true;
       } else if (cr.length() != mlen) {
         System.out.printf(
             "        FAIL(direct=%b): [%s] malformed[len=%d].\n", direct, ashex, cr.length());
         failed = true;
       }
     }
   }
   if (failed) throw new RuntimeException("Check malformed failed " + csn);
 }
  static void testMixed(Charset cs) throws Throwable {
    CharsetDecoder dec =
        cs.newDecoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);
    CharsetEncoder enc =
        cs.newEncoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);
    List<Integer> cps = new ArrayList<>(0x10000);
    int off = 0;
    int cp = 0;
    while (cp < 0x10000) {
      if (enc.canEncode((char) cp)) {
        cps.add(cp);
      }
      cp++;
    }
    Collections.shuffle(cps);
    char[] bmpCA = new char[cps.size()];
    for (int i = 0; i < cps.size(); i++) bmpCA[i] = (char) (int) cps.get(i);
    String bmpStr = new String(bmpCA);
    // getBytes(csn);
    byte[] bmpBA = bmpStr.getBytes(cs.name());
    ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA));
    byte[] baNIO = new byte[bf.limit()];
    bf.get(baNIO, 0, baNIO.length);
    if (!Arrays.equals(bmpBA, baNIO)) {
      throw new RuntimeException("getBytes(csn) failed  -> " + cs.name());
    }

    // getBytes(cs);
    bmpBA = bmpStr.getBytes(cs);
    if (!Arrays.equals(bmpBA, baNIO))
      throw new RuntimeException("getBytes(cs) failed  -> " + cs.name());

    // new String(csn);
    String strSC = new String(bmpBA, cs.name());
    String strNIO = dec.reset().decode(ByteBuffer.wrap(bmpBA)).toString();
    if (!strNIO.equals(strSC)) {
      throw new RuntimeException("new String(csn) failed  -> " + cs.name());
    }

    // new String(cs);
    strSC = new String(bmpBA, cs);
    if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed  -> " + cs.name());
  }
Example #6
0
 /**
  * Chops a token to the specified length and adds dots.
  *
  * @param token token to be chopped
  * @param max maximum length
  * @return chopped token
  */
 public static byte[] chop(final byte[] token, final int max) {
   if (token.length <= max) return token;
   final byte[] tt = Arrays.copyOf(token, max);
   if (max > 2) tt[max - 3] = '.';
   if (max > 1) tt[max - 2] = '.';
   if (max > 0) tt[max - 1] = '.';
   return tt;
 }
Example #7
0
 /**
  * Removes leading and trailing whitespaces from the specified token.
  *
  * @param token token to be trimmed
  * @return trimmed token
  */
 public static byte[] trim(final byte[] token) {
   int s = -1;
   int e = token.length;
   while (++s < e) if (token[s] > ' ' || token[s] < 0) break;
   while (--e > s) if (token[e] > ' ' || token[e] < 0) break;
   if (++e == token.length && s == 0) return token;
   return s == e ? EMPTY : Arrays.copyOfRange(token, s, e);
 }
  public static void main(String[] args) throws Throwable {

    for (Boolean hasSM : new boolean[] {false, true}) {
      if (hasSM) System.setSecurityManager(new PermissiveSecurityManger());
      for (Charset cs : Charset.availableCharsets().values()) {
        if ("ISO-2022-CN".equals(cs.name())
            || "x-COMPOUND_TEXT".equals(cs.name())
            || "x-JISAutoDetect".equals(cs.name())) continue;
        System.out.printf("Testing(sm=%b) " + cs.name() + "....", hasSM);
        // full bmp first
        char[] bmpCA = new char[0x10000];
        for (int i = 0; i < 0x10000; i++) {
          bmpCA[i] = (char) i;
        }
        byte[] sbBA = new byte[0x100];
        for (int i = 0; i < 0x100; i++) {
          sbBA[i] = (byte) i;
        }
        test(cs, bmpCA, sbBA);
        // "randomed" sizes
        Random rnd = new Random();
        for (int i = 0; i < 10; i++) {
          int clen = rnd.nextInt(0x10000);
          int blen = rnd.nextInt(0x100);
          // System.out.printf("    blen=%d, clen=%d%n", blen, clen);
          test(cs, Arrays.copyOf(bmpCA, clen), Arrays.copyOf(sbBA, blen));
          // add a pair of surrogates
          int pos = clen / 2;
          if ((pos + 1) < blen) {
            bmpCA[pos] = '\uD800';
            bmpCA[pos + 1] = '\uDC00';
          }
          test(cs, Arrays.copyOf(bmpCA, clen), Arrays.copyOf(sbBA, blen));
        }

        testMixed(cs);
        System.out.println("done!");
      }
    }
  }
Example #9
0
  /**
   * Returns a partial token.
   *
   * @param token input text
   * @param start start position
   * @param end end position
   * @return resulting text
   */
  public static byte[] subtoken(final byte[] token, final int start, final int end) {
    int s = Math.max(0, start);
    final int e = Math.min(end, token.length);
    if (s == 0 && e == token.length) return token;
    if (s >= e) return EMPTY;

    int t = Math.max(0, s - 4);
    for (; t != s && t < e; t += cl(token, t)) {
      if (t >= s) s = t;
    }
    for (; t < e; t += cl(token, t)) ;
    return Arrays.copyOfRange(token, s, t);
  }
Example #10
0
 /**
  * Normalizes all whitespace occurrences from the specified token.
  *
  * @param token token
  * @return normalized token
  */
 public static byte[] norm(final byte[] token) {
   final int l = token.length;
   final byte[] tmp = new byte[l];
   int c = 0;
   boolean ws1 = true;
   for (final byte t : token) {
     final boolean ws2 = ws(t);
     if (ws2 && ws1) continue;
     tmp[c++] = ws2 ? (byte) ' ' : t;
     ws1 = ws2;
   }
   if (c > 0 && ws(tmp[c - 1])) --c;
   return c == l ? tmp : Arrays.copyOf(tmp, c);
 }
  /**
   * Returns compact class host.
   *
   * @param obj Object to compact.
   * @return String.
   */
  @Nullable
  public static Object compactObject(Object obj) {
    if (obj == null) return null;

    if (obj instanceof Enum) return obj.toString();

    if (obj instanceof String || obj instanceof Boolean || obj instanceof Number) return obj;

    if (obj instanceof Collection) {
      Collection col = (Collection) obj;

      Object[] res = new Object[col.size()];

      int i = 0;

      for (Object elm : col) res[i++] = compactObject(elm);

      return res;
    }

    if (obj.getClass().isArray()) {
      Class<?> arrType = obj.getClass().getComponentType();

      if (arrType.isPrimitive()) {
        if (obj instanceof boolean[]) return Arrays.toString((boolean[]) obj);
        if (obj instanceof byte[]) return Arrays.toString((byte[]) obj);
        if (obj instanceof short[]) return Arrays.toString((short[]) obj);
        if (obj instanceof int[]) return Arrays.toString((int[]) obj);
        if (obj instanceof long[]) return Arrays.toString((long[]) obj);
        if (obj instanceof float[]) return Arrays.toString((float[]) obj);
        if (obj instanceof double[]) return Arrays.toString((double[]) obj);
      }

      Object[] arr = (Object[]) obj;

      int iMax = arr.length - 1;

      StringBuilder sb = new StringBuilder("[");

      for (int i = 0; i <= iMax; i++) {
        sb.append(compactObject(arr[i]));

        if (i != iMax) sb.append(", ");
      }

      sb.append("]");

      return sb.toString();
    }

    return U.compact(obj.getClass().getName());
  }
  /**
   * Concat arrays in one.
   *
   * @param arrays Arrays.
   * @return Summary array.
   */
  public static int[] concat(int[]... arrays) {
    assert arrays != null;
    assert arrays.length > 1;

    int len = 0;

    for (int[] a : arrays) len += a.length;

    int[] r = Arrays.copyOf(arrays[0], len);

    for (int i = 1, shift = 0; i < arrays.length; i++) {
      shift += arrays[i - 1].length;
      System.arraycopy(arrays[i], 0, r, shift, arrays[i].length);
    }

    return r;
  }
  private void _putObjectField(String name, Object val) {

    if (dbOnlyField(name) || name.equals("_transientFields")) return;

    if (DEBUG) System.out.println("\t put thing : " + name);

    if (name.equals("$where") && val instanceof String) {
      _put(CODE, name);
      _putValueString(val.toString());
      return;
    }

    val = Bytes.applyEncodingHooks(val);

    if (val == null) putNull(name);
    else if (val instanceof Date) putDate(name, (Date) val);
    else if (val instanceof Number) putNumber(name, (Number) val);
    else if (val instanceof String) putString(name, val.toString());
    else if (val instanceof ObjectId) putObjectId(name, (ObjectId) val);
    else if (val instanceof DBObject) putObject(name, (DBObject) val);
    else if (val instanceof Boolean) putBoolean(name, (Boolean) val);
    else if (val instanceof Pattern) putPattern(name, (Pattern) val);
    else if (val instanceof DBRegex) {
      putDBRegex(name, (DBRegex) val);
    } else if (val instanceof Map) putMap(name, (Map) val);
    else if (val instanceof List) putList(name, (List) val);
    else if (val instanceof byte[]) putBinary(name, (byte[]) val);
    else if (val instanceof DBBinary) putBinary(name, (DBBinary) val);
    else if (val.getClass().isArray()) putList(name, Arrays.asList((Object[]) val));
    else if (val instanceof DBPointer) {

      // temporary - there's the notion of "special object" , but for simple level 0...
      DBPointer r = (DBPointer) val;
      putDBPointer(name, r._ns, (ObjectId) r._id);
    } else if (val instanceof DBRefBase) {
      putDBRef(name, (DBRefBase) val);
    } else if (val instanceof DBSymbol) {
      putSymbol(name, (DBSymbol) val);
    } else if (val instanceof DBUndefined) {
      putUndefined(name);
    } else if (val instanceof DBTimestamp) {
      putTimestamp(name, (DBTimestamp) val);
    } else throw new IllegalArgumentException("can't serialize " + val.getClass());
  }
  /**
   * Run command in separated console.
   *
   * @param workFolder Work folder for command.
   * @param args A string array containing the program and its arguments.
   * @return Started process.
   * @throws IOException If failed to start process.
   */
  public static Process openInConsole(@Nullable File workFolder, String... args)
      throws IOException {
    String[] commands = args;

    String cmd = F.concat(Arrays.asList(args), " ");

    if (U.isWindows()) commands = F.asArray("cmd", "/c", String.format("start %s", cmd));

    if (U.isMacOs())
      commands =
          F.asArray(
              "osascript",
              "-e",
              String.format("tell application \"Terminal\" to do script \"%s\"", cmd));

    if (U.isUnix()) commands = F.asArray("xterm", "-sl", "1024", "-geometry", "200x50", "-e", cmd);

    ProcessBuilder pb = new ProcessBuilder(commands);

    if (workFolder != null) pb.directory(workFolder);

    return pb.start();
  }
  public static void main(String[] args) throws Throwable {
    final int itrs = Integer.getInteger("iterations", 100000);
    // final int itrs = Integer.getInteger("iterations", 12);
    final int size = Integer.getInteger("size", 2048);
    final int subsize = Integer.getInteger("subsize", 128);
    final int maxchar = Integer.getInteger("maxchar", 128);
    final String regex = System.getProperty("filter");
    final Pattern filter = (regex == null) ? null : Pattern.compile(regex);
    final boolean useSecurityManager = Boolean.getBoolean("SecurityManager");
    if (useSecurityManager) System.setSecurityManager(new PermissiveSecurityManger());
    final Random rnd = new Random();

    String[] csns =
        new String[] {
          "Big5",
          "Johab",
          "EUC_CN",
          "EUC_KR",
          "MS932",
          "MS936",
          "MS949",
          "MS950",
          "GBK",
          "Big5_HKSCS",
          "Big5_HKSCS_2001",
          "Big5_Solaris",
          "MS950_HKSCS",
          "MS950_HKSCS_XP",
          "IBM1364",
          "IBM1381",
          "IBM1383",
          "IBM930",
          "IBM933",
          "IBM935",
          "IBM937",
          "IBM939",
          "IBM942",
          "IBM943",
          "IBM948",
          "IBM949",
          "IBM950",
          "IBM970",
        };

    ArrayList<long[]> sum = new ArrayList<>();

    for (final String csn : csns) {
      final Charset cs = Charset.forName(csn);
      List<Integer> cps = new ArrayList<>(0x4000);
      int off = 0;
      int cp = 0;
      int n = 0;
      CharsetEncoder enc = cs.newEncoder();
      while (cp < 0x10000 && n < cps.size()) {
        if (enc.canEncode((char) cp)) {
          cps.add(cp);
          n++;
        }
        cp++;
      }
      Collections.shuffle(cps);
      char[] ca = new char[cps.size()];
      for (int i = 0; i < cps.size(); i++) ca[i] = (char) (int) cps.get(i);

      System.out.printf("%n--------%s---------%n", csn);
      for (int sz = 8; sz <= 2048; sz *= 2) {
        System.out.printf("   [len=%d]%n", sz);

        final char[] chars = Arrays.copyOf(ca, sz);
        final String str = new String(chars);
        final byte[] bs = str.getBytes(cs);

        Job[] jobs = {
          new Job("String decode: csn") {
            public void work() throws Throwable {
              for (int i = 0; i < itrs; i++) new String(bs, csn);
            }
          },
          new Job("String decode: cs") {
            public void work() throws Throwable {
              for (int i = 0; i < itrs; i++) new String(bs, cs);
            }
          },
          new Job("String encode: csn") {
            public void work() throws Throwable {
              for (int i = 0; i < itrs; i++) str.getBytes(csn);
            }
          },
          new Job("String encode: cs") {
            public void work() throws Throwable {
              for (int i = 0; i < itrs; i++) str.getBytes(cs);
            }
          },
        };
        sum.add(time(jobs));
      }
    }
  }
Example #16
0
 /**
  * Returns a substring of the specified token. Note that this method does not correctly split UTF8
  * character; use {@link #subtoken} instead.
  *
  * @param token input token
  * @param start start position
  * @param end end position
  * @return substring
  */
 public static byte[] substring(final byte[] token, final int start, final int end) {
   final int s = Math.max(0, start);
   final int e = Math.min(end, token.length);
   if (s == 0 && e == token.length) return token;
   return s >= e ? EMPTY : Arrays.copyOfRange(token, s, e);
 }
  static void test(Charset cs, char[] bmpCA, byte[] sbBA) throws Throwable {
    String bmpStr = new String(bmpCA);
    CharsetDecoder dec =
        cs.newDecoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);
    CharsetEncoder enc =
        cs.newEncoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);

    // getBytes(csn);
    byte[] baSC = bmpStr.getBytes(cs.name());
    ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA));
    byte[] baNIO = new byte[bf.limit()];
    bf.get(baNIO, 0, baNIO.length);
    if (!Arrays.equals(baSC, baNIO))
      throw new RuntimeException("getBytes(csn) failed  -> " + cs.name());

    // getBytes(cs);
    baSC = bmpStr.getBytes(cs);
    if (!Arrays.equals(baSC, baNIO))
      throw new RuntimeException("getBytes(cs) failed  -> " + cs.name());

    // new String(csn);
    String strSC = new String(sbBA, cs.name());
    String strNIO = dec.reset().decode(ByteBuffer.wrap(sbBA)).toString();

    if (!strNIO.equals(strSC))
      throw new RuntimeException("new String(csn) failed  -> " + cs.name());

    // new String(cs);
    strSC = new String(sbBA, cs);
    if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed  -> " + cs.name());

    // encode unmappable surrogates
    if (enc instanceof sun.nio.cs.ArrayEncoder && cs.contains(Charset.forName("ASCII"))) {
      if (cs.name().equals("UTF-8")
          || // utf8 handles surrogates
          cs.name().equals("CESU-8")) // utf8 handles surrogates
      return;
      enc.replaceWith(new byte[] {(byte) 'A'});
      sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder) enc;

      String str = "ab\uD800\uDC00\uD800\uDC00cd";
      byte[] ba = new byte[str.length() - 2];
      int n = cae.encode(str.toCharArray(), 0, str.length(), ba);
      if (n != 6 || !"abAAcd".equals(new String(ba, cs.name())))
        throw new RuntimeException("encode1(surrogates) failed  -> " + cs.name());

      ba = new byte[str.length()];
      n = cae.encode(str.toCharArray(), 0, str.length(), ba);
      if (n != 6 || !"abAAcd".equals(new String(ba, 0, n, cs.name())))
        throw new RuntimeException("encode2(surrogates) failed  -> " + cs.name());
      str = "ab\uD800B\uDC00Bcd";
      ba = new byte[str.length()];
      n = cae.encode(str.toCharArray(), 0, str.length(), ba);
      if (n != 8 || !"abABABcd".equals(new String(ba, 0, n, cs.name())))
        throw new RuntimeException("encode3(surrogates) failed  -> " + cs.name());
      /* sun.nio.cs.ArrayDeEncoder works on the assumption that the
         invoker (StringCoder) allocates enough output buf, utf8
         and double-byte coder does not check the output buffer limit.
      ba = new byte[str.length() - 1];
      n = cae.encode(str.toCharArray(), 0, str.length(), ba);
      if (n != 7 || !"abABABc".equals(new String(ba, 0, n, cs.name()))) {
          throw new RuntimeException("encode4(surrogates) failed  -> "
                                     + cs.name());
      }
      */
    }
  }