// check and compare canEncoding/Encoding static char[] checkEncoding(Charset oldCS, Charset newCS) throws Exception { System.out.printf("Encoding <%s> <%s>...%n", oldCS.name(), newCS.name()); CharsetEncoder encOLD = oldCS.newEncoder(); CharsetEncoder encNew = newCS.newEncoder(); char[] cc = new char[0x10000]; int pos = 0; boolean is970 = "x-IBM970-Old".equals(oldCS.name()); for (char c = 0; c < 0xffff; c++) { boolean canOld = encOLD.canEncode(c); boolean canNew = encNew.canEncode(c); if (is970 && c == 0x2299) continue; if (canOld != canNew) { if (canNew) { System.out.printf(" NEW(only): "); printEntry(c, newCS); } else { if (is970) { byte[] bb = new String(new char[] {c}).getBytes(oldCS); if (bb.length == 2 && bb[0] == (byte) 0xa2 && bb[1] == (byte) 0xc1) { // we know 970 has bogus nnnn -> a2c1 -> 2299 continue; } } System.out.printf(" OLD(only): "); printEntry(c, oldCS); } } else if (canNew) { byte[] bbNew = new String(new char[] {c}).getBytes(newCS); byte[] bbOld = new String(new char[] {c}).getBytes(oldCS); if (!Arrays.equals(bbNew, bbOld)) { System.out.printf(" c->b NEW: "); printEntry(c, newCS); System.out.printf(" c->b OLD: "); printEntry(c, oldCS); } else { String sNew = new String(bbNew, newCS); String sOld = new String(bbOld, oldCS); if (!sNew.equals(sOld)) { System.out.printf(" b2c NEW (c=%x):", c & 0xffff); printEntry(sNew.charAt(0), newCS); System.out.printf(" b2c OLD:"); printEntry(sOld.charAt(0), oldCS); } } } if (canNew & canOld) { // added only both for now cc[pos++] = c; } } return Arrays.copyOf(cc, pos); }
static void compare(Charset cs1, Charset cs2, char[] cc) throws Exception { System.gc(); // enqueue finalizable objects Thread.sleep(1000); System.gc(); // enqueue finalizable objects String csn1 = cs1.name(); String csn2 = cs2.name(); System.out.printf("Diff <%s> <%s>...%n", csn1, csn2); Time t1 = new Time(); Time t2 = new Time(); byte[] bb1 = encode(cc, cs1, false, t1); byte[] bb2 = encode(cc, cs2, false, t2); System.out.printf( " Encoding TimeRatio %s/%s: %d,%d :%f%n", csn2, csn1, t2.t, t1.t, (double) (t2.t) / (t1.t)); if (!Arrays.equals(bb1, bb2)) { System.out.printf(" encoding failed%n"); } char[] cc2 = decode(bb1, cs2, false, t2); char[] cc1 = decode(bb1, cs1, false, t1); System.out.printf( " Decoding TimeRatio %s/%s: %d,%d :%f%n", csn2, csn1, t2.t, t1.t, (double) (t2.t) / (t1.t)); if (!Arrays.equals(cc1, cc2)) { System.out.printf(" decoding failed%n"); } bb1 = encode(cc, cs1, true, t1); bb2 = encode(cc, cs2, true, t2); System.out.printf( " Encoding(dir) TimeRatio %s/%s: %d,%d :%f%n", csn2, csn1, t2.t, t1.t, (double) (t2.t) / (t1.t)); if (!Arrays.equals(bb1, bb2)) System.out.printf(" encoding (direct) failed%n"); cc1 = decode(bb1, cs1, true, t1); cc2 = decode(bb1, cs2, true, t2); System.out.printf( " Decoding(dir) TimeRatio %s/%s: %d,%d :%f%n", csn2, csn1, t2.t, t1.t, (double) (t2.t) / (t1.t)); if (!Arrays.equals(cc1, cc2)) { System.out.printf(" decoding (direct) failed%n"); } }
/** * Converts a token to a sequence of codepoints. * * @param token token * @return codepoints */ public static int[] cps(final byte[] token) { int pos = 0; final int len = token.length; final int[] cp = new int[len]; for (int i = 0; i < len; i += cl(token, i)) cp[pos++] = cp(token, i); return pos < len ? Arrays.copyOf(cp, pos) : cp; }
static void checkMalformed(Charset cs, byte[][] malformed) throws Exception { boolean failed = false; String csn = cs.name(); System.out.printf("Check malformed <%s>...%n", csn); for (boolean direct : new boolean[] {false, true}) { for (byte[] bins : malformed) { int mlen = bins[0]; byte[] bin = Arrays.copyOfRange(bins, 1, bins.length); CoderResult cr = decodeCR(bin, cs, direct); String ashex = ""; for (int i = 0; i < bin.length; i++) { if (i > 0) ashex += " "; ashex += Integer.toString((int) bin[i] & 0xff, 16); } if (!cr.isMalformed()) { System.out.printf( " FAIL(direct=%b): [%s] not malformed. -->cr=%s\n", direct, ashex, cr.toString()); failed = true; } else if (cr.length() != mlen) { System.out.printf( " FAIL(direct=%b): [%s] malformed[len=%d].\n", direct, ashex, cr.length()); failed = true; } } } if (failed) throw new RuntimeException("Check malformed failed " + csn); }
static void testMixed(Charset cs) throws Throwable { CharsetDecoder dec = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); CharsetEncoder enc = cs.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); List<Integer> cps = new ArrayList<>(0x10000); int off = 0; int cp = 0; while (cp < 0x10000) { if (enc.canEncode((char) cp)) { cps.add(cp); } cp++; } Collections.shuffle(cps); char[] bmpCA = new char[cps.size()]; for (int i = 0; i < cps.size(); i++) bmpCA[i] = (char) (int) cps.get(i); String bmpStr = new String(bmpCA); // getBytes(csn); byte[] bmpBA = bmpStr.getBytes(cs.name()); ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA)); byte[] baNIO = new byte[bf.limit()]; bf.get(baNIO, 0, baNIO.length); if (!Arrays.equals(bmpBA, baNIO)) { throw new RuntimeException("getBytes(csn) failed -> " + cs.name()); } // getBytes(cs); bmpBA = bmpStr.getBytes(cs); if (!Arrays.equals(bmpBA, baNIO)) throw new RuntimeException("getBytes(cs) failed -> " + cs.name()); // new String(csn); String strSC = new String(bmpBA, cs.name()); String strNIO = dec.reset().decode(ByteBuffer.wrap(bmpBA)).toString(); if (!strNIO.equals(strSC)) { throw new RuntimeException("new String(csn) failed -> " + cs.name()); } // new String(cs); strSC = new String(bmpBA, cs); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed -> " + cs.name()); }
/** * Chops a token to the specified length and adds dots. * * @param token token to be chopped * @param max maximum length * @return chopped token */ public static byte[] chop(final byte[] token, final int max) { if (token.length <= max) return token; final byte[] tt = Arrays.copyOf(token, max); if (max > 2) tt[max - 3] = '.'; if (max > 1) tt[max - 2] = '.'; if (max > 0) tt[max - 1] = '.'; return tt; }
/** * Removes leading and trailing whitespaces from the specified token. * * @param token token to be trimmed * @return trimmed token */ public static byte[] trim(final byte[] token) { int s = -1; int e = token.length; while (++s < e) if (token[s] > ' ' || token[s] < 0) break; while (--e > s) if (token[e] > ' ' || token[e] < 0) break; if (++e == token.length && s == 0) return token; return s == e ? EMPTY : Arrays.copyOfRange(token, s, e); }
public static void main(String[] args) throws Throwable { for (Boolean hasSM : new boolean[] {false, true}) { if (hasSM) System.setSecurityManager(new PermissiveSecurityManger()); for (Charset cs : Charset.availableCharsets().values()) { if ("ISO-2022-CN".equals(cs.name()) || "x-COMPOUND_TEXT".equals(cs.name()) || "x-JISAutoDetect".equals(cs.name())) continue; System.out.printf("Testing(sm=%b) " + cs.name() + "....", hasSM); // full bmp first char[] bmpCA = new char[0x10000]; for (int i = 0; i < 0x10000; i++) { bmpCA[i] = (char) i; } byte[] sbBA = new byte[0x100]; for (int i = 0; i < 0x100; i++) { sbBA[i] = (byte) i; } test(cs, bmpCA, sbBA); // "randomed" sizes Random rnd = new Random(); for (int i = 0; i < 10; i++) { int clen = rnd.nextInt(0x10000); int blen = rnd.nextInt(0x100); // System.out.printf(" blen=%d, clen=%d%n", blen, clen); test(cs, Arrays.copyOf(bmpCA, clen), Arrays.copyOf(sbBA, blen)); // add a pair of surrogates int pos = clen / 2; if ((pos + 1) < blen) { bmpCA[pos] = '\uD800'; bmpCA[pos + 1] = '\uDC00'; } test(cs, Arrays.copyOf(bmpCA, clen), Arrays.copyOf(sbBA, blen)); } testMixed(cs); System.out.println("done!"); } } }
/** * Returns a partial token. * * @param token input text * @param start start position * @param end end position * @return resulting text */ public static byte[] subtoken(final byte[] token, final int start, final int end) { int s = Math.max(0, start); final int e = Math.min(end, token.length); if (s == 0 && e == token.length) return token; if (s >= e) return EMPTY; int t = Math.max(0, s - 4); for (; t != s && t < e; t += cl(token, t)) { if (t >= s) s = t; } for (; t < e; t += cl(token, t)) ; return Arrays.copyOfRange(token, s, t); }
/** * Normalizes all whitespace occurrences from the specified token. * * @param token token * @return normalized token */ public static byte[] norm(final byte[] token) { final int l = token.length; final byte[] tmp = new byte[l]; int c = 0; boolean ws1 = true; for (final byte t : token) { final boolean ws2 = ws(t); if (ws2 && ws1) continue; tmp[c++] = ws2 ? (byte) ' ' : t; ws1 = ws2; } if (c > 0 && ws(tmp[c - 1])) --c; return c == l ? tmp : Arrays.copyOf(tmp, c); }
/** * Returns compact class host. * * @param obj Object to compact. * @return String. */ @Nullable public static Object compactObject(Object obj) { if (obj == null) return null; if (obj instanceof Enum) return obj.toString(); if (obj instanceof String || obj instanceof Boolean || obj instanceof Number) return obj; if (obj instanceof Collection) { Collection col = (Collection) obj; Object[] res = new Object[col.size()]; int i = 0; for (Object elm : col) res[i++] = compactObject(elm); return res; } if (obj.getClass().isArray()) { Class<?> arrType = obj.getClass().getComponentType(); if (arrType.isPrimitive()) { if (obj instanceof boolean[]) return Arrays.toString((boolean[]) obj); if (obj instanceof byte[]) return Arrays.toString((byte[]) obj); if (obj instanceof short[]) return Arrays.toString((short[]) obj); if (obj instanceof int[]) return Arrays.toString((int[]) obj); if (obj instanceof long[]) return Arrays.toString((long[]) obj); if (obj instanceof float[]) return Arrays.toString((float[]) obj); if (obj instanceof double[]) return Arrays.toString((double[]) obj); } Object[] arr = (Object[]) obj; int iMax = arr.length - 1; StringBuilder sb = new StringBuilder("["); for (int i = 0; i <= iMax; i++) { sb.append(compactObject(arr[i])); if (i != iMax) sb.append(", "); } sb.append("]"); return sb.toString(); } return U.compact(obj.getClass().getName()); }
/** * Concat arrays in one. * * @param arrays Arrays. * @return Summary array. */ public static int[] concat(int[]... arrays) { assert arrays != null; assert arrays.length > 1; int len = 0; for (int[] a : arrays) len += a.length; int[] r = Arrays.copyOf(arrays[0], len); for (int i = 1, shift = 0; i < arrays.length; i++) { shift += arrays[i - 1].length; System.arraycopy(arrays[i], 0, r, shift, arrays[i].length); } return r; }
private void _putObjectField(String name, Object val) { if (dbOnlyField(name) || name.equals("_transientFields")) return; if (DEBUG) System.out.println("\t put thing : " + name); if (name.equals("$where") && val instanceof String) { _put(CODE, name); _putValueString(val.toString()); return; } val = Bytes.applyEncodingHooks(val); if (val == null) putNull(name); else if (val instanceof Date) putDate(name, (Date) val); else if (val instanceof Number) putNumber(name, (Number) val); else if (val instanceof String) putString(name, val.toString()); else if (val instanceof ObjectId) putObjectId(name, (ObjectId) val); else if (val instanceof DBObject) putObject(name, (DBObject) val); else if (val instanceof Boolean) putBoolean(name, (Boolean) val); else if (val instanceof Pattern) putPattern(name, (Pattern) val); else if (val instanceof DBRegex) { putDBRegex(name, (DBRegex) val); } else if (val instanceof Map) putMap(name, (Map) val); else if (val instanceof List) putList(name, (List) val); else if (val instanceof byte[]) putBinary(name, (byte[]) val); else if (val instanceof DBBinary) putBinary(name, (DBBinary) val); else if (val.getClass().isArray()) putList(name, Arrays.asList((Object[]) val)); else if (val instanceof DBPointer) { // temporary - there's the notion of "special object" , but for simple level 0... DBPointer r = (DBPointer) val; putDBPointer(name, r._ns, (ObjectId) r._id); } else if (val instanceof DBRefBase) { putDBRef(name, (DBRefBase) val); } else if (val instanceof DBSymbol) { putSymbol(name, (DBSymbol) val); } else if (val instanceof DBUndefined) { putUndefined(name); } else if (val instanceof DBTimestamp) { putTimestamp(name, (DBTimestamp) val); } else throw new IllegalArgumentException("can't serialize " + val.getClass()); }
/** * Run command in separated console. * * @param workFolder Work folder for command. * @param args A string array containing the program and its arguments. * @return Started process. * @throws IOException If failed to start process. */ public static Process openInConsole(@Nullable File workFolder, String... args) throws IOException { String[] commands = args; String cmd = F.concat(Arrays.asList(args), " "); if (U.isWindows()) commands = F.asArray("cmd", "/c", String.format("start %s", cmd)); if (U.isMacOs()) commands = F.asArray( "osascript", "-e", String.format("tell application \"Terminal\" to do script \"%s\"", cmd)); if (U.isUnix()) commands = F.asArray("xterm", "-sl", "1024", "-geometry", "200x50", "-e", cmd); ProcessBuilder pb = new ProcessBuilder(commands); if (workFolder != null) pb.directory(workFolder); return pb.start(); }
public static void main(String[] args) throws Throwable { final int itrs = Integer.getInteger("iterations", 100000); // final int itrs = Integer.getInteger("iterations", 12); final int size = Integer.getInteger("size", 2048); final int subsize = Integer.getInteger("subsize", 128); final int maxchar = Integer.getInteger("maxchar", 128); final String regex = System.getProperty("filter"); final Pattern filter = (regex == null) ? null : Pattern.compile(regex); final boolean useSecurityManager = Boolean.getBoolean("SecurityManager"); if (useSecurityManager) System.setSecurityManager(new PermissiveSecurityManger()); final Random rnd = new Random(); String[] csns = new String[] { "Big5", "Johab", "EUC_CN", "EUC_KR", "MS932", "MS936", "MS949", "MS950", "GBK", "Big5_HKSCS", "Big5_HKSCS_2001", "Big5_Solaris", "MS950_HKSCS", "MS950_HKSCS_XP", "IBM1364", "IBM1381", "IBM1383", "IBM930", "IBM933", "IBM935", "IBM937", "IBM939", "IBM942", "IBM943", "IBM948", "IBM949", "IBM950", "IBM970", }; ArrayList<long[]> sum = new ArrayList<>(); for (final String csn : csns) { final Charset cs = Charset.forName(csn); List<Integer> cps = new ArrayList<>(0x4000); int off = 0; int cp = 0; int n = 0; CharsetEncoder enc = cs.newEncoder(); while (cp < 0x10000 && n < cps.size()) { if (enc.canEncode((char) cp)) { cps.add(cp); n++; } cp++; } Collections.shuffle(cps); char[] ca = new char[cps.size()]; for (int i = 0; i < cps.size(); i++) ca[i] = (char) (int) cps.get(i); System.out.printf("%n--------%s---------%n", csn); for (int sz = 8; sz <= 2048; sz *= 2) { System.out.printf(" [len=%d]%n", sz); final char[] chars = Arrays.copyOf(ca, sz); final String str = new String(chars); final byte[] bs = str.getBytes(cs); Job[] jobs = { new Job("String decode: csn") { public void work() throws Throwable { for (int i = 0; i < itrs; i++) new String(bs, csn); } }, new Job("String decode: cs") { public void work() throws Throwable { for (int i = 0; i < itrs; i++) new String(bs, cs); } }, new Job("String encode: csn") { public void work() throws Throwable { for (int i = 0; i < itrs; i++) str.getBytes(csn); } }, new Job("String encode: cs") { public void work() throws Throwable { for (int i = 0; i < itrs; i++) str.getBytes(cs); } }, }; sum.add(time(jobs)); } } }
/** * Returns a substring of the specified token. Note that this method does not correctly split UTF8 * character; use {@link #subtoken} instead. * * @param token input token * @param start start position * @param end end position * @return substring */ public static byte[] substring(final byte[] token, final int start, final int end) { final int s = Math.max(0, start); final int e = Math.min(end, token.length); if (s == 0 && e == token.length) return token; return s >= e ? EMPTY : Arrays.copyOfRange(token, s, e); }
static void test(Charset cs, char[] bmpCA, byte[] sbBA) throws Throwable { String bmpStr = new String(bmpCA); CharsetDecoder dec = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); CharsetEncoder enc = cs.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); // getBytes(csn); byte[] baSC = bmpStr.getBytes(cs.name()); ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA)); byte[] baNIO = new byte[bf.limit()]; bf.get(baNIO, 0, baNIO.length); if (!Arrays.equals(baSC, baNIO)) throw new RuntimeException("getBytes(csn) failed -> " + cs.name()); // getBytes(cs); baSC = bmpStr.getBytes(cs); if (!Arrays.equals(baSC, baNIO)) throw new RuntimeException("getBytes(cs) failed -> " + cs.name()); // new String(csn); String strSC = new String(sbBA, cs.name()); String strNIO = dec.reset().decode(ByteBuffer.wrap(sbBA)).toString(); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(csn) failed -> " + cs.name()); // new String(cs); strSC = new String(sbBA, cs); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed -> " + cs.name()); // encode unmappable surrogates if (enc instanceof sun.nio.cs.ArrayEncoder && cs.contains(Charset.forName("ASCII"))) { if (cs.name().equals("UTF-8") || // utf8 handles surrogates cs.name().equals("CESU-8")) // utf8 handles surrogates return; enc.replaceWith(new byte[] {(byte) 'A'}); sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder) enc; String str = "ab\uD800\uDC00\uD800\uDC00cd"; byte[] ba = new byte[str.length() - 2]; int n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 6 || !"abAAcd".equals(new String(ba, cs.name()))) throw new RuntimeException("encode1(surrogates) failed -> " + cs.name()); ba = new byte[str.length()]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 6 || !"abAAcd".equals(new String(ba, 0, n, cs.name()))) throw new RuntimeException("encode2(surrogates) failed -> " + cs.name()); str = "ab\uD800B\uDC00Bcd"; ba = new byte[str.length()]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 8 || !"abABABcd".equals(new String(ba, 0, n, cs.name()))) throw new RuntimeException("encode3(surrogates) failed -> " + cs.name()); /* sun.nio.cs.ArrayDeEncoder works on the assumption that the invoker (StringCoder) allocates enough output buf, utf8 and double-byte coder does not check the output buffer limit. ba = new byte[str.length() - 1]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 7 || !"abABABc".equals(new String(ba, 0, n, cs.name()))) { throw new RuntimeException("encode4(surrogates) failed -> " + cs.name()); } */ } }