// check and compare canEncoding/Encoding static char[] checkEncoding(Charset oldCS, Charset newCS) throws Exception { System.out.printf("Encoding <%s> <%s>...%n", oldCS.name(), newCS.name()); CharsetEncoder encOLD = oldCS.newEncoder(); CharsetEncoder encNew = newCS.newEncoder(); char[] cc = new char[0x10000]; int pos = 0; boolean is970 = "x-IBM970-Old".equals(oldCS.name()); for (char c = 0; c < 0xffff; c++) { boolean canOld = encOLD.canEncode(c); boolean canNew = encNew.canEncode(c); if (is970 && c == 0x2299) continue; if (canOld != canNew) { if (canNew) { System.out.printf(" NEW(only): "); printEntry(c, newCS); } else { if (is970) { byte[] bb = new String(new char[] {c}).getBytes(oldCS); if (bb.length == 2 && bb[0] == (byte) 0xa2 && bb[1] == (byte) 0xc1) { // we know 970 has bogus nnnn -> a2c1 -> 2299 continue; } } System.out.printf(" OLD(only): "); printEntry(c, oldCS); } } else if (canNew) { byte[] bbNew = new String(new char[] {c}).getBytes(newCS); byte[] bbOld = new String(new char[] {c}).getBytes(oldCS); if (!Arrays.equals(bbNew, bbOld)) { System.out.printf(" c->b NEW: "); printEntry(c, newCS); System.out.printf(" c->b OLD: "); printEntry(c, oldCS); } else { String sNew = new String(bbNew, newCS); String sOld = new String(bbOld, oldCS); if (!sNew.equals(sOld)) { System.out.printf(" b2c NEW (c=%x):", c & 0xffff); printEntry(sNew.charAt(0), newCS); System.out.printf(" b2c OLD:"); printEntry(sOld.charAt(0), oldCS); } } } if (canNew & canOld) { // added only both for now cc[pos++] = c; } } return Arrays.copyOf(cc, pos); }
public static void main(String args[]) throws Exception { String inputFile = "samplein.txt"; String outputFile = "sampleout.txt"; RandomAccessFile inf = new RandomAccessFile(inputFile, "r"); RandomAccessFile outf = new RandomAccessFile(outputFile, "rw"); long inputLength = new File(inputFile).length(); FileChannel inc = inf.getChannel(); FileChannel outc = outf.getChannel(); MappedByteBuffer inputData = inc.map(FileChannel.MapMode.READ_ONLY, 0, inputLength); Charset latin1 = Charset.forName("ISO-8859-1"); CharsetDecoder decoder = latin1.newDecoder(); CharsetEncoder encoder = latin1.newEncoder(); CharBuffer cb = decoder.decode(inputData); // Process char data here ByteBuffer outputData = encoder.encode(cb); outc.write(outputData); inf.close(); outf.close(); }
/** * Creates a string in a specfied character set. * * @param value String constant, must not be null * @param charsetName Name of the character set, may be null * @param collation Collation, may be null * @throws IllegalCharsetNameException If the given charset name is illegal * @throws UnsupportedCharsetException If no support for the named charset is available in this * instance of the Java virtual machine * @throws RuntimeException If the given value cannot be represented in the given charset */ public NlsString(String value, String charsetName, SqlCollation collation) { assert value != null; if (null != charsetName) { charsetName = charsetName.toUpperCase(); this.charsetName = charsetName; String javaCharsetName = SqlUtil.translateCharacterSetName(charsetName); if (javaCharsetName == null) { throw new UnsupportedCharsetException(charsetName); } this.charset = Charset.forName(javaCharsetName); CharsetEncoder encoder = charset.newEncoder(); // dry run to see if encoding hits any problems try { encoder.encode(CharBuffer.wrap(value)); } catch (CharacterCodingException ex) { throw RESOURCE.charsetEncoding(value, javaCharsetName).ex(); } } else { this.charsetName = null; this.charset = null; } this.collation = collation; this.value = value; }
static byte[] encode(char[] cc, Charset cs, boolean testDirect, Time t) throws Exception { ByteBuffer bbf; CharBuffer cbf; CharsetEncoder enc = cs.newEncoder(); String csn = cs.name(); if (testDirect) { bbf = ByteBuffer.allocateDirect(cc.length * 4); cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); cbf.put(cc).flip(); } else { bbf = ByteBuffer.allocate(cc.length * 4); cbf = CharBuffer.wrap(cc); } CoderResult cr = null; long t1 = System.nanoTime() / 1000; for (int i = 0; i < iteration; i++) { cbf.rewind(); bbf.clear(); enc.reset(); cr = enc.encode(cbf, bbf, true); } long t2 = System.nanoTime() / 1000; t.t = (t2 - t1) / iteration; if (cr != CoderResult.UNDERFLOW) { System.out.println("ENC-----------------"); int pos = cbf.position(); System.out.printf(" cr=%s, cbf.pos=%d, cc[pos]=%x%n", cr.toString(), pos, cc[pos] & 0xffff); throw new RuntimeException("Encoding err: " + csn); } byte[] bb = new byte[bbf.position()]; bbf.flip(); bbf.get(bb); return bb; }
static void testMixed(Charset cs) throws Throwable { CharsetDecoder dec = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); CharsetEncoder enc = cs.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); List<Integer> cps = new ArrayList<>(0x10000); int off = 0; int cp = 0; while (cp < 0x10000) { if (enc.canEncode((char) cp)) { cps.add(cp); } cp++; } Collections.shuffle(cps); char[] bmpCA = new char[cps.size()]; for (int i = 0; i < cps.size(); i++) bmpCA[i] = (char) (int) cps.get(i); String bmpStr = new String(bmpCA); // getBytes(csn); byte[] bmpBA = bmpStr.getBytes(cs.name()); ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA)); byte[] baNIO = new byte[bf.limit()]; bf.get(baNIO, 0, baNIO.length); if (!Arrays.equals(bmpBA, baNIO)) { throw new RuntimeException("getBytes(csn) failed -> " + cs.name()); } // getBytes(cs); bmpBA = bmpStr.getBytes(cs); if (!Arrays.equals(bmpBA, baNIO)) throw new RuntimeException("getBytes(cs) failed -> " + cs.name()); // new String(csn); String strSC = new String(bmpBA, cs.name()); String strNIO = dec.reset().decode(ByteBuffer.wrap(bmpBA)).toString(); if (!strNIO.equals(strSC)) { throw new RuntimeException("new String(csn) failed -> " + cs.name()); } // new String(cs); strSC = new String(bmpBA, cs); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed -> " + cs.name()); }
static void checkInit(String csn) throws Exception { System.out.printf("Check init <%s>...%n", csn); Charset.forName("Big5"); // load in the ExtendedCharsets long t1 = System.nanoTime() / 1000; Charset cs = Charset.forName(csn); long t2 = System.nanoTime() / 1000; System.out.printf(" charset :%d%n", t2 - t1); t1 = System.nanoTime() / 1000; cs.newDecoder(); t2 = System.nanoTime() / 1000; System.out.printf(" new Decoder :%d%n", t2 - t1); t1 = System.nanoTime() / 1000; cs.newEncoder(); t2 = System.nanoTime() / 1000; System.out.printf(" new Encoder :%d%n", t2 - t1); }
public static void main(String[] args) throws Exception { // 创建简体中文对应的Charset Charset cn = Charset.forName("GBK"); // 获取cn对象对应的编码器和解码器 CharsetEncoder cnEncoder = cn.newEncoder(); CharsetDecoder cnDecoder = cn.newDecoder(); // 创建一个CharBuffer对象 CharBuffer cbuff = CharBuffer.allocate(8); cbuff.put('孙'); cbuff.put('悟'); cbuff.put('空'); cbuff.flip(); // 将CharBuffer中的字符序列转换成字节序列 ByteBuffer bbuff = cnEncoder.encode(cbuff); // 循环访问ByteBuffer中的每个字节 for (int i = 0; i < bbuff.capacity(); i++) { System.out.print(bbuff.get(i) + " "); } // 将ByteBuffer的数据解码成字符序列 System.out.println("\n" + cnDecoder.decode(bbuff)); }
static CoderResult encodeCR(char[] cc, Charset cs, boolean testDirect) throws Exception { ByteBuffer bbf; CharBuffer cbf; CharsetEncoder enc = cs.newEncoder(); if (testDirect) { bbf = ByteBuffer.allocateDirect(cc.length * 4); cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); cbf.put(cc).flip(); } else { bbf = ByteBuffer.allocate(cc.length * 4); cbf = CharBuffer.wrap(cc); } CoderResult cr = null; for (int i = 0; i < iteration; i++) { cbf.rewind(); bbf.clear(); enc.reset(); cr = enc.encode(cbf, bbf, true); } return cr; }
public static void main(String[] args) throws Throwable { final int itrs = Integer.getInteger("iterations", 100000); // final int itrs = Integer.getInteger("iterations", 12); final int size = Integer.getInteger("size", 2048); final int subsize = Integer.getInteger("subsize", 128); final int maxchar = Integer.getInteger("maxchar", 128); final String regex = System.getProperty("filter"); final Pattern filter = (regex == null) ? null : Pattern.compile(regex); final boolean useSecurityManager = Boolean.getBoolean("SecurityManager"); if (useSecurityManager) System.setSecurityManager(new PermissiveSecurityManger()); final Random rnd = new Random(); String[] csns = new String[] { "Big5", "Johab", "EUC_CN", "EUC_KR", "MS932", "MS936", "MS949", "MS950", "GBK", "Big5_HKSCS", "Big5_HKSCS_2001", "Big5_Solaris", "MS950_HKSCS", "MS950_HKSCS_XP", "IBM1364", "IBM1381", "IBM1383", "IBM930", "IBM933", "IBM935", "IBM937", "IBM939", "IBM942", "IBM943", "IBM948", "IBM949", "IBM950", "IBM970", }; ArrayList<long[]> sum = new ArrayList<>(); for (final String csn : csns) { final Charset cs = Charset.forName(csn); List<Integer> cps = new ArrayList<>(0x4000); int off = 0; int cp = 0; int n = 0; CharsetEncoder enc = cs.newEncoder(); while (cp < 0x10000 && n < cps.size()) { if (enc.canEncode((char) cp)) { cps.add(cp); n++; } cp++; } Collections.shuffle(cps); char[] ca = new char[cps.size()]; for (int i = 0; i < cps.size(); i++) ca[i] = (char) (int) cps.get(i); System.out.printf("%n--------%s---------%n", csn); for (int sz = 8; sz <= 2048; sz *= 2) { System.out.printf(" [len=%d]%n", sz); final char[] chars = Arrays.copyOf(ca, sz); final String str = new String(chars); final byte[] bs = str.getBytes(cs); Job[] jobs = { new Job("String decode: csn") { public void work() throws Throwable { for (int i = 0; i < itrs; i++) new String(bs, csn); } }, new Job("String decode: cs") { public void work() throws Throwable { for (int i = 0; i < itrs; i++) new String(bs, cs); } }, new Job("String encode: csn") { public void work() throws Throwable { for (int i = 0; i < itrs; i++) str.getBytes(csn); } }, new Job("String encode: cs") { public void work() throws Throwable { for (int i = 0; i < itrs; i++) str.getBytes(cs); } }, }; sum.add(time(jobs)); } } }
static void test(Charset cs, char[] bmpCA, byte[] sbBA) throws Throwable { String bmpStr = new String(bmpCA); CharsetDecoder dec = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); CharsetEncoder enc = cs.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); // getBytes(csn); byte[] baSC = bmpStr.getBytes(cs.name()); ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA)); byte[] baNIO = new byte[bf.limit()]; bf.get(baNIO, 0, baNIO.length); if (!Arrays.equals(baSC, baNIO)) throw new RuntimeException("getBytes(csn) failed -> " + cs.name()); // getBytes(cs); baSC = bmpStr.getBytes(cs); if (!Arrays.equals(baSC, baNIO)) throw new RuntimeException("getBytes(cs) failed -> " + cs.name()); // new String(csn); String strSC = new String(sbBA, cs.name()); String strNIO = dec.reset().decode(ByteBuffer.wrap(sbBA)).toString(); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(csn) failed -> " + cs.name()); // new String(cs); strSC = new String(sbBA, cs); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed -> " + cs.name()); // encode unmappable surrogates if (enc instanceof sun.nio.cs.ArrayEncoder && cs.contains(Charset.forName("ASCII"))) { if (cs.name().equals("UTF-8") || // utf8 handles surrogates cs.name().equals("CESU-8")) // utf8 handles surrogates return; enc.replaceWith(new byte[] {(byte) 'A'}); sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder) enc; String str = "ab\uD800\uDC00\uD800\uDC00cd"; byte[] ba = new byte[str.length() - 2]; int n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 6 || !"abAAcd".equals(new String(ba, cs.name()))) throw new RuntimeException("encode1(surrogates) failed -> " + cs.name()); ba = new byte[str.length()]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 6 || !"abAAcd".equals(new String(ba, 0, n, cs.name()))) throw new RuntimeException("encode2(surrogates) failed -> " + cs.name()); str = "ab\uD800B\uDC00Bcd"; ba = new byte[str.length()]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 8 || !"abABABcd".equals(new String(ba, 0, n, cs.name()))) throw new RuntimeException("encode3(surrogates) failed -> " + cs.name()); /* sun.nio.cs.ArrayDeEncoder works on the assumption that the invoker (StringCoder) allocates enough output buf, utf8 and double-byte coder does not check the output buffer limit. ba = new byte[str.length() - 1]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 7 || !"abABABc".equals(new String(ba, 0, n, cs.name()))) { throw new RuntimeException("encode4(surrogates) failed -> " + cs.name()); } */ } }