public static void main(String args[]) throws Exception { String inputFile = "samplein.txt"; String outputFile = "sampleout.txt"; RandomAccessFile inf = new RandomAccessFile(inputFile, "r"); RandomAccessFile outf = new RandomAccessFile(outputFile, "rw"); long inputLength = new File(inputFile).length(); FileChannel inc = inf.getChannel(); FileChannel outc = outf.getChannel(); MappedByteBuffer inputData = inc.map(FileChannel.MapMode.READ_ONLY, 0, inputLength); Charset latin1 = Charset.forName("ISO-8859-1"); CharsetDecoder decoder = latin1.newDecoder(); CharsetEncoder encoder = latin1.newEncoder(); CharBuffer cb = decoder.decode(inputData); // Process char data here ByteBuffer outputData = encoder.encode(cb); outc.write(outputData); inf.close(); outf.close(); }
/** * Creates a string in a specfied character set. * * @param value String constant, must not be null * @param charsetName Name of the character set, may be null * @param collation Collation, may be null * @throws IllegalCharsetNameException If the given charset name is illegal * @throws UnsupportedCharsetException If no support for the named charset is available in this * instance of the Java virtual machine * @throws RuntimeException If the given value cannot be represented in the given charset */ public NlsString(String value, String charsetName, SqlCollation collation) { assert value != null; if (null != charsetName) { charsetName = charsetName.toUpperCase(); this.charsetName = charsetName; String javaCharsetName = SqlUtil.translateCharacterSetName(charsetName); if (javaCharsetName == null) { throw new UnsupportedCharsetException(charsetName); } this.charset = Charset.forName(javaCharsetName); CharsetEncoder encoder = charset.newEncoder(); // dry run to see if encoding hits any problems try { encoder.encode(CharBuffer.wrap(value)); } catch (CharacterCodingException ex) { throw RESOURCE.charsetEncoding(value, javaCharsetName).ex(); } } else { this.charsetName = null; this.charset = null; } this.collation = collation; this.value = value; }
static byte[] encode(char[] cc, Charset cs, boolean testDirect, Time t) throws Exception { ByteBuffer bbf; CharBuffer cbf; CharsetEncoder enc = cs.newEncoder(); String csn = cs.name(); if (testDirect) { bbf = ByteBuffer.allocateDirect(cc.length * 4); cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); cbf.put(cc).flip(); } else { bbf = ByteBuffer.allocate(cc.length * 4); cbf = CharBuffer.wrap(cc); } CoderResult cr = null; long t1 = System.nanoTime() / 1000; for (int i = 0; i < iteration; i++) { cbf.rewind(); bbf.clear(); enc.reset(); cr = enc.encode(cbf, bbf, true); } long t2 = System.nanoTime() / 1000; t.t = (t2 - t1) / iteration; if (cr != CoderResult.UNDERFLOW) { System.out.println("ENC-----------------"); int pos = cbf.position(); System.out.printf(" cr=%s, cbf.pos=%d, cc[pos]=%x%n", cr.toString(), pos, cc[pos] & 0xffff); throw new RuntimeException("Encoding err: " + csn); } byte[] bb = new byte[bbf.position()]; bbf.flip(); bbf.get(bb); return bb; }
/** * Write String into byte array * * <p>It will remove a trailing null terminator if exists if the option * RemoveTrailingTerminatorOnWrite has been set. * * @return the data as a byte array in format to write to file */ public byte[] writeByteArray() { byte[] data; // Try and write to buffer using the CharSet defined by getTextEncodingCharSet() String charSetName = getTextEncodingCharSet(); try { stripTrailingNull(); // Special Handling because there is no UTF16 BOM LE charset String stringValue = (String) value; String actualCharSet = null; if (charSetName.equals(TextEncoding.CHARSET_UTF_16)) { if (TagOptionSingleton.getInstance().isEncodeUTF16BomAsLittleEndian()) { actualCharSet = TextEncoding.CHARSET_UTF_16_LE_ENCODING_FORMAT; } else { actualCharSet = TextEncoding.CHARSET_UTF_16_BE_ENCODING_FORMAT; } } // Ensure large enough for any encoding ByteBuffer outputBuffer = ByteBuffer.allocate((stringValue.length() + 3) * 3); // Ensure each string (if multiple values) is written with BOM by writing separately List<String> values = splitByNullSeperator(stringValue); checkTrailingNull(values, stringValue); // For each value for (int i = 0; i < values.size(); i++) { String next = values.get(i); if (actualCharSet != null) { if (actualCharSet.equals(TextEncoding.CHARSET_UTF_16_LE_ENCODING_FORMAT)) { outputBuffer.put(writeStringUTF16LEBOM(next, i, values.size())); } else if (actualCharSet.equals(TextEncoding.CHARSET_UTF_16_BE_ENCODING_FORMAT)) { outputBuffer.put(writeStringUTF16BEBOM(next, i, values.size())); } } else { CharsetEncoder charsetEncoder = Charset.forName(charSetName).newEncoder(); charsetEncoder.onMalformedInput(CodingErrorAction.IGNORE); charsetEncoder.onUnmappableCharacter(CodingErrorAction.IGNORE); outputBuffer.put(writeString(charsetEncoder, next, i, values.size())); } } outputBuffer.flip(); data = new byte[outputBuffer.limit()]; outputBuffer.rewind(); outputBuffer.get(data, 0, outputBuffer.limit()); setSize(data.length); } // https://bitbucket.org/ijabz/jaudiotagger/issue/1/encoding-metadata-to-utf-16-can-fail-if catch (CharacterCodingException ce) { logger.severe(ce.getMessage() + ":" + charSetName + ":" + value); throw new RuntimeException(ce); } return data; }
// check and compare canEncoding/Encoding static char[] checkEncoding(Charset oldCS, Charset newCS) throws Exception { System.out.printf("Encoding <%s> <%s>...%n", oldCS.name(), newCS.name()); CharsetEncoder encOLD = oldCS.newEncoder(); CharsetEncoder encNew = newCS.newEncoder(); char[] cc = new char[0x10000]; int pos = 0; boolean is970 = "x-IBM970-Old".equals(oldCS.name()); for (char c = 0; c < 0xffff; c++) { boolean canOld = encOLD.canEncode(c); boolean canNew = encNew.canEncode(c); if (is970 && c == 0x2299) continue; if (canOld != canNew) { if (canNew) { System.out.printf(" NEW(only): "); printEntry(c, newCS); } else { if (is970) { byte[] bb = new String(new char[] {c}).getBytes(oldCS); if (bb.length == 2 && bb[0] == (byte) 0xa2 && bb[1] == (byte) 0xc1) { // we know 970 has bogus nnnn -> a2c1 -> 2299 continue; } } System.out.printf(" OLD(only): "); printEntry(c, oldCS); } } else if (canNew) { byte[] bbNew = new String(new char[] {c}).getBytes(newCS); byte[] bbOld = new String(new char[] {c}).getBytes(oldCS); if (!Arrays.equals(bbNew, bbOld)) { System.out.printf(" c->b NEW: "); printEntry(c, newCS); System.out.printf(" c->b OLD: "); printEntry(c, oldCS); } else { String sNew = new String(bbNew, newCS); String sOld = new String(bbOld, oldCS); if (!sNew.equals(sOld)) { System.out.printf(" b2c NEW (c=%x):", c & 0xffff); printEntry(sNew.charAt(0), newCS); System.out.printf(" b2c OLD:"); printEntry(sOld.charAt(0), oldCS); } } } if (canNew & canOld) { // added only both for now cc[pos++] = c; } } return Arrays.copyOf(cc, pos); }
/** * Write String using specified encoding * * <p>When this is called multiple times, all but the last value has a trailing null * * @param encoder * @param next * @param i * @param noOfValues * @return * @throws CharacterCodingException */ private ByteBuffer writeString(CharsetEncoder encoder, String next, int i, int noOfValues) throws CharacterCodingException { ByteBuffer bb; if ((i + 1) == noOfValues) { bb = encoder.encode(CharBuffer.wrap(next)); } else { bb = encoder.encode(CharBuffer.wrap(next + '\0')); } bb.rewind(); return bb; }
static void testMixed(Charset cs) throws Throwable { CharsetDecoder dec = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); CharsetEncoder enc = cs.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); List<Integer> cps = new ArrayList<>(0x10000); int off = 0; int cp = 0; while (cp < 0x10000) { if (enc.canEncode((char) cp)) { cps.add(cp); } cp++; } Collections.shuffle(cps); char[] bmpCA = new char[cps.size()]; for (int i = 0; i < cps.size(); i++) bmpCA[i] = (char) (int) cps.get(i); String bmpStr = new String(bmpCA); // getBytes(csn); byte[] bmpBA = bmpStr.getBytes(cs.name()); ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA)); byte[] baNIO = new byte[bf.limit()]; bf.get(baNIO, 0, baNIO.length); if (!Arrays.equals(bmpBA, baNIO)) { throw new RuntimeException("getBytes(csn) failed -> " + cs.name()); } // getBytes(cs); bmpBA = bmpStr.getBytes(cs); if (!Arrays.equals(bmpBA, baNIO)) throw new RuntimeException("getBytes(cs) failed -> " + cs.name()); // new String(csn); String strSC = new String(bmpBA, cs.name()); String strNIO = dec.reset().decode(ByteBuffer.wrap(bmpBA)).toString(); if (!strNIO.equals(strSC)) { throw new RuntimeException("new String(csn) failed -> " + cs.name()); } // new String(cs); strSC = new String(bmpBA, cs); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed -> " + cs.name()); }
/** * Write String in UTF-BEBOM format * * <p>When this is called multiple times, all but the last value has a trailing null * * @param next * @param i * @param noOfValues * @return * @throws CharacterCodingException */ private ByteBuffer writeStringUTF16BEBOM(String next, int i, int noOfValues) throws CharacterCodingException { CharsetEncoder encoder = Charset.forName(TextEncoding.CHARSET_UTF_16_BE_ENCODING_FORMAT).newEncoder(); encoder.onMalformedInput(CodingErrorAction.IGNORE); encoder.onUnmappableCharacter(CodingErrorAction.IGNORE); ByteBuffer bb = null; // Add BOM if ((i + 1) == noOfValues) { bb = encoder.encode(CharBuffer.wrap('\ufeff' + next)); } else { bb = encoder.encode(CharBuffer.wrap('\ufeff' + next + '\0')); } bb.rewind(); return bb; }
public static void main(String args[]) throws Exception { String s = "abc\uD800\uDC00qrst"; // Valid surrogate char[] c = s.toCharArray(); CharsetEncoder enc = Charset.forName("ISO8859_1").newEncoder().onUnmappableCharacter(CodingErrorAction.REPLACE); /* Process the first 4 characters, including the high surrogate which should be stored */ ByteBuffer bb = ByteBuffer.allocate(10); CharBuffer cb = CharBuffer.wrap(c); cb.limit(4); enc.encode(cb, bb, false); cb.limit(7); enc.encode(cb, bb, true); byte[] first = bb.array(); for (int i = 0; i < 7; i++) System.err.printf("[%d]=%d was %d\n", i, (int) first[i] & 0xffff, (int) c[i] & 0xffff); }
public static void main(String[] args) throws Exception { // 创建简体中文对应的Charset Charset cn = Charset.forName("GBK"); // 获取cn对象对应的编码器和解码器 CharsetEncoder cnEncoder = cn.newEncoder(); CharsetDecoder cnDecoder = cn.newDecoder(); // 创建一个CharBuffer对象 CharBuffer cbuff = CharBuffer.allocate(8); cbuff.put('孙'); cbuff.put('悟'); cbuff.put('空'); cbuff.flip(); // 将CharBuffer中的字符序列转换成字节序列 ByteBuffer bbuff = cnEncoder.encode(cbuff); // 循环访问ByteBuffer中的每个字节 for (int i = 0; i < bbuff.capacity(); i++) { System.out.print(bbuff.get(i) + " "); } // 将ByteBuffer的数据解码成字符序列 System.out.println("\n" + cnDecoder.decode(bbuff)); }
static CoderResult encodeCR(char[] cc, Charset cs, boolean testDirect) throws Exception { ByteBuffer bbf; CharBuffer cbf; CharsetEncoder enc = cs.newEncoder(); if (testDirect) { bbf = ByteBuffer.allocateDirect(cc.length * 4); cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); cbf.put(cc).flip(); } else { bbf = ByteBuffer.allocate(cc.length * 4); cbf = CharBuffer.wrap(cc); } CoderResult cr = null; for (int i = 0; i < iteration; i++) { cbf.rewind(); bbf.clear(); enc.reset(); cr = enc.encode(cbf, bbf, true); } return cr; }
public static void main(String[] args) throws Throwable { final int itrs = Integer.getInteger("iterations", 100000); // final int itrs = Integer.getInteger("iterations", 12); final int size = Integer.getInteger("size", 2048); final int subsize = Integer.getInteger("subsize", 128); final int maxchar = Integer.getInteger("maxchar", 128); final String regex = System.getProperty("filter"); final Pattern filter = (regex == null) ? null : Pattern.compile(regex); final boolean useSecurityManager = Boolean.getBoolean("SecurityManager"); if (useSecurityManager) System.setSecurityManager(new PermissiveSecurityManger()); final Random rnd = new Random(); String[] csns = new String[] { "Big5", "Johab", "EUC_CN", "EUC_KR", "MS932", "MS936", "MS949", "MS950", "GBK", "Big5_HKSCS", "Big5_HKSCS_2001", "Big5_Solaris", "MS950_HKSCS", "MS950_HKSCS_XP", "IBM1364", "IBM1381", "IBM1383", "IBM930", "IBM933", "IBM935", "IBM937", "IBM939", "IBM942", "IBM943", "IBM948", "IBM949", "IBM950", "IBM970", }; ArrayList<long[]> sum = new ArrayList<>(); for (final String csn : csns) { final Charset cs = Charset.forName(csn); List<Integer> cps = new ArrayList<>(0x4000); int off = 0; int cp = 0; int n = 0; CharsetEncoder enc = cs.newEncoder(); while (cp < 0x10000 && n < cps.size()) { if (enc.canEncode((char) cp)) { cps.add(cp); n++; } cp++; } Collections.shuffle(cps); char[] ca = new char[cps.size()]; for (int i = 0; i < cps.size(); i++) ca[i] = (char) (int) cps.get(i); System.out.printf("%n--------%s---------%n", csn); for (int sz = 8; sz <= 2048; sz *= 2) { System.out.printf(" [len=%d]%n", sz); final char[] chars = Arrays.copyOf(ca, sz); final String str = new String(chars); final byte[] bs = str.getBytes(cs); Job[] jobs = { new Job("String decode: csn") { public void work() throws Throwable { for (int i = 0; i < itrs; i++) new String(bs, csn); } }, new Job("String decode: cs") { public void work() throws Throwable { for (int i = 0; i < itrs; i++) new String(bs, cs); } }, new Job("String encode: csn") { public void work() throws Throwable { for (int i = 0; i < itrs; i++) str.getBytes(csn); } }, new Job("String encode: cs") { public void work() throws Throwable { for (int i = 0; i < itrs; i++) str.getBytes(cs); } }, }; sum.add(time(jobs)); } } }
static void test(Charset cs, char[] bmpCA, byte[] sbBA) throws Throwable { String bmpStr = new String(bmpCA); CharsetDecoder dec = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); CharsetEncoder enc = cs.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); // getBytes(csn); byte[] baSC = bmpStr.getBytes(cs.name()); ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA)); byte[] baNIO = new byte[bf.limit()]; bf.get(baNIO, 0, baNIO.length); if (!Arrays.equals(baSC, baNIO)) throw new RuntimeException("getBytes(csn) failed -> " + cs.name()); // getBytes(cs); baSC = bmpStr.getBytes(cs); if (!Arrays.equals(baSC, baNIO)) throw new RuntimeException("getBytes(cs) failed -> " + cs.name()); // new String(csn); String strSC = new String(sbBA, cs.name()); String strNIO = dec.reset().decode(ByteBuffer.wrap(sbBA)).toString(); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(csn) failed -> " + cs.name()); // new String(cs); strSC = new String(sbBA, cs); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed -> " + cs.name()); // encode unmappable surrogates if (enc instanceof sun.nio.cs.ArrayEncoder && cs.contains(Charset.forName("ASCII"))) { if (cs.name().equals("UTF-8") || // utf8 handles surrogates cs.name().equals("CESU-8")) // utf8 handles surrogates return; enc.replaceWith(new byte[] {(byte) 'A'}); sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder) enc; String str = "ab\uD800\uDC00\uD800\uDC00cd"; byte[] ba = new byte[str.length() - 2]; int n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 6 || !"abAAcd".equals(new String(ba, cs.name()))) throw new RuntimeException("encode1(surrogates) failed -> " + cs.name()); ba = new byte[str.length()]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 6 || !"abAAcd".equals(new String(ba, 0, n, cs.name()))) throw new RuntimeException("encode2(surrogates) failed -> " + cs.name()); str = "ab\uD800B\uDC00Bcd"; ba = new byte[str.length()]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 8 || !"abABABcd".equals(new String(ba, 0, n, cs.name()))) throw new RuntimeException("encode3(surrogates) failed -> " + cs.name()); /* sun.nio.cs.ArrayDeEncoder works on the assumption that the invoker (StringCoder) allocates enough output buf, utf8 and double-byte coder does not check the output buffer limit. ba = new byte[str.length() - 1]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 7 || !"abABABc".equals(new String(ba, 0, n, cs.name()))) { throw new RuntimeException("encode4(surrogates) failed -> " + cs.name()); } */ } }
@Override public void visitFile(PsiFile file) { super.visitFile(file); if (InjectedLanguageManager.getInstance(file.getProject()).isInjectedFragment(file) || !file.isPhysical()) { return; } final VirtualFile virtualFile = file.getVirtualFile(); final String text = file.getText(); final Charset charset = LoadTextUtil.extractCharsetFromFileContent(file.getProject(), virtualFile, text); final CharsetEncoder encoder = charset.newEncoder().onUnmappableCharacter(CodingErrorAction.REPORT); final CharBuffer charBuffer = CharBuffer.allocate(1); final ByteBuffer byteBuffer = ByteBuffer.allocate(10); final int length = text.length(); for (int i = 0; i < length; i++) { final char c = text.charAt(i); if (c != '\\') { continue; } boolean isEscape = true; int previousChar = i - 1; while (previousChar >= 0 && text.charAt(previousChar) == '\\') { isEscape = !isEscape; previousChar--; } if (!isEscape) { continue; } int nextChar = i; do { nextChar++; if (nextChar >= length) { break; } } while (text.charAt(nextChar) == 'u'); // \uuuu0061 is a legal unicode escape if (nextChar == i + 1 || nextChar + 3 >= length) { continue; } if (StringUtil.isHexDigit(text.charAt(nextChar)) && StringUtil.isHexDigit(text.charAt(nextChar + 1)) && StringUtil.isHexDigit(text.charAt(nextChar + 2)) && StringUtil.isHexDigit(text.charAt(nextChar + 3))) { final int escapeEnd = nextChar + 4; final char d = (char) Integer.parseInt(text.substring(nextChar, escapeEnd), 16); if (Character.isISOControl(d)) { continue; } byteBuffer.clear(); charBuffer.clear(); charBuffer.put(d).rewind(); final CoderResult coderResult = encoder.encode(charBuffer, byteBuffer, true); if (!coderResult.isUnmappable()) { final PsiElement element = file.findElementAt(i); if (element != null && isSuppressedFor(element)) { return; } registerErrorAtOffset(file, i, escapeEnd - i, Character.valueOf(d)); } } } }