public static void main(String args[]) throws Exception { String inputFile = "samplein.txt"; String outputFile = "sampleout.txt"; RandomAccessFile inf = new RandomAccessFile(inputFile, "r"); RandomAccessFile outf = new RandomAccessFile(outputFile, "rw"); long inputLength = new File(inputFile).length(); FileChannel inc = inf.getChannel(); FileChannel outc = outf.getChannel(); MappedByteBuffer inputData = inc.map(FileChannel.MapMode.READ_ONLY, 0, inputLength); Charset latin1 = Charset.forName("ISO-8859-1"); CharsetDecoder decoder = latin1.newDecoder(); CharsetEncoder encoder = latin1.newEncoder(); CharBuffer cb = decoder.decode(inputData); // Process char data here ByteBuffer outputData = encoder.encode(cb); outc.write(outputData); inf.close(); outf.close(); }
static boolean check(CharsetDecoder dec, byte[] bytes, boolean direct, int[] flow) { int inPos = flow[0]; int inLen = flow[1]; int outPos = flow[2]; int outLen = flow[3]; int expedInPos = flow[4]; int expedOutPos = flow[5]; CoderResult expedCR = (flow[6] == 0) ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW; ByteBuffer bbf; CharBuffer cbf; if (direct) { bbf = ByteBuffer.allocateDirect(inPos + bytes.length); cbf = ByteBuffer.allocateDirect((outPos + outLen) * 2).asCharBuffer(); } else { bbf = ByteBuffer.allocate(inPos + bytes.length); cbf = CharBuffer.allocate(outPos + outLen); } bbf.position(inPos); bbf.put(bytes).flip().position(inPos).limit(inPos + inLen); cbf.position(outPos); dec.reset(); CoderResult cr = dec.decode(bbf, cbf, false); if (cr != expedCR || bbf.position() != expedInPos || cbf.position() != expedOutPos) { System.out.printf("Expected(direct=%5b): [", direct); for (int i : flow) System.out.print(" " + i); System.out.println( "] CR=" + cr + ", inPos=" + bbf.position() + ", outPos=" + cbf.position()); return false; } return true; }
/** * Read a 'n' bytes from buffer into a String where n is the framesize - offset so therefore * cannot use this if there are other objects after it because it has no delimiter. * * <p>Must take into account the text encoding defined in the Encoding Object ID3 Text Frames * often allow multiple strings seperated by the null char appropriate for the encoding. * * @param arr this is the buffer for the frame * @param offset this is where to start reading in the buffer for this field * @throws NullPointerException * @throws IndexOutOfBoundsException */ public void readByteArray(byte[] arr, int offset) throws InvalidDataTypeException { logger.finest("Reading from array from offset:" + offset); // Get the Specified Decoder String charSetName = getTextEncodingCharSet(); CharsetDecoder decoder = Charset.forName(charSetName).newDecoder(); decoder.reset(); // Decode sliced inBuffer ByteBuffer inBuffer; // #302 [dallen] truncating array manually since the decoder.decode() does not honor the offset // in the in buffer byte[] truncArr = new byte[arr.length - offset]; System.arraycopy(arr, offset, truncArr, 0, truncArr.length); inBuffer = ByteBuffer.wrap(truncArr); CharBuffer outBuffer = CharBuffer.allocate(arr.length - offset); CoderResult coderResult = decoder.decode(inBuffer, outBuffer, true); if (coderResult.isError()) { logger.warning("Decoding error:" + coderResult.toString()); } decoder.flush(outBuffer); outBuffer.flip(); // If using UTF16 with BOM we then search through the text removing any BOMs that could exist // for multiple values, BOM could be Big Endian or Little Endian if (charSetName.equals(TextEncoding.CHARSET_UTF_16)) { value = outBuffer.toString().replace("\ufeff", "").replace("\ufffe", ""); } else { value = outBuffer.toString(); } // SetSize, important this is correct for finding the next datatype setSize(arr.length - offset); logger.config("Read SizeTerminatedString:" + value + " size:" + size); }
public static void main(String[] arguments) { try { // read byte data into a byte buffer String data = "friends.dat"; FileInputStream inData = new FileInputStream(data); FileChannel inChannel = inData.getChannel(); long inSize = inChannel.size(); ByteBuffer source = ByteBuffer.allocate((int) inSize); inChannel.read(source, 0); source.position(0); System.out.println("Original byte data:"); for (int i = 0; source.remaining() > 0; i++) { System.out.print(source.get() + " "); } // convert byte data into character data source.position(0); Charset ascii = Charset.forName("US-ASCII"); CharsetDecoder toAscii = ascii.newDecoder(); CharBuffer destination = toAscii.decode(source); destination.position(0); System.out.println("\n\nNew character data:"); for (int i = 0; destination.remaining() > 0; i++) { System.out.print(destination.get()); } System.out.println(); } catch (FileNotFoundException fne) { System.out.println(fne.getMessage()); } catch (IOException ioe) { System.out.println(ioe.getMessage()); } }
static void testMixed(Charset cs) throws Throwable { CharsetDecoder dec = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); CharsetEncoder enc = cs.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); List<Integer> cps = new ArrayList<>(0x10000); int off = 0; int cp = 0; while (cp < 0x10000) { if (enc.canEncode((char) cp)) { cps.add(cp); } cp++; } Collections.shuffle(cps); char[] bmpCA = new char[cps.size()]; for (int i = 0; i < cps.size(); i++) bmpCA[i] = (char) (int) cps.get(i); String bmpStr = new String(bmpCA); // getBytes(csn); byte[] bmpBA = bmpStr.getBytes(cs.name()); ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA)); byte[] baNIO = new byte[bf.limit()]; bf.get(baNIO, 0, baNIO.length); if (!Arrays.equals(bmpBA, baNIO)) { throw new RuntimeException("getBytes(csn) failed -> " + cs.name()); } // getBytes(cs); bmpBA = bmpStr.getBytes(cs); if (!Arrays.equals(bmpBA, baNIO)) throw new RuntimeException("getBytes(cs) failed -> " + cs.name()); // new String(csn); String strSC = new String(bmpBA, cs.name()); String strNIO = dec.reset().decode(ByteBuffer.wrap(bmpBA)).toString(); if (!strNIO.equals(strSC)) { throw new RuntimeException("new String(csn) failed -> " + cs.name()); } // new String(cs); strSC = new String(bmpBA, cs); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed -> " + cs.name()); }
/** * Sets the substitution characters to use when the converter is in substitution mode. The given * chars must not be longer than the value returned by getMaxCharsPerByte for this converter. * * @param chars the substitution chars * @exception IllegalArgumentException if given byte array is longer than the value returned by * the method getMaxBytesPerChar. * @see #setSubstitutionMode * @see #getMaxBytesPerChar */ public void setSubstitutionChars(char[] chars) throws IllegalArgumentException { if (decoder != null) decoder.replaceWith(new String(chars)); else { // only provided for subclasses if (chars.length > getMaxCharsPerByte()) throw new IllegalArgumentException(); subChars = new char[chars.length]; System.arraycopy(chars, 0, subChars, 0, chars.length); } }
static char[] decode(byte[] bb, Charset cs, boolean testDirect, Time t) throws Exception { String csn = cs.name(); CharsetDecoder dec = cs.newDecoder(); ByteBuffer bbf; CharBuffer cbf; if (testDirect) { bbf = ByteBuffer.allocateDirect(bb.length); cbf = ByteBuffer.allocateDirect(bb.length * 2).asCharBuffer(); bbf.put(bb); } else { bbf = ByteBuffer.wrap(bb); cbf = CharBuffer.allocate(bb.length); } CoderResult cr = null; long t1 = System.nanoTime() / 1000; for (int i = 0; i < iteration; i++) { bbf.rewind(); cbf.clear(); dec.reset(); cr = dec.decode(bbf, cbf, true); } long t2 = System.nanoTime() / 1000; t.t = (t2 - t1) / iteration; if (cr != CoderResult.UNDERFLOW) { System.out.println("DEC-----------------"); int pos = bbf.position(); System.out.printf( " cr=%s, bbf.pos=%d, bb[pos]=%x,%x,%x,%x%n", cr.toString(), pos, bb[pos++] & 0xff, bb[pos++] & 0xff, bb[pos++] & 0xff, bb[pos++] & 0xff); throw new RuntimeException("Decoding err: " + csn); } char[] cc = new char[cbf.position()]; cbf.flip(); cbf.get(cc); return cc; }
public String readAsString(Charset charset) throws CharacterCodingException { int unreadSize = totalBytesUnread(); if (unreadSize > 0) { CharsetDecoder decoder = charset .newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); CharBuffer charbuffer = CharBuffer.allocate(unreadSize); ByteBuffer buf = null; while (prepareRead() != -1) { buf = currentReadChunk.readToNioBuffer(); boolean endOfInput = (prepareRead() == -1); CoderResult result = decoder.decode(buf, charbuffer, endOfInput); if (endOfInput) { if (!result.isUnderflow()) { result.throwException(); } } } CoderResult result = decoder.flush(charbuffer); if (buf.hasRemaining()) { throw new IllegalStateException("There's a bug here, buffer wasn't read fully."); } if (!result.isUnderflow()) result.throwException(); charbuffer.flip(); String str; if (charbuffer.hasArray()) { int len = charbuffer.remaining(); char[] ch = charbuffer.array(); if (len != ch.length) { ch = ArrayUtils.subarray(ch, 0, len); } str = StringCharArrayAccessor.createString(ch); } else { str = charbuffer.toString(); } return str; } return null; }
@Override int read(final TextInput ti) throws IOException { int c = -1; while (++c < 4) { final int ch = ti.readByte(); if (ch < 0) break; cache[c] = (byte) ch; outc.position(0); inc.position(0); inc.limit(c + 1); csd.reset(); final CoderResult cr = csd.decode(inc, outc, true); if (cr.isMalformed()) continue; // return character int i = 0; final int os = outc.position(); for (int o = 0; o < os; ++o) i |= outc.get(o) << (o << 3); return i; } return c == 0 ? -1 : invalid(); }
public static void main(String[] args) throws Exception { // 创建简体中文对应的Charset Charset cn = Charset.forName("GBK"); // 获取cn对象对应的编码器和解码器 CharsetEncoder cnEncoder = cn.newEncoder(); CharsetDecoder cnDecoder = cn.newDecoder(); // 创建一个CharBuffer对象 CharBuffer cbuff = CharBuffer.allocate(8); cbuff.put('孙'); cbuff.put('悟'); cbuff.put('空'); cbuff.flip(); // 将CharBuffer中的字符序列转换成字节序列 ByteBuffer bbuff = cnEncoder.encode(cbuff); // 循环访问ByteBuffer中的每个字节 for (int i = 0; i < bbuff.capacity(); i++) { System.out.print(bbuff.get(i) + " "); } // 将ByteBuffer的数据解码成字符序列 System.out.println("\n" + cnDecoder.decode(bbuff)); }
static CoderResult decodeCR(byte[] bb, Charset cs, boolean testDirect) throws Exception { CharsetDecoder dec = cs.newDecoder(); ByteBuffer bbf; CharBuffer cbf; if (testDirect) { bbf = ByteBuffer.allocateDirect(bb.length); cbf = ByteBuffer.allocateDirect(bb.length * 2).asCharBuffer(); bbf.put(bb).flip(); } else { bbf = ByteBuffer.wrap(bb); cbf = CharBuffer.allocate(bb.length); } CoderResult cr = null; for (int i = 0; i < iteration; i++) { bbf.rewind(); cbf.clear(); dec.reset(); cr = dec.decode(bbf, cbf, true); } return cr; }
/** * Writes any remaining output to the output buffer and resets the converter to its initial state. * * @param output char array to receive flushed output. * @param outStart start writing to output array at this offset. * @param outEnd stop writing to output array at this offset (exclusive). * @exception MalformedInputException if the output to be flushed contained a partial or invalid * multibyte character sequence. flush will write what it can to the output buffer and reset * the converter before throwing this exception. An additional call to flush is not required. * @exception ConversionBufferFullException if output array is filled before all the output can be * flushed. flush will write what it can to the output buffer and remember its state. An * additional call to flush with a new output buffer will conclude the operation. */ public int flush(char[] output, int outStart, int outEnd) throws MalformedInputException, ConversionBufferFullException { byteOff = charOff = 0; if (outStart >= outEnd || outStart >= output.length) throw new ConversionBufferFullException(); if (dst != null && dst.array() == output) dst.position(outStart).limit(outEnd); else dst = CharBuffer.wrap(output, outStart, outEnd - outStart); CoderResult cr = null; try { if (src != null) cr = decoder.decode((ByteBuffer) src.clear(), dst, true); assert !cr.isUnmappable(); if (cr.isMalformed()) { badInputLength = cr.length(); reset(); throw new MalformedInputException(); } } catch (IllegalStateException ise) { if (src != null) cr = decoder.reset().decode(src, dst, true); } try { cr = decoder.flush(dst); } catch (Exception e) { assert false; } finally { byteOff = 0; charOff = dst.position(); src = null; } if (cr.isOverflow()) throw new ConversionBufferFullException(); // Return the length written to the output buffer if (cr.isUnderflow()) { int written = charOff - outStart; reset(); return written; } assert false; return -1; // should be never reached }
/** * Decode file charset. * * @param f File to process. * @return File charset. * @throws IOException in case of error. */ public static Charset decode(File f) throws IOException { SortedMap<String, Charset> charsets = Charset.availableCharsets(); String[] firstCharsets = { Charset.defaultCharset().name(), "US-ASCII", "UTF-8", "UTF-16BE", "UTF-16LE" }; Collection<Charset> orderedCharsets = U.newLinkedHashSet(charsets.size()); for (String c : firstCharsets) if (charsets.containsKey(c)) orderedCharsets.add(charsets.get(c)); orderedCharsets.addAll(charsets.values()); try (RandomAccessFile raf = new RandomAccessFile(f, "r")) { FileChannel ch = raf.getChannel(); ByteBuffer buf = ByteBuffer.allocate(4096); ch.read(buf); buf.flip(); for (Charset charset : orderedCharsets) { CharsetDecoder decoder = charset.newDecoder(); decoder.reset(); try { decoder.decode(buf); return charset; } catch (CharacterCodingException ignored) { } } } return Charset.defaultCharset(); }
/** * Converts an array of bytes containing characters in an external encoding into an array of * Unicode characters. This method allows a buffer by buffer conversion of a data stream. The * state of the conversion is saved between calls to convert. Among other things, this means * multibyte input sequences can be split between calls. If a call to convert results in an * exception, the conversion may be continued by calling convert again with suitably modified * parameters. All conversions should be finished with a call to the flush method. * * @return the number of bytes written to output. * @param input byte array containing text to be converted. * @param inStart begin conversion at this offset in input array. * @param inEnd stop conversion at this offset in input array (exclusive). * @param output character array to receive conversion result. * @param outStart start writing to output array at this offset. * @param outEnd stop writing to output array at this offset (exclusive). * @exception MalformedInputException if the input buffer contains any sequence of bytes that is * illegal for the input character set. * @exception UnknownCharacterException for any character that that cannot be converted to * Unicode. Thrown only when converter is not in substitution mode. * @exception ConversionBufferFullException if output array is filled prior to converting all the * input. */ public int convert(byte[] input, int inStart, int inEnd, char[] output, int outStart, int outEnd) throws UnknownCharacterException, MalformedInputException, ConversionBufferFullException { byteOff = inStart; charOff = outStart; // throw exceptions compatible to legacy ByteToCharXxx converters if (inStart >= inEnd) return 0; if (inStart >= input.length) throw new ArrayIndexOutOfBoundsException(inStart); if (outStart >= outEnd || outStart >= output.length) throw new ConversionBufferFullException(); if (src != null && src.array() == input) src.position(inStart).limit(inEnd); else src = ByteBuffer.wrap(input, inStart, inEnd - inStart); if (dst != null && dst.array() == output) dst.position(outStart).limit(outEnd); else dst = CharBuffer.wrap(output, outStart, outEnd - outStart); CoderResult cr; try { cr = decoder.decode(src, dst, false); } catch (IllegalStateException ise) { cr = decoder.reset().decode(src, dst, false); } finally { byteOff = src.position(); charOff = dst.position(); } if (cr.isUnmappable()) { badInputLength = cr.length(); throw new UnknownCharacterException(); } if (cr.isMalformed()) { badInputLength = cr.length(); throw new MalformedInputException(); } if (cr.isOverflow()) throw new ConversionBufferFullException(); // Return the length written to the output buffer if (cr.isUnderflow()) return charOff - outStart; return -1; // should be never reached }
static void test(Charset cs, char[] bmpCA, byte[] sbBA) throws Throwable { String bmpStr = new String(bmpCA); CharsetDecoder dec = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); CharsetEncoder enc = cs.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); // getBytes(csn); byte[] baSC = bmpStr.getBytes(cs.name()); ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA)); byte[] baNIO = new byte[bf.limit()]; bf.get(baNIO, 0, baNIO.length); if (!Arrays.equals(baSC, baNIO)) throw new RuntimeException("getBytes(csn) failed -> " + cs.name()); // getBytes(cs); baSC = bmpStr.getBytes(cs); if (!Arrays.equals(baSC, baNIO)) throw new RuntimeException("getBytes(cs) failed -> " + cs.name()); // new String(csn); String strSC = new String(sbBA, cs.name()); String strNIO = dec.reset().decode(ByteBuffer.wrap(sbBA)).toString(); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(csn) failed -> " + cs.name()); // new String(cs); strSC = new String(sbBA, cs); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed -> " + cs.name()); // encode unmappable surrogates if (enc instanceof sun.nio.cs.ArrayEncoder && cs.contains(Charset.forName("ASCII"))) { if (cs.name().equals("UTF-8") || // utf8 handles surrogates cs.name().equals("CESU-8")) // utf8 handles surrogates return; enc.replaceWith(new byte[] {(byte) 'A'}); sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder) enc; String str = "ab\uD800\uDC00\uD800\uDC00cd"; byte[] ba = new byte[str.length() - 2]; int n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 6 || !"abAAcd".equals(new String(ba, cs.name()))) throw new RuntimeException("encode1(surrogates) failed -> " + cs.name()); ba = new byte[str.length()]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 6 || !"abAAcd".equals(new String(ba, 0, n, cs.name()))) throw new RuntimeException("encode2(surrogates) failed -> " + cs.name()); str = "ab\uD800B\uDC00Bcd"; ba = new byte[str.length()]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 8 || !"abABABcd".equals(new String(ba, 0, n, cs.name()))) throw new RuntimeException("encode3(surrogates) failed -> " + cs.name()); /* sun.nio.cs.ArrayDeEncoder works on the assumption that the invoker (StringCoder) allocates enough output buf, utf8 and double-byte coder does not check the output buffer limit. ba = new byte[str.length() - 1]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 7 || !"abABABc".equals(new String(ba, 0, n, cs.name()))) { throw new RuntimeException("encode4(surrogates) failed -> " + cs.name()); } */ } }
ByteToCharConverter(Charset charset, String encoding) { super(encoding); decoder = charset.newDecoder().onUnmappableCharacter(CodingErrorAction.REPLACE); // for compatibility to old ByteToCharASCII converter: if (charset.name().equals("US-ASCII")) decoder.onMalformedInput(CodingErrorAction.REPLACE); }
// we assume out is large enough for this conversion // returns number of filled chars in out buffer public int convert(byte[] in, int inOffset, int inLength, char[] out) { final ByteBuffer inBuffer = ByteBuffer.wrap(in, inOffset, inLength); final CharBuffer outBuffer = CharBuffer.wrap(out, 0, out.length); myDecoder.decode(inBuffer, outBuffer, false); return outBuffer.position(); }
/** * Sets converter into substitution mode. In substitution mode, the converter will replace * untranslatable characters in the source encoding with the substitution character set by * setSubstitionChars. When not in substitution mode, the converter will throw an * UnknownCharacterException when it encounters untranslatable input. * * @param doSub if true, enable substitution mode. * @see #setSubstitutionChars */ public void setSubstitutionMode(boolean doSub) { super.setSubstitutionMode(doSub); if (decoder != null) decoder.onUnmappableCharacter(doSub ? CodingErrorAction.REPLACE : CodingErrorAction.REPORT); }
/** * Returns the maximum number of characters needed to convert a byte. Useful for calculating the * maximum output buffer size needed for a particular input buffer. */ public int getMaxCharsPerByte() { if (decoder != null) return (int) Math.ceil(decoder.maxCharsPerByte()); else // only provided for subclasses return 1; // Until UTF-16, this will do for every encoding }
/** Resets converter to its initial state. */ public void reset() { super.reset(); decoder.reset(); src = null; dst = null; }
public void reset() { myDecoder.reset(); }