/** * Creates a string in a specfied character set. * * @param value String constant, must not be null * @param charsetName Name of the character set, may be null * @param collation Collation, may be null * @throws IllegalCharsetNameException If the given charset name is illegal * @throws UnsupportedCharsetException If no support for the named charset is available in this * instance of the Java virtual machine * @throws RuntimeException If the given value cannot be represented in the given charset */ public NlsString(String value, String charsetName, SqlCollation collation) { assert value != null; if (null != charsetName) { charsetName = charsetName.toUpperCase(); this.charsetName = charsetName; String javaCharsetName = SqlUtil.translateCharacterSetName(charsetName); if (javaCharsetName == null) { throw new UnsupportedCharsetException(charsetName); } this.charset = Charset.forName(javaCharsetName); CharsetEncoder encoder = charset.newEncoder(); // dry run to see if encoding hits any problems try { encoder.encode(CharBuffer.wrap(value)); } catch (CharacterCodingException ex) { throw RESOURCE.charsetEncoding(value, javaCharsetName).ex(); } } else { this.charsetName = null; this.charset = null; } this.collation = collation; this.value = value; }
static byte[] encode(char[] cc, Charset cs, boolean testDirect, Time t) throws Exception { ByteBuffer bbf; CharBuffer cbf; CharsetEncoder enc = cs.newEncoder(); String csn = cs.name(); if (testDirect) { bbf = ByteBuffer.allocateDirect(cc.length * 4); cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); cbf.put(cc).flip(); } else { bbf = ByteBuffer.allocate(cc.length * 4); cbf = CharBuffer.wrap(cc); } CoderResult cr = null; long t1 = System.nanoTime() / 1000; for (int i = 0; i < iteration; i++) { cbf.rewind(); bbf.clear(); enc.reset(); cr = enc.encode(cbf, bbf, true); } long t2 = System.nanoTime() / 1000; t.t = (t2 - t1) / iteration; if (cr != CoderResult.UNDERFLOW) { System.out.println("ENC-----------------"); int pos = cbf.position(); System.out.printf(" cr=%s, cbf.pos=%d, cc[pos]=%x%n", cr.toString(), pos, cc[pos] & 0xffff); throw new RuntimeException("Encoding err: " + csn); } byte[] bb = new byte[bbf.position()]; bbf.flip(); bbf.get(bb); return bb; }
static void testMixed(Charset cs) throws Throwable { CharsetDecoder dec = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); CharsetEncoder enc = cs.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); List<Integer> cps = new ArrayList<>(0x10000); int off = 0; int cp = 0; while (cp < 0x10000) { if (enc.canEncode((char) cp)) { cps.add(cp); } cp++; } Collections.shuffle(cps); char[] bmpCA = new char[cps.size()]; for (int i = 0; i < cps.size(); i++) bmpCA[i] = (char) (int) cps.get(i); String bmpStr = new String(bmpCA); // getBytes(csn); byte[] bmpBA = bmpStr.getBytes(cs.name()); ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA)); byte[] baNIO = new byte[bf.limit()]; bf.get(baNIO, 0, baNIO.length); if (!Arrays.equals(bmpBA, baNIO)) { throw new RuntimeException("getBytes(csn) failed -> " + cs.name()); } // getBytes(cs); bmpBA = bmpStr.getBytes(cs); if (!Arrays.equals(bmpBA, baNIO)) throw new RuntimeException("getBytes(cs) failed -> " + cs.name()); // new String(csn); String strSC = new String(bmpBA, cs.name()); String strNIO = dec.reset().decode(ByteBuffer.wrap(bmpBA)).toString(); if (!strNIO.equals(strSC)) { throw new RuntimeException("new String(csn) failed -> " + cs.name()); } // new String(cs); strSC = new String(bmpBA, cs); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed -> " + cs.name()); }
/** Generic Decoder. */ private static final class Generic extends TextDecoder { /** Input cache. */ private final byte[] cache = new byte[4]; /** Input buffer. */ private final ByteBuffer inc = ByteBuffer.wrap(cache); /** Output buffer. */ private final CharBuffer outc = CharBuffer.wrap(new char[4]); /** Charset decoder. */ private final CharsetDecoder csd; /** * Constructor. * * @param enc encoding * @throws IOException I/O exception */ private Generic(final String enc) throws IOException { try { csd = Charset.forName(enc).newDecoder(); } catch (final Exception ex) { throw new EncodingException(ex); } } @Override int read(final TextInput ti) throws IOException { int c = -1; while (++c < 4) { final int ch = ti.readByte(); if (ch < 0) break; cache[c] = (byte) ch; outc.position(0); inc.position(0); inc.limit(c + 1); csd.reset(); final CoderResult cr = csd.decode(inc, outc, true); if (cr.isMalformed()) continue; // return character int i = 0; final int os = outc.position(); for (int o = 0; o < os; ++o) i |= outc.get(o) << (o << 3); return i; } return c == 0 ? -1 : invalid(); } }
public static void main(String args[]) throws Exception { String s = "abc\uD800\uDC00qrst"; // Valid surrogate char[] c = s.toCharArray(); CharsetEncoder enc = Charset.forName("ISO8859_1").newEncoder().onUnmappableCharacter(CodingErrorAction.REPLACE); /* Process the first 4 characters, including the high surrogate which should be stored */ ByteBuffer bb = ByteBuffer.allocate(10); CharBuffer cb = CharBuffer.wrap(c); cb.limit(4); enc.encode(cb, bb, false); cb.limit(7); enc.encode(cb, bb, true); byte[] first = bb.array(); for (int i = 0; i < 7; i++) System.err.printf("[%d]=%d was %d\n", i, (int) first[i] & 0xffff, (int) c[i] & 0xffff); }
static CoderResult encodeCR(char[] cc, Charset cs, boolean testDirect) throws Exception { ByteBuffer bbf; CharBuffer cbf; CharsetEncoder enc = cs.newEncoder(); if (testDirect) { bbf = ByteBuffer.allocateDirect(cc.length * 4); cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); cbf.put(cc).flip(); } else { bbf = ByteBuffer.allocate(cc.length * 4); cbf = CharBuffer.wrap(cc); } CoderResult cr = null; for (int i = 0; i < iteration; i++) { cbf.rewind(); bbf.clear(); enc.reset(); cr = enc.encode(cbf, bbf, true); } return cr; }
/** * Writes any remaining output to the output buffer and resets the converter to its initial state. * * @param output char array to receive flushed output. * @param outStart start writing to output array at this offset. * @param outEnd stop writing to output array at this offset (exclusive). * @exception MalformedInputException if the output to be flushed contained a partial or invalid * multibyte character sequence. flush will write what it can to the output buffer and reset * the converter before throwing this exception. An additional call to flush is not required. * @exception ConversionBufferFullException if output array is filled before all the output can be * flushed. flush will write what it can to the output buffer and remember its state. An * additional call to flush with a new output buffer will conclude the operation. */ public int flush(char[] output, int outStart, int outEnd) throws MalformedInputException, ConversionBufferFullException { byteOff = charOff = 0; if (outStart >= outEnd || outStart >= output.length) throw new ConversionBufferFullException(); if (dst != null && dst.array() == output) dst.position(outStart).limit(outEnd); else dst = CharBuffer.wrap(output, outStart, outEnd - outStart); CoderResult cr = null; try { if (src != null) cr = decoder.decode((ByteBuffer) src.clear(), dst, true); assert !cr.isUnmappable(); if (cr.isMalformed()) { badInputLength = cr.length(); reset(); throw new MalformedInputException(); } } catch (IllegalStateException ise) { if (src != null) cr = decoder.reset().decode(src, dst, true); } try { cr = decoder.flush(dst); } catch (Exception e) { assert false; } finally { byteOff = 0; charOff = dst.position(); src = null; } if (cr.isOverflow()) throw new ConversionBufferFullException(); // Return the length written to the output buffer if (cr.isUnderflow()) { int written = charOff - outStart; reset(); return written; } assert false; return -1; // should be never reached }
/** * Converts an array of bytes containing characters in an external encoding into an array of * Unicode characters. This method allows a buffer by buffer conversion of a data stream. The * state of the conversion is saved between calls to convert. Among other things, this means * multibyte input sequences can be split between calls. If a call to convert results in an * exception, the conversion may be continued by calling convert again with suitably modified * parameters. All conversions should be finished with a call to the flush method. * * @return the number of bytes written to output. * @param input byte array containing text to be converted. * @param inStart begin conversion at this offset in input array. * @param inEnd stop conversion at this offset in input array (exclusive). * @param output character array to receive conversion result. * @param outStart start writing to output array at this offset. * @param outEnd stop writing to output array at this offset (exclusive). * @exception MalformedInputException if the input buffer contains any sequence of bytes that is * illegal for the input character set. * @exception UnknownCharacterException for any character that that cannot be converted to * Unicode. Thrown only when converter is not in substitution mode. * @exception ConversionBufferFullException if output array is filled prior to converting all the * input. */ public int convert(byte[] input, int inStart, int inEnd, char[] output, int outStart, int outEnd) throws UnknownCharacterException, MalformedInputException, ConversionBufferFullException { byteOff = inStart; charOff = outStart; // throw exceptions compatible to legacy ByteToCharXxx converters if (inStart >= inEnd) return 0; if (inStart >= input.length) throw new ArrayIndexOutOfBoundsException(inStart); if (outStart >= outEnd || outStart >= output.length) throw new ConversionBufferFullException(); if (src != null && src.array() == input) src.position(inStart).limit(inEnd); else src = ByteBuffer.wrap(input, inStart, inEnd - inStart); if (dst != null && dst.array() == output) dst.position(outStart).limit(outEnd); else dst = CharBuffer.wrap(output, outStart, outEnd - outStart); CoderResult cr; try { cr = decoder.decode(src, dst, false); } catch (IllegalStateException ise) { cr = decoder.reset().decode(src, dst, false); } finally { byteOff = src.position(); charOff = dst.position(); } if (cr.isUnmappable()) { badInputLength = cr.length(); throw new UnknownCharacterException(); } if (cr.isMalformed()) { badInputLength = cr.length(); throw new MalformedInputException(); } if (cr.isOverflow()) throw new ConversionBufferFullException(); // Return the length written to the output buffer if (cr.isUnderflow()) return charOff - outStart; return -1; // should be never reached }
static void test(Charset cs, char[] bmpCA, byte[] sbBA) throws Throwable { String bmpStr = new String(bmpCA); CharsetDecoder dec = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); CharsetEncoder enc = cs.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); // getBytes(csn); byte[] baSC = bmpStr.getBytes(cs.name()); ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA)); byte[] baNIO = new byte[bf.limit()]; bf.get(baNIO, 0, baNIO.length); if (!Arrays.equals(baSC, baNIO)) throw new RuntimeException("getBytes(csn) failed -> " + cs.name()); // getBytes(cs); baSC = bmpStr.getBytes(cs); if (!Arrays.equals(baSC, baNIO)) throw new RuntimeException("getBytes(cs) failed -> " + cs.name()); // new String(csn); String strSC = new String(sbBA, cs.name()); String strNIO = dec.reset().decode(ByteBuffer.wrap(sbBA)).toString(); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(csn) failed -> " + cs.name()); // new String(cs); strSC = new String(sbBA, cs); if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed -> " + cs.name()); // encode unmappable surrogates if (enc instanceof sun.nio.cs.ArrayEncoder && cs.contains(Charset.forName("ASCII"))) { if (cs.name().equals("UTF-8") || // utf8 handles surrogates cs.name().equals("CESU-8")) // utf8 handles surrogates return; enc.replaceWith(new byte[] {(byte) 'A'}); sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder) enc; String str = "ab\uD800\uDC00\uD800\uDC00cd"; byte[] ba = new byte[str.length() - 2]; int n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 6 || !"abAAcd".equals(new String(ba, cs.name()))) throw new RuntimeException("encode1(surrogates) failed -> " + cs.name()); ba = new byte[str.length()]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 6 || !"abAAcd".equals(new String(ba, 0, n, cs.name()))) throw new RuntimeException("encode2(surrogates) failed -> " + cs.name()); str = "ab\uD800B\uDC00Bcd"; ba = new byte[str.length()]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 8 || !"abABABcd".equals(new String(ba, 0, n, cs.name()))) throw new RuntimeException("encode3(surrogates) failed -> " + cs.name()); /* sun.nio.cs.ArrayDeEncoder works on the assumption that the invoker (StringCoder) allocates enough output buf, utf8 and double-byte coder does not check the output buffer limit. ba = new byte[str.length() - 1]; n = cae.encode(str.toCharArray(), 0, str.length(), ba); if (n != 7 || !"abABABc".equals(new String(ba, 0, n, cs.name()))) { throw new RuntimeException("encode4(surrogates) failed -> " + cs.name()); } */ } }
// we assume out is large enough for this conversion // returns number of filled chars in out buffer public int convert(byte[] in, int inOffset, int inLength, char[] out) { final ByteBuffer inBuffer = ByteBuffer.wrap(in, inOffset, inLength); final CharBuffer outBuffer = CharBuffer.wrap(out, 0, out.length); myDecoder.decode(inBuffer, outBuffer, false); return outBuffer.position(); }