Beispiel #1
0
  // check and compare canEncoding/Encoding
  static char[] checkEncoding(Charset oldCS, Charset newCS) throws Exception {
    System.out.printf("Encoding <%s> <%s>...%n", oldCS.name(), newCS.name());
    CharsetEncoder encOLD = oldCS.newEncoder();
    CharsetEncoder encNew = newCS.newEncoder();
    char[] cc = new char[0x10000];
    int pos = 0;
    boolean is970 = "x-IBM970-Old".equals(oldCS.name());

    for (char c = 0; c < 0xffff; c++) {
      boolean canOld = encOLD.canEncode(c);
      boolean canNew = encNew.canEncode(c);

      if (is970 && c == 0x2299) continue;

      if (canOld != canNew) {
        if (canNew) {
          System.out.printf("      NEW(only): ");
          printEntry(c, newCS);
        } else {
          if (is970) {
            byte[] bb = new String(new char[] {c}).getBytes(oldCS);
            if (bb.length == 2 && bb[0] == (byte) 0xa2 && bb[1] == (byte) 0xc1) {
              // we know 970 has bogus nnnn -> a2c1 -> 2299
              continue;
            }
          }
          System.out.printf("      OLD(only): ");
          printEntry(c, oldCS);
        }
      } else if (canNew) {
        byte[] bbNew = new String(new char[] {c}).getBytes(newCS);
        byte[] bbOld = new String(new char[] {c}).getBytes(oldCS);
        if (!Arrays.equals(bbNew, bbOld)) {
          System.out.printf("      c->b NEW: ");
          printEntry(c, newCS);
          System.out.printf("      c->b OLD: ");
          printEntry(c, oldCS);
        } else {
          String sNew = new String(bbNew, newCS);
          String sOld = new String(bbOld, oldCS);
          if (!sNew.equals(sOld)) {
            System.out.printf("      b2c NEW (c=%x):", c & 0xffff);
            printEntry(sNew.charAt(0), newCS);
            System.out.printf("      b2c OLD:");
            printEntry(sOld.charAt(0), oldCS);
          }
        }
      }
      if (canNew & canOld) { // added only both for now
        cc[pos++] = c;
      }
    }
    return Arrays.copyOf(cc, pos);
  }
Beispiel #2
0
  public static void main(String args[]) throws Exception {
    String inputFile = "samplein.txt";
    String outputFile = "sampleout.txt";

    RandomAccessFile inf = new RandomAccessFile(inputFile, "r");
    RandomAccessFile outf = new RandomAccessFile(outputFile, "rw");
    long inputLength = new File(inputFile).length();

    FileChannel inc = inf.getChannel();
    FileChannel outc = outf.getChannel();

    MappedByteBuffer inputData = inc.map(FileChannel.MapMode.READ_ONLY, 0, inputLength);

    Charset latin1 = Charset.forName("ISO-8859-1");
    CharsetDecoder decoder = latin1.newDecoder();
    CharsetEncoder encoder = latin1.newEncoder();

    CharBuffer cb = decoder.decode(inputData);

    // Process char data here

    ByteBuffer outputData = encoder.encode(cb);

    outc.write(outputData);

    inf.close();
    outf.close();
  }
  /**
   * Creates a string in a specfied character set.
   *
   * @param value String constant, must not be null
   * @param charsetName Name of the character set, may be null
   * @param collation Collation, may be null
   * @throws IllegalCharsetNameException If the given charset name is illegal
   * @throws UnsupportedCharsetException If no support for the named charset is available in this
   *     instance of the Java virtual machine
   * @throws RuntimeException If the given value cannot be represented in the given charset
   */
  public NlsString(String value, String charsetName, SqlCollation collation) {
    assert value != null;
    if (null != charsetName) {
      charsetName = charsetName.toUpperCase();
      this.charsetName = charsetName;
      String javaCharsetName = SqlUtil.translateCharacterSetName(charsetName);
      if (javaCharsetName == null) {
        throw new UnsupportedCharsetException(charsetName);
      }
      this.charset = Charset.forName(javaCharsetName);
      CharsetEncoder encoder = charset.newEncoder();

      // dry run to see if encoding hits any problems
      try {
        encoder.encode(CharBuffer.wrap(value));
      } catch (CharacterCodingException ex) {
        throw RESOURCE.charsetEncoding(value, javaCharsetName).ex();
      }
    } else {
      this.charsetName = null;
      this.charset = null;
    }
    this.collation = collation;
    this.value = value;
  }
Beispiel #4
0
 static byte[] encode(char[] cc, Charset cs, boolean testDirect, Time t) throws Exception {
   ByteBuffer bbf;
   CharBuffer cbf;
   CharsetEncoder enc = cs.newEncoder();
   String csn = cs.name();
   if (testDirect) {
     bbf = ByteBuffer.allocateDirect(cc.length * 4);
     cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
     cbf.put(cc).flip();
   } else {
     bbf = ByteBuffer.allocate(cc.length * 4);
     cbf = CharBuffer.wrap(cc);
   }
   CoderResult cr = null;
   long t1 = System.nanoTime() / 1000;
   for (int i = 0; i < iteration; i++) {
     cbf.rewind();
     bbf.clear();
     enc.reset();
     cr = enc.encode(cbf, bbf, true);
   }
   long t2 = System.nanoTime() / 1000;
   t.t = (t2 - t1) / iteration;
   if (cr != CoderResult.UNDERFLOW) {
     System.out.println("ENC-----------------");
     int pos = cbf.position();
     System.out.printf("  cr=%s, cbf.pos=%d, cc[pos]=%x%n", cr.toString(), pos, cc[pos] & 0xffff);
     throw new RuntimeException("Encoding err: " + csn);
   }
   byte[] bb = new byte[bbf.position()];
   bbf.flip();
   bbf.get(bb);
   return bb;
 }
  static void testMixed(Charset cs) throws Throwable {
    CharsetDecoder dec =
        cs.newDecoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);
    CharsetEncoder enc =
        cs.newEncoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);
    List<Integer> cps = new ArrayList<>(0x10000);
    int off = 0;
    int cp = 0;
    while (cp < 0x10000) {
      if (enc.canEncode((char) cp)) {
        cps.add(cp);
      }
      cp++;
    }
    Collections.shuffle(cps);
    char[] bmpCA = new char[cps.size()];
    for (int i = 0; i < cps.size(); i++) bmpCA[i] = (char) (int) cps.get(i);
    String bmpStr = new String(bmpCA);
    // getBytes(csn);
    byte[] bmpBA = bmpStr.getBytes(cs.name());
    ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA));
    byte[] baNIO = new byte[bf.limit()];
    bf.get(baNIO, 0, baNIO.length);
    if (!Arrays.equals(bmpBA, baNIO)) {
      throw new RuntimeException("getBytes(csn) failed  -> " + cs.name());
    }

    // getBytes(cs);
    bmpBA = bmpStr.getBytes(cs);
    if (!Arrays.equals(bmpBA, baNIO))
      throw new RuntimeException("getBytes(cs) failed  -> " + cs.name());

    // new String(csn);
    String strSC = new String(bmpBA, cs.name());
    String strNIO = dec.reset().decode(ByteBuffer.wrap(bmpBA)).toString();
    if (!strNIO.equals(strSC)) {
      throw new RuntimeException("new String(csn) failed  -> " + cs.name());
    }

    // new String(cs);
    strSC = new String(bmpBA, cs);
    if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed  -> " + cs.name());
  }
Beispiel #6
0
  static void checkInit(String csn) throws Exception {
    System.out.printf("Check init <%s>...%n", csn);
    Charset.forName("Big5"); // load in the ExtendedCharsets
    long t1 = System.nanoTime() / 1000;
    Charset cs = Charset.forName(csn);
    long t2 = System.nanoTime() / 1000;
    System.out.printf("    charset     :%d%n", t2 - t1);
    t1 = System.nanoTime() / 1000;
    cs.newDecoder();
    t2 = System.nanoTime() / 1000;
    System.out.printf("    new Decoder :%d%n", t2 - t1);

    t1 = System.nanoTime() / 1000;
    cs.newEncoder();
    t2 = System.nanoTime() / 1000;
    System.out.printf("    new Encoder :%d%n", t2 - t1);
  }
 public static void main(String[] args) throws Exception {
   // 创建简体中文对应的Charset
   Charset cn = Charset.forName("GBK");
   // 获取cn对象对应的编码器和解码器
   CharsetEncoder cnEncoder = cn.newEncoder();
   CharsetDecoder cnDecoder = cn.newDecoder();
   // 创建一个CharBuffer对象
   CharBuffer cbuff = CharBuffer.allocate(8);
   cbuff.put('孙');
   cbuff.put('悟');
   cbuff.put('空');
   cbuff.flip();
   // 将CharBuffer中的字符序列转换成字节序列
   ByteBuffer bbuff = cnEncoder.encode(cbuff);
   // 循环访问ByteBuffer中的每个字节
   for (int i = 0; i < bbuff.capacity(); i++) {
     System.out.print(bbuff.get(i) + " ");
   }
   // 将ByteBuffer的数据解码成字符序列
   System.out.println("\n" + cnDecoder.decode(bbuff));
 }
Beispiel #8
0
 static CoderResult encodeCR(char[] cc, Charset cs, boolean testDirect) throws Exception {
   ByteBuffer bbf;
   CharBuffer cbf;
   CharsetEncoder enc = cs.newEncoder();
   if (testDirect) {
     bbf = ByteBuffer.allocateDirect(cc.length * 4);
     cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
     cbf.put(cc).flip();
   } else {
     bbf = ByteBuffer.allocate(cc.length * 4);
     cbf = CharBuffer.wrap(cc);
   }
   CoderResult cr = null;
   for (int i = 0; i < iteration; i++) {
     cbf.rewind();
     bbf.clear();
     enc.reset();
     cr = enc.encode(cbf, bbf, true);
   }
   return cr;
 }
  public static void main(String[] args) throws Throwable {
    final int itrs = Integer.getInteger("iterations", 100000);
    // final int itrs = Integer.getInteger("iterations", 12);
    final int size = Integer.getInteger("size", 2048);
    final int subsize = Integer.getInteger("subsize", 128);
    final int maxchar = Integer.getInteger("maxchar", 128);
    final String regex = System.getProperty("filter");
    final Pattern filter = (regex == null) ? null : Pattern.compile(regex);
    final boolean useSecurityManager = Boolean.getBoolean("SecurityManager");
    if (useSecurityManager) System.setSecurityManager(new PermissiveSecurityManger());
    final Random rnd = new Random();

    String[] csns =
        new String[] {
          "Big5",
          "Johab",
          "EUC_CN",
          "EUC_KR",
          "MS932",
          "MS936",
          "MS949",
          "MS950",
          "GBK",
          "Big5_HKSCS",
          "Big5_HKSCS_2001",
          "Big5_Solaris",
          "MS950_HKSCS",
          "MS950_HKSCS_XP",
          "IBM1364",
          "IBM1381",
          "IBM1383",
          "IBM930",
          "IBM933",
          "IBM935",
          "IBM937",
          "IBM939",
          "IBM942",
          "IBM943",
          "IBM948",
          "IBM949",
          "IBM950",
          "IBM970",
        };

    ArrayList<long[]> sum = new ArrayList<>();

    for (final String csn : csns) {
      final Charset cs = Charset.forName(csn);
      List<Integer> cps = new ArrayList<>(0x4000);
      int off = 0;
      int cp = 0;
      int n = 0;
      CharsetEncoder enc = cs.newEncoder();
      while (cp < 0x10000 && n < cps.size()) {
        if (enc.canEncode((char) cp)) {
          cps.add(cp);
          n++;
        }
        cp++;
      }
      Collections.shuffle(cps);
      char[] ca = new char[cps.size()];
      for (int i = 0; i < cps.size(); i++) ca[i] = (char) (int) cps.get(i);

      System.out.printf("%n--------%s---------%n", csn);
      for (int sz = 8; sz <= 2048; sz *= 2) {
        System.out.printf("   [len=%d]%n", sz);

        final char[] chars = Arrays.copyOf(ca, sz);
        final String str = new String(chars);
        final byte[] bs = str.getBytes(cs);

        Job[] jobs = {
          new Job("String decode: csn") {
            public void work() throws Throwable {
              for (int i = 0; i < itrs; i++) new String(bs, csn);
            }
          },
          new Job("String decode: cs") {
            public void work() throws Throwable {
              for (int i = 0; i < itrs; i++) new String(bs, cs);
            }
          },
          new Job("String encode: csn") {
            public void work() throws Throwable {
              for (int i = 0; i < itrs; i++) str.getBytes(csn);
            }
          },
          new Job("String encode: cs") {
            public void work() throws Throwable {
              for (int i = 0; i < itrs; i++) str.getBytes(cs);
            }
          },
        };
        sum.add(time(jobs));
      }
    }
  }
  static void test(Charset cs, char[] bmpCA, byte[] sbBA) throws Throwable {
    String bmpStr = new String(bmpCA);
    CharsetDecoder dec =
        cs.newDecoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);
    CharsetEncoder enc =
        cs.newEncoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);

    // getBytes(csn);
    byte[] baSC = bmpStr.getBytes(cs.name());
    ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA));
    byte[] baNIO = new byte[bf.limit()];
    bf.get(baNIO, 0, baNIO.length);
    if (!Arrays.equals(baSC, baNIO))
      throw new RuntimeException("getBytes(csn) failed  -> " + cs.name());

    // getBytes(cs);
    baSC = bmpStr.getBytes(cs);
    if (!Arrays.equals(baSC, baNIO))
      throw new RuntimeException("getBytes(cs) failed  -> " + cs.name());

    // new String(csn);
    String strSC = new String(sbBA, cs.name());
    String strNIO = dec.reset().decode(ByteBuffer.wrap(sbBA)).toString();

    if (!strNIO.equals(strSC))
      throw new RuntimeException("new String(csn) failed  -> " + cs.name());

    // new String(cs);
    strSC = new String(sbBA, cs);
    if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed  -> " + cs.name());

    // encode unmappable surrogates
    if (enc instanceof sun.nio.cs.ArrayEncoder && cs.contains(Charset.forName("ASCII"))) {
      if (cs.name().equals("UTF-8")
          || // utf8 handles surrogates
          cs.name().equals("CESU-8")) // utf8 handles surrogates
      return;
      enc.replaceWith(new byte[] {(byte) 'A'});
      sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder) enc;

      String str = "ab\uD800\uDC00\uD800\uDC00cd";
      byte[] ba = new byte[str.length() - 2];
      int n = cae.encode(str.toCharArray(), 0, str.length(), ba);
      if (n != 6 || !"abAAcd".equals(new String(ba, cs.name())))
        throw new RuntimeException("encode1(surrogates) failed  -> " + cs.name());

      ba = new byte[str.length()];
      n = cae.encode(str.toCharArray(), 0, str.length(), ba);
      if (n != 6 || !"abAAcd".equals(new String(ba, 0, n, cs.name())))
        throw new RuntimeException("encode2(surrogates) failed  -> " + cs.name());
      str = "ab\uD800B\uDC00Bcd";
      ba = new byte[str.length()];
      n = cae.encode(str.toCharArray(), 0, str.length(), ba);
      if (n != 8 || !"abABABcd".equals(new String(ba, 0, n, cs.name())))
        throw new RuntimeException("encode3(surrogates) failed  -> " + cs.name());
      /* sun.nio.cs.ArrayDeEncoder works on the assumption that the
         invoker (StringCoder) allocates enough output buf, utf8
         and double-byte coder does not check the output buffer limit.
      ba = new byte[str.length() - 1];
      n = cae.encode(str.toCharArray(), 0, str.length(), ba);
      if (n != 7 || !"abABABc".equals(new String(ba, 0, n, cs.name()))) {
          throw new RuntimeException("encode4(surrogates) failed  -> "
                                     + cs.name());
      }
      */
    }
  }