Beispiel #1
0
  public static void main(String args[]) throws Exception {
    String inputFile = "samplein.txt";
    String outputFile = "sampleout.txt";

    RandomAccessFile inf = new RandomAccessFile(inputFile, "r");
    RandomAccessFile outf = new RandomAccessFile(outputFile, "rw");
    long inputLength = new File(inputFile).length();

    FileChannel inc = inf.getChannel();
    FileChannel outc = outf.getChannel();

    MappedByteBuffer inputData = inc.map(FileChannel.MapMode.READ_ONLY, 0, inputLength);

    Charset latin1 = Charset.forName("ISO-8859-1");
    CharsetDecoder decoder = latin1.newDecoder();
    CharsetEncoder encoder = latin1.newEncoder();

    CharBuffer cb = decoder.decode(inputData);

    // Process char data here

    ByteBuffer outputData = encoder.encode(cb);

    outc.write(outputData);

    inf.close();
    outf.close();
  }
  /**
   * Creates a string in a specfied character set.
   *
   * @param value String constant, must not be null
   * @param charsetName Name of the character set, may be null
   * @param collation Collation, may be null
   * @throws IllegalCharsetNameException If the given charset name is illegal
   * @throws UnsupportedCharsetException If no support for the named charset is available in this
   *     instance of the Java virtual machine
   * @throws RuntimeException If the given value cannot be represented in the given charset
   */
  public NlsString(String value, String charsetName, SqlCollation collation) {
    assert value != null;
    if (null != charsetName) {
      charsetName = charsetName.toUpperCase();
      this.charsetName = charsetName;
      String javaCharsetName = SqlUtil.translateCharacterSetName(charsetName);
      if (javaCharsetName == null) {
        throw new UnsupportedCharsetException(charsetName);
      }
      this.charset = Charset.forName(javaCharsetName);
      CharsetEncoder encoder = charset.newEncoder();

      // dry run to see if encoding hits any problems
      try {
        encoder.encode(CharBuffer.wrap(value));
      } catch (CharacterCodingException ex) {
        throw RESOURCE.charsetEncoding(value, javaCharsetName).ex();
      }
    } else {
      this.charsetName = null;
      this.charset = null;
    }
    this.collation = collation;
    this.value = value;
  }
Beispiel #3
0
 static byte[] encode(char[] cc, Charset cs, boolean testDirect, Time t) throws Exception {
   ByteBuffer bbf;
   CharBuffer cbf;
   CharsetEncoder enc = cs.newEncoder();
   String csn = cs.name();
   if (testDirect) {
     bbf = ByteBuffer.allocateDirect(cc.length * 4);
     cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
     cbf.put(cc).flip();
   } else {
     bbf = ByteBuffer.allocate(cc.length * 4);
     cbf = CharBuffer.wrap(cc);
   }
   CoderResult cr = null;
   long t1 = System.nanoTime() / 1000;
   for (int i = 0; i < iteration; i++) {
     cbf.rewind();
     bbf.clear();
     enc.reset();
     cr = enc.encode(cbf, bbf, true);
   }
   long t2 = System.nanoTime() / 1000;
   t.t = (t2 - t1) / iteration;
   if (cr != CoderResult.UNDERFLOW) {
     System.out.println("ENC-----------------");
     int pos = cbf.position();
     System.out.printf("  cr=%s, cbf.pos=%d, cc[pos]=%x%n", cr.toString(), pos, cc[pos] & 0xffff);
     throw new RuntimeException("Encoding err: " + csn);
   }
   byte[] bb = new byte[bbf.position()];
   bbf.flip();
   bbf.get(bb);
   return bb;
 }
  /**
   * Write String into byte array
   *
   * <p>It will remove a trailing null terminator if exists if the option
   * RemoveTrailingTerminatorOnWrite has been set.
   *
   * @return the data as a byte array in format to write to file
   */
  public byte[] writeByteArray() {
    byte[] data;
    // Try and write to buffer using the CharSet defined by getTextEncodingCharSet()
    String charSetName = getTextEncodingCharSet();
    try {

      stripTrailingNull();

      // Special Handling because there is no UTF16 BOM LE charset
      String stringValue = (String) value;
      String actualCharSet = null;
      if (charSetName.equals(TextEncoding.CHARSET_UTF_16)) {
        if (TagOptionSingleton.getInstance().isEncodeUTF16BomAsLittleEndian()) {
          actualCharSet = TextEncoding.CHARSET_UTF_16_LE_ENCODING_FORMAT;
        } else {
          actualCharSet = TextEncoding.CHARSET_UTF_16_BE_ENCODING_FORMAT;
        }
      }

      // Ensure large enough for any encoding
      ByteBuffer outputBuffer = ByteBuffer.allocate((stringValue.length() + 3) * 3);

      // Ensure each string (if multiple values) is written with BOM by writing separately
      List<String> values = splitByNullSeperator(stringValue);
      checkTrailingNull(values, stringValue);

      // For each value
      for (int i = 0; i < values.size(); i++) {
        String next = values.get(i);
        if (actualCharSet != null) {
          if (actualCharSet.equals(TextEncoding.CHARSET_UTF_16_LE_ENCODING_FORMAT)) {
            outputBuffer.put(writeStringUTF16LEBOM(next, i, values.size()));
          } else if (actualCharSet.equals(TextEncoding.CHARSET_UTF_16_BE_ENCODING_FORMAT)) {
            outputBuffer.put(writeStringUTF16BEBOM(next, i, values.size()));
          }
        } else {
          CharsetEncoder charsetEncoder = Charset.forName(charSetName).newEncoder();
          charsetEncoder.onMalformedInput(CodingErrorAction.IGNORE);
          charsetEncoder.onUnmappableCharacter(CodingErrorAction.IGNORE);
          outputBuffer.put(writeString(charsetEncoder, next, i, values.size()));
        }
      }
      outputBuffer.flip();
      data = new byte[outputBuffer.limit()];
      outputBuffer.rewind();
      outputBuffer.get(data, 0, outputBuffer.limit());
      setSize(data.length);
    }
    // https://bitbucket.org/ijabz/jaudiotagger/issue/1/encoding-metadata-to-utf-16-can-fail-if
    catch (CharacterCodingException ce) {
      logger.severe(ce.getMessage() + ":" + charSetName + ":" + value);
      throw new RuntimeException(ce);
    }
    return data;
  }
Beispiel #5
0
  // check and compare canEncoding/Encoding
  static char[] checkEncoding(Charset oldCS, Charset newCS) throws Exception {
    System.out.printf("Encoding <%s> <%s>...%n", oldCS.name(), newCS.name());
    CharsetEncoder encOLD = oldCS.newEncoder();
    CharsetEncoder encNew = newCS.newEncoder();
    char[] cc = new char[0x10000];
    int pos = 0;
    boolean is970 = "x-IBM970-Old".equals(oldCS.name());

    for (char c = 0; c < 0xffff; c++) {
      boolean canOld = encOLD.canEncode(c);
      boolean canNew = encNew.canEncode(c);

      if (is970 && c == 0x2299) continue;

      if (canOld != canNew) {
        if (canNew) {
          System.out.printf("      NEW(only): ");
          printEntry(c, newCS);
        } else {
          if (is970) {
            byte[] bb = new String(new char[] {c}).getBytes(oldCS);
            if (bb.length == 2 && bb[0] == (byte) 0xa2 && bb[1] == (byte) 0xc1) {
              // we know 970 has bogus nnnn -> a2c1 -> 2299
              continue;
            }
          }
          System.out.printf("      OLD(only): ");
          printEntry(c, oldCS);
        }
      } else if (canNew) {
        byte[] bbNew = new String(new char[] {c}).getBytes(newCS);
        byte[] bbOld = new String(new char[] {c}).getBytes(oldCS);
        if (!Arrays.equals(bbNew, bbOld)) {
          System.out.printf("      c->b NEW: ");
          printEntry(c, newCS);
          System.out.printf("      c->b OLD: ");
          printEntry(c, oldCS);
        } else {
          String sNew = new String(bbNew, newCS);
          String sOld = new String(bbOld, oldCS);
          if (!sNew.equals(sOld)) {
            System.out.printf("      b2c NEW (c=%x):", c & 0xffff);
            printEntry(sNew.charAt(0), newCS);
            System.out.printf("      b2c OLD:");
            printEntry(sOld.charAt(0), oldCS);
          }
        }
      }
      if (canNew & canOld) { // added only both for now
        cc[pos++] = c;
      }
    }
    return Arrays.copyOf(cc, pos);
  }
  /**
   * Write String using specified encoding
   *
   * <p>When this is called multiple times, all but the last value has a trailing null
   *
   * @param encoder
   * @param next
   * @param i
   * @param noOfValues
   * @return
   * @throws CharacterCodingException
   */
  private ByteBuffer writeString(CharsetEncoder encoder, String next, int i, int noOfValues)
      throws CharacterCodingException {

    ByteBuffer bb;
    if ((i + 1) == noOfValues) {
      bb = encoder.encode(CharBuffer.wrap(next));
    } else {
      bb = encoder.encode(CharBuffer.wrap(next + '\0'));
    }
    bb.rewind();
    return bb;
  }
  static void testMixed(Charset cs) throws Throwable {
    CharsetDecoder dec =
        cs.newDecoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);
    CharsetEncoder enc =
        cs.newEncoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);
    List<Integer> cps = new ArrayList<>(0x10000);
    int off = 0;
    int cp = 0;
    while (cp < 0x10000) {
      if (enc.canEncode((char) cp)) {
        cps.add(cp);
      }
      cp++;
    }
    Collections.shuffle(cps);
    char[] bmpCA = new char[cps.size()];
    for (int i = 0; i < cps.size(); i++) bmpCA[i] = (char) (int) cps.get(i);
    String bmpStr = new String(bmpCA);
    // getBytes(csn);
    byte[] bmpBA = bmpStr.getBytes(cs.name());
    ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA));
    byte[] baNIO = new byte[bf.limit()];
    bf.get(baNIO, 0, baNIO.length);
    if (!Arrays.equals(bmpBA, baNIO)) {
      throw new RuntimeException("getBytes(csn) failed  -> " + cs.name());
    }

    // getBytes(cs);
    bmpBA = bmpStr.getBytes(cs);
    if (!Arrays.equals(bmpBA, baNIO))
      throw new RuntimeException("getBytes(cs) failed  -> " + cs.name());

    // new String(csn);
    String strSC = new String(bmpBA, cs.name());
    String strNIO = dec.reset().decode(ByteBuffer.wrap(bmpBA)).toString();
    if (!strNIO.equals(strSC)) {
      throw new RuntimeException("new String(csn) failed  -> " + cs.name());
    }

    // new String(cs);
    strSC = new String(bmpBA, cs);
    if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed  -> " + cs.name());
  }
  /**
   * Write String in UTF-BEBOM format
   *
   * <p>When this is called multiple times, all but the last value has a trailing null
   *
   * @param next
   * @param i
   * @param noOfValues
   * @return
   * @throws CharacterCodingException
   */
  private ByteBuffer writeStringUTF16BEBOM(String next, int i, int noOfValues)
      throws CharacterCodingException {
    CharsetEncoder encoder =
        Charset.forName(TextEncoding.CHARSET_UTF_16_BE_ENCODING_FORMAT).newEncoder();
    encoder.onMalformedInput(CodingErrorAction.IGNORE);
    encoder.onUnmappableCharacter(CodingErrorAction.IGNORE);

    ByteBuffer bb = null;
    // Add BOM
    if ((i + 1) == noOfValues) {
      bb = encoder.encode(CharBuffer.wrap('\ufeff' + next));
    } else {
      bb = encoder.encode(CharBuffer.wrap('\ufeff' + next + '\0'));
    }
    bb.rewind();
    return bb;
  }
Beispiel #9
0
 public static void main(String args[]) throws Exception {
   String s = "abc\uD800\uDC00qrst"; // Valid surrogate
   char[] c = s.toCharArray();
   CharsetEncoder enc =
       Charset.forName("ISO8859_1").newEncoder().onUnmappableCharacter(CodingErrorAction.REPLACE);
   /* Process the first 4 characters, including the high surrogate
   which should be stored */
   ByteBuffer bb = ByteBuffer.allocate(10);
   CharBuffer cb = CharBuffer.wrap(c);
   cb.limit(4);
   enc.encode(cb, bb, false);
   cb.limit(7);
   enc.encode(cb, bb, true);
   byte[] first = bb.array();
   for (int i = 0; i < 7; i++)
     System.err.printf("[%d]=%d was %d\n", i, (int) first[i] & 0xffff, (int) c[i] & 0xffff);
 }
 public static void main(String[] args) throws Exception {
   // 创建简体中文对应的Charset
   Charset cn = Charset.forName("GBK");
   // 获取cn对象对应的编码器和解码器
   CharsetEncoder cnEncoder = cn.newEncoder();
   CharsetDecoder cnDecoder = cn.newDecoder();
   // 创建一个CharBuffer对象
   CharBuffer cbuff = CharBuffer.allocate(8);
   cbuff.put('孙');
   cbuff.put('悟');
   cbuff.put('空');
   cbuff.flip();
   // 将CharBuffer中的字符序列转换成字节序列
   ByteBuffer bbuff = cnEncoder.encode(cbuff);
   // 循环访问ByteBuffer中的每个字节
   for (int i = 0; i < bbuff.capacity(); i++) {
     System.out.print(bbuff.get(i) + " ");
   }
   // 将ByteBuffer的数据解码成字符序列
   System.out.println("\n" + cnDecoder.decode(bbuff));
 }
Beispiel #11
0
 static CoderResult encodeCR(char[] cc, Charset cs, boolean testDirect) throws Exception {
   ByteBuffer bbf;
   CharBuffer cbf;
   CharsetEncoder enc = cs.newEncoder();
   if (testDirect) {
     bbf = ByteBuffer.allocateDirect(cc.length * 4);
     cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
     cbf.put(cc).flip();
   } else {
     bbf = ByteBuffer.allocate(cc.length * 4);
     cbf = CharBuffer.wrap(cc);
   }
   CoderResult cr = null;
   for (int i = 0; i < iteration; i++) {
     cbf.rewind();
     bbf.clear();
     enc.reset();
     cr = enc.encode(cbf, bbf, true);
   }
   return cr;
 }
  public static void main(String[] args) throws Throwable {
    final int itrs = Integer.getInteger("iterations", 100000);
    // final int itrs = Integer.getInteger("iterations", 12);
    final int size = Integer.getInteger("size", 2048);
    final int subsize = Integer.getInteger("subsize", 128);
    final int maxchar = Integer.getInteger("maxchar", 128);
    final String regex = System.getProperty("filter");
    final Pattern filter = (regex == null) ? null : Pattern.compile(regex);
    final boolean useSecurityManager = Boolean.getBoolean("SecurityManager");
    if (useSecurityManager) System.setSecurityManager(new PermissiveSecurityManger());
    final Random rnd = new Random();

    String[] csns =
        new String[] {
          "Big5",
          "Johab",
          "EUC_CN",
          "EUC_KR",
          "MS932",
          "MS936",
          "MS949",
          "MS950",
          "GBK",
          "Big5_HKSCS",
          "Big5_HKSCS_2001",
          "Big5_Solaris",
          "MS950_HKSCS",
          "MS950_HKSCS_XP",
          "IBM1364",
          "IBM1381",
          "IBM1383",
          "IBM930",
          "IBM933",
          "IBM935",
          "IBM937",
          "IBM939",
          "IBM942",
          "IBM943",
          "IBM948",
          "IBM949",
          "IBM950",
          "IBM970",
        };

    ArrayList<long[]> sum = new ArrayList<>();

    for (final String csn : csns) {
      final Charset cs = Charset.forName(csn);
      List<Integer> cps = new ArrayList<>(0x4000);
      int off = 0;
      int cp = 0;
      int n = 0;
      CharsetEncoder enc = cs.newEncoder();
      while (cp < 0x10000 && n < cps.size()) {
        if (enc.canEncode((char) cp)) {
          cps.add(cp);
          n++;
        }
        cp++;
      }
      Collections.shuffle(cps);
      char[] ca = new char[cps.size()];
      for (int i = 0; i < cps.size(); i++) ca[i] = (char) (int) cps.get(i);

      System.out.printf("%n--------%s---------%n", csn);
      for (int sz = 8; sz <= 2048; sz *= 2) {
        System.out.printf("   [len=%d]%n", sz);

        final char[] chars = Arrays.copyOf(ca, sz);
        final String str = new String(chars);
        final byte[] bs = str.getBytes(cs);

        Job[] jobs = {
          new Job("String decode: csn") {
            public void work() throws Throwable {
              for (int i = 0; i < itrs; i++) new String(bs, csn);
            }
          },
          new Job("String decode: cs") {
            public void work() throws Throwable {
              for (int i = 0; i < itrs; i++) new String(bs, cs);
            }
          },
          new Job("String encode: csn") {
            public void work() throws Throwable {
              for (int i = 0; i < itrs; i++) str.getBytes(csn);
            }
          },
          new Job("String encode: cs") {
            public void work() throws Throwable {
              for (int i = 0; i < itrs; i++) str.getBytes(cs);
            }
          },
        };
        sum.add(time(jobs));
      }
    }
  }
  static void test(Charset cs, char[] bmpCA, byte[] sbBA) throws Throwable {
    String bmpStr = new String(bmpCA);
    CharsetDecoder dec =
        cs.newDecoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);
    CharsetEncoder enc =
        cs.newEncoder()
            .onMalformedInput(CodingErrorAction.REPLACE)
            .onUnmappableCharacter(CodingErrorAction.REPLACE);

    // getBytes(csn);
    byte[] baSC = bmpStr.getBytes(cs.name());
    ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA));
    byte[] baNIO = new byte[bf.limit()];
    bf.get(baNIO, 0, baNIO.length);
    if (!Arrays.equals(baSC, baNIO))
      throw new RuntimeException("getBytes(csn) failed  -> " + cs.name());

    // getBytes(cs);
    baSC = bmpStr.getBytes(cs);
    if (!Arrays.equals(baSC, baNIO))
      throw new RuntimeException("getBytes(cs) failed  -> " + cs.name());

    // new String(csn);
    String strSC = new String(sbBA, cs.name());
    String strNIO = dec.reset().decode(ByteBuffer.wrap(sbBA)).toString();

    if (!strNIO.equals(strSC))
      throw new RuntimeException("new String(csn) failed  -> " + cs.name());

    // new String(cs);
    strSC = new String(sbBA, cs);
    if (!strNIO.equals(strSC)) throw new RuntimeException("new String(cs) failed  -> " + cs.name());

    // encode unmappable surrogates
    if (enc instanceof sun.nio.cs.ArrayEncoder && cs.contains(Charset.forName("ASCII"))) {
      if (cs.name().equals("UTF-8")
          || // utf8 handles surrogates
          cs.name().equals("CESU-8")) // utf8 handles surrogates
      return;
      enc.replaceWith(new byte[] {(byte) 'A'});
      sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder) enc;

      String str = "ab\uD800\uDC00\uD800\uDC00cd";
      byte[] ba = new byte[str.length() - 2];
      int n = cae.encode(str.toCharArray(), 0, str.length(), ba);
      if (n != 6 || !"abAAcd".equals(new String(ba, cs.name())))
        throw new RuntimeException("encode1(surrogates) failed  -> " + cs.name());

      ba = new byte[str.length()];
      n = cae.encode(str.toCharArray(), 0, str.length(), ba);
      if (n != 6 || !"abAAcd".equals(new String(ba, 0, n, cs.name())))
        throw new RuntimeException("encode2(surrogates) failed  -> " + cs.name());
      str = "ab\uD800B\uDC00Bcd";
      ba = new byte[str.length()];
      n = cae.encode(str.toCharArray(), 0, str.length(), ba);
      if (n != 8 || !"abABABcd".equals(new String(ba, 0, n, cs.name())))
        throw new RuntimeException("encode3(surrogates) failed  -> " + cs.name());
      /* sun.nio.cs.ArrayDeEncoder works on the assumption that the
         invoker (StringCoder) allocates enough output buf, utf8
         and double-byte coder does not check the output buffer limit.
      ba = new byte[str.length() - 1];
      n = cae.encode(str.toCharArray(), 0, str.length(), ba);
      if (n != 7 || !"abABABc".equals(new String(ba, 0, n, cs.name()))) {
          throw new RuntimeException("encode4(surrogates) failed  -> "
                                     + cs.name());
      }
      */
    }
  }
 @Override
 public void visitFile(PsiFile file) {
   super.visitFile(file);
   if (InjectedLanguageManager.getInstance(file.getProject()).isInjectedFragment(file)
       || !file.isPhysical()) {
     return;
   }
   final VirtualFile virtualFile = file.getVirtualFile();
   final String text = file.getText();
   final Charset charset =
       LoadTextUtil.extractCharsetFromFileContent(file.getProject(), virtualFile, text);
   final CharsetEncoder encoder =
       charset.newEncoder().onUnmappableCharacter(CodingErrorAction.REPORT);
   final CharBuffer charBuffer = CharBuffer.allocate(1);
   final ByteBuffer byteBuffer = ByteBuffer.allocate(10);
   final int length = text.length();
   for (int i = 0; i < length; i++) {
     final char c = text.charAt(i);
     if (c != '\\') {
       continue;
     }
     boolean isEscape = true;
     int previousChar = i - 1;
     while (previousChar >= 0 && text.charAt(previousChar) == '\\') {
       isEscape = !isEscape;
       previousChar--;
     }
     if (!isEscape) {
       continue;
     }
     int nextChar = i;
     do {
       nextChar++;
       if (nextChar >= length) {
         break;
       }
     } while (text.charAt(nextChar) == 'u'); // \uuuu0061 is a legal unicode escape
     if (nextChar == i + 1 || nextChar + 3 >= length) {
       continue;
     }
     if (StringUtil.isHexDigit(text.charAt(nextChar))
         && StringUtil.isHexDigit(text.charAt(nextChar + 1))
         && StringUtil.isHexDigit(text.charAt(nextChar + 2))
         && StringUtil.isHexDigit(text.charAt(nextChar + 3))) {
       final int escapeEnd = nextChar + 4;
       final char d = (char) Integer.parseInt(text.substring(nextChar, escapeEnd), 16);
       if (Character.isISOControl(d)) {
         continue;
       }
       byteBuffer.clear();
       charBuffer.clear();
       charBuffer.put(d).rewind();
       final CoderResult coderResult = encoder.encode(charBuffer, byteBuffer, true);
       if (!coderResult.isUnmappable()) {
         final PsiElement element = file.findElementAt(i);
         if (element != null && isSuppressedFor(element)) {
           return;
         }
         registerErrorAtOffset(file, i, escapeEnd - i, Character.valueOf(d));
       }
     }
   }
 }