Esempio n. 1
0
 /**
  * Visit all bytes of the given utf8 string calling the visitor when a character is decoded.
  *
  * <p>The acceptable input formats are controlled by the STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8,
  * and ALLOW_PSEUDO_UTF8 flags.
  *
  * @param utf8 (pseudo-)utf8 byte array
  * @param visitor called when characters are decoded
  * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
  */
 @Inline
 private static void visitUTF8(ByteBuffer utf8, UTF8CharacterVisitor visitor)
     throws UTFDataFormatException {
   while (utf8.hasRemaining()) {
     byte b = utf8.get();
     if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) {
       if (b == 0) {
         throwDataFormatException("0 byte encountered", utf8.position() - 1);
       }
     }
     if (b >= 0) { // < 0x80 unsigned
       // in the range '\001' to '\177'
       visitor.visit_char((char) b);
       continue;
     }
     try {
       byte nb = utf8.get();
       if (b < -32) { // < 0xe0 unsigned
         // '\000' or in the range '\200' to '\u07FF'
         char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
         visitor.visit_char(c);
         if (STRICTLY_CHECK_FORMAT) {
           if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
             throwDataFormatException(
                 "invalid marker bits for double byte char", utf8.position() - 2);
           }
           if (c < '\200') {
             if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
               throwDataFormatException(
                   "encountered double byte char that should have been single byte",
                   utf8.position() - 2);
             }
           } else if (c > '\u07FF') {
             throwDataFormatException(
                 "encountered double byte char that should have been single byte",
                 utf8.position() - 2);
           }
         }
       } else {
         byte nnb = utf8.get();
         // in the range '\u0800' to '\uFFFF'
         char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
         visitor.visit_char(c);
         if (STRICTLY_CHECK_FORMAT) {
           if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
             throwDataFormatException(
                 "invalid marker bits for triple byte char", utf8.position() - 3);
           }
           if (c < '\u0800') {
             throwDataFormatException(
                 "encountered triple byte char that should have been fewer bytes",
                 utf8.position() - 3);
           }
         }
       }
     } catch (ArrayIndexOutOfBoundsException e) {
       throwDataFormatException("unexpected end", utf8.position());
     }
   }
 }
Esempio n. 2
0
 /**
  * Convert the given sequence of (pseudo-)utf8 formatted bytes into a String.
  *
  * <p>The acceptable input formats are controlled by the STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8,
  * and ALLOW_PSEUDO_UTF8 flags.
  *
  * @param utf8 (pseudo-)utf8 byte array
  * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
  * @return unicode string
  */
 public static String fromUTF8(ByteBuffer utf8) throws UTFDataFormatException {
   UTF8CharacterVisitor visitor = new ByteBufferStringEncoderVisitor(utf8.remaining());
   visitUTF8(utf8, visitor);
   return visitor.toString();
 }
Esempio n. 3
0
 /**
  * Convert the given sequence of (pseudo-)utf8 formatted bytes into a String.
  *
  * <p>The acceptable input formats are controlled by the STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8,
  * and ALLOW_PSEUDO_UTF8 flags.
  *
  * @param utf8 (pseudo-)utf8 byte array
  * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
  * @return unicode string
  */
 public static String fromUTF8(byte[] utf8) throws UTFDataFormatException {
   UTF8CharacterVisitor visitor = new ByteArrayStringEncoderVisitor(utf8.length);
   visitUTF8(utf8, visitor);
   return visitor.toString();
 }