@Test public void verifyNullByteArray() throws Exception { for (Map.Entry<String, Charset> entry : CharsetUtil.getCharsetMap().entrySet()) { // test that the byte array wasn't changed Assert.assertEquals( "Charset " + entry.getKey() + " impl bad -- did not return null", "", CharsetUtil.decode(null, entry.getValue())); } }
@Test public void verifyDecodeDoesNotChangeByteArray() throws Exception { for (Map.Entry<String, Charset> entry : CharsetUtil.getCharsetMap().entrySet()) { byte[] bytes = new byte[] {(byte) 0x40, (byte) 0x5F, (byte) 0x24, (byte) 0x78, (byte) 0x02}; byte[] expectedBytes = Arrays.copyOf(bytes, bytes.length); String str0 = CharsetUtil.decode(bytes, entry.getValue()); // test that the byte array wasn't changed Assert.assertArrayEquals( "Charset " + entry.getKey() + " impl bad -- modified byte array parameter", expectedBytes, bytes); } }
@Test public void normalize() throws Exception { String in = null; // try every charset with simple A-Z, a-z, and 0-9 which should work // in all charsets in = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefjhijklmnopqrstuvwxyz01234567890?&@"; for (Map.Entry<String, Charset> entry : CharsetUtil.getCharsetMap().entrySet()) { Assert.assertEquals( "Charset " + entry.getKey() + " implementation broken", in, CharsetUtil.normalize(in, entry.getValue())); } in = "\u20AC"; // euro currency char (only supported in a couple charsets) Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_GSM)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_PACKED_GSM)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_AIRWIDE_GSM)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_VFD2_GSM)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_VFTR_GSM)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_ISO_8859_1)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_ISO_8859_15)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UCS_2)); Assert.assertEquals("\u20AC", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UTF_8)); in = "\u6025"; // arabic char (only supported in a couple charsets) Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_GSM)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_PACKED_GSM)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_AIRWIDE_GSM)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_VFD2_GSM)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_VFTR_GSM)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_ISO_8859_1)); Assert.assertEquals("?", CharsetUtil.normalize(in, CharsetUtil.CHARSET_ISO_8859_15)); Assert.assertEquals("\u6025", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UCS_2)); Assert.assertEquals("\u6025", CharsetUtil.normalize(in, CharsetUtil.CHARSET_UTF_8)); }
@Test public void encode() throws Exception { // euro currency symbol String str0 = "\u20ac"; byte[] bytes = null; bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("1B65"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_PACKED_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("9B32"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2); Assert.assertArrayEquals(HexUtil.toByteArray("20AC"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-16BE"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UTF_8); Assert.assertArrayEquals(HexUtil.toByteArray("E282AC"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-8"), bytes); // latin-1 doesn't contain the euro symbol - replace with '?' bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_1); Assert.assertArrayEquals(HexUtil.toByteArray("3F"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-1"), bytes); // latin-9 does contain the euro symbol bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_15); Assert.assertArrayEquals(HexUtil.toByteArray("A4"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-15"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertArrayEquals(HexUtil.toByteArray("1B65"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFD2_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("1B65"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("1B65"), bytes); // longer string with @ symbol in-between str0 = "Hello @ World"; bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_GSM); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F200020576F726C64"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_PACKED_GSM); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("C8329BFD060140D7B79C4D06"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals( HexUtil.toByteArray("00480065006C006C006F0020004000200057006F0072006C0064"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-16BE"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UTF_8); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-8"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_1); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-1"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_15); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-15"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F200020576F726C64"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFD2_GSM); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("48656C6C6F204020576F726C64"), bytes); // longer string with @ symbol in-between str0 = "JoeyBlue"; bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_GSM); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_PACKED_GSM); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("CA77392F64D7CB"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("004A006F006500790042006C00750065"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-16BE"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UTF_8); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-8"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_1); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-1"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_15); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-15"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFD2_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("4A6F6579426C7565"), bytes); // longer string with @ symbol in-between str0 = "{}[]$"; bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_GSM); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("1B281B291B3C1B3E02"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_PACKED_GSM); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("1BD426B5E16D7C02"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UCS_2); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("007B007D005B005D0024"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-16BE"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_UTF_8); // logger.debug(HexUtil.toHexString(bytes)); Assert.assertArrayEquals(HexUtil.toByteArray("7B7D5B5D24"), bytes); Assert.assertArrayEquals(str0.getBytes("UTF-8"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_1); Assert.assertArrayEquals(HexUtil.toByteArray("7B7D5B5D24"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-1"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_ISO_8859_15); Assert.assertArrayEquals(HexUtil.toByteArray("7B7D5B5D24"), bytes); Assert.assertArrayEquals(str0.getBytes("ISO-8859-15"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertArrayEquals(HexUtil.toByteArray("1B281B291B3C1B3E02"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFD2_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("1B281B291B3C1B3E24"), bytes); bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("1B281B291B3C1B3E24"), bytes); // chars specifically to vodafone-turkey // str0 = "$@£¤¥§ÄÅßñΓΔΘΩ€"; str0 = "$@\u00a3\u00a4\u00a5\u00a7\u00c4\u00c5\u00df\u00f1\u0393\u0394\u0398\u03a9\u20ac"; bytes = CharsetUtil.encode(str0, CharsetUtil.CHARSET_VFTR_GSM); Assert.assertArrayEquals(HexUtil.toByteArray("2440A3A4A5A7C4C5DFF1137F19151B65"), bytes); }
@Test public void decode() throws Exception { // euro currency symbol String str0 = "\u20ac"; String str1 = null; str1 = CharsetUtil.decode(HexUtil.toByteArray("1B65"), CharsetUtil.CHARSET_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("9B32"), CharsetUtil.CHARSET_PACKED_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("20AC"), CharsetUtil.CHARSET_UCS_2); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("E282AC"), CharsetUtil.CHARSET_UTF_8); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("1B65"), CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertEquals(str0, str1); // latin-1 doesn't contain the euro symbol - replace with '?' // str1 = CharsetUtil.decode(HexUtil.toByteArray("3F"), CharsetUtil.CHARSET_ISO_8859_1); // Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("A4"), CharsetUtil.CHARSET_ISO_8859_15); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("1B65"), CharsetUtil.CHARSET_VFD2_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("1B65"), CharsetUtil.CHARSET_VFTR_GSM); Assert.assertEquals(str0, str1); // longer string with @ symbol in-between str0 = "Hello @ World"; str1 = CharsetUtil.decode( HexUtil.toByteArray("48656C6C6F200020576F726C64"), CharsetUtil.CHARSET_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode( HexUtil.toByteArray("C8329BFD060140D7B79C4D06"), CharsetUtil.CHARSET_PACKED_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode( HexUtil.toByteArray("00480065006C006C006F0020004000200057006F0072006C0064"), CharsetUtil.CHARSET_UCS_2); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode( HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_UTF_8); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode( HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_ISO_8859_1); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode( HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_ISO_8859_15); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode( HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode( HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_VFD2_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode( HexUtil.toByteArray("48656C6C6F204020576F726C64"), CharsetUtil.CHARSET_VFTR_GSM); Assert.assertEquals(str0, str1); // longer string with @ symbol in-between str0 = "JoeyBlue"; str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("CA77392F64D7CB"), CharsetUtil.CHARSET_PACKED_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode( HexUtil.toByteArray("004A006F006500790042006C00750065"), CharsetUtil.CHARSET_UCS_2); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_UTF_8); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_ISO_8859_1); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode( HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_ISO_8859_15); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode( HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_VFD2_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("4A6F6579426C7565"), CharsetUtil.CHARSET_VFTR_GSM); Assert.assertEquals(str0, str1); // longer string with @ symbol in-between str0 = "{}[]$"; str1 = CharsetUtil.decode(HexUtil.toByteArray("1B281B291B3C1B3E02"), CharsetUtil.CHARSET_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("1BD426B5E16D7C02"), CharsetUtil.CHARSET_PACKED_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("007B007D005B005D0024"), CharsetUtil.CHARSET_UCS_2); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("7B7D5B5D24"), CharsetUtil.CHARSET_UTF_8); Assert.assertEquals(str0, str1); // airwide is close to GSM, $ is 0x24 rather than 0x02 though str1 = CharsetUtil.decode( HexUtil.toByteArray("1B281B291B3C1B3E24"), CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("7B7D5B5D24"), CharsetUtil.CHARSET_ISO_8859_1); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("7B7D5B5D24"), CharsetUtil.CHARSET_ISO_8859_15); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("1B281B291B3C1B3E24"), CharsetUtil.CHARSET_VFD2_GSM); Assert.assertEquals(str0, str1); str1 = CharsetUtil.decode(HexUtil.toByteArray("1B281B291B3C1B3E24"), CharsetUtil.CHARSET_VFTR_GSM); Assert.assertEquals(str0, str1); // had problem passing these tests on linux vs. mac os x -- issue with // byte encoding on differnet platforms, replaced tests with source strings // that use Java unicode escapes // helpful URL: http://www.greywyvern.com/code/php/utf8_html // decode a string with every char in VFD2-GSM // str0 = "@$ߤ¡Ñܧñü_"; str0 = "@$\u00df\u00a4\u00a1\u00d1\u00dc\u00a7\u00f1\u00fc_"; str1 = CharsetUtil.decode( HexUtil.toByteArray("40247E02A15F5D5E1E7D11"), CharsetUtil.CHARSET_VFD2_GSM); Assert.assertEquals(str0, str1); // str0 = "@$ß$@ÑÜ_ñü_"; str0 = "@$\u00df$@\u00d1\u00dc_\u00f1\u00fc_"; str1 = CharsetUtil.decode( HexUtil.toByteArray("40241E24405D5E5F7D7E5F"), CharsetUtil.CHARSET_AIRWIDE_IA5); Assert.assertEquals(str0, str1); // chars specifically to vodafone-turkey // str0 = "$@£¤¥§ÄÅßñΓΔΘΩ€"; str0 = "$@\u00a3\u00a4\u00a5\u00a7\u00c4\u00c5\u00df\u00f1\u0393\u0394\u0398\u03a9\u20ac"; str1 = CharsetUtil.decode( HexUtil.toByteArray("2440A3A4A5A7C4C5DFF1137F19151B65"), CharsetUtil.CHARSET_VFTR_GSM); Assert.assertEquals(str0, str1); }