/** * Converts appropriate Unicode characters to their HTML character entity counterparts (c.f., <a * href="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>). * * @param s the string to convert * @return the resulting, possibly modified, string * @see #stripHTMLTags * @see #convertCharacterEntities */ public static String makeCharacterEntities(String s) { // First, make a character-to-entity-name map from the resource bundle. ResourceBundle bundle = getResourceBundle(); Map<Character, String> charToEntityName = new HashMap<Character, String>(); Enumeration<String> keys = bundle.getKeys(); XStringBuffer buf = new XStringBuffer(); while (keys.hasMoreElements()) { String key = keys.nextElement(); String sChar = bundle.getString(key); char c = sChar.charAt(0); // Transform the bundle key into an entity name by removing the // "html_" prefix. buf.clear(); buf.append(key); buf.delete("html_"); charToEntityName.put(c, buf.toString()); } char[] chars = s.toCharArray(); buf.clear(); for (int i = 0; i < chars.length; i++) { char c = chars[i]; String entity = charToEntityName.get(c); if (entity == null) { if (!TextUtil.isPrintable(c)) { buf.append("&#"); buf.append(Integer.valueOf(c)); buf.append(';'); } else { buf.append(c); } } else { buf.append('&'); buf.append(entity); buf.append(';'); } } return buf.toString(); }
/** * Converts all inline HTML character entities (c.f., <a * href="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>) * to their Unicode character counterparts, if possible. * * @param s the string to convert * @return the resulting, possibly modified, string * @see #stripHTMLTags * @see #makeCharacterEntities */ public static String convertCharacterEntities(String s) { // The resource bundle contains the mappings for symbolic entity // names like "amp". Note: Must protect matching and MatchResult in // a critical section, for thread-safety. See javadocs for // Perl5Util. synchronized (HTMLUtil.class) { try { if (entityPattern == null) entityPattern = Pattern.compile("&(#?[^; \t]+);"); } catch (PatternSyntaxException ex) { // Should not happen unless I've screwed up the pattern. // Throw a runtime error. assert (false); } } ResourceBundle bundle = getResourceBundle(); XStringBuffer buf = new XStringBuffer(); Matcher matcher = null; synchronized (HTMLUtil.class) { matcher = entityPattern.matcher(s); } for (; ; ) { String match = null; String preMatch = null; String postMatch = null; if (!matcher.find()) break; match = matcher.group(1); preMatch = s.substring(0, matcher.start(1) - 1); postMatch = s.substring(matcher.end(1) + 1); if (preMatch != null) buf.append(preMatch); if (match.charAt(0) == '#') { if (match.length() == 1) buf.append('#'); else { // It might be a numeric entity code. Try to parse it // as a number. If the parse fails, just put the whole // string in the result, as is. Be sure to handle both // the decimal form (e.g., ™) and the hexadecimal // form (e.g., ™). int cc; boolean isHex = (match.length() > 2) && (match.charAt(1) == 'x'); boolean isLegal = false; try { if (isHex) cc = Integer.parseInt(match.substring(2), 16); else cc = Integer.parseInt(match.substring(1)); // It parsed. Is it a valid Unicode character? if (Character.isDefined((char) cc)) { buf.append((char) cc); isLegal = true; } } catch (NumberFormatException ex) { } if (!isLegal) { buf.append("&#"); if (isHex) buf.append('x'); buf.append(match + ";"); } } } else { // Not a numeric entity. Try to find a matching symbolic // entity. try { String rep = bundle.getString("html_" + match); buf.append(rep); } catch (MissingResourceException ex) { buf.append("&" + match + ";"); } } if (postMatch == null) break; s = postMatch; matcher.reset(s); } if (s.length() > 0) buf.append(s); return buf.toString(); }