Example #1
0
  /**
   * Converts appropriate Unicode characters to their HTML character entity counterparts (c.f., <a
   * href="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>).
   *
   * @param s the string to convert
   * @return the resulting, possibly modified, string
   * @see #stripHTMLTags
   * @see #convertCharacterEntities
   */
  public static String makeCharacterEntities(String s) {
    // First, make a character-to-entity-name map from the resource bundle.

    ResourceBundle bundle = getResourceBundle();
    Map<Character, String> charToEntityName = new HashMap<Character, String>();
    Enumeration<String> keys = bundle.getKeys();
    XStringBuffer buf = new XStringBuffer();

    while (keys.hasMoreElements()) {
      String key = keys.nextElement();
      String sChar = bundle.getString(key);
      char c = sChar.charAt(0);

      // Transform the bundle key into an entity name by removing the
      // "html_" prefix.

      buf.clear();
      buf.append(key);
      buf.delete("html_");

      charToEntityName.put(c, buf.toString());
    }

    char[] chars = s.toCharArray();
    buf.clear();

    for (int i = 0; i < chars.length; i++) {
      char c = chars[i];

      String entity = charToEntityName.get(c);
      if (entity == null) {
        if (!TextUtil.isPrintable(c)) {
          buf.append("&#");
          buf.append(Integer.valueOf(c));
          buf.append(';');
        } else {
          buf.append(c);
        }
      } else {
        buf.append('&');
        buf.append(entity);
        buf.append(';');
      }
    }

    return buf.toString();
  }
Example #2
0
  /**
   * Converts all inline HTML character entities (c.f., <a
   * href="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>)
   * to their Unicode character counterparts, if possible.
   *
   * @param s the string to convert
   * @return the resulting, possibly modified, string
   * @see #stripHTMLTags
   * @see #makeCharacterEntities
   */
  public static String convertCharacterEntities(String s) {
    // The resource bundle contains the mappings for symbolic entity
    // names like "amp". Note: Must protect matching and MatchResult in
    // a critical section, for thread-safety. See javadocs for
    // Perl5Util.

    synchronized (HTMLUtil.class) {
      try {
        if (entityPattern == null) entityPattern = Pattern.compile("&(#?[^; \t]+);");
      } catch (PatternSyntaxException ex) {
        // Should not happen unless I've screwed up the pattern.
        // Throw a runtime error.

        assert (false);
      }
    }

    ResourceBundle bundle = getResourceBundle();
    XStringBuffer buf = new XStringBuffer();
    Matcher matcher = null;

    synchronized (HTMLUtil.class) {
      matcher = entityPattern.matcher(s);
    }

    for (; ; ) {
      String match = null;
      String preMatch = null;
      String postMatch = null;

      if (!matcher.find()) break;

      match = matcher.group(1);
      preMatch = s.substring(0, matcher.start(1) - 1);
      postMatch = s.substring(matcher.end(1) + 1);

      if (preMatch != null) buf.append(preMatch);

      if (match.charAt(0) == '#') {
        if (match.length() == 1) buf.append('#');
        else {
          // It might be a numeric entity code. Try to parse it
          // as a number. If the parse fails, just put the whole
          // string in the result, as is. Be sure to handle both
          // the decimal form (e.g., &#8482;) and the hexadecimal
          // form (e.g., &#x2122;).

          int cc;
          boolean isHex = (match.length() > 2) && (match.charAt(1) == 'x');
          boolean isLegal = false;
          try {
            if (isHex) cc = Integer.parseInt(match.substring(2), 16);
            else cc = Integer.parseInt(match.substring(1));

            // It parsed. Is it a valid Unicode character?

            if (Character.isDefined((char) cc)) {
              buf.append((char) cc);
              isLegal = true;
            }
          } catch (NumberFormatException ex) {
          }

          if (!isLegal) {
            buf.append("&#");
            if (isHex) buf.append('x');
            buf.append(match + ";");
          }
        }
      } else {
        // Not a numeric entity. Try to find a matching symbolic
        // entity.

        try {
          String rep = bundle.getString("html_" + match);
          buf.append(rep);
        } catch (MissingResourceException ex) {
          buf.append("&" + match + ";");
        }
      }

      if (postMatch == null) break;

      s = postMatch;
      matcher.reset(s);
    }

    if (s.length() > 0) buf.append(s);

    return buf.toString();
  }