Esempio n. 1
0
  /**
   * Replaces an end tag with its translation, if necessary.
   *
   * @param tagName name of the last tag picked up.
   * @param buf output buffer
   * @param newLineCounter current number of newline hold in output
   * @return updated current number of newline hold
   */
  private static int replaceEndTag(String tagName, XStringBuilder buf, int newLineCounter) {

    int newlineCounterTmp = newLineCounter;

    if (tagName.equals("h1")) {
      // buf.append("(T1)");
      newlineCounterTmp = addNewline(newLineCounter, buf);
    } else if (tagName.equals("h2")) {
      // buf.append("(T2)");
      newlineCounterTmp = addNewline(newLineCounter, buf);
    } else if (tagName.equals("td")) {
      buf.append(' ');
    } else if (tagName.equals("tr")) {
      newlineCounterTmp = addNewline(newLineCounter, buf);
    } else if (tagName.equals("li")) {
      newlineCounterTmp = addNewline(newLineCounter, buf);
    } else if (tagName.equals("a")) {
      buf.append(' ');
    }
    return newlineCounterTmp;
  }
Esempio n. 2
0
  /**
   * Removes all HTML element tags from a string, leaving just the character data. This method does
   * <b>not</b> touch any inline HTML character entity codes. Use {@link #convertCharacterEntities
   * convertCharacterEntities()} to convert HTML character entity codes.
   *
   * @param s the string to adjust
   * @return the resulting, possibly modified, string
   * @see #convertCharacterEntities
   */
  public static String stripHTMLTags(String s) {
    char[] ch = s.toCharArray();
    boolean inElement = false;
    XStringBuilder buf = new XStringBuilder();

    for (int i = 0; i < ch.length; i++) {
      switch (ch[i]) {
        case '<':
          inElement = true;
          break;

        case '>':
          if (inElement) inElement = false;
          else buf.append(ch[i]);
          break;

        default:
          if (!inElement) buf.append(ch[i]);
          break;
      }
    }

    return buf.toString();
  }
Esempio n. 3
0
 /**
  * Some tag substitutions add a newline (that is the string "\r\n") in order to improve clarity of
  * the translated text. Thus it may happen that long sequences of newline could appear. In order
  * to avoid these long sequences the <code>addNewline</code> method adds a newline only if the
  * number of the newlines already inserted is less or equal a predefined number (<code>
  * NEWLINES_TO_HOLD</code>).
  *
  * @param newlineCounter number of newlines already inserted
  * @param buf uotput buffer
  * @return current number of newlines inserted
  */
 private static int addNewline(int newlineCounter, XStringBuilder buf) {
   if (newlineCounter <= NEWLINES_TO_HOLD) {
     buf.append("\r\n");
   }
   return ++newlineCounter;
 }
Esempio n. 4
0
  /**
   * Translates a String containing html to a text.
   *
   * @param s html to be translated
   * @return text String
   */
  public static String HTMLToText(String s) {
    char[] ch = s.toCharArray();

    boolean inElement = false;
    boolean inAttributeValue = false;
    boolean isStartTag = true;
    boolean isEndTag = false;
    boolean isTagNameComplete = false;
    boolean isNewLine = false;

    int ignoreLevel = 0;
    int newLineCounter = 0;

    String[] tagsToExclude = {"style", "head", "!DOCTYPE"};

    StringBuilder tagName = new StringBuilder();
    XStringBuilder buf = new XStringBuilder();

    char c = ' ';
    char c_prec = ' ';

    String name = null;

    for (int i = 0; i < ch.length; i++) {
      c = ch[i];

      switch (c) {
        case '<':
          //
          // Start of a start/end tag.
          //
          isNewLine = false;
          inElement = true;

          // By default a tag is a start tag
          isStartTag = true;
          isEndTag = false;

          isTagNameComplete = false;

          tagName = new StringBuilder();

          break;

        case '/':
          if (inElement && c_prec == '<') {
            //
            // An end tag has been recognized.
            //
            isStartTag = false;
            isEndTag = true;

          } else if (inElement && c_prec != '<') {
            //
            // A single tag (eg: <br/>) has been recognized.
            //
            isEndTag = true;
          } else {
            //
            // save the / because is in the text
            //
            buf.append(c);
          }
          break;

        case '>':

          //
          // Checks if the '>' char occurs within am attribute value.
          // If this is the case, then the end of the tag has not been
          // reached.
          //
          if (inAttributeValue) {
            break;
          }

          //
          // End of a start/end tag has been reached
          //
          if (inElement) {
            inElement = false;
            isTagNameComplete = true;

            //
            // Is this tag's content to be ignored ?
            //

            name = tagName.toString().toLowerCase();
            if (isInArray(name, tagsToExclude)) {
              if (isStartTag) {
                ignoreLevel++;
                break;
              }
              if (isEndTag) {
                ignoreLevel--;
                break;
              }
            }

            //
            // Tag substituitions
            //
            if (isStartTag && !isEndTag) {
              newLineCounter = replaceStartTag(name, buf, newLineCounter);
            }

            if (isEndTag && !isStartTag) {
              newLineCounter = replaceEndTag(name, buf, newLineCounter);
            }

            if (isEndTag && isStartTag) {
              newLineCounter = replaceSingleTag(name, buf, newLineCounter);
            }
          }
          break;
        case '\"':
          if (inElement && !inAttributeValue) {
            inAttributeValue = true;
            break;
          }

          if (inElement && inAttributeValue) {
            inAttributeValue = false;
            break;
          }

          if (ignoreLevel > 0) {
            break;
          }

          buf.append(c);

          break;

        case '\r':
          if (!inElement && c_prec != '>') {
            buf.append(' ');
          }
          break;
        case '\n':
          isNewLine = true;
          break;
        case 0x20: // single white space
        case 0xA0: // non-breaking space character
          //
          // Strips any white space that follows a newline.
          //
          if (isNewLine) {
            break;
          }
        default:
          isNewLine = false;

          if (!inElement) {

            if (ignoreLevel > 0) {
              break;
            }

            newLineCounter = 0;

            //
            // Content enclosed by a start and an end tag is put in
            // output buffer.
            //
            buf.append(c);

          } else {

            //
            // Retrieve tag name. Any tag attribute is ignored.
            //
            if (c != ' ' && !isTagNameComplete) {
              tagName.append(ch[i]);
            } else {
              isTagNameComplete = true;
            }
          }
          break;
      }
      c_prec = c;
    }

    //
    // Trim any leading sequence of the following characters: '\r', '\n', ' '
    //

    int startIndex = 0;
    for (int i = 0; i < buf.length(); i++) {
      c = buf.charAt(i);
      if (c == '\r' || c == '\n' || c == 0x20 || c == 0xA0) {
        continue;
      }
      startIndex = i;
      break;
    }

    return buf.substring(startIndex);
  }