/** * Replaces an end tag with its translation, if necessary. * * @param tagName name of the last tag picked up. * @param buf output buffer * @param newLineCounter current number of newline hold in output * @return updated current number of newline hold */ private static int replaceEndTag(String tagName, XStringBuilder buf, int newLineCounter) { int newlineCounterTmp = newLineCounter; if (tagName.equals("h1")) { // buf.append("(T1)"); newlineCounterTmp = addNewline(newLineCounter, buf); } else if (tagName.equals("h2")) { // buf.append("(T2)"); newlineCounterTmp = addNewline(newLineCounter, buf); } else if (tagName.equals("td")) { buf.append(' '); } else if (tagName.equals("tr")) { newlineCounterTmp = addNewline(newLineCounter, buf); } else if (tagName.equals("li")) { newlineCounterTmp = addNewline(newLineCounter, buf); } else if (tagName.equals("a")) { buf.append(' '); } return newlineCounterTmp; }
/** * Removes all HTML element tags from a string, leaving just the character data. This method does * <b>not</b> touch any inline HTML character entity codes. Use {@link #convertCharacterEntities * convertCharacterEntities()} to convert HTML character entity codes. * * @param s the string to adjust * @return the resulting, possibly modified, string * @see #convertCharacterEntities */ public static String stripHTMLTags(String s) { char[] ch = s.toCharArray(); boolean inElement = false; XStringBuilder buf = new XStringBuilder(); for (int i = 0; i < ch.length; i++) { switch (ch[i]) { case '<': inElement = true; break; case '>': if (inElement) inElement = false; else buf.append(ch[i]); break; default: if (!inElement) buf.append(ch[i]); break; } } return buf.toString(); }
/** * Some tag substitutions add a newline (that is the string "\r\n") in order to improve clarity of * the translated text. Thus it may happen that long sequences of newline could appear. In order * to avoid these long sequences the <code>addNewline</code> method adds a newline only if the * number of the newlines already inserted is less or equal a predefined number (<code> * NEWLINES_TO_HOLD</code>). * * @param newlineCounter number of newlines already inserted * @param buf uotput buffer * @return current number of newlines inserted */ private static int addNewline(int newlineCounter, XStringBuilder buf) { if (newlineCounter <= NEWLINES_TO_HOLD) { buf.append("\r\n"); } return ++newlineCounter; }
/** * Translates a String containing html to a text. * * @param s html to be translated * @return text String */ public static String HTMLToText(String s) { char[] ch = s.toCharArray(); boolean inElement = false; boolean inAttributeValue = false; boolean isStartTag = true; boolean isEndTag = false; boolean isTagNameComplete = false; boolean isNewLine = false; int ignoreLevel = 0; int newLineCounter = 0; String[] tagsToExclude = {"style", "head", "!DOCTYPE"}; StringBuilder tagName = new StringBuilder(); XStringBuilder buf = new XStringBuilder(); char c = ' '; char c_prec = ' '; String name = null; for (int i = 0; i < ch.length; i++) { c = ch[i]; switch (c) { case '<': // // Start of a start/end tag. // isNewLine = false; inElement = true; // By default a tag is a start tag isStartTag = true; isEndTag = false; isTagNameComplete = false; tagName = new StringBuilder(); break; case '/': if (inElement && c_prec == '<') { // // An end tag has been recognized. // isStartTag = false; isEndTag = true; } else if (inElement && c_prec != '<') { // // A single tag (eg: <br/>) has been recognized. // isEndTag = true; } else { // // save the / because is in the text // buf.append(c); } break; case '>': // // Checks if the '>' char occurs within am attribute value. // If this is the case, then the end of the tag has not been // reached. // if (inAttributeValue) { break; } // // End of a start/end tag has been reached // if (inElement) { inElement = false; isTagNameComplete = true; // // Is this tag's content to be ignored ? // name = tagName.toString().toLowerCase(); if (isInArray(name, tagsToExclude)) { if (isStartTag) { ignoreLevel++; break; } if (isEndTag) { ignoreLevel--; break; } } // // Tag substituitions // if (isStartTag && !isEndTag) { newLineCounter = replaceStartTag(name, buf, newLineCounter); } if (isEndTag && !isStartTag) { newLineCounter = replaceEndTag(name, buf, newLineCounter); } if (isEndTag && isStartTag) { newLineCounter = replaceSingleTag(name, buf, newLineCounter); } } break; case '\"': if (inElement && !inAttributeValue) { inAttributeValue = true; break; } if (inElement && inAttributeValue) { inAttributeValue = false; break; } if (ignoreLevel > 0) { break; } buf.append(c); break; case '\r': if (!inElement && c_prec != '>') { buf.append(' '); } break; case '\n': isNewLine = true; break; case 0x20: // single white space case 0xA0: // non-breaking space character // // Strips any white space that follows a newline. // if (isNewLine) { break; } default: isNewLine = false; if (!inElement) { if (ignoreLevel > 0) { break; } newLineCounter = 0; // // Content enclosed by a start and an end tag is put in // output buffer. // buf.append(c); } else { // // Retrieve tag name. Any tag attribute is ignored. // if (c != ' ' && !isTagNameComplete) { tagName.append(ch[i]); } else { isTagNameComplete = true; } } break; } c_prec = c; } // // Trim any leading sequence of the following characters: '\r', '\n', ' ' // int startIndex = 0; for (int i = 0; i < buf.length(); i++) { c = buf.charAt(i); if (c == '\r' || c == '\n' || c == 0x20 || c == 0xA0) { continue; } startIndex = i; break; } return buf.substring(startIndex); }