/** * This method detects the charset even if the first call only returns some bytes. It will read * until 4K bytes are reached and then try to determine the encoding * * @throws IOException */ protected String detectCharset( String key, ByteArrayOutputStream bos, BufferedInputStream in, String enc) throws IOException { // Grab better encoding from stream byte[] arr = new byte[K2]; int nSum = 0; while (nSum < K2) { int n = in.read(arr); if (n < 0) break; nSum += n; bos.write(arr, 0, n); } String str = bos.toString(enc); int encIndex = str.indexOf(key); int clength = key.length(); if (encIndex > 0) { char startChar = str.charAt(encIndex + clength); int lastEncIndex; if (startChar == '\'') // if we have charset='something' lastEncIndex = str.indexOf("'", ++encIndex + clength); else if (startChar == '\"') // if we have charset="something" lastEncIndex = str.indexOf("\"", ++encIndex + clength); else { // if we have "text/html; charset=utf-8" int first = str.indexOf("\"", encIndex + clength); if (first < 0) first = Integer.MAX_VALUE; // or "text/html; charset=utf-8 " int sec = str.indexOf(" ", encIndex + clength); if (sec < 0) sec = Integer.MAX_VALUE; lastEncIndex = Math.min(first, sec); // or "text/html; charset=utf-8 ' int third = str.indexOf("'", encIndex + clength); if (third > 0) lastEncIndex = Math.min(lastEncIndex, third); } // re-read byte array with different encoding // assume that the encoding string cannot be greater than 40 chars if (lastEncIndex > encIndex + clength && lastEncIndex < encIndex + clength + 40) { String tmpEnc = SHelper.encodingCleanup(str.substring(encIndex + clength, lastEncIndex)); try { in.reset(); bos.reset(); return tmpEnc; } catch (IOException ex) { Log.e( Constants.TAG, "Couldn't reset stream to re-read with new encoding " + tmpEnc + " " + ex.toString()); } } } return null; }
/** takes an element and turns the P tags into \n\n */ public String getFormattedText(Element topNode) { removeNodesWithNegativeScores(topNode); StringBuilder sb = new StringBuilder(); append(topNode, sb, nodesToKeepCssSelector); String str = SHelper.innerTrim(sb.toString()); if (str.length() > 100) return str; // no subelements if (str.isEmpty() || !topNode.text().isEmpty() && str.length() <= topNode.ownText().length()) str = topNode.text(); // if jsoup failed to parse the whole html now parse this smaller // snippet again to avoid html tags disturbing our text: return Jsoup.parse(str).text(); }
protected void append(Element node, StringBuilder sb, String tagName) { // is select more costly then getElementsByTag? MAIN: for (Element e : node.select(tagName)) { Element tmpEl = e; // check all elements until 'node' while (tmpEl != null && !tmpEl.equals(node)) { if (unlikely(tmpEl)) continue MAIN; tmpEl = tmpEl.parent(); } String text = node2Text(e); if (text.isEmpty() || text.length() < minParagraphText || text.length() > SHelper.countLetters(text) * 2) continue; sb.append(text); sb.append("\n\n"); } }