Example #1
0
  /**
   * This method detects the charset even if the first call only returns some bytes. It will read
   * until 4K bytes are reached and then try to determine the encoding
   *
   * @throws IOException
   */
  protected String detectCharset(
      String key, ByteArrayOutputStream bos, BufferedInputStream in, String enc)
      throws IOException {

    // Grab better encoding from stream
    byte[] arr = new byte[K2];
    int nSum = 0;
    while (nSum < K2) {
      int n = in.read(arr);
      if (n < 0) break;

      nSum += n;
      bos.write(arr, 0, n);
    }

    String str = bos.toString(enc);
    int encIndex = str.indexOf(key);
    int clength = key.length();
    if (encIndex > 0) {
      char startChar = str.charAt(encIndex + clength);
      int lastEncIndex;
      if (startChar == '\'')
        // if we have charset='something'
        lastEncIndex = str.indexOf("'", ++encIndex + clength);
      else if (startChar == '\"')
        // if we have charset="something"
        lastEncIndex = str.indexOf("\"", ++encIndex + clength);
      else {
        // if we have "text/html; charset=utf-8"
        int first = str.indexOf("\"", encIndex + clength);
        if (first < 0) first = Integer.MAX_VALUE;

        // or "text/html; charset=utf-8 "
        int sec = str.indexOf(" ", encIndex + clength);
        if (sec < 0) sec = Integer.MAX_VALUE;
        lastEncIndex = Math.min(first, sec);

        // or "text/html; charset=utf-8 '
        int third = str.indexOf("'", encIndex + clength);
        if (third > 0) lastEncIndex = Math.min(lastEncIndex, third);
      }

      // re-read byte array with different encoding
      // assume that the encoding string cannot be greater than 40 chars
      if (lastEncIndex > encIndex + clength && lastEncIndex < encIndex + clength + 40) {
        String tmpEnc = SHelper.encodingCleanup(str.substring(encIndex + clength, lastEncIndex));
        try {
          in.reset();
          bos.reset();
          return tmpEnc;
        } catch (IOException ex) {
          Log.e(
              Constants.TAG,
              "Couldn't reset stream to re-read with new encoding " + tmpEnc + " " + ex.toString());
        }
      }
    }
    return null;
  }
  /** takes an element and turns the P tags into \n\n */
  public String getFormattedText(Element topNode) {
    removeNodesWithNegativeScores(topNode);
    StringBuilder sb = new StringBuilder();
    append(topNode, sb, nodesToKeepCssSelector);
    String str = SHelper.innerTrim(sb.toString());
    if (str.length() > 100) return str;

    // no subelements
    if (str.isEmpty() || !topNode.text().isEmpty() && str.length() <= topNode.ownText().length())
      str = topNode.text();

    // if jsoup failed to parse the whole html now parse this smaller
    // snippet again to avoid html tags disturbing our text:
    return Jsoup.parse(str).text();
  }
  protected void append(Element node, StringBuilder sb, String tagName) {
    // is select more costly then getElementsByTag?
    MAIN:
    for (Element e : node.select(tagName)) {
      Element tmpEl = e;
      // check all elements until 'node'
      while (tmpEl != null && !tmpEl.equals(node)) {
        if (unlikely(tmpEl)) continue MAIN;
        tmpEl = tmpEl.parent();
      }

      String text = node2Text(e);
      if (text.isEmpty()
          || text.length() < minParagraphText
          || text.length() > SHelper.countLetters(text) * 2) continue;

      sb.append(text);
      sb.append("\n\n");
    }
  }