public static String extractContent(String url) {
   try {
     Connection connection = Jsoup.connect(url);
     connection.userAgent(USER_AGENT);
     connection.followRedirects(true);
     connection.timeout(GET_TIMEOUT);
     long start = System.currentTimeMillis();
     Connection.Response response = connection.execute();
     long diff = System.currentTimeMillis() - start;
     int responseCode = response.statusCode();
     if (response.statusCode() == OK) {
       String body = response.body();
       Logger.info(
           "%s retrieved, content length %d, time %s sec.",
           url, body.length(), FormatUtil.millis2Seconds(diff));
       return response.body();
     } else {
       Logger.error("%s returned %d", url, responseCode);
       return "";
     }
   } catch (IOException e) {
     Logger.error(e, "%s cannot be read.", url);
     return "";
   }
 }
Exemple #2
0
  @Override
  protected String doProcess(File htmlfile, String originalUrl, Intent intent) {
    try {
      //            String charset = "utf-8";
      Connection coon = HttpConnection.connect(originalUrl);
      coon.followRedirects(
          false); // we don't want it be redirected to other page,example: 10.254.7.4
      Document doc = coon.get();
      Element head = doc.head();
      Element body = doc.body();
      if (body.children().size() == 0) {
        Log.e(TAG, "body has no child with url=" + originalUrl);
        return PROCESS_FAILED_URL;
      }
      /*
      Elements meta = head.select("meta");
      if(!meta.isEmpty()){
          Element m = meta.get(0);
          String content = m.attr("content");
          String attr = content.substring(content.indexOf("charset=")+8);
          if(!attr.trim().isEmpty()){
              charset = attr;
          }
      }
      */
      Elements base = head.select("base");
      if (base.isEmpty()) {
        String b = head.baseUri();
        Attributes attrs = new Attributes();
        attrs.put("href", b);
        ArrayList<Element> a = new ArrayList<>();
        a.add(new Element(Tag.valueOf("base"), b, attrs));
        head.insertChildren(0, a);
      }

      Element div = doc.select("div.content-main").first();
      if (div == null) {
        Log.e(TAG, "not found specific element with url=" + originalUrl);
        return PROCESS_FAILED_URL;
      }
      Element title = div.select("h1.title").first();
      title.remove();
      body.empty();
      ArrayList<Element> a = new ArrayList<>();
      a.add(div);
      body.insertChildren(0, a);
      int g = 0;
      while (g < 2) { // try two times.
        if (FileUtil.saveStringToFile(doc.toString(), htmlfile, false)) {
          break;
        }
        g++;
      }

      if (g < 2) return StringUtils.file2Url(htmlfile, PROCESS_FAILED_URL);
      Log.e(TAG, "save html to file failed with url=" + originalUrl);
    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return PROCESS_FAILED_URL;
  }