public static String extractContent(String url) { try { Connection connection = Jsoup.connect(url); connection.userAgent(USER_AGENT); connection.followRedirects(true); connection.timeout(GET_TIMEOUT); long start = System.currentTimeMillis(); Connection.Response response = connection.execute(); long diff = System.currentTimeMillis() - start; int responseCode = response.statusCode(); if (response.statusCode() == OK) { String body = response.body(); Logger.info( "%s retrieved, content length %d, time %s sec.", url, body.length(), FormatUtil.millis2Seconds(diff)); return response.body(); } else { Logger.error("%s returned %d", url, responseCode); return ""; } } catch (IOException e) { Logger.error(e, "%s cannot be read.", url); return ""; } }
@Override protected String doProcess(File htmlfile, String originalUrl, Intent intent) { try { // String charset = "utf-8"; Connection coon = HttpConnection.connect(originalUrl); coon.followRedirects( false); // we don't want it be redirected to other page,example: 10.254.7.4 Document doc = coon.get(); Element head = doc.head(); Element body = doc.body(); if (body.children().size() == 0) { Log.e(TAG, "body has no child with url=" + originalUrl); return PROCESS_FAILED_URL; } /* Elements meta = head.select("meta"); if(!meta.isEmpty()){ Element m = meta.get(0); String content = m.attr("content"); String attr = content.substring(content.indexOf("charset=")+8); if(!attr.trim().isEmpty()){ charset = attr; } } */ Elements base = head.select("base"); if (base.isEmpty()) { String b = head.baseUri(); Attributes attrs = new Attributes(); attrs.put("href", b); ArrayList<Element> a = new ArrayList<>(); a.add(new Element(Tag.valueOf("base"), b, attrs)); head.insertChildren(0, a); } Element div = doc.select("div.content-main").first(); if (div == null) { Log.e(TAG, "not found specific element with url=" + originalUrl); return PROCESS_FAILED_URL; } Element title = div.select("h1.title").first(); title.remove(); body.empty(); ArrayList<Element> a = new ArrayList<>(); a.add(div); body.insertChildren(0, a); int g = 0; while (g < 2) { // try two times. if (FileUtil.saveStringToFile(doc.toString(), htmlfile, false)) { break; } g++; } if (g < 2) return StringUtils.file2Url(htmlfile, PROCESS_FAILED_URL); Log.e(TAG, "save html to file failed with url=" + originalUrl); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return PROCESS_FAILED_URL; }