public int crawDoubanApp(int ok, int fail, String url, TargetSite tZhan, TargetSite tNote) { String fstr = "dispatch?uri="; int p = url.indexOf(fstr); String link = "http://www.douban.com" + url.substring(p + fstr.length(), url.length()); MLog.e("", url); MLog.e("", link); int[] r = AsyncCrawl.verifyLink(link, false); if (r[0] == AsyncCrawl.YES) { if (r[1] == AsyncCrawl.DOUBAN_NOTE) { title_rex = tNote.title_rex; cont_rex = tNote.cont_rex; int re = crawl(ok, fail, link, false); return re; } else if (r[1] == AsyncCrawl.DOUBAN_XIAOZHAN) { title_rex = tZhan.title_rex; cont_rex = tZhan.cont_rex; return crawl(ok, fail, link, false); } } return fail; }
public int crawl(int ok, int fail, String url, boolean isShort) { if (!ua.equals("") && ua != null) UA = ua; try { Document doc; if (isShort) { Response resp = Jsoup.connect(url).userAgent(UA).followRedirects(true).execute(); doc = resp.parse(); } else { doc = Jsoup.connect(url).userAgent(UA).timeout(10000).get(); } resultTitle = resultCont = ""; /*MLog.e("","title_rex="+title_rex); MLog.e("","cont_rex="+cont_rex); MLog.e("","auth_rex="+auth_rex); MLog.e("","extra_rex="+extra_rex); MLog.e("","source="+source+" url="+url);*/ if (cont_rex.contains(" ")) { String ctemp = cont_rex.trim(); String[] cgp = ctemp.split(" "); if (cgp[1].equals("all")) { cont_len = -1; cont_rex = cgp[0]; } } Elements eletitle = doc.select(this.title_rex), eleauth = null, elecont = doc.select(this.cont_rex), eleextra = null; if (Constant.DEBUG) FileUtils.writeFile(doc.html(), "clip"); if (!auth_rex.equals("")) eleauth = doc.select(this.auth_rex); if (!extra_rex.equals("")) eleextra = doc.select(this.extra_rex); if (eletitle.size() > 0) { resultTitle = eletitle.get(0).html(); if (elecont.size() > 0) { elecont = addStyleForTable(elecont); if (cont_len == -1) { for (Element ele : elecont) { resultCont = resultCont + ele.html(); } } else resultCont = elecont.get(0).html(); } if (!auth_rex.equals("")) { if (eleauth.size() > 0) resultCont = "<p>" + eleauth.get(0).html() + "</p>" + resultCont; } if (!extra_rex.equals("")) { eleextra = addStyleForTable(eleextra); if (eleextra.size() > 0) resultCont = resultCont + eleextra.get(0).html(); } return ok; } else { MLog.e("", "没有匹配到title"); return fail; } } catch (IOException e) { // TODO Auto-generated catch block MLog.e("", "没有请求到数据"); return fail; } }
public int crawBBWC(int ok, int fail, String url) { try { Document doc = Jsoup.connect(url).userAgent(UA).timeout(3000).get(); Elements frame = doc.select("iframe#verticalContent"); if (frame.size() > 0) { url = frame.attr("src"); } doc = Jsoup.connect(url).userAgent(UA).timeout(3000).get(); resultTitle = resultCont = ""; // 处理图片链接 Pattern p = Pattern.compile("issue_\\d+/articles/\\d+"); Matcher m = p.matcher(url); if (m.find()) { String pre = "http://s4.cdn.bb.bbwc.cn/" + m.group(); Elements imgs = doc.select("img"); if (imgs.size() > 0) { for (Element img : imgs) { String raw = img.attr("data-src"); raw = raw.replace("uploadfile", pre); img.attr("src", raw); } } } // 开始提取 Elements eletitle = doc.select(this.title_rex), eleauth = null, elecont = doc.select(this.cont_rex), eleextra = null; if (Constant.DEBUG) FileUtils.writeFile(doc.html(), "clip"); if (!auth_rex.equals("")) eleauth = doc.select(this.auth_rex); if (!extra_rex.equals("")) eleextra = doc.select(this.extra_rex); if (eletitle.size() > 0) { resultTitle = eletitle.get(0).html(); if (elecont.size() > 0) { elecont = addStyleForTable(elecont); resultCont = elecont.get(0).html(); } if (!auth_rex.equals("")) { if (eleauth.size() > 0) resultCont = "<p>" + eleauth.get(0).html() + "</p>" + resultCont; } if (!extra_rex.equals("")) { eleextra = addStyleForTable(eleextra); if (eleextra.size() > 0) resultCont = resultCont + eleextra.get(0).html(); } return ok; } else { MLog.e("", "没有匹配到title"); return fail; } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return fail; }