Example #1
0
  public int crawDoubanApp(int ok, int fail, String url, TargetSite tZhan, TargetSite tNote) {
    String fstr = "dispatch?uri=";
    int p = url.indexOf(fstr);
    String link = "http://www.douban.com" + url.substring(p + fstr.length(), url.length());
    MLog.e("", url);
    MLog.e("", link);

    int[] r = AsyncCrawl.verifyLink(link, false);
    if (r[0] == AsyncCrawl.YES) {
      if (r[1] == AsyncCrawl.DOUBAN_NOTE) {
        title_rex = tNote.title_rex;
        cont_rex = tNote.cont_rex;
        int re = crawl(ok, fail, link, false);
        return re;
      } else if (r[1] == AsyncCrawl.DOUBAN_XIAOZHAN) {
        title_rex = tZhan.title_rex;
        cont_rex = tZhan.cont_rex;
        return crawl(ok, fail, link, false);
      }
    }

    return fail;
  }
Example #2
0
  public int crawl(int ok, int fail, String url, boolean isShort) {
    if (!ua.equals("") && ua != null) UA = ua;
    try {
      Document doc;
      if (isShort) {
        Response resp = Jsoup.connect(url).userAgent(UA).followRedirects(true).execute();
        doc = resp.parse();
      } else {
        doc = Jsoup.connect(url).userAgent(UA).timeout(10000).get();
      }
      resultTitle = resultCont = "";

      /*MLog.e("","title_rex="+title_rex);
      MLog.e("","cont_rex="+cont_rex);
      MLog.e("","auth_rex="+auth_rex);
      MLog.e("","extra_rex="+extra_rex);
      MLog.e("","source="+source+" url="+url);*/

      if (cont_rex.contains(" ")) {
        String ctemp = cont_rex.trim();
        String[] cgp = ctemp.split(" ");
        if (cgp[1].equals("all")) {
          cont_len = -1;
          cont_rex = cgp[0];
        }
      }

      Elements eletitle = doc.select(this.title_rex),
          eleauth = null,
          elecont = doc.select(this.cont_rex),
          eleextra = null;

      if (Constant.DEBUG) FileUtils.writeFile(doc.html(), "clip");

      if (!auth_rex.equals("")) eleauth = doc.select(this.auth_rex);
      if (!extra_rex.equals("")) eleextra = doc.select(this.extra_rex);

      if (eletitle.size() > 0) {
        resultTitle = eletitle.get(0).html();
        if (elecont.size() > 0) {
          elecont = addStyleForTable(elecont);
          if (cont_len == -1) {
            for (Element ele : elecont) {
              resultCont = resultCont + ele.html();
            }
          } else resultCont = elecont.get(0).html();
        }
        if (!auth_rex.equals("")) {
          if (eleauth.size() > 0) resultCont = "<p>" + eleauth.get(0).html() + "</p>" + resultCont;
        }
        if (!extra_rex.equals("")) {
          eleextra = addStyleForTable(eleextra);
          if (eleextra.size() > 0) resultCont = resultCont + eleextra.get(0).html();
        }

        return ok;
      } else {
        MLog.e("", "没有匹配到title");
        return fail;
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      MLog.e("", "没有请求到数据");
      return fail;
    }
  }
Example #3
0
  public int crawBBWC(int ok, int fail, String url) {

    try {
      Document doc = Jsoup.connect(url).userAgent(UA).timeout(3000).get();
      Elements frame = doc.select("iframe#verticalContent");
      if (frame.size() > 0) {
        url = frame.attr("src");
      }

      doc = Jsoup.connect(url).userAgent(UA).timeout(3000).get();
      resultTitle = resultCont = "";

      // 处理图片链接
      Pattern p = Pattern.compile("issue_\\d+/articles/\\d+");
      Matcher m = p.matcher(url);
      if (m.find()) {
        String pre = "http://s4.cdn.bb.bbwc.cn/" + m.group();
        Elements imgs = doc.select("img");
        if (imgs.size() > 0) {
          for (Element img : imgs) {
            String raw = img.attr("data-src");
            raw = raw.replace("uploadfile", pre);
            img.attr("src", raw);
          }
        }
      }

      // 开始提取
      Elements eletitle = doc.select(this.title_rex),
          eleauth = null,
          elecont = doc.select(this.cont_rex),
          eleextra = null;

      if (Constant.DEBUG) FileUtils.writeFile(doc.html(), "clip");

      if (!auth_rex.equals("")) eleauth = doc.select(this.auth_rex);
      if (!extra_rex.equals("")) eleextra = doc.select(this.extra_rex);

      if (eletitle.size() > 0) {
        resultTitle = eletitle.get(0).html();
        if (elecont.size() > 0) {
          elecont = addStyleForTable(elecont);
          resultCont = elecont.get(0).html();
        }
        if (!auth_rex.equals("")) {
          if (eleauth.size() > 0) resultCont = "<p>" + eleauth.get(0).html() + "</p>" + resultCont;
        }
        if (!extra_rex.equals("")) {
          eleextra = addStyleForTable(eleextra);
          if (eleextra.size() > 0) resultCont = resultCont + eleextra.get(0).html();
        }

        return ok;
      } else {
        MLog.e("", "没有匹配到title");
        return fail;
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return fail;
  }