Пример #1
0
 /**
  * Method Description: 根据Jsoup解析出来的Document,提取出其中所有的链接
  *
  * @date 2014年7月29日 上午11:37:21
  * @author Junco
  * @param doc
  * @return
  */
 public void getAllURL(Document doc, String host, hiveRedis redis, hiveBloomFilter filter) {
   Elements links = doc.select("a[href]");
   for (Element link : links) {
     String url = link.attr("abs:href");
     if (url.contains(host) && !filter.exist(url)) {
       redis.putURL(url);
       System.out.println(url);
     } else {
       continue;
     }
   }
 }
Пример #2
0
 public static void main(String[] args) throws IOException {
   String url = "http://politics.people.com.cn/n/2014/0801/c1024-25380603.html";
   hiveRedis redis = new hiveRedis("Junco");
   hiveVertical hv = new hiveVertical();
   hiveBloomFilter filter = new hiveBloomFilter();
   redis.connectRedis();
   redis.putURL(url);
   while (redis.getLength() != 0) {
     Document doc = hv.getText(url);
     hv.saveText2File(doc);
     hv.getAllURL(doc, hv.getHost(url), redis, filter);
     redis.remove(url);
     url = redis.getURL();
     if (url == null) {
       System.out.println("Done");
       break;
     }
   }
   redis.destory();
   System.gc();
 }