/** * Method Description: 根据Jsoup解析出来的Document,提取出其中所有的链接 * * @date 2014年7月29日 上午11:37:21 * @author Junco * @param doc * @return */ public void getAllURL(Document doc, String host, hiveRedis redis, hiveBloomFilter filter) { Elements links = doc.select("a[href]"); for (Element link : links) { String url = link.attr("abs:href"); if (url.contains(host) && !filter.exist(url)) { redis.putURL(url); System.out.println(url); } else { continue; } } }
public static void main(String[] args) throws IOException { String url = "http://politics.people.com.cn/n/2014/0801/c1024-25380603.html"; hiveRedis redis = new hiveRedis("Junco"); hiveVertical hv = new hiveVertical(); hiveBloomFilter filter = new hiveBloomFilter(); redis.connectRedis(); redis.putURL(url); while (redis.getLength() != 0) { Document doc = hv.getText(url); hv.saveText2File(doc); hv.getAllURL(doc, hv.getHost(url), redis, filter); redis.remove(url); url = redis.getURL(); if (url == null) { System.out.println("Done"); break; } } redis.destory(); System.gc(); }