/** 启动方法,根据关键字解析种子列表,并设置规则 */ public void start(String kw) { UrlseedDao dao = new UrlseedDao(); List<String> list = dao.findAll(null); for (String li : list) { String match = li.substring(li.indexOf(".") + 1, li.lastIndexOf(".")); regexRule.addRule("^http://.*" + match + ".*/.*"); // 忽略大小写? regexRule.addRule("^https://.*" + match + ".*/.*"); regexRule.addRule("^ftp://.*" + match + ".*/.*"); // regexRule.addRule("-.*[.][(jpg)|(png)|(gif)|(bmp)|(jpeg)]$"); search(li, kw); } }
@Override public Links visitAndGetNextLinks(Page page) { System.out.println(page.getHtml()); if (Pattern.matches(".*jpg$", page.getUrl()) || Pattern.matches(".*png$", page.getUrl()) || Pattern.matches(".*gif$", page.getUrl())) { try { FileUtils.writeFileWithParent( "download/" + id.incrementAndGet() + ".jpg", page.getContent()); System.out.println("download:" + page.getUrl()); } catch (IOException e) { e.printStackTrace(); } } MyLinks nextLinks = new MyLinks(); RegexRule rr = new RegexRule(); rr.addRule(".*meishij.*"); nextLinks.addAllFromDocument(page.getDoc(), rr); nextLinks.filterImgUrl(page.getDoc(), rr); System.out.println(nextLinks.size()); try { Thread.sleep(1500); } catch (InterruptedException e) { e.printStackTrace(); } return nextLinks; }