public void fetch(String url, String saveFileName) { if (new File(saveFileName).exists()) { System.out.println("网页已存在!"); return; } try { this.doc = Jsoup.connect(url) .userAgent("Mozilla/5.0 (Windows NT 6.1; rv:5.0)") .cookie("auth", "token") .timeout(1000) .get(); String title = doc.body().select("h1[class=title]").text(); if (title.equals("")) { System.out.println("网页不存在!"); return; } System.out.println("网页已保存:" + saveFileName); FileHelp.writeFile(saveFileName, doc.html()); } catch (IOException e) { // e.printStackTrace(); } }
public static void main(String[] args) { String filePath = "D:/data/sites/tech2ipo.com"; List<String> filelist = new ArrayList<String>(); FileHelp.refreshFileList(filePath, filelist, ".html"); Tech2ipoCrawler crawler = new Tech2ipoCrawler(); for (String fileName : filelist) { try { crawler.setFile(new File(fileName)); crawler.fetch(); crawler.getNewsItem().setSource("Tech2IPO"); String name = fileName.substring( fileName.lastIndexOf("tech2ipo.com") + 13, fileName.lastIndexOf("index.html") - 1); // logger.info("D:/data/sites/tech2ipo.com=" + name); // logger.info(fileName); crawler.saveFile("D:/data/xml/tech2ipo.com/" + name.replace("\\", "-") + ".xml", true); // break; } catch (Exception e) { continue; } } }