Ejemplo n.º 1
0
  public void fetch(String url, String saveFileName) {

    if (new File(saveFileName).exists()) {
      System.out.println("网页已存在!");
      return;
    }

    try {
      this.doc =
          Jsoup.connect(url)
              .userAgent("Mozilla/5.0 (Windows NT 6.1; rv:5.0)")
              .cookie("auth", "token")
              .timeout(1000)
              .get();

      String title = doc.body().select("h1[class=title]").text();
      if (title.equals("")) {
        System.out.println("网页不存在!");
        return;
      }
      System.out.println("网页已保存:" + saveFileName);
      FileHelp.writeFile(saveFileName, doc.html());
    } catch (IOException e) {
      //			e.printStackTrace();
    }
  }
Ejemplo n.º 2
0
  public static void main(String[] args) {

    String filePath = "D:/data/sites/tech2ipo.com";
    List<String> filelist = new ArrayList<String>();
    FileHelp.refreshFileList(filePath, filelist, ".html");

    Tech2ipoCrawler crawler = new Tech2ipoCrawler();
    for (String fileName : filelist) {
      try {
        crawler.setFile(new File(fileName));
        crawler.fetch();
        crawler.getNewsItem().setSource("Tech2IPO");
        String name =
            fileName.substring(
                fileName.lastIndexOf("tech2ipo.com") + 13, fileName.lastIndexOf("index.html") - 1);
        //			logger.info("D:/data/sites/tech2ipo.com=" + name);
        //			logger.info(fileName);
        crawler.saveFile("D:/data/xml/tech2ipo.com/" + name.replace("\\", "-") + ".xml", true);
        //			break;
      } catch (Exception e) {
        continue;
      }
    }
  }