Пример #1
0
 public void run() {
   String url = pageInfo.getUrl();
   try {
     int distance = pageInfo.getDistance();
     logger.info("Crawling " + url.toString());
     download();
     extractLinks(); // 抽取链接
     crawler.reportPageFetched(pageInfo); // 保存需要store的
     if (pageInfo.getCrawlFlag() == CrawlAction.FOLLOW) {
       logger.debug("url:" + url + " ----------------follow");
       crawler.reportLinks(pageInfo.getLinks(), distance, url);
     } else if (pageInfo.getCrawlFlag() == CrawlAction.STORE) {
       logger.debug("url:" + url + " ----------------store");
       crawler.reportLinks(pageInfo.getLinks(), distance + 1, url);
     }
   } catch (WontFetchException e) {
     logger.error("Won't fetch " + url);
     e.printStackTrace();
   } catch (Exception e) {
     logger.error("Error crawling %s", e, url);
     e.printStackTrace();
   } finally {
     crawler.reportJobDone();
   }
 }
  public static void main(String[] args) {
    if (args.length == 0) {
      System.out.println(USAGE);
    }

    XMLCrawlTaskSpecFactory factory = new XMLCrawlTaskSpecFactory();

    for (String taskFileName : args) {
      if (taskFileName.endsWith(".spring.xml")) {
        ApplicationContext context = new FileSystemXmlApplicationContext("file:" + taskFileName);
        Crawler crawler = context.getBean("crawler", Crawler.class);
        crawler.run();
      } else {
        CrawlTaskSpec taskSpec = factory.load(new File(taskFileName));
        CrawlTask task = taskSpec.createCrawlTask();
        task.run();
      }
    }
  }
Пример #3
0
  public void download() throws WontFetchException {
    logger.debug("Downloading %s", pageInfo.getUrl());
    HttpGet request = new HttpGet(pageInfo.getUrl());
    request.setHeader("User-Agent", USER_AGENT);
    request.setHeader("Referer", pageInfo.getReferURL());
    //		request.setHeader("Accept-Encoding", "gzip");
    request.setHeader("Connection", "keep-alive");

    HttpResponse response;
    try {
      response = httpClient.execute(request);
    } catch (IOException e) {
      logger.error("Error communicating to server: " + pageInfo.getUrl());
      throw new WontFetchException();
    }
    int statusCode = response.getStatusLine().getStatusCode();
    pageInfo.setHttpStatus(statusCode);
    if (statusCode == HttpStatus.SC_NOT_MODIFIED) {

    } else if (statusCode != HttpStatus.SC_OK) {

    }
    for (Header header : response.getAllHeaders()) {
      String name = header.getName();
      String value = header.getValue();
      pageInfo.getHeaders().put(name, value);
    }

    String contentType = pageInfo.getHeaders().get("Content-Type");
    if (contentType != null
        && !contentType.matches("(application|text)/(xml|xhtml|html)(\\s*;.*)?")) {
      logger.error("Wrong content type: " + contentType);
      throw new WontFetchException();
    }

    HttpEntity entity = response.getEntity();
    try {
      String body = IOUtils.toString(entity.getContent(), crawler.getEncoding());
      pageInfo.setContent(body);
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
Пример #4
0
  public void extractLinks() throws Exception {
    logger.debug("Extracting links " + pageInfo.getUrl());
    String content = pageInfo.getContent();
    if (content == null || content.length() == 0) {
      return;
    }
    URI uri = new URI(pageInfo.getUrl());

    Parser parser = new Parser();
    parser.setInputHTML(content);
    NodeList nodeList = parser.parse(new TagNameFilter("A"));
    logger.debug("get links from " + pageInfo.getUrl() + " size : " + nodeList.size());
    for (int i = 0; i < nodeList.size(); i++) {
      Node node = nodeList.elementAt(i);

      LinkTag tag = (LinkTag) node;
      String linkHref = tag.extractLink();
      if (linkHref.indexOf("http") != linkHref.lastIndexOf("http")) {
        continue;
      }
      try {
        URI linkUri = uri.resolve(linkHref);
        String link = linkUri.toString();
        if (link != null && link.length() > 0) {
          for (FilterRule fr : crawler.getFilterRules()) {
            CrawlAction ca = fr.judge(link);
            if (ca == CrawlAction.STORE
                || ca == CrawlAction.FOLLOW
                || ca == CrawlAction.FOLLOW_STORE) {
              logger.debug("linkUri : " + link + " -- ca : " + ca.toString());
              pageInfo.getLinks().add(link);
            }
          }
        }
      } catch (Exception ignore) {
      }
    }
  }