Пример #1
0
  public void download() throws WontFetchException {
    logger.debug("Downloading %s", pageInfo.getUrl());
    HttpGet request = new HttpGet(pageInfo.getUrl());
    request.setHeader("User-Agent", USER_AGENT);
    request.setHeader("Referer", pageInfo.getReferURL());
    //		request.setHeader("Accept-Encoding", "gzip");
    request.setHeader("Connection", "keep-alive");

    HttpResponse response;
    try {
      response = httpClient.execute(request);
    } catch (IOException e) {
      logger.error("Error communicating to server: " + pageInfo.getUrl());
      throw new WontFetchException();
    }
    int statusCode = response.getStatusLine().getStatusCode();
    pageInfo.setHttpStatus(statusCode);
    if (statusCode == HttpStatus.SC_NOT_MODIFIED) {

    } else if (statusCode != HttpStatus.SC_OK) {

    }
    for (Header header : response.getAllHeaders()) {
      String name = header.getName();
      String value = header.getValue();
      pageInfo.getHeaders().put(name, value);
    }

    String contentType = pageInfo.getHeaders().get("Content-Type");
    if (contentType != null
        && !contentType.matches("(application|text)/(xml|xhtml|html)(\\s*;.*)?")) {
      logger.error("Wrong content type: " + contentType);
      throw new WontFetchException();
    }

    HttpEntity entity = response.getEntity();
    try {
      String body = IOUtils.toString(entity.getContent(), crawler.getEncoding());
      pageInfo.setContent(body);
    } catch (IOException e) {
      e.printStackTrace();
    }
  }