Пример #1
0
  /**
   * Download the file and extract content informations
   *
   * @param httpDownloader
   */
  public DownloadItem download(HttpDownloader httpDownloader) {
    synchronized (this) {
      InputStream is = null;
      DownloadItem downloadItem = null;
      try {
        URL url = urlItem.getURL();
        if (url == null) throw new MalformedURLException("Malformed URL: " + urlItem.getUrl());
        // URL normalisation
        URI uri = url.toURI();
        url = uri.toURL();

        credentialItem = credentialManager == null ? null : credentialManager.matchCredential(url);

        String externalFormUrl = url.toExternalForm();
        downloadItem = crawlCacheManager.loadCache(uri);

        boolean fromCache = (downloadItem != null);

        if (!fromCache) {

          List<CookieItem> cookieList = cookieManager.getItems(externalFormUrl);
          List<HeaderItem> headerList = headerManager.getItems(externalFormUrl);
          downloadItem = httpDownloader.get(uri, credentialItem, headerList, cookieList);
        } else if (Logging.isDebug) Logging.debug("Crawl cache deliver: " + uri);

        urlItem.setContentDispositionFilename(downloadItem.getContentDispositionFilename());

        urlItem.setContentBaseType(downloadItem.getContentBaseType());

        urlItem.setContentTypeCharset(downloadItem.getContentTypeCharset());

        urlItem.setContentEncoding(downloadItem.getContentEncoding());

        urlItem.setContentLength(downloadItem.getContentLength());

        urlItem.setLastModifiedDate(downloadItem.getLastModified());

        urlItem.setFetchStatus(FetchStatus.FETCHED);

        urlItem.setHeaders(downloadItem.getHeaders());

        Integer code = downloadItem.getStatusCode();
        if (code == null) throw new IOException("Http status is null");

        urlItem.setResponseCode(code);
        redirectUrlLocation = downloadItem.getRedirectLocation();
        if (redirectUrlLocation != null)
          urlItem.setRedirectionUrl(redirectUrlLocation.toURL().toExternalForm());

        urlItem.setBacklinkCount(config.getUrlManager().countBackLinks(urlItem.getUrl()));

        if (code >= 200 && code < 300) {
          if (!fromCache) is = crawlCacheManager.storeCache(downloadItem);
          else is = downloadItem.getContentInputStream();
          parseContent(is);
        } else if (code == 301) {
          urlItem.setFetchStatus(FetchStatus.REDIR_PERM);
        } else if (code > 301 && code < 400) {
          urlItem.setFetchStatus(FetchStatus.REDIR_TEMP);
        } else if (code >= 400 && code < 500) {
          urlItem.setFetchStatus(FetchStatus.GONE);
        } else if (code >= 500 && code < 600) {
          urlItem.setFetchStatus(FetchStatus.HTTP_ERROR);
        }
      } catch (FileNotFoundException e) {
        Logging.info("FileNotFound: " + urlItem.getUrl());
        urlItem.setFetchStatus(FetchStatus.GONE);
        setError("FileNotFound: " + urlItem.getUrl());
      } catch (LimitException e) {
        Logging.warn(e.toString() + " (" + urlItem.getUrl() + ")");
        urlItem.setFetchStatus(FetchStatus.SIZE_EXCEED);
        setError(e.getMessage());
      } catch (InstantiationException e) {
        Logging.error(e.getMessage(), e);
        urlItem.setParserStatus(ParserStatus.PARSER_ERROR);
        setError(e.getMessage());
      } catch (IllegalAccessException e) {
        Logging.error(e.getMessage(), e);
        urlItem.setParserStatus(ParserStatus.PARSER_ERROR);
        setError(e.getMessage());
      } catch (ClassNotFoundException e) {
        Logging.error(e.getMessage(), e);
        urlItem.setParserStatus(ParserStatus.PARSER_ERROR);
        setError(e.getMessage());
      } catch (URISyntaxException e) {
        Logging.warn(e.getMessage(), e);
        urlItem.setFetchStatus(FetchStatus.URL_ERROR);
        setError(e.getMessage());
      } catch (MalformedURLException e) {
        Logging.warn(e.getMessage(), e);
        urlItem.setFetchStatus(FetchStatus.URL_ERROR);
        setError(e.getMessage());
      } catch (IOException e) {
        Logging.error(e.getMessage(), e);
        urlItem.setFetchStatus(FetchStatus.ERROR);
        setError(e.getMessage());
      } catch (IllegalArgumentException e) {
        Logging.error(e.getMessage(), e);
        urlItem.setFetchStatus(FetchStatus.ERROR);
        setError(e.getMessage());
      } catch (Exception e) {
        Logging.error(e.getMessage(), e);
        urlItem.setFetchStatus(FetchStatus.ERROR);
        setError(e.getMessage());
      } finally {
        IOUtils.close(is);
      }
      return downloadItem;
    }
  }