コード例 #1
0
ファイル: FetcherThread.java プロジェクト: Ccccz/nutch
  private Text handleRedirect(
      Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp, String redirType)
      throws MalformedURLException, URLFilterException {
    newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
    newUrl = urlFilters.filter(newUrl);

    try {
      String origHost = new URL(urlString).getHost().toLowerCase();
      String newHost = new URL(newUrl).getHost().toLowerCase();
      if (ignoreExternalLinks) {
        if (!origHost.equals(newHost)) {
          if (LOG.isDebugEnabled()) {
            LOG.debug(
                " - ignoring redirect "
                    + redirType
                    + " from "
                    + urlString
                    + " to "
                    + newUrl
                    + " because external links are ignored");
          }
          return null;
        }
      }

      if (ignoreInternalLinks) {
        if (origHost.equals(newHost)) {
          if (LOG.isDebugEnabled()) {
            LOG.debug(
                " - ignoring redirect "
                    + redirType
                    + " from "
                    + urlString
                    + " to "
                    + newUrl
                    + " because internal links are ignored");
          }
          return null;
        }
      }
    } catch (MalformedURLException e) {
    }

    if (newUrl != null && !newUrl.equals(urlString)) {
      reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
      url = new Text(newUrl);
      if (maxRedirect > 0) {
        redirecting = true;
        redirectCount++;
        if (LOG.isDebugEnabled()) {
          LOG.debug(" - " + redirType + " redirect to " + url + " (fetching now)");
        }
        return url;
      } else {
        CrawlDatum newDatum =
            new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval(), datum.getScore());
        // transfer existing metadata
        newDatum.getMetaData().putAll(datum.getMetaData());
        try {
          scfilters.initialScore(url, newDatum);
        } catch (ScoringFilterException e) {
          e.printStackTrace();
        }
        if (reprUrl != null) {
          newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
        }
        output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
        if (LOG.isDebugEnabled()) {
          LOG.debug(" - " + redirType + " redirect to " + url + " (fetching later)");
        }
        return null;
      }
    } else {
      if (LOG.isDebugEnabled()) {
        LOG.debug(
            " - "
                + redirType
                + " redirect skipped: "
                + (newUrl != null ? "to same url" : "filtered"));
      }
      return null;
    }
  }