private Text handleRedirect( Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp, String redirType) throws MalformedURLException, URLFilterException { newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); newUrl = urlFilters.filter(newUrl); try { String origHost = new URL(urlString).getHost().toLowerCase(); String newHost = new URL(newUrl).getHost().toLowerCase(); if (ignoreExternalLinks) { if (!origHost.equals(newHost)) { if (LOG.isDebugEnabled()) { LOG.debug( " - ignoring redirect " + redirType + " from " + urlString + " to " + newUrl + " because external links are ignored"); } return null; } } if (ignoreInternalLinks) { if (origHost.equals(newHost)) { if (LOG.isDebugEnabled()) { LOG.debug( " - ignoring redirect " + redirType + " from " + urlString + " to " + newUrl + " because internal links are ignored"); } return null; } } } catch (MalformedURLException e) { } if (newUrl != null && !newUrl.equals(urlString)) { reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp); url = new Text(newUrl); if (maxRedirect > 0) { redirecting = true; redirectCount++; if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect to " + url + " (fetching now)"); } return url; } else { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval(), datum.getScore()); // transfer existing metadata newDatum.getMetaData().putAll(datum.getMetaData()); try { scfilters.initialScore(url, newDatum); } catch (ScoringFilterException e) { e.printStackTrace(); } if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED); if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect to " + url + " (fetching later)"); } return null; } } else { if (LOG.isDebugEnabled()) { LOG.debug( " - " + redirType + " redirect skipped: " + (newUrl != null ? "to same url" : "filtered")); } return null; } }