public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { // convert on the fly from the old format if (key instanceof UTF8) { newKey.set(key.toString()); key = newKey; } if (filters != null) { try { if (filters.filter(((Text) key).toString()) == null) { return; } } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Cannot filter key " + key + ": " + e.getMessage()); } } } output.collect(key, value); }
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String url = value.toString().trim(); // value is line of text if (url != null && (url.length() == 0 || url.startsWith("#"))) { /* Ignore line that start with # */ return; } // if tabs : metadata that could be stored // must be name=value and separated by \t float customScore = -1f; int customInterval = interval; Map<String, String> metadata = new TreeMap<String, String>(); if (url.indexOf("\t") != -1) { String[] splits = url.split("\t"); url = splits[0]; for (int s = 1; s < splits.length; s++) { // find separation between name and value int indexEquals = splits[s].indexOf("="); if (indexEquals == -1) { // skip anything without a = continue; } String metaname = splits[s].substring(0, indexEquals); String metavalue = splits[s].substring(indexEquals + 1); if (metaname.equals(nutchScoreMDName)) { try { customScore = Float.parseFloat(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFetchIntervalMDName)) { try { customInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else metadata.put(metaname, metavalue); } } try { url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); url = filters.filter(url); // filter the url } catch (Exception e) { LOG.warn("Skipping " + url + ":" + e); url = null; } if (url == null) { context.getCounter("injector", "urls_filtered").increment(1); return; } else { // if it passes String reversedUrl = TableUtil.reverseUrl(url); // collect it WebPage row = new WebPage(); row.setFetchTime(curTime); row.setFetchInterval(customInterval); // now add the metadata Iterator<String> keysIter = metadata.keySet().iterator(); while (keysIter.hasNext()) { String keymd = keysIter.next(); String valuemd = metadata.get(keymd); row.putToMetadata(new Utf8(keymd), ByteBuffer.wrap(valuemd.getBytes())); } if (customScore != -1) row.setScore(customScore); else row.setScore(scoreInjected); try { scfilters.injectedScore(url, row); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn( "Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")"); } } context.getCounter("injector", "urls_injected").increment(1); row.putToMarkers(DbUpdaterJob.DISTANCE, new Utf8(String.valueOf(0))); Mark.INJECT_MARK.putMark(row, YES_STRING); context.write(reversedUrl, row); } }
private Text handleRedirect( Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp, String redirType) throws MalformedURLException, URLFilterException { newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); newUrl = urlFilters.filter(newUrl); try { String origHost = new URL(urlString).getHost().toLowerCase(); String newHost = new URL(newUrl).getHost().toLowerCase(); if (ignoreExternalLinks) { if (!origHost.equals(newHost)) { if (LOG.isDebugEnabled()) { LOG.debug( " - ignoring redirect " + redirType + " from " + urlString + " to " + newUrl + " because external links are ignored"); } return null; } } if (ignoreInternalLinks) { if (origHost.equals(newHost)) { if (LOG.isDebugEnabled()) { LOG.debug( " - ignoring redirect " + redirType + " from " + urlString + " to " + newUrl + " because internal links are ignored"); } return null; } } } catch (MalformedURLException e) { } if (newUrl != null && !newUrl.equals(urlString)) { reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp); url = new Text(newUrl); if (maxRedirect > 0) { redirecting = true; redirectCount++; if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect to " + url + " (fetching now)"); } return url; } else { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval(), datum.getScore()); // transfer existing metadata newDatum.getMetaData().putAll(datum.getMetaData()); try { scfilters.initialScore(url, newDatum); } catch (ScoringFilterException e) { e.printStackTrace(); } if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED); if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect to " + url + " (fetching later)"); } return null; } } else { if (LOG.isDebugEnabled()) { LOG.debug( " - " + redirType + " redirect skipped: " + (newUrl != null ? "to same url" : "filtered")); } return null; } }
public void map( WritableComparable<?> key, Text value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { String url = value.toString(); // value is line of text if (url != null && url.trim().startsWith("#")) { /* Ignore line that start with # */ return; } // if tabs : metadata that could be stored // must be name=value and separated by \t float customScore = -1f; int customInterval = interval; int fixedInterval = -1; Map<String, String> metadata = new TreeMap<String, String>(); if (url.indexOf("\t") != -1) { String[] splits = url.split("\t"); url = splits[0]; for (int s = 1; s < splits.length; s++) { // find separation between name and value int indexEquals = splits[s].indexOf("="); if (indexEquals == -1) { // skip anything without a = continue; } String metaname = splits[s].substring(0, indexEquals); String metavalue = splits[s].substring(indexEquals + 1); if (metaname.equals(nutchScoreMDName)) { try { customScore = Float.parseFloat(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFetchIntervalMDName)) { try { customInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFixedFetchIntervalMDName)) { try { fixedInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else metadata.put(metaname, metavalue); } } try { url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); url = filters.filter(url); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " + url + ":" + e); } url = null; } if (url == null) { reporter.getCounter("injector", "urls_filtered").increment(1); } else { // if it passes value.set(url); // collect it CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_INJECTED); // Is interval custom? Then set as meta data if (fixedInterval > -1) { // Set writable using float. Flaot is used by AdaptiveFetchSchedule datum .getMetaData() .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval)); datum.setFetchInterval(fixedInterval); } else { datum.setFetchInterval(customInterval); } datum.setFetchTime(curTime); // now add the metadata Iterator<String> keysIter = metadata.keySet().iterator(); while (keysIter.hasNext()) { String keymd = keysIter.next(); String valuemd = metadata.get(keymd); datum.getMetaData().put(new Text(keymd), new Text(valuemd)); } if (customScore != -1) datum.setScore(customScore); else datum.setScore(scoreInjected); try { scfilters.injectedScore(value, datum); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn( "Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")"); } } reporter.getCounter("injector", "urls_injected").increment(1); output.collect(value, datum); } }
/** * Parses given web page and stores parsed content within page. Puts a meta-redirect to outlinks. * * @param key * @param page */ public void process(String key, WebPage page) { String url = TableUtil.unreverseUrl(key); byte status = page.getStatus().byteValue(); if (status != CrawlStatus.STATUS_FETCHED) { if (LOG.isDebugEnabled()) { LOG.debug("Skipping " + url + " as status is: " + CrawlStatus.getName(status)); } return; } Parse parse; try { parse = parse(url, page); } catch (ParserNotFound e) { // do not print stacktrace for the fact that some types are not mapped. LOG.warn("No suitable parser found: " + e.getMessage()); return; } catch (final Exception e) { LOG.warn("Error parsing: " + url + ": " + StringUtils.stringifyException(e)); return; } if (parse == null) { return; } org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus(); page.setParseStatus(pstatus); if (ParseStatusUtils.isSuccess(pstatus)) { if (pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) { String newUrl = ParseStatusUtils.getMessage(pstatus); int refreshTime = Integer.parseInt(ParseStatusUtils.getArg(pstatus, 1)); try { newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); if (newUrl == null) { LOG.warn("redirect normalized to null " + url); return; } try { newUrl = filters.filter(newUrl); } catch (URLFilterException e) { return; } if (newUrl == null) { LOG.warn("redirect filtered to null " + url); return; } } catch (MalformedURLException e) { LOG.warn("malformed url exception parsing redirect " + url); return; } page.getOutlinks().put(new Utf8(newUrl), new Utf8()); page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL); if (newUrl == null || newUrl.equals(url)) { String reprUrl = URLUtil.chooseRepr(url, newUrl, refreshTime < FetcherJob.PERM_REFRESH_TIME); if (reprUrl == null) { LOG.warn("reprUrl==null for " + url); return; } else { page.setReprUrl(new Utf8(reprUrl)); } } } else { page.setText(new Utf8(parse.getText())); page.setTitle(new Utf8(parse.getTitle())); ByteBuffer prevSig = page.getSignature(); if (prevSig != null) { page.setPrevSignature(prevSig); } final byte[] signature = sig.calculate(page); page.setSignature(ByteBuffer.wrap(signature)); if (page.getOutlinks() != null) { page.getOutlinks().clear(); } final Outlink[] outlinks = parse.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, outlinks.length); String fromHost; if (ignoreExternalLinks) { try { fromHost = new URL(url).getHost().toLowerCase(); } catch (final MalformedURLException e) { fromHost = null; } } else { fromHost = null; } int validCount = 0; for (int i = 0; validCount < outlinksToStore && i < outlinks.length; i++) { String toUrl = outlinks[i].getToUrl(); try { toUrl = normalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); toUrl = filters.filter(toUrl); } catch (MalformedURLException e2) { continue; } catch (URLFilterException e) { continue; } if (toUrl == null) { continue; } Utf8 utf8ToUrl = new Utf8(toUrl); if (page.getOutlinks().get(utf8ToUrl) != null) { // skip duplicate outlinks continue; } String toHost; if (ignoreExternalLinks) { try { toHost = new URL(toUrl).getHost().toLowerCase(); } catch (final MalformedURLException e) { toHost = null; } if (toHost == null || !toHost.equals(fromHost)) { // external links continue; // skip it } } validCount++; page.getOutlinks().put(utf8ToUrl, new Utf8(outlinks[i].getAnchor())); } Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page); if (fetchMark != null) { Mark.PARSE_MARK.putMark(page, fetchMark); } } } }