@Override protected void reduce(SelectorEntry key, Iterable<WebPage> values, Context context) throws IOException, InterruptedException { for (WebPage page : values) { if (count >= limit) { return; } if (maxCount > 0) { String hostordomain; if (byDomain) { hostordomain = URLUtil.getDomainName(key.url); } else { hostordomain = URLUtil.getHost(key.url); } Integer hostCount = hostCountMap.get(hostordomain); if (hostCount == null) { hostCountMap.put(hostordomain, 0); hostCount = 0; } if (hostCount >= maxCount) { return; } hostCountMap.put(hostordomain, hostCount + 1); } Mark.GENERATE_MARK.putMark(page, batchId); page.setBatchId(batchId); try { context.write(TableUtil.reverseUrl(key.url), page); } catch (MalformedURLException e) { context.getCounter("Generator", "MALFORMED_URL").increment(1); continue; } context.getCounter("Generator", "GENERATE_MARK").increment(1); count++; } }
private ParseStatus output( Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) { datum.setStatus(status); datum.setFetchTime(System.currentTimeMillis()); if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); ParseResult parseResult = null; if (content != null) { Metadata metadata = content.getMetadata(); // store the guessed content type in the crawldatum if (content.getContentType() != null) datum .getMetaData() .put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType())); // add segment to metadata metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } /* * Note: Fetcher will only follow meta-redirects coming from the * original URL. */ if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) { if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) { try { parseResult = this.parseUtil.parse(content); } catch (Exception e) { LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); } } if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(conf) .calculate(content, new ParseStatus().getEmptyParse(conf)); datum.setSignature(signature); } } /* * Store status code in content So we can read this value during parsing * (as a separate job) and decide to parse or not. */ content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status)); } try { output.collect(key, new NutchWritable(datum)); if (content != null && storingContent) output.collect(key, new NutchWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); ParseData parseData = parse.getData(); if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); parse = parseStatus.getEmptyParse(conf); } // Calculate page signature. For non-parsing fetchers this will // be done in ParseSegment byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse); // Ensure segment name and score are in parseData metadata parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); // Pass fetch time to content meta parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime())); if (url.equals(key)) datum.setSignature(signature); try { scfilters.passScoreAfterParsing(url, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } String origin = null; // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, links.length); if (ignoreExternalLinks || ignoreInternalLinks) { URL originURL = new URL(url.toString()); // based on domain? if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { origin = URLUtil.getDomainName(originURL).toLowerCase(); } // use host else { origin = originURL.getHost().toLowerCase(); } } // used by fetchNode if (fetchNode != null) { fetchNode.setOutlinks(links); fetchNode.setTitle(parseData.getTitle()); FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode); } int validCount = 0; // Process all outlinks, normalize, filter and deduplicate List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); HashSet<String> outlinks = new HashSet<String>(outlinksToStore); for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { String toUrl = links[i].getToUrl(); toUrl = ParseOutputFormat.filterNormalize( url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFilters, urlExemptionFilters, normalizers); if (toUrl == null) { continue; } validCount++; links[i].setUrl(toUrl); outlinkList.add(links[i]); outlinks.add(toUrl); } // Only process depth N outlinks if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) { reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size()); // Counter to limit num outlinks to follow per page int outlinkCounter = 0; // Calculate variable number of outlinks by depth using the // divisor (outlinks = Math.floor(divisor / depth * num.links)) int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks); String followUrl; // Walk over the outlinks and add as new FetchItem to the queues Iterator<String> iter = outlinks.iterator(); while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) { followUrl = iter.next(); // Check whether we'll follow external outlinks if (outlinksIgnoreExternal) { if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) { continue; } } reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1); // Create new FetchItem with depth incremented FetchItem fit = FetchItem.create( new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1); ((FetchItemQueues) fetchQueues).addFetchItem(fit); outlinkCounter++; } } // Overwrite the outlinks in ParseData with the normalized and // filtered set parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()])); output.collect( url, new NutchWritable( new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical()))); } } } catch (IOException e) { if (LOG.isErrorEnabled()) { LOG.error("fetcher caught:" + e.toString()); } } // return parse status if it exits if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { reporter.incrCounter( "ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1); return p.getData().getStatus(); } } return null; }
public int run(String[] args) throws Exception { boolean dumpText = false; boolean force = false; String contentType = null; String url = null; String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url"; if (args.length == 0) { LOG.error(usage); return (-1); } for (int i = 0; i < args.length; i++) { if (args[i].equals("-forceAs")) { force = true; contentType = args[++i]; } else if (args[i].equals("-dumpText")) { dumpText = true; } else if (i != args.length - 1) { LOG.error(usage); System.exit(-1); } else { url = URLUtil.toASCII(args[i]); } } if (LOG.isInfoEnabled()) { LOG.info("fetching: " + url); } ProtocolFactory factory = new ProtocolFactory(conf); Protocol protocol = factory.getProtocol(url); WebPage page = WebPage.newBuilder().build(); ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page); if (!protocolOutput.getStatus().isSuccess()) { LOG.error( "Fetch failed with protocol status: " + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode()) + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus())); return (-1); } Content content = protocolOutput.getContent(); if (content == null) { LOG.error("No content for " + url); return (-1); } page.setBaseUrl(new org.apache.avro.util.Utf8(url)); page.setContent(ByteBuffer.wrap(content.getContent())); if (force) { content.setContentType(contentType); } else { contentType = content.getContentType(); } if (contentType == null) { LOG.error("Failed to determine content type!"); return (-1); } page.setContentType(new Utf8(contentType)); if (ParserJob.isTruncated(url, page)) { LOG.warn("Content is truncated, parse may fail!"); } Parse parse = new ParseUtil(conf).parse(url, page); if (parse == null) { LOG.error("Problem with parse - check log"); return (-1); } // Calculate the signature byte[] signature = SignatureFactory.getSignature(getConf()).calculate(page); if (LOG.isInfoEnabled()) { LOG.info("parsing: " + url); LOG.info("contentType: " + contentType); LOG.info("signature: " + StringUtil.toHexString(signature)); } LOG.info("---------\nUrl\n---------------\n"); System.out.print(url + "\n"); LOG.info("---------\nMetadata\n---------\n"); Map<CharSequence, ByteBuffer> metadata = page.getMetadata(); StringBuffer sb = new StringBuffer(); if (metadata != null) { Iterator<Entry<CharSequence, ByteBuffer>> iterator = metadata.entrySet().iterator(); while (iterator.hasNext()) { Entry<CharSequence, ByteBuffer> entry = iterator.next(); sb.append(entry.getKey().toString()) .append(" : \t") .append(Bytes.toString(entry.getValue())) .append("\n"); } System.out.print(sb.toString()); } LOG.info("---------\nOutlinks\n---------\n"); sb = new StringBuffer(); for (Outlink l : parse.getOutlinks()) { sb.append(" outlink: ").append(l).append('\n'); } System.out.print(sb.toString()); if (page.getHeaders() != null) { LOG.info("---------\nHeaders\n---------\n"); Map<CharSequence, CharSequence> headers = page.getHeaders(); StringBuffer headersb = new StringBuffer(); if (metadata != null) { Iterator<Entry<CharSequence, CharSequence>> iterator = headers.entrySet().iterator(); while (iterator.hasNext()) { Entry<CharSequence, CharSequence> entry = iterator.next(); headersb .append(entry.getKey().toString()) .append(" : \t") .append(entry.getValue()) .append("\n"); } System.out.print(headersb.toString()); } } if (dumpText) { LOG.info("---------\nParseText\n---------\n"); System.out.print(parse.getText()); } return 0; }
private Text handleRedirect( Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp, String redirType) throws MalformedURLException, URLFilterException { newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); newUrl = urlFilters.filter(newUrl); try { String origHost = new URL(urlString).getHost().toLowerCase(); String newHost = new URL(newUrl).getHost().toLowerCase(); if (ignoreExternalLinks) { if (!origHost.equals(newHost)) { if (LOG.isDebugEnabled()) { LOG.debug( " - ignoring redirect " + redirType + " from " + urlString + " to " + newUrl + " because external links are ignored"); } return null; } } if (ignoreInternalLinks) { if (origHost.equals(newHost)) { if (LOG.isDebugEnabled()) { LOG.debug( " - ignoring redirect " + redirType + " from " + urlString + " to " + newUrl + " because internal links are ignored"); } return null; } } } catch (MalformedURLException e) { } if (newUrl != null && !newUrl.equals(urlString)) { reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp); url = new Text(newUrl); if (maxRedirect > 0) { redirecting = true; redirectCount++; if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect to " + url + " (fetching now)"); } return url; } else { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval(), datum.getScore()); // transfer existing metadata newDatum.getMetaData().putAll(datum.getMetaData()); try { scfilters.initialScore(url, newDatum); } catch (ScoringFilterException e) { e.printStackTrace(); } if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED); if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect to " + url + " (fetching later)"); } return null; } } else { if (LOG.isDebugEnabled()) { LOG.debug( " - " + redirType + " redirect skipped: " + (newUrl != null ? "to same url" : "filtered")); } return null; } }
/** * Parses given web page and stores parsed content within page. Puts a meta-redirect to outlinks. * * @param key * @param page */ public void process(String key, WebPage page) { String url = TableUtil.unreverseUrl(key); byte status = page.getStatus().byteValue(); if (status != CrawlStatus.STATUS_FETCHED) { if (LOG.isDebugEnabled()) { LOG.debug("Skipping " + url + " as status is: " + CrawlStatus.getName(status)); } return; } Parse parse; try { parse = parse(url, page); } catch (ParserNotFound e) { // do not print stacktrace for the fact that some types are not mapped. LOG.warn("No suitable parser found: " + e.getMessage()); return; } catch (final Exception e) { LOG.warn("Error parsing: " + url + ": " + StringUtils.stringifyException(e)); return; } if (parse == null) { return; } org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus(); page.setParseStatus(pstatus); if (ParseStatusUtils.isSuccess(pstatus)) { if (pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) { String newUrl = ParseStatusUtils.getMessage(pstatus); int refreshTime = Integer.parseInt(ParseStatusUtils.getArg(pstatus, 1)); try { newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); if (newUrl == null) { LOG.warn("redirect normalized to null " + url); return; } try { newUrl = filters.filter(newUrl); } catch (URLFilterException e) { return; } if (newUrl == null) { LOG.warn("redirect filtered to null " + url); return; } } catch (MalformedURLException e) { LOG.warn("malformed url exception parsing redirect " + url); return; } page.getOutlinks().put(new Utf8(newUrl), new Utf8()); page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL); if (newUrl == null || newUrl.equals(url)) { String reprUrl = URLUtil.chooseRepr(url, newUrl, refreshTime < FetcherJob.PERM_REFRESH_TIME); if (reprUrl == null) { LOG.warn("reprUrl==null for " + url); return; } else { page.setReprUrl(new Utf8(reprUrl)); } } } else { page.setText(new Utf8(parse.getText())); page.setTitle(new Utf8(parse.getTitle())); ByteBuffer prevSig = page.getSignature(); if (prevSig != null) { page.setPrevSignature(prevSig); } final byte[] signature = sig.calculate(page); page.setSignature(ByteBuffer.wrap(signature)); if (page.getOutlinks() != null) { page.getOutlinks().clear(); } final Outlink[] outlinks = parse.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, outlinks.length); String fromHost; if (ignoreExternalLinks) { try { fromHost = new URL(url).getHost().toLowerCase(); } catch (final MalformedURLException e) { fromHost = null; } } else { fromHost = null; } int validCount = 0; for (int i = 0; validCount < outlinksToStore && i < outlinks.length; i++) { String toUrl = outlinks[i].getToUrl(); try { toUrl = normalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); toUrl = filters.filter(toUrl); } catch (MalformedURLException e2) { continue; } catch (URLFilterException e) { continue; } if (toUrl == null) { continue; } Utf8 utf8ToUrl = new Utf8(toUrl); if (page.getOutlinks().get(utf8ToUrl) != null) { // skip duplicate outlinks continue; } String toHost; if (ignoreExternalLinks) { try { toHost = new URL(toUrl).getHost().toLowerCase(); } catch (final MalformedURLException e) { toHost = null; } if (toHost == null || !toHost.equals(fromHost)) { // external links continue; // skip it } } validCount++; page.getOutlinks().put(utf8ToUrl, new Utf8(outlinks[i].getAnchor())); } Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page); if (fetchMark != null) { Mark.PARSE_MARK.putMark(page, fetchMark); } } } }