public NutchDocument filter( NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); String reprUrlString = reprUrl != null ? reprUrl.toString() : null; String urlString = url.toString(); String host = null; try { URL u; if (reprUrlString != null) { u = new URL(reprUrlString); } else { u = new URL(urlString); } host = u.getHost(); } catch (MalformedURLException e) { throw new IndexingException(e); } if (host != null) { doc.add("host", host); } doc.add("url", reprUrlString == null ? urlString : reprUrlString); // content String content = parse.getText(); if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) { content = content.substring(0, MAX_CONTENT_LENGTH); } doc.add("content", content); // title String title = parse.getData().getTitle(); if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed title = title.substring(0, MAX_TITLE_LENGTH); } if (title.length() > 0) { // NUTCH-1004 Do not index empty values for title field doc.add("title", title); } // add cached content/summary display policy, if available String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY); if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) { doc.add("cache", caching); } // add timestamp when fetched, for deduplication doc.add("tstamp", new Date(datum.getFetchTime())); return doc; }
/** * Creates a {@link FtpResponse} object corresponding to the url and returns a {@link * ProtocolOutput} object as per the content received * * @param url Text containing the ftp url * @param datum The CrawlDatum object corresponding to the url * @return {@link ProtocolOutput} object for the url */ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { String urlString = url.toString(); try { URL u = new URL(urlString); int redirects = 0; while (true) { FtpResponse response; response = new FtpResponse(u, datum, this, getConf()); // make a request int code = response.getCode(); datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text(Integer.toString(code))); if (code == 200) { // got a good response return new ProtocolOutput(response.toContent()); // return it } else if (code >= 300 && code < 400) { // handle redirect if (redirects == MAX_REDIRECTS) throw new FtpException("Too many redirects: " + url); u = new URL(response.getHeader("Location")); redirects++; if (LOG.isTraceEnabled()) { LOG.trace("redirect to " + u); } } else { // convert to exception throw new FtpError(code); } } } catch (Exception e) { return new ProtocolOutput(null, new ProtocolStatus(e)); } }
public void getStats(Path segment, final SegmentReaderStats stats) throws Exception { SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders( getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); long cnt = 0L; Text key = new Text(); for (int i = 0; i < readers.length; i++) { while (readers[i].next(key)) cnt++; readers[i].close(); } stats.generated = cnt; Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME); if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) { cnt = 0L; long start = Long.MAX_VALUE; long end = Long.MIN_VALUE; CrawlDatum value = new CrawlDatum(); MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, getConf()); for (int i = 0; i < mreaders.length; i++) { while (mreaders[i].next(key, value)) { cnt++; if (value.getFetchTime() < start) start = value.getFetchTime(); if (value.getFetchTime() > end) end = value.getFetchTime(); } mreaders[i].close(); } stats.start = start; stats.end = end; stats.fetched = cnt; } Path parseDir = new Path(segment, ParseData.DIR_NAME); if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) { cnt = 0L; long errors = 0L; ParseData value = new ParseData(); MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf()); for (int i = 0; i < mreaders.length; i++) { while (mreaders[i].next(key, value)) { cnt++; if (!value.getStatus().isSuccess()) errors++; } mreaders[i].close(); } stats.parsed = cnt; stats.parseErrors = errors; } }
private FetchItem queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException { CrawlDatum newDatum = new CrawlDatum( CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore()); // transfer all existing metadata to the redirect newDatum.getMetaData().putAll(fit.datum.getMetaData()); scfilters.initialScore(redirUrl, newDatum); if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } fit = FetchItem.create(redirUrl, newDatum, queueMode); if (fit != null) { FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID); fiq.addInProgressFetchItem(fit); } else { // stop redirecting redirecting = false; reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1); } return fit; }
private ParseStatus output( Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) { datum.setStatus(status); datum.setFetchTime(System.currentTimeMillis()); if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); ParseResult parseResult = null; if (content != null) { Metadata metadata = content.getMetadata(); // store the guessed content type in the crawldatum if (content.getContentType() != null) datum .getMetaData() .put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType())); // add segment to metadata metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } /* * Note: Fetcher will only follow meta-redirects coming from the * original URL. */ if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) { if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) { try { parseResult = this.parseUtil.parse(content); } catch (Exception e) { LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); } } if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(conf) .calculate(content, new ParseStatus().getEmptyParse(conf)); datum.setSignature(signature); } } /* * Store status code in content So we can read this value during parsing * (as a separate job) and decide to parse or not. */ content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status)); } try { output.collect(key, new NutchWritable(datum)); if (content != null && storingContent) output.collect(key, new NutchWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); ParseData parseData = parse.getData(); if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); parse = parseStatus.getEmptyParse(conf); } // Calculate page signature. For non-parsing fetchers this will // be done in ParseSegment byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse); // Ensure segment name and score are in parseData metadata parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); // Pass fetch time to content meta parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime())); if (url.equals(key)) datum.setSignature(signature); try { scfilters.passScoreAfterParsing(url, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } String origin = null; // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, links.length); if (ignoreExternalLinks || ignoreInternalLinks) { URL originURL = new URL(url.toString()); // based on domain? if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { origin = URLUtil.getDomainName(originURL).toLowerCase(); } // use host else { origin = originURL.getHost().toLowerCase(); } } // used by fetchNode if (fetchNode != null) { fetchNode.setOutlinks(links); fetchNode.setTitle(parseData.getTitle()); FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode); } int validCount = 0; // Process all outlinks, normalize, filter and deduplicate List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); HashSet<String> outlinks = new HashSet<String>(outlinksToStore); for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { String toUrl = links[i].getToUrl(); toUrl = ParseOutputFormat.filterNormalize( url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFilters, urlExemptionFilters, normalizers); if (toUrl == null) { continue; } validCount++; links[i].setUrl(toUrl); outlinkList.add(links[i]); outlinks.add(toUrl); } // Only process depth N outlinks if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) { reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size()); // Counter to limit num outlinks to follow per page int outlinkCounter = 0; // Calculate variable number of outlinks by depth using the // divisor (outlinks = Math.floor(divisor / depth * num.links)) int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks); String followUrl; // Walk over the outlinks and add as new FetchItem to the queues Iterator<String> iter = outlinks.iterator(); while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) { followUrl = iter.next(); // Check whether we'll follow external outlinks if (outlinksIgnoreExternal) { if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) { continue; } } reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1); // Create new FetchItem with depth incremented FetchItem fit = FetchItem.create( new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1); ((FetchItemQueues) fetchQueues).addFetchItem(fit); outlinkCounter++; } } // Overwrite the outlinks in ParseData with the normalized and // filtered set parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()])); output.collect( url, new NutchWritable( new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical()))); } } } catch (IOException e) { if (LOG.isErrorEnabled()) { LOG.error("fetcher caught:" + e.toString()); } } // return parse status if it exits if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { reporter.incrCounter( "ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1); return p.getData().getStatus(); } } return null; }
private Text handleRedirect( Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp, String redirType) throws MalformedURLException, URLFilterException { newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); newUrl = urlFilters.filter(newUrl); try { String origHost = new URL(urlString).getHost().toLowerCase(); String newHost = new URL(newUrl).getHost().toLowerCase(); if (ignoreExternalLinks) { if (!origHost.equals(newHost)) { if (LOG.isDebugEnabled()) { LOG.debug( " - ignoring redirect " + redirType + " from " + urlString + " to " + newUrl + " because external links are ignored"); } return null; } } if (ignoreInternalLinks) { if (origHost.equals(newHost)) { if (LOG.isDebugEnabled()) { LOG.debug( " - ignoring redirect " + redirType + " from " + urlString + " to " + newUrl + " because internal links are ignored"); } return null; } } } catch (MalformedURLException e) { } if (newUrl != null && !newUrl.equals(urlString)) { reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp); url = new Text(newUrl); if (maxRedirect > 0) { redirecting = true; redirectCount++; if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect to " + url + " (fetching now)"); } return url; } else { CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval(), datum.getScore()); // transfer existing metadata newDatum.getMetaData().putAll(datum.getMetaData()); try { scfilters.initialScore(url, newDatum); } catch (ScoringFilterException e) { e.printStackTrace(); } if (reprUrl != null) { newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl)); } output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED); if (LOG.isDebugEnabled()) { LOG.debug(" - " + redirType + " redirect to " + url + " (fetching later)"); } return null; } } else { if (LOG.isDebugEnabled()) { LOG.debug( " - " + redirType + " redirect skipped: " + (newUrl != null ? "to same url" : "filtered")); } return null; } }
/** * NOTE: in selecting the latest version we rely exclusively on the segment name (not all segment * data contain time information). Therefore it is extremely important that segments be named in * an increasing lexicographic order as their creation time increases. */ public void reduce( WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { CrawlDatum lastG = null; CrawlDatum lastF = null; CrawlDatum lastSig = null; Content lastC = null; ParseData lastPD = null; ParseText lastPT = null; String lastGname = null; String lastFname = null; String lastSigname = null; String lastCname = null; String lastPDname = null; String lastPTname = null; TreeMap linked = new TreeMap(); while (values.hasNext()) { MetaWrapper wrapper = (MetaWrapper) values.next(); Object o = wrapper.get(); String spString = wrapper.getMeta(SEGMENT_PART_KEY); if (spString == null) { throw new IOException("Null segment part, key=" + key); } SegmentPart sp = SegmentPart.parse(spString); if (o instanceof CrawlDatum) { CrawlDatum val = (CrawlDatum) o; // check which output dir it belongs to if (sp.partName.equals(CrawlDatum.GENERATE_DIR_NAME)) { if (lastG == null) { lastG = val; lastGname = sp.segmentName; } else { // take newer if (lastGname.compareTo(sp.segmentName) < 0) { lastG = val; lastGname = sp.segmentName; } } } else if (sp.partName.equals(CrawlDatum.FETCH_DIR_NAME)) { if (lastF == null) { lastF = val; lastFname = sp.segmentName; } else { // take newer if (lastFname.compareTo(sp.segmentName) < 0) { lastF = val; lastFname = sp.segmentName; } } } else if (sp.partName.equals(CrawlDatum.PARSE_DIR_NAME)) { if (val.getStatus() == CrawlDatum.STATUS_SIGNATURE) { if (lastSig == null) { lastSig = val; lastSigname = sp.segmentName; } else { // take newer if (lastSigname.compareTo(sp.segmentName) < 0) { lastSig = val; lastSigname = sp.segmentName; } } continue; } // collect all LINKED values from the latest segment ArrayList segLinked = (ArrayList) linked.get(sp.segmentName); if (segLinked == null) { segLinked = new ArrayList(); linked.put(sp.segmentName, segLinked); } segLinked.add(val); } else { throw new IOException("Cannot determine segment part: " + sp.partName); } } else if (o instanceof Content) { if (lastC == null) { lastC = (Content) o; lastCname = sp.segmentName; } else { if (lastCname.compareTo(sp.segmentName) < 0) { lastC = (Content) o; lastCname = sp.segmentName; } } } else if (o instanceof ParseData) { if (lastPD == null) { lastPD = (ParseData) o; lastPDname = sp.segmentName; } else { if (lastPDname.compareTo(sp.segmentName) < 0) { lastPD = (ParseData) o; lastPDname = sp.segmentName; } } } else if (o instanceof ParseText) { if (lastPT == null) { lastPT = (ParseText) o; lastPTname = sp.segmentName; } else { if (lastPTname.compareTo(sp.segmentName) < 0) { lastPT = (ParseText) o; lastPTname = sp.segmentName; } } } } curCount++; String sliceName = null; MetaWrapper wrapper = new MetaWrapper(); if (sliceSize > 0) { sliceName = String.valueOf(curCount / sliceSize); wrapper.setMeta(SEGMENT_SLICE_KEY, sliceName); } SegmentPart sp = new SegmentPart(); // now output the latest values if (lastG != null) { wrapper.set(lastG); sp.partName = CrawlDatum.GENERATE_DIR_NAME; sp.segmentName = lastGname; wrapper.setMeta(SEGMENT_PART_KEY, sp.toString()); output.collect(key, wrapper); } if (lastF != null) { wrapper.set(lastF); sp.partName = CrawlDatum.FETCH_DIR_NAME; sp.segmentName = lastFname; wrapper.setMeta(SEGMENT_PART_KEY, sp.toString()); output.collect(key, wrapper); } if (lastSig != null) { wrapper.set(lastSig); sp.partName = CrawlDatum.PARSE_DIR_NAME; sp.segmentName = lastSigname; wrapper.setMeta(SEGMENT_PART_KEY, sp.toString()); output.collect(key, wrapper); } if (lastC != null) { wrapper.set(lastC); sp.partName = Content.DIR_NAME; sp.segmentName = lastCname; wrapper.setMeta(SEGMENT_PART_KEY, sp.toString()); output.collect(key, wrapper); } if (lastPD != null) { wrapper.set(lastPD); sp.partName = ParseData.DIR_NAME; sp.segmentName = lastPDname; wrapper.setMeta(SEGMENT_PART_KEY, sp.toString()); output.collect(key, wrapper); } if (lastPT != null) { wrapper.set(lastPT); sp.partName = ParseText.DIR_NAME; sp.segmentName = lastPTname; wrapper.setMeta(SEGMENT_PART_KEY, sp.toString()); output.collect(key, wrapper); } if (linked.size() > 0) { String name = (String) linked.lastKey(); sp.partName = CrawlDatum.PARSE_DIR_NAME; sp.segmentName = name; wrapper.setMeta(SEGMENT_PART_KEY, sp.toString()); ArrayList segLinked = (ArrayList) linked.get(name); for (int i = 0; i < segLinked.size(); i++) { CrawlDatum link = (CrawlDatum) segLinked.get(i); wrapper.set(link); output.collect(key, wrapper); } } }