public NutchDocument filter( NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); String reprUrlString = reprUrl != null ? reprUrl.toString() : null; String urlString = url.toString(); String host = null; try { URL u; if (reprUrlString != null) { u = new URL(reprUrlString); } else { u = new URL(urlString); } host = u.getHost(); } catch (MalformedURLException e) { throw new IndexingException(e); } if (host != null) { doc.add("host", host); } doc.add("url", reprUrlString == null ? urlString : reprUrlString); // content String content = parse.getText(); if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) { content = content.substring(0, MAX_CONTENT_LENGTH); } doc.add("content", content); // title String title = parse.getData().getTitle(); if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed title = title.substring(0, MAX_TITLE_LENGTH); } if (title.length() > 0) { // NUTCH-1004 Do not index empty values for title field doc.add("title", title); } // add cached content/summary display policy, if available String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY); if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) { doc.add("cache", caching); } // add timestamp when fetched, for deduplication doc.add("tstamp", new Date(datum.getFetchTime())); return doc; }
// implements the filter-method which gives you access to important Objects // like NutchDocument public NutchDocument filter( NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) { LOG.info("-------->>>>> WE ARE IN THE INDExer-------------------"); String containsSem = "false"; containsSem = parse.getData().getMeta(WdcParser.META_CONTAINS_SEM); // we don't have to add the triples in a separate field as they are // already in the content field // String triples = ""; // triples = parse.getText(); // doc.add("triples", triples); // // check if the father contains sem data // boolean semFather = false; // try { // semFather = // Boolean.parseBoolean(datum.getMetaData().get(WdcParser.META_CONTAINS_SEM_FATHER).toString()); // // } catch (Exception e) { // LOG.error("CANNOT PROCESS THE FATHER SEM FIELD" + e.getMessage()); // } // adds the new field to the document doc.add("containsSem", containsSem); return doc; }
public void testIt() throws ProtocolException, ParseException { String urlString; Content content; Parse parse; Configuration conf = NutchConfiguration.create(); Protocol protocol; ProtocolFactory factory = new ProtocolFactory(conf); OOParser parser = new OOParser(); parser.setConf(conf); for (int i = 0; i < sampleFiles.length; i++) { urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; protocol = factory.getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); parse = parser.getParse(content).get(content.getUrl()); String text = parse.getText().replaceAll("[ \t\r\n]+", " "); assertTrue(expectedText.equals(text)); } }
private ParseStatus output( Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) { datum.setStatus(status); datum.setFetchTime(System.currentTimeMillis()); if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); ParseResult parseResult = null; if (content != null) { Metadata metadata = content.getMetadata(); // store the guessed content type in the crawldatum if (content.getContentType() != null) datum .getMetaData() .put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType())); // add segment to metadata metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } /* * Note: Fetcher will only follow meta-redirects coming from the * original URL. */ if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) { if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) { try { parseResult = this.parseUtil.parse(content); } catch (Exception e) { LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); } } if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(conf) .calculate(content, new ParseStatus().getEmptyParse(conf)); datum.setSignature(signature); } } /* * Store status code in content So we can read this value during parsing * (as a separate job) and decide to parse or not. */ content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status)); } try { output.collect(key, new NutchWritable(datum)); if (content != null && storingContent) output.collect(key, new NutchWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); ParseData parseData = parse.getData(); if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); parse = parseStatus.getEmptyParse(conf); } // Calculate page signature. For non-parsing fetchers this will // be done in ParseSegment byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse); // Ensure segment name and score are in parseData metadata parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); // Pass fetch time to content meta parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime())); if (url.equals(key)) datum.setSignature(signature); try { scfilters.passScoreAfterParsing(url, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } String origin = null; // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, links.length); if (ignoreExternalLinks || ignoreInternalLinks) { URL originURL = new URL(url.toString()); // based on domain? if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { origin = URLUtil.getDomainName(originURL).toLowerCase(); } // use host else { origin = originURL.getHost().toLowerCase(); } } // used by fetchNode if (fetchNode != null) { fetchNode.setOutlinks(links); fetchNode.setTitle(parseData.getTitle()); FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode); } int validCount = 0; // Process all outlinks, normalize, filter and deduplicate List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); HashSet<String> outlinks = new HashSet<String>(outlinksToStore); for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { String toUrl = links[i].getToUrl(); toUrl = ParseOutputFormat.filterNormalize( url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFilters, urlExemptionFilters, normalizers); if (toUrl == null) { continue; } validCount++; links[i].setUrl(toUrl); outlinkList.add(links[i]); outlinks.add(toUrl); } // Only process depth N outlinks if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) { reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size()); // Counter to limit num outlinks to follow per page int outlinkCounter = 0; // Calculate variable number of outlinks by depth using the // divisor (outlinks = Math.floor(divisor / depth * num.links)) int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks); String followUrl; // Walk over the outlinks and add as new FetchItem to the queues Iterator<String> iter = outlinks.iterator(); while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) { followUrl = iter.next(); // Check whether we'll follow external outlinks if (outlinksIgnoreExternal) { if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) { continue; } } reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1); // Create new FetchItem with depth incremented FetchItem fit = FetchItem.create( new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1); ((FetchItemQueues) fetchQueues).addFetchItem(fit); outlinkCounter++; } } // Overwrite the outlinks in ParseData with the normalized and // filtered set parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()])); output.collect( url, new NutchWritable( new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical()))); } } } catch (IOException e) { if (LOG.isErrorEnabled()) { LOG.error("fetcher caught:" + e.toString()); } } // return parse status if it exits if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { reporter.incrCounter( "ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1); return p.getData().getStatus(); } } return null; }