public void autoDetectClues(Content content, boolean filter) { byte[] data = content.getContent(); if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType()) && data.length > MIN_LENGTH) { CharsetMatch[] matches = null; // do all these in a try/catch; setText and detect/detectAll // will sometimes throw exceptions try { detector.enableInputFilter(filter); if (data.length > MIN_LENGTH) { detector.setText(data); matches = detector.detectAll(); } } catch (Exception e) { LOG.debug("Exception from ICU4J (ignoring): "); e.printStackTrace(LogUtil.getDebugStream(LOG)); } if (matches != null) { for (CharsetMatch match : matches) { addClue(match.getName(), "detect", match.getConfidence()); } } } // add character encoding coming from HTTP response header addClue(parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)), "header"); }
public SolrRecord(Content recordContent, String fileName) { if (recordContent != null) { this.content = recordContent.getContent(); this.contentType = recordContent.getContentType(); this.id = recordContent.getUrl(); this.fileName = new File(fileName).getName(); } }
public boolean equals(Object o) { if (!(o instanceof Content)) { return false; } Content that = (Content) o; return this.url.equals(that.url) && this.base.equals(that.base) && Arrays.equals(this.getContent(), that.getContent()) && this.contentType.equals(that.contentType) && this.metadata.equals(that.metadata); }
/** * Fetches the specified <code>page</code> from the local Jetty server and checks whether the HTTP * response status code matches with the expected code. Also use jsp pages for redirection. * * @param page Page to be fetched. * @param expectedCode HTTP response status code expected while fetching the page. */ private void fetchPage(String page, int expectedCode) throws Exception { URL url = new URL("http", "127.0.0.1", port, page); CrawlDatum crawlDatum = new CrawlDatum(); Response response = http.getResponse(url, crawlDatum, true); ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), crawlDatum); Content content = out.getContent(); assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode()); if (page.compareTo("/nonexists.html") != 0 && page.compareTo("/brokenpage.jsp") != 0 && page.compareTo("/redirection") != 0) { assertEquals("ContentType " + url, "application/xhtml+xml", content.getContentType()); } }
public ParseResult getParse(final Content content) { String resultText = null; String resultTitle = null; Outlink[] outlinks = null; List outLinksList = new ArrayList(); Properties properties = null; try { final String contentLen = content.getMetadata().get(Response.CONTENT_LENGTH); final int len = Integer.parseInt(contentLen); if (LOG.isDebugEnabled()) { LOG.debug("ziplen: " + len); } final byte[] contentInBytes = content.getContent(); final ByteArrayInputStream bainput = new ByteArrayInputStream(contentInBytes); final InputStream input = bainput; if (contentLen != null && contentInBytes.length != len) { return new ParseStatus( ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED, "Content truncated at " + contentInBytes.length + " bytes. Parser can't handle incomplete pdf file.") .getEmptyParseResult(content.getUrl(), getConf()); } ZipTextExtractor extractor = new ZipTextExtractor(getConf()); // extract text resultText = extractor.extractText( new ByteArrayInputStream(contentInBytes), content.getUrl(), outLinksList); } catch (Exception e) { return new ParseStatus(ParseStatus.FAILED, "Can't be handled as Zip document. " + e) .getEmptyParseResult(content.getUrl(), getConf()); } if (resultText == null) { resultText = ""; } if (resultTitle == null) { resultTitle = ""; } outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]); final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, resultTitle, outlinks, content.getMetadata()); if (LOG.isTraceEnabled()) { LOG.trace("Zip file parsed sucessfully !!"); } return ParseResult.createParseResult(content.getUrl(), new ParseImpl(resultText, parseData)); }
public void testIt() throws ProtocolException, ParseException { String urlString; Content content; Parse parse; Configuration conf = NutchConfiguration.create(); Protocol protocol; ProtocolFactory factory = new ProtocolFactory(conf); OOParser parser = new OOParser(); parser.setConf(conf); for (int i = 0; i < sampleFiles.length; i++) { urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; protocol = factory.getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); parse = parser.getParse(content).get(content.getUrl()); String text = parse.getText().replaceAll("[ \t\r\n]+", " "); assertTrue(expectedText.equals(text)); } }
/** * Guess the encoding with the previously specified list of clues. * * @param content Content instance * @param defaultValue Default encoding to return if no encoding can be detected with enough * confidence. Note that this will <b>not</b> be normalized with {@link * EncodingDetector#resolveEncodingAlias} * @return Guessed encoding or defaultValue */ public String guessEncoding(Content content, String defaultValue) { /* * This algorithm could be replaced by something more sophisticated; * ideally we would gather a bunch of data on where various clues * (autodetect, HTTP headers, HTML meta tags, etc.) disagree, tag each with * the correct answer, and use machine learning/some statistical method * to generate a better heuristic. */ String base = content.getBaseUrl(); if (LOG.isTraceEnabled()) { findDisagreements(base, clues); } /* * Go down the list of encoding "clues". Use a clue if: * 1. Has a confidence value which meets our confidence threshold, OR * 2. Doesn't meet the threshold, but is the best try, * since nothing else is available. */ EncodingClue defaultClue = new EncodingClue(defaultValue, "default"); EncodingClue bestClue = defaultClue; for (EncodingClue clue : clues) { if (LOG.isTraceEnabled()) { LOG.trace(base + ": charset " + clue); } String charset = clue.value; if (minConfidence >= 0 && clue.confidence >= minConfidence) { if (LOG.isTraceEnabled()) { LOG.trace( base + ": Choosing encoding: " + charset + " with confidence " + clue.confidence); } return resolveEncodingAlias(charset).toLowerCase(); } else if (clue.confidence == NO_THRESHOLD && bestClue == defaultClue) { bestClue = clue; } } if (LOG.isTraceEnabled()) { LOG.trace(base + ": Choosing encoding: " + bestClue); } return bestClue.value.toLowerCase(); }
public SolrRecord(Content recordContent) { this(recordContent, (recordContent == null) ? "" : recordContent.getUrl()); }
public static Content read(DataInput in) throws IOException { Content content = new Content(); content.readFields(in); return content; }
private ParseStatus output( Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) { datum.setStatus(status); datum.setFetchTime(System.currentTimeMillis()); if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); ParseResult parseResult = null; if (content != null) { Metadata metadata = content.getMetadata(); // store the guessed content type in the crawldatum if (content.getContentType() != null) datum .getMetaData() .put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType())); // add segment to metadata metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } /* * Note: Fetcher will only follow meta-redirects coming from the * original URL. */ if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) { if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) { try { parseResult = this.parseUtil.parse(content); } catch (Exception e) { LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); } } if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(conf) .calculate(content, new ParseStatus().getEmptyParse(conf)); datum.setSignature(signature); } } /* * Store status code in content So we can read this value during parsing * (as a separate job) and decide to parse or not. */ content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status)); } try { output.collect(key, new NutchWritable(datum)); if (content != null && storingContent) output.collect(key, new NutchWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); ParseData parseData = parse.getData(); if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); parse = parseStatus.getEmptyParse(conf); } // Calculate page signature. For non-parsing fetchers this will // be done in ParseSegment byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse); // Ensure segment name and score are in parseData metadata parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); // Pass fetch time to content meta parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime())); if (url.equals(key)) datum.setSignature(signature); try { scfilters.passScoreAfterParsing(url, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } String origin = null; // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, links.length); if (ignoreExternalLinks || ignoreInternalLinks) { URL originURL = new URL(url.toString()); // based on domain? if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { origin = URLUtil.getDomainName(originURL).toLowerCase(); } // use host else { origin = originURL.getHost().toLowerCase(); } } // used by fetchNode if (fetchNode != null) { fetchNode.setOutlinks(links); fetchNode.setTitle(parseData.getTitle()); FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode); } int validCount = 0; // Process all outlinks, normalize, filter and deduplicate List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); HashSet<String> outlinks = new HashSet<String>(outlinksToStore); for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { String toUrl = links[i].getToUrl(); toUrl = ParseOutputFormat.filterNormalize( url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFilters, urlExemptionFilters, normalizers); if (toUrl == null) { continue; } validCount++; links[i].setUrl(toUrl); outlinkList.add(links[i]); outlinks.add(toUrl); } // Only process depth N outlinks if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) { reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size()); // Counter to limit num outlinks to follow per page int outlinkCounter = 0; // Calculate variable number of outlinks by depth using the // divisor (outlinks = Math.floor(divisor / depth * num.links)) int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks); String followUrl; // Walk over the outlinks and add as new FetchItem to the queues Iterator<String> iter = outlinks.iterator(); while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) { followUrl = iter.next(); // Check whether we'll follow external outlinks if (outlinksIgnoreExternal) { if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) { continue; } } reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1); // Create new FetchItem with depth incremented FetchItem fit = FetchItem.create( new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1); ((FetchItemQueues) fetchQueues).addFetchItem(fit); outlinkCounter++; } } // Overwrite the outlinks in ParseData with the normalized and // filtered set parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()])); output.collect( url, new NutchWritable( new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical()))); } } } catch (IOException e) { if (LOG.isErrorEnabled()) { LOG.error("fetcher caught:" + e.toString()); } } // return parse status if it exits if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { reporter.incrCounter( "ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1); return p.getData().getStatus(); } } return null; }
@SuppressWarnings("fallthrough") public void run() { activeThreads.incrementAndGet(); // count threads FetchItem fit = null; try { // checking for the server to be running and fetcher.parse to be true if (parsing && NutchServer.getInstance().isRunning()) reportToNutchServer = true; while (true) { // creating FetchNode for storing in FetchNodeDb if (reportToNutchServer) this.fetchNode = new FetchNode(); else this.fetchNode = null; // check whether must be stopped if (isHalted()) { LOG.debug(getName() + " set to halted"); fit = null; return; } fit = ((FetchItemQueues) fetchQueues).getFetchItem(); if (fit == null) { if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) { LOG.debug(getName() + " spin-waiting ..."); // spin-wait. ((AtomicInteger) spinWaiting).incrementAndGet(); try { Thread.sleep(500); } catch (Exception e) { } ((AtomicInteger) spinWaiting).decrementAndGet(); continue; } else { // all done, finish this thread LOG.info("Thread " + getName() + " has no more work available"); return; } } lastRequestStart.set(System.currentTimeMillis()); Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); if (reprUrlWritable == null) { setReprUrl(fit.url.toString()); } else { setReprUrl(reprUrlWritable.toString()); } try { // fetch the page redirecting = false; redirectCount = 0; do { if (LOG.isInfoEnabled()) { LOG.info( "fetching " + fit.url + " (queue crawl delay=" + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay + "ms)"); } if (LOG.isDebugEnabled()) { LOG.debug("redirectCount=" + redirectCount); } redirecting = false; Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString()); BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum); if (!rules.isAllowed(fit.u.toString())) { // unblock ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true); if (LOG.isDebugEnabled()) { LOG.debug("Denied by robots.txt: " + fit.url); } output( fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); reporter.incrCounter("FetcherStatus", "robots_denied", 1); continue; } if (rules.getCrawlDelay() > 0) { if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) { // unblock ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true); LOG.debug( "Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping"); output( fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1); continue; } else { FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID); fiq.crawlDelay = rules.getCrawlDelay(); if (LOG.isDebugEnabled()) { LOG.debug( "Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url); } } } ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum); ProtocolStatus status = output.getStatus(); Content content = output.getContent(); ParseStatus pstatus = null; // unblock queue ((FetchItemQueues) fetchQueues).finishFetchItem(fit); String urlString = fit.url.toString(); // used for FetchNode if (fetchNode != null) { fetchNode.setStatus(status.getCode()); fetchNode.setFetchTime(System.currentTimeMillis()); fetchNode.setUrl(fit.url); } reporter.incrCounter("FetcherStatus", status.getName(), 1); switch (status.getCode()) { case ProtocolStatus.WOULDBLOCK: // retry ? ((FetchItemQueues) fetchQueues).addFetchItem(fit); break; case ProtocolStatus.SUCCESS: // got a page pstatus = output( fit.url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth); updateStatus(content.getContent().length); if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { String newUrl = pstatus.getMessage(); int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); Text redirUrl = handleRedirect( fit.url, fit.datum, urlString, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR); if (redirUrl != null) { fit = queueRedirect(redirUrl, fit); } } break; case ProtocolStatus.MOVED: // redirect case ProtocolStatus.TEMP_MOVED: int code; boolean temp; if (status.getCode() == ProtocolStatus.MOVED) { code = CrawlDatum.STATUS_FETCH_REDIR_PERM; temp = false; } else { code = CrawlDatum.STATUS_FETCH_REDIR_TEMP; temp = true; } output(fit.url, fit.datum, content, status, code); String newUrl = status.getMessage(); Text redirUrl = handleRedirect( fit.url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR); if (redirUrl != null) { fit = queueRedirect(redirUrl, fit); } else { // stop redirecting redirecting = false; } break; case ProtocolStatus.EXCEPTION: logError(fit.url, status.getMessage()); int killedURLs = ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit.getQueueID()); if (killedURLs != 0) reporter.incrCounter( "FetcherStatus", "AboveExceptionThresholdInQueue", killedURLs); /* FALLTHROUGH */ case ProtocolStatus.RETRY: // retry case ProtocolStatus.BLOCKED: output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY); break; case ProtocolStatus.GONE: // gone case ProtocolStatus.NOTFOUND: case ProtocolStatus.ACCESS_DENIED: case ProtocolStatus.ROBOTS_DENIED: output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE); break; case ProtocolStatus.NOTMODIFIED: output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED); break; default: if (LOG.isWarnEnabled()) { LOG.warn("Unknown ProtocolStatus: " + status.getCode()); } output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY); } if (redirecting && redirectCount > maxRedirect) { ((FetchItemQueues) fetchQueues).finishFetchItem(fit); if (LOG.isInfoEnabled()) { LOG.info(" - redirect count exceeded " + fit.url); } output( fit.url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE); } } while (redirecting && (redirectCount <= maxRedirect)); } catch (Throwable t) { // unexpected exception // unblock ((FetchItemQueues) fetchQueues).finishFetchItem(fit); logError(fit.url, StringUtils.stringifyException(t)); output( fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY); } } } catch (Throwable e) { if (LOG.isErrorEnabled()) { LOG.error("fetcher caught:" + e.toString()); } } finally { if (fit != null) ((FetchItemQueues) fetchQueues).finishFetchItem(fit); activeThreads.decrementAndGet(); // count threads LOG.info("-finishing thread " + getName() + ", activeThreads=" + activeThreads); } }
public ParseResult getParse(Content content) { String mimeType = content.getContentType(); URL base; try { base = new URL(content.getBaseUrl()); } catch (MalformedURLException e) { return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); } // get the right parser using the mime type as a clue Parser parser = tikaConfig.getParser(MediaType.parse(mimeType)); byte[] raw = content.getContent(); if (parser == null) { String message = "Can't retrieve Tika parser for mime-type " + mimeType; LOG.error(message); return new ParseStatus(ParseStatus.FAILED, message) .getEmptyParseResult(content.getUrl(), getConf()); } LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType); Metadata tikamd = new Metadata(); HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment root = doc.createDocumentFragment(); DOMBuilder domhandler = new DOMBuilder(doc, root); ParseContext context = new ParseContext(); try { parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context); } catch (Exception e) { LOG.error("Error parsing " + content.getUrl(), e); return new ParseStatus(ParseStatus.FAILED, e.getMessage()) .getEmptyParseResult(content.getUrl(), getConf()); } HTMLMetaTags metaTags = new HTMLMetaTags(); String text = ""; String title = ""; Outlink[] outlinks = new Outlink[0]; org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata(); // we have converted the sax events generated by Tika into a DOM object // so we can now use the usual HTML resources from Nutch // get meta directives HTMLMetaProcessor.getMetaTags(metaTags, root, base); if (LOG.isTraceEnabled()) { LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); } // check meta directives if (!metaTags.getNoIndex()) { // okay to index StringBuffer sb = new StringBuffer(); if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); } utils.getText(sb, root); // extract text text = sb.toString(); sb.setLength(0); if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); } utils.getTitle(sb, root); // extract title title = sb.toString().trim(); } if (!metaTags.getNoFollow()) { // okay to follow links ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks URL baseTag = utils.getBase(root); if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } utils.getOutlinks(baseTag != null ? baseTag : base, l, root); outlinks = l.toArray(new Outlink[l.size()]); if (LOG.isTraceEnabled()) { LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl()); } } // populate Nutch metadata with Tika metadata String[] TikaMDNames = tikamd.names(); for (String tikaMDName : TikaMDNames) { if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) continue; // TODO what if multivalued? nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName)); } // no outlinks? try OutlinkExtractor e.g works for mime types where no // explicit markup for anchors if (outlinks.length == 0) { outlinks = OutlinkExtractor.getOutlinks(text, getConf()); } ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); if (metaTags.getRefresh()) { status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); status.setArgs( new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) }); } ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata); ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); // run filters on parse ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root); if (metaTags.getNoCache()) { // not okay to cache for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy); } return filteredParse; }
/** For debugging. */ public static void main(String[] args) throws Exception { int timeout = Integer.MIN_VALUE; int maxContentLength = Integer.MIN_VALUE; String logLevel = "info"; boolean followTalk = false; boolean keepConnection = false; boolean dumpContent = false; String urlString = null; String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url"; if (args.length == 0) { System.err.println(usage); System.exit(-1); } for (int i = 0; i < args.length; i++) { if (args[i].equals("-logLevel")) { logLevel = args[++i]; } else if (args[i].equals("-followTalk")) { followTalk = true; } else if (args[i].equals("-keepConnection")) { keepConnection = true; } else if (args[i].equals("-timeout")) { timeout = Integer.parseInt(args[++i]) * 1000; } else if (args[i].equals("-maxContentLength")) { maxContentLength = Integer.parseInt(args[++i]); } else if (args[i].equals("-dumpContent")) { dumpContent = true; } else if (i != args.length - 1) { System.err.println(usage); System.exit(-1); } else { urlString = args[i]; } } Ftp ftp = new Ftp(); ftp.setFollowTalk(followTalk); ftp.setKeepConnection(keepConnection); if (timeout != Integer.MIN_VALUE) // set timeout ftp.setTimeout(timeout); if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength ftp.setMaxContentLength(maxContentLength); // set log level // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); Content content = ftp.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); System.err.println("Content-Type: " + content.getContentType()); System.err.println("Content-Length: " + content.getMetadata().get(Response.CONTENT_LENGTH)); System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED)); if (dumpContent) { System.out.print(new String(content.getContent())); } ftp = null; }
public int run(String[] args) throws Exception { boolean dumpText = false; boolean force = false; String contentType = null; String url = null; String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url"; if (args.length == 0) { LOG.error(usage); return (-1); } for (int i = 0; i < args.length; i++) { if (args[i].equals("-forceAs")) { force = true; contentType = args[++i]; } else if (args[i].equals("-dumpText")) { dumpText = true; } else if (i != args.length - 1) { LOG.error(usage); System.exit(-1); } else { url = URLUtil.toASCII(args[i]); } } if (LOG.isInfoEnabled()) { LOG.info("fetching: " + url); } ProtocolFactory factory = new ProtocolFactory(conf); Protocol protocol = factory.getProtocol(url); WebPage page = WebPage.newBuilder().build(); ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page); if (!protocolOutput.getStatus().isSuccess()) { LOG.error( "Fetch failed with protocol status: " + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode()) + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus())); return (-1); } Content content = protocolOutput.getContent(); if (content == null) { LOG.error("No content for " + url); return (-1); } page.setBaseUrl(new org.apache.avro.util.Utf8(url)); page.setContent(ByteBuffer.wrap(content.getContent())); if (force) { content.setContentType(contentType); } else { contentType = content.getContentType(); } if (contentType == null) { LOG.error("Failed to determine content type!"); return (-1); } page.setContentType(new Utf8(contentType)); if (ParserJob.isTruncated(url, page)) { LOG.warn("Content is truncated, parse may fail!"); } Parse parse = new ParseUtil(conf).parse(url, page); if (parse == null) { LOG.error("Problem with parse - check log"); return (-1); } // Calculate the signature byte[] signature = SignatureFactory.getSignature(getConf()).calculate(page); if (LOG.isInfoEnabled()) { LOG.info("parsing: " + url); LOG.info("contentType: " + contentType); LOG.info("signature: " + StringUtil.toHexString(signature)); } LOG.info("---------\nUrl\n---------------\n"); System.out.print(url + "\n"); LOG.info("---------\nMetadata\n---------\n"); Map<CharSequence, ByteBuffer> metadata = page.getMetadata(); StringBuffer sb = new StringBuffer(); if (metadata != null) { Iterator<Entry<CharSequence, ByteBuffer>> iterator = metadata.entrySet().iterator(); while (iterator.hasNext()) { Entry<CharSequence, ByteBuffer> entry = iterator.next(); sb.append(entry.getKey().toString()) .append(" : \t") .append(Bytes.toString(entry.getValue())) .append("\n"); } System.out.print(sb.toString()); } LOG.info("---------\nOutlinks\n---------\n"); sb = new StringBuffer(); for (Outlink l : parse.getOutlinks()) { sb.append(" outlink: ").append(l).append('\n'); } System.out.print(sb.toString()); if (page.getHeaders() != null) { LOG.info("---------\nHeaders\n---------\n"); Map<CharSequence, CharSequence> headers = page.getHeaders(); StringBuffer headersb = new StringBuffer(); if (metadata != null) { Iterator<Entry<CharSequence, CharSequence>> iterator = headers.entrySet().iterator(); while (iterator.hasNext()) { Entry<CharSequence, CharSequence> entry = iterator.next(); headersb .append(entry.getKey().toString()) .append(" : \t") .append(entry.getValue()) .append("\n"); } System.out.print(headersb.toString()); } } if (dumpText) { LOG.info("---------\nParseText\n---------\n"); System.out.print(parse.getText()); } return 0; }