public void testIt() throws ProtocolException, ParseException { String urlString; Content content; Parse parse; Configuration conf = NutchConfiguration.create(); Protocol protocol; ProtocolFactory factory = new ProtocolFactory(conf); OOParser parser = new OOParser(); parser.setConf(conf); for (int i = 0; i < sampleFiles.length; i++) { urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; protocol = factory.getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); parse = parser.getParse(content).get(content.getUrl()); String text = parse.getText().replaceAll("[ \t\r\n]+", " "); assertTrue(expectedText.equals(text)); } }
@SuppressWarnings("fallthrough") public void run() { activeThreads.incrementAndGet(); // count threads FetchItem fit = null; try { // checking for the server to be running and fetcher.parse to be true if (parsing && NutchServer.getInstance().isRunning()) reportToNutchServer = true; while (true) { // creating FetchNode for storing in FetchNodeDb if (reportToNutchServer) this.fetchNode = new FetchNode(); else this.fetchNode = null; // check whether must be stopped if (isHalted()) { LOG.debug(getName() + " set to halted"); fit = null; return; } fit = ((FetchItemQueues) fetchQueues).getFetchItem(); if (fit == null) { if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) { LOG.debug(getName() + " spin-waiting ..."); // spin-wait. ((AtomicInteger) spinWaiting).incrementAndGet(); try { Thread.sleep(500); } catch (Exception e) { } ((AtomicInteger) spinWaiting).decrementAndGet(); continue; } else { // all done, finish this thread LOG.info("Thread " + getName() + " has no more work available"); return; } } lastRequestStart.set(System.currentTimeMillis()); Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY); if (reprUrlWritable == null) { setReprUrl(fit.url.toString()); } else { setReprUrl(reprUrlWritable.toString()); } try { // fetch the page redirecting = false; redirectCount = 0; do { if (LOG.isInfoEnabled()) { LOG.info( "fetching " + fit.url + " (queue crawl delay=" + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay + "ms)"); } if (LOG.isDebugEnabled()) { LOG.debug("redirectCount=" + redirectCount); } redirecting = false; Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString()); BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum); if (!rules.isAllowed(fit.u.toString())) { // unblock ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true); if (LOG.isDebugEnabled()) { LOG.debug("Denied by robots.txt: " + fit.url); } output( fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); reporter.incrCounter("FetcherStatus", "robots_denied", 1); continue; } if (rules.getCrawlDelay() > 0) { if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) { // unblock ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true); LOG.debug( "Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping"); output( fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE); reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1); continue; } else { FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID); fiq.crawlDelay = rules.getCrawlDelay(); if (LOG.isDebugEnabled()) { LOG.debug( "Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url); } } } ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum); ProtocolStatus status = output.getStatus(); Content content = output.getContent(); ParseStatus pstatus = null; // unblock queue ((FetchItemQueues) fetchQueues).finishFetchItem(fit); String urlString = fit.url.toString(); // used for FetchNode if (fetchNode != null) { fetchNode.setStatus(status.getCode()); fetchNode.setFetchTime(System.currentTimeMillis()); fetchNode.setUrl(fit.url); } reporter.incrCounter("FetcherStatus", status.getName(), 1); switch (status.getCode()) { case ProtocolStatus.WOULDBLOCK: // retry ? ((FetchItemQueues) fetchQueues).addFetchItem(fit); break; case ProtocolStatus.SUCCESS: // got a page pstatus = output( fit.url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth); updateStatus(content.getContent().length); if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { String newUrl = pstatus.getMessage(); int refreshTime = Integer.valueOf(pstatus.getArgs()[1]); Text redirUrl = handleRedirect( fit.url, fit.datum, urlString, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR); if (redirUrl != null) { fit = queueRedirect(redirUrl, fit); } } break; case ProtocolStatus.MOVED: // redirect case ProtocolStatus.TEMP_MOVED: int code; boolean temp; if (status.getCode() == ProtocolStatus.MOVED) { code = CrawlDatum.STATUS_FETCH_REDIR_PERM; temp = false; } else { code = CrawlDatum.STATUS_FETCH_REDIR_TEMP; temp = true; } output(fit.url, fit.datum, content, status, code); String newUrl = status.getMessage(); Text redirUrl = handleRedirect( fit.url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR); if (redirUrl != null) { fit = queueRedirect(redirUrl, fit); } else { // stop redirecting redirecting = false; } break; case ProtocolStatus.EXCEPTION: logError(fit.url, status.getMessage()); int killedURLs = ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit.getQueueID()); if (killedURLs != 0) reporter.incrCounter( "FetcherStatus", "AboveExceptionThresholdInQueue", killedURLs); /* FALLTHROUGH */ case ProtocolStatus.RETRY: // retry case ProtocolStatus.BLOCKED: output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY); break; case ProtocolStatus.GONE: // gone case ProtocolStatus.NOTFOUND: case ProtocolStatus.ACCESS_DENIED: case ProtocolStatus.ROBOTS_DENIED: output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE); break; case ProtocolStatus.NOTMODIFIED: output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED); break; default: if (LOG.isWarnEnabled()) { LOG.warn("Unknown ProtocolStatus: " + status.getCode()); } output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY); } if (redirecting && redirectCount > maxRedirect) { ((FetchItemQueues) fetchQueues).finishFetchItem(fit); if (LOG.isInfoEnabled()) { LOG.info(" - redirect count exceeded " + fit.url); } output( fit.url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE); } } while (redirecting && (redirectCount <= maxRedirect)); } catch (Throwable t) { // unexpected exception // unblock ((FetchItemQueues) fetchQueues).finishFetchItem(fit); logError(fit.url, StringUtils.stringifyException(t)); output( fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY); } } } catch (Throwable e) { if (LOG.isErrorEnabled()) { LOG.error("fetcher caught:" + e.toString()); } } finally { if (fit != null) ((FetchItemQueues) fetchQueues).finishFetchItem(fit); activeThreads.decrementAndGet(); // count threads LOG.info("-finishing thread " + getName() + ", activeThreads=" + activeThreads); } }
public int run(String[] args) throws Exception { boolean dumpText = false; boolean force = false; String contentType = null; String url = null; String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url"; if (args.length == 0) { LOG.error(usage); return (-1); } for (int i = 0; i < args.length; i++) { if (args[i].equals("-forceAs")) { force = true; contentType = args[++i]; } else if (args[i].equals("-dumpText")) { dumpText = true; } else if (i != args.length - 1) { LOG.error(usage); System.exit(-1); } else { url = URLUtil.toASCII(args[i]); } } if (LOG.isInfoEnabled()) { LOG.info("fetching: " + url); } ProtocolFactory factory = new ProtocolFactory(conf); Protocol protocol = factory.getProtocol(url); WebPage page = WebPage.newBuilder().build(); ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page); if (!protocolOutput.getStatus().isSuccess()) { LOG.error( "Fetch failed with protocol status: " + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode()) + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus())); return (-1); } Content content = protocolOutput.getContent(); if (content == null) { LOG.error("No content for " + url); return (-1); } page.setBaseUrl(new org.apache.avro.util.Utf8(url)); page.setContent(ByteBuffer.wrap(content.getContent())); if (force) { content.setContentType(contentType); } else { contentType = content.getContentType(); } if (contentType == null) { LOG.error("Failed to determine content type!"); return (-1); } page.setContentType(new Utf8(contentType)); if (ParserJob.isTruncated(url, page)) { LOG.warn("Content is truncated, parse may fail!"); } Parse parse = new ParseUtil(conf).parse(url, page); if (parse == null) { LOG.error("Problem with parse - check log"); return (-1); } // Calculate the signature byte[] signature = SignatureFactory.getSignature(getConf()).calculate(page); if (LOG.isInfoEnabled()) { LOG.info("parsing: " + url); LOG.info("contentType: " + contentType); LOG.info("signature: " + StringUtil.toHexString(signature)); } LOG.info("---------\nUrl\n---------------\n"); System.out.print(url + "\n"); LOG.info("---------\nMetadata\n---------\n"); Map<CharSequence, ByteBuffer> metadata = page.getMetadata(); StringBuffer sb = new StringBuffer(); if (metadata != null) { Iterator<Entry<CharSequence, ByteBuffer>> iterator = metadata.entrySet().iterator(); while (iterator.hasNext()) { Entry<CharSequence, ByteBuffer> entry = iterator.next(); sb.append(entry.getKey().toString()) .append(" : \t") .append(Bytes.toString(entry.getValue())) .append("\n"); } System.out.print(sb.toString()); } LOG.info("---------\nOutlinks\n---------\n"); sb = new StringBuffer(); for (Outlink l : parse.getOutlinks()) { sb.append(" outlink: ").append(l).append('\n'); } System.out.print(sb.toString()); if (page.getHeaders() != null) { LOG.info("---------\nHeaders\n---------\n"); Map<CharSequence, CharSequence> headers = page.getHeaders(); StringBuffer headersb = new StringBuffer(); if (metadata != null) { Iterator<Entry<CharSequence, CharSequence>> iterator = headers.entrySet().iterator(); while (iterator.hasNext()) { Entry<CharSequence, CharSequence> entry = iterator.next(); headersb .append(entry.getKey().toString()) .append(" : \t") .append(entry.getValue()) .append("\n"); } System.out.print(headersb.toString()); } } if (dumpText) { LOG.info("---------\nParseText\n---------\n"); System.out.print(parse.getText()); } return 0; }