Esempio n. 1
0
 private FetchItem queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException {
   CrawlDatum newDatum =
       new CrawlDatum(
           CrawlDatum.STATUS_DB_UNFETCHED, fit.datum.getFetchInterval(), fit.datum.getScore());
   // transfer all existing metadata to the redirect
   newDatum.getMetaData().putAll(fit.datum.getMetaData());
   scfilters.initialScore(redirUrl, newDatum);
   if (reprUrl != null) {
     newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
   }
   fit = FetchItem.create(redirUrl, newDatum, queueMode);
   if (fit != null) {
     FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
     fiq.addInProgressFetchItem(fit);
   } else {
     // stop redirecting
     redirecting = false;
     reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
   }
   return fit;
 }
Esempio n. 2
0
  @SuppressWarnings("fallthrough")
  public void run() {
    activeThreads.incrementAndGet(); // count threads

    FetchItem fit = null;
    try {
      // checking for the server to be running and fetcher.parse to be true
      if (parsing && NutchServer.getInstance().isRunning()) reportToNutchServer = true;

      while (true) {
        // creating FetchNode for storing in FetchNodeDb
        if (reportToNutchServer) this.fetchNode = new FetchNode();
        else this.fetchNode = null;

        // check whether must be stopped
        if (isHalted()) {
          LOG.debug(getName() + " set to halted");
          fit = null;
          return;
        }

        fit = ((FetchItemQueues) fetchQueues).getFetchItem();
        if (fit == null) {
          if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) {
            LOG.debug(getName() + " spin-waiting ...");
            // spin-wait.
            ((AtomicInteger) spinWaiting).incrementAndGet();
            try {
              Thread.sleep(500);
            } catch (Exception e) {
            }
            ((AtomicInteger) spinWaiting).decrementAndGet();
            continue;
          } else {
            // all done, finish this thread
            LOG.info("Thread " + getName() + " has no more work available");
            return;
          }
        }
        lastRequestStart.set(System.currentTimeMillis());
        Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
        if (reprUrlWritable == null) {
          setReprUrl(fit.url.toString());
        } else {
          setReprUrl(reprUrlWritable.toString());
        }
        try {
          // fetch the page
          redirecting = false;
          redirectCount = 0;
          do {
            if (LOG.isInfoEnabled()) {
              LOG.info(
                  "fetching "
                      + fit.url
                      + " (queue crawl delay="
                      + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay
                      + "ms)");
            }
            if (LOG.isDebugEnabled()) {
              LOG.debug("redirectCount=" + redirectCount);
            }
            redirecting = false;
            Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());
            BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
            if (!rules.isAllowed(fit.u.toString())) {
              // unblock
              ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
              if (LOG.isDebugEnabled()) {
                LOG.debug("Denied by robots.txt: " + fit.url);
              }
              output(
                  fit.url,
                  fit.datum,
                  null,
                  ProtocolStatus.STATUS_ROBOTS_DENIED,
                  CrawlDatum.STATUS_FETCH_GONE);
              reporter.incrCounter("FetcherStatus", "robots_denied", 1);
              continue;
            }
            if (rules.getCrawlDelay() > 0) {
              if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                // unblock
                ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
                LOG.debug(
                    "Crawl-Delay for "
                        + fit.url
                        + " too long ("
                        + rules.getCrawlDelay()
                        + "), skipping");
                output(
                    fit.url,
                    fit.datum,
                    null,
                    ProtocolStatus.STATUS_ROBOTS_DENIED,
                    CrawlDatum.STATUS_FETCH_GONE);
                reporter.incrCounter("FetcherStatus", "robots_denied_maxcrawldelay", 1);
                continue;
              } else {
                FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
                fiq.crawlDelay = rules.getCrawlDelay();
                if (LOG.isDebugEnabled()) {
                  LOG.debug(
                      "Crawl delay for queue: "
                          + fit.queueID
                          + " is set to "
                          + fiq.crawlDelay
                          + " as per robots.txt. url: "
                          + fit.url);
                }
              }
            }
            ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum);
            ProtocolStatus status = output.getStatus();
            Content content = output.getContent();
            ParseStatus pstatus = null;
            // unblock queue
            ((FetchItemQueues) fetchQueues).finishFetchItem(fit);

            String urlString = fit.url.toString();

            // used for FetchNode
            if (fetchNode != null) {
              fetchNode.setStatus(status.getCode());
              fetchNode.setFetchTime(System.currentTimeMillis());
              fetchNode.setUrl(fit.url);
            }

            reporter.incrCounter("FetcherStatus", status.getName(), 1);

            switch (status.getCode()) {
              case ProtocolStatus.WOULDBLOCK:
                // retry ?
                ((FetchItemQueues) fetchQueues).addFetchItem(fit);
                break;

              case ProtocolStatus.SUCCESS: // got a page
                pstatus =
                    output(
                        fit.url,
                        fit.datum,
                        content,
                        status,
                        CrawlDatum.STATUS_FETCH_SUCCESS,
                        fit.outlinkDepth);
                updateStatus(content.getContent().length);
                if (pstatus != null
                    && pstatus.isSuccess()
                    && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                  String newUrl = pstatus.getMessage();
                  int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
                  Text redirUrl =
                      handleRedirect(
                          fit.url,
                          fit.datum,
                          urlString,
                          newUrl,
                          refreshTime < Fetcher.PERM_REFRESH_TIME,
                          Fetcher.CONTENT_REDIR);
                  if (redirUrl != null) {
                    fit = queueRedirect(redirUrl, fit);
                  }
                }
                break;

              case ProtocolStatus.MOVED: // redirect
              case ProtocolStatus.TEMP_MOVED:
                int code;
                boolean temp;
                if (status.getCode() == ProtocolStatus.MOVED) {
                  code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                  temp = false;
                } else {
                  code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                  temp = true;
                }
                output(fit.url, fit.datum, content, status, code);
                String newUrl = status.getMessage();
                Text redirUrl =
                    handleRedirect(
                        fit.url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR);
                if (redirUrl != null) {
                  fit = queueRedirect(redirUrl, fit);
                } else {
                  // stop redirecting
                  redirecting = false;
                }
                break;

              case ProtocolStatus.EXCEPTION:
                logError(fit.url, status.getMessage());
                int killedURLs =
                    ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit.getQueueID());
                if (killedURLs != 0)
                  reporter.incrCounter(
                      "FetcherStatus", "AboveExceptionThresholdInQueue", killedURLs);
                /* FALLTHROUGH */
              case ProtocolStatus.RETRY: // retry
              case ProtocolStatus.BLOCKED:
                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                break;

              case ProtocolStatus.GONE: // gone
              case ProtocolStatus.NOTFOUND:
              case ProtocolStatus.ACCESS_DENIED:
              case ProtocolStatus.ROBOTS_DENIED:
                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
                break;

              case ProtocolStatus.NOTMODIFIED:
                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
                break;

              default:
                if (LOG.isWarnEnabled()) {
                  LOG.warn("Unknown ProtocolStatus: " + status.getCode());
                }
                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
            }

            if (redirecting && redirectCount > maxRedirect) {
              ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
              if (LOG.isInfoEnabled()) {
                LOG.info(" - redirect count exceeded " + fit.url);
              }
              output(
                  fit.url,
                  fit.datum,
                  null,
                  ProtocolStatus.STATUS_REDIR_EXCEEDED,
                  CrawlDatum.STATUS_FETCH_GONE);
            }

          } while (redirecting && (redirectCount <= maxRedirect));

        } catch (Throwable t) { // unexpected exception
          // unblock
          ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
          logError(fit.url, StringUtils.stringifyException(t));
          output(
              fit.url,
              fit.datum,
              null,
              ProtocolStatus.STATUS_FAILED,
              CrawlDatum.STATUS_FETCH_RETRY);
        }
      }

    } catch (Throwable e) {
      if (LOG.isErrorEnabled()) {
        LOG.error("fetcher caught:" + e.toString());
      }
    } finally {
      if (fit != null) ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
      activeThreads.decrementAndGet(); // count threads
      LOG.info("-finishing thread " + getName() + ", activeThreads=" + activeThreads);
    }
  }