public void setErrorHandler(ErrorHandler eh) {
    _eh = eh;

    if (_robots != null) {
      _robots.setErrorHandler(eh);
    }
    //		if (_sitemaps != null) {
    //			_sitemaps.setErrorHandler(eh);
    //		}
    if (_links != null) {
      _links.setErrorHandler(eh);
    }
  }
  /**
   * @param threads
   * @param proxyHost - the proxy host or <code>null</code> to use
   *     System.getProperties().get("http.proxyHost")
   * @param proxyPort - the proxy port or <code>null</code> to use
   *     System.getProperties().get("http.proxyPort")
   * @param proxyUser - the proxy user or <code>null</code> to use
   *     System.getProperties().get("http.proxyUser")
   * @param proxyPassword - the proxy user password or <code>null</code> to use
   *     System.getProperties().get("http.proxyPassword")
   */
  public Crawler(
      int threads, String proxyHost, String proxyPort, String proxyUser, String proxyPassword) {
    _threads = threads;

    String phost = proxyHost;
    int pport = 0;
    if (proxyPort != null) {
      try {
        pport = Integer.parseInt(proxyPort);
      } catch (NumberFormatException nfe) {
        pport = 0;
      }
    }
    String puser = proxyUser;
    String ppassword = proxyPassword;

    if (phost == null && System.getProperties().get("http.proxyHost") != null) {
      phost = System.getProperties().get("http.proxyHost").toString();
    }
    if (pport == 0 && System.getProperties().get("http.proxyPort") != null) {
      pport = Integer.parseInt(System.getProperties().get("http.proxyPort").toString());
    }

    if (puser == null && System.getProperties().get("http.proxyUser") != null) {
      puser = System.getProperties().get("http.proxyUser").toString();
    }
    if (ppassword == null && System.getProperties().get("http.proxyPassword") != null) {
      ppassword = System.getProperties().get("http.proxyPassword").toString();
    }

    _cm =
        new ConnectionManager(
            phost, pport, puser, ppassword, threads * CrawlerConstants.MAX_CONNECTIONS_PER_THREAD);
    _cm.setRetries(CrawlerConstants.RETRIES);

    // Always use the local TldManager implementation. Changed for the one
    // from NxParser for two reasons:
    // * I fixed a couple of bugs there
    // * I updated the Public suffix list and made a change to it to support
    //   .asia
    // try {
    // _tldm = new TldManager(_cm);
    // } catch (Exception e) {
    // _log.info("cannot get tld file online " + e.getMessage());
    // try {
    // _tldm = new TldManager();
    // } catch (IOException e1) {
    // _log.info("cannot get tld file locally " + e.getMessage());
    // }
    // }

    try {
      _tldm = new TldManager();
    } catch (IOException e1) {
      _log.info("cannot get tld file locally " + e1.getMessage());
    }

    _eh = new ErrorHandlerDummy();

    _robots = new Robots(_cm);
    _robots.setErrorHandler(_eh);

    //	    _sitemaps = new Sitemaps(_cm);
    //	    _sitemaps.setErrorHandler(_eh);

    _contentHandler = new ContentHandlerRdfXml();
    _output = new SinkDummy();
    _ff = new FetchFilterAllow();

    _blacklist = new FetchFilterAllow();
  }
Example #3
0
  public void run() {
    _log.info("starting thread ...");

    if (!(!CrawlerConstants.URI_LIMIT_ENABLED
        || (_overall200FetchesWithRDF.get() < CrawlerConstants.URI_LIMIT_WITH_NON_EMPTY_RDF))) {
      _log.info("URI limit reached. Stopping...");
      return;
    }

    int i = 0;

    URI lu = _q.poll();

    _log.fine("got " + lu);

    while (lu != null) {

      if (!(!CrawlerConstants.URI_LIMIT_ENABLED
          || (_overall200FetchesWithRDF.get() < CrawlerConstants.URI_LIMIT_WITH_NON_EMPTY_RDF))) {
        _log.info("URI limit reached. Stopping...");
        break;
      }

      setName("LT-" + _no + ":" + lu.toString());

      _q.addSeen(lu);

      i++;
      long time = System.currentTimeMillis();

      //				URI lu = _q.obtainRedirect(u);

      long time1 = System.currentTimeMillis();
      long time2 = time1;
      long time3 = time1;
      long bytes = -1;
      int status = 0;
      String type = null;

      //			List<URI> li = _sitemaps.getSitemapUris(lu);
      //			if (li != null && li.size() > 0) {
      //				_log.info("sitemap surprisingly actually has uris " + li);
      //			}

      Header[] headers = null;

      if (!_blacklist.fetchOk(lu, 0, null)) {
        _log.info("access denied per blacklist for " + lu);
        _eh.handleStatus(lu, CrawlerConstants.SKIP_SUFFIX, null, 0, -1);
      } else if (!_robots.accessOk(lu)) {
        _log.info("access denied per robots.txt for " + lu);
        _eh.handleStatus(lu, CrawlerConstants.SKIP_ROBOTS, null, 0, -1);
      } else {
        time2 = System.currentTimeMillis();

        HttpGet hget = new HttpGet(lu);
        hget.setHeaders(CrawlerConstants.HEADERS);

        try {
          HttpResponse hres = _hclient.connect(hget);

          HttpEntity hen = hres.getEntity();

          status = hres.getStatusLine().getStatusCode();

          Header ct = hres.getFirstHeader("Content-Type");
          if (ct != null) {
            type = hres.getFirstHeader("Content-Type").getValue();
          }

          _log.info("lookup on " + lu + " status " + status + " " + getName());

          if (status == HttpStatus.SC_OK) {
            if (hen != null) {
              if (_ff.fetchOk(lu, status, hen) && _contentHandler.canHandle(type)) {
                InputStream is = hen.getContent();
                Callback contentCb =
                    _content.newDataset(new Provenance(lu, hres.getAllHeaders(), status));
                Callbacks cbs =
                    new Callbacks(
                        new Callback[] {contentCb, _links, _stmtCountingCallback.reset()});
                _contentHandler.handle(lu, type, is, cbs);
                is.close();

                _overall200Fetches.incrementAndGet();

                if (_stmtCountingCallback.getStmtCount() > 0)
                  _overall200FetchesWithRDF.incrementAndGet();

                // System.out.println("done with " + lu);

                headers = hres.getAllHeaders();

                Header hloc = hres.getFirstHeader("Content-Location");
                if (hloc != null) {
                  URI to = new URI(hloc.getValue());

                  // handle local redirects
                  if (!to.isAbsolute()) {
                    to = lu.resolve(hloc.getValue());
                  }

                  _q.setRedirect(lu, to, status);
                  _eh.handleRedirect(lu, to, status);
                  _q.addSeen(to);
                }
              } else {
                _log.info("disallowed via fetch filter " + lu + " type " + type);
                _eh.handleStatus(lu, CrawlerConstants.SKIP_MIMETYPE, null, 0, -1);
                hget.abort();
                hen = null;
                status = 0;
              }
            } else {
              _log.info("HttpEntity for " + lu + " is null");
            }
          } else if (status == HttpStatus.SC_MOVED_PERMANENTLY
              || status == HttpStatus.SC_MOVED_TEMPORARILY
              || status == HttpStatus.SC_SEE_OTHER
              || status == HttpStatus.SC_TEMPORARY_REDIRECT) {
            // treating all redirects the same but shouldn't: 301 -> rename context URI, 302,307 ->
            // keep original context URI, 303 -> spec inconclusive
            Header[] loc = hres.getHeaders("location");
            String path = loc[0].getValue();
            _log.info("redirecting (" + status + ") to " + path);
            URI to = new URI(path);

            // handle local redirects
            if (!to.isAbsolute()) {
              to = lu.resolve(path);
            }

            // set redirect from original uri to new uri
            _q.setRedirect(lu, to, status);
            _eh.handleRedirect(lu, to, status);

            headers = hres.getAllHeaders();
          }

          if (hen != null) {
            bytes = hen.getContentLength();
          }
          hget.abort();
        } catch (Throwable e) {
          hget.abort();
          _log.warning("Exception " + e.getClass().getName() + " " + lu);
          _eh.handleError(lu, e);
        }

        time3 = System.currentTimeMillis();

        if (status != 0) {
          _eh.handleStatus(lu, status, headers, (time3 - time2), bytes);
        }

        _log.fine(
            lu
                + " "
                + (time1 - time)
                + " ms before lookup, "
                + (time2 - time1)
                + " ms to check if lookup is ok, "
                + (time3 - time2)
                + " ms for lookup");
      }

      lu = _q.poll();
    }

    _log.info(
        "finished thread after fetching "
            + i
            + " uris; "
            + getOverall200Fetches()
            + " in all threads overall until now ("
            + getOverall200FetchesWithNonEmptyRDF()
            + " with non-empty RDF).");
  }