コード例 #1
0
 public void close() {
   _cm.shutdown();
   _eh.close();
 }
コード例 #2
0
  /**
   * @param threads
   * @param proxyHost - the proxy host or <code>null</code> to use
   *     System.getProperties().get("http.proxyHost")
   * @param proxyPort - the proxy port or <code>null</code> to use
   *     System.getProperties().get("http.proxyPort")
   * @param proxyUser - the proxy user or <code>null</code> to use
   *     System.getProperties().get("http.proxyUser")
   * @param proxyPassword - the proxy user password or <code>null</code> to use
   *     System.getProperties().get("http.proxyPassword")
   */
  public Crawler(
      int threads, String proxyHost, String proxyPort, String proxyUser, String proxyPassword) {
    _threads = threads;

    String phost = proxyHost;
    int pport = 0;
    if (proxyPort != null) {
      try {
        pport = Integer.parseInt(proxyPort);
      } catch (NumberFormatException nfe) {
        pport = 0;
      }
    }
    String puser = proxyUser;
    String ppassword = proxyPassword;

    if (phost == null && System.getProperties().get("http.proxyHost") != null) {
      phost = System.getProperties().get("http.proxyHost").toString();
    }
    if (pport == 0 && System.getProperties().get("http.proxyPort") != null) {
      pport = Integer.parseInt(System.getProperties().get("http.proxyPort").toString());
    }

    if (puser == null && System.getProperties().get("http.proxyUser") != null) {
      puser = System.getProperties().get("http.proxyUser").toString();
    }
    if (ppassword == null && System.getProperties().get("http.proxyPassword") != null) {
      ppassword = System.getProperties().get("http.proxyPassword").toString();
    }

    _cm =
        new ConnectionManager(
            phost, pport, puser, ppassword, threads * CrawlerConstants.MAX_CONNECTIONS_PER_THREAD);
    _cm.setRetries(CrawlerConstants.RETRIES);

    // Always use the local TldManager implementation. Changed for the one
    // from NxParser for two reasons:
    // * I fixed a couple of bugs there
    // * I updated the Public suffix list and made a change to it to support
    //   .asia
    // try {
    // _tldm = new TldManager(_cm);
    // } catch (Exception e) {
    // _log.info("cannot get tld file online " + e.getMessage());
    // try {
    // _tldm = new TldManager();
    // } catch (IOException e1) {
    // _log.info("cannot get tld file locally " + e.getMessage());
    // }
    // }

    try {
      _tldm = new TldManager();
    } catch (IOException e1) {
      _log.info("cannot get tld file locally " + e1.getMessage());
    }

    _eh = new ErrorHandlerDummy();

    _robots = new Robots(_cm);
    _robots.setErrorHandler(_eh);

    //	    _sitemaps = new Sitemaps(_cm);
    //	    _sitemaps.setErrorHandler(_eh);

    _contentHandler = new ContentHandlerRdfXml();
    _output = new SinkDummy();
    _ff = new FetchFilterAllow();

    _blacklist = new FetchFilterAllow();
  }