public void close() { _cm.shutdown(); _eh.close(); }
/** * @param threads * @param proxyHost - the proxy host or <code>null</code> to use * System.getProperties().get("http.proxyHost") * @param proxyPort - the proxy port or <code>null</code> to use * System.getProperties().get("http.proxyPort") * @param proxyUser - the proxy user or <code>null</code> to use * System.getProperties().get("http.proxyUser") * @param proxyPassword - the proxy user password or <code>null</code> to use * System.getProperties().get("http.proxyPassword") */ public Crawler( int threads, String proxyHost, String proxyPort, String proxyUser, String proxyPassword) { _threads = threads; String phost = proxyHost; int pport = 0; if (proxyPort != null) { try { pport = Integer.parseInt(proxyPort); } catch (NumberFormatException nfe) { pport = 0; } } String puser = proxyUser; String ppassword = proxyPassword; if (phost == null && System.getProperties().get("http.proxyHost") != null) { phost = System.getProperties().get("http.proxyHost").toString(); } if (pport == 0 && System.getProperties().get("http.proxyPort") != null) { pport = Integer.parseInt(System.getProperties().get("http.proxyPort").toString()); } if (puser == null && System.getProperties().get("http.proxyUser") != null) { puser = System.getProperties().get("http.proxyUser").toString(); } if (ppassword == null && System.getProperties().get("http.proxyPassword") != null) { ppassword = System.getProperties().get("http.proxyPassword").toString(); } _cm = new ConnectionManager( phost, pport, puser, ppassword, threads * CrawlerConstants.MAX_CONNECTIONS_PER_THREAD); _cm.setRetries(CrawlerConstants.RETRIES); // Always use the local TldManager implementation. Changed for the one // from NxParser for two reasons: // * I fixed a couple of bugs there // * I updated the Public suffix list and made a change to it to support // .asia // try { // _tldm = new TldManager(_cm); // } catch (Exception e) { // _log.info("cannot get tld file online " + e.getMessage()); // try { // _tldm = new TldManager(); // } catch (IOException e1) { // _log.info("cannot get tld file locally " + e.getMessage()); // } // } try { _tldm = new TldManager(); } catch (IOException e1) { _log.info("cannot get tld file locally " + e1.getMessage()); } _eh = new ErrorHandlerDummy(); _robots = new Robots(_cm); _robots.setErrorHandler(_eh); // _sitemaps = new Sitemaps(_cm); // _sitemaps.setErrorHandler(_eh); _contentHandler = new ContentHandlerRdfXml(); _output = new SinkDummy(); _ff = new FetchFilterAllow(); _blacklist = new FetchFilterAllow(); }