public void setErrorHandler(ErrorHandler eh) { _eh = eh; if (_robots != null) { _robots.setErrorHandler(eh); } // if (_sitemaps != null) { // _sitemaps.setErrorHandler(eh); // } if (_links != null) { _links.setErrorHandler(eh); } }
/** * @param threads * @param proxyHost - the proxy host or <code>null</code> to use * System.getProperties().get("http.proxyHost") * @param proxyPort - the proxy port or <code>null</code> to use * System.getProperties().get("http.proxyPort") * @param proxyUser - the proxy user or <code>null</code> to use * System.getProperties().get("http.proxyUser") * @param proxyPassword - the proxy user password or <code>null</code> to use * System.getProperties().get("http.proxyPassword") */ public Crawler( int threads, String proxyHost, String proxyPort, String proxyUser, String proxyPassword) { _threads = threads; String phost = proxyHost; int pport = 0; if (proxyPort != null) { try { pport = Integer.parseInt(proxyPort); } catch (NumberFormatException nfe) { pport = 0; } } String puser = proxyUser; String ppassword = proxyPassword; if (phost == null && System.getProperties().get("http.proxyHost") != null) { phost = System.getProperties().get("http.proxyHost").toString(); } if (pport == 0 && System.getProperties().get("http.proxyPort") != null) { pport = Integer.parseInt(System.getProperties().get("http.proxyPort").toString()); } if (puser == null && System.getProperties().get("http.proxyUser") != null) { puser = System.getProperties().get("http.proxyUser").toString(); } if (ppassword == null && System.getProperties().get("http.proxyPassword") != null) { ppassword = System.getProperties().get("http.proxyPassword").toString(); } _cm = new ConnectionManager( phost, pport, puser, ppassword, threads * CrawlerConstants.MAX_CONNECTIONS_PER_THREAD); _cm.setRetries(CrawlerConstants.RETRIES); // Always use the local TldManager implementation. Changed for the one // from NxParser for two reasons: // * I fixed a couple of bugs there // * I updated the Public suffix list and made a change to it to support // .asia // try { // _tldm = new TldManager(_cm); // } catch (Exception e) { // _log.info("cannot get tld file online " + e.getMessage()); // try { // _tldm = new TldManager(); // } catch (IOException e1) { // _log.info("cannot get tld file locally " + e.getMessage()); // } // } try { _tldm = new TldManager(); } catch (IOException e1) { _log.info("cannot get tld file locally " + e1.getMessage()); } _eh = new ErrorHandlerDummy(); _robots = new Robots(_cm); _robots.setErrorHandler(_eh); // _sitemaps = new Sitemaps(_cm); // _sitemaps.setErrorHandler(_eh); _contentHandler = new ContentHandlerRdfXml(); _output = new SinkDummy(); _ff = new FetchFilterAllow(); _blacklist = new FetchFilterAllow(); }
public void run() { _log.info("starting thread ..."); if (!(!CrawlerConstants.URI_LIMIT_ENABLED || (_overall200FetchesWithRDF.get() < CrawlerConstants.URI_LIMIT_WITH_NON_EMPTY_RDF))) { _log.info("URI limit reached. Stopping..."); return; } int i = 0; URI lu = _q.poll(); _log.fine("got " + lu); while (lu != null) { if (!(!CrawlerConstants.URI_LIMIT_ENABLED || (_overall200FetchesWithRDF.get() < CrawlerConstants.URI_LIMIT_WITH_NON_EMPTY_RDF))) { _log.info("URI limit reached. Stopping..."); break; } setName("LT-" + _no + ":" + lu.toString()); _q.addSeen(lu); i++; long time = System.currentTimeMillis(); // URI lu = _q.obtainRedirect(u); long time1 = System.currentTimeMillis(); long time2 = time1; long time3 = time1; long bytes = -1; int status = 0; String type = null; // List<URI> li = _sitemaps.getSitemapUris(lu); // if (li != null && li.size() > 0) { // _log.info("sitemap surprisingly actually has uris " + li); // } Header[] headers = null; if (!_blacklist.fetchOk(lu, 0, null)) { _log.info("access denied per blacklist for " + lu); _eh.handleStatus(lu, CrawlerConstants.SKIP_SUFFIX, null, 0, -1); } else if (!_robots.accessOk(lu)) { _log.info("access denied per robots.txt for " + lu); _eh.handleStatus(lu, CrawlerConstants.SKIP_ROBOTS, null, 0, -1); } else { time2 = System.currentTimeMillis(); HttpGet hget = new HttpGet(lu); hget.setHeaders(CrawlerConstants.HEADERS); try { HttpResponse hres = _hclient.connect(hget); HttpEntity hen = hres.getEntity(); status = hres.getStatusLine().getStatusCode(); Header ct = hres.getFirstHeader("Content-Type"); if (ct != null) { type = hres.getFirstHeader("Content-Type").getValue(); } _log.info("lookup on " + lu + " status " + status + " " + getName()); if (status == HttpStatus.SC_OK) { if (hen != null) { if (_ff.fetchOk(lu, status, hen) && _contentHandler.canHandle(type)) { InputStream is = hen.getContent(); Callback contentCb = _content.newDataset(new Provenance(lu, hres.getAllHeaders(), status)); Callbacks cbs = new Callbacks( new Callback[] {contentCb, _links, _stmtCountingCallback.reset()}); _contentHandler.handle(lu, type, is, cbs); is.close(); _overall200Fetches.incrementAndGet(); if (_stmtCountingCallback.getStmtCount() > 0) _overall200FetchesWithRDF.incrementAndGet(); // System.out.println("done with " + lu); headers = hres.getAllHeaders(); Header hloc = hres.getFirstHeader("Content-Location"); if (hloc != null) { URI to = new URI(hloc.getValue()); // handle local redirects if (!to.isAbsolute()) { to = lu.resolve(hloc.getValue()); } _q.setRedirect(lu, to, status); _eh.handleRedirect(lu, to, status); _q.addSeen(to); } } else { _log.info("disallowed via fetch filter " + lu + " type " + type); _eh.handleStatus(lu, CrawlerConstants.SKIP_MIMETYPE, null, 0, -1); hget.abort(); hen = null; status = 0; } } else { _log.info("HttpEntity for " + lu + " is null"); } } else if (status == HttpStatus.SC_MOVED_PERMANENTLY || status == HttpStatus.SC_MOVED_TEMPORARILY || status == HttpStatus.SC_SEE_OTHER || status == HttpStatus.SC_TEMPORARY_REDIRECT) { // treating all redirects the same but shouldn't: 301 -> rename context URI, 302,307 -> // keep original context URI, 303 -> spec inconclusive Header[] loc = hres.getHeaders("location"); String path = loc[0].getValue(); _log.info("redirecting (" + status + ") to " + path); URI to = new URI(path); // handle local redirects if (!to.isAbsolute()) { to = lu.resolve(path); } // set redirect from original uri to new uri _q.setRedirect(lu, to, status); _eh.handleRedirect(lu, to, status); headers = hres.getAllHeaders(); } if (hen != null) { bytes = hen.getContentLength(); } hget.abort(); } catch (Throwable e) { hget.abort(); _log.warning("Exception " + e.getClass().getName() + " " + lu); _eh.handleError(lu, e); } time3 = System.currentTimeMillis(); if (status != 0) { _eh.handleStatus(lu, status, headers, (time3 - time2), bytes); } _log.fine( lu + " " + (time1 - time) + " ms before lookup, " + (time2 - time1) + " ms to check if lookup is ok, " + (time3 - time2) + " ms for lookup"); } lu = _q.poll(); } _log.info( "finished thread after fetching " + i + " uris; " + getOverall200Fetches() + " in all threads overall until now (" + getOverall200FetchesWithNonEmptyRDF() + " with non-empty RDF)."); }