public void evaluateSequential(Frontier frontier, Seen seen) { Redirects r = null; try { r = _redirsClass.newInstance(); } catch (InstantiationException e) { _log.info("IllegalAccessException. Using dummy."); r = new DummyRedirects(); } catch (IllegalAccessException e) { _log.info("IllegalAccessException. Using dummy."); r = new DummyRedirects(); } _queue = new BreadthFirstQueue(_tldm, r, seen, Integer.MAX_VALUE, Integer.MAX_VALUE, -1, false); _queue.schedule(frontier); _log.info(_queue.toString()); int i = 0; while (_queue.size() > 0 && i <= CrawlerConstants.MAX_REDIRECTS) { List<Thread> ts = new ArrayList<Thread>(); for (int j = 0; j < _threads; j++) { LookupThread lt = new LookupThread( _cm, _queue, _contentHandler, _output, _links, _robots, _eh, _ff, _blacklist, j); ts.add(lt); // new Thread(lt,"LookupThread-"+j)); } _log.info("Starting threads round " + i++ + " with " + _queue.size() + " uris"); for (Thread t : ts) { t.start(); } Monitor m = new Monitor(ts, System.err, 1000 * 10); m.start(); for (Thread t : ts) { try { t.join(); } catch (InterruptedException e1) { _log.info(e1.getMessage()); // e1.printStackTrace(); } } m.shutdown(); _queue.schedule(frontier); _log.info("ROUND " + i + " DONE with " + _queue.size() + " uris remaining in queue"); _log.info("new queue: \n" + _queue.toString()); } _log.info("DONE with " + _queue.size() + " uris remaining in queue"); }
public void evaluateBreadthFirst( Frontier frontier, Seen seen, Redirects redirects, int depth, int maxuris, int maxplds, int minActPlds, boolean minActPldsAlready4Seedlist, Mode crawlingMode) { Redirects r = redirects; if (_queue != null) r = _queue.getRedirects(); if (_queue == null || !(_queue instanceof BreadthFirstQueue || _queue instanceof DiskBreadthFirstQueue)) { if (CrawlerConstants.BREADTHFIRSTQUEUE_ONDISK) _queue = new DiskBreadthFirstQueue(_tldm, r, seen, minActPlds, minActPldsAlready4Seedlist); else _queue = new BreadthFirstQueue( _tldm, r, seen, maxuris, maxplds, minActPlds, minActPldsAlready4Seedlist); } else { Seen tempseen = _queue.getSeen(); _queue = new BreadthFirstQueue( _tldm, r, seen, maxuris, maxplds, minActPlds, minActPldsAlready4Seedlist); _queue.setRedirects(r); _queue.setSeen(tempseen); } if (_links == null) { _links = new LinkFilterDefault(frontier); } _queue.schedule(frontier); _links.setFollowABox(crawlingMode.followABox()); _links.setFollowTBox(crawlingMode.followTBox()); _log.info(_queue.toString()); int rounds = crawlingMode.doExtraRound() ? depth + 1 : depth; for (int curRound = 0; (curRound <= rounds) && (CrawlerConstants.URI_LIMIT_ENABLED ? (LookupThread.getOverall200FetchesWithNonEmptyRDF() < CrawlerConstants.URI_LIMIT_WITH_NON_EMPTY_RDF) : true); curRound++) { List<Thread> ts = new ArrayList<Thread>(); // Extra round to get TBox if (curRound == depth) { _links.setFollowABox(false); _links.setFollowTBox(true); } for (int j = 0; j < _threads; j++) { LookupThread lt = new LookupThread( _cm, _queue, _contentHandler, _output, _links, _robots, _eh, _ff, _blacklist, j); ts.add(lt); // new Thread(lt,"LookupThread-"+j)); } _log.info("Starting threads round " + curRound + " with " + _queue.size() + " uris"); Monitor m = new Monitor(ts, System.err, 1000 * 10); m.start(); for (Thread t : ts) { t.start(); } for (Thread t : ts) { try { t.join(); } catch (InterruptedException e1) { _log.info(e1.getMessage()); // e1.printStackTrace(); } } m.shutdown(); _log.info("ROUND " + curRound + " DONE with " + _queue.size() + " uris remaining in queue"); _log.fine("old queue: \n" + _queue.toString()); if (_output instanceof LastReporter) _log.info( "Last non-empty context of this hop (# " + curRound + " ): " + ((LastReporter) _output).whoWasLast()); if (CrawlerConstants.SPLIT_HOPWISE) { for (TakingHopsIntoAccount thia : CrawlerConstants.THOSE_WHO_TAKE_HOPS_INTO_ACCOUNT) { try { thia.nextHop(curRound + 1); } catch (Exception e) { e.printStackTrace(); } } } _queue.schedule(frontier); _eh.handleNextRound(); _log.fine("new queue: \n" + _queue.toString()); } }
public void evaluateLoadBalanced(Frontier frontier, Seen seen, int maxuris) { if (_queue == null || !(_queue instanceof LoadBalancingQueue)) { Redirects r = null; if (_queue != null) r = _queue.getRedirects(); if (r == null) // try { // _log.info("The instance get executed"); // r = _redirsClass.newInstance(); r = new DummyRedirects(); // } // catch (InstantiationException e) { // _log.info("InstantiationException. Using dummy."); // r = new DummyRedirects(); // } catch (IllegalAccessException e) { // _log.info("IllegalAccessException. Using dummy."); // r = new DummyRedirects(); // } _queue = new LoadBalancingQueue(_tldm, r, seen); } else { Redirects r = _queue.getRedirects(); seen = _queue.getSeen(); _queue = new LoadBalancingQueue(_tldm, r, seen); _queue.setSeen(seen); } if (_links == null) { _links = new LinkFilterDefault(frontier); } _queue.schedule(frontier); _log.fine(_queue.toString()); int i = 0; int uris = 0; while (uris < maxuris && _queue.size() > 0) { int size = _queue.size(); List<Thread> ts = new ArrayList<Thread>(); for (int j = 0; j < _threads; j++) { LookupThread lt = new LookupThread( _cm, _queue, _contentHandler, _output, _links, _robots, _eh, _ff, _blacklist, j); ts.add(lt); // new Thread(lt,"LookupThread-"+j)); } _log.info("Starting threads round " + i++ + " with " + _queue.size() + " uris"); for (Thread t : ts) { t.start(); } Monitor m = new Monitor(ts, System.err, 1000 * 10); m.start(); for (Thread t : ts) { try { t.join(); } catch (InterruptedException e1) { _log.info(e1.getMessage()); // e1.printStackTrace(); } } m.shutdown(); uris += size - _queue.size(); _log.info("ROUND " + i + " DONE with " + _queue.size() + " uris remaining in queue"); _log.fine("old queue: \n" + _queue.toString()); _log.fine("frontier" + frontier); _queue.schedule(frontier); _log.info("new queue: \n" + _queue.toString()); } }
public void run() { _log.info("starting thread ..."); if (!(!CrawlerConstants.URI_LIMIT_ENABLED || (_overall200FetchesWithRDF.get() < CrawlerConstants.URI_LIMIT_WITH_NON_EMPTY_RDF))) { _log.info("URI limit reached. Stopping..."); return; } int i = 0; URI lu = _q.poll(); _log.fine("got " + lu); while (lu != null) { if (!(!CrawlerConstants.URI_LIMIT_ENABLED || (_overall200FetchesWithRDF.get() < CrawlerConstants.URI_LIMIT_WITH_NON_EMPTY_RDF))) { _log.info("URI limit reached. Stopping..."); break; } setName("LT-" + _no + ":" + lu.toString()); _q.addSeen(lu); i++; long time = System.currentTimeMillis(); // URI lu = _q.obtainRedirect(u); long time1 = System.currentTimeMillis(); long time2 = time1; long time3 = time1; long bytes = -1; int status = 0; String type = null; // List<URI> li = _sitemaps.getSitemapUris(lu); // if (li != null && li.size() > 0) { // _log.info("sitemap surprisingly actually has uris " + li); // } Header[] headers = null; if (!_blacklist.fetchOk(lu, 0, null)) { _log.info("access denied per blacklist for " + lu); _eh.handleStatus(lu, CrawlerConstants.SKIP_SUFFIX, null, 0, -1); } else if (!_robots.accessOk(lu)) { _log.info("access denied per robots.txt for " + lu); _eh.handleStatus(lu, CrawlerConstants.SKIP_ROBOTS, null, 0, -1); } else { time2 = System.currentTimeMillis(); HttpGet hget = new HttpGet(lu); hget.setHeaders(CrawlerConstants.HEADERS); try { HttpResponse hres = _hclient.connect(hget); HttpEntity hen = hres.getEntity(); status = hres.getStatusLine().getStatusCode(); Header ct = hres.getFirstHeader("Content-Type"); if (ct != null) { type = hres.getFirstHeader("Content-Type").getValue(); } _log.info("lookup on " + lu + " status " + status + " " + getName()); if (status == HttpStatus.SC_OK) { if (hen != null) { if (_ff.fetchOk(lu, status, hen) && _contentHandler.canHandle(type)) { InputStream is = hen.getContent(); Callback contentCb = _content.newDataset(new Provenance(lu, hres.getAllHeaders(), status)); Callbacks cbs = new Callbacks( new Callback[] {contentCb, _links, _stmtCountingCallback.reset()}); _contentHandler.handle(lu, type, is, cbs); is.close(); _overall200Fetches.incrementAndGet(); if (_stmtCountingCallback.getStmtCount() > 0) _overall200FetchesWithRDF.incrementAndGet(); // System.out.println("done with " + lu); headers = hres.getAllHeaders(); Header hloc = hres.getFirstHeader("Content-Location"); if (hloc != null) { URI to = new URI(hloc.getValue()); // handle local redirects if (!to.isAbsolute()) { to = lu.resolve(hloc.getValue()); } _q.setRedirect(lu, to, status); _eh.handleRedirect(lu, to, status); _q.addSeen(to); } } else { _log.info("disallowed via fetch filter " + lu + " type " + type); _eh.handleStatus(lu, CrawlerConstants.SKIP_MIMETYPE, null, 0, -1); hget.abort(); hen = null; status = 0; } } else { _log.info("HttpEntity for " + lu + " is null"); } } else if (status == HttpStatus.SC_MOVED_PERMANENTLY || status == HttpStatus.SC_MOVED_TEMPORARILY || status == HttpStatus.SC_SEE_OTHER || status == HttpStatus.SC_TEMPORARY_REDIRECT) { // treating all redirects the same but shouldn't: 301 -> rename context URI, 302,307 -> // keep original context URI, 303 -> spec inconclusive Header[] loc = hres.getHeaders("location"); String path = loc[0].getValue(); _log.info("redirecting (" + status + ") to " + path); URI to = new URI(path); // handle local redirects if (!to.isAbsolute()) { to = lu.resolve(path); } // set redirect from original uri to new uri _q.setRedirect(lu, to, status); _eh.handleRedirect(lu, to, status); headers = hres.getAllHeaders(); } if (hen != null) { bytes = hen.getContentLength(); } hget.abort(); } catch (Throwable e) { hget.abort(); _log.warning("Exception " + e.getClass().getName() + " " + lu); _eh.handleError(lu, e); } time3 = System.currentTimeMillis(); if (status != 0) { _eh.handleStatus(lu, status, headers, (time3 - time2), bytes); } _log.fine( lu + " " + (time1 - time) + " ms before lookup, " + (time2 - time1) + " ms to check if lookup is ok, " + (time3 - time2) + " ms for lookup"); } lu = _q.poll(); } _log.info( "finished thread after fetching " + i + " uris; " + getOverall200Fetches() + " in all threads overall until now (" + getOverall200FetchesWithNonEmptyRDF() + " with non-empty RDF)."); }