public void evaluateLoadBalanced(Frontier frontier, Seen seen, int maxuris) { if (_queue == null || !(_queue instanceof LoadBalancingQueue)) { Redirects r = null; if (_queue != null) r = _queue.getRedirects(); if (r == null) // try { // _log.info("The instance get executed"); // r = _redirsClass.newInstance(); r = new DummyRedirects(); // } // catch (InstantiationException e) { // _log.info("InstantiationException. Using dummy."); // r = new DummyRedirects(); // } catch (IllegalAccessException e) { // _log.info("IllegalAccessException. Using dummy."); // r = new DummyRedirects(); // } _queue = new LoadBalancingQueue(_tldm, r, seen); } else { Redirects r = _queue.getRedirects(); seen = _queue.getSeen(); _queue = new LoadBalancingQueue(_tldm, r, seen); _queue.setSeen(seen); } if (_links == null) { _links = new LinkFilterDefault(frontier); } _queue.schedule(frontier); _log.fine(_queue.toString()); int i = 0; int uris = 0; while (uris < maxuris && _queue.size() > 0) { int size = _queue.size(); List<Thread> ts = new ArrayList<Thread>(); for (int j = 0; j < _threads; j++) { LookupThread lt = new LookupThread( _cm, _queue, _contentHandler, _output, _links, _robots, _eh, _ff, _blacklist, j); ts.add(lt); // new Thread(lt,"LookupThread-"+j)); } _log.info("Starting threads round " + i++ + " with " + _queue.size() + " uris"); for (Thread t : ts) { t.start(); } Monitor m = new Monitor(ts, System.err, 1000 * 10); m.start(); for (Thread t : ts) { try { t.join(); } catch (InterruptedException e1) { _log.info(e1.getMessage()); // e1.printStackTrace(); } } m.shutdown(); uris += size - _queue.size(); _log.info("ROUND " + i + " DONE with " + _queue.size() + " uris remaining in queue"); _log.fine("old queue: \n" + _queue.toString()); _log.fine("frontier" + frontier); _queue.schedule(frontier); _log.info("new queue: \n" + _queue.toString()); } }
public void evaluateBreadthFirst( Frontier frontier, Seen seen, Redirects redirects, int depth, int maxuris, int maxplds, int minActPlds, boolean minActPldsAlready4Seedlist, Mode crawlingMode) { Redirects r = redirects; if (_queue != null) r = _queue.getRedirects(); if (_queue == null || !(_queue instanceof BreadthFirstQueue || _queue instanceof DiskBreadthFirstQueue)) { if (CrawlerConstants.BREADTHFIRSTQUEUE_ONDISK) _queue = new DiskBreadthFirstQueue(_tldm, r, seen, minActPlds, minActPldsAlready4Seedlist); else _queue = new BreadthFirstQueue( _tldm, r, seen, maxuris, maxplds, minActPlds, minActPldsAlready4Seedlist); } else { Seen tempseen = _queue.getSeen(); _queue = new BreadthFirstQueue( _tldm, r, seen, maxuris, maxplds, minActPlds, minActPldsAlready4Seedlist); _queue.setRedirects(r); _queue.setSeen(tempseen); } if (_links == null) { _links = new LinkFilterDefault(frontier); } _queue.schedule(frontier); _links.setFollowABox(crawlingMode.followABox()); _links.setFollowTBox(crawlingMode.followTBox()); _log.info(_queue.toString()); int rounds = crawlingMode.doExtraRound() ? depth + 1 : depth; for (int curRound = 0; (curRound <= rounds) && (CrawlerConstants.URI_LIMIT_ENABLED ? (LookupThread.getOverall200FetchesWithNonEmptyRDF() < CrawlerConstants.URI_LIMIT_WITH_NON_EMPTY_RDF) : true); curRound++) { List<Thread> ts = new ArrayList<Thread>(); // Extra round to get TBox if (curRound == depth) { _links.setFollowABox(false); _links.setFollowTBox(true); } for (int j = 0; j < _threads; j++) { LookupThread lt = new LookupThread( _cm, _queue, _contentHandler, _output, _links, _robots, _eh, _ff, _blacklist, j); ts.add(lt); // new Thread(lt,"LookupThread-"+j)); } _log.info("Starting threads round " + curRound + " with " + _queue.size() + " uris"); Monitor m = new Monitor(ts, System.err, 1000 * 10); m.start(); for (Thread t : ts) { t.start(); } for (Thread t : ts) { try { t.join(); } catch (InterruptedException e1) { _log.info(e1.getMessage()); // e1.printStackTrace(); } } m.shutdown(); _log.info("ROUND " + curRound + " DONE with " + _queue.size() + " uris remaining in queue"); _log.fine("old queue: \n" + _queue.toString()); if (_output instanceof LastReporter) _log.info( "Last non-empty context of this hop (# " + curRound + " ): " + ((LastReporter) _output).whoWasLast()); if (CrawlerConstants.SPLIT_HOPWISE) { for (TakingHopsIntoAccount thia : CrawlerConstants.THOSE_WHO_TAKE_HOPS_INTO_ACCOUNT) { try { thia.nextHop(curRound + 1); } catch (Exception e) { e.printStackTrace(); } } } _queue.schedule(frontier); _eh.handleNextRound(); _log.fine("new queue: \n" + _queue.toString()); } }