@Override public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException { assert entry != null; final byte[] hash = entry.url().hash(); synchronized (this) { // double-check if (this.has(hash)) return "double occurrence in urlFileIndex"; // increase dom counter if (profile != null) { int maxPages = profile.domMaxPages(); if (maxPages != Integer.MAX_VALUE && maxPages > 0) { String host = entry.url().getHost(); profile.domInc(host); } } // add to index Index depthStack = getStack(entry.depth()); final int s = depthStack.size(); depthStack.put(entry.toRow()); assert s < depthStack.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + depthStack.size(); assert depthStack.has(hash) : "hash = " + ASCII.String(hash); } return null; }
/** * remove urls from the queue * * @param urlHashes, a list of hashes that shall be removed * @return number of entries that had been removed * @throws IOException */ @Override public synchronized int remove(final HandleSet urlHashes) throws IOException { int removedCounter = 0; for (Index depthStack : this.depthStacks.values()) { final int s = depthStack.size(); for (final byte[] urlhash : urlHashes) { final Row.Entry entry = depthStack.remove(urlhash); if (entry != null) removedCounter++; } if (removedCounter == 0) return 0; assert depthStack.size() + removedCounter == s : "urlFileIndex.size() = " + depthStack.size() + ", s = " + s; } return removedCounter; }
private int openAllStacks() { String[] l = this.hostPath.list(); int c = 0; if (l != null) for (String s : l) { if (s.endsWith(indexSuffix)) try { int depth = Integer.parseInt(s.substring(0, s.length() - indexSuffix.length())); File stackFile = new File(this.hostPath, s); Index depthStack = openStack(stackFile); if (depthStack != null) { int sz = depthStack.size(); if (sz == 0) { depthStack.close(); deletedelete(stackFile); } else { this.depthStacks.put(depth, depthStack); c += sz; } } } catch (NumberFormatException e) { } } return c; }
@Override public int size() { int size = 0; for (Index depthStack : this.depthStacks.values()) { size += depthStack.size(); } return size; }
@Override public Request pop(boolean delay, CrawlSwitchboard cs, RobotsTxt robots) throws IOException { // returns a crawl entry from the stack and ensures minimum delta times long sleeptime = 0; Request crawlEntry = null; CrawlProfile profileEntry = null; synchronized (this) { mainloop: while (true) { Index depthStack = getLowestStack(); if (depthStack == null) return null; Row.Entry rowEntry = null; while (depthStack.size() > 0) { rowEntry = depthStack.removeOne(); if (rowEntry != null) break; } if (rowEntry == null) continue mainloop; crawlEntry = new Request(rowEntry); // check blacklist (again) because the user may have created blacklist entries after the // queue has been filled if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) { if (log.isFine()) log.fine("URL '" + crawlEntry.url() + "' is in blacklist."); continue mainloop; } // at this point we must check if the crawlEntry has relevance because the crawl profile // still exists // if not: return null. A calling method must handle the null value and try again profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle())); if (profileEntry == null) { if (log.isFine()) log.fine("no profile entry for handle " + crawlEntry.profileHandle()); continue mainloop; } // depending on the caching policy we need sleep time to avoid DoS-like situations sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url()); break; } } if (crawlEntry == null) return null; ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent(); long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent); Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime); if (delay && sleeptime > 0) { // force a busy waiting here // in best case, this should never happen if the balancer works properly // this is only to protection against the worst case, where the crawler could // behave in a DoS-manner if (log.isInfo()) log.info( "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent)); long loops = sleeptime / 1000; long rest = sleeptime % 1000; if (loops < 3) { rest = rest + 1000 * loops; loops = 0; } Thread.currentThread() .setName( "Balancer waiting for " + crawlEntry.url().getHost() + ": " + sleeptime + " milliseconds"); synchronized (this) { // must be synchronized here to avoid 'takeover' moves from other threads which then idle // the same time which would not be enough if (rest > 0) { try { this.wait(rest); } catch (final InterruptedException e) { } } for (int i = 0; i < loops; i++) { if (log.isInfo()) log.info( "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining..."); try { this.wait(1000); } catch (final InterruptedException e) { } } } Latency.updateAfterSelection(crawlEntry.url(), robotsTime); } return crawlEntry; }