@Override public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException { assert entry != null; final byte[] hash = entry.url().hash(); synchronized (this) { // double-check if (this.has(hash)) return "double occurrence in urlFileIndex"; // increase dom counter if (profile != null) { int maxPages = profile.domMaxPages(); if (maxPages != Integer.MAX_VALUE && maxPages > 0) { String host = entry.url().getHost(); profile.domInc(host); } } // add to index Index depthStack = getStack(entry.depth()); final int s = depthStack.size(); depthStack.put(entry.toRow()); assert s < depthStack.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + depthStack.size(); assert depthStack.has(hash) : "hash = " + ASCII.String(hash); } return null; }
@Override public int removeAllByProfileHandle(final String profileHandle, final long timeout) throws IOException, SpaceExceededException { // first find a list of url hashes that shall be deleted final long terminate = timeout == Long.MAX_VALUE ? Long.MAX_VALUE : (timeout > 0) ? System.currentTimeMillis() + timeout : Long.MAX_VALUE; int count = 0; synchronized (this) { for (Index depthStack : this.depthStacks.values()) { final HandleSet urlHashes = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 100); final Iterator<Row.Entry> i = depthStack.rows(); Row.Entry rowEntry; Request crawlEntry; while (i.hasNext() && (System.currentTimeMillis() < terminate)) { rowEntry = i.next(); crawlEntry = new Request(rowEntry); if (crawlEntry.profileHandle().equals(profileHandle)) { urlHashes.put(crawlEntry.url().hash()); } if (System.currentTimeMillis() > terminate) break; } for (final byte[] urlhash : urlHashes) { depthStack.remove(urlhash); count++; } } } return count; }
@Override public Request pop(boolean delay, CrawlSwitchboard cs, RobotsTxt robots) throws IOException { // returns a crawl entry from the stack and ensures minimum delta times long sleeptime = 0; Request crawlEntry = null; CrawlProfile profileEntry = null; synchronized (this) { mainloop: while (true) { Index depthStack = getLowestStack(); if (depthStack == null) return null; Row.Entry rowEntry = null; while (depthStack.size() > 0) { rowEntry = depthStack.removeOne(); if (rowEntry != null) break; } if (rowEntry == null) continue mainloop; crawlEntry = new Request(rowEntry); // check blacklist (again) because the user may have created blacklist entries after the // queue has been filled if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) { if (log.isFine()) log.fine("URL '" + crawlEntry.url() + "' is in blacklist."); continue mainloop; } // at this point we must check if the crawlEntry has relevance because the crawl profile // still exists // if not: return null. A calling method must handle the null value and try again profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle())); if (profileEntry == null) { if (log.isFine()) log.fine("no profile entry for handle " + crawlEntry.profileHandle()); continue mainloop; } // depending on the caching policy we need sleep time to avoid DoS-like situations sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url()); break; } } if (crawlEntry == null) return null; ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent(); long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent); Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime); if (delay && sleeptime > 0) { // force a busy waiting here // in best case, this should never happen if the balancer works properly // this is only to protection against the worst case, where the crawler could // behave in a DoS-manner if (log.isInfo()) log.info( "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent)); long loops = sleeptime / 1000; long rest = sleeptime % 1000; if (loops < 3) { rest = rest + 1000 * loops; loops = 0; } Thread.currentThread() .setName( "Balancer waiting for " + crawlEntry.url().getHost() + ": " + sleeptime + " milliseconds"); synchronized (this) { // must be synchronized here to avoid 'takeover' moves from other threads which then idle // the same time which would not be enough if (rest > 0) { try { this.wait(rest); } catch (final InterruptedException e) { } } for (int i = 0; i < loops; i++) { if (log.isInfo()) log.info( "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining..."); try { this.wait(1000); } catch (final InterruptedException e) { } } } Latency.updateAfterSelection(crawlEntry.url(), robotsTime); } return crawlEntry; }