コード例 #1
0
  @Override
  public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots)
      throws IOException, SpaceExceededException {
    assert entry != null;
    final byte[] hash = entry.url().hash();
    synchronized (this) {
      // double-check
      if (this.has(hash)) return "double occurrence in urlFileIndex";

      // increase dom counter
      if (profile != null) {
        int maxPages = profile.domMaxPages();
        if (maxPages != Integer.MAX_VALUE && maxPages > 0) {
          String host = entry.url().getHost();
          profile.domInc(host);
        }
      }

      // add to index
      Index depthStack = getStack(entry.depth());
      final int s = depthStack.size();
      depthStack.put(entry.toRow());
      assert s < depthStack.size()
          : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + depthStack.size();
      assert depthStack.has(hash) : "hash = " + ASCII.String(hash);
    }
    return null;
  }
コード例 #2
0
 /**
  * remove urls from the queue
  *
  * @param urlHashes, a list of hashes that shall be removed
  * @return number of entries that had been removed
  * @throws IOException
  */
 @Override
 public synchronized int remove(final HandleSet urlHashes) throws IOException {
   int removedCounter = 0;
   for (Index depthStack : this.depthStacks.values()) {
     final int s = depthStack.size();
     for (final byte[] urlhash : urlHashes) {
       final Row.Entry entry = depthStack.remove(urlhash);
       if (entry != null) removedCounter++;
     }
     if (removedCounter == 0) return 0;
     assert depthStack.size() + removedCounter == s
         : "urlFileIndex.size() = " + depthStack.size() + ", s = " + s;
   }
   return removedCounter;
 }
コード例 #3
0
 private int openAllStacks() {
   String[] l = this.hostPath.list();
   int c = 0;
   if (l != null)
     for (String s : l) {
       if (s.endsWith(indexSuffix))
         try {
           int depth = Integer.parseInt(s.substring(0, s.length() - indexSuffix.length()));
           File stackFile = new File(this.hostPath, s);
           Index depthStack = openStack(stackFile);
           if (depthStack != null) {
             int sz = depthStack.size();
             if (sz == 0) {
               depthStack.close();
               deletedelete(stackFile);
             } else {
               this.depthStacks.put(depth, depthStack);
               c += sz;
             }
           }
         } catch (NumberFormatException e) {
         }
     }
   return c;
 }
コード例 #4
0
 @Override
 public int size() {
   int size = 0;
   for (Index depthStack : this.depthStacks.values()) {
     size += depthStack.size();
   }
   return size;
 }
コード例 #5
0
  @Override
  public Request pop(boolean delay, CrawlSwitchboard cs, RobotsTxt robots) throws IOException {
    // returns a crawl entry from the stack and ensures minimum delta times
    long sleeptime = 0;
    Request crawlEntry = null;
    CrawlProfile profileEntry = null;
    synchronized (this) {
      mainloop:
      while (true) {
        Index depthStack = getLowestStack();
        if (depthStack == null) return null;
        Row.Entry rowEntry = null;
        while (depthStack.size() > 0) {
          rowEntry = depthStack.removeOne();
          if (rowEntry != null) break;
        }
        if (rowEntry == null) continue mainloop;
        crawlEntry = new Request(rowEntry);

        // check blacklist (again) because the user may have created blacklist entries after the
        // queue has been filled
        if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) {
          if (log.isFine()) log.fine("URL '" + crawlEntry.url() + "' is in blacklist.");
          continue mainloop;
        }

        // at this point we must check if the crawlEntry has relevance because the crawl profile
        // still exists
        // if not: return null. A calling method must handle the null value and try again
        profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
        if (profileEntry == null) {
          if (log.isFine()) log.fine("no profile entry for handle " + crawlEntry.profileHandle());
          continue mainloop;
        }

        // depending on the caching policy we need sleep time to avoid DoS-like situations
        sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url());
        break;
      }
    }
    if (crawlEntry == null) return null;
    ClientIdentification.Agent agent =
        profileEntry == null
            ? ClientIdentification.yacyInternetCrawlerAgent
            : profileEntry.getAgent();
    long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent);
    Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
    if (delay && sleeptime > 0) {
      // force a busy waiting here
      // in best case, this should never happen if the balancer works properly
      // this is only to protection against the worst case, where the crawler could
      // behave in a DoS-manner
      if (log.isInfo())
        log.info(
            "forcing crawl-delay of "
                + sleeptime
                + " milliseconds for "
                + crawlEntry.url().getHost()
                + ": "
                + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent));
      long loops = sleeptime / 1000;
      long rest = sleeptime % 1000;
      if (loops < 3) {
        rest = rest + 1000 * loops;
        loops = 0;
      }
      Thread.currentThread()
          .setName(
              "Balancer waiting for "
                  + crawlEntry.url().getHost()
                  + ": "
                  + sleeptime
                  + " milliseconds");
      synchronized (this) {
        // must be synchronized here to avoid 'takeover' moves from other threads which then idle
        // the same time which would not be enough
        if (rest > 0) {
          try {
            this.wait(rest);
          } catch (final InterruptedException e) {
          }
        }
        for (int i = 0; i < loops; i++) {
          if (log.isInfo())
            log.info(
                "waiting for "
                    + crawlEntry.url().getHost()
                    + ": "
                    + (loops - i)
                    + " seconds remaining...");
          try {
            this.wait(1000);
          } catch (final InterruptedException e) {
          }
        }
      }
      Latency.updateAfterSelection(crawlEntry.url(), robotsTime);
    }
    return crawlEntry;
  }