コード例 #1
0
  @Override
  public boolean shouldVisit(WebURL url) {
    // Don't crawl non-HTML pages
    String href = url.getURL().toLowerCase(); // Turns http://www.ics.uci.edu/SomePage.PHP ->
    // http://www.ics.uci.edu/somepage.php
    if (FILTERS.matcher(href).matches()) // filter using file extension
    return false;

    // Only crawl within the domain of the seed URL
    String currentUrlDomain = URLHelper.getDomain(url.getURL());
    String seedUrlDomain = URLHelper.getDomain(this.params.getSeedUrl());
    if (currentUrlDomain == null || !currentUrlDomain.endsWith(seedUrlDomain)) return false;

    // Don't crawl the same pages too many times (avoid infinite loops)
    if (!stats.intendToVisit(url.getURL())) return false;

    return true;
  }
コード例 #2
0
  public boolean intendToVisit(String url) {
    // Determine if the page has been visited too many times (might be infinite loop)
    String urlWithoutQuery = URLHelper.removeQuery(url);
    if (urlWithoutQuery == null) return true;

    // Get the current visit-intent count
    int count = 1;
    if (this.pagesToCrawl.containsKey(urlWithoutQuery))
      count = this.pagesToCrawl.get(urlWithoutQuery);

    if (count >= 20)
      return false; // visit single page at most 20 times (with different query strings)

    // Update with another intent
    this.pagesToCrawl.put(urlWithoutQuery, count + 1);

    return true;
  }