@Override public boolean shouldVisit(WebURL url) { // Don't crawl non-HTML pages String href = url.getURL().toLowerCase(); // Turns http://www.ics.uci.edu/SomePage.PHP -> // http://www.ics.uci.edu/somepage.php if (FILTERS.matcher(href).matches()) // filter using file extension return false; // Only crawl within the domain of the seed URL String currentUrlDomain = URLHelper.getDomain(url.getURL()); String seedUrlDomain = URLHelper.getDomain(this.params.getSeedUrl()); if (currentUrlDomain == null || !currentUrlDomain.endsWith(seedUrlDomain)) return false; // Don't crawl the same pages too many times (avoid infinite loops) if (!stats.intendToVisit(url.getURL())) return false; return true; }
public boolean intendToVisit(String url) { // Determine if the page has been visited too many times (might be infinite loop) String urlWithoutQuery = URLHelper.removeQuery(url); if (urlWithoutQuery == null) return true; // Get the current visit-intent count int count = 1; if (this.pagesToCrawl.containsKey(urlWithoutQuery)) count = this.pagesToCrawl.get(urlWithoutQuery); if (count >= 20) return false; // visit single page at most 20 times (with different query strings) // Update with another intent this.pagesToCrawl.put(urlWithoutQuery, count + 1); return true; }