/** Override getTargetPageURLs to add target URL. */ public Set<URL> getTargetPageURLs() { Set<URL> urls = new HashSet<URL>(); String url = "http://www.infoq.com/news/2012/12/twemproxy;jsessionid=1652D82C3359CBAB67DA00B26BE7784B"; urls.add(URL.valueOf(url)); return urls; }
/** Override to remove unnecessary URL. */ @Override public Collection<URL> getUrlsToFilter() { Set<URL> filterSet = new HashSet<URL>(); String url = "http://www.infoq.com/news/2012/11/Panel-WinRT-Answers;jsessionid=91AB81A159E85692E6F1199644E2053C "; filterSet.add(URL.valueOf(url)); return filterSet; }
/** * Override getListPageURLs to get paging URL * * @see {@link AbstractWebCrawler.getListPageURLs} */ public Set<URL> getListPageURLs() { Set<URL> urls = new HashSet<URL>(); String url = "http://www.infoq.com/infoq.action?newsidx="; int j = 10; for (int i = 1; i <= 10; i++) { String pagingUrl = url + (j * i); System.out.println(pagingUrl); urls.add(URL.valueOf(pagingUrl)); } return urls; }