@Override public WebURL entryToObject(TupleInput input) { WebURL webURL = new WebURL(); webURL.setPrimeKey(input.readString()); webURL.setURL(input.readString()); webURL.setDocid(input.readInt()); webURL.setParentDocid(input.readInt()); webURL.setParentUrl(input.readString()); webURL.setDepth(input.readShort()); webURL.setCookie(input.readString()); webURL.setFormParams(input.readString()); webURL.setMethod(input.readString()); return webURL; }
@Override public void objectToEntry(WebURL url, TupleOutput output) { output.writeString(url.getPrimeKey()); output.writeString(url.getURL()); output.writeInt(url.getDocid()); output.writeInt(url.getParentDocid()); output.writeString(url.getParentUrl()); output.writeShort(url.getDepth()); output.writeString(url.getCookie()); output.writeString(url.getFormParams()); output.writeString(url.getMethod()); }
public void visit(Page page) { String url = page.getWebURL().getURL(); // standard out contains a single line per URL, with the URL // followed by all the words found on the page // String text = page.getText().replaceAll("[^a-zA-Z]+", " "); System.out.println(url + "\t" + text); // standard err contains a line for each outgoing link from the // page we're crawling // for (WebURL link : page.getURLs()) { System.err.println(url + "\t" + link.getURL()); } }
@Override public boolean shouldVisit(WebURL url) { // Don't crawl non-HTML pages String href = url.getURL().toLowerCase(); // Turns http://www.ics.uci.edu/SomePage.PHP -> // http://www.ics.uci.edu/somepage.php if (FILTERS.matcher(href).matches()) // filter using file extension return false; // Only crawl within the domain of the seed URL String currentUrlDomain = URLHelper.getDomain(url.getURL()); String seedUrlDomain = URLHelper.getDomain(this.params.getSeedUrl()); if (currentUrlDomain == null || !currentUrlDomain.endsWith(seedUrlDomain)) return false; // Don't crawl the same pages too many times (avoid infinite loops) if (!stats.intendToVisit(url.getURL())) return false; return true; }
@Override public boolean shouldVisit(Page page, WebURL url) { try { String href = url.getURL().toLowerCase(); String decodedString = URLDecoder.decode(href, "UTF8"); return filter1.matcher(decodedString).matches() || filter2.matcher(decodedString).matches(); } catch (UnsupportedEncodingException ex) { ex.printStackTrace(); return false; } }
public void InsertItem(String URL, String Text, String HTML) throws SQLException { WebURL curURL = new WebURL(); curURL.setURL(URL); Statement stmt = null; try { stmt = c.createStatement(); } catch (SQLException e) { System.out.println("Error, can't create statement in AddLink"); e.printStackTrace(); } PreparedStatement statement = c.prepareStatement("INSERT INTO PAGES (URL,SUBDOMAIN,BODY,HTML) values (?, ?, ?, ?)"); statement.setString(1, URL); statement.setString(2, curURL.getSubDomain()); statement.setString(3, Text); statement.setString(4, HTML); statement.executeUpdate(); }
@Override public boolean shouldVisit(Page referringPage, WebURL url) { String href = url.getURL().toLowerCase().toString(); // System.out.println("start tag = // "+href.startsWith("http://estekhdame.ir/blog/tag/")+url); if (FILTERS.matcher(href).matches()) return false; if (href.contains("city") || href.contains("study")) { // System.out.println("hi"+href); return false; } return href.startsWith("http://eshetab.com/state/") || href.contains("/ads/"); }
@Test public void test() { String url1 = "http://nba.hupu.com/"; String url2 = "http://www.google.com/"; WebURL a = new WebURL(); a.setURL(url1); a.setDepth((short) 3); WebURL b = new WebURL(); b.setURL(url2); b.setDepth((short) 2); scheduler.schedule(a); assertEquals(1, scheduler.getQueueLength()); scheduler.schedule(a); assertEquals(1, scheduler.getQueueLength()); scheduler.schedule(b); assertEquals(2, scheduler.getQueueLength()); List<WebURL> l = new ArrayList<WebURL>(); scheduler.getNextURLs(1, l); assertEquals(url2, l.get(0).getURL()); assertEquals(1, scheduler.getQueueLength()); assertEquals(2, scheduler.getScheduledNum()); }
private Page download(String url) { WebURL curURL = new WebURL(); curURL.setURL(url); PageFetchResult fetchResult = null; try { fetchResult = pageFetcher.fetchHeader(curURL); if (fetchResult.getStatusCode() == HttpStatus.SC_OK) { try { Page page = new Page(curURL); fetchResult.fetchContent(page); if (parser.parse(page, curURL.getURL())) { return page; } } catch (Exception e) { e.printStackTrace(); } } } finally { fetchResult.discardContentIfNotConsumed(); } return null; }
@Override public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); if (PATTERN_MAIL_FILTERS.matcher(href).matches()) { return false; } for (String crawlDomain : crawlDomains) { if (href.startsWith(crawlDomain)) { return true; } } return false; }
public void downLoadText(String pageText, WebURL webUrl) throws IOException { int docid = webUrl.getDocid(); File file_output = new File("E:/crawlData/pageText/" + docid + ".txt"); if (!file_output.exists()) { File parent = new File(file_output.getParent()); if (!parent.exists()) { parent.mkdirs(); } } FileWriter fWriter = new FileWriter(file_output); fWriter.write(pageText); fWriter.close(); }
@Override public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); if (filters.matcher(href).matches()) { return false; } if (imgPatterns.matcher(href).matches()) { return true; } for (String domain : crawlDomains) { if (href.startsWith(domain)) { return true; } } return false; }
/* * @see * com.qq.crawler.webcrawler.RailgunCrawler#shouldVisit(edu.uci.ics.crawler4j * .url.WebURL) */ @Override public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); if (urlPattern.matcher(href).matches()) { return false; } if (null != shouldUnVisitPatterns) { for (Pattern pattern : shouldUnVisitPatterns) { if (pattern.matcher(href).matches()) { return false; } } } boolean shouldVisit = false; if (null != shouldVisitPatterns) { for (Pattern pattern : shouldVisitPatterns) { if (pattern.matcher(href).matches()) { shouldVisit = true; break; } } } else { shouldVisit = true; } if (!shouldVisit) { return false; } if (!CrawlerUtils.FILTERS.matcher(href).matches()) { for (String host : hosts) { if (href.startsWith(host)) { // System.out.println(href + ": fetch"); return true; } } } return false; }
/** * This method receives two parameters. The first parameter is the page in which we have * discovered this new url and the second parameter is the new url. You should implement this * function to specify whether the given url should be crawled or not (based on your crawling * logic). In this example, we are instructing the crawler to ignore urls that have css, js, git, * ... extensions and to only accept urls that start with "http://www.ics.uci.edu/". In this case, * we didn't need the referringPage parameter to make the decision. */ @Override public boolean shouldVisit(Page referringPage, WebURL url) { String dest = url.getURL().toLowerCase(); return FILTERS.matcher(dest).matches() && dest.startsWith(getBaseURL()); }
/** * This method receives two parameters. The first parameter is the page in which we have * discovered this new url and the second parameter is the new url. You should implement this * function to specify whether the given url should be crawled or not (based on your crawling * logic). In this example, we are instructing the crawler to ignore urls that have css, js, git, * ... extensions and to only accept urls that start with "http://www.ics.uci.edu/". In this case, * we didn't need the referringPage parameter to make the decision. */ @Override public boolean shouldVisit(Page referringPage, WebURL url) { String href = url.getURL().toLowerCase(); return !FILTERS.matcher(href).matches() && href.startsWith("http://www.sanook.com/"); }
@Override public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); return !filters.matcher(href).matches() && href.startsWith("http://www.lankadeepa.lk/"); }
public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); return !filters.matcher(href).matches() && href.contains("apache.org"); }
/** * You should implement this function to specify whether the given url should be crawled or not * (based on your crawling logic). */ @Override public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/"); }
@Override public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); return !Mission.crawlFilters.matcher(href).matches() && href.startsWith(Mission.crawlDomain); }