/** * begin crawling with a specific url use depth first search * * @throws IOException * @throws SQLException */ public void crawl(String starturl) throws IOException, SQLException { if (urlid >= MAXURL) // base case return; Document doc; try { doc = Jsoup.connect(starturl).get(); } catch (IOException e) { // if the url is not valid, stop the crawling process return; } catch (IllegalArgumentException e) { System.out.println("Must supply a valid URL : " + starturl); return; } if (!urlList.contains(starturl)) { urlList.add(starturl); } // if the url has already been crawled else if (urlList.contains(starturl)) { return; } Elements hrefs = doc.select("a"); urlid += 1; // terminate the process if there is no more link in a webpage if (hrefs == null || hrefs.size() == 0) return; HashMap<String, Integer> wordMap = parseHTML(getHTMLContent(starturl)); insertDBWord(starturl, wordMap, urlid); insertDBDescription(starturl, topOneHundred(starturl), urlid); for (Element e : hrefs) { String href = e.attr("href"); crawl(href); // depth first search; } }
/** * begin crawling with a specific url use breadth first search * * @param starturl * @throws IOException * @throws SQLException */ public void bfscrawl(String starturl) throws IOException, SQLException { if (starturl == null) { return; } webqueue.add(starturl); while (!webqueue.isEmpty() && urlid < MAXURL) { String url = webqueue.poll(); Document doc; Elements hrefs; try { doc = Jsoup.connect(url).get(); hrefs = doc.select("a"); // if (!urlList.contains(url)) { if (!db.urlInDB(url)) { String description = topOneHundred(url); if (!db.descriptionInDB(description)) { // urlList.add(url); System.out.println(urlid); urlid += 1; HashMap<String, Integer> wordMap = parseHTML(getHTMLContent(url)); insertDBWord(url, wordMap, urlid); insertDBDescription(url, description, urlid); for (Element e : hrefs) { String href = e.attr("href"); webqueue.add(href); } } } } catch (IOException e) { // if the url is not valid, stop the crawling process System.out.println("IOException : " + url); continue; } catch (IllegalArgumentException e) { System.out.println(e + " Must supply a valid URL : " + url); continue; } } }