Beispiel #1
0
 /**
  * begin crawling with a specific url use depth first search
  *
  * @throws IOException
  * @throws SQLException
  */
 public void crawl(String starturl) throws IOException, SQLException {
   if (urlid >= MAXURL) // base case
   return;
   Document doc;
   try {
     doc = Jsoup.connect(starturl).get();
   } catch (IOException e) {
     // if the url is not valid, stop the crawling process
     return;
   } catch (IllegalArgumentException e) {
     System.out.println("Must supply a valid URL : " + starturl);
     return;
   }
   if (!urlList.contains(starturl)) {
     urlList.add(starturl);
   }
   // if the url has already been crawled
   else if (urlList.contains(starturl)) {
     return;
   }
   Elements hrefs = doc.select("a");
   urlid += 1;
   // terminate the process if there is no more link in a webpage
   if (hrefs == null || hrefs.size() == 0) return;
   HashMap<String, Integer> wordMap = parseHTML(getHTMLContent(starturl));
   insertDBWord(starturl, wordMap, urlid);
   insertDBDescription(starturl, topOneHundred(starturl), urlid);
   for (Element e : hrefs) {
     String href = e.attr("href");
     crawl(href); // depth first search;
   }
 }
Beispiel #2
0
 /**
  * begin crawling with a specific url use breadth first search
  *
  * @param starturl
  * @throws IOException
  * @throws SQLException
  */
 public void bfscrawl(String starturl) throws IOException, SQLException {
   if (starturl == null) {
     return;
   }
   webqueue.add(starturl);
   while (!webqueue.isEmpty() && urlid < MAXURL) {
     String url = webqueue.poll();
     Document doc;
     Elements hrefs;
     try {
       doc = Jsoup.connect(url).get();
       hrefs = doc.select("a");
       // if (!urlList.contains(url)) {
       if (!db.urlInDB(url)) {
         String description = topOneHundred(url);
         if (!db.descriptionInDB(description)) {
           // urlList.add(url);
           System.out.println(urlid);
           urlid += 1;
           HashMap<String, Integer> wordMap = parseHTML(getHTMLContent(url));
           insertDBWord(url, wordMap, urlid);
           insertDBDescription(url, description, urlid);
           for (Element e : hrefs) {
             String href = e.attr("href");
             webqueue.add(href);
           }
         }
       }
     } catch (IOException e) {
       // if the url is not valid, stop the crawling process
       System.out.println("IOException : " + url);
       continue;
     } catch (IllegalArgumentException e) {
       System.out.println(e + " Must supply a valid URL : " + url);
       continue;
     }
   }
 }