/** * to download web page contents and store in the BerkleyDB based on checks * * @return * @throws IOException */ public WebContentEntity extract_add_webpage() throws IOException { /** check for absolute urls */ Date date_modified = null; String absolute_url = crawler_client.get_curr_url(); if (this.response_headers.containsKey("last-modified")) { String last_modified = this.response_headers.get("last-modified"); date_modified = DBWrapper.get_date_from_header_string(last_modified); } // check if the webpage already exists WebContentEntity webpage = DBWrapper.get_webpage(absolute_url); String document_contents = null; Date current_date = new Date(); if (webpage == null) { // the webpage doesn't exist, thus, add webpage // System.out.println("[Display] Downloading :" + absolute_url); document_contents = crawler_client.extract_content_from_url(absolute_url); // System.out.println("[Display] Downloading :" + absolute_url); ThreadCrawler.num_files_crawled++; System.out.println("[Display] Number of files crawled + " + ThreadCrawler.num_files_crawled); /** * initialize the webcontent attributes in constructor with data from head request and * extracted body content */ webpage = new WebContentEntity( absolute_url, current_date, document_contents, this.content_type.split(";")[0], this.content_length); // add webpage to BerkleyDB DBWrapper.add_webcontent(webpage); return webpage; } // else if the webpage already exists in the database Date last_crawled_date = webpage.get_last_crawled_date(); if (last_crawled_date.after(date_modified)) { // no need to download the file and extract file from the database document_contents = webpage.get_url_content(); System.out.println("[Display] Not Modified :" + absolute_url); } else { // when the document is modified at later date after storing document_contents = crawler_client.extract_content_from_url(absolute_url); // delete existing tuple for webpage DBWrapper.delete_webpage(absolute_url); // create new tuple for the webpage /** * initialize the webcontent attributes in constructor with data from head request and * extracted body content */ webpage = new WebContentEntity( absolute_url, current_date, document_contents, this.content_type, this.content_length); // add webpage to BerkleyDB DBWrapper.add_webcontent(webpage); } return webpage; }