Java WebCrawlerClient.get_curr_url示例

编程语言: Java

类/类型: WebCrawlerClient

方法/功能: get_curr_url

hotexamples.com的示例: 1

Java WebCrawlerClient.get_curr_url - 已找到1个示例。这些是从开源项目中提取的最受好评的WebCrawlerClient.get_curr_url现实Java示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

extract_content_from_url(1)

fetch_head_response_url(1)

fetch_robots_rules(1)

get_curr_url(1)

get_hostname(1)

示例#1

显示文件

文件： PolitenessModule.java 项目： aakritis/XPath-Engine-and-Web-Crawler

  /**
   * to download web page contents and store in the BerkleyDB based on checks
   *
   * @return
   * @throws IOException
   */
  public WebContentEntity extract_add_webpage() throws IOException {
    /** check for absolute urls */
    Date date_modified = null;
    String absolute_url = crawler_client.get_curr_url();
    if (this.response_headers.containsKey("last-modified")) {
      String last_modified = this.response_headers.get("last-modified");
      date_modified = DBWrapper.get_date_from_header_string(last_modified);
    }
    // check if the webpage already exists
    WebContentEntity webpage = DBWrapper.get_webpage(absolute_url);
    String document_contents = null;
    Date current_date = new Date();
    if (webpage == null) {
      // the webpage doesn't exist, thus, add webpage
      // System.out.println("[Display] Downloading :" + absolute_url);
      document_contents = crawler_client.extract_content_from_url(absolute_url);
      // System.out.println("[Display] Downloading :" + absolute_url);
      ThreadCrawler.num_files_crawled++;
      System.out.println("[Display] Number of files crawled + " + ThreadCrawler.num_files_crawled);
      /**
       * initialize the webcontent attributes in constructor with data from head request and
       * extracted body content
       */
      webpage =
          new WebContentEntity(
              absolute_url,
              current_date,
              document_contents,
              this.content_type.split(";")[0],
              this.content_length);
      // add webpage to BerkleyDB
      DBWrapper.add_webcontent(webpage);
      return webpage;
    }

    // else if the webpage already exists in the database
    Date last_crawled_date = webpage.get_last_crawled_date();
    if (last_crawled_date.after(date_modified)) {
      // no need to download the file and extract file from the database
      document_contents = webpage.get_url_content();
      System.out.println("[Display] Not Modified :" + absolute_url);
    } else {
      // when the document is modified at later date after storing
      document_contents = crawler_client.extract_content_from_url(absolute_url);
      // delete existing tuple for webpage
      DBWrapper.delete_webpage(absolute_url);
      // create new tuple for the webpage
      /**
       * initialize the webcontent attributes in constructor with data from head request and
       * extracted body content
       */
      webpage =
          new WebContentEntity(
              absolute_url,
              current_date,
              document_contents,
              this.content_type,
              this.content_length);
      // add webpage to BerkleyDB
      DBWrapper.add_webcontent(webpage);
    }
    return webpage;
  }