Пример #1
0
 @Override
 public WebURL entryToObject(TupleInput input) {
   WebURL webURL = new WebURL();
   webURL.setPrimeKey(input.readString());
   webURL.setURL(input.readString());
   webURL.setDocid(input.readInt());
   webURL.setParentDocid(input.readInt());
   webURL.setParentUrl(input.readString());
   webURL.setDepth(input.readShort());
   webURL.setCookie(input.readString());
   webURL.setFormParams(input.readString());
   webURL.setMethod(input.readString());
   return webURL;
 }
Пример #2
0
 @Override
 public void objectToEntry(WebURL url, TupleOutput output) {
   output.writeString(url.getPrimeKey());
   output.writeString(url.getURL());
   output.writeInt(url.getDocid());
   output.writeInt(url.getParentDocid());
   output.writeString(url.getParentUrl());
   output.writeShort(url.getDepth());
   output.writeString(url.getCookie());
   output.writeString(url.getFormParams());
   output.writeString(url.getMethod());
 }
Пример #3
0
    public void visit(Page page) {
      String url = page.getWebURL().getURL();

      // standard out contains a single line per URL, with the URL
      // followed by all the words found on the page
      //
      String text = page.getText().replaceAll("[^a-zA-Z]+", " ");
      System.out.println(url + "\t" + text);

      // standard err contains a line for each outgoing link from the
      // page we're crawling
      //
      for (WebURL link : page.getURLs()) {
        System.err.println(url + "\t" + link.getURL());
      }
    }
  @Override
  public boolean shouldVisit(WebURL url) {
    // Don't crawl non-HTML pages
    String href = url.getURL().toLowerCase(); // Turns http://www.ics.uci.edu/SomePage.PHP ->
    // http://www.ics.uci.edu/somepage.php
    if (FILTERS.matcher(href).matches()) // filter using file extension
    return false;

    // Only crawl within the domain of the seed URL
    String currentUrlDomain = URLHelper.getDomain(url.getURL());
    String seedUrlDomain = URLHelper.getDomain(this.params.getSeedUrl());
    if (currentUrlDomain == null || !currentUrlDomain.endsWith(seedUrlDomain)) return false;

    // Don't crawl the same pages too many times (avoid infinite loops)
    if (!stats.intendToVisit(url.getURL())) return false;

    return true;
  }
 @Override
 public boolean shouldVisit(Page page, WebURL url) {
   try {
     String href = url.getURL().toLowerCase();
     String decodedString = URLDecoder.decode(href, "UTF8");
     return filter1.matcher(decodedString).matches() || filter2.matcher(decodedString).matches();
   } catch (UnsupportedEncodingException ex) {
     ex.printStackTrace();
     return false;
   }
 }
Пример #6
0
  public void InsertItem(String URL, String Text, String HTML) throws SQLException {
    WebURL curURL = new WebURL();
    curURL.setURL(URL);

    Statement stmt = null;
    try {
      stmt = c.createStatement();
    } catch (SQLException e) {
      System.out.println("Error, can't create statement in AddLink");
      e.printStackTrace();
    }

    PreparedStatement statement =
        c.prepareStatement("INSERT INTO PAGES (URL,SUBDOMAIN,BODY,HTML) values (?, ?, ?, ?)");
    statement.setString(1, URL);
    statement.setString(2, curURL.getSubDomain());
    statement.setString(3, Text);
    statement.setString(4, HTML);
    statement.executeUpdate();
  }
Пример #7
0
 @Override
 public boolean shouldVisit(Page referringPage, WebURL url) {
   String href = url.getURL().toLowerCase().toString();
   //        System.out.println("start tag =
   // "+href.startsWith("http://estekhdame.ir/blog/tag/")+url);
   if (FILTERS.matcher(href).matches()) return false;
   if (href.contains("city") || href.contains("study")) {
     //            System.out.println("hi"+href);
     return false;
   }
   return href.startsWith("http://eshetab.com/state/") || href.contains("/ads/");
 }
Пример #8
0
  @Test
  public void test() {
    String url1 = "http://nba.hupu.com/";
    String url2 = "http://www.google.com/";

    WebURL a = new WebURL();
    a.setURL(url1);
    a.setDepth((short) 3);

    WebURL b = new WebURL();
    b.setURL(url2);
    b.setDepth((short) 2);

    scheduler.schedule(a);
    assertEquals(1, scheduler.getQueueLength());

    scheduler.schedule(a);
    assertEquals(1, scheduler.getQueueLength());

    scheduler.schedule(b);
    assertEquals(2, scheduler.getQueueLength());

    List<WebURL> l = new ArrayList<WebURL>();
    scheduler.getNextURLs(1, l);
    assertEquals(url2, l.get(0).getURL());

    assertEquals(1, scheduler.getQueueLength());
    assertEquals(2, scheduler.getScheduledNum());
  }
Пример #9
0
 private Page download(String url) {
   WebURL curURL = new WebURL();
   curURL.setURL(url);
   PageFetchResult fetchResult = null;
   try {
     fetchResult = pageFetcher.fetchHeader(curURL);
     if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
       try {
         Page page = new Page(curURL);
         fetchResult.fetchContent(page);
         if (parser.parse(page, curURL.getURL())) {
           return page;
         }
       } catch (Exception e) {
         e.printStackTrace();
       }
     }
   } finally {
     fetchResult.discardContentIfNotConsumed();
   }
   return null;
 }
Пример #10
0
 @Override
 public boolean shouldVisit(WebURL url) {
   String href = url.getURL().toLowerCase();
   if (PATTERN_MAIL_FILTERS.matcher(href).matches()) {
     return false;
   }
   for (String crawlDomain : crawlDomains) {
     if (href.startsWith(crawlDomain)) {
       return true;
     }
   }
   return false;
 }
 public void downLoadText(String pageText, WebURL webUrl) throws IOException {
   int docid = webUrl.getDocid();
   File file_output = new File("E:/crawlData/pageText/" + docid + ".txt");
   if (!file_output.exists()) {
     File parent = new File(file_output.getParent());
     if (!parent.exists()) {
       parent.mkdirs();
     }
   }
   FileWriter fWriter = new FileWriter(file_output);
   fWriter.write(pageText);
   fWriter.close();
 }
Пример #12
0
  @Override
  public boolean shouldVisit(WebURL url) {
    String href = url.getURL().toLowerCase();
    if (filters.matcher(href).matches()) {
      return false;
    }

    if (imgPatterns.matcher(href).matches()) {
      return true;
    }

    for (String domain : crawlDomains) {
      if (href.startsWith(domain)) {
        return true;
      }
    }
    return false;
  }
Пример #13
0
  /*
   * @see
   * com.qq.crawler.webcrawler.RailgunCrawler#shouldVisit(edu.uci.ics.crawler4j
   * .url.WebURL)
   */
  @Override
  public boolean shouldVisit(WebURL url) {
    String href = url.getURL().toLowerCase();
    if (urlPattern.matcher(href).matches()) {
      return false;
    }
    if (null != shouldUnVisitPatterns) {
      for (Pattern pattern : shouldUnVisitPatterns) {
        if (pattern.matcher(href).matches()) {
          return false;
        }
      }
    }

    boolean shouldVisit = false;

    if (null != shouldVisitPatterns) {
      for (Pattern pattern : shouldVisitPatterns) {
        if (pattern.matcher(href).matches()) {
          shouldVisit = true;
          break;
        }
      }
    } else {
      shouldVisit = true;
    }

    if (!shouldVisit) {
      return false;
    }
    if (!CrawlerUtils.FILTERS.matcher(href).matches()) {
      for (String host : hosts) {
        if (href.startsWith(host)) {
          // System.out.println(href + ":  fetch");
          return true;
        }
      }
    }
    return false;
  }
Пример #14
0
 /**
  * This method receives two parameters. The first parameter is the page in which we have
  * discovered this new url and the second parameter is the new url. You should implement this
  * function to specify whether the given url should be crawled or not (based on your crawling
  * logic). In this example, we are instructing the crawler to ignore urls that have css, js, git,
  * ... extensions and to only accept urls that start with "http://www.ics.uci.edu/". In this case,
  * we didn't need the referringPage parameter to make the decision.
  */
 @Override
 public boolean shouldVisit(Page referringPage, WebURL url) {
   String dest = url.getURL().toLowerCase();
   return FILTERS.matcher(dest).matches() && dest.startsWith(getBaseURL());
 }
Пример #15
0
 /**
  * This method receives two parameters. The first parameter is the page in which we have
  * discovered this new url and the second parameter is the new url. You should implement this
  * function to specify whether the given url should be crawled or not (based on your crawling
  * logic). In this example, we are instructing the crawler to ignore urls that have css, js, git,
  * ... extensions and to only accept urls that start with "http://www.ics.uci.edu/". In this case,
  * we didn't need the referringPage parameter to make the decision.
  */
 @Override
 public boolean shouldVisit(Page referringPage, WebURL url) {
   String href = url.getURL().toLowerCase();
   return !FILTERS.matcher(href).matches() && href.startsWith("http://www.sanook.com/");
 }
 @Override
 public boolean shouldVisit(WebURL url) {
   String href = url.getURL().toLowerCase();
   return !filters.matcher(href).matches() && href.startsWith("http://www.lankadeepa.lk/");
 }
Пример #17
0
 public boolean shouldVisit(WebURL url) {
   String href = url.getURL().toLowerCase();
   return !filters.matcher(href).matches() && href.contains("apache.org");
 }
Пример #18
0
 /**
  * You should implement this function to specify whether the given url should be crawled or not
  * (based on your crawling logic).
  */
 @Override
 public boolean shouldVisit(WebURL url) {
   String href = url.getURL().toLowerCase();
   return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
 }
Пример #19
0
 @Override
 public boolean shouldVisit(WebURL url) {
   String href = url.getURL().toLowerCase();
   return !Mission.crawlFilters.matcher(href).matches() && href.startsWith(Mission.crawlDomain);
 }