コード例 #1
0
  /** This function is called when a page is fetched and ready to be processed by your program. */
  @Override
  public void visit(Page page) {
    //        System.out.println(page.getWebURL().toString().contains("city")+"
    // "+page.getWebURL());
    // url title date body
    if (page.getWebURL().toString().contains("/ads/")
        || page.getWebURL().toString().contains("/view/")) {
      if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();

        String html = htmlParseData.getHtml();
        Document doc = Jsoup.parse(html);

        String url;
        String title;
        String body = null;
        String date = null;

        url = page.getWebURL().getURL();
        title = doc.title();
        //                System.out.println(title);

        date = doc.getElementsByClass("nameDiv").get(1).text();
        body = doc.getElementsByClass("addsTexts").get(0).text();
        System.out.println(date);
        System.out.println(body);
        //            System.out.println(body);
        //                System.out.println("URL: " + url);
        //                System.out.println("title: " + title);
        //                System.out.println("date: " + date);
        //                System.out.println("body: " + body);
        Indexer.add(url, title, body, date);
      }
    }
  }
コード例 #2
0
  /** This function is called when a page is fetched and ready to be processed by your program. */
  @Override
  public void visit(Page page) {
    if (page.getParseData() instanceof HtmlParseData) {
      ResultSet res = null;
      try {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String html = htmlParseData.getHtml();
        Document doc = Jsoup.parse(html);

        // check if the page must be crawled
        String url = page.getWebURL().getURL();
        if (!shouldVisit(url)) {
          return;
        }

        currentPage = getDocumentURL(url.toLowerCase());

        // check if the page was already crawled
        queryDocumentFind.setString(1, currentPage);
        res = queryDocumentFind.executeQuery();
        if (res.next()) {
          return;
        }
        logger.debug("Code: " + currentPage);

        // insert into DOCUMENT
        queryDocumentInsert.setString(1, getSHA256(currentPage));
        queryDocumentInsert.setString(2, currentPage);
        queryDocumentInsert.setString(3, getDocumentType(url).name());
        queryDocumentInsert.setString(4, getDocumentBand(url, doc));
        queryDocumentInsert.setString(5, getDocumentAlbum(url, doc));
        queryDocumentInsert.setAsciiStream(6, getDocumentContent(url, doc));
        queryDocumentInsert.setDate(7, getDocumentCreationDate(url, doc));
        queryDocumentInsert.setString(8, getDocumentCover(url, doc));
        queryDocumentInsert.setString(9, getDocumentAuthor(url, doc));
        queryDocumentInsert.setString(10, getDocumentGenre(url, doc));
        queryDocumentInsert.setInt(11, getDocumentYear(url, doc));
        queryDocumentInsert.setString(12, getDocumentLabel(url, doc));
        queryDocumentInsert.setFloat(13, getDocumentVote(url, doc));
        queryDocumentInsert.setBoolean(14, getDocumentMilestone(url, doc));
        queryDocumentInsert.executeUpdate();
        logger.info("Document " + currentPage + " added into DOCUMENT");
      } catch (Throwable t) {
        logger.error("Error parsing page " + page.getWebURL().getURL() + ": " + t.getMessage());
      } finally {
        try {
          if (res != null) {
            res.close();
          }
        } catch (Throwable t) {
          // do nothing
        }
      }
    }
  }
コード例 #3
0
ファイル: ImageCrawler.java プロジェクト: yijuchung/crawler4j
  @Override
  public void visit(Page page) {
    String url = page.getWebURL().getURL();

    // We are only interested in processing images
    if (!(page.getParseData() instanceof BinaryParseData)) {
      return;
    }

    if (!imgPatterns.matcher(url).matches()) {
      return;
    }

    // Not interested in very small images
    if (page.getContentData().length < 10 * 1024) {
      return;
    }

    // get a unique name for storing this image
    String extension = url.substring(url.lastIndexOf("."));
    String hashedName = Cryptography.MD5(url) + extension;

    // store image
    IO.writeBytesToFile(page.getContentData(), storageFolder.getAbsolutePath() + "/" + hashedName);

    System.out.println("Stored: " + url);
  }
コード例 #4
0
ファイル: MyCrawler.java プロジェクト: champillon/crawler4j
  /** This function is called when a page is fetched and ready to be processed by your program. */
  @Override
  public void visit(Page page) {
    String url = page.getWebURL().getURL();
    System.out.println("URL: " + url);

    if (page.getParseData() instanceof HtmlParseData) {
      i++;

      HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
      String text = htmlParseData.getText();
      String html = htmlParseData.getHtml();
      Set<WebURL> links = htmlParseData.getOutgoingUrls();

      System.out.println("Text: " + text);
      System.out.println("Text length: " + text.length());
      System.out.println("Html length: " + html.length());
      System.out.println("Number of outgoing links: " + links.size());

      try {
        PrintWriter writer = new PrintWriter(i + ".txt", "UTF-8");
        writer.println("Text length: " + text);
        writer.println("Html length: " + html);
        writer.close();

      } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      } catch (UnsupportedEncodingException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }
  }
コード例 #5
0
  @Override
  public void visit(Page page) {

    if (page.getWebURL().getURL().equals("http://www.eshetab.com/")) return;
    else if (filter1.matcher(page.getWebURL().getURL()).matches()) {
      return;
    } else {
      System.out.println(page.getWebURL().getURL());
      Document doc = Jsoup.parse(((HtmlParseData) page.getParseData()).getHtml());
      Elements elements =
          doc.select(".addsTexts , .nameDiv:nth-child(3) , .nameDiv:nth-child(2) , .DivTitle span");
      String title = elements.get(0).text();
      String date =
          elements.get(2).text().split(":")[1].substring(1); // it has one space in zero index

      date = date.split("-")[2] + "/" + date.split("-")[1] + "/" + date.split("-")[0];

      date = CalendarUtility.getEnglishDate(date);
      System.out.println(date);
      String city = "نامعلوم";

      // System.out.println(elements.get(3).text());
      city = elements.get(3).text().split(":")[1].substring(1).split("-")[0];
      if (city.contains(",")) {
        city = city.substring(1, city.length() - 1);
      } else {
        city = "نامعلوم";
      }
      System.out.println(date);

      String body = elements.get(4).text();
      try {

        feeds.add(
            new Feed(
                title, body, city, URLDecoder.decode(page.getWebURL().toString(), "UTF8"), date));

      } catch (ParseException e) {
        e.printStackTrace();
      } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
      }
    }
  }
コード例 #6
0
    @Override
    public void visit(Page page) {
      int docid = page.getWebURL().getDocid();
      String url = page.getWebURL().getURL();
      System.out.println("Docid: " + docid);

      // *** Filter urls which need to be processed ************
      if (url.startsWith(Mission.crawlDomain + "topic/")
          && page.getParseData() instanceof HtmlParseData) {
        // *************************************************
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        Document dom = Jsoup.parse(htmlParseData.getHtml());

        // *** Scratch elements inside a page **************
        Elements titles = dom.select("div.title").select("h2");
        /*
         * TODO bug: text will also contain title
         */
        Elements texts = dom.select("div[class=bd art-content clearfix]");
        String title = titles.get(0).text();
        String text = texts.get(0).text();
        // *************************************************
        if (titles.size() == 1 && texts.size() == 1) {
          System.out.println("title:" + title);
          System.out.println("text:" + text);
          String crawlClass = Mission.dbNameMap.get("crawlClass");
          String[] crawlFields = Mission.dbNameMap.get("crawlFields").split(",");
          // *** Save to database *************************
          ODatabaseRecordThreadLocal.INSTANCE.set(Mission.db);
          ODocument doc = new ODocument(crawlClass);
          doc.field(crawlFields[0], url);
          doc.field(crawlFields[1], title);
          doc.field(crawlFields[2], text);
          doc.save();
          // *************************************************
        }
      }
    }
コード例 #7
0
    @Override
    public void visit(Page page) {
      String url = page.getWebURL().getURL();
      System.out.println("Visited: " + url);

      if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
        String text = htmlParseData.getText();
        String html = htmlParseData.getHtml();
        List<WebURL> links = htmlParseData.getOutgoingUrls();
        System.out.println("Text length: " + text.length());
        System.out.println("Html length: " + html.length());
        System.out.println("Number of outgoing links: " + links.size());
      }
    }
コード例 #8
0
    public void visit(Page page) {
      String url = page.getWebURL().getURL();

      // standard out contains a single line per URL, with the URL
      // followed by all the words found on the page
      //
      String text = page.getText().replaceAll("[^a-zA-Z]+", " ");
      System.out.println(url + "\t" + text);

      // standard err contains a line for each outgoing link from the
      // page we're crawling
      //
      for (WebURL link : page.getURLs()) {
        System.err.println(url + "\t" + link.getURL());
      }
    }
コード例 #9
0
  @Override
  public void visit(Page page) {
    // Keep track of visited URLs
    String url = page.getWebURL().getURL();
    stats.addCrawledUrl(url);
    System.out.println("Crawled: " + url);

    // Get the page terms and store them locally
    if (page.getParseData() instanceof HtmlParseData) { // make sure document has HTML data
      HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
      String html = htmlParseData.getHtml();

      // Don't keep documents that are too big (bigger than 2MB)
      if (html.length() <= 2097152) {
        // Store the HTML
        this.params.getDocumentStorage().storeDocument(url, html);
      }
    }
  }
コード例 #10
0
  @Override
  public void visit(Page page) {
    String url = page.getWebURL().getURL();

    urlQueue.add(new EmailCollectorUrlMessage(url));

    logger.info("Scanning URL: " + url);

    if (page.getParseData() instanceof HtmlParseData) {
      HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
      String html = htmlParseData.getHtml();

      try {
        addEmail(html);
      } catch (InterruptedException e) {
        e.printStackTrace();
      }
    }

    logger.info("=============");
  }
コード例 #11
0
ファイル: BasicCrawler.java プロジェクト: ind9/crawler4j
  /** This function is called when a page is fetched and ready to be processed by your program. */
  @Override
  public void visit(Page page) {
    int docid = page.getWebURL().getDocid();
    String url = page.getWebURL().getURL();
    String domain = page.getWebURL().getDomain();
    String path = page.getWebURL().getPath();
    String subDomain = page.getWebURL().getSubDomain();
    String parentUrl = page.getWebURL().getParentUrl();
    String anchor = page.getWebURL().getAnchor();

    System.out.println("Docid: " + docid);
    System.out.println("URL: " + url);
    System.out.println("Domain: '" + domain + "'");
    System.out.println("Sub-domain: '" + subDomain + "'");
    System.out.println("Path: '" + path + "'");
    System.out.println("Parent page: " + parentUrl);
    System.out.println("Anchor text: " + anchor);

    if (page.getParseData() instanceof HtmlParseData) {
      HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
      String text = htmlParseData.getText();
      String html = htmlParseData.getHtml();
      List<WebURL> links = htmlParseData.getOutgoingUrls();

      System.out.println("Text length: " + text.length());
      System.out.println("Html length: " + html.length());
      System.out.println("Number of outgoing links: " + links.size());
    }

    Header[] responseHeaders = page.getFetchResponseHeaders();
    if (responseHeaders != null) {
      System.out.println("Response headers:");
      for (Header header : responseHeaders) {
        System.out.println("\t" + header.getName() + ": " + header.getValue());
      }
    }

    System.out.println("=============");
  }
コード例 #12
0
 private void PrintLongestPage() {
   String url = longestPage.getWebURL().getURL().toLowerCase();
   System.out.println("Longest page is :" + url);
   System.out.println("Number of words in the page :" + longestPageWordCount);
 }
コード例 #13
0
  public ExtractorContext extractContent() {

    HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();

    ExtractorContext ctx = new ExtractorContext(page, htmlParseData.getHtml());

    ctx.setMetaTags(MetadataExtractor.INST.extractMetaTags(ctx.getDoc()));

    String language = ctx.getMetaTags().get(MetadataExtractor.CONTENT_LANGUAGE_TAG);

    // skip not english pages
    if ((language != null && !ENGLISH_US.equals(language.toLowerCase()))) {
      LOG.info("Skipping " + ctx.getUrl() + " not ENGLISH");
      return null;
    }

    if (ctx.getUrl().contains(YOUTUBE)) {

      String keyWords = ctx.getMetaTags().get(MetadataExtractor.KEYWORDS_TAG);

      if (keyWords == null) {
        LOG.info("Skipping " + ctx.getUrl() + " no KEYWORDS");
        return null;
      }

      keyWords = keyWords.toLowerCase();

      if (!keyWords.contains(SAP)) {
        LOG.info("Skipping " + ctx.getUrl() + " no SAP in KEYWORDS");
        return null;
      }
    }

    Document clearedHtml = cleanHtml(ctx);

    if (clearedHtml.text().length() < MIN_CONTENT_LENGTH) {
      LOG.info("Skipping " + ctx.getUrl() + " less then " + MIN_CONTENT_LENGTH + " characters");
      return null;
    }

    // skip pages with not English content
    if (!LanguageDetector.INST.isEnglish(clearedHtml.text())) {
      LOG.info("Skipping " + ctx.getUrl() + " not ENGLISH");
      return null;
    }

    NavigableSet<ExtractedImage> images = MostRelevantImageExtractor.INST.extract(ctx);

    String uniqueFileName = CryptographyUtils.sha1(clearedHtml.text());

    PageObject pageObject = new PageObject();
    pageObject.setId(uniqueFileName);
    pageObject.setTitle(ctx.getTitle());
    pageObject.setContent(clearedHtml.html());
    pageObject.setUrl(page.getWebURL().getURL());
    pageObject.setHtmlCleanerName(ctx.getCleaner().getName());

    List<TextBlock> blocks = TextBlocksExtractor.INST.extract(clearedHtml);

    if (!blocks.isEmpty()) {
      pageObject.setMainBlock(blocks.get(0));
      blocks.remove(0);
      pageObject.addBlocks(blocks);
    }

    // set title from First text block if NULL or EMPTY.
    if ((pageObject.getTitle() == null || "".equals(pageObject.getTitle().trim()))
        && !pageObject.getBlocks().isEmpty()) {
      pageObject.setTitle(pageObject.getBlocks().get(0).getTitle());
    }

    pageObject.addAllKeywords(BaseClassifier.INST.classify(ctx));

    if (!images.isEmpty()) {
      pageObject.addImages(images);
    }

    ctx.setPageObject(pageObject);

    return ctx;
  }