Пример #1
0
  public void indexBook(Book book) throws BookException, IOException {

    IndexManager inMan = new IndexManager();
    inMan.deleteBook(book.getName());

    CrawlLinkListener list = new CrawlLinkListener(book, inMan);

    DownloadParameters dlPars = new DownloadParameters();
    dlPars = dlPars.changeMaxThreads(0);
    dlPars = dlPars.changeMaxPageSize(-1);

    Crawler c = new Crawler();
    c.setRoot(new Link(book.getLocation()));
    c.addClassifier(new StandardClassifier());
    c.setDownloadParameters(dlPars);

    c.addLinkListener(list);
    c.setDomain(Crawler.SERVER);
    c.setLinkType(Crawler.HYPERLINKS);
    c.setMaxDepth(15);
    c.run();
  }
Пример #2
0
  public void crawled(LinkEvent le) {

    switch (le.getID()) {
      case LinkEvent.ERROR:
        logger.error(
            "Crawling error occured during download of URL: "
                + le.getLink().getURL()
                + " Root cause:	"
                + le.getException());
        break;

      case LinkEvent.TOO_DEEP:
      case LinkEvent.SKIPPED:
        logger.warn("Crawling event: " + le.getName() + ", URL: " + le.getLink().getURL());
        break;
      default:
        break;
    }

    if (le.getID() != LinkEvent.DOWNLOADED) return;

    logger.info("Crawling event: " + le.getName() + ", URL: " + le.getLink().getURL());

    Link l = le.getLink();
    Page p = l.getPage();

    BookPage page = new BookPage();
    page.setBookName(book.getName());
    page.setName(book.getName() + counter);
    page.setParentName(book.getName());
    page.setTitle(p.getTitle());
    page.setLocation(p.getURL().toString());
    page.setChildCount(counter);
    page.setContentType(book.getContentType());
    page.setPath(p.getURL().getPath());

    System.out.println(p.getContentBytes());
    try {

      ByteArrayInputStream is = new ByteArrayInputStream(p.getContentBytes());

      FileContentParser parser = new FileContentParser();
      parser.processContentType(p.getContentType(), is, page);

      // } catch (UnsupportedEncodingException e1) {
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    try {
      im.addPage(page);
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    counter++;
  }