public void indexBook(Book book) throws BookException, IOException { IndexManager inMan = new IndexManager(); inMan.deleteBook(book.getName()); CrawlLinkListener list = new CrawlLinkListener(book, inMan); DownloadParameters dlPars = new DownloadParameters(); dlPars = dlPars.changeMaxThreads(0); dlPars = dlPars.changeMaxPageSize(-1); Crawler c = new Crawler(); c.setRoot(new Link(book.getLocation())); c.addClassifier(new StandardClassifier()); c.setDownloadParameters(dlPars); c.addLinkListener(list); c.setDomain(Crawler.SERVER); c.setLinkType(Crawler.HYPERLINKS); c.setMaxDepth(15); c.run(); }
public void crawled(LinkEvent le) { switch (le.getID()) { case LinkEvent.ERROR: logger.error( "Crawling error occured during download of URL: " + le.getLink().getURL() + " Root cause: " + le.getException()); break; case LinkEvent.TOO_DEEP: case LinkEvent.SKIPPED: logger.warn("Crawling event: " + le.getName() + ", URL: " + le.getLink().getURL()); break; default: break; } if (le.getID() != LinkEvent.DOWNLOADED) return; logger.info("Crawling event: " + le.getName() + ", URL: " + le.getLink().getURL()); Link l = le.getLink(); Page p = l.getPage(); BookPage page = new BookPage(); page.setBookName(book.getName()); page.setName(book.getName() + counter); page.setParentName(book.getName()); page.setTitle(p.getTitle()); page.setLocation(p.getURL().toString()); page.setChildCount(counter); page.setContentType(book.getContentType()); page.setPath(p.getURL().getPath()); System.out.println(p.getContentBytes()); try { ByteArrayInputStream is = new ByteArrayInputStream(p.getContentBytes()); FileContentParser parser = new FileContentParser(); parser.processContentType(p.getContentType(), is, page); // } catch (UnsupportedEncodingException e1) { } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } try { im.addPage(page); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } counter++; }