public static WebDocument getDocument(Page page) { InputStream stream = new ByteArrayInputStream(page.getContentBytes()); try { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, page.getContentType()); metadata.set(Metadata.CONTENT_LOCATION, page.getURL().toString()); metadata.set(Metadata.LOCATION, page.getURL().toString()); metadata.set(Metadata.MIME_TYPE_MAGIC, page.getContentType()); metadata.set(Metadata.CONTENT_ENCODING, page.getContentEncoding()); metadata.set(Metadata.TIKA_MIME_FILE, page.getContentType()); String text = getTika().parseToString(stream, metadata); WebDocument wd = new WebDocument(0, metadata.get(Metadata.TITLE), text, page.getURL().toString()); stream.close(); return wd; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (TikaException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { try { stream.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } // close the stream } return null; }
public void crawled(LinkEvent le) { switch (le.getID()) { case LinkEvent.ERROR: logger.error( "Crawling error occured during download of URL: " + le.getLink().getURL() + " Root cause: " + le.getException()); break; case LinkEvent.TOO_DEEP: case LinkEvent.SKIPPED: logger.warn("Crawling event: " + le.getName() + ", URL: " + le.getLink().getURL()); break; default: break; } if (le.getID() != LinkEvent.DOWNLOADED) return; logger.info("Crawling event: " + le.getName() + ", URL: " + le.getLink().getURL()); Link l = le.getLink(); Page p = l.getPage(); BookPage page = new BookPage(); page.setBookName(book.getName()); page.setName(book.getName() + counter); page.setParentName(book.getName()); page.setTitle(p.getTitle()); page.setLocation(p.getURL().toString()); page.setChildCount(counter); page.setContentType(book.getContentType()); page.setPath(p.getURL().getPath()); System.out.println(p.getContentBytes()); try { ByteArrayInputStream is = new ByteArrayInputStream(p.getContentBytes()); FileContentParser parser = new FileContentParser(); parser.processContentType(p.getContentType(), is, page); // } catch (UnsupportedEncodingException e1) { } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } try { im.addPage(page); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } counter++; }