public static WebDocument getDocument(Page page) {
    InputStream stream = new ByteArrayInputStream(page.getContentBytes());
    try {
      Metadata metadata = new Metadata();
      metadata.set(Metadata.CONTENT_TYPE, page.getContentType());
      metadata.set(Metadata.CONTENT_LOCATION, page.getURL().toString());
      metadata.set(Metadata.LOCATION, page.getURL().toString());
      metadata.set(Metadata.MIME_TYPE_MAGIC, page.getContentType());
      metadata.set(Metadata.CONTENT_ENCODING, page.getContentEncoding());
      metadata.set(Metadata.TIKA_MIME_FILE, page.getContentType());

      String text = getTika().parseToString(stream, metadata);

      WebDocument wd =
          new WebDocument(0, metadata.get(Metadata.TITLE), text, page.getURL().toString());
      stream.close();
      return wd;
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (TikaException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } finally {
      try {
        stream.close();
      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      } // close the stream
    }

    return null;
  }
Esempio n. 2
0
  public void crawled(LinkEvent le) {

    switch (le.getID()) {
      case LinkEvent.ERROR:
        logger.error(
            "Crawling error occured during download of URL: "
                + le.getLink().getURL()
                + " Root cause:	"
                + le.getException());
        break;

      case LinkEvent.TOO_DEEP:
      case LinkEvent.SKIPPED:
        logger.warn("Crawling event: " + le.getName() + ", URL: " + le.getLink().getURL());
        break;
      default:
        break;
    }

    if (le.getID() != LinkEvent.DOWNLOADED) return;

    logger.info("Crawling event: " + le.getName() + ", URL: " + le.getLink().getURL());

    Link l = le.getLink();
    Page p = l.getPage();

    BookPage page = new BookPage();
    page.setBookName(book.getName());
    page.setName(book.getName() + counter);
    page.setParentName(book.getName());
    page.setTitle(p.getTitle());
    page.setLocation(p.getURL().toString());
    page.setChildCount(counter);
    page.setContentType(book.getContentType());
    page.setPath(p.getURL().getPath());

    System.out.println(p.getContentBytes());
    try {

      ByteArrayInputStream is = new ByteArrayInputStream(p.getContentBytes());

      FileContentParser parser = new FileContentParser();
      parser.processContentType(p.getContentType(), is, page);

      // } catch (UnsupportedEncodingException e1) {
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    try {
      im.addPage(page);
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    counter++;
  }