@Override
  protected void endDocument(PDDocument pdf) throws IOException {
    try {
      // Extract text for any bookmarks:
      extractBookmarkText();
      try {
        extractEmbeddedDocuments(pdf);
      } catch (IOException e) {
        handleCatchableIOE(e);
      }

      // extract acroform data at end of doc
      if (config.getExtractAcroFormContent() == true) {
        try {
          extractAcroForm(pdf);
        } catch (IOException e) {
          handleCatchableIOE(e);
        }
      }
      xhtml.endDocument();
    } catch (TikaException e) {
      throw new IOExceptionWithCause("Unable to end a document", e);
    } catch (SAXException e) {
      throw new IOExceptionWithCause("Unable to end a document", e);
    }
  }
Beispiel #2
0
  @Override
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    DBFReader reader = DBFReader.open(stream);
    DBFFileHeader header = reader.getHeader();
    metadata.set(Metadata.CONTENT_TYPE, header.getVersion().getFullMimeString());

    // insert metadata here
    Calendar lastModified = header.getLastModified();
    if (lastModified != null) {
      metadata.set(TikaCoreProperties.MODIFIED, lastModified);
    }

    // buffer first X rows for charset detection
    List<DBFRow> firstRows = new LinkedList<>();
    DBFRow row = reader.next();
    int i = 0;
    while (row != null && i++ < ROWS_TO_BUFFER_FOR_CHARSET_DETECTION) {
      firstRows.add(row.deepCopy());
      row = reader.next();
    }

    Charset charset = getCharset(firstRows, header);
    metadata.set(Metadata.CONTENT_ENCODING, charset.toString());

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.startElement("table");
    xhtml.startElement("thead");
    for (DBFColumnHeader col : header.getCols()) {
      xhtml.startElement("th");
      xhtml.characters(col.getName(charset));
      xhtml.endElement("th");
    }
    xhtml.endElement("thead");

    xhtml.startElement("tbody");

    // now write cached rows
    while (firstRows.size() > 0) {
      DBFRow cachedRow = firstRows.remove(0);
      writeRow(cachedRow, charset, xhtml);
    }

    // now continue with rest
    while (row != null) {
      writeRow(row, charset, xhtml);
      row = reader.next();
    }
    xhtml.endElement("tbody");
    xhtml.endElement("table");
    xhtml.endDocument();
  }
  @Override
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    connection = getConnection(stream, metadata, context);
    XHTMLContentHandler xHandler = null;
    List<String> tableNames = null;
    try {
      tableNames = getTableNames(connection, metadata, context);
    } catch (SQLException e) {
      throw new IOExceptionWithCause(e);
    }
    for (String tableName : tableNames) {
      // add table names to parent metadata
      metadata.add(Database.TABLE_NAME, tableName);
    }
    xHandler = new XHTMLContentHandler(handler, metadata);
    xHandler.startDocument();

    try {
      for (String tableName : tableNames) {
        JDBCTableReader tableReader = getTableReader(connection, tableName, context);
        xHandler.startElement("table", "name", tableReader.getTableName());
        xHandler.startElement("thead");
        xHandler.startElement("tr");
        for (String header : tableReader.getHeaders()) {
          xHandler.startElement("th");
          xHandler.characters(header);
          xHandler.endElement("th");
        }
        xHandler.endElement("tr");
        xHandler.endElement("thead");
        xHandler.startElement("tbody");
        while (tableReader.nextRow(xHandler, context)) {
          // no-op
        }
        xHandler.endElement("tbody");
        xHandler.endElement("table");
      }
    } finally {
      if (xHandler != null) {
        xHandler.endDocument();
      }
      try {
        close();
      } catch (SQLException e) {
        // swallow
      }
    }
  }
Beispiel #4
0
  @Override
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    if (metadata.get(Metadata.CONTENT_TYPE) == null) {
      metadata.set(Metadata.CONTENT_TYPE, "application/xml");
    }

    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.startElement("p");

    xhtml.endElement("p");
    xhtml.endDocument();
  }
Beispiel #5
0
  @Override
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    setContentType(metadata);

    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    TaggedContentHandler tagged = new TaggedContentHandler(xhtml);
    try {
      context
          .getSAXParser()
          .parse(
              new CloseShieldInputStream(stream),
              new OfflineContentHandler(
                  new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
    } catch (SAXException e) {
      tagged.throwIfCauseOf(e);
      throw new TikaException("XML parse error", e);
    } finally {
      xhtml.endDocument();
    }
  }