@Override protected void endDocument(PDDocument pdf) throws IOException { try { // Extract text for any bookmarks: extractBookmarkText(); try { extractEmbeddedDocuments(pdf); } catch (IOException e) { handleCatchableIOE(e); } // extract acroform data at end of doc if (config.getExtractAcroFormContent() == true) { try { extractAcroForm(pdf); } catch (IOException e) { handleCatchableIOE(e); } } xhtml.endDocument(); } catch (TikaException e) { throw new IOExceptionWithCause("Unable to end a document", e); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a document", e); } }
@Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { DBFReader reader = DBFReader.open(stream); DBFFileHeader header = reader.getHeader(); metadata.set(Metadata.CONTENT_TYPE, header.getVersion().getFullMimeString()); // insert metadata here Calendar lastModified = header.getLastModified(); if (lastModified != null) { metadata.set(TikaCoreProperties.MODIFIED, lastModified); } // buffer first X rows for charset detection List<DBFRow> firstRows = new LinkedList<>(); DBFRow row = reader.next(); int i = 0; while (row != null && i++ < ROWS_TO_BUFFER_FOR_CHARSET_DETECTION) { firstRows.add(row.deepCopy()); row = reader.next(); } Charset charset = getCharset(firstRows, header); metadata.set(Metadata.CONTENT_ENCODING, charset.toString()); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("table"); xhtml.startElement("thead"); for (DBFColumnHeader col : header.getCols()) { xhtml.startElement("th"); xhtml.characters(col.getName(charset)); xhtml.endElement("th"); } xhtml.endElement("thead"); xhtml.startElement("tbody"); // now write cached rows while (firstRows.size() > 0) { DBFRow cachedRow = firstRows.remove(0); writeRow(cachedRow, charset, xhtml); } // now continue with rest while (row != null) { writeRow(row, charset, xhtml); row = reader.next(); } xhtml.endElement("tbody"); xhtml.endElement("table"); xhtml.endDocument(); }
@Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { connection = getConnection(stream, metadata, context); XHTMLContentHandler xHandler = null; List<String> tableNames = null; try { tableNames = getTableNames(connection, metadata, context); } catch (SQLException e) { throw new IOExceptionWithCause(e); } for (String tableName : tableNames) { // add table names to parent metadata metadata.add(Database.TABLE_NAME, tableName); } xHandler = new XHTMLContentHandler(handler, metadata); xHandler.startDocument(); try { for (String tableName : tableNames) { JDBCTableReader tableReader = getTableReader(connection, tableName, context); xHandler.startElement("table", "name", tableReader.getTableName()); xHandler.startElement("thead"); xHandler.startElement("tr"); for (String header : tableReader.getHeaders()) { xHandler.startElement("th"); xHandler.characters(header); xHandler.endElement("th"); } xHandler.endElement("tr"); xHandler.endElement("thead"); xHandler.startElement("tbody"); while (tableReader.nextRow(xHandler, context)) { // no-op } xHandler.endElement("tbody"); xHandler.endElement("table"); } } finally { if (xHandler != null) { xHandler.endDocument(); } try { close(); } catch (SQLException e) { // swallow } } }
@Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { if (metadata.get(Metadata.CONTENT_TYPE) == null) { metadata.set(Metadata.CONTENT_TYPE, "application/xml"); } final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("p"); xhtml.endElement("p"); xhtml.endDocument(); }
@Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { setContentType(metadata); final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); TaggedContentHandler tagged = new TaggedContentHandler(xhtml); try { context .getSAXParser() .parse( new CloseShieldInputStream(stream), new OfflineContentHandler( new EmbeddedContentHandler(getContentHandler(tagged, metadata, context)))); } catch (SAXException e) { tagged.throwIfCauseOf(e); throw new TikaException("XML parse error", e); } finally { xhtml.endDocument(); } }