private void addFieldString(PDField field) throws SAXException { // Pick partial name to present in content and altName for attribute // Ignoring FullyQualifiedName for now String partName = field.getPartialName(); String altName = field.getAlternateFieldName(); StringBuilder sb = new StringBuilder(); AttributesImpl attrs = new AttributesImpl(); if (partName != null) { sb.append(partName).append(": "); } if (altName != null) { attrs.addAttribute("", "altName", "altName", "CDATA", altName); } // return early if PDSignature field if (field instanceof PDSignatureField) { handleSignature(attrs, (PDSignatureField) field); return; } String value = field.getValueAsString(); if (value != null && !value.equals("null")) { sb.append(value); } if (attrs.getLength() > 0 || sb.length() > 0) { xhtml.startElement("li", attrs); xhtml.characters(sb.toString()); xhtml.endElement("li"); } }
private void writeRow(DBFRow row, Charset charset, XHTMLContentHandler xhtml) throws SAXException { xhtml.startElement("tr"); for (DBFCell cell : row.cells) { xhtml.startElement("td"); xhtml.characters(cell.getString(charset)); xhtml.endElement("td"); } xhtml.endElement("tr"); }
@Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { DBFReader reader = DBFReader.open(stream); DBFFileHeader header = reader.getHeader(); metadata.set(Metadata.CONTENT_TYPE, header.getVersion().getFullMimeString()); // insert metadata here Calendar lastModified = header.getLastModified(); if (lastModified != null) { metadata.set(TikaCoreProperties.MODIFIED, lastModified); } // buffer first X rows for charset detection List<DBFRow> firstRows = new LinkedList<>(); DBFRow row = reader.next(); int i = 0; while (row != null && i++ < ROWS_TO_BUFFER_FOR_CHARSET_DETECTION) { firstRows.add(row.deepCopy()); row = reader.next(); } Charset charset = getCharset(firstRows, header); metadata.set(Metadata.CONTENT_ENCODING, charset.toString()); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("table"); xhtml.startElement("thead"); for (DBFColumnHeader col : header.getCols()) { xhtml.startElement("th"); xhtml.characters(col.getName(charset)); xhtml.endElement("th"); } xhtml.endElement("thead"); xhtml.startElement("tbody"); // now write cached rows while (firstRows.size() > 0) { DBFRow cachedRow = firstRows.remove(0); writeRow(cachedRow, charset, xhtml); } // now continue with rest while (row != null) { writeRow(row, charset, xhtml); row = reader.next(); } xhtml.endElement("tbody"); xhtml.endElement("table"); xhtml.endDocument(); }
@Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { if (metadata.get(Metadata.CONTENT_TYPE) == null) { metadata.set(Metadata.CONTENT_TYPE, "application/xml"); } final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("p"); xhtml.endElement("p"); xhtml.endDocument(); }
void extractBookmarkText(PDOutlineNode bookmark) throws SAXException { PDOutlineItem current = bookmark.getFirstChild(); if (current != null) { xhtml.startElement("ul"); while (current != null) { xhtml.startElement("li"); xhtml.characters(current.getTitle()); xhtml.endElement("li"); // Recurse: extractBookmarkText(current); current = current.getNextSibling(); } xhtml.endElement("ul"); } }
void extractAcroForm(PDDocument pdf) throws IOException, SAXException { // Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields // this code derives from Ben's code PDDocumentCatalog catalog = pdf.getDocumentCatalog(); if (catalog == null) return; PDAcroForm form = catalog.getAcroForm(); if (form == null) return; // if it has xfa, try that. // if it doesn't exist or there's an exception, // go with traditional AcroForm PDXFAResource pdxfa = form.getXFA(); if (pdxfa != null) { // if successful, return XFAExtractor xfaExtractor = new XFAExtractor(); try (InputStream is = new BufferedInputStream(new ByteArrayInputStream(pdxfa.getBytes()))) { xfaExtractor.extract(is, xhtml, metadata, context); return; } catch (XMLStreamException | IOException e) { // if there was an xml parse exception in xfa, try the AcroForm } } @SuppressWarnings("rawtypes") List fields = form.getFields(); if (fields == null) return; @SuppressWarnings("rawtypes") ListIterator itr = fields.listIterator(); if (itr == null) return; xhtml.startElement("div", "class", "acroform"); xhtml.startElement("ol"); while (itr.hasNext()) { Object obj = itr.next(); if (obj != null && obj instanceof PDField) { processAcroField((PDField) obj, 0); } } xhtml.endElement("ol"); xhtml.endElement("div"); }
private void processAcroField(PDField field, final int currentRecursiveDepth) throws SAXException, IOException { if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { return; } addFieldString(field); if (field instanceof PDNonTerminalField) { int r = currentRecursiveDepth + 1; xhtml.startElement("ol"); for (PDField child : ((PDNonTerminalField) field).getChildren()) { processAcroField(child, r); } xhtml.endElement("ol"); } }
@Override protected void endDocument(PDDocument pdf) throws IOException { try { // Extract text for any bookmarks: extractBookmarkText(); try { extractEmbeddedDocuments(pdf); } catch (IOException e) { handleCatchableIOE(e); } // extract acroform data at end of doc if (config.getExtractAcroFormContent() == true) { try { extractAcroForm(pdf); } catch (IOException e) { handleCatchableIOE(e); } } xhtml.endDocument(); } catch (TikaException e) { throw new IOExceptionWithCause("Unable to end a document", e); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a document", e); } }
@Override protected void startDocument(PDDocument pdf) throws IOException { try { xhtml.startDocument(); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to start a document", e); } }
@Override protected void startPage(PDPage page) throws IOException { try { xhtml.startElement("div", "class", "page"); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to start a page", e); } writeParagraphStart(); }
private void extractPDEmbeddedFile( String displayName, String unicodeFileName, String fileName, PDEmbeddedFile file, EmbeddedDocumentExtractor extractor) throws SAXException, IOException, TikaException { if (file == null) { // skip silently return; } fileName = (fileName == null) ? displayName : fileName; // TODO: other metadata? Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); metadata.set( TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName); if (extractor.shouldParseEmbedded(metadata)) { TikaInputStream stream = null; try { stream = TikaInputStream.get(file.createInputStream()); extractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml), metadata, false); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", fileName); xhtml.startElement("div", attributes); xhtml.endElement("div"); } finally { IOUtils.closeQuietly(stream); } } }
@Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { setContentType(metadata); final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); TaggedContentHandler tagged = new TaggedContentHandler(xhtml); try { context .getSAXParser() .parse( new CloseShieldInputStream(stream), new OfflineContentHandler( new EmbeddedContentHandler(getContentHandler(tagged, metadata, context)))); } catch (SAXException e) { tagged.throwIfCauseOf(e); throw new TikaException("XML parse error", e); } finally { xhtml.endDocument(); } }
private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField) throws SAXException { PDSignature sig = sigField.getSignature(); if (sig == null) { return; } Map<String, String> vals = new TreeMap<>(); vals.put("name", sig.getName()); vals.put("contactInfo", sig.getContactInfo()); vals.put("location", sig.getLocation()); vals.put("reason", sig.getReason()); Calendar cal = sig.getSignDate(); if (cal != null) { dateFormat.setTimeZone(cal.getTimeZone()); vals.put("date", dateFormat.format(cal.getTime())); } // see if there is any data int nonNull = 0; for (String val : vals.keySet()) { if (val != null && !val.equals("")) { nonNull++; } } // if there is, process it if (nonNull > 0) { xhtml.startElement("li", parentAttributes); AttributesImpl attrs = new AttributesImpl(); attrs.addAttribute("", "type", "type", "CDATA", "signaturedata"); xhtml.startElement("ol", attrs); for (Map.Entry<String, String> e : vals.entrySet()) { if (e.getValue() == null || e.getValue().equals("")) { continue; } attrs = new AttributesImpl(); attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey()); xhtml.startElement("li", attrs); xhtml.characters(e.getValue()); xhtml.endElement("li"); } xhtml.endElement("ol"); xhtml.endElement("li"); } }
@Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { connection = getConnection(stream, metadata, context); XHTMLContentHandler xHandler = null; List<String> tableNames = null; try { tableNames = getTableNames(connection, metadata, context); } catch (SQLException e) { throw new IOExceptionWithCause(e); } for (String tableName : tableNames) { // add table names to parent metadata metadata.add(Database.TABLE_NAME, tableName); } xHandler = new XHTMLContentHandler(handler, metadata); xHandler.startDocument(); try { for (String tableName : tableNames) { JDBCTableReader tableReader = getTableReader(connection, tableName, context); xHandler.startElement("table", "name", tableReader.getTableName()); xHandler.startElement("thead"); xHandler.startElement("tr"); for (String header : tableReader.getHeaders()) { xHandler.startElement("th"); xHandler.characters(header); xHandler.endElement("th"); } xHandler.endElement("tr"); xHandler.endElement("thead"); xHandler.startElement("tbody"); while (tableReader.nextRow(xHandler, context)) { // no-op } xHandler.endElement("tbody"); xHandler.endElement("table"); } } finally { if (xHandler != null) { xHandler.endDocument(); } try { close(); } catch (SQLException e) { // swallow } } }
@Override protected void endPage(PDPage page) throws IOException { try { EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } catch (IOException e) { handleCatchableIOE(e); } } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { // can't currently associate link to text. // for now, extract link and repeat the link as if it // were the visible text PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null && link.trim().length() > 0) { xhtml.startElement("div", "class", "annotation"); xhtml.startElement("a", "href", link); xhtml.characters(link); xhtml.endElement("a"); xhtml.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { xhtml.startElement("div", "class", "annotation"); if (title != null) { xhtml.startElement("div", "class", "annotationTitle"); xhtml.characters(title); xhtml.endElement("div"); } if (subject != null) { xhtml.startElement("div", "class", "annotationSubject"); xhtml.characters(subject); xhtml.endElement("div"); } if (contents != null) { xhtml.startElement("div", "class", "annotationContents"); xhtml.characters(contents); xhtml.endElement("div"); } xhtml.endElement("div"); } } } } if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { doOCROnCurrentPage(); } xhtml.endElement("div"); } catch (SAXException | TikaException e) { throw new IOExceptionWithCause("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } finally { pageIndex++; } }