/** * extract clickable links from pdf * * @param pdf the document to parse * @return all detected links */ private Collection<AnchorURL>[] extractPdfLinks(final PDDocument pdf) { @SuppressWarnings("unchecked") List<PDPage> allPages = pdf.getDocumentCatalog().getAllPages(); @SuppressWarnings("unchecked") Collection<AnchorURL>[] linkCollections = (Collection<AnchorURL>[]) new Collection<?>[allPages.size()]; int pagecount = 0; for (PDPage page : allPages) { final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>(); try { List<PDAnnotation> annotations = page.getAnnotations(); if (annotations != null) { for (PDAnnotation pdfannotation : annotations) { if (pdfannotation instanceof PDAnnotationLink) { PDAction link = ((PDAnnotationLink) pdfannotation).getAction(); if (link != null && link instanceof PDActionURI) { PDActionURI pdflinkuri = (PDActionURI) link; String uristr = pdflinkuri.getURI(); AnchorURL url = new AnchorURL(uristr); pdflinks.add(url); } } } } } catch (IOException ex) { } linkCollections[pagecount++] = pdflinks; } return linkCollections; }
@Override protected void endPage(PDPage page) throws IOException { try { EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); for (PDAnnotation annotation : page.getAnnotations()) { if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); } catch (TikaException e) { throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } catch (IOException e) { handleCatchableIOE(e); } } // TODO: remove once PDFBOX-1143 is fixed: if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; if (annotationlink.getAction() != null) { PDAction action = annotationlink.getAction(); if (action instanceof PDActionURI) { // can't currently associate link to text. // for now, extract link and repeat the link as if it // were the visible text PDActionURI uri = (PDActionURI) action; String link = uri.getURI(); if (link != null && link.trim().length() > 0) { xhtml.startElement("div", "class", "annotation"); xhtml.startElement("a", "href", link); xhtml.characters(link); xhtml.endElement("a"); xhtml.endElement("div"); } } } } if (annotation instanceof PDAnnotationMarkup) { PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; String title = annotationMarkup.getTitlePopup(); String subject = annotationMarkup.getSubject(); String contents = annotationMarkup.getContents(); // TODO: maybe also annotationMarkup.getRichContents()? if (title != null || subject != null || contents != null) { xhtml.startElement("div", "class", "annotation"); if (title != null) { xhtml.startElement("div", "class", "annotationTitle"); xhtml.characters(title); xhtml.endElement("div"); } if (subject != null) { xhtml.startElement("div", "class", "annotationSubject"); xhtml.characters(subject); xhtml.endElement("div"); } if (contents != null) { xhtml.startElement("div", "class", "annotationContents"); xhtml.characters(contents); xhtml.endElement("div"); } xhtml.endElement("div"); } } } } if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { doOCROnCurrentPage(); } xhtml.endElement("div"); } catch (SAXException | TikaException e) { throw new IOExceptionWithCause("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } finally { pageIndex++; } }