/**
  * extract clickable links from pdf
  *
  * @param pdf the document to parse
  * @return all detected links
  */
 private Collection<AnchorURL>[] extractPdfLinks(final PDDocument pdf) {
   @SuppressWarnings("unchecked")
   List<PDPage> allPages = pdf.getDocumentCatalog().getAllPages();
   @SuppressWarnings("unchecked")
   Collection<AnchorURL>[] linkCollections =
       (Collection<AnchorURL>[]) new Collection<?>[allPages.size()];
   int pagecount = 0;
   for (PDPage page : allPages) {
     final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>();
     try {
       List<PDAnnotation> annotations = page.getAnnotations();
       if (annotations != null) {
         for (PDAnnotation pdfannotation : annotations) {
           if (pdfannotation instanceof PDAnnotationLink) {
             PDAction link = ((PDAnnotationLink) pdfannotation).getAction();
             if (link != null && link instanceof PDActionURI) {
               PDActionURI pdflinkuri = (PDActionURI) link;
               String uristr = pdflinkuri.getURI();
               AnchorURL url = new AnchorURL(uristr);
               pdflinks.add(url);
             }
           }
         }
       }
     } catch (IOException ex) {
     }
     linkCollections[pagecount++] = pdflinks;
   }
   return linkCollections;
 }
Ejemplo n.º 2
0
  @Override
  protected void endPage(PDPage page) throws IOException {

    try {
      EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
      for (PDAnnotation annotation : page.getAnnotations()) {

        if (annotation instanceof PDAnnotationFileAttachment) {
          PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
          PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
          try {
            extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor);
          } catch (SAXException e) {
            throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
          } catch (TikaException e) {
            throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
          } catch (IOException e) {
            handleCatchableIOE(e);
          }
        }
        // TODO: remove once PDFBOX-1143 is fixed:
        if (config.getExtractAnnotationText()) {
          if (annotation instanceof PDAnnotationLink) {
            PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
            if (annotationlink.getAction() != null) {
              PDAction action = annotationlink.getAction();
              if (action instanceof PDActionURI) {
                // can't currently associate link to text.
                // for now, extract link and repeat the link as if it
                // were the visible text
                PDActionURI uri = (PDActionURI) action;
                String link = uri.getURI();
                if (link != null && link.trim().length() > 0) {
                  xhtml.startElement("div", "class", "annotation");
                  xhtml.startElement("a", "href", link);
                  xhtml.characters(link);
                  xhtml.endElement("a");
                  xhtml.endElement("div");
                }
              }
            }
          }

          if (annotation instanceof PDAnnotationMarkup) {
            PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
            String title = annotationMarkup.getTitlePopup();
            String subject = annotationMarkup.getSubject();
            String contents = annotationMarkup.getContents();
            // TODO: maybe also annotationMarkup.getRichContents()?
            if (title != null || subject != null || contents != null) {
              xhtml.startElement("div", "class", "annotation");

              if (title != null) {
                xhtml.startElement("div", "class", "annotationTitle");
                xhtml.characters(title);
                xhtml.endElement("div");
              }

              if (subject != null) {
                xhtml.startElement("div", "class", "annotationSubject");
                xhtml.characters(subject);
                xhtml.endElement("div");
              }

              if (contents != null) {
                xhtml.startElement("div", "class", "annotationContents");
                xhtml.characters(contents);
                xhtml.endElement("div");
              }

              xhtml.endElement("div");
            }
          }
        }
      }
      if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
        doOCROnCurrentPage();
      }
      xhtml.endElement("div");
    } catch (SAXException | TikaException e) {
      throw new IOExceptionWithCause("Unable to end a page", e);
    } catch (IOException e) {
      exceptions.add(e);
    } finally {
      pageIndex++;
    }
  }