Java PDDocument примеры, org.apache.pdfbox.pdmodel.PDDocument Java примеры использования

Пример #1

1

Показать файл

Файл: PdfBoxParser.java Проект: littleStar711/josm-plugins

  public void parse(File file, int maxPaths, ProgressMonitor monitor) throws Exception {
    monitor.beginTask(tr("Parsing PDF", 1));

    PDDocument document = PDDocument.load(file);

    if (document.isEncrypted()) {
      throw new Exception(tr("Encrypted documents not supported."));
    }

    List<?> allPages = document.getDocumentCatalog().getAllPages();

    if (allPages.size() != 1) {
      throw new Exception(tr("The PDF file must have exactly one page."));
    }

    PDPage page = (PDPage) allPages.get(0);
    PDRectangle pageSize = page.findMediaBox();
    Integer rotationVal = page.getRotation();
    int rotation = 0;
    if (rotationVal != null) {
      rotation = rotationVal.intValue();
    }

    GraphicsProcessor p = new GraphicsProcessor(target, rotation, maxPaths, monitor);
    PageDrawer drawer = new PageDrawer();
    drawer.drawPage(p, page);
    this.target.bounds =
        new Rectangle2D.Double(
            pageSize.getLowerLeftX(),
            pageSize.getLowerLeftY(),
            pageSize.getWidth(),
            pageSize.getHeight());

    monitor.finishTask();
  }

Пример #2

0

Показать файл

Файл: PdfBoxSignatureService.java Проект: rvillido/dss

  public void saveDocumentIncrementally(
      PAdESSignatureParameters parameters,
      File signedFile,
      FileOutputStream fileOutputStream,
      PDDocument pdDocument)
      throws DSSException {

    FileInputStream signedFileInputStream = null;
    try {

      signedFileInputStream = new FileInputStream(signedFile);
      // the document needs to have an ID, if not a ID based on the current system time is used, and
      // then the
      // digest of the signed data is
      // different
      if (pdDocument.getDocumentId() == null) {

        final byte[] documentIdBytes =
            DSSUtils.digest(
                DigestAlgorithm.MD5, parameters.bLevel().getSigningDate().toString().getBytes());
        pdDocument.setDocumentId(DSSUtils.toLong(documentIdBytes));
        pdDocument.setDocumentId(0L);
      }
      pdDocument.saveIncremental(signedFileInputStream, fileOutputStream);
    } catch (IOException e) {
      throw new DSSException(e);
    } catch (COSVisitorException e) {
      throw new DSSException(e);
    } finally {
      IOUtils.closeQuietly(signedFileInputStream);
    }
  }

Пример #3

0

Показать файл

Файл: PDFContentExtractor.java Проект: gSafe/mark

 public static String extractText(InputStream src) throws IOException {
   StringBuilder text = new StringBuilder();
   COSDocument cosDoc = null;
   PDDocument pdDoc = null;
   try {
     PDFParser parser = new PDFParser(src);
     parser.parse();
     cosDoc = parser.getDocument();
     PDFTextStripper stripper = new PDFTextStripper();
     pdDoc = new PDDocument(cosDoc);
     int nbPages = pdDoc.getDocumentCatalog().getPages().getCount();
     for (int i = 0; i < nbPages; i++) {
       stripper.setStartPage(i + 1);
       stripper.setEndPage(i + 1);
       text.append(stripper.getText(pdDoc));
     }
   } finally {
     try {
       if (cosDoc != null) {
         cosDoc.close();
       }
     } catch (IOException e) {
       // Do nada
     }
     try {
       if (pdDoc != null) {
         pdDoc.close();
       }
     } catch (IOException e) {
       // Do nada
     }
   }
   return text.toString();
 }

Пример #4

0

Показать файл

Файл: ResourceHelper.java Проект: muh6mm3d/javlo

 public static String getFileContent(File file) throws FileNotFoundException, IOException {
   String ext = FilenameUtils.getExtension(file.getName());
   String outContent = "";
   try {
     if (ext.toLowerCase().equals("doc")) {
       if (file != null) {
         WordExtractor we = new WordExtractor(new FileInputStream(file));
         outContent = we.getText();
       } else {
         logger.warning("file not found : " + file);
       }
     } else if (ext.toLowerCase().equals("pdf")) {
       PDDocument doc = PDDocument.load(file);
       PDFTextStripper text = new PDFTextStripper();
       outContent = text.getText(doc);
       doc.close();
     } else if (StringHelper.isHTML(file.getName())) {
       return loadStringFromFile(file);
     }
   } catch (Throwable t) {
     logger.warning("error when read : " + file + "+ [" + t.getMessage() + "]");
     t.printStackTrace();
   }
   return outContent;
 }

Пример #5

0

Показать файл

Файл: PDFProcessor.java Проект: BavithraRajendran/liferay-portal

  private void _generateImagesPB(FileVersion fileVersion, InputStream inputStream)
      throws Exception {

    boolean generatePreview = _isGeneratePreview(fileVersion);
    boolean generateThumbnail = _isGenerateThumbnail(fileVersion);

    PDDocument pdDocument = null;

    try {
      pdDocument = PDDocument.load(inputStream);

      PDDocumentCatalog pdDocumentCatalog = pdDocument.getDocumentCatalog();

      List<PDPage> pdPages = pdDocumentCatalog.getAllPages();

      for (int i = 0; i < pdPages.size(); i++) {
        PDPage pdPage = pdPages.get(i);

        if (generateThumbnail && (i == 0)) {
          _generateImagesPB(
              fileVersion,
              pdPage,
              PropsValues.DL_FILE_ENTRY_THUMBNAIL_DPI,
              PropsValues.DL_FILE_ENTRY_THUMBNAIL_HEIGHT,
              PropsValues.DL_FILE_ENTRY_THUMBNAIL_WIDTH,
              true,
              0);

          if (_log.isInfoEnabled()) {
            _log.info("PDFBox generated a thumbnail for " + fileVersion.getFileVersionId());
          }
        }

        if (!generatePreview) {
          break;
        }

        _generateImagesPB(
            fileVersion,
            pdPage,
            PropsValues.DL_FILE_ENTRY_PREVIEW_DOCUMENT_DPI,
            PropsValues.DL_FILE_ENTRY_PREVIEW_DOCUMENT_HEIGHT,
            PropsValues.DL_FILE_ENTRY_PREVIEW_DOCUMENT_WIDTH,
            false,
            i + 1);
      }

      if (_log.isInfoEnabled() && generatePreview) {
        _log.info(
            "PDFBox generated "
                + getPreviewFileCount(fileVersion)
                + " preview pages for "
                + fileVersion.getFileVersionId());
      }
    } finally {
      if (pdDocument != null) {
        pdDocument.close();
      }
    }
  }

Пример #6

0

Показать файл

  public static void main(String[] args) throws IOException {

    try (PDDocument doc = new PDDocument()) {
      PDPage page = new PDPage();

      // Create a landscape page
      // page.setMediaBox(new PDRectangle(PDRectangle.A4.getHeight(),
      // PDRectangle.A4.getWidth()));
      doc.addPage(page);

      // Initialize table
      float margin = 10;
      float tableWidth = page.getMediaBox().getWidth() - (2 * margin);
      float yStartNewPage = page.getMediaBox().getHeight() - (2 * margin);
      float yStart = yStartNewPage;
      float bottomMargin = 0;

      // Create the data
      List<List> data = new ArrayList<>();
      data.add(new ArrayList<>(Arrays.asList("Key", "Value")));
      for (int i = 1; i <= 5; i++) {
        data.add(new ArrayList<>(Arrays.asList(String.valueOf(i), "value:" + i)));
      }

      BaseTable dataTable =
          new BaseTable(
              yStart, yStartNewPage, bottomMargin, tableWidth, margin, doc, page, true, true);
      DataTable t = new DataTable(dataTable, page);
      t.addListToTable(data, DataTable.HASHEADER);
      dataTable.draw();
      File file = new File("box.pdf");
      System.out.println("Sample file saved at : " + file.getAbsolutePath());
      doc.save(file);
    }
  }

Пример #7

0

Показать файл

Файл: WordToPdfRenditionProviderTest.java Проект: paulcwarren/spring-content

  private String pdfToText(InputStream in) {
    PDFParser parser = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;

    try {
      parser = new PDFParser(in);
      parser.parse();
      cosDoc = parser.getDocument();
      pdfStripper = new PDFTextStripper();
      pdDoc = new PDDocument(cosDoc);
      return pdfStripper.getText(pdDoc);
      // System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", ""));
    } catch (Exception e) {
      e.printStackTrace();
      try {
        if (cosDoc != null) cosDoc.close();
        if (pdDoc != null) pdDoc.close();
      } catch (Exception e1) {
        e.printStackTrace();
      }
    }
    return null;
  }

Пример #8

0

Показать файл

Файл: DocumentHelper.java Проект: kluna/JGAAP

  /**
   * Extracts text from a PDF and stores it in the document. Takes an input stream rather than a
   * file name.
   *
   * @param filesInputStream An input stream pointing to a PDF file.
   * @throws IOException
   */
  private static char[] loadPDF(InputStream filesInputStream) throws IOException {
    PDDocument doc = PDDocument.load(filesInputStream);
    PDFTextStripper pdfStripper = new PDFTextStripper();
    pdfStripper.setSortByPosition(false);
    char[] origText = pdfStripper.getText(doc).toCharArray();
    doc.close();

    return origText;
  }

Пример #9

0

Показать файл

Файл: PublicKeySecurityHandler.java Проект: sato-shigezo/pdfBox

  /**
   * Decrypt the document.
   *
   * @param doc The document to decrypt.
   * @param decryptionMaterial The data used to decrypt the document.
   * @throws CryptographyException If there is an error during decryption.
   * @throws IOException If there is an error accessing data.
   */
  public void decryptDocument(PDDocument doc, DecryptionMaterial decryptionMaterial)
      throws CryptographyException, IOException {
    this.document = doc;

    PDEncryptionDictionary dictionary = doc.getEncryptionDictionary();

    prepareForDecryption(dictionary, doc.getDocument().getDocumentID(), decryptionMaterial);

    proceedDecryption();
  }

Пример #10

0

Показать файл

Файл: StringPlus.java Проект: rupumped/NicksAppsJava

 public static String getContent(PDFParser parser) throws IOException {
   parser.parse();
   COSDocument cosDoc = parser.getDocument();
   PDFTextStripper pdfStripper = new PDFTextStripper();
   PDDocument pdDoc = new PDDocument(cosDoc);
   String content = pdfStripper.getText(pdDoc);
   cosDoc.close();
   pdDoc.close();
   return content;
 }

Пример #11

0

Показать файл

Файл: PdfReportGenerator.java Проект: Catalysts/cat-boot

 public void printToStream(
     PdfPageLayout pageConfig,
     Resource templateResource,
     PdfReportStructure report,
     OutputStream stream,
     PDDocument document)
     throws IOException {
   PDDocument page = generate(pageConfig, templateResource, report, document);
   page.save(stream);
   page.close();
 }

Пример #12

0

Показать файл

Файл: LucenePDFDocument.java Проект: lumpchen/pdfbox_ua

  /**
   * This will add the contents to the lucene document.
   *
   * @param document The document to add the contents to.
   * @param is The stream to get the contents from.
   * @param documentLocation The location of the document, used just for debug messages.
   * @throws IOException If there is an error parsing the document.
   */
  private void addContent(Document document, InputStream is, String documentLocation)
      throws IOException {
    PDDocument pdfDocument = null;
    try {
      pdfDocument = PDDocument.load(is, "");

      // create a writer where to append the text content.
      StringWriter writer = new StringWriter();
      if (stripper == null) {
        stripper = new PDFTextStripper();
      }
      stripper.writeText(pdfDocument, writer);

      // Note: the buffer to string operation is costless;
      // the char array value of the writer buffer and the content string
      // is shared as long as the buffer content is not modified, which will
      // not occur here.
      String contents = writer.getBuffer().toString();

      StringReader reader = new StringReader(contents);

      // Add the tag-stripped contents as a Reader-valued Text field so it will
      // get tokenized and indexed.
      addTextField(document, "contents", reader);

      PDDocumentInformation info = pdfDocument.getDocumentInformation();
      if (info != null) {
        addTextField(document, "Author", info.getAuthor());
        addTextField(document, "CreationDate", info.getCreationDate());
        addTextField(document, "Creator", info.getCreator());
        addTextField(document, "Keywords", info.getKeywords());
        addTextField(document, "ModificationDate", info.getModificationDate());
        addTextField(document, "Producer", info.getProducer());
        addTextField(document, "Subject", info.getSubject());
        addTextField(document, "Title", info.getTitle());
        addTextField(document, "Trapped", info.getTrapped());
      }
      int summarySize = Math.min(contents.length(), 500);
      String summary = contents.substring(0, summarySize);
      // Add the summary as an UnIndexed field, so that it is stored and returned
      // with hit documents for display.
      addUnindexedField(document, "summary", summary);
    } catch (InvalidPasswordException e) {
      // they didn't suppply a password and the default of "" was wrong.
      throw new IOException(
          "Error: The document(" + documentLocation + ") is encrypted and will not be indexed.", e);
    } finally {
      if (pdfDocument != null) {
        pdfDocument.close();
      }
    }
  }

Пример #13

0

Показать файл

Файл: PdfParser.java Проект: vahid-g/PublicationAnalyzer

  public static String loadPdfToString(String filename) throws IOException {
    PDDocument document = PDDocument.load(new File(filename));
    PDFTextStripper reader = new PDFTextStripper();
    String pdfText = reader.getText(document);
    document.close();

    // writing pdf text to a text file
    // try (FileWriter fw = new FileWriter("data/sigmod.txt")) {
    // fw.write(pdfText);
    // }

    return pdfText;
  }

Пример #14

0

Показать файл

Файл: Labs.java Проект: fabianonunes/freemandela

  @SuppressWarnings("unchecked")
  public static void main_3(String[] args) throws IOException {

    PDDocument doc = PDDocument.load(iconFile);

    List<PDPage> pages = doc.getDocumentCatalog().getAllPages();

    List<COSObject> objects = doc.getDocument().getObjects();

    for (COSObject cosObject : objects) {

      COSBase cosbase = cosObject.getObject();

      if (cosObject.getObject() instanceof COSStream) {

        COSStream cosstream = (COSStream) cosbase;

        COSBase filter = cosstream.getDictionaryObject(COSName.FILTER);

        COSBase subtype = cosstream.getDictionaryObject(COSName.SUBTYPE);

        if (subtype != null && subtype.equals(COSName.IMAGE)) {

          System.out.println(filter);

          InputStream filtered = cosstream.getFilteredStream();
          // PDStream stream = new PDStream(costream);

          System.out.println(Hex.encodeHex(IOUtils.toByteArray(filtered)));
        }
      }
    }

    for (PDPage pdPage : pages) {

      PDResources resources = pdPage.getResources();

      Map<String, PDXObject> images = resources.getXObjects();

      Set<String> keys = images.keySet();

      for (String key : keys) {

        PDXObject image = images.get(key);

        byte[] imgData = image.getPDStream().getByteArray();

        System.out.println(Hex.encodeHex(imgData));
      }
    }
  }

Пример #15

0

Показать файл

Файл: PDFReader.java Проект: kripke/friendlyreader

  /**
   * This will parse a document.
   *
   * @param input The input stream for the document.
   * @return The document.
   * @throws IOException If there is an error parsing the document.
   */
  private static PDDocument parseDocument(InputStream input) throws IOException {
    PDDocument document = PDDocument.load(input);
    if (document.isEncrypted()) {
      try {
        document.decrypt("");
      } catch (org.apache.pdfbox.exceptions.InvalidPasswordException e) {
        System.err.println("Error: The document is encrypted.");
      } catch (org.apache.pdfbox.exceptions.CryptographyException e) {
        e.printStackTrace();
      }
    }

    return document;
  }

Пример #16

0

Показать файл

Файл: PDFTextExtractor.java Проект: Knixli/Zeen

 public static String extract(File pdfFile) throws IOException {
   checkNotNull(pdfFile, "pdfFile");
   PDFParser parser = new PDFParser(new FileInputStream(pdfFile));
   parser.parse();
   COSDocument cosDoc = parser.getDocument();
   PDFTextStripper pdfStripper = new PDFTextStripper();
   PDDocument pdDoc = new PDDocument(cosDoc);
   pdfStripper.setStartPage(1);
   pdfStripper.setEndPage(pdDoc.getNumberOfPages());
   pdfStripper.setSortByPosition(true);
   String pdfText = pdfStripper.getText(pdDoc);
   pdDoc.close();
   cosDoc.close();
   return pdfText;
 }

Пример #17

0

Показать файл

Файл: PDFRenderer.java Проект: liuwenhui2365/EasyEnglishV2

  /**
   * Returns the given page as an RGB image at the given scale.
   *
   * @param pageIndex the zero-based index of the page to be converted
   * @param scale the scaling factor, where 1 = 72 DPI
   * @param config the bitmap config to create
   * @return the rendered page image
   * @throws IOException if the PDF cannot be read
   */
  public Bitmap renderImage(int pageIndex, float scale, Bitmap.Config config) throws IOException {
    PDPage page = document.getPage(pageIndex);

    PDRectangle cropbBox = page.getCropBox();
    float widthPt = cropbBox.getWidth();
    float heightPt = cropbBox.getHeight();
    int widthPx = Math.round(widthPt * scale);
    int heightPx = Math.round(heightPt * scale);
    int rotationAngle = page.getRotation();

    // swap width and height
    Bitmap image;
    if (rotationAngle == 90 || rotationAngle == 270) {
      image = Bitmap.createBitmap(heightPx, widthPx, config);
    } else {
      image = Bitmap.createBitmap(widthPx, heightPx, config);
    }

    // use a transparent background if the imageType supports alpha
    Paint paint = new Paint();
    Canvas canvas = new Canvas(image);
    if (config != Bitmap.Config.ARGB_8888) {
      paint.setColor(Color.WHITE);
      paint.setStyle(Paint.Style.FILL);
      canvas.drawRect(0, 0, image.getWidth(), image.getHeight(), paint);
      paint.reset();
    }

    renderPage(page, paint, canvas, image.getWidth(), image.getHeight(), scale, scale);

    return image;
  }

Пример #18

0

Показать файл

Файл: PDFRenderer.java Проект: aclarkxyz/PDFBoxLite

  /**
   * Returns the given page as an RGB or ARGB image at the given scale.
   *
   * @param pageIndex the zero-based index of the page to be converted
   * @param scale the scaling factor, where 1 = 72 DPI
   * @param imageType the type of image to return
   * @return the rendered page image
   * @throws IOException if the PDF cannot be read
   */
  public BufferedImage renderImage(int pageIndex, float scale, ImageType imageType)
      throws IOException {
    PDPage page = document.getPage(pageIndex);

    PDRectangle cropbBox = page.getCropBox();
    float widthPt = cropbBox.getWidth();
    float heightPt = cropbBox.getHeight();
    int widthPx = Math.round(widthPt * scale);
    int heightPx = Math.round(heightPt * scale);
    int rotationAngle = page.getRotation();

    // swap width and height
    BufferedImage image;
    if (rotationAngle == 90 || rotationAngle == 270) {
      image = new BufferedImage(heightPx, widthPx, imageType.toBufferedImageType());
    } else {
      image = new BufferedImage(widthPx, heightPx, imageType.toBufferedImageType());
    }

    // use a transparent background if the imageType supports alpha
    Graphics2D g = image.createGraphics();
    if (imageType == ImageType.ARGB) {
      g.setBackground(new Color(0, 0, 0, 0));
    } else {
      g.setBackground(Color.WHITE);
    }

    renderPage(page, g, image.getWidth(), image.getHeight(), scale, scale);
    g.dispose();

    return image;
  }

Пример #19

0

Показать файл

Файл: AbstractPDF2XHTML.java Проект: Zarana-Parekh/tika

  private void extractEmbeddedDocuments(PDDocument document)
      throws IOException, SAXException, TikaException {
    PDDocumentNameDictionary namesDictionary =
        new PDDocumentNameDictionary(document.getDocumentCatalog());
    PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
    if (efTree == null) {
      return;
    }

    Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
    // For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
    // This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
    // If there is a need we could add a fully recursive search to find a non-null
    // Map<String, COSObjectable> that contains the doc info.
    if (embeddedFileNames != null) {
      processEmbeddedDocNames(embeddedFileNames);
    } else {
      List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
      if (kids == null) {
        return;
      }
      for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
        embeddedFileNames = node.getNames();
        if (embeddedFileNames != null) {
          processEmbeddedDocNames(embeddedFileNames);
        }
      }
    }
  }

Пример #20

0

Показать файл

Файл: pdfParser.java Проект: supertanglang/yacy_search_server

 /**
  * extract clickable links from pdf
  *
  * @param pdf the document to parse
  * @return all detected links
  */
 private Collection<AnchorURL>[] extractPdfLinks(final PDDocument pdf) {
   @SuppressWarnings("unchecked")
   List<PDPage> allPages = pdf.getDocumentCatalog().getAllPages();
   @SuppressWarnings("unchecked")
   Collection<AnchorURL>[] linkCollections =
       (Collection<AnchorURL>[]) new Collection<?>[allPages.size()];
   int pagecount = 0;
   for (PDPage page : allPages) {
     final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>();
     try {
       List<PDAnnotation> annotations = page.getAnnotations();
       if (annotations != null) {
         for (PDAnnotation pdfannotation : annotations) {
           if (pdfannotation instanceof PDAnnotationLink) {
             PDAction link = ((PDAnnotationLink) pdfannotation).getAction();
             if (link != null && link instanceof PDActionURI) {
               PDActionURI pdflinkuri = (PDActionURI) link;
               String uristr = pdflinkuri.getURI();
               AnchorURL url = new AnchorURL(uristr);
               pdflinks.add(url);
             }
           }
         }
       }
     } catch (IOException ex) {
     }
     linkCollections[pagecount++] = pdflinks;
   }
   return linkCollections;
 }

Пример #21

0

Показать файл

Файл: TextConvertor.java Проект: policygrid/ourSpaces

  /**
   * Constructor sets the input and output file and convert the pdf, docx and doc files to text .
   *
   * @param infile,outfile
   * @return
   */
  public TextConvertor(String infile, String outfile) {
    try {
      File input = new File(infile); // The file from where you would like to extract
      FileInputStream fis = new FileInputStream(input.getAbsolutePath());
      int x = fis.read();
      int y = fis.read();
      fis = new FileInputStream(input.getAbsolutePath());
      if (x == 37 && y == 80) {
        filetype = "pdf";
        pd = PDDocument.load(input);
        PDF2Text(outfile);
      } else if (x == 80 && y == 75) {
        filetype = "docx";

        dx = new XWPFDocument(fis);
        DOCX2Text(outfile);
      } else if (x == 208 && y == 207) {
        filetype = "doc";
        dc = new HWPFDocument(fis);
        DOC2Text(outfile);
      }

    } catch (Exception e) {
      e.printStackTrace();
    }
  }

Пример #22

0

Показать файл

Файл: Pdf2CasConverter.java Проект: renaud/dkpro-core

  public void writeText(final CAS aCas, final InputStream aIs) throws IOException {
    final PDDocument doc = PDDocument.load(aIs);

    try {
      if (doc.isEncrypted()) {
        throw new IOException("Encrypted documents currently not supported");
      }

      cas = aCas;
      text = new StringBuilder();

      writeText(doc);
    } finally {
      doc.close();
    }
  }

Пример #23

0

Показать файл

Файл: WarehouseReportPDF.java Проект: JoshFajardo/storageQuest

  @Override
  public void outputReportToFile(String fileName) throws ReportException {

    try {
      doc.save(fileName);
    } catch (IOException e) {
      throw new ReportException("Error in report save to file: " + e.getMessage());
    }
  }

Пример #24

0

Показать файл

Файл: WarehouseReportPDF.java Проект: JoshFajardo/storageQuest

 @Override
 public void close() {
   super.close();
   try {
     doc.close();
   } catch (IOException e) {
     e.printStackTrace();
   }
 }

Пример #25

0

Показать файл

Файл: ExtractPageContent.java Проект: slonka/project1_jtp2

 ExtractPageContent(String filePath) {
   this.filePath = filePath;
   try {
     reader = new PdfReader(filePath);
     parser = new PdfReaderContentParser(reader);
     getContents();
   } catch (Exception e) {
     try {
       PDDocument doc = PDDocument.load(filePath);
       PDFTextStripper stripper = new PDFTextStripper();
       this.fileContents = stripper.getText(doc);
       doc.close();
     } catch (IOException e1) {
       // TODO Auto-generated catch block
       // e1.printStackTrace();
     }
   }
 }

Пример #26

0

Показать файл

Файл: IndexadorPDF.java Проект: DinamicArea/formacion-lucene

  /**
   * Método para la indexación individual de cada fichero PDF
   *
   * @param f el fichero PDF
   * @param writer el IndexWriter
   * @throws IOException
   */
  public static void indexFile(File f, IndexWriter writer) throws IOException {

    // Cargamos el fichero mediante PDFBox
    PDDocument pddDocument = PDDocument.load(f.getAbsolutePath());
    PDFTextStripper textStripper = new PDFTextStripper();
    int numPages = pddDocument.getNumberOfPages();
    String pageContent;

    // Declaramos un Field propio
    FieldType fieldText = new FieldType();
    fieldText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    fieldText.setStored(false);
    fieldText.setStoreTermVectorOffsets(true);
    fieldText.setStoreTermVectorPositions(true);
    fieldText.setStoreTermVectors(true);

    // Recorremos e indexamos cada una de las páginas del fichero, almacenando el número de página y
    // el título del fichero, e indexando el contenido
    for (int i = 0; i < numPages; i++) {
      if (i == 0) {
        i++;
      }
      textStripper.setStartPage(i);
      textStripper.setEndPage(i);
      // coger una página
      pageContent = textStripper.getText(pddDocument);
      if (pageContent != null && !pageContent.isEmpty()) {
        pageContent = pageContent.toLowerCase();
      }

      if (pageContent != null) {
        // Declaramos el documento a indexar para esa página

        // Número de página
        // Contenido de la página
        // Título del fichero

        // Añadimos el documento
      }
    }

    // Cerramos el fichero PDF

  }

Пример #27

0

Показать файл

Файл: PDPageLabels.java Проект: joaopauloribeiro/PDF-to-standard-HTML

 /**
  * Returns a mapping with computed page labels as keys and corresponding 0-based page indices as
  * values. The returned map will contain at most as much entries as the document has pages.
  *
  * <p><strong>NOTE:</strong> If the document contains duplicate page labels, the returned map will
  * contain <em>less</em> entries than the document has pages. The page index returned in this case
  * is the <em>highest</em> index among all pages sharing the same label.
  *
  * @return a mapping from labels to 0-based page indices.
  */
 public Map<String, Integer> getPageIndicesByLabels() {
   final Map<String, Integer> labelMap = new HashMap<String, Integer>(doc.getNumberOfPages());
   computeLabels(
       new LabelHandler() {
         public void newLabel(int pageIndex, String label) {
           labelMap.put(label, pageIndex);
         }
       });
   return labelMap;
 }

Пример #28

0

Показать файл

Файл: PDFReader.java Проект: kripke/friendlyreader

 private void openPDFFile(String file) throws Exception {
   if (document != null) {
     document.close();
     documentPanel.removeAll();
   }
   InputStream input = null;
   File f = new File(file);
   input = new FileInputStream(f);
   document = parseDocument(input);
   pages = document.getDocumentCatalog().getAllPages();
   numberOfPages = pages.size();
   // AH* Sidantal till GUI:
   sumPan.Sidantal.setText("" + 1 + " Av " + numberOfPages);
   sumPan.sidnrantal = numberOfPages;
   currentFilename = f.getAbsolutePath(); // AH* Borttagen i senare version.
   currentPage = 0;
   updateTitle();
   showPage(0);
 }

Пример #29

0

Показать файл

Файл: PdfAppendTest.java Проект: Rabeea/camel

  @Test
  public void testAppend() throws Exception {
    final String originalText = "Test";
    final String textToAppend = "Append";
    PDDocument document = new PDDocument();
    PDPage page = new PDPage(PDPage.PAGE_SIZE_A4);
    document.addPage(page);
    PDPageContentStream contentStream = new PDPageContentStream(document, page);
    contentStream.setFont(PDType1Font.HELVETICA, 12);
    contentStream.beginText();
    contentStream.moveTextPositionByAmount(20, 400);
    contentStream.drawString(originalText);
    contentStream.endText();
    contentStream.close();

    template.sendBodyAndHeader(
        "direct:start", textToAppend, PdfHeaderConstants.PDF_DOCUMENT_HEADER_NAME, document);

    resultEndpoint.setExpectedMessageCount(1);
    resultEndpoint.expectedMessagesMatches(
        new Predicate() {
          @Override
          public boolean matches(Exchange exchange) {
            Object body = exchange.getIn().getBody();
            assertThat(body, instanceOf(ByteArrayOutputStream.class));
            try {
              PDDocument doc =
                  PDDocument.load(
                      new ByteArrayInputStream(((ByteArrayOutputStream) body).toByteArray()));
              PDFTextStripper pdfTextStripper = new PDFTextStripper();
              String text = pdfTextStripper.getText(doc);
              assertEquals(2, doc.getNumberOfPages());
              assertThat(text, containsString(originalText));
              assertThat(text, containsString(textToAppend));
            } catch (IOException e) {
              throw new RuntimeException(e);
            }
            return true;
          }
        });
    resultEndpoint.assertIsSatisfied();
  }

Пример #30

0

Показать файл

Файл: TestObjectExtractor.java Проект: burakcardakk/tabula-java

 @Test
 public void testDontThrowNPEInShfill() throws IOException {
   PDDocument pdf_document = PDDocument.load("src/test/resources/technology/tabula/labor.pdf");
   ObjectExtractor oe = new ObjectExtractor(pdf_document);
   PageIterator pi = oe.extract();
   try {
     Page p = pi.next();
   } catch (NullPointerException e) {
     fail("NPE in ObjectExtractor " + e.toString());
   }
 }

Java PDDocument примеры использования