Java PDDocument.getDocumentInformationの例

プログラミング言語: Java

名前空間/パッケージ名: org.apache.pdfbox.pdmodel

クラス/型: PDDocument

メソッド/関数: getDocumentInformation

hotexamples.comのコード掲載数: 3

Java PDDocument.getDocumentInformation - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたJavaのorg.apache.pdfbox.pdmodel.PDDocument.getDocumentInformationの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

load(30)

close(30)

getDocumentCatalog(19)

getNumberOfPages(12)

isEncrypted(11)

save(10)

addPage(7)

getDocument(5)

decrypt(4)

getSignatureDictionaries(3)

saveIncremental(3)

getPage(3)

getDocumentInformation(3)

getEncryptionDictionary(2)

protect(2)

getDocumentId(2)

getCurrentAccessPermission(2)

setDocumentId(2)

addSignature(1)

importPage(1)

loadNonSeq(1)

openProtection(1)

setEncryptionDictionary(1)

コード例 #1

ファイルを表示

ファイル: PDFIndexer.java プロジェクト: schoenm1/lucene

  /**
   * This will add the contents to the lucene document.
   *
   * @param document The document to add the contents to.
   * @param is The stream to get the contents from.
   * @param documentLocation The location of the document, used just for debug messages.
   * @throws IOException If there is an error parsing the document.
   */
  private void addContent(Document document, InputStream is, String documentLocation)
      throws IOException {
    PDDocument pdfDocument = null;
    PDFTextStripper stripper;
    try {
      pdfDocument = PDDocument.load(is);
      if (pdfDocument.isEncrypted()) {
        // Just try using the default password and move on
        pdfDocument.decrypt("");
      }

      // create a writer where to append the text content.
      StringWriter writer = new StringWriter();
      stripper = new PDFTextStripper();
      try {
        stripper.writeText(pdfDocument, writer);

      } catch (Exception e) {
        System.out.println("Error in stripper.writeText()");
      }
      String contents = writer.getBuffer().toString();

      StringReader reader = new StringReader(contents);
      addTextField(document, Indexer.contents, reader);
      PDDocumentInformation info = pdfDocument.getDocumentInformation();
      if (info != null) {
        addTextField(document, Indexer.Author, info.getAuthor());
        try {
          addTextField(document, Indexer.created, info.getCreationDate());
        } catch (IOException io) {
          // ignore, bad date but continue with indexing
        }

        addTextField(document, Indexer.keywords, info.getKeywords());
        try {
          addTextField(document, Indexer.modified, info.getModificationDate());
        } catch (IOException io) {
          // ignore, bad date but continue with indexing
        }
        addTextField(document, "Subject", info.getSubject());
        addTextField(document, Indexer.Title, info.getTitle());
      }
      int summarySize = Math.min(contents.length(), 500);
      String summary = contents.substring(0, summarySize);
      // Add the summary as an UnIndexed field, so that it is stored and
      // returned
      // with hit documents for display.
      addUnindexedField(document, Indexer.summary, summary);
    } catch (CryptographyException e) {
      throw new IOException("Error decrypting document(" + documentLocation + "): " + e);
    } catch (InvalidPasswordException e) {
      // they didn't suppply a password and the default of "" was wrong.
      throw new IOException(
          "Error: The document(" + documentLocation + ") is encrypted and will not be indexed.");
    } finally {
      if (pdfDocument != null) {
        pdfDocument.close();
      }
    }
  }

コード例 #2

ファイルを表示

ファイル: LucenePDFDocument.java プロジェクト: lumpchen/pdfbox_ua

  /**
   * This will add the contents to the lucene document.
   *
   * @param document The document to add the contents to.
   * @param is The stream to get the contents from.
   * @param documentLocation The location of the document, used just for debug messages.
   * @throws IOException If there is an error parsing the document.
   */
  private void addContent(Document document, InputStream is, String documentLocation)
      throws IOException {
    PDDocument pdfDocument = null;
    try {
      pdfDocument = PDDocument.load(is, "");

      // create a writer where to append the text content.
      StringWriter writer = new StringWriter();
      if (stripper == null) {
        stripper = new PDFTextStripper();
      }
      stripper.writeText(pdfDocument, writer);

      // Note: the buffer to string operation is costless;
      // the char array value of the writer buffer and the content string
      // is shared as long as the buffer content is not modified, which will
      // not occur here.
      String contents = writer.getBuffer().toString();

      StringReader reader = new StringReader(contents);

      // Add the tag-stripped contents as a Reader-valued Text field so it will
      // get tokenized and indexed.
      addTextField(document, "contents", reader);

      PDDocumentInformation info = pdfDocument.getDocumentInformation();
      if (info != null) {
        addTextField(document, "Author", info.getAuthor());
        addTextField(document, "CreationDate", info.getCreationDate());
        addTextField(document, "Creator", info.getCreator());
        addTextField(document, "Keywords", info.getKeywords());
        addTextField(document, "ModificationDate", info.getModificationDate());
        addTextField(document, "Producer", info.getProducer());
        addTextField(document, "Subject", info.getSubject());
        addTextField(document, "Title", info.getTitle());
        addTextField(document, "Trapped", info.getTrapped());
      }
      int summarySize = Math.min(contents.length(), 500);
      String summary = contents.substring(0, summarySize);
      // Add the summary as an UnIndexed field, so that it is stored and returned
      // with hit documents for display.
      addUnindexedField(document, "summary", summary);
    } catch (InvalidPasswordException e) {
      // they didn't suppply a password and the default of "" was wrong.
      throw new IOException(
          "Error: The document(" + documentLocation + ") is encrypted and will not be indexed.", e);
    } finally {
      if (pdfDocument != null) {
        pdfDocument.close();
      }
    }
  }

コード例 #3

ファイルを表示

ファイル: pdfParser.java プロジェクト: supertanglang/yacy_search_server

  @Override
  public Document[] parse(
      final AnchorURL location,
      final String mimeType,
      final String charset,
      final VocabularyScraper scraper,
      final int timezoneOffset,
      final InputStream source)
      throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
      throw new Parser.Failure(
          "Not enough Memory available for pdf parser: " + MemoryControl.available(), location);

    // create a pdf parser
    PDDocument pdfDoc;
    try {
      Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
      // pdfDoc = PDDocument.load(source);
      final PDFParser pdfParser = new PDFParser(source);
      pdfParser.setTempDirectory(new File(System.getProperty("java.io.tmpdir")));
      pdfParser.parse();
      pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
      throw new Parser.Failure(e.getMessage(), location);
    } finally {
      Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
      try {
        pdfDoc.openProtection(new StandardDecryptionMaterial(""));
      } catch (final BadSecurityHandlerException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
      } catch (final IOException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
      } catch (final CryptographyException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
      }
      final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
      if (perm == null || !perm.canExtractContent()) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
      }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null,
        docSubject = null,
        docAuthor = null,
        docPublisher = null,
        docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
      docTitle = info.getTitle();
      docSubject = info.getSubject();
      docAuthor = info.getAuthor();
      docPublisher = info.getProducer();
      if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator();
      docKeywordStr = info.getKeywords();
      try {
        if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime();
      } catch (IOException e) {
      }
      // unused:
      // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
      docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
      docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
      docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
      // get the links
      pdflinks = extractPdfLinks(pdfDoc);

      // get the fulltext (either per document or for each page)
      final PDFTextStripper stripper = new PDFTextStripper(StandardCharsets.UTF_8.name());

      if (individualPages) {
        // this is a hack which stores individual pages of the source pdf into individual index
        // documents
        // the new documents will get a virtual link with a post argument page=X appended to the
        // original url

        // collect text
        int pagecount = pdfDoc.getNumberOfPages();
        String[] pages = new String[pagecount];
        for (int page = 1; page <= pagecount; page++) {
          stripper.setStartPage(page);
          stripper.setEndPage(page);
          pages[page - 1] = stripper.getText(pdfDoc);
          // System.out.println("PAGE " + page + ": " + pages[page - 1]);
        }

        // create individual documents for each page
        assert pages.length == pdflinks.length
            : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length;
        result = new Document[Math.min(pages.length, pdflinks.length)];
        String loc = location.toNormalform(true);
        for (int page = 0; page < result.length; page++) {
          result[page] =
              new Document(
                  new AnchorURL(
                      loc
                          + (loc.indexOf('?') > 0 ? '&' : '?')
                          + individualPagePropertyname
                          + '='
                          + (page
                              + 1)), // these are virtual new pages; we cannot combine them with '#'
                                     // as that would be removed when computing the urlhash
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                  pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
                  null,
                  null,
                  false,
                  docDate);
        }
      } else {
        // collect the whole text at once
        final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
        byte[] contentBytes = new byte[0];
        stripper.setEndPage(3); // get first 3 pages (always)
        writer.append(stripper.getText(pdfDoc));
        contentBytes = writer.getBytes(); // remember text in case of interrupting thread

        if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
          stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
          stripper.setEndPage(Integer.MAX_VALUE); // set to default
          // we start the pdf parsing in a separate thread to ensure that it can be terminated
          final PDDocument pdfDocC = pdfDoc;
          final Thread t =
              new Thread() {
                @Override
                public void run() {
                  Thread.currentThread().setName("pdfParser.getText:" + location);
                  try {
                    writer.append(stripper.getText(pdfDocC));
                  } catch (final Throwable e) {
                  }
                }
              };
          t.start();
          t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
          if (t.isAlive()) t.interrupt();
        }
        contentBytes = writer.getBytes(); // get final text before closing writer

        Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
        for (Collection<AnchorURL> pdflinksx : pdflinks)
          if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
        result =
            new Document[] {
              new Document(
                  location,
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  contentBytes,
                  pdflinksCombined,
                  null,
                  null,
                  false,
                  docDate)
            };
      }
    } catch (final Throwable e) {
      // close the writer (in finally)
      // throw new Parser.Failure(e.getMessage(), location);
    } finally {
      try {
        pdfDoc.close();
      } catch (final Throwable e) {
      }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the
    // rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
  }