Exemple #1
0
  private void crosswalkPDF(Context context, Item item, InputStream metadata)
      throws CrosswalkException, IOException, SQLException, AuthorizeException {
    COSDocument cos = null;

    try {
      PDFParser parser = new PDFParser(metadata);
      parser.parse();
      cos = parser.getDocument();

      // sanity check: PDFBox breaks on encrypted documents, so give up.
      if (cos.getEncryptionDictionary() != null)
        throw new MetadataValidationException(
            "This packager cannot accept an encrypted PDF document.");

      /* PDF to DC "crosswalk":
       *
       * NOTE: This is not in a crosswalk plugin because (a) it isn't
       * useful anywhere else, and more importantly, (b) the source
       * data is not XML so it doesn't fit the plugin's interface.
       *
       * pattern of crosswalk -- PDF dict entries to DC:
       *   Title -> title.null
       *   Author -> contributor.author
       *   CreationDate -> date.created
       *   ModDate -> date.created
       *   Creator -> description.provenance (application that created orig)
       *   Producer -> description.provenance (convertor to pdf)
       *   Subject -> description.abstract
       *   Keywords -> subject.other
       *    date is java.util.Calendar
       */
      PDDocument pd = new PDDocument(cos);
      PDDocumentInformation docinfo = pd.getDocumentInformation();
      String title = docinfo.getTitle();

      // sanity check: item must have a title.
      if (title == null)
        throw new MetadataValidationException(
            "This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
      log.debug("PDF Info dict title=\"" + title + "\"");
      item.addDC("title", null, "en", title);
      String value;
      Calendar date;
      if ((value = docinfo.getAuthor()) != null) {
        item.addDC("contributor", "author", null, value);
        log.debug("PDF Info dict author=\"" + value + "\"");
      }
      if ((value = docinfo.getCreator()) != null)
        item.addDC(
            "description",
            "provenance",
            "en",
            "Application that created the original document: " + value);
      if ((value = docinfo.getProducer()) != null)
        item.addDC(
            "description", "provenance", "en", "Original document converted to PDF by: " + value);
      if ((value = docinfo.getSubject()) != null)
        item.addDC("description", "abstract", null, value);
      if ((value = docinfo.getKeywords()) != null) item.addDC("subject", "other", null, value);

      // Take either CreationDate or ModDate as "date.created",
      // Too bad there's no place to put "last modified" in the DC.
      Calendar calValue;
      if ((calValue = docinfo.getCreationDate()) == null) calValue = docinfo.getModificationDate();
      if (calValue != null)
        item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
      item.update();
    } finally {
      if (cos != null) cos.close();
    }
  }
package lius.index.pdf;