Пример #1
0
  String pdf2Text(File fll) {

    String fileName = fll.getName();

    System.out.println("Parsing PDF file " + fileName + "...");
    File fl = fll;

    if (!fl.isFile()) {
      System.out.println("The File : " + fileName + " does not exist!");
      return null;
    }

    try {
      parser = new PDFParser(new FileInputStream(fl));
    } catch (Exception e) {
      System.out.println("Could not open PDF Parser.");
      return null;
    }

    try {
      parser.parse();
      csDoc = parser.getDocument();

      pdfStrp = new PDFTextStripper();
      pdDoc = new PDDocument(csDoc);
      NewText = pdfStrp.getText(pdDoc);

    } catch (Exception e) {
      System.out.println("Error while parsing PDF file!");
      e.printStackTrace();
      try {
        if (csDoc != null) csDoc.close();
        if (pdDoc != null) pdDoc.close();
      } catch (Exception e1) {
        e.printStackTrace();
      }
      return null;
    }
    System.out.println("Done.");
    return NewText;
  }
Пример #2
0
  private void crosswalkPDF(Context context, Item item, InputStream metadata)
      throws CrosswalkException, IOException, SQLException, AuthorizeException {
    COSDocument cos = null;

    try {
      PDFParser parser = new PDFParser(metadata);
      parser.parse();
      cos = parser.getDocument();

      // sanity check: PDFBox breaks on encrypted documents, so give up.
      if (cos.getEncryptionDictionary() != null)
        throw new MetadataValidationException(
            "This packager cannot accept an encrypted PDF document.");

      /* PDF to DC "crosswalk":
       *
       * NOTE: This is not in a crosswalk plugin because (a) it isn't
       * useful anywhere else, and more importantly, (b) the source
       * data is not XML so it doesn't fit the plugin's interface.
       *
       * pattern of crosswalk -- PDF dict entries to DC:
       *   Title -> title.null
       *   Author -> contributor.author
       *   CreationDate -> date.created
       *   ModDate -> date.created
       *   Creator -> description.provenance (application that created orig)
       *   Producer -> description.provenance (convertor to pdf)
       *   Subject -> description.abstract
       *   Keywords -> subject.other
       *    date is java.util.Calendar
       */
      PDDocument pd = new PDDocument(cos);
      PDDocumentInformation docinfo = pd.getDocumentInformation();
      String title = docinfo.getTitle();

      // sanity check: item must have a title.
      if (title == null)
        throw new MetadataValidationException(
            "This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
      log.debug("PDF Info dict title=\"" + title + "\"");
      item.addDC("title", null, "en", title);
      String value;
      Calendar date;
      if ((value = docinfo.getAuthor()) != null) {
        item.addDC("contributor", "author", null, value);
        log.debug("PDF Info dict author=\"" + value + "\"");
      }
      if ((value = docinfo.getCreator()) != null)
        item.addDC(
            "description",
            "provenance",
            "en",
            "Application that created the original document: " + value);
      if ((value = docinfo.getProducer()) != null)
        item.addDC(
            "description", "provenance", "en", "Original document converted to PDF by: " + value);
      if ((value = docinfo.getSubject()) != null)
        item.addDC("description", "abstract", null, value);
      if ((value = docinfo.getKeywords()) != null) item.addDC("subject", "other", null, value);

      // Take either CreationDate or ModDate as "date.created",
      // Too bad there's no place to put "last modified" in the DC.
      Calendar calValue;
      if ((calValue = docinfo.getCreationDate()) == null) calValue = docinfo.getModificationDate();
      if (calValue != null)
        item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString());
      item.update();
    } finally {
      if (cos != null) cos.close();
    }
  }
Пример #3
0
  /**
   * Create new Item out of the ingested package, in the indicated collection. It creates a
   * workspace item, which the application can then install if it chooses to bypass Workflow.
   *
   * <p>This is a VERY crude import of a single Adobe PDF (Portable Document Format) file, using the
   * document's embedded metadata for package metadata. If the PDF file hasn't got the minimal
   * metadata available, it is rejected.
   *
   * <p>
   *
   * @param context DSpace context.
   * @param collection collection under which to create new item.
   * @param pkg input stream containing package to ingest.
   * @param params package parameters (none recognized)
   * @param license may be null, which takes default license.
   * @return workspace item created by ingest.
   * @throws PackageException if package is unacceptable or there is a fatal error turning it into
   *     an Item.
   */
  public WorkspaceItem ingest(
      Context context,
      Collection collection,
      InputStream pkg,
      PackageParameters params,
      String license)
      throws PackageValidationException, CrosswalkException, AuthorizeException, SQLException,
          IOException {
    InputStream bis = null;
    COSDocument cos = null;
    boolean success = false;
    Bundle original = null;
    Bitstream bs = null;
    WorkspaceItem wi = null;

    /**
     * XXX comment out for now // XXX for debugging of parameter handling if (params != null) {
     * Enumeration pe = params.propertyNames(); while (pe.hasMoreElements()) { String name =
     * (String)pe.nextElement(); String v[] = params.getProperties(name); StringBuffer msg = new
     * StringBuffer("PackageParam: "); msg.append(name).append(" = "); for (int i = 0; i < v.length;
     * ++i) { if (i > 0) msg.append(", "); msg.append(v[i]); } log.debug(msg); } }
     */
    try {
      // Save the PDF in a bitstream first, since the parser
      // has to read it as well, and we cannot "rewind" it after that.
      wi = WorkspaceItem.create(context, collection, false);
      Item myitem = wi.getItem();
      original = myitem.createBundle("ORIGINAL");
      bs = original.createBitstream(pkg);
      pkg.close();
      bs.setName("package.pdf");
      setFormatToMIMEType(context, bs, "application/pdf");
      bs.update();
      log.debug("Created bitstream ID=" + String.valueOf(bs.getID()) + ", parsing...");

      crosswalkPDF(context, myitem, bs.retrieve());

      wi.update();
      context.commit();
      success = true;
      log.info(
          LogManager.getHeader(
              context,
              "ingest",
              "Created new Item, db ID="
                  + String.valueOf(myitem.getID())
                  + ", WorkspaceItem ID="
                  + String.valueOf(wi.getID())));
      return wi;
    } finally {
      try {
        // Close bitstream input stream and PDF file.
        if (bis != null) bis.close();
        if (cos != null) cos.close();
      } catch (IOException ie) {
      }

      // get rid of bitstream and item if ingest fails
      if (!success) {
        if (original != null && bs != null) original.removeBitstream(bs);
        if (wi != null) wi.deleteAll();
      }
      context.commit();
    }
  }