String pdf2Text(File fll) { String fileName = fll.getName(); System.out.println("Parsing PDF file " + fileName + "..."); File fl = fll; if (!fl.isFile()) { System.out.println("The File : " + fileName + " does not exist!"); return null; } try { parser = new PDFParser(new FileInputStream(fl)); } catch (Exception e) { System.out.println("Could not open PDF Parser."); return null; } try { parser.parse(); csDoc = parser.getDocument(); pdfStrp = new PDFTextStripper(); pdDoc = new PDDocument(csDoc); NewText = pdfStrp.getText(pdDoc); } catch (Exception e) { System.out.println("Error while parsing PDF file!"); e.printStackTrace(); try { if (csDoc != null) csDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e1) { e.printStackTrace(); } return null; } System.out.println("Done."); return NewText; }
private void crosswalkPDF(Context context, Item item, InputStream metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException { COSDocument cos = null; try { PDFParser parser = new PDFParser(metadata); parser.parse(); cos = parser.getDocument(); // sanity check: PDFBox breaks on encrypted documents, so give up. if (cos.getEncryptionDictionary() != null) throw new MetadataValidationException( "This packager cannot accept an encrypted PDF document."); /* PDF to DC "crosswalk": * * NOTE: This is not in a crosswalk plugin because (a) it isn't * useful anywhere else, and more importantly, (b) the source * data is not XML so it doesn't fit the plugin's interface. * * pattern of crosswalk -- PDF dict entries to DC: * Title -> title.null * Author -> contributor.author * CreationDate -> date.created * ModDate -> date.created * Creator -> description.provenance (application that created orig) * Producer -> description.provenance (convertor to pdf) * Subject -> description.abstract * Keywords -> subject.other * date is java.util.Calendar */ PDDocument pd = new PDDocument(cos); PDDocumentInformation docinfo = pd.getDocumentInformation(); String title = docinfo.getTitle(); // sanity check: item must have a title. if (title == null) throw new MetadataValidationException( "This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary."); log.debug("PDF Info dict title=\"" + title + "\""); item.addDC("title", null, "en", title); String value; Calendar date; if ((value = docinfo.getAuthor()) != null) { item.addDC("contributor", "author", null, value); log.debug("PDF Info dict author=\"" + value + "\""); } if ((value = docinfo.getCreator()) != null) item.addDC( "description", "provenance", "en", "Application that created the original document: " + value); if ((value = docinfo.getProducer()) != null) item.addDC( "description", "provenance", "en", "Original document converted to PDF by: " + value); if ((value = docinfo.getSubject()) != null) item.addDC("description", "abstract", null, value); if ((value = docinfo.getKeywords()) != null) item.addDC("subject", "other", null, value); // Take either CreationDate or ModDate as "date.created", // Too bad there's no place to put "last modified" in the DC. Calendar calValue; if ((calValue = docinfo.getCreationDate()) == null) calValue = docinfo.getModificationDate(); if (calValue != null) item.addDC("date", "created", null, (new DCDate(calValue.getTime())).toString()); item.update(); } finally { if (cos != null) cos.close(); } }
/** * Create new Item out of the ingested package, in the indicated collection. It creates a * workspace item, which the application can then install if it chooses to bypass Workflow. * * <p>This is a VERY crude import of a single Adobe PDF (Portable Document Format) file, using the * document's embedded metadata for package metadata. If the PDF file hasn't got the minimal * metadata available, it is rejected. * * <p> * * @param context DSpace context. * @param collection collection under which to create new item. * @param pkg input stream containing package to ingest. * @param params package parameters (none recognized) * @param license may be null, which takes default license. * @return workspace item created by ingest. * @throws PackageException if package is unacceptable or there is a fatal error turning it into * an Item. */ public WorkspaceItem ingest( Context context, Collection collection, InputStream pkg, PackageParameters params, String license) throws PackageValidationException, CrosswalkException, AuthorizeException, SQLException, IOException { InputStream bis = null; COSDocument cos = null; boolean success = false; Bundle original = null; Bitstream bs = null; WorkspaceItem wi = null; /** * XXX comment out for now // XXX for debugging of parameter handling if (params != null) { * Enumeration pe = params.propertyNames(); while (pe.hasMoreElements()) { String name = * (String)pe.nextElement(); String v[] = params.getProperties(name); StringBuffer msg = new * StringBuffer("PackageParam: "); msg.append(name).append(" = "); for (int i = 0; i < v.length; * ++i) { if (i > 0) msg.append(", "); msg.append(v[i]); } log.debug(msg); } } */ try { // Save the PDF in a bitstream first, since the parser // has to read it as well, and we cannot "rewind" it after that. wi = WorkspaceItem.create(context, collection, false); Item myitem = wi.getItem(); original = myitem.createBundle("ORIGINAL"); bs = original.createBitstream(pkg); pkg.close(); bs.setName("package.pdf"); setFormatToMIMEType(context, bs, "application/pdf"); bs.update(); log.debug("Created bitstream ID=" + String.valueOf(bs.getID()) + ", parsing..."); crosswalkPDF(context, myitem, bs.retrieve()); wi.update(); context.commit(); success = true; log.info( LogManager.getHeader( context, "ingest", "Created new Item, db ID=" + String.valueOf(myitem.getID()) + ", WorkspaceItem ID=" + String.valueOf(wi.getID()))); return wi; } finally { try { // Close bitstream input stream and PDF file. if (bis != null) bis.close(); if (cos != null) cos.close(); } catch (IOException ie) { } // get rid of bitstream and item if ingest fails if (!success) { if (original != null && bs != null) original.removeBitstream(bs); if (wi != null) wi.deleteAll(); } context.commit(); } }