@Override public Document[] parse( final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) throw new Parser.Failure( "Not enough Memory available for pdf parser: " + MemoryControl.available(), location); // create a pdf parser PDDocument pdfDoc; try { Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain // pdfDoc = PDDocument.load(source); final PDFParser pdfParser = new PDFParser(source); pdfParser.setTempDirectory(new File(System.getProperty("java.io.tmpdir"))); pdfParser.parse(); pdfDoc = pdfParser.getPDDocument(); } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } finally { Thread.currentThread().setPriority(Thread.NORM_PRIORITY); } if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (final BadSecurityHandlerException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location); } catch (final IOException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location); } catch (final CryptographyException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location); } final AccessPermission perm = pdfDoc.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted and cannot be decrypted", location); } } // extracting some metadata PDDocumentInformation info = pdfDoc.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null; Date docDate = new Date(); if (info != null) { docTitle = info.getTitle(); docSubject = info.getSubject(); docAuthor = info.getAuthor(); docPublisher = info.getProducer(); if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator(); docKeywordStr = info.getKeywords(); try { if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime(); } catch (IOException e) { } // unused: // info.getTrapped()); } info = null; if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURL.unescape(location.getFileName()); } if (docTitle == null) { docTitle = docSubject; } String[] docKeywords = null; if (docKeywordStr != null) { docKeywords = docKeywordStr.split(" |,"); } Collection<AnchorURL>[] pdflinks = null; Document[] result = null; try { // get the links pdflinks = extractPdfLinks(pdfDoc); // get the fulltext (either per document or for each page) final PDFTextStripper stripper = new PDFTextStripper(StandardCharsets.UTF_8.name()); if (individualPages) { // this is a hack which stores individual pages of the source pdf into individual index // documents // the new documents will get a virtual link with a post argument page=X appended to the // original url // collect text int pagecount = pdfDoc.getNumberOfPages(); String[] pages = new String[pagecount]; for (int page = 1; page <= pagecount; page++) { stripper.setStartPage(page); stripper.setEndPage(page); pages[page - 1] = stripper.getText(pdfDoc); // System.out.println("PAGE " + page + ": " + pages[page - 1]); } // create individual documents for each page assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length; result = new Document[Math.min(pages.length, pdflinks.length)]; String loc = location.toNormalform(true); for (int page = 0; page < result.length; page++) { result[page] = new Document( new AnchorURL( loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' // as that would be removed when computing the urlhash mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false, docDate); } } else { // collect the whole text at once final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); byte[] contentBytes = new byte[0]; stripper.setEndPage(3); // get first 3 pages (always) writer.append(stripper.getText(pdfDoc)); contentBytes = writer.getBytes(); // remember text in case of interrupting thread if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) stripper.setEndPage(Integer.MAX_VALUE); // set to default // we start the pdf parsing in a separate thread to ensure that it can be terminated final PDDocument pdfDocC = pdfDoc; final Thread t = new Thread() { @Override public void run() { Thread.currentThread().setName("pdfParser.getText:" + location); try { writer.append(stripper.getText(pdfDocC)); } catch (final Throwable e) { } } }; t.start(); t.join(3000); // pdfbox likes to forget to terminate ... (quite often) if (t.isAlive()) t.interrupt(); } contentBytes = writer.getBytes(); // get final text before closing writer Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>(); for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); result = new Document[] { new Document( location, mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes, pdflinksCombined, null, null, false, docDate) }; } } catch (final Throwable e) { // close the writer (in finally) // throw new Parser.Failure(e.getMessage(), location); } finally { try { pdfDoc.close(); } catch (final Throwable e) { } } // clear resources in pdfbox. they say that is resolved but it's not. see: // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351 // https://issues.apache.org/jira/browse/PDFBOX-441 // the pdfbox still generates enormeous number of object allocations and don't delete these // the following Object are statically stored and never flushed: // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary, // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull // the great number of these objects can easily be seen in Java Visual VM // we try to get this shit out of the memory here by forced clear calls, hope the best the // rubbish gets out. pdfDoc = null; clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); return result; }
/** * Starts the text extraction. * * @param args the commandline arguments. * @throws IOException if there is an error reading the document or extracting the text. */ public void startExtraction(String[] args) throws IOException { boolean toConsole = false; boolean toHTML = false; boolean sort = false; boolean separateBeads = true; String password = ""; String encoding = "UTF-8"; String pdfFile = null; String outputFile = null; // Defaults to text files String ext = ".txt"; int startPage = 1; int endPage = Integer.MAX_VALUE; for (int i = 0; i < args.length; i++) { switch (args[i]) { case PASSWORD: i++; if (i >= args.length) { usage(); } password = args[i]; break; case ENCODING: i++; if (i >= args.length) { usage(); } encoding = args[i]; break; case START_PAGE: i++; if (i >= args.length) { usage(); } startPage = Integer.parseInt(args[i]); break; case HTML: toHTML = true; ext = ".html"; break; case SORT: sort = true; break; case IGNORE_BEADS: separateBeads = false; break; case DEBUG: debug = true; break; case END_PAGE: i++; if (i >= args.length) { usage(); } endPage = Integer.parseInt(args[i]); break; case CONSOLE: toConsole = true; break; default: if (pdfFile == null) { pdfFile = args[i]; } else { outputFile = args[i]; } break; } } if (pdfFile == null) { usage(); } else { Writer output = null; PDDocument document = null; try { long startTime = startProcessing("Loading PDF " + pdfFile); if (outputFile == null && pdfFile.length() > 4) { outputFile = new File(pdfFile.substring(0, pdfFile.length() - 4) + ext).getAbsolutePath(); } document = PDDocument.load(new File(pdfFile), password); AccessPermission ap = document.getCurrentAccessPermission(); if (!ap.canExtractContent()) { throw new IOException("You do not have permission to extract text"); } stopProcessing("Time for loading: ", startTime); if (toConsole) { output = new OutputStreamWriter(System.out, encoding); } else { output = new OutputStreamWriter(new FileOutputStream(outputFile), encoding); } PDFTextStripper stripper; if (toHTML) { stripper = new PDFText2HTML(); } else { stripper = new PDFTextStripper(); } stripper.setSortByPosition(sort); stripper.setShouldSeparateByBeads(separateBeads); stripper.setStartPage(startPage); stripper.setEndPage(endPage); startTime = startProcessing("Starting text extraction"); if (debug) { System.err.println("Writing to " + outputFile); } // Extract text for main document: stripper.writeText(document, output); // ... also for any embedded PDFs: PDDocumentCatalog catalog = document.getDocumentCatalog(); PDDocumentNameDictionary names = catalog.getNames(); if (names != null) { PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); if (embeddedFiles != null) { Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames(); if (embeddedFileNames != null) { for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) { if (debug) { System.err.println("Processing embedded file " + ent.getKey() + ":"); } PDComplexFileSpecification spec = ent.getValue(); PDEmbeddedFile file = spec.getEmbeddedFile(); if (file != null && "application/pdf".equals(file.getSubtype())) { if (debug) { System.err.println(" is PDF (size=" + file.getSize() + ")"); } InputStream fis = file.createInputStream(); PDDocument subDoc = null; try { subDoc = PDDocument.load(fis); } finally { fis.close(); } try { stripper.writeText(subDoc, output); } finally { IOUtils.closeQuietly(subDoc); } } } } } } stopProcessing("Time for extraction: ", startTime); } finally { IOUtils.closeQuietly(output); IOUtils.closeQuietly(document); } } }