/** * This will add the contents to the lucene document. * * @param document The document to add the contents to. * @param is The stream to get the contents from. * @param documentLocation The location of the document, used just for debug messages. * @throws IOException If there is an error parsing the document. */ private void addContent(Document document, InputStream is, String documentLocation) throws IOException { PDDocument pdfDocument = null; PDFTextStripper stripper; try { pdfDocument = PDDocument.load(is); if (pdfDocument.isEncrypted()) { // Just try using the default password and move on pdfDocument.decrypt(""); } // create a writer where to append the text content. StringWriter writer = new StringWriter(); stripper = new PDFTextStripper(); try { stripper.writeText(pdfDocument, writer); } catch (Exception e) { System.out.println("Error in stripper.writeText()"); } String contents = writer.getBuffer().toString(); StringReader reader = new StringReader(contents); addTextField(document, Indexer.contents, reader); PDDocumentInformation info = pdfDocument.getDocumentInformation(); if (info != null) { addTextField(document, Indexer.Author, info.getAuthor()); try { addTextField(document, Indexer.created, info.getCreationDate()); } catch (IOException io) { // ignore, bad date but continue with indexing } addTextField(document, Indexer.keywords, info.getKeywords()); try { addTextField(document, Indexer.modified, info.getModificationDate()); } catch (IOException io) { // ignore, bad date but continue with indexing } addTextField(document, "Subject", info.getSubject()); addTextField(document, Indexer.Title, info.getTitle()); } int summarySize = Math.min(contents.length(), 500); String summary = contents.substring(0, summarySize); // Add the summary as an UnIndexed field, so that it is stored and // returned // with hit documents for display. addUnindexedField(document, Indexer.summary, summary); } catch (CryptographyException e) { throw new IOException("Error decrypting document(" + documentLocation + "): " + e); } catch (InvalidPasswordException e) { // they didn't suppply a password and the default of "" was wrong. throw new IOException( "Error: The document(" + documentLocation + ") is encrypted and will not be indexed."); } finally { if (pdfDocument != null) { pdfDocument.close(); } } }
private String pdfToText(InputStream in) { PDFParser parser = null; PDDocument pdDoc = null; COSDocument cosDoc = null; PDFTextStripper pdfStripper; try { parser = new PDFParser(in); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); return pdfStripper.getText(pdDoc); // System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", "")); } catch (Exception e) { e.printStackTrace(); try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e1) { e.printStackTrace(); } } return null; }
public static String getFileContent(File file) throws FileNotFoundException, IOException { String ext = FilenameUtils.getExtension(file.getName()); String outContent = ""; try { if (ext.toLowerCase().equals("doc")) { if (file != null) { WordExtractor we = new WordExtractor(new FileInputStream(file)); outContent = we.getText(); } else { logger.warning("file not found : " + file); } } else if (ext.toLowerCase().equals("pdf")) { PDDocument doc = PDDocument.load(file); PDFTextStripper text = new PDFTextStripper(); outContent = text.getText(doc); doc.close(); } else if (StringHelper.isHTML(file.getName())) { return loadStringFromFile(file); } } catch (Throwable t) { logger.warning("error when read : " + file + "+ [" + t.getMessage() + "]"); t.printStackTrace(); } return outContent; }
public static String extractText(InputStream src) throws IOException { StringBuilder text = new StringBuilder(); COSDocument cosDoc = null; PDDocument pdDoc = null; try { PDFParser parser = new PDFParser(src); parser.parse(); cosDoc = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); int nbPages = pdDoc.getDocumentCatalog().getPages().getCount(); for (int i = 0; i < nbPages; i++) { stripper.setStartPage(i + 1); stripper.setEndPage(i + 1); text.append(stripper.getText(pdDoc)); } } finally { try { if (cosDoc != null) { cosDoc.close(); } } catch (IOException e) { // Do nada } try { if (pdDoc != null) { pdDoc.close(); } } catch (IOException e) { // Do nada } } return text.toString(); }
/** * Extracts text from a PDF and stores it in the document. Takes an input stream rather than a * file name. * * @param filesInputStream An input stream pointing to a PDF file. * @throws IOException */ private static char[] loadPDF(InputStream filesInputStream) throws IOException { PDDocument doc = PDDocument.load(filesInputStream); PDFTextStripper pdfStripper = new PDFTextStripper(); pdfStripper.setSortByPosition(false); char[] origText = pdfStripper.getText(doc).toCharArray(); doc.close(); return origText; }
public static String getContent(PDFParser parser) throws IOException { parser.parse(); COSDocument cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); String content = pdfStripper.getText(pdDoc); cosDoc.close(); pdDoc.close(); return content; }
/** Extracts the textual contents from a PDF file as one long string. */ public String extractPDFContents(File f) throws IOException { FileInputStream fi = new FileInputStream(f); PDFParser parser = new PDFParser(fi); parser.parse(); fi.close(); COSDocument cd = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); String result = stripper.getText(new PDDocument(cd)); cd.close(); return result; }
public static String extract(File pdfFile) throws IOException { checkNotNull(pdfFile, "pdfFile"); PDFParser parser = new PDFParser(new FileInputStream(pdfFile)); parser.parse(); COSDocument cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); pdfStripper.setSortByPosition(true); String pdfText = pdfStripper.getText(pdDoc); pdDoc.close(); cosDoc.close(); return pdfText; }
ExtractPageContent(String filePath) { this.filePath = filePath; try { reader = new PdfReader(filePath); parser = new PdfReaderContentParser(reader); getContents(); } catch (Exception e) { try { PDDocument doc = PDDocument.load(filePath); PDFTextStripper stripper = new PDFTextStripper(); this.fileContents = stripper.getText(doc); doc.close(); } catch (IOException e1) { // TODO Auto-generated catch block // e1.printStackTrace(); } } }
/** * Método para la indexación individual de cada fichero PDF * * @param f el fichero PDF * @param writer el IndexWriter * @throws IOException */ public static void indexFile(File f, IndexWriter writer) throws IOException { // Cargamos el fichero mediante PDFBox PDDocument pddDocument = PDDocument.load(f.getAbsolutePath()); PDFTextStripper textStripper = new PDFTextStripper(); int numPages = pddDocument.getNumberOfPages(); String pageContent; // Declaramos un Field propio FieldType fieldText = new FieldType(); fieldText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); fieldText.setStored(false); fieldText.setStoreTermVectorOffsets(true); fieldText.setStoreTermVectorPositions(true); fieldText.setStoreTermVectors(true); // Recorremos e indexamos cada una de las páginas del fichero, almacenando el número de página y // el título del fichero, e indexando el contenido for (int i = 0; i < numPages; i++) { if (i == 0) { i++; } textStripper.setStartPage(i); textStripper.setEndPage(i); // coger una página pageContent = textStripper.getText(pddDocument); if (pageContent != null && !pageContent.isEmpty()) { pageContent = pageContent.toLowerCase(); } if (pageContent != null) { // Declaramos el documento a indexar para esa página // Número de página // Contenido de la página // Título del fichero // Añadimos el documento } } // Cerramos el fichero PDF }
public static void main(String[] args) { PDDocument pd; try { File input = new File("pdf/1.pdf"); // The PDF file from where you would like to extract pd = PDDocument.load(input); int numberOfPages = pd.getNumberOfPages(); PDFTextStripper stripper = new PDFTextStripper(); String fullText = stripper.getText(pd); int indexReferences = fullText.lastIndexOf("References\n"); String textOutReferences = fullText.substring(0, indexReferences > 0 ? indexReferences : fullText.length()); String textOutStop = removeStopWords(textOutReferences); findMoreCiteds(textOutStop); extractReferences(fullText); stripper.setEndPage(3); String startText = stripper.getText(pd); System.out.println("Autores"); extractAuthor(startText); System.out.println("Objetivos"); extractObjective(startText); System.out.println("\n\nProblemas"); extractProblem(startText); System.out.println("\n\nMetodologia"); extractMethodology(fullText); System.out.println("\n\nContribuições"); extractContributes(fullText); pd.close(); } catch (Exception e) { e.printStackTrace(); } }
public static void main(String[] args) throws Exception { File file = new File("C:/Users/jatin.goyal/Desktop/demoexcel.pdf"); PDDocument pd = PDDocument.load(file); System.out.println(pd.getNumberOfPages()); PDFTextStripper st = new PDFTextStripper(); st.setStartPage(1); // st.setEndPage(4); // PDFTextStripperByArea stripper = new PDFTextStripperByArea(); // stripper.setSortByPosition( true ); // Rectangle rect1 = new Rectangle( 50, 140, 60, 20 ); // Rectangle rect2 = new Rectangle( 110, 140, 20, 20 ); // stripper.addRegion( "row1column1", rect1 ); // stripper.addRegion( "row1column2", rect2 ); // List allPages = pd.getDocumentCatalog().getAllPages(); // PDPage firstPage = (PDPage)allPages.get( 0 ); // stripper.extractRegions( firstPage ); // System.out.println(stripper.getTextForRegion( "row1column1" )); // System.out.println(stripper.getTextForRegion( "row1column2" )); System.out.println(st.getText(pd)); }
public void PDF2TextPreProssesd(String filename) { try { stripper = new PDFTextStripper(); stripper.setParagraphStart("&*&"); stripper.setLineSeparator("#%#"); stripper.setPageSeparator("#%#"); String fulltxt = stripper.getText(pd); String paras[] = fulltxt.split("&*&"); File file = new File(filename); try { BufferedWriter out = new BufferedWriter(new FileWriter(file)); int i = 0; while (i < paras.length) { if (paras[i].length() > 200) { String para = paras[i].replace("#%#", " "); out.write(para + "\r\n"); } i++; } out.close(); } catch (IOException ex) { ex.printStackTrace(); } if (pd != null) { pd.close(); } } catch (Exception e) { e.printStackTrace(); } }
/** * save the converted text (without any processing) to the given file. * * @param filename * @return */ public void PDF2Text(String filename) { try { File output = new File(filename); // The text file where you are going to store the extracted data stripper = new PDFTextStripper(); wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output))); stripper.writeText(pd, wr); if (pd != null) { pd.close(); } } catch (Exception e) { e.printStackTrace(); } }
String pdftoText(String fileName) { System.out.println("Parsing text from PDF file " + fileName + "...."); File f = new File(fileName); if (!f.isFile()) { System.out.println("File " + fileName + " does not exist."); return null; } try { parser = new PDFParser(new FileInputStream(f)); } catch (Exception e) { System.out.println("Unable to open PDF Parser."); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { System.out.println("An exception occured in parsing the PDF Document."); e.printStackTrace(); try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e1) { e.printStackTrace(); } return null; } System.out.println("Done."); return parsedText; }
/** {@inheritDoc} */ @Override public void resetEngine() { super.resetEngine(); textCache = null; }
/** * Default constructor. * * @throws IOException If there is an error loading text stripper properties. */ public PrintTextLocations() throws IOException { super.setSortByPosition(true); }
@Override public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) throws IOException { final ArrayList<BibtexEntry> res = new ArrayList<BibtexEntry>(1); PDDocument document; try { document = PDDocument.load(in); } catch (IOException e) { LOGGER.error("Could not load document", e); return res; } try { if (document.isEncrypted()) { LOGGER.error(Localization.lang("Encrypted documents are not supported")); // return res; } PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(1); stripper.setEndPage(1); stripper.setSortByPosition(true); stripper.setParagraphEnd(System.lineSeparator()); StringWriter writer = new StringWriter(); stripper.writeText(document, writer); String textResult = writer.toString(); String doi = new DOI(textResult).getDOI(); if (doi.length() < textResult.length()) { // A Doi was found in the text // We do NO parsing of the text, but use the Doi fetcher ImportInspector i = new ImportInspector() { @Override public void toFront() {} @Override public void setProgress(int current, int max) {} @Override public void addEntry(BibtexEntry entry) { // add the entry to the result object res.add(entry); } }; PdfContentImporter.doiToBibTeXFetcher.processQuery(doi, i, status); if (!res.isEmpty()) { // if something has been found, return the result return res; } else { // otherwise, we just parse the PDF } } String author; String editor = null; String institution = null; String abstractT = null; String keywords = null; String title; String conference = null; String DOI = null; String series = null; String volume = null; String number = null; String pages = null; // year is a class variable as the method extractYear() uses it; String publisher = null; BibtexEntryType type = BibtexEntryTypes.INPROCEEDINGS; final String lineBreak = System.lineSeparator(); split = textResult.split(lineBreak); // idea: split[] contains the different lines // blocks are separated by empty lines // treat each block // or do special treatment at authors (which are not broken) // therefore, we do a line-based and not a block-based splitting // i points to the current line // curString (mostly) contains the current block // the different lines are joined into one and thereby separated by " " proceedToNextNonEmptyLine(); if (i >= split.length) { // PDF could not be parsed or is empty // return empty list return res; } curString = split[i]; i = i + 1; if (curString.length() > 4) { // special case: possibly conference as first line on the page extractYear(); if (curString.contains("Conference")) { fillCurStringWithNonEmptyLines(); conference = curString; curString = ""; } else { // e.g. Copyright (c) 1998 by the Genetics Society of America // future work: get year using RegEx String lower = curString.toLowerCase(); if (lower.contains("copyright")) { fillCurStringWithNonEmptyLines(); publisher = curString; curString = ""; } } } // start: title fillCurStringWithNonEmptyLines(); title = streamlineTitle(curString); curString = ""; // i points to the next non-empty line // after title: authors author = null; while (i < split.length && !split[i].equals("")) { // author names are unlikely to be split among different lines // treat them line by line curString = streamlineNames(split[i]); if (author == null) { author = curString; } else { if (curString.equals("")) { // if split[i] is "and" then "" is returned by streamlineNames -> do nothing } else { author = author.concat(" and ").concat(curString); } } i++; } curString = ""; i++; // then, abstract and keywords follow while (i < split.length) { curString = split[i]; if (curString.length() >= "Abstract".length() && curString.substring(0, "Abstract".length()).equalsIgnoreCase("Abstract")) { if (curString.length() == "Abstract".length()) { // only word "abstract" found -- skip line curString = ""; } else { curString = curString.substring("Abstract".length() + 1).trim().concat(lineBreak); } i++; // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator // whereas we need linebreak as separator while (i < split.length && !split[i].equals("")) { curString = curString.concat(split[i]).concat(lineBreak); i++; } abstractT = curString; i++; } else if (curString.length() >= "Keywords".length() && curString.substring(0, "Keywords".length()).equalsIgnoreCase("Keywords")) { if (curString.length() == "Keywords".length()) { // only word "Keywords" found -- skip line curString = ""; } else { curString = curString.substring("Keywords".length() + 1).trim(); } i++; fillCurStringWithNonEmptyLines(); keywords = removeNonLettersAtEnd(curString); } else { String lower = curString.toLowerCase(); int pos = lower.indexOf("technical"); if (pos >= 0) { type = BibtexEntryTypes.TECHREPORT; pos = curString.trim().lastIndexOf(' '); if (pos >= 0) { // assumption: last character of curString is NOT ' ' // otherwise pos+1 leads to an out-of-bounds exception number = curString.substring(pos + 1); } } i++; proceedToNextNonEmptyLine(); } } i = split.length - 1; // last block: DOI, detailed information // sometimes, this information is in the third last block etc... // therefore, read until the beginning of the file while (i >= 0) { readLastBlock(); // i now points to the block before or is -1 // curString contains the last block, separated by " " extractYear(); int pos = curString.indexOf("(Eds.)"); if (pos >= 0 && publisher == null) { // looks like a Springer last line // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009. publisher = "Springer"; editor = streamlineNames(curString.substring(0, pos - 1)); curString = curString.substring( pos + "(Eds.)".length() + 2); // +2 because of ":" after (Eds.) and the subsequent space String[] springerSplit = curString.split(", "); if (springerSplit.length >= 4) { conference = springerSplit[0]; String seriesData = springerSplit[1]; int lastSpace = seriesData.lastIndexOf(' '); series = seriesData.substring(0, lastSpace); volume = seriesData.substring(lastSpace + 1); pages = springerSplit[2].substring(4); if (springerSplit[3].length() >= 4) { year = springerSplit[3].substring(0, 4); } } } else { if (DOI == null) { pos = curString.indexOf("DOI"); if (pos < 0) { pos = curString.indexOf("doi"); } if (pos >= 0) { pos += 3; char delimiter = curString.charAt(pos); if (delimiter == ':' || delimiter == ' ') { pos++; } int nextSpace = curString.indexOf(' ', pos); if (nextSpace > 0) { DOI = curString.substring(pos, nextSpace); } else { DOI = curString.substring(pos); } } } if (publisher == null && curString.contains("IEEE")) { // IEEE has the conference things at the end publisher = "IEEE"; // year is extracted by extractYear // otherwise, we could it determine as follows: // String yearStr = curString.substring(curString.length()-4); // if (isYear(yearStr)) { // year = yearStr; // } if (conference == null) { pos = curString.indexOf('$'); if (pos > 0) { // we found the price // before the price, the ISSN is stated // skip that pos -= 2; while (pos >= 0 && curString.charAt(pos) != ' ') { pos--; } if (pos > 0) { conference = curString.substring(0, pos); } } } } // String lower = curString.toLowerCase(); // if (institution == null) { // // } } } BibtexEntry entry = new BibtexEntry(); entry.setType(type); if (author != null) { entry.setField("author", author); } if (editor != null) { entry.setField("editor", editor); } if (institution != null) { entry.setField("institution", institution); } if (abstractT != null) { entry.setField("abstract", abstractT); } if (keywords != null) { entry.setField("keywords", keywords); } if (title != null) { entry.setField("title", title); } if (conference != null) { entry.setField("booktitle", conference); } if (DOI != null) { entry.setField("doi", DOI); } if (series != null) { entry.setField("series", series); } if (volume != null) { entry.setField("volume", volume); } if (number != null) { entry.setField("number", number); } if (pages != null) { entry.setField("pages", pages); } if (year != null) { entry.setField("year", year); } if (publisher != null) { entry.setField("publisher", publisher); } entry.setField("review", textResult); res.add(entry); } catch (NoClassDefFoundError e) { if (e.getMessage().equals("org/bouncycastle/jce/provider/BouncyCastleProvider")) { status.showMessage( Localization.lang( "Java Bouncy Castle library not found. Please download and install it. For more information see http://www.bouncycastle.org/.")); } else { LOGGER.error("Could not find class", e); } } finally { document.close(); } return res; }
@Override public Document[] parse( final AnchorURL location, final String mimeType, final String charset, final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { // check memory for parser if (!MemoryControl.request(200 * 1024 * 1024, false)) throw new Parser.Failure( "Not enough Memory available for pdf parser: " + MemoryControl.available(), location); // create a pdf parser PDDocument pdfDoc; try { Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain // pdfDoc = PDDocument.load(source); final PDFParser pdfParser = new PDFParser(source); pdfParser.setTempDirectory(new File(System.getProperty("java.io.tmpdir"))); pdfParser.parse(); pdfDoc = pdfParser.getPDDocument(); } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } finally { Thread.currentThread().setPriority(Thread.NORM_PRIORITY); } if (pdfDoc.isEncrypted()) { try { pdfDoc.openProtection(new StandardDecryptionMaterial("")); } catch (final BadSecurityHandlerException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location); } catch (final IOException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location); } catch (final CryptographyException e) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location); } final AccessPermission perm = pdfDoc.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) { try { pdfDoc.close(); } catch (final IOException ee) { } throw new Parser.Failure("Document is encrypted and cannot be decrypted", location); } } // extracting some metadata PDDocumentInformation info = pdfDoc.getDocumentInformation(); String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null; Date docDate = new Date(); if (info != null) { docTitle = info.getTitle(); docSubject = info.getSubject(); docAuthor = info.getAuthor(); docPublisher = info.getProducer(); if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator(); docKeywordStr = info.getKeywords(); try { if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime(); } catch (IOException e) { } // unused: // info.getTrapped()); } info = null; if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURL.unescape(location.getFileName()); } if (docTitle == null) { docTitle = docSubject; } String[] docKeywords = null; if (docKeywordStr != null) { docKeywords = docKeywordStr.split(" |,"); } Collection<AnchorURL>[] pdflinks = null; Document[] result = null; try { // get the links pdflinks = extractPdfLinks(pdfDoc); // get the fulltext (either per document or for each page) final PDFTextStripper stripper = new PDFTextStripper(StandardCharsets.UTF_8.name()); if (individualPages) { // this is a hack which stores individual pages of the source pdf into individual index // documents // the new documents will get a virtual link with a post argument page=X appended to the // original url // collect text int pagecount = pdfDoc.getNumberOfPages(); String[] pages = new String[pagecount]; for (int page = 1; page <= pagecount; page++) { stripper.setStartPage(page); stripper.setEndPage(page); pages[page - 1] = stripper.getText(pdfDoc); // System.out.println("PAGE " + page + ": " + pages[page - 1]); } // create individual documents for each page assert pages.length == pdflinks.length : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length; result = new Document[Math.min(pages.length, pdflinks.length)]; String loc = location.toNormalform(true); for (int page = 0; page < result.length; page++) { result[page] = new Document( new AnchorURL( loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' // as that would be removed when computing the urlhash mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), pdflinks == null || page >= pdflinks.length ? null : pdflinks[page], null, null, false, docDate); } } else { // collect the whole text at once final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); byte[] contentBytes = new byte[0]; stripper.setEndPage(3); // get first 3 pages (always) writer.append(stripper.getText(pdfDoc)); contentBytes = writer.getBytes(); // remember text in case of interrupting thread if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) stripper.setEndPage(Integer.MAX_VALUE); // set to default // we start the pdf parsing in a separate thread to ensure that it can be terminated final PDDocument pdfDocC = pdfDoc; final Thread t = new Thread() { @Override public void run() { Thread.currentThread().setName("pdfParser.getText:" + location); try { writer.append(stripper.getText(pdfDocC)); } catch (final Throwable e) { } } }; t.start(); t.join(3000); // pdfbox likes to forget to terminate ... (quite often) if (t.isAlive()) t.interrupt(); } contentBytes = writer.getBytes(); // get final text before closing writer Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>(); for (Collection<AnchorURL> pdflinksx : pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); result = new Document[] { new Document( location, mimeType, StandardCharsets.UTF_8.name(), this, null, docKeywords, singleList(docTitle), docAuthor, docPublisher, null, null, 0.0f, 0.0f, contentBytes, pdflinksCombined, null, null, false, docDate) }; } } catch (final Throwable e) { // close the writer (in finally) // throw new Parser.Failure(e.getMessage(), location); } finally { try { pdfDoc.close(); } catch (final Throwable e) { } } // clear resources in pdfbox. they say that is resolved but it's not. see: // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351 // https://issues.apache.org/jira/browse/PDFBOX-441 // the pdfbox still generates enormeous number of object allocations and don't delete these // the following Object are statically stored and never flushed: // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary, // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull // the great number of these objects can easily be seen in Java Visual VM // we try to get this shit out of the memory here by forced clear calls, hope the best the // rubbish gets out. pdfDoc = null; clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); return result; }