/** * Extracts text from a PDF and stores it in the document. Takes an input stream rather than a * file name. * * @param filesInputStream An input stream pointing to a PDF file. * @throws IOException */ private static char[] loadPDF(InputStream filesInputStream) throws IOException { PDDocument doc = PDDocument.load(filesInputStream); PDFTextStripper pdfStripper = new PDFTextStripper(); pdfStripper.setSortByPosition(false); char[] origText = pdfStripper.getText(doc).toCharArray(); doc.close(); return origText; }
public static String extract(File pdfFile) throws IOException { checkNotNull(pdfFile, "pdfFile"); PDFParser parser = new PDFParser(new FileInputStream(pdfFile)); parser.parse(); COSDocument cosDoc = parser.getDocument(); PDFTextStripper pdfStripper = new PDFTextStripper(); PDDocument pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); pdfStripper.setSortByPosition(true); String pdfText = pdfStripper.getText(pdDoc); pdDoc.close(); cosDoc.close(); return pdfText; }
@Override public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) throws IOException { final ArrayList<BibtexEntry> res = new ArrayList<BibtexEntry>(1); PDDocument document; try { document = PDDocument.load(in); } catch (IOException e) { LOGGER.error("Could not load document", e); return res; } try { if (document.isEncrypted()) { LOGGER.error(Localization.lang("Encrypted documents are not supported")); // return res; } PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(1); stripper.setEndPage(1); stripper.setSortByPosition(true); stripper.setParagraphEnd(System.lineSeparator()); StringWriter writer = new StringWriter(); stripper.writeText(document, writer); String textResult = writer.toString(); String doi = new DOI(textResult).getDOI(); if (doi.length() < textResult.length()) { // A Doi was found in the text // We do NO parsing of the text, but use the Doi fetcher ImportInspector i = new ImportInspector() { @Override public void toFront() {} @Override public void setProgress(int current, int max) {} @Override public void addEntry(BibtexEntry entry) { // add the entry to the result object res.add(entry); } }; PdfContentImporter.doiToBibTeXFetcher.processQuery(doi, i, status); if (!res.isEmpty()) { // if something has been found, return the result return res; } else { // otherwise, we just parse the PDF } } String author; String editor = null; String institution = null; String abstractT = null; String keywords = null; String title; String conference = null; String DOI = null; String series = null; String volume = null; String number = null; String pages = null; // year is a class variable as the method extractYear() uses it; String publisher = null; BibtexEntryType type = BibtexEntryTypes.INPROCEEDINGS; final String lineBreak = System.lineSeparator(); split = textResult.split(lineBreak); // idea: split[] contains the different lines // blocks are separated by empty lines // treat each block // or do special treatment at authors (which are not broken) // therefore, we do a line-based and not a block-based splitting // i points to the current line // curString (mostly) contains the current block // the different lines are joined into one and thereby separated by " " proceedToNextNonEmptyLine(); if (i >= split.length) { // PDF could not be parsed or is empty // return empty list return res; } curString = split[i]; i = i + 1; if (curString.length() > 4) { // special case: possibly conference as first line on the page extractYear(); if (curString.contains("Conference")) { fillCurStringWithNonEmptyLines(); conference = curString; curString = ""; } else { // e.g. Copyright (c) 1998 by the Genetics Society of America // future work: get year using RegEx String lower = curString.toLowerCase(); if (lower.contains("copyright")) { fillCurStringWithNonEmptyLines(); publisher = curString; curString = ""; } } } // start: title fillCurStringWithNonEmptyLines(); title = streamlineTitle(curString); curString = ""; // i points to the next non-empty line // after title: authors author = null; while (i < split.length && !split[i].equals("")) { // author names are unlikely to be split among different lines // treat them line by line curString = streamlineNames(split[i]); if (author == null) { author = curString; } else { if (curString.equals("")) { // if split[i] is "and" then "" is returned by streamlineNames -> do nothing } else { author = author.concat(" and ").concat(curString); } } i++; } curString = ""; i++; // then, abstract and keywords follow while (i < split.length) { curString = split[i]; if (curString.length() >= "Abstract".length() && curString.substring(0, "Abstract".length()).equalsIgnoreCase("Abstract")) { if (curString.length() == "Abstract".length()) { // only word "abstract" found -- skip line curString = ""; } else { curString = curString.substring("Abstract".length() + 1).trim().concat(lineBreak); } i++; // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator // whereas we need linebreak as separator while (i < split.length && !split[i].equals("")) { curString = curString.concat(split[i]).concat(lineBreak); i++; } abstractT = curString; i++; } else if (curString.length() >= "Keywords".length() && curString.substring(0, "Keywords".length()).equalsIgnoreCase("Keywords")) { if (curString.length() == "Keywords".length()) { // only word "Keywords" found -- skip line curString = ""; } else { curString = curString.substring("Keywords".length() + 1).trim(); } i++; fillCurStringWithNonEmptyLines(); keywords = removeNonLettersAtEnd(curString); } else { String lower = curString.toLowerCase(); int pos = lower.indexOf("technical"); if (pos >= 0) { type = BibtexEntryTypes.TECHREPORT; pos = curString.trim().lastIndexOf(' '); if (pos >= 0) { // assumption: last character of curString is NOT ' ' // otherwise pos+1 leads to an out-of-bounds exception number = curString.substring(pos + 1); } } i++; proceedToNextNonEmptyLine(); } } i = split.length - 1; // last block: DOI, detailed information // sometimes, this information is in the third last block etc... // therefore, read until the beginning of the file while (i >= 0) { readLastBlock(); // i now points to the block before or is -1 // curString contains the last block, separated by " " extractYear(); int pos = curString.indexOf("(Eds.)"); if (pos >= 0 && publisher == null) { // looks like a Springer last line // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009. publisher = "Springer"; editor = streamlineNames(curString.substring(0, pos - 1)); curString = curString.substring( pos + "(Eds.)".length() + 2); // +2 because of ":" after (Eds.) and the subsequent space String[] springerSplit = curString.split(", "); if (springerSplit.length >= 4) { conference = springerSplit[0]; String seriesData = springerSplit[1]; int lastSpace = seriesData.lastIndexOf(' '); series = seriesData.substring(0, lastSpace); volume = seriesData.substring(lastSpace + 1); pages = springerSplit[2].substring(4); if (springerSplit[3].length() >= 4) { year = springerSplit[3].substring(0, 4); } } } else { if (DOI == null) { pos = curString.indexOf("DOI"); if (pos < 0) { pos = curString.indexOf("doi"); } if (pos >= 0) { pos += 3; char delimiter = curString.charAt(pos); if (delimiter == ':' || delimiter == ' ') { pos++; } int nextSpace = curString.indexOf(' ', pos); if (nextSpace > 0) { DOI = curString.substring(pos, nextSpace); } else { DOI = curString.substring(pos); } } } if (publisher == null && curString.contains("IEEE")) { // IEEE has the conference things at the end publisher = "IEEE"; // year is extracted by extractYear // otherwise, we could it determine as follows: // String yearStr = curString.substring(curString.length()-4); // if (isYear(yearStr)) { // year = yearStr; // } if (conference == null) { pos = curString.indexOf('$'); if (pos > 0) { // we found the price // before the price, the ISSN is stated // skip that pos -= 2; while (pos >= 0 && curString.charAt(pos) != ' ') { pos--; } if (pos > 0) { conference = curString.substring(0, pos); } } } } // String lower = curString.toLowerCase(); // if (institution == null) { // // } } } BibtexEntry entry = new BibtexEntry(); entry.setType(type); if (author != null) { entry.setField("author", author); } if (editor != null) { entry.setField("editor", editor); } if (institution != null) { entry.setField("institution", institution); } if (abstractT != null) { entry.setField("abstract", abstractT); } if (keywords != null) { entry.setField("keywords", keywords); } if (title != null) { entry.setField("title", title); } if (conference != null) { entry.setField("booktitle", conference); } if (DOI != null) { entry.setField("doi", DOI); } if (series != null) { entry.setField("series", series); } if (volume != null) { entry.setField("volume", volume); } if (number != null) { entry.setField("number", number); } if (pages != null) { entry.setField("pages", pages); } if (year != null) { entry.setField("year", year); } if (publisher != null) { entry.setField("publisher", publisher); } entry.setField("review", textResult); res.add(entry); } catch (NoClassDefFoundError e) { if (e.getMessage().equals("org/bouncycastle/jce/provider/BouncyCastleProvider")) { status.showMessage( Localization.lang( "Java Bouncy Castle library not found. Please download and install it. For more information see http://www.bouncycastle.org/.")); } else { LOGGER.error("Could not find class", e); } } finally { document.close(); } return res; }
/** * Default constructor. * * @throws IOException If there is an error loading text stripper properties. */ public PrintTextLocations() throws IOException { super.setSortByPosition(true); }