示例#1
0
  /**
   * Extracts text from a PDF and stores it in the document. Takes an input stream rather than a
   * file name.
   *
   * @param filesInputStream An input stream pointing to a PDF file.
   * @throws IOException
   */
  private static char[] loadPDF(InputStream filesInputStream) throws IOException {
    PDDocument doc = PDDocument.load(filesInputStream);
    PDFTextStripper pdfStripper = new PDFTextStripper();
    pdfStripper.setSortByPosition(false);
    char[] origText = pdfStripper.getText(doc).toCharArray();
    doc.close();

    return origText;
  }
示例#2
0
 public static String extract(File pdfFile) throws IOException {
   checkNotNull(pdfFile, "pdfFile");
   PDFParser parser = new PDFParser(new FileInputStream(pdfFile));
   parser.parse();
   COSDocument cosDoc = parser.getDocument();
   PDFTextStripper pdfStripper = new PDFTextStripper();
   PDDocument pdDoc = new PDDocument(cosDoc);
   pdfStripper.setStartPage(1);
   pdfStripper.setEndPage(pdDoc.getNumberOfPages());
   pdfStripper.setSortByPosition(true);
   String pdfText = pdfStripper.getText(pdDoc);
   pdDoc.close();
   cosDoc.close();
   return pdfText;
 }
  @Override
  public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) throws IOException {
    final ArrayList<BibtexEntry> res = new ArrayList<BibtexEntry>(1);

    PDDocument document;
    try {
      document = PDDocument.load(in);
    } catch (IOException e) {
      LOGGER.error("Could not load document", e);
      return res;
    }

    try {
      if (document.isEncrypted()) {
        LOGGER.error(Localization.lang("Encrypted documents are not supported"));
        // return res;
      }

      PDFTextStripper stripper = new PDFTextStripper();
      stripper.setStartPage(1);
      stripper.setEndPage(1);
      stripper.setSortByPosition(true);
      stripper.setParagraphEnd(System.lineSeparator());
      StringWriter writer = new StringWriter();
      stripper.writeText(document, writer);
      String textResult = writer.toString();

      String doi = new DOI(textResult).getDOI();
      if (doi.length() < textResult.length()) {
        // A Doi was found in the text
        // We do NO parsing of the text, but use the Doi fetcher

        ImportInspector i =
            new ImportInspector() {

              @Override
              public void toFront() {}

              @Override
              public void setProgress(int current, int max) {}

              @Override
              public void addEntry(BibtexEntry entry) {
                // add the entry to the result object
                res.add(entry);
              }
            };
        PdfContentImporter.doiToBibTeXFetcher.processQuery(doi, i, status);
        if (!res.isEmpty()) {
          // if something has been found, return the result
          return res;
        } else {
          // otherwise, we just parse the PDF
        }
      }

      String author;
      String editor = null;
      String institution = null;
      String abstractT = null;
      String keywords = null;
      String title;
      String conference = null;
      String DOI = null;
      String series = null;
      String volume = null;
      String number = null;
      String pages = null;
      // year is a class variable as the method extractYear() uses it;
      String publisher = null;
      BibtexEntryType type = BibtexEntryTypes.INPROCEEDINGS;

      final String lineBreak = System.lineSeparator();

      split = textResult.split(lineBreak);

      // idea: split[] contains the different lines
      // blocks are separated by empty lines
      // treat each block
      //   or do special treatment at authors (which are not broken)
      //   therefore, we do a line-based and not a block-based splitting
      // i points to the current line
      // curString (mostly) contains the current block
      //   the different lines are joined into one and thereby separated by " "

      proceedToNextNonEmptyLine();
      if (i >= split.length) {
        // PDF could not be parsed or is empty
        // return empty list
        return res;
      }
      curString = split[i];
      i = i + 1;

      if (curString.length() > 4) {
        // special case: possibly conference as first line on the page
        extractYear();
        if (curString.contains("Conference")) {
          fillCurStringWithNonEmptyLines();
          conference = curString;
          curString = "";
        } else {
          // e.g. Copyright (c) 1998 by the Genetics Society of America
          // future work: get year using RegEx
          String lower = curString.toLowerCase();
          if (lower.contains("copyright")) {
            fillCurStringWithNonEmptyLines();
            publisher = curString;
            curString = "";
          }
        }
      }

      // start: title
      fillCurStringWithNonEmptyLines();
      title = streamlineTitle(curString);
      curString = "";
      // i points to the next non-empty line

      // after title: authors
      author = null;
      while (i < split.length && !split[i].equals("")) {
        // author names are unlikely to be split among different lines
        // treat them line by line
        curString = streamlineNames(split[i]);
        if (author == null) {
          author = curString;
        } else {
          if (curString.equals("")) {
            // if split[i] is "and" then "" is returned by streamlineNames -> do nothing
          } else {
            author = author.concat(" and ").concat(curString);
          }
        }
        i++;
      }
      curString = "";
      i++;

      // then, abstract and keywords follow
      while (i < split.length) {
        curString = split[i];
        if (curString.length() >= "Abstract".length()
            && curString.substring(0, "Abstract".length()).equalsIgnoreCase("Abstract")) {
          if (curString.length() == "Abstract".length()) {
            // only word "abstract" found -- skip line
            curString = "";
          } else {
            curString = curString.substring("Abstract".length() + 1).trim().concat(lineBreak);
          }
          i++;
          // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator
          // whereas we need linebreak as separator
          while (i < split.length && !split[i].equals("")) {
            curString = curString.concat(split[i]).concat(lineBreak);
            i++;
          }
          abstractT = curString;
          i++;
        } else if (curString.length() >= "Keywords".length()
            && curString.substring(0, "Keywords".length()).equalsIgnoreCase("Keywords")) {
          if (curString.length() == "Keywords".length()) {
            // only word "Keywords" found -- skip line
            curString = "";
          } else {
            curString = curString.substring("Keywords".length() + 1).trim();
          }
          i++;
          fillCurStringWithNonEmptyLines();
          keywords = removeNonLettersAtEnd(curString);
        } else {
          String lower = curString.toLowerCase();

          int pos = lower.indexOf("technical");
          if (pos >= 0) {
            type = BibtexEntryTypes.TECHREPORT;
            pos = curString.trim().lastIndexOf(' ');
            if (pos >= 0) {
              // assumption: last character of curString is NOT ' '
              //   otherwise pos+1 leads to an out-of-bounds exception
              number = curString.substring(pos + 1);
            }
          }

          i++;
          proceedToNextNonEmptyLine();
        }
      }

      i = split.length - 1;

      // last block: DOI, detailed information
      // sometimes, this information is in the third last block etc...
      // therefore, read until the beginning of the file

      while (i >= 0) {
        readLastBlock();
        // i now points to the block before or is -1
        // curString contains the last block, separated by " "

        extractYear();

        int pos = curString.indexOf("(Eds.)");
        if (pos >= 0 && publisher == null) {
          // looks like a Springer last line
          // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009.
          publisher = "Springer";
          editor = streamlineNames(curString.substring(0, pos - 1));
          curString =
              curString.substring(
                  pos
                      + "(Eds.)".length()
                      + 2); // +2 because of ":" after (Eds.) and the subsequent space
          String[] springerSplit = curString.split(", ");
          if (springerSplit.length >= 4) {
            conference = springerSplit[0];

            String seriesData = springerSplit[1];
            int lastSpace = seriesData.lastIndexOf(' ');
            series = seriesData.substring(0, lastSpace);
            volume = seriesData.substring(lastSpace + 1);

            pages = springerSplit[2].substring(4);

            if (springerSplit[3].length() >= 4) {
              year = springerSplit[3].substring(0, 4);
            }
          }
        } else {
          if (DOI == null) {
            pos = curString.indexOf("DOI");
            if (pos < 0) {
              pos = curString.indexOf("doi");
            }
            if (pos >= 0) {
              pos += 3;
              char delimiter = curString.charAt(pos);
              if (delimiter == ':' || delimiter == ' ') {
                pos++;
              }
              int nextSpace = curString.indexOf(' ', pos);
              if (nextSpace > 0) {
                DOI = curString.substring(pos, nextSpace);
              } else {
                DOI = curString.substring(pos);
              }
            }
          }

          if (publisher == null && curString.contains("IEEE")) {
            // IEEE has the conference things at the end
            publisher = "IEEE";

            // year is extracted by extractYear
            // otherwise, we could it determine as follows:
            // String yearStr = curString.substring(curString.length()-4);
            // if (isYear(yearStr)) {
            //	year = yearStr;
            // }

            if (conference == null) {
              pos = curString.indexOf('$');
              if (pos > 0) {
                // we found the price
                // before the price, the ISSN is stated
                // skip that
                pos -= 2;
                while (pos >= 0 && curString.charAt(pos) != ' ') {
                  pos--;
                }
                if (pos > 0) {
                  conference = curString.substring(0, pos);
                }
              }
            }
          }

          //					String lower = curString.toLowerCase();
          //					if (institution == null) {
          //
          //					}

        }
      }

      BibtexEntry entry = new BibtexEntry();
      entry.setType(type);

      if (author != null) {
        entry.setField("author", author);
      }
      if (editor != null) {
        entry.setField("editor", editor);
      }
      if (institution != null) {
        entry.setField("institution", institution);
      }
      if (abstractT != null) {
        entry.setField("abstract", abstractT);
      }
      if (keywords != null) {
        entry.setField("keywords", keywords);
      }
      if (title != null) {
        entry.setField("title", title);
      }
      if (conference != null) {
        entry.setField("booktitle", conference);
      }
      if (DOI != null) {
        entry.setField("doi", DOI);
      }
      if (series != null) {
        entry.setField("series", series);
      }
      if (volume != null) {
        entry.setField("volume", volume);
      }
      if (number != null) {
        entry.setField("number", number);
      }
      if (pages != null) {
        entry.setField("pages", pages);
      }
      if (year != null) {
        entry.setField("year", year);
      }
      if (publisher != null) {
        entry.setField("publisher", publisher);
      }

      entry.setField("review", textResult);

      res.add(entry);
    } catch (NoClassDefFoundError e) {
      if (e.getMessage().equals("org/bouncycastle/jce/provider/BouncyCastleProvider")) {
        status.showMessage(
            Localization.lang(
                "Java Bouncy Castle library not found. Please download and install it. For more information see http://www.bouncycastle.org/."));
      } else {
        LOGGER.error("Could not find class", e);
      }
    } finally {
      document.close();
    }

    return res;
  }
示例#4
0
 /**
  * Default constructor.
  *
  * @throws IOException If there is an error loading text stripper properties.
  */
 public PrintTextLocations() throws IOException {
   super.setSortByPosition(true);
 }