Пример #1
0
 public static String extractText(InputStream src) throws IOException {
   StringBuilder text = new StringBuilder();
   COSDocument cosDoc = null;
   PDDocument pdDoc = null;
   try {
     PDFParser parser = new PDFParser(src);
     parser.parse();
     cosDoc = parser.getDocument();
     PDFTextStripper stripper = new PDFTextStripper();
     pdDoc = new PDDocument(cosDoc);
     int nbPages = pdDoc.getDocumentCatalog().getPages().getCount();
     for (int i = 0; i < nbPages; i++) {
       stripper.setStartPage(i + 1);
       stripper.setEndPage(i + 1);
       text.append(stripper.getText(pdDoc));
     }
   } finally {
     try {
       if (cosDoc != null) {
         cosDoc.close();
       }
     } catch (IOException e) {
       // Do nada
     }
     try {
       if (pdDoc != null) {
         pdDoc.close();
       }
     } catch (IOException e) {
       // Do nada
     }
   }
   return text.toString();
 }
Пример #2
0
 public static String extract(File pdfFile) throws IOException {
   checkNotNull(pdfFile, "pdfFile");
   PDFParser parser = new PDFParser(new FileInputStream(pdfFile));
   parser.parse();
   COSDocument cosDoc = parser.getDocument();
   PDFTextStripper pdfStripper = new PDFTextStripper();
   PDDocument pdDoc = new PDDocument(cosDoc);
   pdfStripper.setStartPage(1);
   pdfStripper.setEndPage(pdDoc.getNumberOfPages());
   pdfStripper.setSortByPosition(true);
   String pdfText = pdfStripper.getText(pdDoc);
   pdDoc.close();
   cosDoc.close();
   return pdfText;
 }
  /**
   * Método para la indexación individual de cada fichero PDF
   *
   * @param f el fichero PDF
   * @param writer el IndexWriter
   * @throws IOException
   */
  public static void indexFile(File f, IndexWriter writer) throws IOException {

    // Cargamos el fichero mediante PDFBox
    PDDocument pddDocument = PDDocument.load(f.getAbsolutePath());
    PDFTextStripper textStripper = new PDFTextStripper();
    int numPages = pddDocument.getNumberOfPages();
    String pageContent;

    // Declaramos un Field propio
    FieldType fieldText = new FieldType();
    fieldText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    fieldText.setStored(false);
    fieldText.setStoreTermVectorOffsets(true);
    fieldText.setStoreTermVectorPositions(true);
    fieldText.setStoreTermVectors(true);

    // Recorremos e indexamos cada una de las páginas del fichero, almacenando el número de página y
    // el título del fichero, e indexando el contenido
    for (int i = 0; i < numPages; i++) {
      if (i == 0) {
        i++;
      }
      textStripper.setStartPage(i);
      textStripper.setEndPage(i);
      // coger una página
      pageContent = textStripper.getText(pddDocument);
      if (pageContent != null && !pageContent.isEmpty()) {
        pageContent = pageContent.toLowerCase();
      }

      if (pageContent != null) {
        // Declaramos el documento a indexar para esa página

        // Número de página
        // Contenido de la página
        // Título del fichero

        // Añadimos el documento
      }
    }

    // Cerramos el fichero PDF

  }
Пример #4
0
  public static void main(String[] args) throws Exception {

    File file = new File("C:/Users/jatin.goyal/Desktop/demoexcel.pdf");
    PDDocument pd = PDDocument.load(file);
    System.out.println(pd.getNumberOfPages());
    PDFTextStripper st = new PDFTextStripper();
    st.setStartPage(1);
    // st.setEndPage(4);

    //		 PDFTextStripperByArea stripper = new PDFTextStripperByArea();
    //	        stripper.setSortByPosition( true );
    //	        Rectangle rect1 = new Rectangle( 50, 140, 60, 20 );
    //	        Rectangle rect2 = new Rectangle( 110, 140, 20, 20 );
    //	        stripper.addRegion( "row1column1", rect1 );
    //	        stripper.addRegion( "row1column2", rect2 );
    //	        List allPages = pd.getDocumentCatalog().getAllPages();
    //	        PDPage firstPage = (PDPage)allPages.get( 0 );
    //	        stripper.extractRegions( firstPage );
    //	        System.out.println(stripper.getTextForRegion( "row1column1" ));
    //	        System.out.println(stripper.getTextForRegion( "row1column2" ));

    System.out.println(st.getText(pd));
  }
Пример #5
0
  @Override
  public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) throws IOException {
    final ArrayList<BibtexEntry> res = new ArrayList<BibtexEntry>(1);

    PDDocument document;
    try {
      document = PDDocument.load(in);
    } catch (IOException e) {
      LOGGER.error("Could not load document", e);
      return res;
    }

    try {
      if (document.isEncrypted()) {
        LOGGER.error(Localization.lang("Encrypted documents are not supported"));
        // return res;
      }

      PDFTextStripper stripper = new PDFTextStripper();
      stripper.setStartPage(1);
      stripper.setEndPage(1);
      stripper.setSortByPosition(true);
      stripper.setParagraphEnd(System.lineSeparator());
      StringWriter writer = new StringWriter();
      stripper.writeText(document, writer);
      String textResult = writer.toString();

      String doi = new DOI(textResult).getDOI();
      if (doi.length() < textResult.length()) {
        // A Doi was found in the text
        // We do NO parsing of the text, but use the Doi fetcher

        ImportInspector i =
            new ImportInspector() {

              @Override
              public void toFront() {}

              @Override
              public void setProgress(int current, int max) {}

              @Override
              public void addEntry(BibtexEntry entry) {
                // add the entry to the result object
                res.add(entry);
              }
            };
        PdfContentImporter.doiToBibTeXFetcher.processQuery(doi, i, status);
        if (!res.isEmpty()) {
          // if something has been found, return the result
          return res;
        } else {
          // otherwise, we just parse the PDF
        }
      }

      String author;
      String editor = null;
      String institution = null;
      String abstractT = null;
      String keywords = null;
      String title;
      String conference = null;
      String DOI = null;
      String series = null;
      String volume = null;
      String number = null;
      String pages = null;
      // year is a class variable as the method extractYear() uses it;
      String publisher = null;
      BibtexEntryType type = BibtexEntryTypes.INPROCEEDINGS;

      final String lineBreak = System.lineSeparator();

      split = textResult.split(lineBreak);

      // idea: split[] contains the different lines
      // blocks are separated by empty lines
      // treat each block
      //   or do special treatment at authors (which are not broken)
      //   therefore, we do a line-based and not a block-based splitting
      // i points to the current line
      // curString (mostly) contains the current block
      //   the different lines are joined into one and thereby separated by " "

      proceedToNextNonEmptyLine();
      if (i >= split.length) {
        // PDF could not be parsed or is empty
        // return empty list
        return res;
      }
      curString = split[i];
      i = i + 1;

      if (curString.length() > 4) {
        // special case: possibly conference as first line on the page
        extractYear();
        if (curString.contains("Conference")) {
          fillCurStringWithNonEmptyLines();
          conference = curString;
          curString = "";
        } else {
          // e.g. Copyright (c) 1998 by the Genetics Society of America
          // future work: get year using RegEx
          String lower = curString.toLowerCase();
          if (lower.contains("copyright")) {
            fillCurStringWithNonEmptyLines();
            publisher = curString;
            curString = "";
          }
        }
      }

      // start: title
      fillCurStringWithNonEmptyLines();
      title = streamlineTitle(curString);
      curString = "";
      // i points to the next non-empty line

      // after title: authors
      author = null;
      while (i < split.length && !split[i].equals("")) {
        // author names are unlikely to be split among different lines
        // treat them line by line
        curString = streamlineNames(split[i]);
        if (author == null) {
          author = curString;
        } else {
          if (curString.equals("")) {
            // if split[i] is "and" then "" is returned by streamlineNames -> do nothing
          } else {
            author = author.concat(" and ").concat(curString);
          }
        }
        i++;
      }
      curString = "";
      i++;

      // then, abstract and keywords follow
      while (i < split.length) {
        curString = split[i];
        if (curString.length() >= "Abstract".length()
            && curString.substring(0, "Abstract".length()).equalsIgnoreCase("Abstract")) {
          if (curString.length() == "Abstract".length()) {
            // only word "abstract" found -- skip line
            curString = "";
          } else {
            curString = curString.substring("Abstract".length() + 1).trim().concat(lineBreak);
          }
          i++;
          // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator
          // whereas we need linebreak as separator
          while (i < split.length && !split[i].equals("")) {
            curString = curString.concat(split[i]).concat(lineBreak);
            i++;
          }
          abstractT = curString;
          i++;
        } else if (curString.length() >= "Keywords".length()
            && curString.substring(0, "Keywords".length()).equalsIgnoreCase("Keywords")) {
          if (curString.length() == "Keywords".length()) {
            // only word "Keywords" found -- skip line
            curString = "";
          } else {
            curString = curString.substring("Keywords".length() + 1).trim();
          }
          i++;
          fillCurStringWithNonEmptyLines();
          keywords = removeNonLettersAtEnd(curString);
        } else {
          String lower = curString.toLowerCase();

          int pos = lower.indexOf("technical");
          if (pos >= 0) {
            type = BibtexEntryTypes.TECHREPORT;
            pos = curString.trim().lastIndexOf(' ');
            if (pos >= 0) {
              // assumption: last character of curString is NOT ' '
              //   otherwise pos+1 leads to an out-of-bounds exception
              number = curString.substring(pos + 1);
            }
          }

          i++;
          proceedToNextNonEmptyLine();
        }
      }

      i = split.length - 1;

      // last block: DOI, detailed information
      // sometimes, this information is in the third last block etc...
      // therefore, read until the beginning of the file

      while (i >= 0) {
        readLastBlock();
        // i now points to the block before or is -1
        // curString contains the last block, separated by " "

        extractYear();

        int pos = curString.indexOf("(Eds.)");
        if (pos >= 0 && publisher == null) {
          // looks like a Springer last line
          // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009.
          publisher = "Springer";
          editor = streamlineNames(curString.substring(0, pos - 1));
          curString =
              curString.substring(
                  pos
                      + "(Eds.)".length()
                      + 2); // +2 because of ":" after (Eds.) and the subsequent space
          String[] springerSplit = curString.split(", ");
          if (springerSplit.length >= 4) {
            conference = springerSplit[0];

            String seriesData = springerSplit[1];
            int lastSpace = seriesData.lastIndexOf(' ');
            series = seriesData.substring(0, lastSpace);
            volume = seriesData.substring(lastSpace + 1);

            pages = springerSplit[2].substring(4);

            if (springerSplit[3].length() >= 4) {
              year = springerSplit[3].substring(0, 4);
            }
          }
        } else {
          if (DOI == null) {
            pos = curString.indexOf("DOI");
            if (pos < 0) {
              pos = curString.indexOf("doi");
            }
            if (pos >= 0) {
              pos += 3;
              char delimiter = curString.charAt(pos);
              if (delimiter == ':' || delimiter == ' ') {
                pos++;
              }
              int nextSpace = curString.indexOf(' ', pos);
              if (nextSpace > 0) {
                DOI = curString.substring(pos, nextSpace);
              } else {
                DOI = curString.substring(pos);
              }
            }
          }

          if (publisher == null && curString.contains("IEEE")) {
            // IEEE has the conference things at the end
            publisher = "IEEE";

            // year is extracted by extractYear
            // otherwise, we could it determine as follows:
            // String yearStr = curString.substring(curString.length()-4);
            // if (isYear(yearStr)) {
            //	year = yearStr;
            // }

            if (conference == null) {
              pos = curString.indexOf('$');
              if (pos > 0) {
                // we found the price
                // before the price, the ISSN is stated
                // skip that
                pos -= 2;
                while (pos >= 0 && curString.charAt(pos) != ' ') {
                  pos--;
                }
                if (pos > 0) {
                  conference = curString.substring(0, pos);
                }
              }
            }
          }

          //					String lower = curString.toLowerCase();
          //					if (institution == null) {
          //
          //					}

        }
      }

      BibtexEntry entry = new BibtexEntry();
      entry.setType(type);

      if (author != null) {
        entry.setField("author", author);
      }
      if (editor != null) {
        entry.setField("editor", editor);
      }
      if (institution != null) {
        entry.setField("institution", institution);
      }
      if (abstractT != null) {
        entry.setField("abstract", abstractT);
      }
      if (keywords != null) {
        entry.setField("keywords", keywords);
      }
      if (title != null) {
        entry.setField("title", title);
      }
      if (conference != null) {
        entry.setField("booktitle", conference);
      }
      if (DOI != null) {
        entry.setField("doi", DOI);
      }
      if (series != null) {
        entry.setField("series", series);
      }
      if (volume != null) {
        entry.setField("volume", volume);
      }
      if (number != null) {
        entry.setField("number", number);
      }
      if (pages != null) {
        entry.setField("pages", pages);
      }
      if (year != null) {
        entry.setField("year", year);
      }
      if (publisher != null) {
        entry.setField("publisher", publisher);
      }

      entry.setField("review", textResult);

      res.add(entry);
    } catch (NoClassDefFoundError e) {
      if (e.getMessage().equals("org/bouncycastle/jce/provider/BouncyCastleProvider")) {
        status.showMessage(
            Localization.lang(
                "Java Bouncy Castle library not found. Please download and install it. For more information see http://www.bouncycastle.org/."));
      } else {
        LOGGER.error("Could not find class", e);
      }
    } finally {
      document.close();
    }

    return res;
  }
  @Override
  public Document[] parse(
      final AnchorURL location,
      final String mimeType,
      final String charset,
      final VocabularyScraper scraper,
      final int timezoneOffset,
      final InputStream source)
      throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
      throw new Parser.Failure(
          "Not enough Memory available for pdf parser: " + MemoryControl.available(), location);

    // create a pdf parser
    PDDocument pdfDoc;
    try {
      Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
      // pdfDoc = PDDocument.load(source);
      final PDFParser pdfParser = new PDFParser(source);
      pdfParser.setTempDirectory(new File(System.getProperty("java.io.tmpdir")));
      pdfParser.parse();
      pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
      throw new Parser.Failure(e.getMessage(), location);
    } finally {
      Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
      try {
        pdfDoc.openProtection(new StandardDecryptionMaterial(""));
      } catch (final BadSecurityHandlerException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
      } catch (final IOException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
      } catch (final CryptographyException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
      }
      final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
      if (perm == null || !perm.canExtractContent()) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
      }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null,
        docSubject = null,
        docAuthor = null,
        docPublisher = null,
        docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
      docTitle = info.getTitle();
      docSubject = info.getSubject();
      docAuthor = info.getAuthor();
      docPublisher = info.getProducer();
      if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator();
      docKeywordStr = info.getKeywords();
      try {
        if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime();
      } catch (IOException e) {
      }
      // unused:
      // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
      docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
      docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
      docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
      // get the links
      pdflinks = extractPdfLinks(pdfDoc);

      // get the fulltext (either per document or for each page)
      final PDFTextStripper stripper = new PDFTextStripper(StandardCharsets.UTF_8.name());

      if (individualPages) {
        // this is a hack which stores individual pages of the source pdf into individual index
        // documents
        // the new documents will get a virtual link with a post argument page=X appended to the
        // original url

        // collect text
        int pagecount = pdfDoc.getNumberOfPages();
        String[] pages = new String[pagecount];
        for (int page = 1; page <= pagecount; page++) {
          stripper.setStartPage(page);
          stripper.setEndPage(page);
          pages[page - 1] = stripper.getText(pdfDoc);
          // System.out.println("PAGE " + page + ": " + pages[page - 1]);
        }

        // create individual documents for each page
        assert pages.length == pdflinks.length
            : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length;
        result = new Document[Math.min(pages.length, pdflinks.length)];
        String loc = location.toNormalform(true);
        for (int page = 0; page < result.length; page++) {
          result[page] =
              new Document(
                  new AnchorURL(
                      loc
                          + (loc.indexOf('?') > 0 ? '&' : '?')
                          + individualPagePropertyname
                          + '='
                          + (page
                              + 1)), // these are virtual new pages; we cannot combine them with '#'
                                     // as that would be removed when computing the urlhash
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                  pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
                  null,
                  null,
                  false,
                  docDate);
        }
      } else {
        // collect the whole text at once
        final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
        byte[] contentBytes = new byte[0];
        stripper.setEndPage(3); // get first 3 pages (always)
        writer.append(stripper.getText(pdfDoc));
        contentBytes = writer.getBytes(); // remember text in case of interrupting thread

        if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
          stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
          stripper.setEndPage(Integer.MAX_VALUE); // set to default
          // we start the pdf parsing in a separate thread to ensure that it can be terminated
          final PDDocument pdfDocC = pdfDoc;
          final Thread t =
              new Thread() {
                @Override
                public void run() {
                  Thread.currentThread().setName("pdfParser.getText:" + location);
                  try {
                    writer.append(stripper.getText(pdfDocC));
                  } catch (final Throwable e) {
                  }
                }
              };
          t.start();
          t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
          if (t.isAlive()) t.interrupt();
        }
        contentBytes = writer.getBytes(); // get final text before closing writer

        Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
        for (Collection<AnchorURL> pdflinksx : pdflinks)
          if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
        result =
            new Document[] {
              new Document(
                  location,
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  contentBytes,
                  pdflinksCombined,
                  null,
                  null,
                  false,
                  docDate)
            };
      }
    } catch (final Throwable e) {
      // close the writer (in finally)
      // throw new Parser.Failure(e.getMessage(), location);
    } finally {
      try {
        pdfDoc.close();
      } catch (final Throwable e) {
      }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the
    // rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
  }