Java PDFTextStripper примеры, org.apache.pdfbox.util.PDFTextStripper Java примеры использования

Пример #1

1

Показать файл

Файл: PDFIndexer.java Проект: schoenm1/lucene

  /**
   * This will add the contents to the lucene document.
   *
   * @param document The document to add the contents to.
   * @param is The stream to get the contents from.
   * @param documentLocation The location of the document, used just for debug messages.
   * @throws IOException If there is an error parsing the document.
   */
  private void addContent(Document document, InputStream is, String documentLocation)
      throws IOException {
    PDDocument pdfDocument = null;
    PDFTextStripper stripper;
    try {
      pdfDocument = PDDocument.load(is);
      if (pdfDocument.isEncrypted()) {
        // Just try using the default password and move on
        pdfDocument.decrypt("");
      }

      // create a writer where to append the text content.
      StringWriter writer = new StringWriter();
      stripper = new PDFTextStripper();
      try {
        stripper.writeText(pdfDocument, writer);

      } catch (Exception e) {
        System.out.println("Error in stripper.writeText()");
      }
      String contents = writer.getBuffer().toString();

      StringReader reader = new StringReader(contents);
      addTextField(document, Indexer.contents, reader);
      PDDocumentInformation info = pdfDocument.getDocumentInformation();
      if (info != null) {
        addTextField(document, Indexer.Author, info.getAuthor());
        try {
          addTextField(document, Indexer.created, info.getCreationDate());
        } catch (IOException io) {
          // ignore, bad date but continue with indexing
        }

        addTextField(document, Indexer.keywords, info.getKeywords());
        try {
          addTextField(document, Indexer.modified, info.getModificationDate());
        } catch (IOException io) {
          // ignore, bad date but continue with indexing
        }
        addTextField(document, "Subject", info.getSubject());
        addTextField(document, Indexer.Title, info.getTitle());
      }
      int summarySize = Math.min(contents.length(), 500);
      String summary = contents.substring(0, summarySize);
      // Add the summary as an UnIndexed field, so that it is stored and
      // returned
      // with hit documents for display.
      addUnindexedField(document, Indexer.summary, summary);
    } catch (CryptographyException e) {
      throw new IOException("Error decrypting document(" + documentLocation + "): " + e);
    } catch (InvalidPasswordException e) {
      // they didn't suppply a password and the default of "" was wrong.
      throw new IOException(
          "Error: The document(" + documentLocation + ") is encrypted and will not be indexed.");
    } finally {
      if (pdfDocument != null) {
        pdfDocument.close();
      }
    }
  }

Пример #2

0

Показать файл

Файл: WordToPdfRenditionProviderTest.java Проект: paulcwarren/spring-content

  private String pdfToText(InputStream in) {
    PDFParser parser = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    PDFTextStripper pdfStripper;

    try {
      parser = new PDFParser(in);
      parser.parse();
      cosDoc = parser.getDocument();
      pdfStripper = new PDFTextStripper();
      pdDoc = new PDDocument(cosDoc);
      return pdfStripper.getText(pdDoc);
      // System.out.println(parsedText.replaceAll("[^A-Za-z0-9. ]+", ""));
    } catch (Exception e) {
      e.printStackTrace();
      try {
        if (cosDoc != null) cosDoc.close();
        if (pdDoc != null) pdDoc.close();
      } catch (Exception e1) {
        e.printStackTrace();
      }
    }
    return null;
  }

Пример #3

0

Показать файл

Файл: ResourceHelper.java Проект: muh6mm3d/javlo

 public static String getFileContent(File file) throws FileNotFoundException, IOException {
   String ext = FilenameUtils.getExtension(file.getName());
   String outContent = "";
   try {
     if (ext.toLowerCase().equals("doc")) {
       if (file != null) {
         WordExtractor we = new WordExtractor(new FileInputStream(file));
         outContent = we.getText();
       } else {
         logger.warning("file not found : " + file);
       }
     } else if (ext.toLowerCase().equals("pdf")) {
       PDDocument doc = PDDocument.load(file);
       PDFTextStripper text = new PDFTextStripper();
       outContent = text.getText(doc);
       doc.close();
     } else if (StringHelper.isHTML(file.getName())) {
       return loadStringFromFile(file);
     }
   } catch (Throwable t) {
     logger.warning("error when read : " + file + "+ [" + t.getMessage() + "]");
     t.printStackTrace();
   }
   return outContent;
 }

Пример #4

0

Показать файл

Файл: PDFContentExtractor.java Проект: gSafe/mark

 public static String extractText(InputStream src) throws IOException {
   StringBuilder text = new StringBuilder();
   COSDocument cosDoc = null;
   PDDocument pdDoc = null;
   try {
     PDFParser parser = new PDFParser(src);
     parser.parse();
     cosDoc = parser.getDocument();
     PDFTextStripper stripper = new PDFTextStripper();
     pdDoc = new PDDocument(cosDoc);
     int nbPages = pdDoc.getDocumentCatalog().getPages().getCount();
     for (int i = 0; i < nbPages; i++) {
       stripper.setStartPage(i + 1);
       stripper.setEndPage(i + 1);
       text.append(stripper.getText(pdDoc));
     }
   } finally {
     try {
       if (cosDoc != null) {
         cosDoc.close();
       }
     } catch (IOException e) {
       // Do nada
     }
     try {
       if (pdDoc != null) {
         pdDoc.close();
       }
     } catch (IOException e) {
       // Do nada
     }
   }
   return text.toString();
 }

Пример #5

0

Показать файл

Файл: DocumentHelper.java Проект: kluna/JGAAP

  /**
   * Extracts text from a PDF and stores it in the document. Takes an input stream rather than a
   * file name.
   *
   * @param filesInputStream An input stream pointing to a PDF file.
   * @throws IOException
   */
  private static char[] loadPDF(InputStream filesInputStream) throws IOException {
    PDDocument doc = PDDocument.load(filesInputStream);
    PDFTextStripper pdfStripper = new PDFTextStripper();
    pdfStripper.setSortByPosition(false);
    char[] origText = pdfStripper.getText(doc).toCharArray();
    doc.close();

    return origText;
  }

Пример #6

0

Показать файл

Файл: StringPlus.java Проект: rupumped/NicksAppsJava

 public static String getContent(PDFParser parser) throws IOException {
   parser.parse();
   COSDocument cosDoc = parser.getDocument();
   PDFTextStripper pdfStripper = new PDFTextStripper();
   PDDocument pdDoc = new PDDocument(cosDoc);
   String content = pdfStripper.getText(pdDoc);
   cosDoc.close();
   pdDoc.close();
   return content;
 }

Пример #7

0

Показать файл

Файл: Indexer.java Проект: Njanderson/kth_search_engines

 /** Extracts the textual contents from a PDF file as one long string. */
 public String extractPDFContents(File f) throws IOException {
   FileInputStream fi = new FileInputStream(f);
   PDFParser parser = new PDFParser(fi);
   parser.parse();
   fi.close();
   COSDocument cd = parser.getDocument();
   PDFTextStripper stripper = new PDFTextStripper();
   String result = stripper.getText(new PDDocument(cd));
   cd.close();
   return result;
 }

Пример #8

0

Показать файл

Файл: PDFTextExtractor.java Проект: Knixli/Zeen

 public static String extract(File pdfFile) throws IOException {
   checkNotNull(pdfFile, "pdfFile");
   PDFParser parser = new PDFParser(new FileInputStream(pdfFile));
   parser.parse();
   COSDocument cosDoc = parser.getDocument();
   PDFTextStripper pdfStripper = new PDFTextStripper();
   PDDocument pdDoc = new PDDocument(cosDoc);
   pdfStripper.setStartPage(1);
   pdfStripper.setEndPage(pdDoc.getNumberOfPages());
   pdfStripper.setSortByPosition(true);
   String pdfText = pdfStripper.getText(pdDoc);
   pdDoc.close();
   cosDoc.close();
   return pdfText;
 }

Пример #9

0

Показать файл

Файл: ExtractPageContent.java Проект: slonka/project1_jtp2

 ExtractPageContent(String filePath) {
   this.filePath = filePath;
   try {
     reader = new PdfReader(filePath);
     parser = new PdfReaderContentParser(reader);
     getContents();
   } catch (Exception e) {
     try {
       PDDocument doc = PDDocument.load(filePath);
       PDFTextStripper stripper = new PDFTextStripper();
       this.fileContents = stripper.getText(doc);
       doc.close();
     } catch (IOException e1) {
       // TODO Auto-generated catch block
       // e1.printStackTrace();
     }
   }
 }

Пример #10

0

Показать файл

Файл: IndexadorPDF.java Проект: DinamicArea/formacion-lucene

  /**
   * Método para la indexación individual de cada fichero PDF
   *
   * @param f el fichero PDF
   * @param writer el IndexWriter
   * @throws IOException
   */
  public static void indexFile(File f, IndexWriter writer) throws IOException {

    // Cargamos el fichero mediante PDFBox
    PDDocument pddDocument = PDDocument.load(f.getAbsolutePath());
    PDFTextStripper textStripper = new PDFTextStripper();
    int numPages = pddDocument.getNumberOfPages();
    String pageContent;

    // Declaramos un Field propio
    FieldType fieldText = new FieldType();
    fieldText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    fieldText.setStored(false);
    fieldText.setStoreTermVectorOffsets(true);
    fieldText.setStoreTermVectorPositions(true);
    fieldText.setStoreTermVectors(true);

    // Recorremos e indexamos cada una de las páginas del fichero, almacenando el número de página y
    // el título del fichero, e indexando el contenido
    for (int i = 0; i < numPages; i++) {
      if (i == 0) {
        i++;
      }
      textStripper.setStartPage(i);
      textStripper.setEndPage(i);
      // coger una página
      pageContent = textStripper.getText(pddDocument);
      if (pageContent != null && !pageContent.isEmpty()) {
        pageContent = pageContent.toLowerCase();
      }

      if (pageContent != null) {
        // Declaramos el documento a indexar para esa página

        // Número de página
        // Contenido de la página
        // Título del fichero

        // Añadimos el documento
      }
    }

    // Cerramos el fichero PDF

  }

Пример #11

0

Показать файл

Файл: Main.java Проект: jlramalheira/PLN-pdfBox

  public static void main(String[] args) {

    PDDocument pd;
    try {
      File input = new File("pdf/1.pdf"); // The PDF file from where you would like to extract
      pd = PDDocument.load(input);
      int numberOfPages = pd.getNumberOfPages();

      PDFTextStripper stripper = new PDFTextStripper();
      String fullText = stripper.getText(pd);

      int indexReferences = fullText.lastIndexOf("References\n");
      String textOutReferences =
          fullText.substring(0, indexReferences > 0 ? indexReferences : fullText.length());
      String textOutStop = removeStopWords(textOutReferences);

      findMoreCiteds(textOutStop);

      extractReferences(fullText);

      stripper.setEndPage(3);
      String startText = stripper.getText(pd);

      System.out.println("Autores");
      extractAuthor(startText);
      System.out.println("Objetivos");
      extractObjective(startText);
      System.out.println("\n\nProblemas");
      extractProblem(startText);
      System.out.println("\n\nMetodologia");
      extractMethodology(fullText);
      System.out.println("\n\nContribuições");
      extractContributes(fullText);
      pd.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

Пример #12

0

Показать файл

Файл: PdfReader.java Проект: jatin-goyal/FileReader

  public static void main(String[] args) throws Exception {

    File file = new File("C:/Users/jatin.goyal/Desktop/demoexcel.pdf");
    PDDocument pd = PDDocument.load(file);
    System.out.println(pd.getNumberOfPages());
    PDFTextStripper st = new PDFTextStripper();
    st.setStartPage(1);
    // st.setEndPage(4);

    //		 PDFTextStripperByArea stripper = new PDFTextStripperByArea();
    //	        stripper.setSortByPosition( true );
    //	        Rectangle rect1 = new Rectangle( 50, 140, 60, 20 );
    //	        Rectangle rect2 = new Rectangle( 110, 140, 20, 20 );
    //	        stripper.addRegion( "row1column1", rect1 );
    //	        stripper.addRegion( "row1column2", rect2 );
    //	        List allPages = pd.getDocumentCatalog().getAllPages();
    //	        PDPage firstPage = (PDPage)allPages.get( 0 );
    //	        stripper.extractRegions( firstPage );
    //	        System.out.println(stripper.getTextForRegion( "row1column1" ));
    //	        System.out.println(stripper.getTextForRegion( "row1column2" ));

    System.out.println(st.getText(pd));
  }

Пример #13

0

Показать файл

Файл: TextConvertor.java Проект: policygrid/ourSpaces

  public void PDF2TextPreProssesd(String filename) {
    try {

      stripper = new PDFTextStripper();
      stripper.setParagraphStart("&*&");
      stripper.setLineSeparator("#%#");
      stripper.setPageSeparator("#%#");
      String fulltxt = stripper.getText(pd);
      String paras[] = fulltxt.split("&*&");

      File file = new File(filename);
      try {
        BufferedWriter out = new BufferedWriter(new FileWriter(file));

        int i = 0;
        while (i < paras.length) {
          if (paras[i].length() > 200) {
            String para = paras[i].replace("#%#", " ");

            out.write(para + "\r\n");
          }
          i++;
        }
        out.close();

      } catch (IOException ex) {
        ex.printStackTrace();
      }

      if (pd != null) {
        pd.close();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

Пример #14

0

Показать файл

Файл: TextConvertor.java Проект: policygrid/ourSpaces

  /**
   * save the converted text (without any processing) to the given file.
   *
   * @param filename
   * @return
   */
  public void PDF2Text(String filename) {
    try {
      File output =
          new File(filename); // The text file where you are going to store the extracted data

      stripper = new PDFTextStripper();

      wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));

      stripper.writeText(pd, wr);

      if (pd != null) {
        pd.close();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

Пример #15

0

Показать файл

Файл: PDFToTextConverter.java Проект: lukasz357/TextExtr

  String pdftoText(String fileName) {

    System.out.println("Parsing text from PDF file " + fileName + "....");
    File f = new File(fileName);

    if (!f.isFile()) {
      System.out.println("File " + fileName + " does not exist.");
      return null;
    }

    try {
      parser = new PDFParser(new FileInputStream(f));
    } catch (Exception e) {
      System.out.println("Unable to open PDF Parser.");
      return null;
    }

    try {
      parser.parse();
      cosDoc = parser.getDocument();
      pdfStripper = new PDFTextStripper();
      pdDoc = new PDDocument(cosDoc);
      parsedText = pdfStripper.getText(pdDoc);
    } catch (Exception e) {
      System.out.println("An exception occured in parsing the PDF Document.");
      e.printStackTrace();
      try {
        if (cosDoc != null) cosDoc.close();
        if (pdDoc != null) pdDoc.close();
      } catch (Exception e1) {
        e.printStackTrace();
      }
      return null;
    }
    System.out.println("Done.");
    return parsedText;
  }

Пример #16

0

Показать файл

Файл: TextHighlight.java Проект: pdeboer/StatsPDFPreprocessing

 /** {@inheritDoc} */
 @Override
 public void resetEngine() {
   super.resetEngine();
   textCache = null;
 }

Пример #17

0

Показать файл

Файл: PrintTextLocations.java Проект: fabled1/DiscVac

 /**
  * Default constructor.
  *
  * @throws IOException If there is an error loading text stripper properties.
  */
 public PrintTextLocations() throws IOException {
   super.setSortByPosition(true);
 }

Пример #18

0

Показать файл

Файл: PdfContentImporter.java Проект: nigel-v-thomas/jabref

  @Override
  public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) throws IOException {
    final ArrayList<BibtexEntry> res = new ArrayList<BibtexEntry>(1);

    PDDocument document;
    try {
      document = PDDocument.load(in);
    } catch (IOException e) {
      LOGGER.error("Could not load document", e);
      return res;
    }

    try {
      if (document.isEncrypted()) {
        LOGGER.error(Localization.lang("Encrypted documents are not supported"));
        // return res;
      }

      PDFTextStripper stripper = new PDFTextStripper();
      stripper.setStartPage(1);
      stripper.setEndPage(1);
      stripper.setSortByPosition(true);
      stripper.setParagraphEnd(System.lineSeparator());
      StringWriter writer = new StringWriter();
      stripper.writeText(document, writer);
      String textResult = writer.toString();

      String doi = new DOI(textResult).getDOI();
      if (doi.length() < textResult.length()) {
        // A Doi was found in the text
        // We do NO parsing of the text, but use the Doi fetcher

        ImportInspector i =
            new ImportInspector() {

              @Override
              public void toFront() {}

              @Override
              public void setProgress(int current, int max) {}

              @Override
              public void addEntry(BibtexEntry entry) {
                // add the entry to the result object
                res.add(entry);
              }
            };
        PdfContentImporter.doiToBibTeXFetcher.processQuery(doi, i, status);
        if (!res.isEmpty()) {
          // if something has been found, return the result
          return res;
        } else {
          // otherwise, we just parse the PDF
        }
      }

      String author;
      String editor = null;
      String institution = null;
      String abstractT = null;
      String keywords = null;
      String title;
      String conference = null;
      String DOI = null;
      String series = null;
      String volume = null;
      String number = null;
      String pages = null;
      // year is a class variable as the method extractYear() uses it;
      String publisher = null;
      BibtexEntryType type = BibtexEntryTypes.INPROCEEDINGS;

      final String lineBreak = System.lineSeparator();

      split = textResult.split(lineBreak);

      // idea: split[] contains the different lines
      // blocks are separated by empty lines
      // treat each block
      //   or do special treatment at authors (which are not broken)
      //   therefore, we do a line-based and not a block-based splitting
      // i points to the current line
      // curString (mostly) contains the current block
      //   the different lines are joined into one and thereby separated by " "

      proceedToNextNonEmptyLine();
      if (i >= split.length) {
        // PDF could not be parsed or is empty
        // return empty list
        return res;
      }
      curString = split[i];
      i = i + 1;

      if (curString.length() > 4) {
        // special case: possibly conference as first line on the page
        extractYear();
        if (curString.contains("Conference")) {
          fillCurStringWithNonEmptyLines();
          conference = curString;
          curString = "";
        } else {
          // e.g. Copyright (c) 1998 by the Genetics Society of America
          // future work: get year using RegEx
          String lower = curString.toLowerCase();
          if (lower.contains("copyright")) {
            fillCurStringWithNonEmptyLines();
            publisher = curString;
            curString = "";
          }
        }
      }

      // start: title
      fillCurStringWithNonEmptyLines();
      title = streamlineTitle(curString);
      curString = "";
      // i points to the next non-empty line

      // after title: authors
      author = null;
      while (i < split.length && !split[i].equals("")) {
        // author names are unlikely to be split among different lines
        // treat them line by line
        curString = streamlineNames(split[i]);
        if (author == null) {
          author = curString;
        } else {
          if (curString.equals("")) {
            // if split[i] is "and" then "" is returned by streamlineNames -> do nothing
          } else {
            author = author.concat(" and ").concat(curString);
          }
        }
        i++;
      }
      curString = "";
      i++;

      // then, abstract and keywords follow
      while (i < split.length) {
        curString = split[i];
        if (curString.length() >= "Abstract".length()
            && curString.substring(0, "Abstract".length()).equalsIgnoreCase("Abstract")) {
          if (curString.length() == "Abstract".length()) {
            // only word "abstract" found -- skip line
            curString = "";
          } else {
            curString = curString.substring("Abstract".length() + 1).trim().concat(lineBreak);
          }
          i++;
          // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator
          // whereas we need linebreak as separator
          while (i < split.length && !split[i].equals("")) {
            curString = curString.concat(split[i]).concat(lineBreak);
            i++;
          }
          abstractT = curString;
          i++;
        } else if (curString.length() >= "Keywords".length()
            && curString.substring(0, "Keywords".length()).equalsIgnoreCase("Keywords")) {
          if (curString.length() == "Keywords".length()) {
            // only word "Keywords" found -- skip line
            curString = "";
          } else {
            curString = curString.substring("Keywords".length() + 1).trim();
          }
          i++;
          fillCurStringWithNonEmptyLines();
          keywords = removeNonLettersAtEnd(curString);
        } else {
          String lower = curString.toLowerCase();

          int pos = lower.indexOf("technical");
          if (pos >= 0) {
            type = BibtexEntryTypes.TECHREPORT;
            pos = curString.trim().lastIndexOf(' ');
            if (pos >= 0) {
              // assumption: last character of curString is NOT ' '
              //   otherwise pos+1 leads to an out-of-bounds exception
              number = curString.substring(pos + 1);
            }
          }

          i++;
          proceedToNextNonEmptyLine();
        }
      }

      i = split.length - 1;

      // last block: DOI, detailed information
      // sometimes, this information is in the third last block etc...
      // therefore, read until the beginning of the file

      while (i >= 0) {
        readLastBlock();
        // i now points to the block before or is -1
        // curString contains the last block, separated by " "

        extractYear();

        int pos = curString.indexOf("(Eds.)");
        if (pos >= 0 && publisher == null) {
          // looks like a Springer last line
          // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009.
          publisher = "Springer";
          editor = streamlineNames(curString.substring(0, pos - 1));
          curString =
              curString.substring(
                  pos
                      + "(Eds.)".length()
                      + 2); // +2 because of ":" after (Eds.) and the subsequent space
          String[] springerSplit = curString.split(", ");
          if (springerSplit.length >= 4) {
            conference = springerSplit[0];

            String seriesData = springerSplit[1];
            int lastSpace = seriesData.lastIndexOf(' ');
            series = seriesData.substring(0, lastSpace);
            volume = seriesData.substring(lastSpace + 1);

            pages = springerSplit[2].substring(4);

            if (springerSplit[3].length() >= 4) {
              year = springerSplit[3].substring(0, 4);
            }
          }
        } else {
          if (DOI == null) {
            pos = curString.indexOf("DOI");
            if (pos < 0) {
              pos = curString.indexOf("doi");
            }
            if (pos >= 0) {
              pos += 3;
              char delimiter = curString.charAt(pos);
              if (delimiter == ':' || delimiter == ' ') {
                pos++;
              }
              int nextSpace = curString.indexOf(' ', pos);
              if (nextSpace > 0) {
                DOI = curString.substring(pos, nextSpace);
              } else {
                DOI = curString.substring(pos);
              }
            }
          }

          if (publisher == null && curString.contains("IEEE")) {
            // IEEE has the conference things at the end
            publisher = "IEEE";

            // year is extracted by extractYear
            // otherwise, we could it determine as follows:
            // String yearStr = curString.substring(curString.length()-4);
            // if (isYear(yearStr)) {
            //	year = yearStr;
            // }

            if (conference == null) {
              pos = curString.indexOf('$');
              if (pos > 0) {
                // we found the price
                // before the price, the ISSN is stated
                // skip that
                pos -= 2;
                while (pos >= 0 && curString.charAt(pos) != ' ') {
                  pos--;
                }
                if (pos > 0) {
                  conference = curString.substring(0, pos);
                }
              }
            }
          }

          //					String lower = curString.toLowerCase();
          //					if (institution == null) {
          //
          //					}

        }
      }

      BibtexEntry entry = new BibtexEntry();
      entry.setType(type);

      if (author != null) {
        entry.setField("author", author);
      }
      if (editor != null) {
        entry.setField("editor", editor);
      }
      if (institution != null) {
        entry.setField("institution", institution);
      }
      if (abstractT != null) {
        entry.setField("abstract", abstractT);
      }
      if (keywords != null) {
        entry.setField("keywords", keywords);
      }
      if (title != null) {
        entry.setField("title", title);
      }
      if (conference != null) {
        entry.setField("booktitle", conference);
      }
      if (DOI != null) {
        entry.setField("doi", DOI);
      }
      if (series != null) {
        entry.setField("series", series);
      }
      if (volume != null) {
        entry.setField("volume", volume);
      }
      if (number != null) {
        entry.setField("number", number);
      }
      if (pages != null) {
        entry.setField("pages", pages);
      }
      if (year != null) {
        entry.setField("year", year);
      }
      if (publisher != null) {
        entry.setField("publisher", publisher);
      }

      entry.setField("review", textResult);

      res.add(entry);
    } catch (NoClassDefFoundError e) {
      if (e.getMessage().equals("org/bouncycastle/jce/provider/BouncyCastleProvider")) {
        status.showMessage(
            Localization.lang(
                "Java Bouncy Castle library not found. Please download and install it. For more information see http://www.bouncycastle.org/."));
      } else {
        LOGGER.error("Could not find class", e);
      }
    } finally {
      document.close();
    }

    return res;
  }

Пример #19

0

Показать файл

Файл: pdfParser.java Проект: supertanglang/yacy_search_server

  @Override
  public Document[] parse(
      final AnchorURL location,
      final String mimeType,
      final String charset,
      final VocabularyScraper scraper,
      final int timezoneOffset,
      final InputStream source)
      throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
      throw new Parser.Failure(
          "Not enough Memory available for pdf parser: " + MemoryControl.available(), location);

    // create a pdf parser
    PDDocument pdfDoc;
    try {
      Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
      // pdfDoc = PDDocument.load(source);
      final PDFParser pdfParser = new PDFParser(source);
      pdfParser.setTempDirectory(new File(System.getProperty("java.io.tmpdir")));
      pdfParser.parse();
      pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
      throw new Parser.Failure(e.getMessage(), location);
    } finally {
      Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
      try {
        pdfDoc.openProtection(new StandardDecryptionMaterial(""));
      } catch (final BadSecurityHandlerException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
      } catch (final IOException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
      } catch (final CryptographyException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
      }
      final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
      if (perm == null || !perm.canExtractContent()) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
      }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null,
        docSubject = null,
        docAuthor = null,
        docPublisher = null,
        docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
      docTitle = info.getTitle();
      docSubject = info.getSubject();
      docAuthor = info.getAuthor();
      docPublisher = info.getProducer();
      if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator();
      docKeywordStr = info.getKeywords();
      try {
        if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime();
      } catch (IOException e) {
      }
      // unused:
      // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
      docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
      docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
      docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
      // get the links
      pdflinks = extractPdfLinks(pdfDoc);

      // get the fulltext (either per document or for each page)
      final PDFTextStripper stripper = new PDFTextStripper(StandardCharsets.UTF_8.name());

      if (individualPages) {
        // this is a hack which stores individual pages of the source pdf into individual index
        // documents
        // the new documents will get a virtual link with a post argument page=X appended to the
        // original url

        // collect text
        int pagecount = pdfDoc.getNumberOfPages();
        String[] pages = new String[pagecount];
        for (int page = 1; page <= pagecount; page++) {
          stripper.setStartPage(page);
          stripper.setEndPage(page);
          pages[page - 1] = stripper.getText(pdfDoc);
          // System.out.println("PAGE " + page + ": " + pages[page - 1]);
        }

        // create individual documents for each page
        assert pages.length == pdflinks.length
            : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length;
        result = new Document[Math.min(pages.length, pdflinks.length)];
        String loc = location.toNormalform(true);
        for (int page = 0; page < result.length; page++) {
          result[page] =
              new Document(
                  new AnchorURL(
                      loc
                          + (loc.indexOf('?') > 0 ? '&' : '?')
                          + individualPagePropertyname
                          + '='
                          + (page
                              + 1)), // these are virtual new pages; we cannot combine them with '#'
                                     // as that would be removed when computing the urlhash
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                  pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
                  null,
                  null,
                  false,
                  docDate);
        }
      } else {
        // collect the whole text at once
        final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
        byte[] contentBytes = new byte[0];
        stripper.setEndPage(3); // get first 3 pages (always)
        writer.append(stripper.getText(pdfDoc));
        contentBytes = writer.getBytes(); // remember text in case of interrupting thread

        if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
          stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
          stripper.setEndPage(Integer.MAX_VALUE); // set to default
          // we start the pdf parsing in a separate thread to ensure that it can be terminated
          final PDDocument pdfDocC = pdfDoc;
          final Thread t =
              new Thread() {
                @Override
                public void run() {
                  Thread.currentThread().setName("pdfParser.getText:" + location);
                  try {
                    writer.append(stripper.getText(pdfDocC));
                  } catch (final Throwable e) {
                  }
                }
              };
          t.start();
          t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
          if (t.isAlive()) t.interrupt();
        }
        contentBytes = writer.getBytes(); // get final text before closing writer

        Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
        for (Collection<AnchorURL> pdflinksx : pdflinks)
          if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
        result =
            new Document[] {
              new Document(
                  location,
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  contentBytes,
                  pdflinksCombined,
                  null,
                  null,
                  false,
                  docDate)
            };
      }
    } catch (final Throwable e) {
      // close the writer (in finally)
      // throw new Parser.Failure(e.getMessage(), location);
    } finally {
      try {
        pdfDoc.close();
      } catch (final Throwable e) {
      }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the
    // rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
  }

Java PDFTextStripper примеры использования