Java PDFTextStripper.setStartPage примеры использования

Язык программирования: Java

Пространство имен/Пакет: org.apache.pdfbox.util

Класс/Тип: PDFTextStripper

Метод/Функция: setStartPage

Примеров на hotexamples.com: 6

Java PDFTextStripper.setStartPage - 6 примеров найдено. Это лучшие примеры Java кода для org.apache.pdfbox.util.PDFTextStripper.setStartPage, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

getText(14)

setEndPage(6)

setStartPage(6)

setSortByPosition(4)

writeText(3)

resetEngine(1)

setLineSeparator(1)

setPageSeparator(1)

setParagraphEnd(1)

setParagraphStart(1)

Пример #1

Показать файл

Файл: PDFContentExtractor.java Проект: gSafe/mark

 public static String extractText(InputStream src) throws IOException {
   StringBuilder text = new StringBuilder();
   COSDocument cosDoc = null;
   PDDocument pdDoc = null;
   try {
     PDFParser parser = new PDFParser(src);
     parser.parse();
     cosDoc = parser.getDocument();
     PDFTextStripper stripper = new PDFTextStripper();
     pdDoc = new PDDocument(cosDoc);
     int nbPages = pdDoc.getDocumentCatalog().getPages().getCount();
     for (int i = 0; i < nbPages; i++) {
       stripper.setStartPage(i + 1);
       stripper.setEndPage(i + 1);
       text.append(stripper.getText(pdDoc));
     }
   } finally {
     try {
       if (cosDoc != null) {
         cosDoc.close();
       }
     } catch (IOException e) {
       // Do nada
     }
     try {
       if (pdDoc != null) {
         pdDoc.close();
       }
     } catch (IOException e) {
       // Do nada
     }
   }
   return text.toString();
 }

Пример #2

Показать файл

Файл: PDFTextExtractor.java Проект: Knixli/Zeen

 public static String extract(File pdfFile) throws IOException {
   checkNotNull(pdfFile, "pdfFile");
   PDFParser parser = new PDFParser(new FileInputStream(pdfFile));
   parser.parse();
   COSDocument cosDoc = parser.getDocument();
   PDFTextStripper pdfStripper = new PDFTextStripper();
   PDDocument pdDoc = new PDDocument(cosDoc);
   pdfStripper.setStartPage(1);
   pdfStripper.setEndPage(pdDoc.getNumberOfPages());
   pdfStripper.setSortByPosition(true);
   String pdfText = pdfStripper.getText(pdDoc);
   pdDoc.close();
   cosDoc.close();
   return pdfText;
 }

Пример #3

Показать файл

Файл: IndexadorPDF.java Проект: DinamicArea/formacion-lucene

  /**
   * Método para la indexación individual de cada fichero PDF
   *
   * @param f el fichero PDF
   * @param writer el IndexWriter
   * @throws IOException
   */
  public static void indexFile(File f, IndexWriter writer) throws IOException {

    // Cargamos el fichero mediante PDFBox
    PDDocument pddDocument = PDDocument.load(f.getAbsolutePath());
    PDFTextStripper textStripper = new PDFTextStripper();
    int numPages = pddDocument.getNumberOfPages();
    String pageContent;

    // Declaramos un Field propio
    FieldType fieldText = new FieldType();
    fieldText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    fieldText.setStored(false);
    fieldText.setStoreTermVectorOffsets(true);
    fieldText.setStoreTermVectorPositions(true);
    fieldText.setStoreTermVectors(true);

    // Recorremos e indexamos cada una de las páginas del fichero, almacenando el número de página y
    // el título del fichero, e indexando el contenido
    for (int i = 0; i < numPages; i++) {
      if (i == 0) {
        i++;
      }
      textStripper.setStartPage(i);
      textStripper.setEndPage(i);
      // coger una página
      pageContent = textStripper.getText(pddDocument);
      if (pageContent != null && !pageContent.isEmpty()) {
        pageContent = pageContent.toLowerCase();
      }

      if (pageContent != null) {
        // Declaramos el documento a indexar para esa página

        // Número de página
        // Contenido de la página
        // Título del fichero

        // Añadimos el documento
      }
    }

    // Cerramos el fichero PDF

  }

Пример #4

Показать файл

Файл: PdfReader.java Проект: jatin-goyal/FileReader

  public static void main(String[] args) throws Exception {

    File file = new File("C:/Users/jatin.goyal/Desktop/demoexcel.pdf");
    PDDocument pd = PDDocument.load(file);
    System.out.println(pd.getNumberOfPages());
    PDFTextStripper st = new PDFTextStripper();
    st.setStartPage(1);
    // st.setEndPage(4);

    //		 PDFTextStripperByArea stripper = new PDFTextStripperByArea();
    //	        stripper.setSortByPosition( true );
    //	        Rectangle rect1 = new Rectangle( 50, 140, 60, 20 );
    //	        Rectangle rect2 = new Rectangle( 110, 140, 20, 20 );
    //	        stripper.addRegion( "row1column1", rect1 );
    //	        stripper.addRegion( "row1column2", rect2 );
    //	        List allPages = pd.getDocumentCatalog().getAllPages();
    //	        PDPage firstPage = (PDPage)allPages.get( 0 );
    //	        stripper.extractRegions( firstPage );
    //	        System.out.println(stripper.getTextForRegion( "row1column1" ));
    //	        System.out.println(stripper.getTextForRegion( "row1column2" ));

    System.out.println(st.getText(pd));
  }

Пример #5

Показать файл

Файл: PdfContentImporter.java Проект: nigel-v-thomas/jabref

  @Override
  public List<BibtexEntry> importEntries(InputStream in, OutputPrinter status) throws IOException {
    final ArrayList<BibtexEntry> res = new ArrayList<BibtexEntry>(1);

    PDDocument document;
    try {
      document = PDDocument.load(in);
    } catch (IOException e) {
      LOGGER.error("Could not load document", e);
      return res;
    }

    try {
      if (document.isEncrypted()) {
        LOGGER.error(Localization.lang("Encrypted documents are not supported"));
        // return res;
      }

      PDFTextStripper stripper = new PDFTextStripper();
      stripper.setStartPage(1);
      stripper.setEndPage(1);
      stripper.setSortByPosition(true);
      stripper.setParagraphEnd(System.lineSeparator());
      StringWriter writer = new StringWriter();
      stripper.writeText(document, writer);
      String textResult = writer.toString();

      String doi = new DOI(textResult).getDOI();
      if (doi.length() < textResult.length()) {
        // A Doi was found in the text
        // We do NO parsing of the text, but use the Doi fetcher

        ImportInspector i =
            new ImportInspector() {

              @Override
              public void toFront() {}

              @Override
              public void setProgress(int current, int max) {}

              @Override
              public void addEntry(BibtexEntry entry) {
                // add the entry to the result object
                res.add(entry);
              }
            };
        PdfContentImporter.doiToBibTeXFetcher.processQuery(doi, i, status);
        if (!res.isEmpty()) {
          // if something has been found, return the result
          return res;
        } else {
          // otherwise, we just parse the PDF
        }
      }

      String author;
      String editor = null;
      String institution = null;
      String abstractT = null;
      String keywords = null;
      String title;
      String conference = null;
      String DOI = null;
      String series = null;
      String volume = null;
      String number = null;
      String pages = null;
      // year is a class variable as the method extractYear() uses it;
      String publisher = null;
      BibtexEntryType type = BibtexEntryTypes.INPROCEEDINGS;

      final String lineBreak = System.lineSeparator();

      split = textResult.split(lineBreak);

      // idea: split[] contains the different lines
      // blocks are separated by empty lines
      // treat each block
      //   or do special treatment at authors (which are not broken)
      //   therefore, we do a line-based and not a block-based splitting
      // i points to the current line
      // curString (mostly) contains the current block
      //   the different lines are joined into one and thereby separated by " "

      proceedToNextNonEmptyLine();
      if (i >= split.length) {
        // PDF could not be parsed or is empty
        // return empty list
        return res;
      }
      curString = split[i];
      i = i + 1;

      if (curString.length() > 4) {
        // special case: possibly conference as first line on the page
        extractYear();
        if (curString.contains("Conference")) {
          fillCurStringWithNonEmptyLines();
          conference = curString;
          curString = "";
        } else {
          // e.g. Copyright (c) 1998 by the Genetics Society of America
          // future work: get year using RegEx
          String lower = curString.toLowerCase();
          if (lower.contains("copyright")) {
            fillCurStringWithNonEmptyLines();
            publisher = curString;
            curString = "";
          }
        }
      }

      // start: title
      fillCurStringWithNonEmptyLines();
      title = streamlineTitle(curString);
      curString = "";
      // i points to the next non-empty line

      // after title: authors
      author = null;
      while (i < split.length && !split[i].equals("")) {
        // author names are unlikely to be split among different lines
        // treat them line by line
        curString = streamlineNames(split[i]);
        if (author == null) {
          author = curString;
        } else {
          if (curString.equals("")) {
            // if split[i] is "and" then "" is returned by streamlineNames -> do nothing
          } else {
            author = author.concat(" and ").concat(curString);
          }
        }
        i++;
      }
      curString = "";
      i++;

      // then, abstract and keywords follow
      while (i < split.length) {
        curString = split[i];
        if (curString.length() >= "Abstract".length()
            && curString.substring(0, "Abstract".length()).equalsIgnoreCase("Abstract")) {
          if (curString.length() == "Abstract".length()) {
            // only word "abstract" found -- skip line
            curString = "";
          } else {
            curString = curString.substring("Abstract".length() + 1).trim().concat(lineBreak);
          }
          i++;
          // fillCurStringWithNonEmptyLines() cannot be used as that uses " " as line separator
          // whereas we need linebreak as separator
          while (i < split.length && !split[i].equals("")) {
            curString = curString.concat(split[i]).concat(lineBreak);
            i++;
          }
          abstractT = curString;
          i++;
        } else if (curString.length() >= "Keywords".length()
            && curString.substring(0, "Keywords".length()).equalsIgnoreCase("Keywords")) {
          if (curString.length() == "Keywords".length()) {
            // only word "Keywords" found -- skip line
            curString = "";
          } else {
            curString = curString.substring("Keywords".length() + 1).trim();
          }
          i++;
          fillCurStringWithNonEmptyLines();
          keywords = removeNonLettersAtEnd(curString);
        } else {
          String lower = curString.toLowerCase();

          int pos = lower.indexOf("technical");
          if (pos >= 0) {
            type = BibtexEntryTypes.TECHREPORT;
            pos = curString.trim().lastIndexOf(' ');
            if (pos >= 0) {
              // assumption: last character of curString is NOT ' '
              //   otherwise pos+1 leads to an out-of-bounds exception
              number = curString.substring(pos + 1);
            }
          }

          i++;
          proceedToNextNonEmptyLine();
        }
      }

      i = split.length - 1;

      // last block: DOI, detailed information
      // sometimes, this information is in the third last block etc...
      // therefore, read until the beginning of the file

      while (i >= 0) {
        readLastBlock();
        // i now points to the block before or is -1
        // curString contains the last block, separated by " "

        extractYear();

        int pos = curString.indexOf("(Eds.)");
        if (pos >= 0 && publisher == null) {
          // looks like a Springer last line
          // e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009.
          publisher = "Springer";
          editor = streamlineNames(curString.substring(0, pos - 1));
          curString =
              curString.substring(
                  pos
                      + "(Eds.)".length()
                      + 2); // +2 because of ":" after (Eds.) and the subsequent space
          String[] springerSplit = curString.split(", ");
          if (springerSplit.length >= 4) {
            conference = springerSplit[0];

            String seriesData = springerSplit[1];
            int lastSpace = seriesData.lastIndexOf(' ');
            series = seriesData.substring(0, lastSpace);
            volume = seriesData.substring(lastSpace + 1);

            pages = springerSplit[2].substring(4);

            if (springerSplit[3].length() >= 4) {
              year = springerSplit[3].substring(0, 4);
            }
          }
        } else {
          if (DOI == null) {
            pos = curString.indexOf("DOI");
            if (pos < 0) {
              pos = curString.indexOf("doi");
            }
            if (pos >= 0) {
              pos += 3;
              char delimiter = curString.charAt(pos);
              if (delimiter == ':' || delimiter == ' ') {
                pos++;
              }
              int nextSpace = curString.indexOf(' ', pos);
              if (nextSpace > 0) {
                DOI = curString.substring(pos, nextSpace);
              } else {
                DOI = curString.substring(pos);
              }
            }
          }

          if (publisher == null && curString.contains("IEEE")) {
            // IEEE has the conference things at the end
            publisher = "IEEE";

            // year is extracted by extractYear
            // otherwise, we could it determine as follows:
            // String yearStr = curString.substring(curString.length()-4);
            // if (isYear(yearStr)) {
            //	year = yearStr;
            // }

            if (conference == null) {
              pos = curString.indexOf('$');
              if (pos > 0) {
                // we found the price
                // before the price, the ISSN is stated
                // skip that
                pos -= 2;
                while (pos >= 0 && curString.charAt(pos) != ' ') {
                  pos--;
                }
                if (pos > 0) {
                  conference = curString.substring(0, pos);
                }
              }
            }
          }

          //					String lower = curString.toLowerCase();
          //					if (institution == null) {
          //
          //					}

        }
      }

      BibtexEntry entry = new BibtexEntry();
      entry.setType(type);

      if (author != null) {
        entry.setField("author", author);
      }
      if (editor != null) {
        entry.setField("editor", editor);
      }
      if (institution != null) {
        entry.setField("institution", institution);
      }
      if (abstractT != null) {
        entry.setField("abstract", abstractT);
      }
      if (keywords != null) {
        entry.setField("keywords", keywords);
      }
      if (title != null) {
        entry.setField("title", title);
      }
      if (conference != null) {
        entry.setField("booktitle", conference);
      }
      if (DOI != null) {
        entry.setField("doi", DOI);
      }
      if (series != null) {
        entry.setField("series", series);
      }
      if (volume != null) {
        entry.setField("volume", volume);
      }
      if (number != null) {
        entry.setField("number", number);
      }
      if (pages != null) {
        entry.setField("pages", pages);
      }
      if (year != null) {
        entry.setField("year", year);
      }
      if (publisher != null) {
        entry.setField("publisher", publisher);
      }

      entry.setField("review", textResult);

      res.add(entry);
    } catch (NoClassDefFoundError e) {
      if (e.getMessage().equals("org/bouncycastle/jce/provider/BouncyCastleProvider")) {
        status.showMessage(
            Localization.lang(
                "Java Bouncy Castle library not found. Please download and install it. For more information see http://www.bouncycastle.org/."));
      } else {
        LOGGER.error("Could not find class", e);
      }
    } finally {
      document.close();
    }

    return res;
  }

Пример #6

Показать файл

Файл: pdfParser.java Проект: supertanglang/yacy_search_server

  @Override
  public Document[] parse(
      final AnchorURL location,
      final String mimeType,
      final String charset,
      final VocabularyScraper scraper,
      final int timezoneOffset,
      final InputStream source)
      throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
      throw new Parser.Failure(
          "Not enough Memory available for pdf parser: " + MemoryControl.available(), location);

    // create a pdf parser
    PDDocument pdfDoc;
    try {
      Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
      // pdfDoc = PDDocument.load(source);
      final PDFParser pdfParser = new PDFParser(source);
      pdfParser.setTempDirectory(new File(System.getProperty("java.io.tmpdir")));
      pdfParser.parse();
      pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
      throw new Parser.Failure(e.getMessage(), location);
    } finally {
      Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
      try {
        pdfDoc.openProtection(new StandardDecryptionMaterial(""));
      } catch (final BadSecurityHandlerException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
      } catch (final IOException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
      } catch (final CryptographyException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
      }
      final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
      if (perm == null || !perm.canExtractContent()) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
      }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null,
        docSubject = null,
        docAuthor = null,
        docPublisher = null,
        docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
      docTitle = info.getTitle();
      docSubject = info.getSubject();
      docAuthor = info.getAuthor();
      docPublisher = info.getProducer();
      if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator();
      docKeywordStr = info.getKeywords();
      try {
        if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime();
      } catch (IOException e) {
      }
      // unused:
      // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
      docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
      docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
      docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
      // get the links
      pdflinks = extractPdfLinks(pdfDoc);

      // get the fulltext (either per document or for each page)
      final PDFTextStripper stripper = new PDFTextStripper(StandardCharsets.UTF_8.name());

      if (individualPages) {
        // this is a hack which stores individual pages of the source pdf into individual index
        // documents
        // the new documents will get a virtual link with a post argument page=X appended to the
        // original url

        // collect text
        int pagecount = pdfDoc.getNumberOfPages();
        String[] pages = new String[pagecount];
        for (int page = 1; page <= pagecount; page++) {
          stripper.setStartPage(page);
          stripper.setEndPage(page);
          pages[page - 1] = stripper.getText(pdfDoc);
          // System.out.println("PAGE " + page + ": " + pages[page - 1]);
        }

        // create individual documents for each page
        assert pages.length == pdflinks.length
            : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length;
        result = new Document[Math.min(pages.length, pdflinks.length)];
        String loc = location.toNormalform(true);
        for (int page = 0; page < result.length; page++) {
          result[page] =
              new Document(
                  new AnchorURL(
                      loc
                          + (loc.indexOf('?') > 0 ? '&' : '?')
                          + individualPagePropertyname
                          + '='
                          + (page
                              + 1)), // these are virtual new pages; we cannot combine them with '#'
                                     // as that would be removed when computing the urlhash
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                  pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
                  null,
                  null,
                  false,
                  docDate);
        }
      } else {
        // collect the whole text at once
        final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
        byte[] contentBytes = new byte[0];
        stripper.setEndPage(3); // get first 3 pages (always)
        writer.append(stripper.getText(pdfDoc));
        contentBytes = writer.getBytes(); // remember text in case of interrupting thread

        if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
          stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
          stripper.setEndPage(Integer.MAX_VALUE); // set to default
          // we start the pdf parsing in a separate thread to ensure that it can be terminated
          final PDDocument pdfDocC = pdfDoc;
          final Thread t =
              new Thread() {
                @Override
                public void run() {
                  Thread.currentThread().setName("pdfParser.getText:" + location);
                  try {
                    writer.append(stripper.getText(pdfDocC));
                  } catch (final Throwable e) {
                  }
                }
              };
          t.start();
          t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
          if (t.isAlive()) t.interrupt();
        }
        contentBytes = writer.getBytes(); // get final text before closing writer

        Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
        for (Collection<AnchorURL> pdflinksx : pdflinks)
          if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
        result =
            new Document[] {
              new Document(
                  location,
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  contentBytes,
                  pdflinksCombined,
                  null,
                  null,
                  false,
                  docDate)
            };
      }
    } catch (final Throwable e) {
      // close the writer (in finally)
      // throw new Parser.Failure(e.getMessage(), location);
    } finally {
      try {
        pdfDoc.close();
      } catch (final Throwable e) {
      }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the
    // rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
  }