示例#1
0
 public static String getFileContent(File file) throws FileNotFoundException, IOException {
   String ext = FilenameUtils.getExtension(file.getName());
   String outContent = "";
   try {
     if (ext.toLowerCase().equals("doc")) {
       if (file != null) {
         WordExtractor we = new WordExtractor(new FileInputStream(file));
         outContent = we.getText();
       } else {
         logger.warning("file not found : " + file);
       }
     } else if (ext.toLowerCase().equals("pdf")) {
       PDDocument doc = PDDocument.load(file);
       PDFTextStripper text = new PDFTextStripper();
       outContent = text.getText(doc);
       doc.close();
     } else if (StringHelper.isHTML(file.getName())) {
       return loadStringFromFile(file);
     }
   } catch (Throwable t) {
     logger.warning("error when read : " + file + "+ [" + t.getMessage() + "]");
     t.printStackTrace();
   }
   return outContent;
 }
示例#2
0
  /**
   * Extracts text from a Word document and stores it in the document.
   *
   * @param inputStream An input stream pointing to the Word document to be read.
   * @throws IOException
   */
  private static char[] loadMSWord(InputStream inputStream) throws IOException {
    POIFSFileSystem fs = new POIFSFileSystem(inputStream);
    HWPFDocument doc = new HWPFDocument(fs);
    WordExtractor we = new WordExtractor(doc);
    char[] origText = we.getText().toCharArray();

    return origText;
  }
示例#3
0
  /**
   * save the converted text (without any processing) to the given file.
   *
   * @param filename
   * @return
   */
  public void DOC2Text(String filename) {
    try {
      File output =
          new File(filename); // The text file where you are going to store the extracted data
      wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));
      WordExtractor extractor = new WordExtractor(dc);
      String[] fileData = extractor.getParagraphText();
      for (int i = 0; i < fileData.length; i++) {
        if (fileData[i] != null) wr.write(fileData[i]);
      }

      wr.close();

    } catch (Exception e) {
      e.printStackTrace();
    }
  }
示例#4
0
  public static String GetText(String sUrl) {

    String sRet = "";
    try {
      System.out.print(" Connecting to: " + sUrl + "... \n");
      InputStream inputStream = new URL(sUrl).openStream();
      System.out.print(" Stream readed from: " + sUrl + "\n");

      HWPFDocument docx = new HWPFDocument(inputStream);
      WordExtractor we = new WordExtractor(docx);
      sRet = we.getText();

      we.close();

    } catch (Exception e) {
      sRet = "Error al leer el archivo" + e.getMessage();
    }

    return sRet;
  }