public static String getFileContent(File file) throws FileNotFoundException, IOException { String ext = FilenameUtils.getExtension(file.getName()); String outContent = ""; try { if (ext.toLowerCase().equals("doc")) { if (file != null) { WordExtractor we = new WordExtractor(new FileInputStream(file)); outContent = we.getText(); } else { logger.warning("file not found : " + file); } } else if (ext.toLowerCase().equals("pdf")) { PDDocument doc = PDDocument.load(file); PDFTextStripper text = new PDFTextStripper(); outContent = text.getText(doc); doc.close(); } else if (StringHelper.isHTML(file.getName())) { return loadStringFromFile(file); } } catch (Throwable t) { logger.warning("error when read : " + file + "+ [" + t.getMessage() + "]"); t.printStackTrace(); } return outContent; }
/** * Extracts text from a Word document and stores it in the document. * * @param inputStream An input stream pointing to the Word document to be read. * @throws IOException */ private static char[] loadMSWord(InputStream inputStream) throws IOException { POIFSFileSystem fs = new POIFSFileSystem(inputStream); HWPFDocument doc = new HWPFDocument(fs); WordExtractor we = new WordExtractor(doc); char[] origText = we.getText().toCharArray(); return origText; }
/** * save the converted text (without any processing) to the given file. * * @param filename * @return */ public void DOC2Text(String filename) { try { File output = new File(filename); // The text file where you are going to store the extracted data wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output))); WordExtractor extractor = new WordExtractor(dc); String[] fileData = extractor.getParagraphText(); for (int i = 0; i < fileData.length; i++) { if (fileData[i] != null) wr.write(fileData[i]); } wr.close(); } catch (Exception e) { e.printStackTrace(); } }
public static String GetText(String sUrl) { String sRet = ""; try { System.out.print(" Connecting to: " + sUrl + "... \n"); InputStream inputStream = new URL(sUrl).openStream(); System.out.print(" Stream readed from: " + sUrl + "\n"); HWPFDocument docx = new HWPFDocument(inputStream); WordExtractor we = new WordExtractor(docx); sRet = we.getText(); we.close(); } catch (Exception e) { sRet = "Error al leer el archivo" + e.getMessage(); } return sRet; }