/** * save the converted text (without any processing) to the given file. * * @param filename * @return */ public void DOCX2Text(String filename) { try { File output = new File(filename); // The text file where you are going to store the extracted data XWPFWordExtractor ex = new XWPFWordExtractor(dx); String fileData = ex.getText(); wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output))); wr.write(fileData); wr.close(); } catch (Exception e) { e.printStackTrace(); } }
/** * save the converted text (without any processing) to the given file. * * @param filename * @return */ public void DOC2Text(String filename) { try { File output = new File(filename); // The text file where you are going to store the extracted data wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output))); WordExtractor extractor = new WordExtractor(dc); String[] fileData = extractor.getParagraphText(); for (int i = 0; i < fileData.length; i++) { if (fileData[i] != null) wr.write(fileData[i]); } wr.close(); } catch (Exception e) { e.printStackTrace(); } }
public void PDF2TextPreProssesd(String filename) { try { stripper = new PDFTextStripper(); stripper.setParagraphStart("&*&"); stripper.setLineSeparator("#%#"); stripper.setPageSeparator("#%#"); String fulltxt = stripper.getText(pd); String paras[] = fulltxt.split("&*&"); File file = new File(filename); try { BufferedWriter out = new BufferedWriter(new FileWriter(file)); int i = 0; while (i < paras.length) { if (paras[i].length() > 200) { String para = paras[i].replace("#%#", " "); out.write(para + "\r\n"); } i++; } out.close(); } catch (IOException ex) { ex.printStackTrace(); } if (pd != null) { pd.close(); } } catch (Exception e) { e.printStackTrace(); } }