Esempio n. 1
0
  /**
   * save the converted text (without any processing) to the given file.
   *
   * @param filename
   * @return
   */
  public void DOCX2Text(String filename) {
    try {
      File output =
          new File(filename); // The text file where you are going to store the extracted data

      XWPFWordExtractor ex = new XWPFWordExtractor(dx);

      String fileData = ex.getText();

      wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));
      wr.write(fileData);

      wr.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Esempio n. 2
0
  /**
   * save the converted text (without any processing) to the given file.
   *
   * @param filename
   * @return
   */
  public void DOC2Text(String filename) {
    try {
      File output =
          new File(filename); // The text file where you are going to store the extracted data
      wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));
      WordExtractor extractor = new WordExtractor(dc);
      String[] fileData = extractor.getParagraphText();
      for (int i = 0; i < fileData.length; i++) {
        if (fileData[i] != null) wr.write(fileData[i]);
      }

      wr.close();

    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Esempio n. 3
0
  public void PDF2TextPreProssesd(String filename) {
    try {

      stripper = new PDFTextStripper();
      stripper.setParagraphStart("&*&");
      stripper.setLineSeparator("#%#");
      stripper.setPageSeparator("#%#");
      String fulltxt = stripper.getText(pd);
      String paras[] = fulltxt.split("&*&");

      File file = new File(filename);
      try {
        BufferedWriter out = new BufferedWriter(new FileWriter(file));

        int i = 0;
        while (i < paras.length) {
          if (paras[i].length() > 200) {
            String para = paras[i].replace("#%#", " ");

            out.write(para + "\r\n");
          }
          i++;
        }
        out.close();

      } catch (IOException ex) {
        ex.printStackTrace();
      }

      if (pd != null) {
        pd.close();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }