Example #1
0
  /**
   * Constructor sets the input and output file and convert the pdf, docx and doc files to text .
   *
   * @param infile,outfile
   * @return
   */
  public TextConvertor(String infile, String outfile) {
    try {
      File input = new File(infile); // The file from where you would like to extract
      FileInputStream fis = new FileInputStream(input.getAbsolutePath());
      int x = fis.read();
      int y = fis.read();
      fis = new FileInputStream(input.getAbsolutePath());
      if (x == 37 && y == 80) {
        filetype = "pdf";
        pd = PDDocument.load(input);
        PDF2Text(outfile);
      } else if (x == 80 && y == 75) {
        filetype = "docx";

        dx = new XWPFDocument(fis);
        DOCX2Text(outfile);
      } else if (x == 208 && y == 207) {
        filetype = "doc";
        dc = new HWPFDocument(fis);
        DOC2Text(outfile);
      }

    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Example #2
0
  /**
   * save the converted text (without any processing) to the given file.
   *
   * @param filename
   * @return
   */
  public void PDF2Text(String filename) {
    try {
      File output =
          new File(filename); // The text file where you are going to store the extracted data

      stripper = new PDFTextStripper();

      wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));

      stripper.writeText(pd, wr);

      if (pd != null) {
        pd.close();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Example #3
0
  public void PDF2TextPreProssesd(String filename) {
    try {

      stripper = new PDFTextStripper();
      stripper.setParagraphStart("&*&");
      stripper.setLineSeparator("#%#");
      stripper.setPageSeparator("#%#");
      String fulltxt = stripper.getText(pd);
      String paras[] = fulltxt.split("&*&");

      File file = new File(filename);
      try {
        BufferedWriter out = new BufferedWriter(new FileWriter(file));

        int i = 0;
        while (i < paras.length) {
          if (paras[i].length() > 200) {
            String para = paras[i].replace("#%#", " ");

            out.write(para + "\r\n");
          }
          i++;
        }
        out.close();

      } catch (IOException ex) {
        ex.printStackTrace();
      }

      if (pd != null) {
        pd.close();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }