/**
   * Constructor sets the input and output file and convert the pdf, docx and doc files to text .
   *
   * @param infile,outfile
   * @return
   */
  public TextConvertor(String infile, String outfile) {
    try {
      File input = new File(infile); // The file from where you would like to extract
      FileInputStream fis = new FileInputStream(input.getAbsolutePath());
      int x = fis.read();
      int y = fis.read();
      fis = new FileInputStream(input.getAbsolutePath());
      if (x == 37 && y == 80) {
        filetype = "pdf";
        pd = PDDocument.load(input);
        PDF2Text(outfile);
      } else if (x == 80 && y == 75) {
        filetype = "docx";

        dx = new XWPFDocument(fis);
        DOCX2Text(outfile);
      } else if (x == 208 && y == 207) {
        filetype = "doc";
        dc = new HWPFDocument(fis);
        DOC2Text(outfile);
      }

    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Beispiel #2
0
  private static void extract(InputStream in) throws Exception {
    PDDocument document = null;
    try {
      PDFParser parser = new PDFParser(in);
      parser.parse();
      document = parser.getPDDocument();
      if (document.isEncrypted()) {
        System.err.println("Document is Encrypted!");
      }
      PDDocumentCatalog cat = document.getDocumentCatalog();
      PDMetadata metadata = cat.getMetadata();
      if (metadata != null) {
        // System.out.println(metadata.getStream().getStreamTokens());

        // Levantamos la MetaData
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        Document doc = dBuilder.parse(metadata.createInputStream());

        // Buscamos el tag de SEmployee y el Element -> CUIT
        NodeList nList = doc.getElementsByTagName("foaf:SEmployee");
        Element elem = (Element) nList.item(0);
        String cuit = elem.getElementsByTagName("foaf:cuit").item(0).getTextContent();

        System.out.println(cuit);

        System.out.println("---");
        System.out.println(metadata.getInputStreamAsString());
      }
    } catch (Exception err) {
      throw err;
    } finally {
      if (document != null)
        try {
          document.close();
        } catch (Throwable err2) {
        }
    }
  }
  /**
   * save the converted text (without any processing) to the given file.
   *
   * @param filename
   * @return
   */
  public void PDF2Text(String filename) {
    try {
      File output =
          new File(filename); // The text file where you are going to store the extracted data

      stripper = new PDFTextStripper();

      wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));

      stripper.writeText(pd, wr);

      if (pd != null) {
        pd.close();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  public void PDF2TextPreProssesd(String filename) {
    try {

      stripper = new PDFTextStripper();
      stripper.setParagraphStart("&*&");
      stripper.setLineSeparator("#%#");
      stripper.setPageSeparator("#%#");
      String fulltxt = stripper.getText(pd);
      String paras[] = fulltxt.split("&*&");

      File file = new File(filename);
      try {
        BufferedWriter out = new BufferedWriter(new FileWriter(file));

        int i = 0;
        while (i < paras.length) {
          if (paras[i].length() > 200) {
            String para = paras[i].replace("#%#", " ");

            out.write(para + "\r\n");
          }
          i++;
        }
        out.close();

      } catch (IOException ex) {
        ex.printStackTrace();
      }

      if (pd != null) {
        pd.close();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }