コード例 #1
0
  /**
   * Constructor sets the input and output file and convert the pdf, docx and doc files to text .
   *
   * @param infile,outfile
   * @return
   */
  public TextConvertor(String infile, String outfile) {
    try {
      File input = new File(infile); // The file from where you would like to extract
      FileInputStream fis = new FileInputStream(input.getAbsolutePath());
      int x = fis.read();
      int y = fis.read();
      fis = new FileInputStream(input.getAbsolutePath());
      if (x == 37 && y == 80) {
        filetype = "pdf";
        pd = PDDocument.load(input);
        PDF2Text(outfile);
      } else if (x == 80 && y == 75) {
        filetype = "docx";

        dx = new XWPFDocument(fis);
        DOCX2Text(outfile);
      } else if (x == 208 && y == 207) {
        filetype = "doc";
        dc = new HWPFDocument(fis);
        DOC2Text(outfile);
      }

    } catch (Exception e) {
      e.printStackTrace();
    }
  }
コード例 #2
0
 /**
  * This method scans the document for all WidgetAnnotation objects.
  *
  * <p>This is done because some writer do not create a correct list of all PDAcroFormField objects
  * in the AcroForm. In the case that the list of children is empty, we go and search ourselves for
  * candidates...
  *
  * @param doc The document to reconstruct.
  */
 protected COSArray reconstruct(PDDocument doc) {
   COSArray result = COSArray.create();
   if (doc == null) {
     return result;
   }
   PDPageTree pageTree = doc.getPageTree();
   if (pageTree == null) {
     return result;
   }
   boolean signatureExists = false;
   for (PDPage page = pageTree.getFirstPage(); page != null; page = page.getNextPage()) {
     List annotations = page.getAnnotations();
     if (annotations == null) {
       continue;
     }
     for (Iterator it = annotations.iterator(); it.hasNext(); ) {
       PDAnnotation annot = (PDAnnotation) it.next();
       if (annot.isWidgetAnnotation()) {
         COSDictionary cosAnnot = annot.cosGetDict();
         result.basicAddSilent(cosAnnot);
         cosAnnot.basicRemoveSilent(PDAcroFormField.DK_Parent);
         signatureExists |= cosAnnot.get(PDAcroFormField.DK_FT).equals(PDAcroFormField.CN_FT_Sig);
       }
     }
   }
   if (signatureExists) {
     int flags = getFieldInt(PDAcroForm.DK_SigFlags, 0);
     flags |= AcroFormSigFlags.Bit_AppendOnly | AcroFormSigFlags.Bit_SignatureExists;
     cosGetDict().basicPutSilent(PDAcroForm.DK_SigFlags, COSInteger.create(flags));
   }
   return result;
 }
コード例 #3
0
 /**
  * The destination page. ATTENTION: it is common have dangling destinations to invalid (null)
  * pages around!
  *
  * @return The destination page. Be sure to handle null return values.
  */
 public PDPage getPage(PDDocument doc) {
   COSArray definition = cosGetArray();
   COSObject page = definition.get(0);
   if (page.asNumber() != null) {
     int pageIndex = page.asNumber().intValue();
     return doc.getPageTree().getPageAt(pageIndex);
   }
   if (page.asDictionary() != null) {
     return (PDPage) PDPageNode.META.createFromCos(page.asDictionary());
   }
   return null;
 }
コード例 #4
0
ファイル: ReadMetadata.java プロジェクト: Martum/pdfbox-test
  private static void extract(InputStream in) throws Exception {
    PDDocument document = null;
    try {
      PDFParser parser = new PDFParser(in);
      parser.parse();
      document = parser.getPDDocument();
      if (document.isEncrypted()) {
        System.err.println("Document is Encrypted!");
      }
      PDDocumentCatalog cat = document.getDocumentCatalog();
      PDMetadata metadata = cat.getMetadata();
      if (metadata != null) {
        // System.out.println(metadata.getStream().getStreamTokens());

        // Levantamos la MetaData
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        Document doc = dBuilder.parse(metadata.createInputStream());

        // Buscamos el tag de SEmployee y el Element -> CUIT
        NodeList nList = doc.getElementsByTagName("foaf:SEmployee");
        Element elem = (Element) nList.item(0);
        String cuit = elem.getElementsByTagName("foaf:cuit").item(0).getTextContent();

        System.out.println(cuit);

        System.out.println("---");
        System.out.println(metadata.getInputStreamAsString());
      }
    } catch (Exception err) {
      throw err;
    } finally {
      if (document != null)
        try {
          document.close();
        } catch (Throwable err2) {
        }
    }
  }
コード例 #5
0
  /**
   * save the converted text (without any processing) to the given file.
   *
   * @param filename
   * @return
   */
  public void PDF2Text(String filename) {
    try {
      File output =
          new File(filename); // The text file where you are going to store the extracted data

      stripper = new PDFTextStripper();

      wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));

      stripper.writeText(pd, wr);

      if (pd != null) {
        pd.close();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
コード例 #6
0
  public void PDF2TextPreProssesd(String filename) {
    try {

      stripper = new PDFTextStripper();
      stripper.setParagraphStart("&*&");
      stripper.setLineSeparator("#%#");
      stripper.setPageSeparator("#%#");
      String fulltxt = stripper.getText(pd);
      String paras[] = fulltxt.split("&*&");

      File file = new File(filename);
      try {
        BufferedWriter out = new BufferedWriter(new FileWriter(file));

        int i = 0;
        while (i < paras.length) {
          if (paras[i].length() > 200) {
            String para = paras[i].replace("#%#", " ");

            out.write(para + "\r\n");
          }
          i++;
        }
        out.close();

      } catch (IOException ex) {
        ex.printStackTrace();
      }

      if (pd != null) {
        pd.close();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }