/** * Constructor sets the input and output file and convert the pdf, docx and doc files to text . * * @param infile,outfile * @return */ public TextConvertor(String infile, String outfile) { try { File input = new File(infile); // The file from where you would like to extract FileInputStream fis = new FileInputStream(input.getAbsolutePath()); int x = fis.read(); int y = fis.read(); fis = new FileInputStream(input.getAbsolutePath()); if (x == 37 && y == 80) { filetype = "pdf"; pd = PDDocument.load(input); PDF2Text(outfile); } else if (x == 80 && y == 75) { filetype = "docx"; dx = new XWPFDocument(fis); DOCX2Text(outfile); } else if (x == 208 && y == 207) { filetype = "doc"; dc = new HWPFDocument(fis); DOC2Text(outfile); } } catch (Exception e) { e.printStackTrace(); } }
/** * This method scans the document for all WidgetAnnotation objects. * * <p>This is done because some writer do not create a correct list of all PDAcroFormField objects * in the AcroForm. In the case that the list of children is empty, we go and search ourselves for * candidates... * * @param doc The document to reconstruct. */ protected COSArray reconstruct(PDDocument doc) { COSArray result = COSArray.create(); if (doc == null) { return result; } PDPageTree pageTree = doc.getPageTree(); if (pageTree == null) { return result; } boolean signatureExists = false; for (PDPage page = pageTree.getFirstPage(); page != null; page = page.getNextPage()) { List annotations = page.getAnnotations(); if (annotations == null) { continue; } for (Iterator it = annotations.iterator(); it.hasNext(); ) { PDAnnotation annot = (PDAnnotation) it.next(); if (annot.isWidgetAnnotation()) { COSDictionary cosAnnot = annot.cosGetDict(); result.basicAddSilent(cosAnnot); cosAnnot.basicRemoveSilent(PDAcroFormField.DK_Parent); signatureExists |= cosAnnot.get(PDAcroFormField.DK_FT).equals(PDAcroFormField.CN_FT_Sig); } } } if (signatureExists) { int flags = getFieldInt(PDAcroForm.DK_SigFlags, 0); flags |= AcroFormSigFlags.Bit_AppendOnly | AcroFormSigFlags.Bit_SignatureExists; cosGetDict().basicPutSilent(PDAcroForm.DK_SigFlags, COSInteger.create(flags)); } return result; }
/** * The destination page. ATTENTION: it is common have dangling destinations to invalid (null) * pages around! * * @return The destination page. Be sure to handle null return values. */ public PDPage getPage(PDDocument doc) { COSArray definition = cosGetArray(); COSObject page = definition.get(0); if (page.asNumber() != null) { int pageIndex = page.asNumber().intValue(); return doc.getPageTree().getPageAt(pageIndex); } if (page.asDictionary() != null) { return (PDPage) PDPageNode.META.createFromCos(page.asDictionary()); } return null; }
private static void extract(InputStream in) throws Exception { PDDocument document = null; try { PDFParser parser = new PDFParser(in); parser.parse(); document = parser.getPDDocument(); if (document.isEncrypted()) { System.err.println("Document is Encrypted!"); } PDDocumentCatalog cat = document.getDocumentCatalog(); PDMetadata metadata = cat.getMetadata(); if (metadata != null) { // System.out.println(metadata.getStream().getStreamTokens()); // Levantamos la MetaData DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(metadata.createInputStream()); // Buscamos el tag de SEmployee y el Element -> CUIT NodeList nList = doc.getElementsByTagName("foaf:SEmployee"); Element elem = (Element) nList.item(0); String cuit = elem.getElementsByTagName("foaf:cuit").item(0).getTextContent(); System.out.println(cuit); System.out.println("---"); System.out.println(metadata.getInputStreamAsString()); } } catch (Exception err) { throw err; } finally { if (document != null) try { document.close(); } catch (Throwable err2) { } } }
/** * save the converted text (without any processing) to the given file. * * @param filename * @return */ public void PDF2Text(String filename) { try { File output = new File(filename); // The text file where you are going to store the extracted data stripper = new PDFTextStripper(); wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output))); stripper.writeText(pd, wr); if (pd != null) { pd.close(); } } catch (Exception e) { e.printStackTrace(); } }
public void PDF2TextPreProssesd(String filename) { try { stripper = new PDFTextStripper(); stripper.setParagraphStart("&*&"); stripper.setLineSeparator("#%#"); stripper.setPageSeparator("#%#"); String fulltxt = stripper.getText(pd); String paras[] = fulltxt.split("&*&"); File file = new File(filename); try { BufferedWriter out = new BufferedWriter(new FileWriter(file)); int i = 0; while (i < paras.length) { if (paras[i].length() > 200) { String para = paras[i].replace("#%#", " "); out.write(para + "\r\n"); } i++; } out.close(); } catch (IOException ex) { ex.printStackTrace(); } if (pd != null) { pd.close(); } } catch (Exception e) { e.printStackTrace(); } }