/** * Setup is used to get the document ready. Gets the DocumentSummaryInformation and the * SummaryInformation to reasonable values */ public void setUp() { bout = new ByteArrayOutputStream(); poifs = new POIFSFileSystem(); dir = poifs.getRoot(); dsi = null; try { DocumentEntry dsiEntry = (DocumentEntry) dir.getEntry(DocumentSummaryInformation.DEFAULT_STREAM_NAME); DocumentInputStream dis = new DocumentInputStream(dsiEntry); PropertySet ps = new PropertySet(dis); dis.close(); dsi = new DocumentSummaryInformation(ps); } catch (FileNotFoundException ex) { /* There is no document summary information yet. We have to create a * new one. */ dsi = PropertySetFactory.newDocumentSummaryInformation(); assertNotNull(dsi); } catch (IOException e) { e.printStackTrace(); fail(); } catch (NoPropertySetStreamException e) { e.printStackTrace(); fail(); } catch (MarkUnsupportedException e) { e.printStackTrace(); fail(); } catch (UnexpectedPropertySetTypeException e) { e.printStackTrace(); fail(); } assertNotNull(dsi); try { DocumentEntry dsiEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME); DocumentInputStream dis = new DocumentInputStream(dsiEntry); PropertySet ps = new PropertySet(dis); dis.close(); si = new SummaryInformation(ps); } catch (FileNotFoundException ex) { /* There is no document summary information yet. We have to create a * new one. */ si = PropertySetFactory.newSummaryInformation(); assertNotNull(si); } catch (IOException e) { e.printStackTrace(); fail(); } catch (NoPropertySetStreamException e) { e.printStackTrace(); fail(); } catch (MarkUnsupportedException e) { e.printStackTrace(); fail(); } catch (UnexpectedPropertySetTypeException e) { e.printStackTrace(); fail(); } assertNotNull(dsi); }
/** * Copies the bytes from a {@link DocumentInputStream} to a new stream in a POI filesystem. * * @param poiFs The POI filesystem to write to. * @param path The source document's path. * @param name The source document's name. * @param stream The stream containing the source document. * @throws IOException */ public void copy( final POIFSFileSystem poiFs, final POIFSDocumentPath path, final String name, final DocumentInputStream stream) throws IOException { final DirectoryEntry de = getPath(poiFs, path); final ByteArrayOutputStream out = new ByteArrayOutputStream(); int c; while ((c = stream.read()) != -1) out.write(c); stream.close(); out.close(); final InputStream in = new ByteArrayInputStream(out.toByteArray()); de.createDocument(name, in); }
public void close() throws IOException { input.close(); input = null; }
/** * Closes the ByteArrayOutputStream and reads it into a ByteArrayInputStream. When finished * writing information this method is used in the tests to start reading from the created document * and then the see if the results match. */ public void closeAndReOpen() { try { dsi.write(dir, DocumentSummaryInformation.DEFAULT_STREAM_NAME); si.write(dir, SummaryInformation.DEFAULT_STREAM_NAME); } catch (WritingNotSupportedException e) { e.printStackTrace(); fail(); } catch (IOException e) { e.printStackTrace(); fail(); } si = null; dsi = null; try { poifs.writeFilesystem(bout); bout.flush(); } catch (IOException e) { e.printStackTrace(); fail(); } InputStream is = new ByteArrayInputStream(bout.toByteArray()); assertNotNull(is); POIFSFileSystem poifs = null; try { poifs = new POIFSFileSystem(is); } catch (IOException e) { e.printStackTrace(); fail(); } try { is.close(); } catch (IOException e) { e.printStackTrace(); fail(); } assertNotNull(poifs); /* Read the document summary information. */ DirectoryEntry dir = poifs.getRoot(); try { DocumentEntry dsiEntry = (DocumentEntry) dir.getEntry(DocumentSummaryInformation.DEFAULT_STREAM_NAME); DocumentInputStream dis = new DocumentInputStream(dsiEntry); PropertySet ps = new PropertySet(dis); dis.close(); dsi = new DocumentSummaryInformation(ps); } catch (FileNotFoundException ex) { fail(); } catch (IOException e) { e.printStackTrace(); fail(); } catch (NoPropertySetStreamException e) { e.printStackTrace(); fail(); } catch (MarkUnsupportedException e) { e.printStackTrace(); fail(); } catch (UnexpectedPropertySetTypeException e) { e.printStackTrace(); fail(); } try { DocumentEntry dsiEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME); DocumentInputStream dis = new DocumentInputStream(dsiEntry); PropertySet ps = new PropertySet(dis); dis.close(); si = new SummaryInformation(ps); } catch (FileNotFoundException ex) { /* There is no document summary information yet. We have to create a * new one. */ si = PropertySetFactory.newSummaryInformation(); assertNotNull(si); } catch (IOException e) { e.printStackTrace(); fail(); } catch (NoPropertySetStreamException e) { e.printStackTrace(); fail(); } catch (MarkUnsupportedException e) { e.printStackTrace(); fail(); } catch (UnexpectedPropertySetTypeException e) { e.printStackTrace(); fail(); } }
// will throw IOException if not actually POIFS // can return null byte[] private byte[] handleEmbeddedPOIFS( InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException { NPOIFSFileSystem fs = null; byte[] ret = null; try { fs = new NPOIFSFileSystem(is); DirectoryNode root = fs.getRoot(); if (root == null) { return ret; } if (root.hasEntry("Package")) { Entry ooxml = root.getEntry("Package"); TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml)); ByteArrayOutputStream out = new ByteArrayOutputStream(); IOUtils.copy(stream, out); ret = out.toByteArray(); } else { // try poifs POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type == POIFSDocumentType.OLE10_NATIVE) { try { // Try to un-wrap the OLE10Native record: Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root); ret = ole.getDataBuffer(); } catch (Ole10NativeException ex) { // Not a valid OLE10Native record, skip it } } else if (type == POIFSDocumentType.COMP_OBJ) { DocumentEntry contentsEntry; try { contentsEntry = (DocumentEntry) root.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { contentsEntry = (DocumentEntry) root.getEntry("Contents"); } DocumentInputStream inp = null; try { inp = new DocumentInputStream(contentsEntry); ret = new byte[contentsEntry.getSize()]; inp.readFully(ret); } finally { if (inp != null) { inp.close(); } } } else { ByteArrayOutputStream out = new ByteArrayOutputStream(); is.reset(); IOUtils.copy(is, out); ret = out.toByteArray(); metadata.set( Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension()); metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); } } } finally { if (fs != null) { fs.close(); } } return ret; }
public String extractText(InputStream in) throws IOException { ArrayList<WordTextPiece> text = new ArrayList<WordTextPiece>(); POIFSFileSystem fsys = new POIFSFileSystem(in); DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header); din.close(); // Prende le informazioni dall'header del documento int info = LittleEndian.getShort(header, 0xa); boolean useTable1 = (info & 0x200) != 0; // boolean useTable1 = true; // Prende informazioni dalla piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // int complexOffset = LittleEndian.getInt(header); String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); din = null; fsys = null; table = null; headerProps = null; int multiple = findText(tableStream, complexOffset, text); StringBuffer sb = new StringBuffer(); tableStream = null; for (int x = 0; x < text.size(); x++) { WordTextPiece nextPiece = (WordTextPiece) text.get(x); int start = nextPiece.getStart(); int length = nextPiece.getLength(); boolean unicode = nextPiece.usesUnicode(); String toStr = null; if (unicode) { toStr = new String(header, start, length * multiple, "UTF-8"); } else { toStr = new String(header, start, length, "big5"); } sb.append(toStr).append(" "); } return sb.toString(); }