/** * Is this one of the kinds of formats which uses CompObj to store all of their data, eg Star * Draw, Star Impress or (older) Works? If not, it's likely an embedded resource */ private static MediaType processCompObjFormatType(DirectoryEntry root) { try { Entry e = root.getEntry("\u0001CompObj"); if (e != null && e.isDocumentEntry()) { DocumentNode dn = (DocumentNode) e; DocumentInputStream stream = new DocumentInputStream(dn); byte[] bytes = IOUtils.toByteArray(stream); /* * This array contains a string with a normal ASCII name of the * application used to create this file. We want to search for that * name. */ if (arrayContains(bytes, MS_GRAPH_CHART_BYTES)) { return MS_GRAPH_CHART; } else if (arrayContains(bytes, STAR_DRAW)) { return SDA; } else if (arrayContains(bytes, STAR_IMPRESS)) { return SDD; } else if (arrayContains(bytes, WORKS_QUILL96)) { return WPS; } } } catch (Exception e) { /* * "root.getEntry" can throw FileNotFoundException. The code inside * "if" can throw IOExceptions. Theoretically. Practically no * exceptions will likely ever appear. * * Swallow all of them. If any occur, we just assume that we can't * distinguish between Draw and Impress and return something safe: * x-tika-msoffice */ } return OLE; }
private static Set<String> getTopLevelNames(DirectoryNode root) { Set<String> names = new HashSet<String>(); for (Entry entry : root) { names.add(entry.getName()); } return names; }
protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) throws IOException { for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) { if (entry instanceof DirectoryEntry) { // Need to recurse DirectoryEntry newDir = destDir.createDirectory(entry.getName()); copy((DirectoryEntry) entry, newDir); } else { // Copy entry InputStream contents = new DocumentInputStream((DocumentEntry) entry); try { destDir.createDocument(entry.getName(), contents); } finally { contents.close(); } } } }
/** * Compares two {@link DirectoryEntry} instances of a POI file system. The directories must * contain the same streams with the same names and contents. * * @param d1 The first directory. * @param d2 The second directory. * @param msg The method may append human-readable comparison messages to this string buffer. * @return <code>true</code> if the directories are equal, else <code>false</code>. * @exception MarkUnsupportedException if a POI document stream does not support the mark() * operation. * @exception NoPropertySetStreamException if the application tries to create a property set from * a POI document stream that is not a property set stream. * @throws UnsupportedEncodingException * @exception IOException if any I/O exception occurs. */ private static boolean equal( final DirectoryEntry d1, final DirectoryEntry d2, final StringBuffer msg) throws NoPropertySetStreamException, MarkUnsupportedException, UnsupportedEncodingException, IOException { boolean equal = true; /* Iterate over d1 and compare each entry with its counterpart in d2. */ for (final Iterator i = d1.getEntries(); equal && i.hasNext(); ) { final Entry e1 = (Entry) i.next(); final String n1 = e1.getName(); Entry e2 = null; try { e2 = d2.getEntry(n1); } catch (FileNotFoundException ex) { msg.append("Document \"" + e1 + "\" exists, document \"" + e2 + "\" does not.\n"); equal = false; break; } if (e1.isDirectoryEntry() && e2.isDirectoryEntry()) equal = equal((DirectoryEntry) e1, (DirectoryEntry) e2, msg); else if (e1.isDocumentEntry() && e2.isDocumentEntry()) equal = equal((DocumentEntry) e1, (DocumentEntry) e2, msg); else { msg.append( "One of \"" + e1 + "\" and \"" + e2 + "\" is a " + "document while the other one is a directory.\n"); equal = false; } } /* Iterate over d2 just to make sure that there are no entries in d2 * that are not in d1. */ for (final Iterator i = d2.getEntries(); equal && i.hasNext(); ) { final Entry e2 = (Entry) i.next(); final String n2 = e2.getName(); Entry e1 = null; try { e1 = d1.getEntry(n2); } catch (FileNotFoundException ex) { msg.append("Document \"" + e2 + "\" exitsts, document \"" + e1 + "\" does not.\n"); equal = false; break; } } return equal; }
/** * Writes out the word file that is represented by an instance of this class. * * @param out The OutputStream to write to. * @throws IOException If there is an unexpected IOException from the passed in OutputStream. */ public void write(OutputStream out) throws IOException { // initialize our streams for writing. HWPFFileSystem docSys = new HWPFFileSystem(); HWPFOutputStream wordDocumentStream = docSys.getStream(STREAM_WORD_DOCUMENT); HWPFOutputStream tableStream = docSys.getStream(STREAM_TABLE_1); // HWPFOutputStream dataStream = docSys.getStream("Data"); int tableOffset = 0; // FileInformationBlock fib = (FileInformationBlock)_fib.clone(); // clear the offsets and sizes in our FileInformationBlock. _fib.clearOffsetsSizes(); // determine the FileInformationBLock size int fibSize = _fib.getSize(); fibSize += POIFSConstants.SMALLER_BIG_BLOCK_SIZE - (fibSize % POIFSConstants.SMALLER_BIG_BLOCK_SIZE); // preserve space for the FileInformationBlock because we will be writing // it after we write everything else. byte[] placeHolder = new byte[fibSize]; wordDocumentStream.write(placeHolder); int mainOffset = wordDocumentStream.getOffset(); // write out the StyleSheet. _fib.setFcStshf(tableOffset); _ss.writeTo(tableStream); _fib.setLcbStshf(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); // get fcMin and fcMac because we will be writing the actual text with the // complex table. int fcMin = mainOffset; /* * clx (encoding of the sprm lists for a complex file and piece table * for a any file) Written immediately after the end of the previously * recorded structure. This is recorded in all Word documents * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 23 of 210 */ // write out the Complex table, includes text. _fib.setFcClx(tableOffset); _cft.writeTo(wordDocumentStream, tableStream); _fib.setLcbClx(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); int fcMac = wordDocumentStream.getOffset(); /* * dop (document properties record) Written immediately after the end of * the previously recorded structure. This is recorded in all Word * documents * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 23 of 210 */ // write out the DocumentProperties. _fib.setFcDop(tableOffset); _dop.writeTo(tableStream); _fib.setLcbDop(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); /* * plcfBkmkf (table recording beginning CPs of bookmarks) Written * immediately after the sttbfBkmk, if the document contains bookmarks. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ if (_bookmarksTables != null) { _bookmarksTables.writePlcfBkmkf(_fib, tableStream); tableOffset = tableStream.getOffset(); } /* * plcfBkmkl (table recording limit CPs of bookmarks) Written * immediately after the plcfBkmkf, if the document contains bookmarks. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ if (_bookmarksTables != null) { _bookmarksTables.writePlcfBkmkl(_fib, tableStream); tableOffset = tableStream.getOffset(); } /* * plcfbteChpx (bin table for CHP FKPs) Written immediately after the * previously recorded table. This is recorded in all Word documents. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ // write out the CHPBinTable. _fib.setFcPlcfbteChpx(tableOffset); _cbt.writeTo(wordDocumentStream, tableStream, fcMin, _cft.getTextPieceTable()); _fib.setLcbPlcfbteChpx(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); /* * plcfbtePapx (bin table for PAP FKPs) Written immediately after the * plcfbteChpx. This is recorded in all Word documents. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ // write out the PAPBinTable. _fib.setFcPlcfbtePapx(tableOffset); _pbt.writeTo(wordDocumentStream, tableStream, _cft.getTextPieceTable()); _fib.setLcbPlcfbtePapx(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); /* * plcfendRef (endnote reference position table) Written immediately * after the previously recorded table if the document contains endnotes * * plcfendTxt (endnote text position table) Written immediately after * the plcfendRef if the document contains endnotes * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ _endnotesTables.writeRef(_fib, tableStream); _endnotesTables.writeTxt(_fib, tableStream); tableOffset = tableStream.getOffset(); /* * plcffld*** (table of field positions and statuses for annotation * subdocument) Written immediately after the previously recorded table, * if the ******* subdocument contains fields. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ if (_fieldsTables != null) { _fieldsTables.write(_fib, tableStream); tableOffset = tableStream.getOffset(); } /* * plcffndRef (footnote reference position table) Written immediately * after the stsh if the document contains footnotes * * plcffndTxt (footnote text position table) Written immediately after * the plcffndRef if the document contains footnotes * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ _footnotesTables.writeRef(_fib, tableStream); _footnotesTables.writeTxt(_fib, tableStream); tableOffset = tableStream.getOffset(); /* * plcfsed (section table) Written immediately after the previously * recorded table. Recorded in all Word documents * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 25 of 210 */ // write out the SectionTable. _fib.setFcPlcfsed(tableOffset); _st.writeTo(wordDocumentStream, tableStream); _fib.setLcbPlcfsed(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); // write out the list tables if (_lt != null) { /* * plcflst (list formats) Written immediately after the end of the * previously recorded, if there are any lists defined in the * document. This begins with a short count of LSTF structures * followed by those LSTF structures. This is immediately followed * by the allocated data hanging off the LSTFs. This data consists * of the array of LVLs for each LSTF. (Each LVL consists of an LVLF * followed by two grpprls and an XST.) * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 25 of 210 */ _lt.writeListDataTo(_fib, tableStream); tableOffset = tableStream.getOffset(); /* * plflfo (more list formats) Written immediately after the end of * the plcflst and its accompanying data, if there are any lists * defined in the document. This consists first of a PL of LFO * records, followed by the allocated data (if any) hanging off the * LFOs. The allocated data consists of the array of LFOLVLFs for * each LFO (and each LFOLVLF is immediately followed by some LVLs). * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 26 of 210 */ _fib.setFcPlfLfo(tableStream.getOffset()); _lt.writeListOverridesTo(tableStream); _fib.setLcbPlfLfo(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); } /* * sttbfBkmk (table of bookmark name strings) Written immediately after * the previously recorded table, if the document contains bookmarks. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 27 of 210 */ if (_bookmarksTables != null) { _bookmarksTables.writeSttbfBkmk(_fib, tableStream); tableOffset = tableStream.getOffset(); } /* * sttbSavedBy (last saved by string table) Written immediately after * the previously recorded table. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 27 of 210 */ // write out the saved-by table. if (_sbt != null) { _fib.setFcSttbSavedBy(tableOffset); _sbt.writeTo(tableStream); _fib.setLcbSttbSavedBy(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); } // write out the revision mark authors table. if (_rmat != null) { _fib.setFcSttbfRMark(tableOffset); _rmat.writeTo(tableStream); _fib.setLcbSttbfRMark(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); } // write out the FontTable. _fib.setFcSttbfffn(tableOffset); _ft.writeTo(tableStream); _fib.setLcbSttbfffn(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); // set some variables in the FileInformationBlock. _fib.getFibBase().setFcMin(fcMin); _fib.getFibBase().setFcMac(fcMac); _fib.setCbMac(wordDocumentStream.getOffset()); // make sure that the table, doc and data streams use big blocks. byte[] mainBuf = wordDocumentStream.toByteArray(); if (mainBuf.length < 4096) { byte[] tempBuf = new byte[4096]; System.arraycopy(mainBuf, 0, tempBuf, 0, mainBuf.length); mainBuf = tempBuf; } // Table1 stream will be used _fib.getFibBase().setFWhichTblStm(true); // write out the FileInformationBlock. // _fib.serialize(mainBuf, 0); _fib.writeTo(mainBuf, tableStream); byte[] tableBuf = tableStream.toByteArray(); if (tableBuf.length < 4096) { byte[] tempBuf = new byte[4096]; System.arraycopy(tableBuf, 0, tempBuf, 0, tableBuf.length); tableBuf = tempBuf; } byte[] dataBuf = _dataStream; if (dataBuf == null) { dataBuf = new byte[4096]; } if (dataBuf.length < 4096) { byte[] tempBuf = new byte[4096]; System.arraycopy(dataBuf, 0, tempBuf, 0, dataBuf.length); dataBuf = tempBuf; } // create new document preserving order of entries POIFSFileSystem pfs = new POIFSFileSystem(); boolean docWritten = false; boolean dataWritten = false; boolean objectPoolWritten = false; boolean tableWritten = false; boolean propertiesWritten = false; for (Iterator<Entry> iter = directory.getEntries(); iter.hasNext(); ) { Entry entry = iter.next(); if (entry.getName().equals(STREAM_WORD_DOCUMENT)) { if (!docWritten) { pfs.createDocument(new ByteArrayInputStream(mainBuf), STREAM_WORD_DOCUMENT); docWritten = true; } } else if (entry.getName().equals(STREAM_OBJECT_POOL)) { if (!objectPoolWritten) { _objectPool.writeTo(pfs.getRoot()); objectPoolWritten = true; } } else if (entry.getName().equals(STREAM_TABLE_0) || entry.getName().equals(STREAM_TABLE_1)) { if (!tableWritten) { pfs.createDocument(new ByteArrayInputStream(tableBuf), STREAM_TABLE_1); tableWritten = true; } } else if (entry.getName().equals(SummaryInformation.DEFAULT_STREAM_NAME) || entry.getName().equals(DocumentSummaryInformation.DEFAULT_STREAM_NAME)) { if (!propertiesWritten) { writeProperties(pfs); propertiesWritten = true; } } else if (entry.getName().equals(STREAM_DATA)) { if (!dataWritten) { pfs.createDocument(new ByteArrayInputStream(dataBuf), STREAM_DATA); dataWritten = true; } } else { EntryUtils.copyNodeRecursively(entry, pfs.getRoot()); } } if (!docWritten) pfs.createDocument(new ByteArrayInputStream(mainBuf), STREAM_WORD_DOCUMENT); if (!tableWritten) pfs.createDocument(new ByteArrayInputStream(tableBuf), STREAM_TABLE_1); if (!propertiesWritten) writeProperties(pfs); if (!dataWritten) pfs.createDocument(new ByteArrayInputStream(dataBuf), STREAM_DATA); if (!objectPoolWritten) _objectPool.writeTo(pfs.getRoot()); pfs.writeFilesystem(out); this.directory = pfs.getRoot(); /* * since we updated all references in FIB and etc, using new arrays to * access data */ this.directory = pfs.getRoot(); this._tableStream = tableStream.toByteArray(); this._dataStream = dataBuf; }