@Internal public TextPieceTable getTextTable() { return _cft.getTextPieceTable(); }
/** * Writes out the word file that is represented by an instance of this class. * * @param out The OutputStream to write to. * @throws IOException If there is an unexpected IOException from the passed in OutputStream. */ public void write(OutputStream out) throws IOException { // initialize our streams for writing. HWPFFileSystem docSys = new HWPFFileSystem(); HWPFOutputStream wordDocumentStream = docSys.getStream(STREAM_WORD_DOCUMENT); HWPFOutputStream tableStream = docSys.getStream(STREAM_TABLE_1); // HWPFOutputStream dataStream = docSys.getStream("Data"); int tableOffset = 0; // FileInformationBlock fib = (FileInformationBlock)_fib.clone(); // clear the offsets and sizes in our FileInformationBlock. _fib.clearOffsetsSizes(); // determine the FileInformationBLock size int fibSize = _fib.getSize(); fibSize += POIFSConstants.SMALLER_BIG_BLOCK_SIZE - (fibSize % POIFSConstants.SMALLER_BIG_BLOCK_SIZE); // preserve space for the FileInformationBlock because we will be writing // it after we write everything else. byte[] placeHolder = new byte[fibSize]; wordDocumentStream.write(placeHolder); int mainOffset = wordDocumentStream.getOffset(); // write out the StyleSheet. _fib.setFcStshf(tableOffset); _ss.writeTo(tableStream); _fib.setLcbStshf(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); // get fcMin and fcMac because we will be writing the actual text with the // complex table. int fcMin = mainOffset; /* * clx (encoding of the sprm lists for a complex file and piece table * for a any file) Written immediately after the end of the previously * recorded structure. This is recorded in all Word documents * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 23 of 210 */ // write out the Complex table, includes text. _fib.setFcClx(tableOffset); _cft.writeTo(wordDocumentStream, tableStream); _fib.setLcbClx(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); int fcMac = wordDocumentStream.getOffset(); /* * dop (document properties record) Written immediately after the end of * the previously recorded structure. This is recorded in all Word * documents * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 23 of 210 */ // write out the DocumentProperties. _fib.setFcDop(tableOffset); _dop.writeTo(tableStream); _fib.setLcbDop(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); /* * plcfBkmkf (table recording beginning CPs of bookmarks) Written * immediately after the sttbfBkmk, if the document contains bookmarks. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ if (_bookmarksTables != null) { _bookmarksTables.writePlcfBkmkf(_fib, tableStream); tableOffset = tableStream.getOffset(); } /* * plcfBkmkl (table recording limit CPs of bookmarks) Written * immediately after the plcfBkmkf, if the document contains bookmarks. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ if (_bookmarksTables != null) { _bookmarksTables.writePlcfBkmkl(_fib, tableStream); tableOffset = tableStream.getOffset(); } /* * plcfbteChpx (bin table for CHP FKPs) Written immediately after the * previously recorded table. This is recorded in all Word documents. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ // write out the CHPBinTable. _fib.setFcPlcfbteChpx(tableOffset); _cbt.writeTo(wordDocumentStream, tableStream, fcMin, _cft.getTextPieceTable()); _fib.setLcbPlcfbteChpx(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); /* * plcfbtePapx (bin table for PAP FKPs) Written immediately after the * plcfbteChpx. This is recorded in all Word documents. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ // write out the PAPBinTable. _fib.setFcPlcfbtePapx(tableOffset); _pbt.writeTo(wordDocumentStream, tableStream, _cft.getTextPieceTable()); _fib.setLcbPlcfbtePapx(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); /* * plcfendRef (endnote reference position table) Written immediately * after the previously recorded table if the document contains endnotes * * plcfendTxt (endnote text position table) Written immediately after * the plcfendRef if the document contains endnotes * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ _endnotesTables.writeRef(_fib, tableStream); _endnotesTables.writeTxt(_fib, tableStream); tableOffset = tableStream.getOffset(); /* * plcffld*** (table of field positions and statuses for annotation * subdocument) Written immediately after the previously recorded table, * if the ******* subdocument contains fields. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ if (_fieldsTables != null) { _fieldsTables.write(_fib, tableStream); tableOffset = tableStream.getOffset(); } /* * plcffndRef (footnote reference position table) Written immediately * after the stsh if the document contains footnotes * * plcffndTxt (footnote text position table) Written immediately after * the plcffndRef if the document contains footnotes * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 24 of 210 */ _footnotesTables.writeRef(_fib, tableStream); _footnotesTables.writeTxt(_fib, tableStream); tableOffset = tableStream.getOffset(); /* * plcfsed (section table) Written immediately after the previously * recorded table. Recorded in all Word documents * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 25 of 210 */ // write out the SectionTable. _fib.setFcPlcfsed(tableOffset); _st.writeTo(wordDocumentStream, tableStream); _fib.setLcbPlcfsed(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); // write out the list tables if (_lt != null) { /* * plcflst (list formats) Written immediately after the end of the * previously recorded, if there are any lists defined in the * document. This begins with a short count of LSTF structures * followed by those LSTF structures. This is immediately followed * by the allocated data hanging off the LSTFs. This data consists * of the array of LVLs for each LSTF. (Each LVL consists of an LVLF * followed by two grpprls and an XST.) * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 25 of 210 */ _lt.writeListDataTo(_fib, tableStream); tableOffset = tableStream.getOffset(); /* * plflfo (more list formats) Written immediately after the end of * the plcflst and its accompanying data, if there are any lists * defined in the document. This consists first of a PL of LFO * records, followed by the allocated data (if any) hanging off the * LFOs. The allocated data consists of the array of LFOLVLFs for * each LFO (and each LFOLVLF is immediately followed by some LVLs). * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 26 of 210 */ _fib.setFcPlfLfo(tableStream.getOffset()); _lt.writeListOverridesTo(tableStream); _fib.setLcbPlfLfo(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); } /* * sttbfBkmk (table of bookmark name strings) Written immediately after * the previously recorded table, if the document contains bookmarks. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 27 of 210 */ if (_bookmarksTables != null) { _bookmarksTables.writeSttbfBkmk(_fib, tableStream); tableOffset = tableStream.getOffset(); } /* * sttbSavedBy (last saved by string table) Written immediately after * the previously recorded table. * * Microsoft Office Word 97-2007 Binary File Format (.doc) * Specification; Page 27 of 210 */ // write out the saved-by table. if (_sbt != null) { _fib.setFcSttbSavedBy(tableOffset); _sbt.writeTo(tableStream); _fib.setLcbSttbSavedBy(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); } // write out the revision mark authors table. if (_rmat != null) { _fib.setFcSttbfRMark(tableOffset); _rmat.writeTo(tableStream); _fib.setLcbSttbfRMark(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); } // write out the FontTable. _fib.setFcSttbfffn(tableOffset); _ft.writeTo(tableStream); _fib.setLcbSttbfffn(tableStream.getOffset() - tableOffset); tableOffset = tableStream.getOffset(); // set some variables in the FileInformationBlock. _fib.getFibBase().setFcMin(fcMin); _fib.getFibBase().setFcMac(fcMac); _fib.setCbMac(wordDocumentStream.getOffset()); // make sure that the table, doc and data streams use big blocks. byte[] mainBuf = wordDocumentStream.toByteArray(); if (mainBuf.length < 4096) { byte[] tempBuf = new byte[4096]; System.arraycopy(mainBuf, 0, tempBuf, 0, mainBuf.length); mainBuf = tempBuf; } // Table1 stream will be used _fib.getFibBase().setFWhichTblStm(true); // write out the FileInformationBlock. // _fib.serialize(mainBuf, 0); _fib.writeTo(mainBuf, tableStream); byte[] tableBuf = tableStream.toByteArray(); if (tableBuf.length < 4096) { byte[] tempBuf = new byte[4096]; System.arraycopy(tableBuf, 0, tempBuf, 0, tableBuf.length); tableBuf = tempBuf; } byte[] dataBuf = _dataStream; if (dataBuf == null) { dataBuf = new byte[4096]; } if (dataBuf.length < 4096) { byte[] tempBuf = new byte[4096]; System.arraycopy(dataBuf, 0, tempBuf, 0, dataBuf.length); dataBuf = tempBuf; } // create new document preserving order of entries POIFSFileSystem pfs = new POIFSFileSystem(); boolean docWritten = false; boolean dataWritten = false; boolean objectPoolWritten = false; boolean tableWritten = false; boolean propertiesWritten = false; for (Iterator<Entry> iter = directory.getEntries(); iter.hasNext(); ) { Entry entry = iter.next(); if (entry.getName().equals(STREAM_WORD_DOCUMENT)) { if (!docWritten) { pfs.createDocument(new ByteArrayInputStream(mainBuf), STREAM_WORD_DOCUMENT); docWritten = true; } } else if (entry.getName().equals(STREAM_OBJECT_POOL)) { if (!objectPoolWritten) { _objectPool.writeTo(pfs.getRoot()); objectPoolWritten = true; } } else if (entry.getName().equals(STREAM_TABLE_0) || entry.getName().equals(STREAM_TABLE_1)) { if (!tableWritten) { pfs.createDocument(new ByteArrayInputStream(tableBuf), STREAM_TABLE_1); tableWritten = true; } } else if (entry.getName().equals(SummaryInformation.DEFAULT_STREAM_NAME) || entry.getName().equals(DocumentSummaryInformation.DEFAULT_STREAM_NAME)) { if (!propertiesWritten) { writeProperties(pfs); propertiesWritten = true; } } else if (entry.getName().equals(STREAM_DATA)) { if (!dataWritten) { pfs.createDocument(new ByteArrayInputStream(dataBuf), STREAM_DATA); dataWritten = true; } } else { EntryUtils.copyNodeRecursively(entry, pfs.getRoot()); } } if (!docWritten) pfs.createDocument(new ByteArrayInputStream(mainBuf), STREAM_WORD_DOCUMENT); if (!tableWritten) pfs.createDocument(new ByteArrayInputStream(tableBuf), STREAM_TABLE_1); if (!propertiesWritten) writeProperties(pfs); if (!dataWritten) pfs.createDocument(new ByteArrayInputStream(dataBuf), STREAM_DATA); if (!objectPoolWritten) _objectPool.writeTo(pfs.getRoot()); pfs.writeFilesystem(out); this.directory = pfs.getRoot(); /* * since we updated all references in FIB and etc, using new arrays to * access data */ this.directory = pfs.getRoot(); this._tableStream = tableStream.toByteArray(); this._dataStream = dataBuf; }
/** * This constructor loads a Word document from a specific point in a POIFSFileSystem, probably not * the default. Used typically to open embeded documents. * * @param directory The DirectoryNode that contains the Word document. * @throws IOException If there is an unexpected IOException from the passed in POIFSFileSystem. */ public HWPFDocument(DirectoryNode directory) throws IOException { // Load the main stream and FIB // Also handles HPSF bits super(directory); // Is this document too old for us? if (_fib.getFibBase().getNFib() < 106) { throw new OldWordFileFormatException( "The document is too old - Word 95 or older. Try HWPFOldDocument instead?"); } // use the fib to determine the name of the table stream. String name = STREAM_TABLE_0; if (_fib.getFibBase().isFWhichTblStm()) { name = STREAM_TABLE_1; } // Grab the table stream. DocumentEntry tableProps; try { tableProps = (DocumentEntry) directory.getEntry(name); } catch (FileNotFoundException fnfe) { throw new IllegalStateException( "Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)"); } // read in the table stream. _tableStream = new byte[tableProps.getSize()]; directory.createDocumentInputStream(name).read(_tableStream); _fib.fillVariableFields(_mainStream, _tableStream); // read in the data stream. try { DocumentEntry dataProps = (DocumentEntry) directory.getEntry(STREAM_DATA); _dataStream = new byte[dataProps.getSize()]; directory.createDocumentInputStream(STREAM_DATA).read(_dataStream); } catch (java.io.FileNotFoundException e) { _dataStream = new byte[0]; } // Get the cp of the start of text in the main stream // The latest spec doc says this is always zero! int fcMin = 0; // fcMin = _fib.getFcMin() // Start to load up our standard structures. _dop = new DocumentProperties(_tableStream, _fib.getFcDop(), _fib.getLcbDop()); _cft = new ComplexFileTable(_mainStream, _tableStream, _fib.getFcClx(), fcMin); TextPieceTable _tpt = _cft.getTextPieceTable(); // Now load the rest of the properties, which need to be adjusted // for where text really begin _cbt = new CHPBinTable( _mainStream, _tableStream, _fib.getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), _tpt); _pbt = new PAPBinTable( _mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt); _text = _tpt.getText(); /* * in this mode we preserving PAPX/CHPX structure from file, so text may * miss from output, and text order may be corrupted */ boolean preserveBinTables = false; try { preserveBinTables = Boolean.parseBoolean(System.getProperty(PROPERTY_PRESERVE_BIN_TABLES)); } catch (Exception exc) { // ignore; } if (!preserveBinTables) { _cbt.rebuild(_cft); _pbt.rebuild(_text, _cft); } /* * Property to disable text rebuilding. In this mode changing the text * will lead to unpredictable behavior */ boolean preserveTextTable = false; try { preserveTextTable = Boolean.parseBoolean(System.getProperty(PROPERTY_PRESERVE_TEXT_TABLE)); } catch (Exception exc) { // ignore; } if (!preserveTextTable) { _cft = new ComplexFileTable(); _tpt = _cft.getTextPieceTable(); final TextPiece textPiece = new SinglentonTextPiece(_text); _tpt.add(textPiece); _text = textPiece.getStringBuilder(); } // Read FSPA and Escher information // _fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), // _fib.getLcbPlcspaMom(), getTextTable().getTextPieces()); _fspaHeaders = new FSPATable(_tableStream, _fib, FSPADocumentPart.HEADER); _fspaMain = new FSPATable(_tableStream, _fib, FSPADocumentPart.MAIN); if (_fib.getFcDggInfo() != 0) { _escherRecordHolder = new EscherRecordHolder(_tableStream, _fib.getFcDggInfo(), _fib.getLcbDggInfo()); } else { _escherRecordHolder = new EscherRecordHolder(); } // read in the pictures stream _pictures = new PicturesTable(this, _dataStream, _mainStream, _fspaMain, _escherRecordHolder); // And the art shapes stream _officeArts = new ShapesTable(_tableStream, _fib); // And escher pictures _officeDrawingsHeaders = new OfficeDrawingsImpl(_fspaHeaders, _escherRecordHolder, _mainStream); _officeDrawingsMain = new OfficeDrawingsImpl(_fspaMain, _escherRecordHolder, _mainStream); _st = new SectionTable( _mainStream, _tableStream, _fib.getFcPlcfsed(), _fib.getLcbPlcfsed(), fcMin, _tpt, _fib.getSubdocumentTextStreamLength(SubdocumentType.MAIN)); _ss = new StyleSheet(_tableStream, _fib.getFcStshf()); _ft = new FontTable(_tableStream, _fib.getFcSttbfffn(), _fib.getLcbSttbfffn()); int listOffset = _fib.getFcPlcfLst(); int lfoOffset = _fib.getFcPlfLfo(); if (listOffset != 0 && _fib.getLcbPlcfLst() != 0) { _lt = new ListTables(_tableStream, _fib.getFcPlcfLst(), _fib.getFcPlfLfo()); } int sbtOffset = _fib.getFcSttbSavedBy(); int sbtLength = _fib.getLcbSttbSavedBy(); if (sbtOffset != 0 && sbtLength != 0) { _sbt = new SavedByTable(_tableStream, sbtOffset, sbtLength); } int rmarkOffset = _fib.getFcSttbfRMark(); int rmarkLength = _fib.getLcbSttbfRMark(); if (rmarkOffset != 0 && rmarkLength != 0) { _rmat = new RevisionMarkAuthorTable(_tableStream, rmarkOffset, rmarkLength); } _bookmarksTables = new BookmarksTables(_tableStream, _fib); _bookmarks = new BookmarksImpl(_bookmarksTables); _endnotesTables = new NotesTables(NoteType.ENDNOTE, _tableStream, _fib); _endnotes = new NotesImpl(_endnotesTables); _footnotesTables = new NotesTables(NoteType.FOOTNOTE, _tableStream, _fib); _footnotes = new NotesImpl(_footnotesTables); _fieldsTables = new FieldsTables(_tableStream, _fib); _fields = new FieldsImpl(_fieldsTables); }