/** * This constructor loads a Word document from a specific point in a POIFSFileSystem, probably not * the default. Used typically to open embeded documents. * * @param directory The DirectoryNode that contains the Word document. * @throws IOException If there is an unexpected IOException from the passed in POIFSFileSystem. */ public HWPFDocumentCore(DirectoryNode directory) throws IOException { // Sort out the hpsf properties super(directory); // read in the main stream. DocumentEntry documentProps = (DocumentEntry) directory.getEntry("WordDocument"); _mainStream = new byte[documentProps.getSize()]; directory.createDocumentInputStream(STREAM_WORD_DOCUMENT).read(_mainStream); // Create our FIB, and check for the doc being encrypted _fib = new FileInformationBlock(_mainStream); DirectoryEntry objectPoolEntry; try { objectPoolEntry = (DirectoryEntry) directory.getEntry(STREAM_OBJECT_POOL); } catch (FileNotFoundException exc) { objectPoolEntry = null; } _objectPool = new ObjectPoolImpl(objectPoolEntry); }
/** * This constructor loads a Word document from a specific point in a POIFSFileSystem, probably not * the default. Used typically to open embeded documents. * * @param directory The DirectoryNode that contains the Word document. * @throws IOException If there is an unexpected IOException from the passed in POIFSFileSystem. */ public HWPFDocument(DirectoryNode directory) throws IOException { // Load the main stream and FIB // Also handles HPSF bits super(directory); // Is this document too old for us? if (_fib.getFibBase().getNFib() < 106) { throw new OldWordFileFormatException( "The document is too old - Word 95 or older. Try HWPFOldDocument instead?"); } // use the fib to determine the name of the table stream. String name = STREAM_TABLE_0; if (_fib.getFibBase().isFWhichTblStm()) { name = STREAM_TABLE_1; } // Grab the table stream. DocumentEntry tableProps; try { tableProps = (DocumentEntry) directory.getEntry(name); } catch (FileNotFoundException fnfe) { throw new IllegalStateException( "Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)"); } // read in the table stream. _tableStream = new byte[tableProps.getSize()]; directory.createDocumentInputStream(name).read(_tableStream); _fib.fillVariableFields(_mainStream, _tableStream); // read in the data stream. try { DocumentEntry dataProps = (DocumentEntry) directory.getEntry(STREAM_DATA); _dataStream = new byte[dataProps.getSize()]; directory.createDocumentInputStream(STREAM_DATA).read(_dataStream); } catch (java.io.FileNotFoundException e) { _dataStream = new byte[0]; } // Get the cp of the start of text in the main stream // The latest spec doc says this is always zero! int fcMin = 0; // fcMin = _fib.getFcMin() // Start to load up our standard structures. _dop = new DocumentProperties(_tableStream, _fib.getFcDop(), _fib.getLcbDop()); _cft = new ComplexFileTable(_mainStream, _tableStream, _fib.getFcClx(), fcMin); TextPieceTable _tpt = _cft.getTextPieceTable(); // Now load the rest of the properties, which need to be adjusted // for where text really begin _cbt = new CHPBinTable( _mainStream, _tableStream, _fib.getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), _tpt); _pbt = new PAPBinTable( _mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt); _text = _tpt.getText(); /* * in this mode we preserving PAPX/CHPX structure from file, so text may * miss from output, and text order may be corrupted */ boolean preserveBinTables = false; try { preserveBinTables = Boolean.parseBoolean(System.getProperty(PROPERTY_PRESERVE_BIN_TABLES)); } catch (Exception exc) { // ignore; } if (!preserveBinTables) { _cbt.rebuild(_cft); _pbt.rebuild(_text, _cft); } /* * Property to disable text rebuilding. In this mode changing the text * will lead to unpredictable behavior */ boolean preserveTextTable = false; try { preserveTextTable = Boolean.parseBoolean(System.getProperty(PROPERTY_PRESERVE_TEXT_TABLE)); } catch (Exception exc) { // ignore; } if (!preserveTextTable) { _cft = new ComplexFileTable(); _tpt = _cft.getTextPieceTable(); final TextPiece textPiece = new SinglentonTextPiece(_text); _tpt.add(textPiece); _text = textPiece.getStringBuilder(); } // Read FSPA and Escher information // _fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), // _fib.getLcbPlcspaMom(), getTextTable().getTextPieces()); _fspaHeaders = new FSPATable(_tableStream, _fib, FSPADocumentPart.HEADER); _fspaMain = new FSPATable(_tableStream, _fib, FSPADocumentPart.MAIN); if (_fib.getFcDggInfo() != 0) { _escherRecordHolder = new EscherRecordHolder(_tableStream, _fib.getFcDggInfo(), _fib.getLcbDggInfo()); } else { _escherRecordHolder = new EscherRecordHolder(); } // read in the pictures stream _pictures = new PicturesTable(this, _dataStream, _mainStream, _fspaMain, _escherRecordHolder); // And the art shapes stream _officeArts = new ShapesTable(_tableStream, _fib); // And escher pictures _officeDrawingsHeaders = new OfficeDrawingsImpl(_fspaHeaders, _escherRecordHolder, _mainStream); _officeDrawingsMain = new OfficeDrawingsImpl(_fspaMain, _escherRecordHolder, _mainStream); _st = new SectionTable( _mainStream, _tableStream, _fib.getFcPlcfsed(), _fib.getLcbPlcfsed(), fcMin, _tpt, _fib.getSubdocumentTextStreamLength(SubdocumentType.MAIN)); _ss = new StyleSheet(_tableStream, _fib.getFcStshf()); _ft = new FontTable(_tableStream, _fib.getFcSttbfffn(), _fib.getLcbSttbfffn()); int listOffset = _fib.getFcPlcfLst(); int lfoOffset = _fib.getFcPlfLfo(); if (listOffset != 0 && _fib.getLcbPlcfLst() != 0) { _lt = new ListTables(_tableStream, _fib.getFcPlcfLst(), _fib.getFcPlfLfo()); } int sbtOffset = _fib.getFcSttbSavedBy(); int sbtLength = _fib.getLcbSttbSavedBy(); if (sbtOffset != 0 && sbtLength != 0) { _sbt = new SavedByTable(_tableStream, sbtOffset, sbtLength); } int rmarkOffset = _fib.getFcSttbfRMark(); int rmarkLength = _fib.getLcbSttbfRMark(); if (rmarkOffset != 0 && rmarkLength != 0) { _rmat = new RevisionMarkAuthorTable(_tableStream, rmarkOffset, rmarkLength); } _bookmarksTables = new BookmarksTables(_tableStream, _fib); _bookmarks = new BookmarksImpl(_bookmarksTables); _endnotesTables = new NotesTables(NoteType.ENDNOTE, _tableStream, _fib); _endnotes = new NotesImpl(_endnotesTables); _footnotesTables = new NotesTables(NoteType.FOOTNOTE, _tableStream, _fib); _footnotes = new NotesImpl(_footnotesTables); _fieldsTables = new FieldsTables(_tableStream, _fib); _fields = new FieldsImpl(_fieldsTables); }
// will throw IOException if not actually POIFS // can return null byte[] private byte[] handleEmbeddedPOIFS( InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException { NPOIFSFileSystem fs = null; byte[] ret = null; try { fs = new NPOIFSFileSystem(is); DirectoryNode root = fs.getRoot(); if (root == null) { return ret; } if (root.hasEntry("Package")) { Entry ooxml = root.getEntry("Package"); TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml)); ByteArrayOutputStream out = new ByteArrayOutputStream(); IOUtils.copy(stream, out); ret = out.toByteArray(); } else { // try poifs POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type == POIFSDocumentType.OLE10_NATIVE) { try { // Try to un-wrap the OLE10Native record: Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root); ret = ole.getDataBuffer(); } catch (Ole10NativeException ex) { // Not a valid OLE10Native record, skip it } } else if (type == POIFSDocumentType.COMP_OBJ) { DocumentEntry contentsEntry; try { contentsEntry = (DocumentEntry) root.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { contentsEntry = (DocumentEntry) root.getEntry("Contents"); } DocumentInputStream inp = null; try { inp = new DocumentInputStream(contentsEntry); ret = new byte[contentsEntry.getSize()]; inp.readFully(ret); } finally { if (inp != null) { inp.close(); } } } else { ByteArrayOutputStream out = new ByteArrayOutputStream(); is.reset(); IOUtils.copy(is, out); ret = out.toByteArray(); metadata.set( Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension()); metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); } } } finally { if (fs != null) { fs.close(); } } return ret; }
public String extractText(InputStream in) throws IOException { ArrayList<WordTextPiece> text = new ArrayList<WordTextPiece>(); POIFSFileSystem fsys = new POIFSFileSystem(in); DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header); din.close(); // Prende le informazioni dall'header del documento int info = LittleEndian.getShort(header, 0xa); boolean useTable1 = (info & 0x200) != 0; // boolean useTable1 = true; // Prende informazioni dalla piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // int complexOffset = LittleEndian.getInt(header); String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); din = null; fsys = null; table = null; headerProps = null; int multiple = findText(tableStream, complexOffset, text); StringBuffer sb = new StringBuffer(); tableStream = null; for (int x = 0; x < text.size(); x++) { WordTextPiece nextPiece = (WordTextPiece) text.get(x); int start = nextPiece.getStart(); int length = nextPiece.getLength(); boolean unicode = nextPiece.usesUnicode(); String toStr = null; if (unicode) { toStr = new String(header, start, length * multiple, "UTF-8"); } else { toStr = new String(header, start, length, "big5"); } sb.append(toStr).append(" "); } return sb.toString(); }