/** Creates a Workbook from the given NPOIFSFileSystem, which may be password protected */ private static Workbook create(NPOIFSFileSystem fs, String password) throws IOException, InvalidFormatException { DirectoryNode root = fs.getRoot(); // Encrypted OOXML files go inside OLE2 containers, is this one? if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) { EncryptionInfo info = new EncryptionInfo(fs); Decryptor d = Decryptor.getInstance(info); boolean passwordCorrect = false; InputStream stream = null; try { if (password != null && d.verifyPassword(password)) { passwordCorrect = true; } if (!passwordCorrect && d.verifyPassword(Decryptor.DEFAULT_PASSWORD)) { passwordCorrect = true; } if (passwordCorrect) { stream = d.getDataStream(root); } } catch (GeneralSecurityException e) { throw new IOException(e); } if (!passwordCorrect) { if (password != null) throw new EncryptedDocumentException("Password incorrect"); else throw new EncryptedDocumentException( "The supplied spreadsheet is protected, but no password was supplied"); } OPCPackage pkg = OPCPackage.open(stream); return create(pkg); } // If we get here, it isn't an encrypted XLSX file // So, treat it as a regular HSSF XLS one if (password != null) { Biff8EncryptionKey.setCurrentUserPassword(password); } Workbook wb = new HSSFWorkbook(root, true); Biff8EncryptionKey.setCurrentUserPassword(null); return wb; }
/** * Processes a file into essentially record events. * * @param req an Instance of HSSFRequest which has your registered listeners * @param dir a DirectoryNode containing your workbook */ public void processWorkbookEvents(HSSFRequest req, DirectoryNode dir) throws IOException { // some old documents have "WORKBOOK" or "BOOK" final String name; Set<String> entryNames = dir.getEntryNames(); if (entryNames.contains("Workbook")) { name = "Workbook"; } else if (entryNames.contains("WORKBOOK")) { name = "WORKBOOK"; } else if (entryNames.contains("BOOK")) { name = "BOOK"; } else { name = "Workbook"; } InputStream in = dir.createDocumentInputStream(name); processEvents(req, in); }
/** * This constructor loads a Word document from a specific point in a POIFSFileSystem, probably not * the default. Used typically to open embeded documents. * * @param directory The DirectoryNode that contains the Word document. * @throws IOException If there is an unexpected IOException from the passed in POIFSFileSystem. */ public HWPFDocumentCore(DirectoryNode directory) throws IOException { // Sort out the hpsf properties super(directory); // read in the main stream. DocumentEntry documentProps = (DocumentEntry) directory.getEntry("WordDocument"); _mainStream = new byte[documentProps.getSize()]; directory.createDocumentInputStream(STREAM_WORD_DOCUMENT).read(_mainStream); // Create our FIB, and check for the doc being encrypted _fib = new FileInformationBlock(_mainStream); DirectoryEntry objectPoolEntry; try { objectPoolEntry = (DirectoryEntry) directory.getEntry(STREAM_OBJECT_POOL); } catch (FileNotFoundException exc) { objectPoolEntry = null; } _objectPool = new ObjectPoolImpl(objectPoolEntry); }
/** * Add a embedded object to this presentation * * @return 0-based index of the embedded object */ public int addEmbed(POIFSFileSystem poiData) { DirectoryNode root = poiData.getRoot(); // prepare embedded data if (new ClassID().equals(root.getStorageClsid())) { // need to set class id Map<String, ClassID> olemap = getOleMap(); ClassID classID = null; for (Map.Entry<String, ClassID> entry : olemap.entrySet()) { if (root.hasEntry(entry.getKey())) { classID = entry.getValue(); break; } } if (classID == null) { throw new IllegalArgumentException("Unsupported embedded document"); } root.setStorageClsid(classID); } ExEmbed exEmbed = new ExEmbed(); // remove unneccessary infos, so we don't need to specify the type // of the ole object multiple times Record children[] = exEmbed.getChildRecords(); exEmbed.removeChild(children[2]); exEmbed.removeChild(children[3]); exEmbed.removeChild(children[4]); ExEmbedAtom eeEmbed = exEmbed.getExEmbedAtom(); eeEmbed.setCantLockServerB(true); ExOleObjAtom eeAtom = exEmbed.getExOleObjAtom(); eeAtom.setDrawAspect(ExOleObjAtom.DRAW_ASPECT_VISIBLE); eeAtom.setType(ExOleObjAtom.TYPE_EMBEDDED); // eeAtom.setSubType(ExOleObjAtom.SUBTYPE_EXCEL); // should be ignored?!?, see MS-PPT ExOleObjAtom, but Libre Office sets it ... eeAtom.setOptions(1226240); ExOleObjStg exOleObjStg = new ExOleObjStg(); try { final String OLESTREAM_NAME = "\u0001Ole"; if (!root.hasEntry(OLESTREAM_NAME)) { // the following data was taken from an example libre office document // beside this "\u0001Ole" record there were several other records, e.g. CompObj, // OlePresXXX, but it seems, that they aren't neccessary byte oleBytes[] = {1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; poiData.createDocument(new ByteArrayInputStream(oleBytes), OLESTREAM_NAME); } ByteArrayOutputStream bos = new ByteArrayOutputStream(); poiData.writeFilesystem(bos); exOleObjStg.setData(bos.toByteArray()); } catch (IOException e) { throw new HSLFException(e); } int psrId = addPersistentObject(exOleObjStg); exOleObjStg.setPersistId(psrId); eeAtom.setObjStgDataRef(psrId); int objectId = addToObjListAtom(exEmbed); eeAtom.setObjID(objectId); return objectId; }
/** * Processes a file into essentially record events. * * @param req an Instance of HSSFRequest which has your registered listeners * @param dir a DirectoryNode containing your workbook * @return numeric user-specified result code. */ public short abortableProcessWorkbookEvents(HSSFRequest req, DirectoryNode dir) throws IOException, HSSFUserException { InputStream in = dir.createDocumentInputStream("Workbook"); return abortableProcessEvents(req, in); }
// will throw IOException if not actually POIFS // can return null byte[] private byte[] handleEmbeddedPOIFS( InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException { NPOIFSFileSystem fs = null; byte[] ret = null; try { fs = new NPOIFSFileSystem(is); DirectoryNode root = fs.getRoot(); if (root == null) { return ret; } if (root.hasEntry("Package")) { Entry ooxml = root.getEntry("Package"); TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml)); ByteArrayOutputStream out = new ByteArrayOutputStream(); IOUtils.copy(stream, out); ret = out.toByteArray(); } else { // try poifs POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type == POIFSDocumentType.OLE10_NATIVE) { try { // Try to un-wrap the OLE10Native record: Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root); ret = ole.getDataBuffer(); } catch (Ole10NativeException ex) { // Not a valid OLE10Native record, skip it } } else if (type == POIFSDocumentType.COMP_OBJ) { DocumentEntry contentsEntry; try { contentsEntry = (DocumentEntry) root.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { contentsEntry = (DocumentEntry) root.getEntry("Contents"); } DocumentInputStream inp = null; try { inp = new DocumentInputStream(contentsEntry); ret = new byte[contentsEntry.getSize()]; inp.readFully(ret); } finally { if (inp != null) { inp.close(); } } } else { ByteArrayOutputStream out = new ByteArrayOutputStream(); is.reset(); IOUtils.copy(is, out); ret = out.toByteArray(); metadata.set( Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension()); metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); } } } finally { if (fs != null) { fs.close(); } } return ret; }
/** * This constructor loads a Word document from a specific point in a POIFSFileSystem, probably not * the default. Used typically to open embeded documents. * * @param directory The DirectoryNode that contains the Word document. * @throws IOException If there is an unexpected IOException from the passed in POIFSFileSystem. */ public HWPFDocument(DirectoryNode directory) throws IOException { // Load the main stream and FIB // Also handles HPSF bits super(directory); // Is this document too old for us? if (_fib.getFibBase().getNFib() < 106) { throw new OldWordFileFormatException( "The document is too old - Word 95 or older. Try HWPFOldDocument instead?"); } // use the fib to determine the name of the table stream. String name = STREAM_TABLE_0; if (_fib.getFibBase().isFWhichTblStm()) { name = STREAM_TABLE_1; } // Grab the table stream. DocumentEntry tableProps; try { tableProps = (DocumentEntry) directory.getEntry(name); } catch (FileNotFoundException fnfe) { throw new IllegalStateException( "Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)"); } // read in the table stream. _tableStream = new byte[tableProps.getSize()]; directory.createDocumentInputStream(name).read(_tableStream); _fib.fillVariableFields(_mainStream, _tableStream); // read in the data stream. try { DocumentEntry dataProps = (DocumentEntry) directory.getEntry(STREAM_DATA); _dataStream = new byte[dataProps.getSize()]; directory.createDocumentInputStream(STREAM_DATA).read(_dataStream); } catch (java.io.FileNotFoundException e) { _dataStream = new byte[0]; } // Get the cp of the start of text in the main stream // The latest spec doc says this is always zero! int fcMin = 0; // fcMin = _fib.getFcMin() // Start to load up our standard structures. _dop = new DocumentProperties(_tableStream, _fib.getFcDop(), _fib.getLcbDop()); _cft = new ComplexFileTable(_mainStream, _tableStream, _fib.getFcClx(), fcMin); TextPieceTable _tpt = _cft.getTextPieceTable(); // Now load the rest of the properties, which need to be adjusted // for where text really begin _cbt = new CHPBinTable( _mainStream, _tableStream, _fib.getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), _tpt); _pbt = new PAPBinTable( _mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt); _text = _tpt.getText(); /* * in this mode we preserving PAPX/CHPX structure from file, so text may * miss from output, and text order may be corrupted */ boolean preserveBinTables = false; try { preserveBinTables = Boolean.parseBoolean(System.getProperty(PROPERTY_PRESERVE_BIN_TABLES)); } catch (Exception exc) { // ignore; } if (!preserveBinTables) { _cbt.rebuild(_cft); _pbt.rebuild(_text, _cft); } /* * Property to disable text rebuilding. In this mode changing the text * will lead to unpredictable behavior */ boolean preserveTextTable = false; try { preserveTextTable = Boolean.parseBoolean(System.getProperty(PROPERTY_PRESERVE_TEXT_TABLE)); } catch (Exception exc) { // ignore; } if (!preserveTextTable) { _cft = new ComplexFileTable(); _tpt = _cft.getTextPieceTable(); final TextPiece textPiece = new SinglentonTextPiece(_text); _tpt.add(textPiece); _text = textPiece.getStringBuilder(); } // Read FSPA and Escher information // _fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), // _fib.getLcbPlcspaMom(), getTextTable().getTextPieces()); _fspaHeaders = new FSPATable(_tableStream, _fib, FSPADocumentPart.HEADER); _fspaMain = new FSPATable(_tableStream, _fib, FSPADocumentPart.MAIN); if (_fib.getFcDggInfo() != 0) { _escherRecordHolder = new EscherRecordHolder(_tableStream, _fib.getFcDggInfo(), _fib.getLcbDggInfo()); } else { _escherRecordHolder = new EscherRecordHolder(); } // read in the pictures stream _pictures = new PicturesTable(this, _dataStream, _mainStream, _fspaMain, _escherRecordHolder); // And the art shapes stream _officeArts = new ShapesTable(_tableStream, _fib); // And escher pictures _officeDrawingsHeaders = new OfficeDrawingsImpl(_fspaHeaders, _escherRecordHolder, _mainStream); _officeDrawingsMain = new OfficeDrawingsImpl(_fspaMain, _escherRecordHolder, _mainStream); _st = new SectionTable( _mainStream, _tableStream, _fib.getFcPlcfsed(), _fib.getLcbPlcfsed(), fcMin, _tpt, _fib.getSubdocumentTextStreamLength(SubdocumentType.MAIN)); _ss = new StyleSheet(_tableStream, _fib.getFcStshf()); _ft = new FontTable(_tableStream, _fib.getFcSttbfffn(), _fib.getLcbSttbfffn()); int listOffset = _fib.getFcPlcfLst(); int lfoOffset = _fib.getFcPlfLfo(); if (listOffset != 0 && _fib.getLcbPlcfLst() != 0) { _lt = new ListTables(_tableStream, _fib.getFcPlcfLst(), _fib.getFcPlfLfo()); } int sbtOffset = _fib.getFcSttbSavedBy(); int sbtLength = _fib.getLcbSttbSavedBy(); if (sbtOffset != 0 && sbtLength != 0) { _sbt = new SavedByTable(_tableStream, sbtOffset, sbtLength); } int rmarkOffset = _fib.getFcSttbfRMark(); int rmarkLength = _fib.getLcbSttbfRMark(); if (rmarkOffset != 0 && rmarkLength != 0) { _rmat = new RevisionMarkAuthorTable(_tableStream, rmarkOffset, rmarkLength); } _bookmarksTables = new BookmarksTables(_tableStream, _fib); _bookmarks = new BookmarksImpl(_bookmarksTables); _endnotesTables = new NotesTables(NoteType.ENDNOTE, _tableStream, _fib); _endnotes = new NotesImpl(_endnotesTables); _footnotesTables = new NotesTables(NoteType.FOOTNOTE, _tableStream, _fib); _footnotes = new NotesImpl(_footnotesTables); _fieldsTables = new FieldsTables(_tableStream, _fib); _fields = new FieldsImpl(_fieldsTables); }