/** * Compares two {@link DocumentEntry} instances of a POI file system. Documents that are not * property set streams must be bitwise identical. Property set streams must be logically equal. * * @param d1 The first document. * @param d2 The second document. * @param msg The method may append human-readable comparison messages to this string buffer. * @return <code>true</code> if the documents are equal, else <code>false</code>. * @exception MarkUnsupportedException if a POI document stream does not support the mark() * operation. * @exception NoPropertySetStreamException if the application tries to create a property set from * a POI document stream that is not a property set stream. * @throws UnsupportedEncodingException * @exception IOException if any I/O exception occurs. */ private static boolean equal( final DocumentEntry d1, final DocumentEntry d2, final StringBuffer msg) throws NoPropertySetStreamException, MarkUnsupportedException, UnsupportedEncodingException, IOException { boolean equal = true; final DocumentInputStream dis1 = new DocumentInputStream(d1); final DocumentInputStream dis2 = new DocumentInputStream(d2); if (PropertySet.isPropertySetStream(dis1) && PropertySet.isPropertySetStream(dis2)) { final PropertySet ps1 = PropertySetFactory.create(dis1); final PropertySet ps2 = PropertySetFactory.create(dis2); equal = ps1.equals(ps2); if (!equal) { msg.append("Property sets are not equal.\n"); return equal; } } else { int i1; int i2; do { i1 = dis1.read(); i2 = dis2.read(); if (i1 != i2) { equal = false; msg.append("Documents are not equal.\n"); break; } } while (equal && i1 == -1); } return true; }
/** * Setup is used to get the document ready. Gets the DocumentSummaryInformation and the * SummaryInformation to reasonable values */ public void setUp() { bout = new ByteArrayOutputStream(); poifs = new POIFSFileSystem(); dir = poifs.getRoot(); dsi = null; try { DocumentEntry dsiEntry = (DocumentEntry) dir.getEntry(DocumentSummaryInformation.DEFAULT_STREAM_NAME); DocumentInputStream dis = new DocumentInputStream(dsiEntry); PropertySet ps = new PropertySet(dis); dis.close(); dsi = new DocumentSummaryInformation(ps); } catch (FileNotFoundException ex) { /* There is no document summary information yet. We have to create a * new one. */ dsi = PropertySetFactory.newDocumentSummaryInformation(); assertNotNull(dsi); } catch (IOException e) { e.printStackTrace(); fail(); } catch (NoPropertySetStreamException e) { e.printStackTrace(); fail(); } catch (MarkUnsupportedException e) { e.printStackTrace(); fail(); } catch (UnexpectedPropertySetTypeException e) { e.printStackTrace(); fail(); } assertNotNull(dsi); try { DocumentEntry dsiEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME); DocumentInputStream dis = new DocumentInputStream(dsiEntry); PropertySet ps = new PropertySet(dis); dis.close(); si = new SummaryInformation(ps); } catch (FileNotFoundException ex) { /* There is no document summary information yet. We have to create a * new one. */ si = PropertySetFactory.newSummaryInformation(); assertNotNull(si); } catch (IOException e) { e.printStackTrace(); fail(); } catch (NoPropertySetStreamException e) { e.printStackTrace(); fail(); } catch (MarkUnsupportedException e) { e.printStackTrace(); fail(); } catch (UnexpectedPropertySetTypeException e) { e.printStackTrace(); fail(); } assertNotNull(dsi); }
public byte[] getData(int len) throws IOException { // determine the amount that is possible to read (if len is too much) int actualAmount = Math.min(input.available(), len); byte[] b = new byte[actualAmount]; int i = input.read(b); return b; }
/** * Copies the bytes from a {@link DocumentInputStream} to a new stream in a POI filesystem. * * @param poiFs The POI filesystem to write to. * @param path The source document's path. * @param name The source document's name. * @param stream The stream containing the source document. * @throws IOException */ public void copy( final POIFSFileSystem poiFs, final POIFSDocumentPath path, final String name, final DocumentInputStream stream) throws IOException { final DirectoryEntry de = getPath(poiFs, path); final ByteArrayOutputStream out = new ByteArrayOutputStream(); int c; while ((c = stream.read()) != -1) out.write(c); stream.close(); out.close(); final InputStream in = new ByteArrayInputStream(out.toByteArray()); de.createDocument(name, in); }
/** * Creates a {@link DocumentDescriptor}. * * @param name The stream's name. * @param path The stream's path in the POI filesystem hierarchy. * @param stream The stream. * @param nrOfBytes The maximum number of bytes to display in a dump starting at the beginning of * the stream. */ public DocumentDescriptor( final String name, final POIFSDocumentPath path, final DocumentInputStream stream, final int nrOfBytes) { this.name = name; this.path = path; this.stream = stream; try { size = stream.available(); if (stream.markSupported()) { stream.mark(nrOfBytes); final byte[] b = new byte[nrOfBytes]; final int read = stream.read(b, 0, Math.min(size, b.length)); bytes = new byte[read]; System.arraycopy(b, 0, bytes, 0, read); stream.reset(); } } catch (IOException ex) { System.out.println(ex); } }
public void close() throws IOException { input.close(); input = null; }
public void setPosition(long i) throws IOException { input.reset(); input.skip((int) i); }
public DocumentDataSource(DocumentInputStream input) { this.input = input; // Mark the start of the file input.mark(0); }
/** * Closes the ByteArrayOutputStream and reads it into a ByteArrayInputStream. When finished * writing information this method is used in the tests to start reading from the created document * and then the see if the results match. */ public void closeAndReOpen() { try { dsi.write(dir, DocumentSummaryInformation.DEFAULT_STREAM_NAME); si.write(dir, SummaryInformation.DEFAULT_STREAM_NAME); } catch (WritingNotSupportedException e) { e.printStackTrace(); fail(); } catch (IOException e) { e.printStackTrace(); fail(); } si = null; dsi = null; try { poifs.writeFilesystem(bout); bout.flush(); } catch (IOException e) { e.printStackTrace(); fail(); } InputStream is = new ByteArrayInputStream(bout.toByteArray()); assertNotNull(is); POIFSFileSystem poifs = null; try { poifs = new POIFSFileSystem(is); } catch (IOException e) { e.printStackTrace(); fail(); } try { is.close(); } catch (IOException e) { e.printStackTrace(); fail(); } assertNotNull(poifs); /* Read the document summary information. */ DirectoryEntry dir = poifs.getRoot(); try { DocumentEntry dsiEntry = (DocumentEntry) dir.getEntry(DocumentSummaryInformation.DEFAULT_STREAM_NAME); DocumentInputStream dis = new DocumentInputStream(dsiEntry); PropertySet ps = new PropertySet(dis); dis.close(); dsi = new DocumentSummaryInformation(ps); } catch (FileNotFoundException ex) { fail(); } catch (IOException e) { e.printStackTrace(); fail(); } catch (NoPropertySetStreamException e) { e.printStackTrace(); fail(); } catch (MarkUnsupportedException e) { e.printStackTrace(); fail(); } catch (UnexpectedPropertySetTypeException e) { e.printStackTrace(); fail(); } try { DocumentEntry dsiEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME); DocumentInputStream dis = new DocumentInputStream(dsiEntry); PropertySet ps = new PropertySet(dis); dis.close(); si = new SummaryInformation(ps); } catch (FileNotFoundException ex) { /* There is no document summary information yet. We have to create a * new one. */ si = PropertySetFactory.newSummaryInformation(); assertNotNull(si); } catch (IOException e) { e.printStackTrace(); fail(); } catch (NoPropertySetStreamException e) { e.printStackTrace(); fail(); } catch (MarkUnsupportedException e) { e.printStackTrace(); fail(); } catch (UnexpectedPropertySetTypeException e) { e.printStackTrace(); fail(); } }
// will throw IOException if not actually POIFS // can return null byte[] private byte[] handleEmbeddedPOIFS( InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException { NPOIFSFileSystem fs = null; byte[] ret = null; try { fs = new NPOIFSFileSystem(is); DirectoryNode root = fs.getRoot(); if (root == null) { return ret; } if (root.hasEntry("Package")) { Entry ooxml = root.getEntry("Package"); TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml)); ByteArrayOutputStream out = new ByteArrayOutputStream(); IOUtils.copy(stream, out); ret = out.toByteArray(); } else { // try poifs POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type == POIFSDocumentType.OLE10_NATIVE) { try { // Try to un-wrap the OLE10Native record: Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root); ret = ole.getDataBuffer(); } catch (Ole10NativeException ex) { // Not a valid OLE10Native record, skip it } } else if (type == POIFSDocumentType.COMP_OBJ) { DocumentEntry contentsEntry; try { contentsEntry = (DocumentEntry) root.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { contentsEntry = (DocumentEntry) root.getEntry("Contents"); } DocumentInputStream inp = null; try { inp = new DocumentInputStream(contentsEntry); ret = new byte[contentsEntry.getSize()]; inp.readFully(ret); } finally { if (inp != null) { inp.close(); } } } else { ByteArrayOutputStream out = new ByteArrayOutputStream(); is.reset(); IOUtils.copy(is, out); ret = out.toByteArray(); metadata.set( Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension()); metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); } } } finally { if (fs != null) { fs.close(); } } return ret; }
public String extractText(InputStream in) throws IOException { ArrayList<WordTextPiece> text = new ArrayList<WordTextPiece>(); POIFSFileSystem fsys = new POIFSFileSystem(in); DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument"); DocumentInputStream din = fsys.createDocumentInputStream("WordDocument"); byte[] header = new byte[headerProps.getSize()]; din.read(header); din.close(); // Prende le informazioni dall'header del documento int info = LittleEndian.getShort(header, 0xa); boolean useTable1 = (info & 0x200) != 0; // boolean useTable1 = true; // Prende informazioni dalla piece table int complexOffset = LittleEndian.getInt(header, 0x1a2); // int complexOffset = LittleEndian.getInt(header); String tableName = null; if (useTable1) { tableName = "1Table"; } else { tableName = "0Table"; } DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName); byte[] tableStream = new byte[table.getSize()]; din = fsys.createDocumentInputStream(tableName); din.read(tableStream); din.close(); din = null; fsys = null; table = null; headerProps = null; int multiple = findText(tableStream, complexOffset, text); StringBuffer sb = new StringBuffer(); tableStream = null; for (int x = 0; x < text.size(); x++) { WordTextPiece nextPiece = (WordTextPiece) text.get(x); int start = nextPiece.getStart(); int length = nextPiece.getLength(); boolean unicode = nextPiece.usesUnicode(); String toStr = null; if (unicode) { toStr = new String(header, start, length * multiple, "UTF-8"); } else { toStr = new String(header, start, length, "big5"); } sb.append(toStr).append(" "); } return sb.toString(); }