Beispiel #1
0
 /**
  * Compares two {@link DocumentEntry} instances of a POI file system. Documents that are not
  * property set streams must be bitwise identical. Property set streams must be logically equal.
  *
  * @param d1 The first document.
  * @param d2 The second document.
  * @param msg The method may append human-readable comparison messages to this string buffer.
  * @return <code>true</code> if the documents are equal, else <code>false</code>.
  * @exception MarkUnsupportedException if a POI document stream does not support the mark()
  *     operation.
  * @exception NoPropertySetStreamException if the application tries to create a property set from
  *     a POI document stream that is not a property set stream.
  * @throws UnsupportedEncodingException
  * @exception IOException if any I/O exception occurs.
  */
 private static boolean equal(
     final DocumentEntry d1, final DocumentEntry d2, final StringBuffer msg)
     throws NoPropertySetStreamException, MarkUnsupportedException, UnsupportedEncodingException,
         IOException {
   boolean equal = true;
   final DocumentInputStream dis1 = new DocumentInputStream(d1);
   final DocumentInputStream dis2 = new DocumentInputStream(d2);
   if (PropertySet.isPropertySetStream(dis1) && PropertySet.isPropertySetStream(dis2)) {
     final PropertySet ps1 = PropertySetFactory.create(dis1);
     final PropertySet ps2 = PropertySetFactory.create(dis2);
     equal = ps1.equals(ps2);
     if (!equal) {
       msg.append("Property sets are not equal.\n");
       return equal;
     }
   } else {
     int i1;
     int i2;
     do {
       i1 = dis1.read();
       i2 = dis2.read();
       if (i1 != i2) {
         equal = false;
         msg.append("Documents are not equal.\n");
         break;
       }
     } while (equal && i1 == -1);
   }
   return true;
 }
Beispiel #2
0
  /**
   * Setup is used to get the document ready. Gets the DocumentSummaryInformation and the
   * SummaryInformation to reasonable values
   */
  public void setUp() {
    bout = new ByteArrayOutputStream();
    poifs = new POIFSFileSystem();
    dir = poifs.getRoot();
    dsi = null;
    try {
      DocumentEntry dsiEntry =
          (DocumentEntry) dir.getEntry(DocumentSummaryInformation.DEFAULT_STREAM_NAME);
      DocumentInputStream dis = new DocumentInputStream(dsiEntry);
      PropertySet ps = new PropertySet(dis);
      dis.close();
      dsi = new DocumentSummaryInformation(ps);

    } catch (FileNotFoundException ex) {
      /* There is no document summary information yet. We have to create a
       * new one. */
      dsi = PropertySetFactory.newDocumentSummaryInformation();
      assertNotNull(dsi);
    } catch (IOException e) {
      e.printStackTrace();
      fail();
    } catch (NoPropertySetStreamException e) {
      e.printStackTrace();
      fail();
    } catch (MarkUnsupportedException e) {
      e.printStackTrace();
      fail();
    } catch (UnexpectedPropertySetTypeException e) {
      e.printStackTrace();
      fail();
    }
    assertNotNull(dsi);
    try {
      DocumentEntry dsiEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME);
      DocumentInputStream dis = new DocumentInputStream(dsiEntry);
      PropertySet ps = new PropertySet(dis);
      dis.close();
      si = new SummaryInformation(ps);

    } catch (FileNotFoundException ex) {
      /* There is no document summary information yet. We have to create a
       * new one. */
      si = PropertySetFactory.newSummaryInformation();
      assertNotNull(si);
    } catch (IOException e) {
      e.printStackTrace();
      fail();
    } catch (NoPropertySetStreamException e) {
      e.printStackTrace();
      fail();
    } catch (MarkUnsupportedException e) {
      e.printStackTrace();
      fail();
    } catch (UnexpectedPropertySetTypeException e) {
      e.printStackTrace();
      fail();
    }
    assertNotNull(dsi);
  }
  public byte[] getData(int len) throws IOException {
    // determine the amount that is possible to read (if len is too much)
    int actualAmount = Math.min(input.available(), len);

    byte[] b = new byte[actualAmount];
    int i = input.read(b);
    return b;
  }
Beispiel #4
0
 /**
  * Copies the bytes from a {@link DocumentInputStream} to a new stream in a POI filesystem.
  *
  * @param poiFs The POI filesystem to write to.
  * @param path The source document's path.
  * @param name The source document's name.
  * @param stream The stream containing the source document.
  * @throws IOException
  */
 public void copy(
     final POIFSFileSystem poiFs,
     final POIFSDocumentPath path,
     final String name,
     final DocumentInputStream stream)
     throws IOException {
   final DirectoryEntry de = getPath(poiFs, path);
   final ByteArrayOutputStream out = new ByteArrayOutputStream();
   int c;
   while ((c = stream.read()) != -1) out.write(c);
   stream.close();
   out.close();
   final InputStream in = new ByteArrayInputStream(out.toByteArray());
   de.createDocument(name, in);
 }
Beispiel #5
0
 /**
  * Creates a {@link DocumentDescriptor}.
  *
  * @param name The stream's name.
  * @param path The stream's path in the POI filesystem hierarchy.
  * @param stream The stream.
  * @param nrOfBytes The maximum number of bytes to display in a dump starting at the beginning of
  *     the stream.
  */
 public DocumentDescriptor(
     final String name,
     final POIFSDocumentPath path,
     final DocumentInputStream stream,
     final int nrOfBytes) {
   this.name = name;
   this.path = path;
   this.stream = stream;
   try {
     size = stream.available();
     if (stream.markSupported()) {
       stream.mark(nrOfBytes);
       final byte[] b = new byte[nrOfBytes];
       final int read = stream.read(b, 0, Math.min(size, b.length));
       bytes = new byte[read];
       System.arraycopy(b, 0, bytes, 0, read);
       stream.reset();
     }
   } catch (IOException ex) {
     System.out.println(ex);
   }
 }
 public void close() throws IOException {
   input.close();
   input = null;
 }
 public void setPosition(long i) throws IOException {
   input.reset();
   input.skip((int) i);
 }
 public DocumentDataSource(DocumentInputStream input) {
   this.input = input;
   // Mark the start of the file
   input.mark(0);
 }
Beispiel #9
0
  /**
   * Closes the ByteArrayOutputStream and reads it into a ByteArrayInputStream. When finished
   * writing information this method is used in the tests to start reading from the created document
   * and then the see if the results match.
   */
  public void closeAndReOpen() {

    try {
      dsi.write(dir, DocumentSummaryInformation.DEFAULT_STREAM_NAME);
      si.write(dir, SummaryInformation.DEFAULT_STREAM_NAME);
    } catch (WritingNotSupportedException e) {
      e.printStackTrace();
      fail();
    } catch (IOException e) {
      e.printStackTrace();
      fail();
    }

    si = null;
    dsi = null;
    try {

      poifs.writeFilesystem(bout);
      bout.flush();

    } catch (IOException e) {

      e.printStackTrace();
      fail();
    }

    InputStream is = new ByteArrayInputStream(bout.toByteArray());
    assertNotNull(is);
    POIFSFileSystem poifs = null;
    try {
      poifs = new POIFSFileSystem(is);
    } catch (IOException e) {

      e.printStackTrace();
      fail();
    }
    try {
      is.close();
    } catch (IOException e) {
      e.printStackTrace();
      fail();
    }
    assertNotNull(poifs);
    /* Read the document summary information. */
    DirectoryEntry dir = poifs.getRoot();

    try {
      DocumentEntry dsiEntry =
          (DocumentEntry) dir.getEntry(DocumentSummaryInformation.DEFAULT_STREAM_NAME);
      DocumentInputStream dis = new DocumentInputStream(dsiEntry);
      PropertySet ps = new PropertySet(dis);
      dis.close();
      dsi = new DocumentSummaryInformation(ps);
    } catch (FileNotFoundException ex) {
      fail();
    } catch (IOException e) {
      e.printStackTrace();
      fail();
    } catch (NoPropertySetStreamException e) {
      e.printStackTrace();
      fail();
    } catch (MarkUnsupportedException e) {
      e.printStackTrace();
      fail();
    } catch (UnexpectedPropertySetTypeException e) {
      e.printStackTrace();
      fail();
    }
    try {
      DocumentEntry dsiEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME);
      DocumentInputStream dis = new DocumentInputStream(dsiEntry);
      PropertySet ps = new PropertySet(dis);
      dis.close();
      si = new SummaryInformation(ps);

    } catch (FileNotFoundException ex) {
      /* There is no document summary information yet. We have to create a
       * new one. */
      si = PropertySetFactory.newSummaryInformation();
      assertNotNull(si);
    } catch (IOException e) {
      e.printStackTrace();
      fail();
    } catch (NoPropertySetStreamException e) {
      e.printStackTrace();
      fail();
    } catch (MarkUnsupportedException e) {
      e.printStackTrace();
      fail();
    } catch (UnexpectedPropertySetTypeException e) {
      e.printStackTrace();
      fail();
    }
  }
  // will throw IOException if not actually POIFS
  // can return null byte[]
  private byte[] handleEmbeddedPOIFS(
      InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException {

    NPOIFSFileSystem fs = null;
    byte[] ret = null;
    try {

      fs = new NPOIFSFileSystem(is);

      DirectoryNode root = fs.getRoot();

      if (root == null) {
        return ret;
      }

      if (root.hasEntry("Package")) {
        Entry ooxml = root.getEntry("Package");
        TikaInputStream stream =
            TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));

        ByteArrayOutputStream out = new ByteArrayOutputStream();

        IOUtils.copy(stream, out);
        ret = out.toByteArray();
      } else {
        // try poifs
        POIFSDocumentType type = POIFSDocumentType.detectType(root);
        if (type == POIFSDocumentType.OLE10_NATIVE) {
          try {
            // Try to un-wrap the OLE10Native record:
            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
            ret = ole.getDataBuffer();
          } catch (Ole10NativeException ex) {
            // Not a valid OLE10Native record, skip it
          }
        } else if (type == POIFSDocumentType.COMP_OBJ) {

          DocumentEntry contentsEntry;
          try {
            contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
          } catch (FileNotFoundException ioe) {
            contentsEntry = (DocumentEntry) root.getEntry("Contents");
          }

          DocumentInputStream inp = null;
          try {
            inp = new DocumentInputStream(contentsEntry);
            ret = new byte[contentsEntry.getSize()];
            inp.readFully(ret);
          } finally {
            if (inp != null) {
              inp.close();
            }
          }
        } else {

          ByteArrayOutputStream out = new ByteArrayOutputStream();
          is.reset();
          IOUtils.copy(is, out);
          ret = out.toByteArray();
          metadata.set(
              Metadata.RESOURCE_NAME_KEY,
              "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
          metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
        }
      }
    } finally {
      if (fs != null) {
        fs.close();
      }
    }
    return ret;
  }
Beispiel #11
0
  public String extractText(InputStream in) throws IOException {
    ArrayList<WordTextPiece> text = new ArrayList<WordTextPiece>();
    POIFSFileSystem fsys = new POIFSFileSystem(in);

    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();
    // Prende le informazioni dall'header del documento
    int info = LittleEndian.getShort(header, 0xa);

    boolean useTable1 = (info & 0x200) != 0;

    // boolean useTable1 = true;

    // Prende informazioni dalla piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);
    // int complexOffset = LittleEndian.getInt(header);

    String tableName = null;
    if (useTable1) {
      tableName = "1Table";
    } else {
      tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    int multiple = findText(tableStream, complexOffset, text);

    StringBuffer sb = new StringBuffer();
    tableStream = null;

    for (int x = 0; x < text.size(); x++) {

      WordTextPiece nextPiece = (WordTextPiece) text.get(x);
      int start = nextPiece.getStart();
      int length = nextPiece.getLength();

      boolean unicode = nextPiece.usesUnicode();
      String toStr = null;
      if (unicode) {
        toStr = new String(header, start, length * multiple, "UTF-8");
      } else {
        toStr = new String(header, start, length, "big5");
      }
      sb.append(toStr).append(" ");
    }
    return sb.toString();
  }