Exemple #1
0
  /**
   * Setup is used to get the document ready. Gets the DocumentSummaryInformation and the
   * SummaryInformation to reasonable values
   */
  public void setUp() {
    bout = new ByteArrayOutputStream();
    poifs = new POIFSFileSystem();
    dir = poifs.getRoot();
    dsi = null;
    try {
      DocumentEntry dsiEntry =
          (DocumentEntry) dir.getEntry(DocumentSummaryInformation.DEFAULT_STREAM_NAME);
      DocumentInputStream dis = new DocumentInputStream(dsiEntry);
      PropertySet ps = new PropertySet(dis);
      dis.close();
      dsi = new DocumentSummaryInformation(ps);

    } catch (FileNotFoundException ex) {
      /* There is no document summary information yet. We have to create a
       * new one. */
      dsi = PropertySetFactory.newDocumentSummaryInformation();
      assertNotNull(dsi);
    } catch (IOException e) {
      e.printStackTrace();
      fail();
    } catch (NoPropertySetStreamException e) {
      e.printStackTrace();
      fail();
    } catch (MarkUnsupportedException e) {
      e.printStackTrace();
      fail();
    } catch (UnexpectedPropertySetTypeException e) {
      e.printStackTrace();
      fail();
    }
    assertNotNull(dsi);
    try {
      DocumentEntry dsiEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME);
      DocumentInputStream dis = new DocumentInputStream(dsiEntry);
      PropertySet ps = new PropertySet(dis);
      dis.close();
      si = new SummaryInformation(ps);

    } catch (FileNotFoundException ex) {
      /* There is no document summary information yet. We have to create a
       * new one. */
      si = PropertySetFactory.newSummaryInformation();
      assertNotNull(si);
    } catch (IOException e) {
      e.printStackTrace();
      fail();
    } catch (NoPropertySetStreamException e) {
      e.printStackTrace();
      fail();
    } catch (MarkUnsupportedException e) {
      e.printStackTrace();
      fail();
    } catch (UnexpectedPropertySetTypeException e) {
      e.printStackTrace();
      fail();
    }
    assertNotNull(dsi);
  }
Exemple #2
0
 /**
  * Copies the bytes from a {@link DocumentInputStream} to a new stream in a POI filesystem.
  *
  * @param poiFs The POI filesystem to write to.
  * @param path The source document's path.
  * @param name The source document's name.
  * @param stream The stream containing the source document.
  * @throws IOException
  */
 public void copy(
     final POIFSFileSystem poiFs,
     final POIFSDocumentPath path,
     final String name,
     final DocumentInputStream stream)
     throws IOException {
   final DirectoryEntry de = getPath(poiFs, path);
   final ByteArrayOutputStream out = new ByteArrayOutputStream();
   int c;
   while ((c = stream.read()) != -1) out.write(c);
   stream.close();
   out.close();
   final InputStream in = new ByteArrayInputStream(out.toByteArray());
   de.createDocument(name, in);
 }
 public void close() throws IOException {
   input.close();
   input = null;
 }
Exemple #4
0
  /**
   * Closes the ByteArrayOutputStream and reads it into a ByteArrayInputStream. When finished
   * writing information this method is used in the tests to start reading from the created document
   * and then the see if the results match.
   */
  public void closeAndReOpen() {

    try {
      dsi.write(dir, DocumentSummaryInformation.DEFAULT_STREAM_NAME);
      si.write(dir, SummaryInformation.DEFAULT_STREAM_NAME);
    } catch (WritingNotSupportedException e) {
      e.printStackTrace();
      fail();
    } catch (IOException e) {
      e.printStackTrace();
      fail();
    }

    si = null;
    dsi = null;
    try {

      poifs.writeFilesystem(bout);
      bout.flush();

    } catch (IOException e) {

      e.printStackTrace();
      fail();
    }

    InputStream is = new ByteArrayInputStream(bout.toByteArray());
    assertNotNull(is);
    POIFSFileSystem poifs = null;
    try {
      poifs = new POIFSFileSystem(is);
    } catch (IOException e) {

      e.printStackTrace();
      fail();
    }
    try {
      is.close();
    } catch (IOException e) {
      e.printStackTrace();
      fail();
    }
    assertNotNull(poifs);
    /* Read the document summary information. */
    DirectoryEntry dir = poifs.getRoot();

    try {
      DocumentEntry dsiEntry =
          (DocumentEntry) dir.getEntry(DocumentSummaryInformation.DEFAULT_STREAM_NAME);
      DocumentInputStream dis = new DocumentInputStream(dsiEntry);
      PropertySet ps = new PropertySet(dis);
      dis.close();
      dsi = new DocumentSummaryInformation(ps);
    } catch (FileNotFoundException ex) {
      fail();
    } catch (IOException e) {
      e.printStackTrace();
      fail();
    } catch (NoPropertySetStreamException e) {
      e.printStackTrace();
      fail();
    } catch (MarkUnsupportedException e) {
      e.printStackTrace();
      fail();
    } catch (UnexpectedPropertySetTypeException e) {
      e.printStackTrace();
      fail();
    }
    try {
      DocumentEntry dsiEntry = (DocumentEntry) dir.getEntry(SummaryInformation.DEFAULT_STREAM_NAME);
      DocumentInputStream dis = new DocumentInputStream(dsiEntry);
      PropertySet ps = new PropertySet(dis);
      dis.close();
      si = new SummaryInformation(ps);

    } catch (FileNotFoundException ex) {
      /* There is no document summary information yet. We have to create a
       * new one. */
      si = PropertySetFactory.newSummaryInformation();
      assertNotNull(si);
    } catch (IOException e) {
      e.printStackTrace();
      fail();
    } catch (NoPropertySetStreamException e) {
      e.printStackTrace();
      fail();
    } catch (MarkUnsupportedException e) {
      e.printStackTrace();
      fail();
    } catch (UnexpectedPropertySetTypeException e) {
      e.printStackTrace();
      fail();
    }
  }
  // will throw IOException if not actually POIFS
  // can return null byte[]
  private byte[] handleEmbeddedPOIFS(
      InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException {

    NPOIFSFileSystem fs = null;
    byte[] ret = null;
    try {

      fs = new NPOIFSFileSystem(is);

      DirectoryNode root = fs.getRoot();

      if (root == null) {
        return ret;
      }

      if (root.hasEntry("Package")) {
        Entry ooxml = root.getEntry("Package");
        TikaInputStream stream =
            TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));

        ByteArrayOutputStream out = new ByteArrayOutputStream();

        IOUtils.copy(stream, out);
        ret = out.toByteArray();
      } else {
        // try poifs
        POIFSDocumentType type = POIFSDocumentType.detectType(root);
        if (type == POIFSDocumentType.OLE10_NATIVE) {
          try {
            // Try to un-wrap the OLE10Native record:
            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
            ret = ole.getDataBuffer();
          } catch (Ole10NativeException ex) {
            // Not a valid OLE10Native record, skip it
          }
        } else if (type == POIFSDocumentType.COMP_OBJ) {

          DocumentEntry contentsEntry;
          try {
            contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
          } catch (FileNotFoundException ioe) {
            contentsEntry = (DocumentEntry) root.getEntry("Contents");
          }

          DocumentInputStream inp = null;
          try {
            inp = new DocumentInputStream(contentsEntry);
            ret = new byte[contentsEntry.getSize()];
            inp.readFully(ret);
          } finally {
            if (inp != null) {
              inp.close();
            }
          }
        } else {

          ByteArrayOutputStream out = new ByteArrayOutputStream();
          is.reset();
          IOUtils.copy(is, out);
          ret = out.toByteArray();
          metadata.set(
              Metadata.RESOURCE_NAME_KEY,
              "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
          metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
        }
      }
    } finally {
      if (fs != null) {
        fs.close();
      }
    }
    return ret;
  }
  public String extractText(InputStream in) throws IOException {
    ArrayList<WordTextPiece> text = new ArrayList<WordTextPiece>();
    POIFSFileSystem fsys = new POIFSFileSystem(in);

    DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
    DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    byte[] header = new byte[headerProps.getSize()];

    din.read(header);
    din.close();
    // Prende le informazioni dall'header del documento
    int info = LittleEndian.getShort(header, 0xa);

    boolean useTable1 = (info & 0x200) != 0;

    // boolean useTable1 = true;

    // Prende informazioni dalla piece table
    int complexOffset = LittleEndian.getInt(header, 0x1a2);
    // int complexOffset = LittleEndian.getInt(header);

    String tableName = null;
    if (useTable1) {
      tableName = "1Table";
    } else {
      tableName = "0Table";
    }

    DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
    byte[] tableStream = new byte[table.getSize()];

    din = fsys.createDocumentInputStream(tableName);

    din.read(tableStream);
    din.close();

    din = null;
    fsys = null;
    table = null;
    headerProps = null;

    int multiple = findText(tableStream, complexOffset, text);

    StringBuffer sb = new StringBuffer();
    tableStream = null;

    for (int x = 0; x < text.size(); x++) {

      WordTextPiece nextPiece = (WordTextPiece) text.get(x);
      int start = nextPiece.getStart();
      int length = nextPiece.getLength();

      boolean unicode = nextPiece.usesUnicode();
      String toStr = null;
      if (unicode) {
        toStr = new String(header, start, length * multiple, "UTF-8");
      } else {
        toStr = new String(header, start, length, "big5");
      }
      sb.append(toStr).append(" ");
    }
    return sb.toString();
  }