Example #1
0
  public MediaType detect(InputStream input, Metadata metadata) throws IOException {
    // Check if we have access to the document
    if (input == null) {
      return MediaType.OCTET_STREAM;
    }

    // If this is a TikaInputStream wrapping an already
    // parsed NPOIFileSystem/DirectoryNode, just get the
    // names from the root:
    TikaInputStream tis = TikaInputStream.cast(input);
    Set<String> names = null;
    if (tis != null) {
      Object container = tis.getOpenContainer();
      if (container instanceof NPOIFSFileSystem) {
        names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
      } else if (container instanceof DirectoryNode) {
        names = getTopLevelNames((DirectoryNode) container);
      }
    }

    if (names == null) {
      // Check if the document starts with the OLE header
      input.mark(8);
      try {
        if (input.read() != 0xd0
            || input.read() != 0xcf
            || input.read() != 0x11
            || input.read() != 0xe0
            || input.read() != 0xa1
            || input.read() != 0xb1
            || input.read() != 0x1a
            || input.read() != 0xe1) {
          return MediaType.OCTET_STREAM;
        }
      } finally {
        input.reset();
      }
    }

    // We can only detect the exact type when given a TikaInputStream
    if (names == null && tis != null) {
      // Look for known top level entry names to detect the document type
      names = getTopLevelNames(tis);
    }

    // Detect based on the names (as available)
    if (tis != null
        && tis.getOpenContainer() != null
        && tis.getOpenContainer() instanceof NPOIFSFileSystem) {
      return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot());
    } else {
      return detect(names, null);
    }
  }
Example #2
0
    public void parseEmbedded(
        InputStream inputStream,
        ContentHandler contentHandler,
        Metadata metadata,
        boolean outputHtml)
        throws SAXException, IOException {
      String name = metadata.get(Metadata.RESOURCE_NAME_KEY);

      if (name == null) {
        name = "file" + count++;
      }

      MediaType contentType = detector.detect(inputStream, metadata);

      if (name.indexOf('.') == -1 && contentType != null) {
        try {
          name += config.getMimeRepository().forName(contentType.toString()).getExtension();
        } catch (MimeTypeException e) {
          e.printStackTrace();
        }
      }

      String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
      if (relID != null && !name.startsWith(relID)) {
        name = relID + "_" + name;
      }

      File outputFile = new File(extractDir, name);
      File parent = outputFile.getParentFile();
      if (!parent.exists()) {
        if (!parent.mkdirs()) {
          throw new IOException("unable to create directory \"" + parent + "\"");
        }
      }
      System.out.println("Extracting '" + name + "' (" + contentType + ") to " + outputFile);

      FileOutputStream os = null;

      try {
        os = new FileOutputStream(outputFile);

        if (inputStream instanceof TikaInputStream) {
          TikaInputStream tin = (TikaInputStream) inputStream;

          if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
            POIFSFileSystem fs = new POIFSFileSystem();
            copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
            fs.writeFilesystem(os);
          } else {
            IOUtils.copy(inputStream, os);
          }
        } else {
          IOUtils.copy(inputStream, os);
        }
      } catch (Exception e) {
        logger.warn("Ignoring unexpected exception trying to save embedded file " + name, e);
      } finally {
        if (os != null) {
          os.close();
        }
      }
    }