public MediaType detect(InputStream input, Metadata metadata) throws IOException { // Check if we have access to the document if (input == null) { return MediaType.OCTET_STREAM; } // If this is a TikaInputStream wrapping an already // parsed NPOIFileSystem/DirectoryNode, just get the // names from the root: TikaInputStream tis = TikaInputStream.cast(input); Set<String> names = null; if (tis != null) { Object container = tis.getOpenContainer(); if (container instanceof NPOIFSFileSystem) { names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot()); } else if (container instanceof DirectoryNode) { names = getTopLevelNames((DirectoryNode) container); } } if (names == null) { // Check if the document starts with the OLE header input.mark(8); try { if (input.read() != 0xd0 || input.read() != 0xcf || input.read() != 0x11 || input.read() != 0xe0 || input.read() != 0xa1 || input.read() != 0xb1 || input.read() != 0x1a || input.read() != 0xe1) { return MediaType.OCTET_STREAM; } } finally { input.reset(); } } // We can only detect the exact type when given a TikaInputStream if (names == null && tis != null) { // Look for known top level entry names to detect the document type names = getTopLevelNames(tis); } // Detect based on the names (as available) if (tis != null && tis.getOpenContainer() != null && tis.getOpenContainer() instanceof NPOIFSFileSystem) { return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot()); } else { return detect(names, null); } }
public void parseEmbedded( InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { String name = metadata.get(Metadata.RESOURCE_NAME_KEY); if (name == null) { name = "file" + count++; } MediaType contentType = detector.detect(inputStream, metadata); if (name.indexOf('.') == -1 && contentType != null) { try { name += config.getMimeRepository().forName(contentType.toString()).getExtension(); } catch (MimeTypeException e) { e.printStackTrace(); } } String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID); if (relID != null && !name.startsWith(relID)) { name = relID + "_" + name; } File outputFile = new File(extractDir, name); File parent = outputFile.getParentFile(); if (!parent.exists()) { if (!parent.mkdirs()) { throw new IOException("unable to create directory \"" + parent + "\""); } } System.out.println("Extracting '" + name + "' (" + contentType + ") to " + outputFile); FileOutputStream os = null; try { os = new FileOutputStream(outputFile); if (inputStream instanceof TikaInputStream) { TikaInputStream tin = (TikaInputStream) inputStream; if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) { POIFSFileSystem fs = new POIFSFileSystem(); copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot()); fs.writeFilesystem(os); } else { IOUtils.copy(inputStream, os); } } else { IOUtils.copy(inputStream, os); } } catch (Exception e) { logger.warn("Ignoring unexpected exception trying to save embedded file " + name, e); } finally { if (os != null) { os.close(); } } }