public MediaType detect(TikaInputStream input, Metadata metadata) throws IOException { ZipFile zip = new ZipFile(input.getFile()); for (ZipEntry entry : Collections.list(zip.entries())) { // Is it an Open Document file? if (entry.getName().equals("mimetype")) { InputStream stream = zip.getInputStream(entry); try { return fromString(IOUtils.toString(stream, "UTF-8")); } finally { stream.close(); } } else if (entry.getName().equals("_rels/.rels") || entry.getName().equals("[Content_Types].xml")) { // Office Open XML File // As POI to open and investigate it for us try { OPCPackage pkg = OPCPackage.open(input.getFile().toString()); input.setOpenContainer(pkg); PackageRelationshipCollection core = pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL); if (core.size() != 1) { throw new IOException( "Invalid OOXML Package received - expected 1 core document, found " + core.size()); } // Get the type of the core document part PackagePart corePart = pkg.getPart(core.getRelationship(0)); String coreType = corePart.getContentType(); // Turn that into the type of the overall document String docType = coreType.substring(0, coreType.lastIndexOf('.')); return fromString(docType); } catch (InvalidFormatException e) { throw new IOException("Office Open XML File detected, but corrupted - " + e.getMessage()); } } else if (entry.getName().equals("buildVersionHistory.plist")) { // This is an iWork document // Reset and ask zip.close(); zip = new ZipFile(input.getFile()); return IWorkPackageParser.identifyType(zip); } else if (entry.getName().equals("META-INF/")) { // Java Jar return MediaType.application("java-archive"); } } return MediaType.APPLICATION_ZIP; }
private static Set<String> getTopLevelNames(TikaInputStream stream) throws IOException { // Force the document stream to a (possibly temporary) file // so we don't modify the current position of the stream File file = stream.getFile(); try { NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true); // Optimize a possible later parsing process by keeping // a reference to the already opened POI file system stream.setOpenContainer(fs); return getTopLevelNames(fs.getRoot()); } catch (IOException e) { // Parse error in POI, so we don't know the file type return Collections.emptySet(); } catch (RuntimeException e) { // Another problem in POI return Collections.emptySet(); } }