public void parse( InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // As we don't know which of the metadata or the content // we'll hit first, catch the endDocument call initially EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(baseHandler); // Process the file in turn ZipInputStream zip = new ZipInputStream(stream); ZipEntry entry = zip.getNextEntry(); while (entry != null) { if (entry.getName().equals("mimetype")) { String type = IOUtils.toString(zip, "UTF-8"); metadata.set(Metadata.CONTENT_TYPE, type); } else if (entry.getName().equals("meta.xml")) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith("content.xml")) { content.parse(zip, handler, metadata, context); } entry = zip.getNextEntry(); } // Only now call the end document if (handler.getEndDocumentWasCalled()) { handler.reallyEndDocument(); } }
public MediaType detect(TikaInputStream input, Metadata metadata) throws IOException { ZipFile zip = new ZipFile(input.getFile()); for (ZipEntry entry : Collections.list(zip.entries())) { // Is it an Open Document file? if (entry.getName().equals("mimetype")) { InputStream stream = zip.getInputStream(entry); try { return fromString(IOUtils.toString(stream, "UTF-8")); } finally { stream.close(); } } else if (entry.getName().equals("_rels/.rels") || entry.getName().equals("[Content_Types].xml")) { // Office Open XML File // As POI to open and investigate it for us try { OPCPackage pkg = OPCPackage.open(input.getFile().toString()); input.setOpenContainer(pkg); PackageRelationshipCollection core = pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL); if (core.size() != 1) { throw new IOException( "Invalid OOXML Package received - expected 1 core document, found " + core.size()); } // Get the type of the core document part PackagePart corePart = pkg.getPart(core.getRelationship(0)); String coreType = corePart.getContentType(); // Turn that into the type of the overall document String docType = coreType.substring(0, coreType.lastIndexOf('.')); return fromString(docType); } catch (InvalidFormatException e) { throw new IOException("Office Open XML File detected, but corrupted - " + e.getMessage()); } } else if (entry.getName().equals("buildVersionHistory.plist")) { // This is an iWork document // Reset and ask zip.close(); zip = new ZipFile(input.getFile()); return IWorkPackageParser.identifyType(zip); } else if (entry.getName().equals("META-INF/")) { // Java Jar return MediaType.application("java-archive"); } } return MediaType.APPLICATION_ZIP; }