protected void importDataArchive( Resource archive, InputStream resourceStream, BatchImportOptions options) { BufferedInputStream bufferedResourceStream = null; try { // Make sure the stream is buffered if (resourceStream instanceof BufferedInputStream) { bufferedResourceStream = (BufferedInputStream) resourceStream; } else { bufferedResourceStream = new BufferedInputStream(resourceStream); } // Buffer up to 100MB, bad things will happen if we bust this buffer. // TODO see if there is a buffered stream that will write to a file once the buffer fills up bufferedResourceStream.mark(100 * 1024 * 1024); final MediaType type = getMediaType(bufferedResourceStream, archive.getFilename()); if (MT_JAVA_ARCHIVE.equals(type)) { final ArchiveInputStream archiveStream = new JarArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MediaType.APPLICATION_ZIP.equals(type)) { final ArchiveInputStream archiveStream = new ZipArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MT_CPIO.equals(type)) { final ArchiveInputStream archiveStream = new CpioArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MT_AR.equals(type)) { final ArchiveInputStream archiveStream = new ArArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MT_TAR.equals(type)) { final ArchiveInputStream archiveStream = new TarArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MT_BZIP2.equals(type)) { final CompressorInputStream compressedStream = new BZip2CompressorInputStream(bufferedResourceStream); importDataArchive(archive, compressedStream, options); } else if (MT_GZIP.equals(type)) { final CompressorInputStream compressedStream = new GzipCompressorInputStream(bufferedResourceStream); importDataArchive(archive, compressedStream, options); } else if (MT_PACK200.equals(type)) { final CompressorInputStream compressedStream = new Pack200CompressorInputStream(bufferedResourceStream); importDataArchive(archive, compressedStream, options); } else if (MT_XZ.equals(type)) { final CompressorInputStream compressedStream = new XZCompressorInputStream(bufferedResourceStream); importDataArchive(archive, compressedStream, options); } else { throw new RuntimeException("Unrecognized archive media type: " + type); } } catch (IOException e) { throw new RuntimeException("Could not load InputStream for resource: " + archive, e); } finally { IOUtils.closeQuietly(bufferedResourceStream); } }
/** @return SiteMap/SiteMapIndex given a content type, byte content and the URL of a sitemap */ public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException, IOException { MediaType mediaType = MediaType.parse(contentType); // Octet-stream is the father of all binary types while (mediaType != null && !mediaType.equals(MediaType.OCTET_STREAM)) { if (XML_MEDIA_TYPES.contains(mediaType)) { return processXml(url, content); } else if (TEXT_MEDIA_TYPES.contains(mediaType)) { return (AbstractSiteMap) processText(url.toString(), content); } else if (GZ_MEDIA_TYPES.contains(mediaType)) { return processGzip(url, content); } else { mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check // parent return parseSiteMap(mediaType.toString(), content, url); } } throw new UnknownFormatException( "Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")"); }
/** * Internal detection of the specific kind of OLE2 document, based on the names of the top-level * streams within the file. In some cases the detection may need access to the root {@link * DirectoryEntry} of that file for best results. The entry can be given as a second, optional * argument. * * @param names * @param root * @return */ protected static MediaType detect(Set<String> names, DirectoryEntry root) { if (names != null) { if (names.contains("SwDocContentMgr") && names.contains("SwDocMgrTempStorage")) { return SLDWORKS; } else if (names.contains("StarCalcDocument")) { // Star Office Calc return SDC; } else if (names.contains("StarWriterDocument")) { return SDW; } else if (names.contains("StarDrawDocument3")) { if (root == null) { /* * This is either StarOfficeDraw or StarOfficeImpress, we have * to consult the CompObj to distinguish them, if this method is * called in "legacy mode", without the root, just return * x-tika-msoffice. The one-argument method is only for backward * compatibility, if someone calls old API he/she can get the * old result. */ return OLE; } else { return processCompObjFormatType(root); } } else if (names.contains("\u0005HwpSummaryInformation")) { // Hangul Word Processor v5+ (previous aren't OLE2-based) return HWP; } else if (names.contains("WksSSWorkBook")) { // This check has to be before names.contains("Workbook") // Works 7.0 spreadsheet files contain both // we want to avoid classifying this as Excel return XLR; } else if (names.contains("Workbook") || names.contains("WORKBOOK")) { MediaType tmp = processCompObjFormatType(root); if (tmp.equals(MS_GRAPH_CHART)) { return MS_GRAPH_CHART; } return XLS; } else if (names.contains("Book")) { // Excel 95 or older, we won't be able to parse this.... return XLS; } else if (names.contains("EncryptedPackage") && names.contains("EncryptionInfo") && names.contains("\u0006DataSpaces")) { // This is a protected OOXML document, which is an OLE2 file // with an Encrypted Stream which holds the OOXML data // Without decrypting the stream, we can't tell what kind of // OOXML file we have. Return a general OOXML Protected type, // and hope the name based detection can guess the rest! return OOXML_PROTECTED; } else if (names.contains("EncryptedPackage")) { return OLE; } else if (names.contains("WordDocument")) { return DOC; } else if (names.contains("Quill")) { return PUB; } else if (names.contains("PowerPoint Document")) { return PPT; } else if (names.contains("VisioDocument")) { return VSD; } else if (names.contains("\u0001Ole10Native")) { return OLE10_NATIVE; } else if (names.contains("MatOST")) { // this occurs on older Works Word Processor files (versions 3.0 and 4.0) return WPS; } else if (names.contains("CONTENTS") && names.contains("SPELLING")) { // Newer Works files return WPS; } else if (names.contains("Contents") && names.contains("\u0003ObjInfo")) { return COMP_OBJ; } else if (names.contains("CONTENTS") && names.contains("\u0001CompObj")) { // CompObj is a general kind of OLE2 embedding, but this may be an old Works file // If we have the Directory, check if (root != null) { MediaType type = processCompObjFormatType(root); if (type == WPS) { return WPS; } else { // Assume it's a general CompObj embedded resource return COMP_OBJ; } } else { // Assume it's a general CompObj embedded resource return COMP_OBJ; } } else if (names.contains("CONTENTS")) { // CONTENTS without SPELLING nor CompObj normally means some sort // of embedded non-office file inside an OLE2 document // This is most commonly triggered on nested directories return OLE; } else if (names.contains("\u0001CompObj") && (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) { // Could be Project, look for common name patterns for (String name : names) { if (mppDataMatch.matcher(name).matches()) { return MPP; } } } else if (names.contains("PerfectOffice_MAIN")) { if (names.contains("SlideShow")) { return MediaType.application("x-corelpresentations"); // .shw } else if (names.contains("PerfectOffice_OBJECTS")) { return new MediaType(QUATTROPRO, "version", "7-8"); // .wb? } } else if (names.contains("NativeContent_MAIN")) { return new MediaType(QUATTROPRO, "version", "9"); // .qpw } else { for (String name : names) { if (name.startsWith("__substg1.0_")) { return MSG; } } } } // Couldn't detect a more specific type return OLE; }