protected void importDataArchive(
      Resource archive, InputStream resourceStream, BatchImportOptions options) {
    BufferedInputStream bufferedResourceStream = null;
    try {
      // Make sure the stream is buffered
      if (resourceStream instanceof BufferedInputStream) {
        bufferedResourceStream = (BufferedInputStream) resourceStream;
      } else {
        bufferedResourceStream = new BufferedInputStream(resourceStream);
      }

      // Buffer up to 100MB, bad things will happen if we bust this buffer.
      // TODO see if there is a buffered stream that will write to a file once the buffer fills up
      bufferedResourceStream.mark(100 * 1024 * 1024);
      final MediaType type = getMediaType(bufferedResourceStream, archive.getFilename());

      if (MT_JAVA_ARCHIVE.equals(type)) {
        final ArchiveInputStream archiveStream = new JarArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MediaType.APPLICATION_ZIP.equals(type)) {
        final ArchiveInputStream archiveStream = new ZipArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_CPIO.equals(type)) {
        final ArchiveInputStream archiveStream = new CpioArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_AR.equals(type)) {
        final ArchiveInputStream archiveStream = new ArArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_TAR.equals(type)) {
        final ArchiveInputStream archiveStream = new TarArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_BZIP2.equals(type)) {
        final CompressorInputStream compressedStream =
            new BZip2CompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_GZIP.equals(type)) {
        final CompressorInputStream compressedStream =
            new GzipCompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_PACK200.equals(type)) {
        final CompressorInputStream compressedStream =
            new Pack200CompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_XZ.equals(type)) {
        final CompressorInputStream compressedStream =
            new XZCompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else {
        throw new RuntimeException("Unrecognized archive media type: " + type);
      }
    } catch (IOException e) {
      throw new RuntimeException("Could not load InputStream for resource: " + archive, e);
    } finally {
      IOUtils.closeQuietly(bufferedResourceStream);
    }
  }
예제 #2
0
  /** @return SiteMap/SiteMapIndex given a content type, byte content and the URL of a sitemap */
  public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url)
      throws UnknownFormatException, IOException {
    MediaType mediaType = MediaType.parse(contentType);

    // Octet-stream is the father of all binary types
    while (mediaType != null && !mediaType.equals(MediaType.OCTET_STREAM)) {
      if (XML_MEDIA_TYPES.contains(mediaType)) {
        return processXml(url, content);
      } else if (TEXT_MEDIA_TYPES.contains(mediaType)) {
        return (AbstractSiteMap) processText(url.toString(), content);
      } else if (GZ_MEDIA_TYPES.contains(mediaType)) {
        return processGzip(url, content);
      } else {
        mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
        // parent
        return parseSiteMap(mediaType.toString(), content, url);
      }
    }

    throw new UnknownFormatException(
        "Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
  }
예제 #3
0
  /**
   * Internal detection of the specific kind of OLE2 document, based on the names of the top-level
   * streams within the file. In some cases the detection may need access to the root {@link
   * DirectoryEntry} of that file for best results. The entry can be given as a second, optional
   * argument.
   *
   * @param names
   * @param root
   * @return
   */
  protected static MediaType detect(Set<String> names, DirectoryEntry root) {
    if (names != null) {
      if (names.contains("SwDocContentMgr") && names.contains("SwDocMgrTempStorage")) {
        return SLDWORKS;
      } else if (names.contains("StarCalcDocument")) {
        // Star Office Calc
        return SDC;
      } else if (names.contains("StarWriterDocument")) {
        return SDW;
      } else if (names.contains("StarDrawDocument3")) {
        if (root == null) {
          /*
           * This is either StarOfficeDraw or StarOfficeImpress, we have
           * to consult the CompObj to distinguish them, if this method is
           * called in "legacy mode", without the root, just return
           * x-tika-msoffice. The one-argument method is only for backward
           * compatibility, if someone calls old API he/she can get the
           * old result.
           */
          return OLE;
        } else {
          return processCompObjFormatType(root);
        }
      } else if (names.contains("\u0005HwpSummaryInformation")) {
        // Hangul Word Processor v5+ (previous aren't OLE2-based)
        return HWP;
      } else if (names.contains("WksSSWorkBook")) {
        // This check has to be before names.contains("Workbook")
        // Works 7.0 spreadsheet files contain both
        // we want to avoid classifying this as Excel
        return XLR;
      } else if (names.contains("Workbook") || names.contains("WORKBOOK")) {
        MediaType tmp = processCompObjFormatType(root);
        if (tmp.equals(MS_GRAPH_CHART)) {
          return MS_GRAPH_CHART;
        }
        return XLS;
      } else if (names.contains("Book")) {
        // Excel 95 or older, we won't be able to parse this....
        return XLS;
      } else if (names.contains("EncryptedPackage")
          && names.contains("EncryptionInfo")
          && names.contains("\u0006DataSpaces")) {
        // This is a protected OOXML document, which is an OLE2 file
        //  with an Encrypted Stream which holds the OOXML data
        // Without decrypting the stream, we can't tell what kind of
        //  OOXML file we have. Return a general OOXML Protected type,
        //  and hope the name based detection can guess the rest!
        return OOXML_PROTECTED;
      } else if (names.contains("EncryptedPackage")) {
        return OLE;
      } else if (names.contains("WordDocument")) {
        return DOC;
      } else if (names.contains("Quill")) {
        return PUB;
      } else if (names.contains("PowerPoint Document")) {
        return PPT;
      } else if (names.contains("VisioDocument")) {
        return VSD;
      } else if (names.contains("\u0001Ole10Native")) {
        return OLE10_NATIVE;
      } else if (names.contains("MatOST")) {
        // this occurs on older Works Word Processor files (versions 3.0 and 4.0)
        return WPS;
      } else if (names.contains("CONTENTS") && names.contains("SPELLING")) {
        // Newer Works files
        return WPS;
      } else if (names.contains("Contents") && names.contains("\u0003ObjInfo")) {
        return COMP_OBJ;
      } else if (names.contains("CONTENTS") && names.contains("\u0001CompObj")) {
        // CompObj is a general kind of OLE2 embedding, but this may be an old Works file
        // If we have the Directory, check
        if (root != null) {
          MediaType type = processCompObjFormatType(root);
          if (type == WPS) {
            return WPS;
          } else {
            // Assume it's a general CompObj embedded resource
            return COMP_OBJ;
          }
        } else {
          // Assume it's a general CompObj embedded resource
          return COMP_OBJ;
        }
      } else if (names.contains("CONTENTS")) {
        // CONTENTS without SPELLING nor CompObj normally means some sort
        //  of embedded non-office file inside an OLE2 document
        // This is most commonly triggered on nested directories
        return OLE;
      } else if (names.contains("\u0001CompObj")
          && (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) {
        // Could be Project, look for common name patterns
        for (String name : names) {
          if (mppDataMatch.matcher(name).matches()) {
            return MPP;
          }
        }
      } else if (names.contains("PerfectOffice_MAIN")) {
        if (names.contains("SlideShow")) {
          return MediaType.application("x-corelpresentations"); // .shw
        } else if (names.contains("PerfectOffice_OBJECTS")) {
          return new MediaType(QUATTROPRO, "version", "7-8"); // .wb?
        }
      } else if (names.contains("NativeContent_MAIN")) {
        return new MediaType(QUATTROPRO, "version", "9"); // .qpw
      } else {
        for (String name : names) {
          if (name.startsWith("__substg1.0_")) {
            return MSG;
          }
        }
      }
    }

    // Couldn't detect a more specific type
    return OLE;
  }