Пример #1
0
  public MediaType detect(TikaInputStream input, Metadata metadata) throws IOException {
    ZipFile zip = new ZipFile(input.getFile());
    for (ZipEntry entry : Collections.list(zip.entries())) {
      // Is it an Open Document file?
      if (entry.getName().equals("mimetype")) {
        InputStream stream = zip.getInputStream(entry);
        try {
          return fromString(IOUtils.toString(stream, "UTF-8"));
        } finally {
          stream.close();
        }
      } else if (entry.getName().equals("_rels/.rels")
          || entry.getName().equals("[Content_Types].xml")) {
        // Office Open XML File
        // As POI to open and investigate it for us
        try {
          OPCPackage pkg = OPCPackage.open(input.getFile().toString());
          input.setOpenContainer(pkg);

          PackageRelationshipCollection core =
              pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL);
          if (core.size() != 1) {
            throw new IOException(
                "Invalid OOXML Package received - expected 1 core document, found " + core.size());
          }

          // Get the type of the core document part
          PackagePart corePart = pkg.getPart(core.getRelationship(0));
          String coreType = corePart.getContentType();

          // Turn that into the type of the overall document
          String docType = coreType.substring(0, coreType.lastIndexOf('.'));
          return fromString(docType);
        } catch (InvalidFormatException e) {
          throw new IOException("Office Open XML File detected, but corrupted - " + e.getMessage());
        }
      } else if (entry.getName().equals("buildVersionHistory.plist")) {
        // This is an iWork document

        // Reset and ask
        zip.close();
        zip = new ZipFile(input.getFile());
        return IWorkPackageParser.identifyType(zip);
      } else if (entry.getName().equals("META-INF/")) {
        // Java Jar
        return MediaType.application("java-archive");
      }
    }

    return MediaType.APPLICATION_ZIP;
  }
Пример #2
0
  private static Set<String> getTopLevelNames(TikaInputStream stream) throws IOException {
    // Force the document stream to a (possibly temporary) file
    // so we don't modify the current position of the stream
    File file = stream.getFile();

    try {
      NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true);

      // Optimize a possible later parsing process by keeping
      // a reference to the already opened POI file system
      stream.setOpenContainer(fs);

      return getTopLevelNames(fs.getRoot());
    } catch (IOException e) {
      // Parse error in POI, so we don't know the file type
      return Collections.emptySet();
    } catch (RuntimeException e) {
      // Another problem in POI
      return Collections.emptySet();
    }
  }