Beispiel #1
0
  public void parse(
      InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {

    // As we don't know which of the metadata or the content
    //  we'll hit first, catch the endDocument call initially
    EndDocumentShieldingContentHandler handler =
        new EndDocumentShieldingContentHandler(baseHandler);

    // Process the file in turn
    ZipInputStream zip = new ZipInputStream(stream);
    ZipEntry entry = zip.getNextEntry();
    while (entry != null) {
      if (entry.getName().equals("mimetype")) {
        String type = IOUtils.toString(zip, "UTF-8");
        metadata.set(Metadata.CONTENT_TYPE, type);
      } else if (entry.getName().equals("meta.xml")) {
        meta.parse(zip, new DefaultHandler(), metadata, context);
      } else if (entry.getName().endsWith("content.xml")) {
        content.parse(zip, handler, metadata, context);
      }
      entry = zip.getNextEntry();
    }

    // Only now call the end document
    if (handler.getEndDocumentWasCalled()) {
      handler.reallyEndDocument();
    }
  }
Beispiel #2
0
  public MediaType detect(TikaInputStream input, Metadata metadata) throws IOException {
    ZipFile zip = new ZipFile(input.getFile());
    for (ZipEntry entry : Collections.list(zip.entries())) {
      // Is it an Open Document file?
      if (entry.getName().equals("mimetype")) {
        InputStream stream = zip.getInputStream(entry);
        try {
          return fromString(IOUtils.toString(stream, "UTF-8"));
        } finally {
          stream.close();
        }
      } else if (entry.getName().equals("_rels/.rels")
          || entry.getName().equals("[Content_Types].xml")) {
        // Office Open XML File
        // As POI to open and investigate it for us
        try {
          OPCPackage pkg = OPCPackage.open(input.getFile().toString());
          input.setOpenContainer(pkg);

          PackageRelationshipCollection core =
              pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL);
          if (core.size() != 1) {
            throw new IOException(
                "Invalid OOXML Package received - expected 1 core document, found " + core.size());
          }

          // Get the type of the core document part
          PackagePart corePart = pkg.getPart(core.getRelationship(0));
          String coreType = corePart.getContentType();

          // Turn that into the type of the overall document
          String docType = coreType.substring(0, coreType.lastIndexOf('.'));
          return fromString(docType);
        } catch (InvalidFormatException e) {
          throw new IOException("Office Open XML File detected, but corrupted - " + e.getMessage());
        }
      } else if (entry.getName().equals("buildVersionHistory.plist")) {
        // This is an iWork document

        // Reset and ask
        zip.close();
        zip = new ZipFile(input.getFile());
        return IWorkPackageParser.identifyType(zip);
      } else if (entry.getName().equals("META-INF/")) {
        // Java Jar
        return MediaType.application("java-archive");
      }
    }

    return MediaType.APPLICATION_ZIP;
  }