/**
  * Delegates the call to the matching component parser.
  *
  * <p>Potential {@link RuntimeException}s, {@link IOException}s and {@link SAXException}s
  * unrelated to the given input stream and content handler are automatically wrapped into {@link
  * TikaException}s to better honor the {@link Parser} contract.
  */
 public void parse(
     InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
     throws IOException, SAXException, TikaException {
   Parser parser = getParser(metadata, context);
   TemporaryResources tmp = new TemporaryResources();
   try {
     TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
     TaggedContentHandler taggedHandler =
         handler != null ? new TaggedContentHandler(handler) : null;
     if (parser instanceof ParserDecorator) {
       metadata.add(
           "X-Parsed-By", ((ParserDecorator) parser).getWrappedParser().getClass().getName());
     } else {
       metadata.add("X-Parsed-By", parser.getClass().getName());
     }
     try {
       parser.parse(taggedStream, taggedHandler, metadata, context);
     } catch (RuntimeException e) {
       throw new TikaException("Unexpected RuntimeException from " + parser, e);
     } catch (IOException e) {
       taggedStream.throwIfCauseOf(e);
       throw new TikaException("TIKA-198: Illegal IOException from " + parser, e);
     } catch (SAXException e) {
       if (taggedHandler != null) taggedHandler.throwIfCauseOf(e);
       throw new TikaException("TIKA-237: Illegal SAXException from " + parser, e);
     }
   } finally {
     tmp.dispose();
   }
 }
Example #2
0
 public MediaType detect(InputStream input, Metadata metadata) throws IOException {
   if (TikaInputStream.isTikaInputStream(input)) {
     return detect(TikaInputStream.get(input), metadata);
   } else {
     return MediaType.APPLICATION_ZIP;
   }
 }
Example #3
0
  public MediaType detect(InputStream input, Metadata metadata) throws IOException {
    // Check if we have access to the document
    if (input == null) {
      return MediaType.OCTET_STREAM;
    }

    // If this is a TikaInputStream wrapping an already
    // parsed NPOIFileSystem/DirectoryNode, just get the
    // names from the root:
    TikaInputStream tis = TikaInputStream.cast(input);
    Set<String> names = null;
    if (tis != null) {
      Object container = tis.getOpenContainer();
      if (container instanceof NPOIFSFileSystem) {
        names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
      } else if (container instanceof DirectoryNode) {
        names = getTopLevelNames((DirectoryNode) container);
      }
    }

    if (names == null) {
      // Check if the document starts with the OLE header
      input.mark(8);
      try {
        if (input.read() != 0xd0
            || input.read() != 0xcf
            || input.read() != 0x11
            || input.read() != 0xe0
            || input.read() != 0xa1
            || input.read() != 0xb1
            || input.read() != 0x1a
            || input.read() != 0xe1) {
          return MediaType.OCTET_STREAM;
        }
      } finally {
        input.reset();
      }
    }

    // We can only detect the exact type when given a TikaInputStream
    if (names == null && tis != null) {
      // Look for known top level entry names to detect the document type
      names = getTopLevelNames(tis);
    }

    // Detect based on the names (as available)
    if (tis != null
        && tis.getOpenContainer() != null
        && tis.getOpenContainer() instanceof NPOIFSFileSystem) {
      return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot());
    } else {
      return detect(names, null);
    }
  }
Example #4
0
  public MediaType detect(TikaInputStream input, Metadata metadata) throws IOException {
    ZipFile zip = new ZipFile(input.getFile());
    for (ZipEntry entry : Collections.list(zip.entries())) {
      // Is it an Open Document file?
      if (entry.getName().equals("mimetype")) {
        InputStream stream = zip.getInputStream(entry);
        try {
          return fromString(IOUtils.toString(stream, "UTF-8"));
        } finally {
          stream.close();
        }
      } else if (entry.getName().equals("_rels/.rels")
          || entry.getName().equals("[Content_Types].xml")) {
        // Office Open XML File
        // As POI to open and investigate it for us
        try {
          OPCPackage pkg = OPCPackage.open(input.getFile().toString());
          input.setOpenContainer(pkg);

          PackageRelationshipCollection core =
              pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL);
          if (core.size() != 1) {
            throw new IOException(
                "Invalid OOXML Package received - expected 1 core document, found " + core.size());
          }

          // Get the type of the core document part
          PackagePart corePart = pkg.getPart(core.getRelationship(0));
          String coreType = corePart.getContentType();

          // Turn that into the type of the overall document
          String docType = coreType.substring(0, coreType.lastIndexOf('.'));
          return fromString(docType);
        } catch (InvalidFormatException e) {
          throw new IOException("Office Open XML File detected, but corrupted - " + e.getMessage());
        }
      } else if (entry.getName().equals("buildVersionHistory.plist")) {
        // This is an iWork document

        // Reset and ask
        zip.close();
        zip = new ZipFile(input.getFile());
        return IWorkPackageParser.identifyType(zip);
      } else if (entry.getName().equals("META-INF/")) {
        // Java Jar
        return MediaType.application("java-archive");
      }
    }

    return MediaType.APPLICATION_ZIP;
  }
  @Override
  public TikaInputStream getStream(
      URLName url2getStream, Metadata metadata, ParseContext parseContext) throws Exception {
    final URL asUrl = new URL(url2getStream.toString());

    return TikaInputStream.get(
        new ShiftInitInputStream() {
          @Override
          protected InputStream initBeforeFirstStreamDataAccess() throws Exception {
            URLConnection connection = asUrl.openConnection();

            connection.setConnectTimeout(connectTimeout);
            connection.setReadTimeout(readTimeout);
            connection.setRequestProperty("Accept-Encoding", "gzip");

            InputStream ourStream = connection.getInputStream();

            String strContentEncoding = connection.getHeaderField("Content-Encoding");
            if (strContentEncoding != null)
              strContentEncoding = strContentEncoding.toLowerCase().trim();

            if ("gzip".equals(strContentEncoding))
              ourStream = new BufferedInputStream(new GZIPInputStream(ourStream));
            else ourStream = new BufferedInputStream(ourStream);

            return ourStream;
          }
        });
  }
Example #6
0
  @Test
  public void testFromFile() throws Exception {
    try (TikaInputStream tis =
        TikaInputStream.get(this.getClass().getResource("/test-documents/testODFwithOOo3.odt"))) {
      assertEquals(true, tis.hasFile());
      OpenDocumentParser parser = new OpenDocumentParser();
      Metadata metadata = new Metadata();
      ContentHandler handler = new BodyContentHandler();
      parser.parse(tis, handler, metadata, new ParseContext());

      assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE));

      String content = handler.toString();
      assertContains("Tika is part of the Lucene project.", content);
    }
  }
  void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
    if (config.getOCRStrategy().equals(NO_OCR)) {
      return;
    }
    TesseractOCRConfig tesseractConfig =
        context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);

    TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
    if (!tesseractOCRParser.hasTesseract(tesseractConfig)) {
      throw new TikaException(
          "Tesseract is not available. "
              + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
    }

    PDFRenderer renderer = new PDFRenderer(pdDocument);
    TemporaryResources tmp = new TemporaryResources();
    try {
      BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType());
      Path tmpFile = tmp.createTempFile();
      try (OutputStream os = Files.newOutputStream(tmpFile)) {
        // TODO: get output format from TesseractConfig
        ImageIOUtil.writeImage(image, config.getOCRImageFormatName(), os, config.getOCRDPI());
      }
      try (InputStream is = TikaInputStream.get(tmpFile)) {
        tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
      }
    } catch (IOException e) {
      handleCatchableIOE(e);
    } catch (SAXException e) {
      throw new IOExceptionWithCause("error writing OCR content from PDF", e);
    } finally {
      tmp.dispose();
    }
  }
Example #8
0
  private static Set<String> getTopLevelNames(TikaInputStream stream) throws IOException {
    // Force the document stream to a (possibly temporary) file
    // so we don't modify the current position of the stream
    File file = stream.getFile();

    try {
      NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true);

      // Optimize a possible later parsing process by keeping
      // a reference to the already opened POI file system
      stream.setOpenContainer(fs);

      return getTopLevelNames(fs.getRoot());
    } catch (IOException e) {
      // Parse error in POI, so we don't know the file type
      return Collections.emptySet();
    } catch (RuntimeException e) {
      // Another problem in POI
      return Collections.emptySet();
    }
  }
  public String getContentType(File file, String title) {
    InputStream is = null;

    try {
      is = TikaInputStream.get(file);

      return getContentType(is, title);
    } catch (FileNotFoundException fnfe) {
      return getContentType(title);
    } finally {
      StreamUtil.cleanUp(is);
    }
  }
Example #10
0
    @Override
    protected boolean doProcess(Record record, InputStream inputStream) {
      Parser parser = detectParser(record);
      if (parser == null) {
        return false;
      }

      ParseContext parseContext = new ParseContext();
      parseContext.set(Locale.class, locale);

      Metadata metadata = new Metadata();
      for (Entry<String, Object> entry : record.getFields().entries()) {
        metadata.add(entry.getKey(), entry.getValue().toString());
      }

      SolrContentHandler handler =
          solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
      try {
        inputStream = TikaInputStream.get(inputStream);

        ContentHandler parsingHandler = handler;

        // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()";
        if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(parsingHandler, matcher);
        }

        try {
          parser.parse(inputStream, parsingHandler, metadata, parseContext);
        } catch (IOException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (SAXException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (TikaException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        }
      } finally {
        if (inputStream != null) {
          Closeables.closeQuietly(inputStream);
        }
      }

      SolrInputDocument doc = handler.newDocument();
      LOG.debug("solr doc: {}", doc);
      Record outputRecord = toRecord(doc);
      return getChild().process(outputRecord);
    }
Example #11
0
  @Test
  public void testNPEFromFile() throws Exception {
    OpenDocumentParser parser = new OpenDocumentParser();
    try (TikaInputStream tis =
        TikaInputStream.get(
            this.getClass().getResource("/test-documents/testNPEOpenDocument.odt"))) {
      Metadata metadata = new Metadata();
      ContentHandler handler = new BodyContentHandler();
      parser.parse(tis, handler, metadata, new ParseContext());

      assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE));

      String content = handler.toString();
      assertContains("primero hay que generar un par de claves", content);
    }
  }
Example #12
0
  private void extractPDEmbeddedFile(
      String displayName,
      String unicodeFileName,
      String fileName,
      PDEmbeddedFile file,
      EmbeddedDocumentExtractor extractor)
      throws SAXException, IOException, TikaException {

    if (file == null) {
      // skip silently
      return;
    }

    fileName = (fileName == null) ? displayName : fileName;

    // TODO: other metadata?
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
    metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
    metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
    metadata.set(
        TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
        TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
    metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);

    if (extractor.shouldParseEmbedded(metadata)) {
      TikaInputStream stream = null;
      try {
        stream = TikaInputStream.get(file.createInputStream());
        extractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml), metadata, false);

        AttributesImpl attributes = new AttributesImpl();
        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
        attributes.addAttribute("", "id", "id", "CDATA", fileName);
        xhtml.startElement("div", attributes);
        xhtml.endElement("div");
      } finally {
        IOUtils.closeQuietly(stream);
      }
    }
  }
  protected MediaType getMediaType(BufferedInputStream inputStream, String fileName)
      throws IOException {
    final TikaInputStream tikaInputStreamStream =
        TikaInputStream.get(new CloseShieldInputStream(inputStream));
    try {
      final Detector detector = new DefaultDetector();
      final Metadata metadata = new Metadata();
      metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);

      final MediaType type = detector.detect(tikaInputStreamStream, metadata);
      logger.debug("Determined '{}' for '{}'", type, fileName);
      return type;
    } catch (IOException e) {
      logger.warn("Failed to determine media type for '" + fileName + "' assuming XML", e);
      return null;
    } finally {
      IOUtils.closeQuietly(tikaInputStreamStream);

      // Reset the buffered stream to make up for anything read by the detector
      inputStream.reset();
    }
  }
  public String getContentType(InputStream inputStream, String fileName) {
    if ((inputStream == null) && Validator.isNull(fileName)) {
      return ContentTypes.APPLICATION_OCTET_STREAM;
    }

    String contentType = null;

    try {
      Metadata metadata = new Metadata();

      metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);

      MediaType mediaType = _detector.detect(TikaInputStream.get(inputStream), metadata);

      contentType = mediaType.toString();

      if (contentType.contains("tika")) {
        if (_log.isDebugEnabled()) {
          _log.debug("Retrieved invalid content type " + contentType);
        }

        contentType = getContentType(fileName);
      }

      if (contentType.contains("tika")) {
        if (_log.isDebugEnabled()) {
          _log.debug("Retrieved invalid content type " + contentType);
        }

        contentType = ContentTypes.APPLICATION_OCTET_STREAM;
      }
    } catch (Exception e) {
      _log.error(e, e);

      contentType = ContentTypes.APPLICATION_OCTET_STREAM;
    }

    return contentType;
  }
Example #15
0
 private Charset getCharset(List<DBFRow> firstRows, DBFFileHeader header)
     throws IOException, TikaException {
   // TODO: potentially use codepage info in the header
   Charset charset = DEFAULT_CHARSET;
   ByteArrayOutputStream bos = new ByteArrayOutputStream();
   for (DBFRow row : firstRows) {
     for (DBFCell cell : row.cells) {
       if (cell.getColType().equals(DBFColumnHeader.ColType.C)) {
         byte[] bytes = cell.getBytes();
         bos.write(bytes);
         if (bos.size() > MAX_CHARS_FOR_CHARSET_DETECTION) {
           break;
         }
       }
     }
   }
   byte[] bytes = bos.toByteArray();
   if (bytes.length > 20) {
     EncodingDetector detector = new Icu4jEncodingDetector();
     detector.detect(TikaInputStream.get(bytes), new Metadata());
     charset = detector.detect(new ByteArrayInputStream(bytes), new Metadata());
   }
   return charset;
 }
Example #16
0
  /**
   * Gets the content and defers to registered viewers to generate the markup.
   *
   * @param request servlet request
   * @param response servlet response
   * @throws ServletException if a servlet-specific error occurs
   * @throws IOException if an I/O error occurs
   */
  @Override
  protected void doGet(final HttpServletRequest request, final HttpServletResponse response)
      throws ServletException, IOException {
    // specify the charset in a response header
    response.addHeader("Content-Type", "text/html; charset=UTF-8");

    // get the content
    final ServletContext servletContext = request.getServletContext();
    final ContentAccess contentAccess =
        (ContentAccess) servletContext.getAttribute("nifi-content-access");

    final ContentRequestContext contentRequest = getContentRequest(request);
    if (contentRequest.getDataUri() == null) {
      request.setAttribute("title", "Error");
      request.setAttribute("messages", "The data reference must be specified.");

      // forward to the error page
      final ServletContext viewerContext = servletContext.getContext("/nifi");
      viewerContext.getRequestDispatcher("/message").forward(request, response);
      return;
    }

    // get the content
    final DownloadableContent downloadableContent;
    try {
      downloadableContent = contentAccess.getContent(contentRequest);
    } catch (final ResourceNotFoundException rnfe) {
      request.setAttribute("title", "Error");
      request.setAttribute("messages", "Unable to find the specified content");

      // forward to the error page
      final ServletContext viewerContext = servletContext.getContext("/nifi");
      viewerContext.getRequestDispatcher("/message").forward(request, response);
      return;
    } catch (final AccessDeniedException ade) {
      request.setAttribute("title", "Acess Denied");
      request.setAttribute(
          "messages", "Unable to approve access to the specified content: " + ade.getMessage());

      // forward to the error page
      final ServletContext viewerContext = servletContext.getContext("/nifi");
      viewerContext.getRequestDispatcher("/message").forward(request, response);
      return;
    } catch (final Exception e) {
      request.setAttribute("title", "Error");
      request.setAttribute("messages", "An unexcepted error has occurred: " + e.getMessage());

      // forward to the error page
      final ServletContext viewerContext = servletContext.getContext("/nifi");
      viewerContext.getRequestDispatcher("/message").forward(request, response);
      return;
    }

    // determine how we want to view the data
    String mode = request.getParameter("mode");

    // if the name isn't set, use original
    if (mode == null) {
      mode = DisplayMode.Original.name();
    }

    // determine the display mode
    final DisplayMode displayMode;
    try {
      displayMode = DisplayMode.valueOf(mode);
    } catch (final IllegalArgumentException iae) {
      request.setAttribute("title", "Error");
      request.setAttribute("messages", "Invalid display mode: " + mode);

      // forward to the error page
      final ServletContext viewerContext = servletContext.getContext("/nifi");
      viewerContext.getRequestDispatcher("/message").forward(request, response);
      return;
    }

    // buffer the content to support reseting in case we need to detect the content type or char
    // encoding
    try (final BufferedInputStream bis =
        new BufferedInputStream(downloadableContent.getContent()); ) {
      final String mimeType;

      // when standalone and we don't know the type is null as we were able to directly access the
      // content bypassing the rest endpoint,
      // when clustered and we don't know the type set to octet stream since the content was
      // retrieved from the node's rest endpoint
      if (downloadableContent.getType() == null
          || downloadableContent.getType().equals(MediaType.OCTET_STREAM.toString())) {
        // attempt to detect the content stream if we don't know what it is ()
        final DefaultDetector detector = new DefaultDetector();

        // create the stream for tika to process, buffered to support reseting
        final TikaInputStream tikaStream = TikaInputStream.get(bis);

        // provide a hint based on the filename
        final Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, downloadableContent.getFilename());

        // Get mime type
        final MediaType mediatype = detector.detect(tikaStream, metadata);
        mimeType = mediatype.toString();
      } else {
        mimeType = downloadableContent.getType();
      }

      // add attributes needed for the header
      request.setAttribute("filename", downloadableContent.getFilename());
      request.setAttribute("contentType", mimeType);

      // generate the header
      request.getRequestDispatcher("/WEB-INF/jsp/header.jsp").include(request, response);

      // remove the attributes needed for the header
      request.removeAttribute("filename");
      request.removeAttribute("contentType");

      // generate the markup for the content based on the display mode
      if (DisplayMode.Hex.equals(displayMode)) {
        final byte[] buffer = new byte[BUFFER_LENGTH];
        final int read = StreamUtils.fillBuffer(bis, buffer, false);

        // trim the byte array if necessary
        byte[] bytes = buffer;
        if (read != buffer.length) {
          bytes = new byte[read];
          System.arraycopy(buffer, 0, bytes, 0, read);
        }

        // convert bytes into the base 64 bytes
        final String base64 = Base64.encodeBase64String(bytes);

        // defer to the jsp
        request.setAttribute("content", base64);
        request.getRequestDispatcher("/WEB-INF/jsp/hexview.jsp").include(request, response);
      } else {
        // lookup a viewer for the content
        final String contentViewerUri = servletContext.getInitParameter(mimeType);

        // handle no viewer for content type
        if (contentViewerUri == null) {
          request.getRequestDispatcher("/WEB-INF/jsp/no-viewer.jsp").include(request, response);
        } else {
          // create a request attribute for accessing the content
          request.setAttribute(
              ViewableContent.CONTENT_REQUEST_ATTRIBUTE,
              new ViewableContent() {
                @Override
                public InputStream getContentStream() {
                  return bis;
                }

                @Override
                public String getContent() throws IOException {
                  // detect the charset
                  final CharsetDetector detector = new CharsetDetector();
                  detector.setText(bis);
                  detector.enableInputFilter(true);
                  final CharsetMatch match = detector.detect();

                  // ensure we were able to detect the charset
                  if (match == null) {
                    throw new IOException("Unable to detect character encoding.");
                  }

                  // convert the stream using the detected charset
                  return IOUtils.toString(bis, match.getName());
                }

                @Override
                public ViewableContent.DisplayMode getDisplayMode() {
                  return displayMode;
                }

                @Override
                public String getFileName() {
                  return downloadableContent.getFilename();
                }

                @Override
                public String getContentType() {
                  return mimeType;
                }
              });

          try {
            // generate the content
            final ServletContext viewerContext = servletContext.getContext(contentViewerUri);
            viewerContext.getRequestDispatcher("/view-content").include(request, response);
          } catch (final Exception e) {
            String message = e.getMessage() != null ? e.getMessage() : e.toString();
            message = "Unable to generate view of data: " + message;

            // log the error
            logger.error(message);
            if (logger.isDebugEnabled()) {
              logger.error(StringUtils.EMPTY, e);
            }

            // populate the request attributes
            request.setAttribute("title", "Error");
            request.setAttribute("messages", message);

            // forward to the error page
            final ServletContext viewerContext = servletContext.getContext("/nifi");
            viewerContext.getRequestDispatcher("/message").forward(request, response);
            return;
          }

          // remove the request attribute
          request.removeAttribute(ViewableContent.CONTENT_REQUEST_ATTRIBUTE);
        }
      }

      // generate footer
      request.getRequestDispatcher("/WEB-INF/jsp/footer.jsp").include(request, response);
    }
  }
Example #17
0
    public void parseEmbedded(
        InputStream inputStream,
        ContentHandler contentHandler,
        Metadata metadata,
        boolean outputHtml)
        throws SAXException, IOException {
      String name = metadata.get(Metadata.RESOURCE_NAME_KEY);

      if (name == null) {
        name = "file" + count++;
      }

      MediaType contentType = detector.detect(inputStream, metadata);

      if (name.indexOf('.') == -1 && contentType != null) {
        try {
          name += config.getMimeRepository().forName(contentType.toString()).getExtension();
        } catch (MimeTypeException e) {
          e.printStackTrace();
        }
      }

      String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
      if (relID != null && !name.startsWith(relID)) {
        name = relID + "_" + name;
      }

      File outputFile = new File(extractDir, name);
      File parent = outputFile.getParentFile();
      if (!parent.exists()) {
        if (!parent.mkdirs()) {
          throw new IOException("unable to create directory \"" + parent + "\"");
        }
      }
      System.out.println("Extracting '" + name + "' (" + contentType + ") to " + outputFile);

      FileOutputStream os = null;

      try {
        os = new FileOutputStream(outputFile);

        if (inputStream instanceof TikaInputStream) {
          TikaInputStream tin = (TikaInputStream) inputStream;

          if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
            POIFSFileSystem fs = new POIFSFileSystem();
            copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
            fs.writeFilesystem(os);
          } else {
            IOUtils.copy(inputStream, os);
          }
        } else {
          IOUtils.copy(inputStream, os);
        }
      } catch (Exception e) {
        logger.warn("Ignoring unexpected exception trying to save embedded file " + name, e);
      } finally {
        if (os != null) {
          os.close();
        }
      }
    }
Example #18
0
 public void process(String arg) throws Exception {
   if (arg.equals("-?") || arg.equals("--help")) {
     pipeMode = false;
     usage();
   } else if (arg.equals("-V") || arg.equals("--version")) {
     pipeMode = false;
     version();
   } else if (arg.equals("-v") || arg.equals("--verbose")) {
     //            Logger.getRootLogger().setLevel(Level.DEBUG);
   } else if (arg.equals("-g") || arg.equals("--gui")) {
     pipeMode = false;
     TikaGUI.main(new String[0]);
   } else if (arg.equals("--list-parser") || arg.equals("--list-parsers")) {
     pipeMode = false;
     displayParsers(false);
   } else if (arg.equals("--list-detector") || arg.equals("--list-detectors")) {
     pipeMode = false;
     displayDetectors();
   } else if (arg.equals("--list-parser-detail") || arg.equals("--list-parser-details")) {
     pipeMode = false;
     displayParsers(true);
   } else if (arg.equals("--list-met-models")) {
     pipeMode = false;
     displayMetModels();
   } else if (arg.equals("--list-supported-types")) {
     pipeMode = false;
     displaySupportedTypes();
   } else if (arg.equals("--container-aware") || arg.equals("--container-aware-detector")) {
     // ignore, as container-aware detectors are now always used
   } else if (arg.equals("-f") || arg.equals("--fork")) {
     fork = true;
   } else if (arg.startsWith("-e")) {
     encoding = arg.substring("-e".length());
   } else if (arg.startsWith("--encoding=")) {
     encoding = arg.substring("--encoding=".length());
   } else if (arg.startsWith("-p") && !arg.equals("-p")) {
     password = arg.substring("-p".length());
   } else if (arg.startsWith("--password="******"--password="******"-j") || arg.equals("--json")) {
     type = JSON;
   } else if (arg.equals("-y") || arg.equals("--xmp")) {
     type = XMP;
   } else if (arg.equals("-x") || arg.equals("--xml")) {
     type = XML;
   } else if (arg.equals("-h") || arg.equals("--html")) {
     type = HTML;
   } else if (arg.equals("-t") || arg.equals("--text")) {
     type = TEXT;
   } else if (arg.equals("-T") || arg.equals("--text-main")) {
     type = TEXT_MAIN;
   } else if (arg.equals("-m") || arg.equals("--metadata")) {
     type = METADATA;
   } else if (arg.equals("-l") || arg.equals("--language")) {
     type = LANGUAGE;
   } else if (arg.equals("-d") || arg.equals("--detect")) {
     type = DETECT;
   } else if (arg.startsWith("--extract-dir=")) {
     extractDir = new File(arg.substring("--extract-dir=".length()));
   } else if (arg.equals("-z") || arg.equals("--extract")) {
     type = NO_OUTPUT;
     context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
   } else if (arg.equals("-r") || arg.equals("--pretty-print")) {
     prettyPrint = true;
   } else if (arg.equals("-p")
       || arg.equals("--port")
       || arg.equals("-s")
       || arg.equals("--server")) {
     serverMode = true;
     pipeMode = false;
   } else if (arg.startsWith("-c")) {
     URI uri = new URI(arg.substring("-c".length()));
     parser = new NetworkParser(uri);
   } else if (arg.startsWith("--client=")) {
     URI uri = new URI(arg.substring("--client=".length()));
     parser = new NetworkParser(uri);
   } else if (arg.startsWith("--create-profile=")) {
     profileName = arg.substring("--create-profile=".length());
     type = CREATE_PROFILE;
   } else {
     pipeMode = false;
     if (serverMode) {
       new TikaServer(Integer.parseInt(arg)).start();
     } else if (arg.equals("-")) {
       InputStream stream = TikaInputStream.get(new CloseShieldInputStream(System.in));
       try {
         type.process(stream, System.out, new Metadata());
       } finally {
         stream.close();
       }
     } else {
       URL url;
       File file = new File(arg);
       if (file.isFile()) {
         url = file.toURI().toURL();
       } else {
         url = new URL(arg);
       }
       Metadata metadata = new Metadata();
       InputStream input = TikaInputStream.get(url, metadata);
       try {
         type.process(input, System.out, metadata);
       } finally {
         input.close();
         System.out.flush();
       }
     }
   }
 }
  // will throw IOException if not actually POIFS
  // can return null byte[]
  private byte[] handleEmbeddedPOIFS(
      InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException {

    NPOIFSFileSystem fs = null;
    byte[] ret = null;
    try {

      fs = new NPOIFSFileSystem(is);

      DirectoryNode root = fs.getRoot();

      if (root == null) {
        return ret;
      }

      if (root.hasEntry("Package")) {
        Entry ooxml = root.getEntry("Package");
        TikaInputStream stream =
            TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));

        ByteArrayOutputStream out = new ByteArrayOutputStream();

        IOUtils.copy(stream, out);
        ret = out.toByteArray();
      } else {
        // try poifs
        POIFSDocumentType type = POIFSDocumentType.detectType(root);
        if (type == POIFSDocumentType.OLE10_NATIVE) {
          try {
            // Try to un-wrap the OLE10Native record:
            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
            ret = ole.getDataBuffer();
          } catch (Ole10NativeException ex) {
            // Not a valid OLE10Native record, skip it
          }
        } else if (type == POIFSDocumentType.COMP_OBJ) {

          DocumentEntry contentsEntry;
          try {
            contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
          } catch (FileNotFoundException ioe) {
            contentsEntry = (DocumentEntry) root.getEntry("Contents");
          }

          DocumentInputStream inp = null;
          try {
            inp = new DocumentInputStream(contentsEntry);
            ret = new byte[contentsEntry.getSize()];
            inp.readFully(ret);
          } finally {
            if (inp != null) {
              inp.close();
            }
          }
        } else {

          ByteArrayOutputStream out = new ByteArrayOutputStream();
          is.reset();
          IOUtils.copy(is, out);
          ret = out.toByteArray();
          metadata.set(
              Metadata.RESOURCE_NAME_KEY,
              "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
          metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
        }
      }
    } finally {
      if (fs != null) {
        fs.close();
      }
    }
    return ret;
  }