@Override public TikaInputStream getStream( URLName url2getStream, Metadata metadata, ParseContext parseContext) throws Exception { final URL asUrl = new URL(url2getStream.toString()); return TikaInputStream.get( new ShiftInitInputStream() { @Override protected InputStream initBeforeFirstStreamDataAccess() throws Exception { URLConnection connection = asUrl.openConnection(); connection.setConnectTimeout(connectTimeout); connection.setReadTimeout(readTimeout); connection.setRequestProperty("Accept-Encoding", "gzip"); InputStream ourStream = connection.getInputStream(); String strContentEncoding = connection.getHeaderField("Content-Encoding"); if (strContentEncoding != null) strContentEncoding = strContentEncoding.toLowerCase().trim(); if ("gzip".equals(strContentEncoding)) ourStream = new BufferedInputStream(new GZIPInputStream(ourStream)); else ourStream = new BufferedInputStream(ourStream); return ourStream; } }); }
/** * Delegates the call to the matching component parser. * * <p>Potential {@link RuntimeException}s, {@link IOException}s and {@link SAXException}s * unrelated to the given input stream and content handler are automatically wrapped into {@link * TikaException}s to better honor the {@link Parser} contract. */ public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { Parser parser = getParser(metadata, context); TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream taggedStream = TikaInputStream.get(stream, tmp); TaggedContentHandler taggedHandler = handler != null ? new TaggedContentHandler(handler) : null; if (parser instanceof ParserDecorator) { metadata.add( "X-Parsed-By", ((ParserDecorator) parser).getWrappedParser().getClass().getName()); } else { metadata.add("X-Parsed-By", parser.getClass().getName()); } try { parser.parse(taggedStream, taggedHandler, metadata, context); } catch (RuntimeException e) { throw new TikaException("Unexpected RuntimeException from " + parser, e); } catch (IOException e) { taggedStream.throwIfCauseOf(e); throw new TikaException("TIKA-198: Illegal IOException from " + parser, e); } catch (SAXException e) { if (taggedHandler != null) taggedHandler.throwIfCauseOf(e); throw new TikaException("TIKA-237: Illegal SAXException from " + parser, e); } } finally { tmp.dispose(); } }
public MediaType detect(InputStream input, Metadata metadata) throws IOException { if (TikaInputStream.isTikaInputStream(input)) { return detect(TikaInputStream.get(input), metadata); } else { return MediaType.APPLICATION_ZIP; } }
void doOCROnCurrentPage() throws IOException, TikaException, SAXException { if (config.getOCRStrategy().equals(NO_OCR)) { return; } TesseractOCRConfig tesseractConfig = context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG); TesseractOCRParser tesseractOCRParser = new TesseractOCRParser(); if (!tesseractOCRParser.hasTesseract(tesseractConfig)) { throw new TikaException( "Tesseract is not available. " + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly"); } PDFRenderer renderer = new PDFRenderer(pdDocument); TemporaryResources tmp = new TemporaryResources(); try { BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType()); Path tmpFile = tmp.createTempFile(); try (OutputStream os = Files.newOutputStream(tmpFile)) { // TODO: get output format from TesseractConfig ImageIOUtil.writeImage(image, config.getOCRImageFormatName(), os, config.getOCRDPI()); } try (InputStream is = TikaInputStream.get(tmpFile)) { tesseractOCRParser.parseInline(is, xhtml, tesseractConfig); } } catch (IOException e) { handleCatchableIOE(e); } catch (SAXException e) { throw new IOExceptionWithCause("error writing OCR content from PDF", e); } finally { tmp.dispose(); } }
public String getContentType(File file, String title) { InputStream is = null; try { is = TikaInputStream.get(file); return getContentType(is, title); } catch (FileNotFoundException fnfe) { return getContentType(title); } finally { StreamUtil.cleanUp(is); } }
@Override protected boolean doProcess(Record record, InputStream inputStream) { Parser parser = detectParser(record); if (parser == null) { return false; } ParseContext parseContext = new ParseContext(); parseContext.set(Locale.class, locale); Metadata metadata = new Metadata(); for (Entry<String, Object> entry : record.getFields().entries()) { metadata.add(entry.getKey(), entry.getValue().toString()); } SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema); try { inputStream = TikaInputStream.get(inputStream); ContentHandler parsingHandler = handler; // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()"; if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); parsingHandler = new MatchingContentHandler(parsingHandler, matcher); } try { parser.parse(inputStream, parsingHandler, metadata, parseContext); } catch (IOException e) { throw new MorphlineRuntimeException("Cannot parse", e); } catch (SAXException e) { throw new MorphlineRuntimeException("Cannot parse", e); } catch (TikaException e) { throw new MorphlineRuntimeException("Cannot parse", e); } } finally { if (inputStream != null) { Closeables.closeQuietly(inputStream); } } SolrInputDocument doc = handler.newDocument(); LOG.debug("solr doc: {}", doc); Record outputRecord = toRecord(doc); return getChild().process(outputRecord); }
@Test public void testNPEFromFile() throws Exception { OpenDocumentParser parser = new OpenDocumentParser(); try (TikaInputStream tis = TikaInputStream.get( this.getClass().getResource("/test-documents/testNPEOpenDocument.odt"))) { Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); parser.parse(tis, handler, metadata, new ParseContext()); assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE)); String content = handler.toString(); assertContains("primero hay que generar un par de claves", content); } }
@Test public void testFromFile() throws Exception { try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource("/test-documents/testODFwithOOo3.odt"))) { assertEquals(true, tis.hasFile()); OpenDocumentParser parser = new OpenDocumentParser(); Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); parser.parse(tis, handler, metadata, new ParseContext()); assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE)); String content = handler.toString(); assertContains("Tika is part of the Lucene project.", content); } }
private void extractPDEmbeddedFile( String displayName, String unicodeFileName, String fileName, PDEmbeddedFile file, EmbeddedDocumentExtractor extractor) throws SAXException, IOException, TikaException { if (file == null) { // skip silently return; } fileName = (fileName == null) ? displayName : fileName; // TODO: other metadata? Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); metadata.set( TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName); if (extractor.shouldParseEmbedded(metadata)) { TikaInputStream stream = null; try { stream = TikaInputStream.get(file.createInputStream()); extractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml), metadata, false); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", fileName); xhtml.startElement("div", attributes); xhtml.endElement("div"); } finally { IOUtils.closeQuietly(stream); } } }
protected MediaType getMediaType(BufferedInputStream inputStream, String fileName) throws IOException { final TikaInputStream tikaInputStreamStream = TikaInputStream.get(new CloseShieldInputStream(inputStream)); try { final Detector detector = new DefaultDetector(); final Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); final MediaType type = detector.detect(tikaInputStreamStream, metadata); logger.debug("Determined '{}' for '{}'", type, fileName); return type; } catch (IOException e) { logger.warn("Failed to determine media type for '" + fileName + "' assuming XML", e); return null; } finally { IOUtils.closeQuietly(tikaInputStreamStream); // Reset the buffered stream to make up for anything read by the detector inputStream.reset(); } }
public String getContentType(InputStream inputStream, String fileName) { if ((inputStream == null) && Validator.isNull(fileName)) { return ContentTypes.APPLICATION_OCTET_STREAM; } String contentType = null; try { Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); MediaType mediaType = _detector.detect(TikaInputStream.get(inputStream), metadata); contentType = mediaType.toString(); if (contentType.contains("tika")) { if (_log.isDebugEnabled()) { _log.debug("Retrieved invalid content type " + contentType); } contentType = getContentType(fileName); } if (contentType.contains("tika")) { if (_log.isDebugEnabled()) { _log.debug("Retrieved invalid content type " + contentType); } contentType = ContentTypes.APPLICATION_OCTET_STREAM; } } catch (Exception e) { _log.error(e, e); contentType = ContentTypes.APPLICATION_OCTET_STREAM; } return contentType; }
private Charset getCharset(List<DBFRow> firstRows, DBFFileHeader header) throws IOException, TikaException { // TODO: potentially use codepage info in the header Charset charset = DEFAULT_CHARSET; ByteArrayOutputStream bos = new ByteArrayOutputStream(); for (DBFRow row : firstRows) { for (DBFCell cell : row.cells) { if (cell.getColType().equals(DBFColumnHeader.ColType.C)) { byte[] bytes = cell.getBytes(); bos.write(bytes); if (bos.size() > MAX_CHARS_FOR_CHARSET_DETECTION) { break; } } } } byte[] bytes = bos.toByteArray(); if (bytes.length > 20) { EncodingDetector detector = new Icu4jEncodingDetector(); detector.detect(TikaInputStream.get(bytes), new Metadata()); charset = detector.detect(new ByteArrayInputStream(bytes), new Metadata()); } return charset; }
public void process(String arg) throws Exception { if (arg.equals("-?") || arg.equals("--help")) { pipeMode = false; usage(); } else if (arg.equals("-V") || arg.equals("--version")) { pipeMode = false; version(); } else if (arg.equals("-v") || arg.equals("--verbose")) { // Logger.getRootLogger().setLevel(Level.DEBUG); } else if (arg.equals("-g") || arg.equals("--gui")) { pipeMode = false; TikaGUI.main(new String[0]); } else if (arg.equals("--list-parser") || arg.equals("--list-parsers")) { pipeMode = false; displayParsers(false); } else if (arg.equals("--list-detector") || arg.equals("--list-detectors")) { pipeMode = false; displayDetectors(); } else if (arg.equals("--list-parser-detail") || arg.equals("--list-parser-details")) { pipeMode = false; displayParsers(true); } else if (arg.equals("--list-met-models")) { pipeMode = false; displayMetModels(); } else if (arg.equals("--list-supported-types")) { pipeMode = false; displaySupportedTypes(); } else if (arg.equals("--container-aware") || arg.equals("--container-aware-detector")) { // ignore, as container-aware detectors are now always used } else if (arg.equals("-f") || arg.equals("--fork")) { fork = true; } else if (arg.startsWith("-e")) { encoding = arg.substring("-e".length()); } else if (arg.startsWith("--encoding=")) { encoding = arg.substring("--encoding=".length()); } else if (arg.startsWith("-p") && !arg.equals("-p")) { password = arg.substring("-p".length()); } else if (arg.startsWith("--password="******"--password="******"-j") || arg.equals("--json")) { type = JSON; } else if (arg.equals("-y") || arg.equals("--xmp")) { type = XMP; } else if (arg.equals("-x") || arg.equals("--xml")) { type = XML; } else if (arg.equals("-h") || arg.equals("--html")) { type = HTML; } else if (arg.equals("-t") || arg.equals("--text")) { type = TEXT; } else if (arg.equals("-T") || arg.equals("--text-main")) { type = TEXT_MAIN; } else if (arg.equals("-m") || arg.equals("--metadata")) { type = METADATA; } else if (arg.equals("-l") || arg.equals("--language")) { type = LANGUAGE; } else if (arg.equals("-d") || arg.equals("--detect")) { type = DETECT; } else if (arg.startsWith("--extract-dir=")) { extractDir = new File(arg.substring("--extract-dir=".length())); } else if (arg.equals("-z") || arg.equals("--extract")) { type = NO_OUTPUT; context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor()); } else if (arg.equals("-r") || arg.equals("--pretty-print")) { prettyPrint = true; } else if (arg.equals("-p") || arg.equals("--port") || arg.equals("-s") || arg.equals("--server")) { serverMode = true; pipeMode = false; } else if (arg.startsWith("-c")) { URI uri = new URI(arg.substring("-c".length())); parser = new NetworkParser(uri); } else if (arg.startsWith("--client=")) { URI uri = new URI(arg.substring("--client=".length())); parser = new NetworkParser(uri); } else if (arg.startsWith("--create-profile=")) { profileName = arg.substring("--create-profile=".length()); type = CREATE_PROFILE; } else { pipeMode = false; if (serverMode) { new TikaServer(Integer.parseInt(arg)).start(); } else if (arg.equals("-")) { InputStream stream = TikaInputStream.get(new CloseShieldInputStream(System.in)); try { type.process(stream, System.out, new Metadata()); } finally { stream.close(); } } else { URL url; File file = new File(arg); if (file.isFile()) { url = file.toURI().toURL(); } else { url = new URL(arg); } Metadata metadata = new Metadata(); InputStream input = TikaInputStream.get(url, metadata); try { type.process(input, System.out, metadata); } finally { input.close(); System.out.flush(); } } } }
// will throw IOException if not actually POIFS // can return null byte[] private byte[] handleEmbeddedPOIFS( InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException { NPOIFSFileSystem fs = null; byte[] ret = null; try { fs = new NPOIFSFileSystem(is); DirectoryNode root = fs.getRoot(); if (root == null) { return ret; } if (root.hasEntry("Package")) { Entry ooxml = root.getEntry("Package"); TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml)); ByteArrayOutputStream out = new ByteArrayOutputStream(); IOUtils.copy(stream, out); ret = out.toByteArray(); } else { // try poifs POIFSDocumentType type = POIFSDocumentType.detectType(root); if (type == POIFSDocumentType.OLE10_NATIVE) { try { // Try to un-wrap the OLE10Native record: Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root); ret = ole.getDataBuffer(); } catch (Ole10NativeException ex) { // Not a valid OLE10Native record, skip it } } else if (type == POIFSDocumentType.COMP_OBJ) { DocumentEntry contentsEntry; try { contentsEntry = (DocumentEntry) root.getEntry("CONTENTS"); } catch (FileNotFoundException ioe) { contentsEntry = (DocumentEntry) root.getEntry("Contents"); } DocumentInputStream inp = null; try { inp = new DocumentInputStream(contentsEntry); ret = new byte[contentsEntry.getSize()]; inp.readFully(ret); } finally { if (inp != null) { inp.close(); } } } else { ByteArrayOutputStream out = new ByteArrayOutputStream(); is.reset(); IOUtils.copy(is, out); ret = out.toByteArray(); metadata.set( Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension()); metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); } } } finally { if (fs != null) { fs.close(); } } return ret; }
/** * Gets the content and defers to registered viewers to generate the markup. * * @param request servlet request * @param response servlet response * @throws ServletException if a servlet-specific error occurs * @throws IOException if an I/O error occurs */ @Override protected void doGet(final HttpServletRequest request, final HttpServletResponse response) throws ServletException, IOException { // specify the charset in a response header response.addHeader("Content-Type", "text/html; charset=UTF-8"); // get the content final ServletContext servletContext = request.getServletContext(); final ContentAccess contentAccess = (ContentAccess) servletContext.getAttribute("nifi-content-access"); final ContentRequestContext contentRequest = getContentRequest(request); if (contentRequest.getDataUri() == null) { request.setAttribute("title", "Error"); request.setAttribute("messages", "The data reference must be specified."); // forward to the error page final ServletContext viewerContext = servletContext.getContext("/nifi"); viewerContext.getRequestDispatcher("/message").forward(request, response); return; } // get the content final DownloadableContent downloadableContent; try { downloadableContent = contentAccess.getContent(contentRequest); } catch (final ResourceNotFoundException rnfe) { request.setAttribute("title", "Error"); request.setAttribute("messages", "Unable to find the specified content"); // forward to the error page final ServletContext viewerContext = servletContext.getContext("/nifi"); viewerContext.getRequestDispatcher("/message").forward(request, response); return; } catch (final AccessDeniedException ade) { request.setAttribute("title", "Acess Denied"); request.setAttribute( "messages", "Unable to approve access to the specified content: " + ade.getMessage()); // forward to the error page final ServletContext viewerContext = servletContext.getContext("/nifi"); viewerContext.getRequestDispatcher("/message").forward(request, response); return; } catch (final Exception e) { request.setAttribute("title", "Error"); request.setAttribute("messages", "An unexcepted error has occurred: " + e.getMessage()); // forward to the error page final ServletContext viewerContext = servletContext.getContext("/nifi"); viewerContext.getRequestDispatcher("/message").forward(request, response); return; } // determine how we want to view the data String mode = request.getParameter("mode"); // if the name isn't set, use original if (mode == null) { mode = DisplayMode.Original.name(); } // determine the display mode final DisplayMode displayMode; try { displayMode = DisplayMode.valueOf(mode); } catch (final IllegalArgumentException iae) { request.setAttribute("title", "Error"); request.setAttribute("messages", "Invalid display mode: " + mode); // forward to the error page final ServletContext viewerContext = servletContext.getContext("/nifi"); viewerContext.getRequestDispatcher("/message").forward(request, response); return; } // buffer the content to support reseting in case we need to detect the content type or char // encoding try (final BufferedInputStream bis = new BufferedInputStream(downloadableContent.getContent()); ) { final String mimeType; // when standalone and we don't know the type is null as we were able to directly access the // content bypassing the rest endpoint, // when clustered and we don't know the type set to octet stream since the content was // retrieved from the node's rest endpoint if (downloadableContent.getType() == null || downloadableContent.getType().equals(MediaType.OCTET_STREAM.toString())) { // attempt to detect the content stream if we don't know what it is () final DefaultDetector detector = new DefaultDetector(); // create the stream for tika to process, buffered to support reseting final TikaInputStream tikaStream = TikaInputStream.get(bis); // provide a hint based on the filename final Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, downloadableContent.getFilename()); // Get mime type final MediaType mediatype = detector.detect(tikaStream, metadata); mimeType = mediatype.toString(); } else { mimeType = downloadableContent.getType(); } // add attributes needed for the header request.setAttribute("filename", downloadableContent.getFilename()); request.setAttribute("contentType", mimeType); // generate the header request.getRequestDispatcher("/WEB-INF/jsp/header.jsp").include(request, response); // remove the attributes needed for the header request.removeAttribute("filename"); request.removeAttribute("contentType"); // generate the markup for the content based on the display mode if (DisplayMode.Hex.equals(displayMode)) { final byte[] buffer = new byte[BUFFER_LENGTH]; final int read = StreamUtils.fillBuffer(bis, buffer, false); // trim the byte array if necessary byte[] bytes = buffer; if (read != buffer.length) { bytes = new byte[read]; System.arraycopy(buffer, 0, bytes, 0, read); } // convert bytes into the base 64 bytes final String base64 = Base64.encodeBase64String(bytes); // defer to the jsp request.setAttribute("content", base64); request.getRequestDispatcher("/WEB-INF/jsp/hexview.jsp").include(request, response); } else { // lookup a viewer for the content final String contentViewerUri = servletContext.getInitParameter(mimeType); // handle no viewer for content type if (contentViewerUri == null) { request.getRequestDispatcher("/WEB-INF/jsp/no-viewer.jsp").include(request, response); } else { // create a request attribute for accessing the content request.setAttribute( ViewableContent.CONTENT_REQUEST_ATTRIBUTE, new ViewableContent() { @Override public InputStream getContentStream() { return bis; } @Override public String getContent() throws IOException { // detect the charset final CharsetDetector detector = new CharsetDetector(); detector.setText(bis); detector.enableInputFilter(true); final CharsetMatch match = detector.detect(); // ensure we were able to detect the charset if (match == null) { throw new IOException("Unable to detect character encoding."); } // convert the stream using the detected charset return IOUtils.toString(bis, match.getName()); } @Override public ViewableContent.DisplayMode getDisplayMode() { return displayMode; } @Override public String getFileName() { return downloadableContent.getFilename(); } @Override public String getContentType() { return mimeType; } }); try { // generate the content final ServletContext viewerContext = servletContext.getContext(contentViewerUri); viewerContext.getRequestDispatcher("/view-content").include(request, response); } catch (final Exception e) { String message = e.getMessage() != null ? e.getMessage() : e.toString(); message = "Unable to generate view of data: " + message; // log the error logger.error(message); if (logger.isDebugEnabled()) { logger.error(StringUtils.EMPTY, e); } // populate the request attributes request.setAttribute("title", "Error"); request.setAttribute("messages", message); // forward to the error page final ServletContext viewerContext = servletContext.getContext("/nifi"); viewerContext.getRequestDispatcher("/message").forward(request, response); return; } // remove the request attribute request.removeAttribute(ViewableContent.CONTENT_REQUEST_ATTRIBUTE); } } // generate footer request.getRequestDispatcher("/WEB-INF/jsp/footer.jsp").include(request, response); } }