public static String getMimeTypeFromContentType(String contentType) { String result = ""; MediaType mt = MediaType.parse(contentType); if (mt != null) { result = mt.getType() + "/" + mt.getSubtype(); } return result; }
public static String getCharsetFromContentType(String contentType) { String result = ""; MediaType mt = MediaType.parse(contentType); if (mt != null) { String charset = mt.getParameters().get("charset"); if (charset != null) { result = charset; } } return result; }
/** * Performs a one time intialization of Tika's Media-Type components and media type collection * constants <br> * Please note that this is a private static method which is called once per CLASS (not per * instance / object) */ private static void initMediaTypes() { /* XML media types (and all aliases) */ XML_MEDIA_TYPES.add(APPLICATION_XML); XML_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(APPLICATION_XML)); /* TEXT media types (and all aliases) */ TEXT_MEDIA_TYPES.add(TEXT_PLAIN); TEXT_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(TEXT_PLAIN)); /* GZIP media types (and all aliases) */ MediaType gzipMediaType = MediaType.parse("application/gzip"); GZ_MEDIA_TYPES.add(gzipMediaType); GZ_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(gzipMediaType)); }
protected Parser getParser(Metadata metadata, ParseContext context) { Map<MediaType, Parser> map = getParsers(context); MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE)); if (type != null) { // We always work on the normalised, canonical form type = registry.normalize(type); } while (type != null) { // Try finding a parser for the type Parser parser = map.get(type); if (parser != null) { return parser; } // Failing that, try for the parent of the type type = registry.getSupertype(type); } return fallback; }
/* * (non-Javadoc) * @see org.alfresco.repo.rendition.executer.AbstractRenderingEngine#render(org.alfresco.repo.rendition.executer.AbstractRenderingEngine.RenderingContext) */ @Override protected void render(RenderingContext context) { ContentReader contentReader = context.makeContentReader(); String sourceMimeType = contentReader.getMimetype(); // Check that Tika supports the supplied file AutoDetectParser p = new AutoDetectParser(tikaConfig); MediaType sourceMediaType = MediaType.parse(sourceMimeType); if (!p.getParsers().containsKey(sourceMediaType)) { throw new RenditionServiceException( "Source mime type of " + sourceMimeType + " is not supported by Tika for HTML conversions"); } // Make the HTML Version using Tika // This will also extract out any images as found generateHTML(p, context); }
/** @return SiteMap/SiteMapIndex given a content type, byte content and the URL of a sitemap */ public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException, IOException { MediaType mediaType = MediaType.parse(contentType); // Octet-stream is the father of all binary types while (mediaType != null && !mediaType.equals(MediaType.OCTET_STREAM)) { if (XML_MEDIA_TYPES.contains(mediaType)) { return processXml(url, content); } else if (TEXT_MEDIA_TYPES.contains(mediaType)) { return (AbstractSiteMap) processText(url.toString(), content); } else if (GZ_MEDIA_TYPES.contains(mediaType)) { return processGzip(url, content); } else { mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check // parent return parseSiteMap(mediaType.toString(), content, url); } } throw new UnknownFormatException( "Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")"); }
private static Set<MediaType> mediaTypesListFromDomElement(Element node, String tag) throws TikaException, IOException { Set<MediaType> types = null; NodeList children = node.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { Node cNode = children.item(i); if (cNode instanceof Element) { Element cElement = (Element) cNode; if (tag.equals(cElement.getTagName())) { String mime = getText(cElement); MediaType type = MediaType.parse(mime); if (type != null) { if (types == null) types = new HashSet<>(); types.add(type); } else { throw new TikaException("Invalid media type name: " + mime); } } } } if (types != null) return types; return Collections.emptySet(); }
@Override public void load( SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception { Parser parser = null; String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null); if (streamType != null) { // Cache? Parsers are lightweight to construct and thread-safe, so I'm told MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT)); parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt); } else { parser = autoDetectParser; } if (parser != null) { Metadata metadata = new Metadata(); // If you specify the resource name (the filename, roughly) with this parameter, // then Tika can make use of it in guessing the appropriate MIME type: String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null); if (resourceName != null) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName); } // Provide stream's content type as hint for auto detection if (stream.getContentType() != null) { metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType()); } InputStream inputStream = null; try { inputStream = stream.getStream(); metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName()); metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo()); metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize())); metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType()); // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); if (charset != null) { metadata.add(HttpHeaders.CONTENT_ENCODING, charset); } String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema()); ContentHandler parsingHandler = handler; StringWriter writer = null; BaseMarkupSerializer serializer = null; if (extractOnly == true) { String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml"); writer = new StringWriter(); if (extractFormat.equals(TEXT_FORMAT)) { serializer = new TextSerializer(); serializer.setOutputCharStream(writer); serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true)); } else { serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true)); } if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); serializer .startDocument(); // The MatchingContentHandler does not invoke startDocument. See // http://tika.markmail.org/message/kknu3hw7argwiqin parsingHandler = new MatchingContentHandler(serializer, matcher); } else { parsingHandler = serializer; } } else if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); parsingHandler = new MatchingContentHandler(handler, matcher); } // else leave it as is try { // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler // for getting the document. ParseContext context = parseContextConfig.create(); context.set(Parser.class, parser); context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); // Password handling RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider(); String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE); if (pwMapFile != null && pwMapFile.length() > 0) { InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile); if (is != null) { log.debug("Password file supplied: " + pwMapFile); epp.parse(is); } } context.set(PasswordProvider.class, epp); String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD); if (resourcePassword != null) { epp.setExplicitPassword(resourcePassword); log.debug("Literal password supplied for file " + resourceName); } parser.parse(inputStream, parsingHandler, metadata, context); } catch (TikaException e) { if (ignoreTikaException) log.warn( new StringBuilder("skip extracting text due to ") .append(e.getLocalizedMessage()) .append(". metadata=") .append(metadata.toString()) .toString()); else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } if (extractOnly == false) { addDoc(handler); } else { // serializer is not null, so we need to call endDoc on it if using xpath if (xpathExpr != null) { serializer.endDocument(); } rsp.add(stream.getName(), writer.toString()); writer.close(); String[] names = metadata.names(); NamedList metadataNL = new NamedList(); for (int i = 0; i < names.length; i++) { String[] vals = metadata.getValues(names[i]); metadataNL.add(names[i], vals); } rsp.add(stream.getName() + "_metadata", metadataNL); } } catch (SAXException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } finally { IOUtils.closeQuietly(inputStream); } } else { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter."); } }
public ParseResult getParse(Content content) { String mimeType = content.getContentType(); URL base; try { base = new URL(content.getBaseUrl()); } catch (MalformedURLException e) { return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); } // get the right parser using the mime type as a clue Parser parser = tikaConfig.getParser(MediaType.parse(mimeType)); byte[] raw = content.getContent(); if (parser == null) { String message = "Can't retrieve Tika parser for mime-type " + mimeType; LOG.error(message); return new ParseStatus(ParseStatus.FAILED, message) .getEmptyParseResult(content.getUrl(), getConf()); } LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType); Metadata tikamd = new Metadata(); HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment root = doc.createDocumentFragment(); DOMBuilder domhandler = new DOMBuilder(doc, root); ParseContext context = new ParseContext(); try { parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context); } catch (Exception e) { LOG.error("Error parsing " + content.getUrl(), e); return new ParseStatus(ParseStatus.FAILED, e.getMessage()) .getEmptyParseResult(content.getUrl(), getConf()); } HTMLMetaTags metaTags = new HTMLMetaTags(); String text = ""; String title = ""; Outlink[] outlinks = new Outlink[0]; org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata(); // we have converted the sax events generated by Tika into a DOM object // so we can now use the usual HTML resources from Nutch // get meta directives HTMLMetaProcessor.getMetaTags(metaTags, root, base); if (LOG.isTraceEnabled()) { LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); } // check meta directives if (!metaTags.getNoIndex()) { // okay to index StringBuffer sb = new StringBuffer(); if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); } utils.getText(sb, root); // extract text text = sb.toString(); sb.setLength(0); if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); } utils.getTitle(sb, root); // extract title title = sb.toString().trim(); } if (!metaTags.getNoFollow()) { // okay to follow links ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks URL baseTag = utils.getBase(root); if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } utils.getOutlinks(baseTag != null ? baseTag : base, l, root); outlinks = l.toArray(new Outlink[l.size()]); if (LOG.isTraceEnabled()) { LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl()); } } // populate Nutch metadata with Tika metadata String[] TikaMDNames = tikamd.names(); for (String tikaMDName : TikaMDNames) { if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) continue; // TODO what if multivalued? nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName)); } // no outlinks? try OutlinkExtractor e.g works for mime types where no // explicit markup for anchors if (outlinks.length == 0) { outlinks = OutlinkExtractor.getOutlinks(text, getConf()); } ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); if (metaTags.getRefresh()) { status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); status.setArgs( new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) }); } ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata); ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); // run filters on parse ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root); if (metaTags.getNoCache()) { // not okay to cache for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy); } return filteredParse; }
private MediaType parseMediaType(String mediaTypeStr) { MediaType mediaType = MediaType.parse(mediaTypeStr.trim().toLowerCase(Locale.ROOT)); return mediaType.getBaseType(); };