/** * Get the Mime type of an Asset based on its type. If the Asset already has the "content-type" * property set, we return that. Otherwise the Apache Tika library is used to do file type * detection. * * @return A string representation of the content type suitable for use in an HTTP header. Eg. * "image/jpeg" for a jpeg image. */ public <T> String getMimeType(Entity entity, T type) { Map<String, Object> fileMetadata = AssetUtils.getFileMetadata(entity); if (fileMetadata.get(AssetUtils.CONTENT_TYPE) != null) { return (String) fileMetadata.get(AssetUtils.CONTENT_TYPE); } Metadata metadata = new Metadata(); MediaType mediaType = MediaType.OCTET_STREAM; try { if (type instanceof byte[]) { ByteArrayInputStream bais = new ByteArrayInputStream((byte[]) type); mediaType = detector.detect(bais, metadata); } else if (type instanceof File) { InputStream fis = new BufferedInputStream(new FileInputStream((File) type)); try { mediaType = detector.detect(fis, metadata); } finally { fis.close(); } } else { return mediaType.toString(); } fileMetadata.put(AssetUtils.CONTENT_TYPE, mediaType.toString()); } catch (IOException e) { LOG.error("error detecting mime type", e); } return mediaType.toString(); }
public String getContentType(String fileName) { if (Validator.isNull(fileName)) { return ContentTypes.APPLICATION_OCTET_STREAM; } try { Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); MediaType mediaType = _detector.detect(null, metadata); String contentType = mediaType.toString(); if (!contentType.contains("tika")) { return contentType; } else if (_log.isDebugEnabled()) { _log.debug("Retrieved invalid content type " + contentType); } } catch (Exception e) { _log.error(e, e); } return ContentTypes.APPLICATION_OCTET_STREAM; }
/** * We don't currently support the .xlsb file format (an OOXML container with binary blobs), but we * shouldn't break on these files either (TIKA-826) */ @Test public void testExcelXLSB() throws Exception { Detector detector = new DefaultDetector(); AutoDetectParser parser = new AutoDetectParser(); Metadata m = new Metadata(); m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb"); // Should be detected correctly MediaType type; try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) { type = detector.detect(input, m); assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString()); } // OfficeParser won't handle it assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type)); // OOXMLParser won't handle it assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type)); // AutoDetectParser doesn't break on it try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) { ContentHandler handler = new BodyContentHandler(-1); ParseContext context = new ParseContext(); context.set(Locale.class, Locale.US); parser.parse(input, handler, m, context); String content = handler.toString(); assertEquals("", content); } }
@Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Is it a supported image? String filename = metadata.get(Metadata.RESOURCE_NAME_KEY); String type = metadata.get(Metadata.CONTENT_TYPE); boolean accept = false; if (type != null) { for (MediaType mt : types) { if (mt.toString().equals(type)) { accept = true; } } } if (filename != null) { for (MediaType mt : types) { String ext = "." + mt.getSubtype(); if (filename.endsWith(ext)) { accept = true; } } } if (!accept) return; handleImage(stream, filename, type); }
/** Returns true if mediaType falls withing the given range (pattern), false otherwise */ private boolean isMediaTypeMatch(MediaType mediaType, MediaType rangePattern) { String WILDCARD = "*"; String rangePatternType = rangePattern.getType(); String rangePatternSubtype = rangePattern.getSubtype(); return (rangePatternType.equals(WILDCARD) || rangePatternType.equals(mediaType.getType())) && (rangePatternSubtype.equals(WILDCARD) || rangePatternSubtype.equals(mediaType.getSubtype())); }
static { SUPPORTED_MIMETYPES = new ArrayList<String>(); Parser p = new PackageParser(); for (MediaType mt : p.getSupportedTypes(null)) { // Tika can probably do some useful text SUPPORTED_MIMETYPES.add(mt.toString()); } }
public static String getMimeTypeFromContentType(String contentType) { String result = ""; MediaType mt = MediaType.parse(contentType); if (mt != null) { result = mt.getType() + "/" + mt.getSubtype(); } return result; }
public static String getCharsetFromContentType(String contentType) { String result = ""; MediaType mt = MediaType.parse(contentType); if (mt != null) { String charset = mt.getParameters().get("charset"); if (charset != null) { result = charset; } } return result; }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Automatically detect the character encoding AutoDetectReader reader = new AutoDetectReader( new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER)); try { Charset charset = reader.getCharset(); String previous = metadata.get(Metadata.CONTENT_TYPE); MediaType contentType = null; if (previous == null || previous.startsWith("text/html")) { contentType = new MediaType(MediaType.TEXT_HTML, charset); } else if (previous.startsWith("application/xhtml+xml")) { contentType = new MediaType(XHTML, charset); } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) { contentType = new MediaType(WAP_XHTML, charset); } else if (previous.startsWith("application/x-asp")) { contentType = new MediaType(X_ASP, charset); } if (contentType != null) { metadata.set(Metadata.CONTENT_TYPE, contentType.toString()); } // deprecated, see TIKA-431 metadata.set(Metadata.CONTENT_ENCODING, charset.name()); // Get the HTML mapper from the parse context HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper()); // Parse the HTML document org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser(); // Use schema from context or default Schema schema = context.get(Schema.class, HTML_SCHEMA); // TIKA-528: Reuse share schema to avoid heavy instantiation parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema); // TIKA-599: Shared schema is thread-safe only if bogons are ignored parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true); parser.setContentHandler( new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata))); parser.parse(reader.asInputSource()); } finally { reader.close(); } }
/** * A more "natural" implementation of an XML parser. Instead of "generating" HTML-like wrapper * events and then producing the PCDATA (only - this is the reason why the default XMLParser should * be called "embedded"), this parser produces the actual XML start and end document and tag events * (that get wrapped by Tika's own {@link org.apache.tika.parser.xml.XMLParser XMLParser}). * Furthermore, this parser semi-structures the element's PCDATA text by separating content from * different elements by linebreaks, indenting PCDATA content according to the current element's * depth, and drops any ("ignorable") character stretches consisting only of spaces. * * @author Florian Leitner */ public class UnembeddedXMLParser extends AbstractParser { /** Serial version UID */ private static final long serialVersionUID = -6028860725229212437L; /** Only support XML */ private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( new HashSet<MediaType>(Arrays.asList(MediaType.application("xml")))); /** {@inheritDoc} */ public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } /** * Parse the input stream with a SAX parser. Wraps the content handler with an {@link * org.apache.tika.sax.OfflineContentHandler} to avoid that any namespace lookups are made. In * addition, by overriding {@link #getContentHandler(ContentHandler, Metadata, ParseContext)}, it * is possible to add additional wrappers. * * @param stream that should be parsed * @param handler that will receive the SAX events * @param metadata of current document stream * @param context of current parse * @throws IOException if the stream cannot be read * @throws SAXException if the SAX parsing fails. * @throws TikaException if the XML parsing fails. */ public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { final TaggedContentHandler tagged = new TaggedContentHandler(handler); if (metadata.get(HttpHeaders.CONTENT_TYPE) == null) { metadata.set(HttpHeaders.CONTENT_TYPE, "application/xml"); } try { context .getSAXParser() .parse( new CloseShieldInputStream(stream), new OfflineContentHandler(getContentHandler(tagged, metadata, context))); } catch (final SAXException e) { tagged.throwIfCauseOf(e); throw new TikaException("XML parse error", e); } } /** * Return the handler (ie., does nothing). This method can be overridden to add wrap the content * handler with additional handlers. * * @param handler to wrap * @param metadata of current document * @param context of current parse * @return */ protected ContentHandler getContentHandler( ContentHandler handler, Metadata metadata, ParseContext context) { return handler; } }
/** * Detects the content type of the given input event. Returns <code>application/octet-stream * </code> if the type of the event can not be detected. * * <p>It is legal for the event headers or body to be empty. The detector may read bytes from * the start of the body stream to help in type detection. * * @return detected media type, or <code>application/octet-stream</code> */ private String getMediaType(InputStream in, Metadata metadata, boolean excludeParameters) { MediaType mediaType; try { mediaType = getDetector().detect(in, metadata); } catch (IOException e) { throw new MorphlineRuntimeException(e); } String mediaTypeStr = mediaType.toString(); if (excludeParameters) { int i = mediaTypeStr.indexOf(';'); if (i >= 0) { mediaTypeStr = mediaTypeStr.substring(0, i); } } return mediaTypeStr; }
/** Tika Parser for Microsoft Project MPX files (Text based) */ public class MPXParser extends AbstractParser { private static final long serialVersionUID = -4791025107910605527L; private static List<MediaType> TYPES = Arrays.asList(new MediaType[] {MediaType.application("x-project")}); public Set<MediaType> getSupportedTypes(ParseContext context) { return new HashSet<MediaType>(TYPES); } public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { MPXReader reader = new MPXReader(); ProjectFile project = null; try { project = reader.read(stream); } catch (MPXJException e) { throw new TikaException("Error reading MPX file", e); } // Extract helpful information out ProjectFileProcessor.parse(project, handler, metadata, context); } }
protected void importDataArchive( Resource archive, InputStream resourceStream, BatchImportOptions options) { BufferedInputStream bufferedResourceStream = null; try { // Make sure the stream is buffered if (resourceStream instanceof BufferedInputStream) { bufferedResourceStream = (BufferedInputStream) resourceStream; } else { bufferedResourceStream = new BufferedInputStream(resourceStream); } // Buffer up to 100MB, bad things will happen if we bust this buffer. // TODO see if there is a buffered stream that will write to a file once the buffer fills up bufferedResourceStream.mark(100 * 1024 * 1024); final MediaType type = getMediaType(bufferedResourceStream, archive.getFilename()); if (MT_JAVA_ARCHIVE.equals(type)) { final ArchiveInputStream archiveStream = new JarArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MediaType.APPLICATION_ZIP.equals(type)) { final ArchiveInputStream archiveStream = new ZipArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MT_CPIO.equals(type)) { final ArchiveInputStream archiveStream = new CpioArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MT_AR.equals(type)) { final ArchiveInputStream archiveStream = new ArArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MT_TAR.equals(type)) { final ArchiveInputStream archiveStream = new TarArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MT_BZIP2.equals(type)) { final CompressorInputStream compressedStream = new BZip2CompressorInputStream(bufferedResourceStream); importDataArchive(archive, compressedStream, options); } else if (MT_GZIP.equals(type)) { final CompressorInputStream compressedStream = new GzipCompressorInputStream(bufferedResourceStream); importDataArchive(archive, compressedStream, options); } else if (MT_PACK200.equals(type)) { final CompressorInputStream compressedStream = new Pack200CompressorInputStream(bufferedResourceStream); importDataArchive(archive, compressedStream, options); } else if (MT_XZ.equals(type)) { final CompressorInputStream compressedStream = new XZCompressorInputStream(bufferedResourceStream); importDataArchive(archive, compressedStream, options); } else { throw new RuntimeException("Unrecognized archive media type: " + type); } } catch (IOException e) { throw new RuntimeException("Could not load InputStream for resource: " + archive, e); } finally { IOUtils.closeQuietly(bufferedResourceStream); } }
private TikaImageExtractingParser(RenderingContext renderingContext) { this.renderingContext = renderingContext; // Our expected types types = new HashSet<MediaType>(); types.add(MediaType.image("bmp")); types.add(MediaType.image("gif")); types.add(MediaType.image("jpg")); types.add(MediaType.image("jpeg")); types.add(MediaType.image("png")); types.add(MediaType.image("tiff")); // Are images going in the same place as the HTML? if (renderingContext.getParamWithDefault(PARAM_IMAGES_SAME_FOLDER, false)) { RenditionLocation location = resolveRenditionLocation( renderingContext.getSourceNode(), renderingContext.getDefinition(), renderingContext.getDestinationNode()); imgFolder = location.getParentRef(); if (logger.isDebugEnabled()) { logger.debug("Using imgFolder: " + imgFolder); } } }
public String getContentType(InputStream inputStream, String fileName) { if ((inputStream == null) && Validator.isNull(fileName)) { return ContentTypes.APPLICATION_OCTET_STREAM; } String contentType = null; try { Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); MediaType mediaType = _detector.detect(TikaInputStream.get(inputStream), metadata); contentType = mediaType.toString(); if (contentType.contains("tika")) { if (_log.isDebugEnabled()) { _log.debug("Retrieved invalid content type " + contentType); } contentType = getContentType(fileName); } if (contentType.contains("tika")) { if (_log.isDebugEnabled()) { _log.debug("Retrieved invalid content type " + contentType); } contentType = ContentTypes.APPLICATION_OCTET_STREAM; } } catch (Exception e) { _log.error(e, e); contentType = ContentTypes.APPLICATION_OCTET_STREAM; } return contentType; }
/** @return SiteMap/SiteMapIndex given a content type, byte content and the URL of a sitemap */ public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url) throws UnknownFormatException, IOException { MediaType mediaType = MediaType.parse(contentType); // Octet-stream is the father of all binary types while (mediaType != null && !mediaType.equals(MediaType.OCTET_STREAM)) { if (XML_MEDIA_TYPES.contains(mediaType)) { return processXml(url, content); } else if (TEXT_MEDIA_TYPES.contains(mediaType)) { return (AbstractSiteMap) processText(url.toString(), content); } else if (GZ_MEDIA_TYPES.contains(mediaType)) { return processGzip(url, content); } else { mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check // parent return parseSiteMap(mediaType.toString(), content, url); } } throw new UnknownFormatException( "Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")"); }
public MediaType detect(TikaInputStream input, Metadata metadata) throws IOException { ZipFile zip = new ZipFile(input.getFile()); for (ZipEntry entry : Collections.list(zip.entries())) { // Is it an Open Document file? if (entry.getName().equals("mimetype")) { InputStream stream = zip.getInputStream(entry); try { return fromString(IOUtils.toString(stream, "UTF-8")); } finally { stream.close(); } } else if (entry.getName().equals("_rels/.rels") || entry.getName().equals("[Content_Types].xml")) { // Office Open XML File // As POI to open and investigate it for us try { OPCPackage pkg = OPCPackage.open(input.getFile().toString()); input.setOpenContainer(pkg); PackageRelationshipCollection core = pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL); if (core.size() != 1) { throw new IOException( "Invalid OOXML Package received - expected 1 core document, found " + core.size()); } // Get the type of the core document part PackagePart corePart = pkg.getPart(core.getRelationship(0)); String coreType = corePart.getContentType(); // Turn that into the type of the overall document String docType = coreType.substring(0, coreType.lastIndexOf('.')); return fromString(docType); } catch (InvalidFormatException e) { throw new IOException("Office Open XML File detected, but corrupted - " + e.getMessage()); } } else if (entry.getName().equals("buildVersionHistory.plist")) { // This is an iWork document // Reset and ask zip.close(); zip = new ZipFile(input.getFile()); return IWorkPackageParser.identifyType(zip); } else if (entry.getName().equals("META-INF/")) { // Java Jar return MediaType.application("java-archive"); } } return MediaType.APPLICATION_ZIP; }
/** * Performs a one time intialization of Tika's Media-Type components and media type collection * constants <br> * Please note that this is a private static method which is called once per CLASS (not per * instance / object) */ private static void initMediaTypes() { /* XML media types (and all aliases) */ XML_MEDIA_TYPES.add(APPLICATION_XML); XML_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(APPLICATION_XML)); /* TEXT media types (and all aliases) */ TEXT_MEDIA_TYPES.add(TEXT_PLAIN); TEXT_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(TEXT_PLAIN)); /* GZIP media types (and all aliases) */ MediaType gzipMediaType = MediaType.parse("application/gzip"); GZ_MEDIA_TYPES.add(gzipMediaType); GZ_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(gzipMediaType)); }
private static Metadata tika_parse(File audioFile) { Metadata metadata = new Metadata(); try { String filetype = new Tika().detect(audioFile); metadata.set("tika.filetype", filetype); metadata.set("file.size", Long.toString(audioFile.length())); BufferedInputStream inputStream = new BufferedInputStream(new FileInputStream(audioFile)); new AutoDetectParser().parse(inputStream, new BodyContentHandler(), metadata); inputStream.close(); for (String key : metadata.names()) { StringBuilder dataBuilder = new StringBuilder(); if (metadata.isMultiValued(key)) { for (String val : metadata.getValues(key)) { if (dataBuilder.length() > 1) { dataBuilder.append(", "); } dataBuilder.append(val); } } else { dataBuilder.append(metadata.get(key)); } metadata.set(key, dataBuilder.toString().trim()); } inputStream = new BufferedInputStream(new FileInputStream(audioFile)); MediaType media = new DefaultDetector().detect(inputStream, new Metadata()); metadata.set("media", media.toString()); } catch (SAXException | IOException | TikaException e) { metadata.set( "error_tika_parse", "tika_parse error processing file (" + audioFile.getName() + "): " + e.getMessage()); } return metadata; }
/* If Tesseract is found, test we retrieve the proper number of supporting Parsers. */ @Test public void offersTypesIfFound() throws Exception { TesseractOCRParser parser = new TesseractOCRParser(); DefaultParser defaultParser = new DefaultParser(); ParseContext parseContext = new ParseContext(); MediaType png = MediaType.image("png"); // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG. assumeTrue(canRun()); assertEquals(5, parser.getSupportedTypes(parseContext).size()); assertTrue(parser.getSupportedTypes(parseContext).contains(png)); // DefaultParser will now select the TesseractOCRParser. assertEquals( TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass()); }
protected Parser getParser(Metadata metadata, ParseContext context) { Map<MediaType, Parser> map = getParsers(context); MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE)); if (type != null) { // We always work on the normalised, canonical form type = registry.normalize(type); } while (type != null) { // Try finding a parser for the type Parser parser = map.get(type); if (parser != null) { return parser; } // Failing that, try for the parent of the type type = registry.getSupertype(type); } return fallback; }
/* * (non-Javadoc) * @see org.alfresco.repo.rendition.executer.AbstractRenderingEngine#render(org.alfresco.repo.rendition.executer.AbstractRenderingEngine.RenderingContext) */ @Override protected void render(RenderingContext context) { ContentReader contentReader = context.makeContentReader(); String sourceMimeType = contentReader.getMimetype(); // Check that Tika supports the supplied file AutoDetectParser p = new AutoDetectParser(tikaConfig); MediaType sourceMediaType = MediaType.parse(sourceMimeType); if (!p.getParsers().containsKey(sourceMediaType)) { throw new RenditionServiceException( "Source mime type of " + sourceMimeType + " is not supported by Tika for HTML conversions"); } // Make the HTML Version using Tika // This will also extract out any images as found generateHTML(p, context); }
/* Check that if Tesseract is not found, the TesseractOCRParser claims to not support any file types. So, the standard image parser is called instead. */ @Test public void offersNoTypesIfNotFound() throws Exception { TesseractOCRParser parser = new TesseractOCRParser(); DefaultParser defaultParser = new DefaultParser(); MediaType png = MediaType.image("png"); // With an invalid path, will offer no types TesseractOCRConfig invalidConfig = new TesseractOCRConfig(); invalidConfig.setTesseractPath("/made/up/path"); ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, invalidConfig); // No types offered assertEquals(0, parser.getSupportedTypes(parseContext).size()); // And DefaultParser won't use us assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass()); }
private static Set<MediaType> mediaTypesListFromDomElement(Element node, String tag) throws TikaException, IOException { Set<MediaType> types = null; NodeList children = node.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { Node cNode = children.item(i); if (cNode instanceof Element) { Element cElement = (Element) cNode; if (tag.equals(cElement.getTagName())) { String mime = getText(cElement); MediaType type = MediaType.parse(mime); if (type != null) { if (types == null) types = new HashSet<>(); types.add(type); } else { throw new TikaException("Invalid media type name: " + mime); } } } } if (types != null) return types; return Collections.emptySet(); }
public MediaTypeFilter() { this.mediaTypes = MediaType.set(MediaType.TEXT_HTML, MediaType.TEXT_PLAIN, MediaType.APPLICATION_XML); }
/** * HTML parser. Uses TagSoup to turn the input document to HTML SAX events, and post-processes the * events to produce XHTML and metadata expected by Tika clients. */ public class HtmlParser extends AbstractParser { /** Serial version UID */ private static final long serialVersionUID = 7895315240498733128L; private static final MediaType XHTML = MediaType.application("xhtml+xml"); private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml"); private static final MediaType X_ASP = MediaType.application("x-asp"); private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( new HashSet<MediaType>(Arrays.asList(MediaType.text("html"), XHTML, WAP_XHTML, X_ASP))); private static final ServiceLoader LOADER = new ServiceLoader(HtmlParser.class.getClassLoader()); /** HTML schema singleton used to amortise the heavy instantiation time. */ private static final Schema HTML_SCHEMA = new HTMLSchema(); public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Automatically detect the character encoding AutoDetectReader reader = new AutoDetectReader( new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER)); try { Charset charset = reader.getCharset(); String previous = metadata.get(Metadata.CONTENT_TYPE); MediaType contentType = null; if (previous == null || previous.startsWith("text/html")) { contentType = new MediaType(MediaType.TEXT_HTML, charset); } else if (previous.startsWith("application/xhtml+xml")) { contentType = new MediaType(XHTML, charset); } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) { contentType = new MediaType(WAP_XHTML, charset); } else if (previous.startsWith("application/x-asp")) { contentType = new MediaType(X_ASP, charset); } if (contentType != null) { metadata.set(Metadata.CONTENT_TYPE, contentType.toString()); } // deprecated, see TIKA-431 metadata.set(Metadata.CONTENT_ENCODING, charset.name()); // Get the HTML mapper from the parse context HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper()); // Parse the HTML document org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser(); // Use schema from context or default Schema schema = context.get(Schema.class, HTML_SCHEMA); // TIKA-528: Reuse share schema to avoid heavy instantiation parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema); // TIKA-599: Shared schema is thread-safe only if bogons are ignored parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true); parser.setContentHandler( new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata))); parser.parse(reader.asInputSource()); } finally { reader.close(); } } /** * Maps "safe" HTML element names to semantic XHTML equivalents. If the given element is unknown * or deemed unsafe for inclusion in the parse output, then this method returns <code>null</code> * and the element will be ignored but the content inside it is still processed. See the {@link * #isDiscardElement(String)} method for a way to discard the entire contents of an element. * * <p>Subclasses can override this method to customize the default mapping. * * @param name HTML element name (upper case) * @return XHTML element name (lower case), or <code>null</code> if the element is unsafe * @since Apache Tika 0.5 * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method * will be removed in Tika 1.0. */ protected String mapSafeElement(String name) { return DefaultHtmlMapper.INSTANCE.mapSafeElement(name); } /** * Checks whether all content within the given HTML element should be discarded instead of * including it in the parse output. Subclasses can override this method to customize the set of * discarded elements. * * @param name HTML element name (upper case) * @return <code>true</code> if content inside the named element should be ignored, <code>false * </code> otherwise * @since Apache Tika 0.5 * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method * will be removed in Tika 1.0. */ protected boolean isDiscardElement(String name) { return DefaultHtmlMapper.INSTANCE.isDiscardElement(name); } /** * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method * will be removed in Tika 1.0. */ public String mapSafeAttribute(String elementName, String attributeName) { return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName); } /** * Adapter class that maintains backwards compatibility with the protected HtmlParser methods. * Making HtmlParser implement HtmlMapper directly would require those methods to be public, which * would break backwards compatibility with subclasses. * * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This class will * be removed in Tika 1.0. */ private class HtmlParserMapper implements HtmlMapper { public String mapSafeElement(String name) { return HtmlParser.this.mapSafeElement(name); } public boolean isDiscardElement(String name) { return HtmlParser.this.isDiscardElement(name); } public String mapSafeAttribute(String elementName, String attributeName) { return HtmlParser.this.mapSafeAttribute(elementName, attributeName); } } }
@Override public void load( SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception { Parser parser = null; String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null); if (streamType != null) { // Cache? Parsers are lightweight to construct and thread-safe, so I'm told MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT)); parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt); } else { parser = autoDetectParser; } if (parser != null) { Metadata metadata = new Metadata(); // If you specify the resource name (the filename, roughly) with this parameter, // then Tika can make use of it in guessing the appropriate MIME type: String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null); if (resourceName != null) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName); } // Provide stream's content type as hint for auto detection if (stream.getContentType() != null) { metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType()); } InputStream inputStream = null; try { inputStream = stream.getStream(); metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName()); metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo()); metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize())); metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType()); // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); if (charset != null) { metadata.add(HttpHeaders.CONTENT_ENCODING, charset); } String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema()); ContentHandler parsingHandler = handler; StringWriter writer = null; BaseMarkupSerializer serializer = null; if (extractOnly == true) { String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml"); writer = new StringWriter(); if (extractFormat.equals(TEXT_FORMAT)) { serializer = new TextSerializer(); serializer.setOutputCharStream(writer); serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true)); } else { serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true)); } if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); serializer .startDocument(); // The MatchingContentHandler does not invoke startDocument. See // http://tika.markmail.org/message/kknu3hw7argwiqin parsingHandler = new MatchingContentHandler(serializer, matcher); } else { parsingHandler = serializer; } } else if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); parsingHandler = new MatchingContentHandler(handler, matcher); } // else leave it as is try { // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler // for getting the document. ParseContext context = parseContextConfig.create(); context.set(Parser.class, parser); context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); // Password handling RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider(); String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE); if (pwMapFile != null && pwMapFile.length() > 0) { InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile); if (is != null) { log.debug("Password file supplied: " + pwMapFile); epp.parse(is); } } context.set(PasswordProvider.class, epp); String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD); if (resourcePassword != null) { epp.setExplicitPassword(resourcePassword); log.debug("Literal password supplied for file " + resourceName); } parser.parse(inputStream, parsingHandler, metadata, context); } catch (TikaException e) { if (ignoreTikaException) log.warn( new StringBuilder("skip extracting text due to ") .append(e.getLocalizedMessage()) .append(". metadata=") .append(metadata.toString()) .toString()); else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } if (extractOnly == false) { addDoc(handler); } else { // serializer is not null, so we need to call endDoc on it if using xpath if (xpathExpr != null) { serializer.endDocument(); } rsp.add(stream.getName(), writer.toString()); writer.close(); String[] names = metadata.names(); NamedList metadataNL = new NamedList(); for (int i = 0; i < names.length; i++) { String[] vals = metadata.getValues(names[i]); metadataNL.add(names[i], vals); } rsp.add(stream.getName() + "_metadata", metadataNL); } } catch (SAXException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } finally { IOUtils.closeQuietly(inputStream); } } else { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter."); } }
public void parseEmbedded( InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { String name = metadata.get(Metadata.RESOURCE_NAME_KEY); if (name == null) { name = "file" + count++; } MediaType contentType = detector.detect(inputStream, metadata); if (name.indexOf('.') == -1 && contentType != null) { try { name += config.getMimeRepository().forName(contentType.toString()).getExtension(); } catch (MimeTypeException e) { e.printStackTrace(); } } String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID); if (relID != null && !name.startsWith(relID)) { name = relID + "_" + name; } File outputFile = new File(extractDir, name); File parent = outputFile.getParentFile(); if (!parent.exists()) { if (!parent.mkdirs()) { throw new IOException("unable to create directory \"" + parent + "\""); } } System.out.println("Extracting '" + name + "' (" + contentType + ") to " + outputFile); FileOutputStream os = null; try { os = new FileOutputStream(outputFile); if (inputStream instanceof TikaInputStream) { TikaInputStream tin = (TikaInputStream) inputStream; if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) { POIFSFileSystem fs = new POIFSFileSystem(); copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot()); fs.writeFilesystem(os); } else { IOUtils.copy(inputStream, os); } } else { IOUtils.copy(inputStream, os); } } catch (Exception e) { logger.warn("Ignoring unexpected exception trying to save embedded file " + name, e); } finally { if (os != null) { os.close(); } } }
/** * Pulls together {@link IPortalDataType}, {@link IDataUpgrader}, and {@link IDataImporter} * implementations to handle data upgrade, import, export and removal operations. * * @author Eric Dalquist */ @Service("portalDataHandlerService") public class JaxbPortalDataHandlerService implements IPortalDataHandlerService { /** Tracks the base import directory to allow for easier to read logging when importing */ private static final ThreadLocal<String> IMPORT_BASE_DIR = new ThreadLocal<String>(); private static final String REPORT_FORMAT = "%s,%s,%.2fms\n"; private static final MediaType MT_JAVA_ARCHIVE = MediaType.application("java-archive"); private static final MediaType MT_CPIO = MediaType.application("x-cpio"); private static final MediaType MT_AR = MediaType.application("x-archive"); private static final MediaType MT_TAR = MediaType.application("x-tar"); private static final MediaType MT_BZIP2 = MediaType.application("x-bzip2"); private static final MediaType MT_GZIP = MediaType.application("x-gzip"); private static final MediaType MT_PACK200 = MediaType.application("x-java-pack200"); private static final MediaType MT_XZ = MediaType.application("x-xz"); protected final Logger logger = LoggerFactory.getLogger(getClass()); // Order in which data must be imported private List<PortalDataKey> dataKeyImportOrder = Collections.emptyList(); // Map to lookup the associated IPortalDataType for each known PortalDataKey private Map<PortalDataKey, IPortalDataType> dataKeyTypes = Collections.emptyMap(); // Ant path matcher patterns that a file must match when scanning directories (unless a pattern is // explicitly specified) private Set<String> dataFileIncludes = Collections.emptySet(); private Set<String> dataFileExcludes = ImmutableSet.copyOf(DirectoryScanner.getDefaultExcludes()); // Data upgraders mapped by PortalDataKey private Map<PortalDataKey, IDataUpgrader> portalDataUpgraders = Collections.emptyMap(); // Data importers mapped by PortalDataKey private Map<PortalDataKey, IDataImporter<Object>> portalDataImporters = Collections.emptyMap(); // ExportAll data types private Set<IPortalDataType> exportAllPortalDataTypes = null; // All portal data types available for export private Set<IPortalDataType> exportPortalDataTypes = Collections.emptySet(); // Data exporters mapped by IPortalDateType#getTypeId() private Map<String, IDataExporter<Object>> portalDataExporters = Collections.emptyMap(); // All portal data types available for delete private Set<IPortalDataType> deletePortalDataTypes = Collections.emptySet(); // Data deleters mapped by IPortalDateType#getTypeId() private Map<String, IDataDeleter<Object>> portalDataDeleters = Collections.emptyMap(); private org.apereo.portal.utils.DirectoryScanner directoryScanner; private ExecutorService importExportThreadPool; private XmlUtilities xmlUtilities; private long maxWait = -1; private TimeUnit maxWaitTimeUnit = TimeUnit.MILLISECONDS; @Autowired public void setXmlUtilities(XmlUtilities xmlUtilities) { this.xmlUtilities = xmlUtilities; } @Autowired public void setImportExportThreadPool( @Qualifier("importExportThreadPool") ExecutorService importExportThreadPool) { this.importExportThreadPool = importExportThreadPool; this.directoryScanner = new ConcurrentDirectoryScanner(this.importExportThreadPool); } /** Maximum time to wait for an import, export, or delete to execute. */ public void setMaxWait(long maxWait) { this.maxWait = maxWait; } /** {@link TimeUnit} for {@link #setMaxWait(long)} value. */ public void setMaxWaitTimeUnit(TimeUnit maxWaitTimeUnit) { this.maxWaitTimeUnit = maxWaitTimeUnit; } /** Order in which data types should be imported. */ @javax.annotation.Resource(name = "dataTypeImportOrder") public void setDataTypeImportOrder(List<IPortalDataType> dataTypeImportOrder) { final ArrayList<PortalDataKey> dataKeyImportOrder = new ArrayList<PortalDataKey>(dataTypeImportOrder.size() * 2); final Map<PortalDataKey, IPortalDataType> dataKeyTypes = new LinkedHashMap<PortalDataKey, IPortalDataType>(dataTypeImportOrder.size() * 2); for (final IPortalDataType portalDataType : dataTypeImportOrder) { final List<PortalDataKey> supportedDataKeys = portalDataType.getDataKeyImportOrder(); for (final PortalDataKey portalDataKey : supportedDataKeys) { dataKeyImportOrder.add(portalDataKey); dataKeyTypes.put(portalDataKey, portalDataType); } } dataKeyImportOrder.trimToSize(); this.dataKeyImportOrder = Collections.unmodifiableList(dataKeyImportOrder); this.dataKeyTypes = Collections.unmodifiableMap(dataKeyTypes); } /** Ant path matching patterns that files must match to be included */ @javax.annotation.Resource(name = "dataFileIncludes") public void setDataFileIncludes(Set<String> dataFileIncludes) { this.dataFileIncludes = dataFileIncludes; } /** * Ant path matching patterns that exclude matched files. Defaults to {@link * DirectoryScanner#addDefaultExcludes()} */ public void setDataFileExcludes(Set<String> dataFileExcludes) { this.dataFileExcludes = dataFileExcludes; } /** {@link IDataImporter} implementations to delegate import operations to. */ @SuppressWarnings("unchecked") @Autowired(required = false) public void setDataImporters(Collection<IDataImporter<? extends Object>> dataImporters) { final Map<PortalDataKey, IDataImporter<Object>> dataImportersMap = new LinkedHashMap<PortalDataKey, IDataImporter<Object>>(); for (final IDataImporter<?> dataImporter : dataImporters) { try { final Set<PortalDataKey> importDataKeys = dataImporter.getImportDataKeys(); for (final PortalDataKey importDataKey : importDataKeys) { this.logger.debug( "Registering IDataImporter for '{}' - {}", new Object[] {importDataKey, dataImporter}); final IDataImporter<Object> existing = dataImportersMap.put(importDataKey, (IDataImporter<Object>) dataImporter); if (existing != null) { this.logger.warn( "Duplicate IDataImporter PortalDataKey for {} Replacing {} with {}", new Object[] {importDataKey, existing, dataImporter}); } } } catch (Exception exception) { logger.error("Failed to register data importer {}.", dataImporter, exception); } } this.portalDataImporters = Collections.unmodifiableMap(dataImportersMap); } /** {@link IDataExporter} implementations to delegate export operations to. */ @SuppressWarnings("unchecked") @Autowired(required = false) public void setDataExporters(Collection<IDataExporter<? extends Object>> dataExporters) { final Map<String, IDataExporter<Object>> dataExportersMap = new LinkedHashMap<String, IDataExporter<Object>>(); final Set<IPortalDataType> portalDataTypes = new LinkedHashSet<IPortalDataType>(); for (final IDataExporter<?> dataExporter : dataExporters) { try { final IPortalDataType portalDataType = dataExporter.getPortalDataType(); final String typeId = portalDataType.getTypeId(); this.logger.debug( "Registering IDataExporter for '{}' - {}", new Object[] {typeId, dataExporter}); final IDataExporter<Object> existing = dataExportersMap.put(typeId, (IDataExporter<Object>) dataExporter); if (existing != null) { this.logger.warn( "Duplicate IDataExporter typeId for {} Replacing {} with {}", new Object[] {typeId, existing, dataExporter}); } portalDataTypes.add(portalDataType); } catch (Exception exception) { logger.error("Failed to register data exporter {}.", dataExporter, exception); } } this.portalDataExporters = Collections.unmodifiableMap(dataExportersMap); this.exportPortalDataTypes = Collections.unmodifiableSet(portalDataTypes); } /** * Optional set of all portal data types to export. If not specified all available portal data * types will be listed. */ @javax.annotation.Resource(name = "exportAllPortalDataTypes") public void setExportAllPortalDataTypes(Set<IPortalDataType> exportAllPortalDataTypes) { this.exportAllPortalDataTypes = ImmutableSet.copyOf(exportAllPortalDataTypes); } /** {@link IDataDeleter} implementations to delegate delete operations to. */ @SuppressWarnings("unchecked") @Autowired(required = false) public void setDataDeleters(Collection<IDataDeleter<? extends Object>> dataDeleters) { final Map<String, IDataDeleter<Object>> dataDeletersMap = new LinkedHashMap<String, IDataDeleter<Object>>(); final Set<IPortalDataType> portalDataTypes = new LinkedHashSet<IPortalDataType>(); for (final IDataDeleter<?> dataDeleter : dataDeleters) { try { final IPortalDataType portalDataType = dataDeleter.getPortalDataType(); final String typeId = portalDataType.getTypeId(); this.logger.debug( "Registering IDataDeleter for '{}' - {}", new Object[] {typeId, dataDeleter}); final IDataDeleter<Object> existing = dataDeletersMap.put(typeId, (IDataDeleter<Object>) dataDeleter); if (existing != null) { this.logger.warn( "Duplicate IDataDeleter typeId for {} Replacing {} with {}", new Object[] {typeId, existing, dataDeleter}); } portalDataTypes.add(portalDataType); } catch (Exception exception) { logger.error("Failed to register data deleter {}.", dataDeleter, exception); } } this.portalDataDeleters = Collections.unmodifiableMap(dataDeletersMap); this.deletePortalDataTypes = Collections.unmodifiableSet(portalDataTypes); } /** {@link IDataUpgrader} implementations to delegate upgrade operations to. */ @Autowired(required = false) public void setDataUpgraders(Collection<IDataUpgrader> dataUpgraders) { final Map<PortalDataKey, IDataUpgrader> dataUpgraderMap = new LinkedHashMap<PortalDataKey, IDataUpgrader>(); for (final IDataUpgrader dataUpgrader : dataUpgraders) { try { final Set<PortalDataKey> upgradeDataKeys = dataUpgrader.getSourceDataTypes(); for (final PortalDataKey upgradeDataKey : upgradeDataKeys) { this.logger.debug( "Registering IDataUpgrader for '{}' - {}", upgradeDataKey, dataUpgrader); final IDataUpgrader existing = dataUpgraderMap.put(upgradeDataKey, dataUpgrader); if (existing != null) { this.logger.warn( "Duplicate IDataUpgrader PortalDataKey for {} Replacing {} with {}", new Object[] {upgradeDataKey, existing, dataUpgrader}); } } } catch (Exception exception) { logger.error("Failed to register data upgrader {}.", dataUpgrader, exception); } } this.portalDataUpgraders = Collections.unmodifiableMap(dataUpgraderMap); } @Override public void importDataArchive(Resource archive, BatchImportOptions options) { try { importDataArchive(archive, archive.getInputStream(), options); } catch (IOException e) { throw new RuntimeException("Could not load InputStream for resource: " + archive, e); } } protected void importDataArchive( Resource archive, InputStream resourceStream, BatchImportOptions options) { BufferedInputStream bufferedResourceStream = null; try { // Make sure the stream is buffered if (resourceStream instanceof BufferedInputStream) { bufferedResourceStream = (BufferedInputStream) resourceStream; } else { bufferedResourceStream = new BufferedInputStream(resourceStream); } // Buffer up to 100MB, bad things will happen if we bust this buffer. // TODO see if there is a buffered stream that will write to a file once the buffer fills up bufferedResourceStream.mark(100 * 1024 * 1024); final MediaType type = getMediaType(bufferedResourceStream, archive.getFilename()); if (MT_JAVA_ARCHIVE.equals(type)) { final ArchiveInputStream archiveStream = new JarArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MediaType.APPLICATION_ZIP.equals(type)) { final ArchiveInputStream archiveStream = new ZipArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MT_CPIO.equals(type)) { final ArchiveInputStream archiveStream = new CpioArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MT_AR.equals(type)) { final ArchiveInputStream archiveStream = new ArArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MT_TAR.equals(type)) { final ArchiveInputStream archiveStream = new TarArchiveInputStream(bufferedResourceStream); importDataArchive(archive, archiveStream, options); } else if (MT_BZIP2.equals(type)) { final CompressorInputStream compressedStream = new BZip2CompressorInputStream(bufferedResourceStream); importDataArchive(archive, compressedStream, options); } else if (MT_GZIP.equals(type)) { final CompressorInputStream compressedStream = new GzipCompressorInputStream(bufferedResourceStream); importDataArchive(archive, compressedStream, options); } else if (MT_PACK200.equals(type)) { final CompressorInputStream compressedStream = new Pack200CompressorInputStream(bufferedResourceStream); importDataArchive(archive, compressedStream, options); } else if (MT_XZ.equals(type)) { final CompressorInputStream compressedStream = new XZCompressorInputStream(bufferedResourceStream); importDataArchive(archive, compressedStream, options); } else { throw new RuntimeException("Unrecognized archive media type: " + type); } } catch (IOException e) { throw new RuntimeException("Could not load InputStream for resource: " + archive, e); } finally { IOUtils.closeQuietly(bufferedResourceStream); } } /** Extracts the archive resource and then runs the batch-import process on it. */ protected void importDataArchive( final Resource resource, final ArchiveInputStream resourceStream, BatchImportOptions options) { final File tempDir = Files.createTempDir(); try { ArchiveEntry archiveEntry; while ((archiveEntry = resourceStream.getNextEntry()) != null) { final File entryFile = new File(tempDir, archiveEntry.getName()); if (archiveEntry.isDirectory()) { entryFile.mkdirs(); } else { entryFile.getParentFile().mkdirs(); Files.copy( new InputSupplier<InputStream>() { @Override public InputStream getInput() throws IOException { return new CloseShieldInputStream(resourceStream); } }, entryFile); } } importDataDirectory(tempDir, null, options); } catch (IOException e) { throw new RuntimeException( "Failed to extract data from '" + resource + "' to '" + tempDir + "' for batch import.", e); } finally { FileUtils.deleteQuietly(tempDir); } } protected MediaType getMediaType(BufferedInputStream inputStream, String fileName) throws IOException { final TikaInputStream tikaInputStreamStream = TikaInputStream.get(new CloseShieldInputStream(inputStream)); try { final Detector detector = new DefaultDetector(); final Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); final MediaType type = detector.detect(tikaInputStreamStream, metadata); logger.debug("Determined '{}' for '{}'", type, fileName); return type; } catch (IOException e) { logger.warn("Failed to determine media type for '" + fileName + "' assuming XML", e); return null; } finally { IOUtils.closeQuietly(tikaInputStreamStream); // Reset the buffered stream to make up for anything read by the detector inputStream.reset(); } } @Override public void importDataDirectory( File directory, String pattern, final BatchImportOptions options) { if (!directory.exists()) { throw new IllegalArgumentException( "The specified directory '" + directory + "' does not exist"); } // Create the file filter to use when searching for files to import final FileFilter fileFilter; if (pattern != null) { fileFilter = new AntPatternFileFilter(true, false, pattern, this.dataFileExcludes); } else { fileFilter = new AntPatternFileFilter(true, false, this.dataFileIncludes, this.dataFileExcludes); } // Determine the parent directory to log to final File logDirectory = determineLogDirectory(options, "import"); // Setup reporting file final File importReport = new File(logDirectory, "data-import.txt"); final PrintWriter reportWriter; try { reportWriter = new PrintWriter(new PeriodicFlushingBufferedWriter(500, new FileWriter(importReport))); } catch (IOException e) { throw new RuntimeException("Failed to create FileWriter for: " + importReport, e); } // Convert directory to URI String to provide better logging output final URI directoryUri = directory.toURI(); final String directoryUriStr = directoryUri.toString(); IMPORT_BASE_DIR.set(directoryUriStr); try { // Scan the specified directory for files to import logger.info("Scanning for files to Import from: {}", directory); final PortalDataKeyFileProcessor fileProcessor = new PortalDataKeyFileProcessor(this.dataKeyTypes, options); this.directoryScanner.scanDirectoryNoResults(directory, fileFilter, fileProcessor); final long resourceCount = fileProcessor.getResourceCount(); logger.info("Found {} files to Import from: {}", resourceCount, directory); // See if the import should fail on error final boolean failOnError = options != null ? options.isFailOnError() : true; // Map of files to import, grouped by type final ConcurrentMap<PortalDataKey, Queue<Resource>> dataToImport = fileProcessor.getDataToImport(); // Import the data files for (final PortalDataKey portalDataKey : this.dataKeyImportOrder) { final Queue<Resource> files = dataToImport.remove(portalDataKey); if (files == null) { continue; } final Queue<ImportFuture<?>> importFutures = new LinkedList<ImportFuture<?>>(); final List<FutureHolder<?>> failedFutures = new LinkedList<FutureHolder<?>>(); final int fileCount = files.size(); logger.info("Importing {} files of type {}", fileCount, portalDataKey); reportWriter.println(portalDataKey + "," + fileCount); while (!files.isEmpty()) { final Resource file = files.poll(); // Check for completed futures on every iteration, needed to fail as fast as possible on // an import exception final List<FutureHolder<?>> newFailed = waitForFutures(importFutures, reportWriter, logDirectory, false); failedFutures.addAll(newFailed); final AtomicLong importTime = new AtomicLong(-1); // Create import task final Callable<Object> task = new CallableWithoutResult() { @Override protected void callWithoutResult() { IMPORT_BASE_DIR.set(directoryUriStr); importTime.set(System.nanoTime()); try { importData(file, portalDataKey); } finally { importTime.set(System.nanoTime() - importTime.get()); IMPORT_BASE_DIR.remove(); } } }; // Submit the import task final Future<?> importFuture = this.importExportThreadPool.submit(task); // Add the future for tracking importFutures.offer(new ImportFuture(importFuture, file, portalDataKey, importTime)); } // Wait for all of the imports on of this type to complete final List<FutureHolder<?>> newFailed = waitForFutures(importFutures, reportWriter, logDirectory, true); failedFutures.addAll(newFailed); if (failOnError && !failedFutures.isEmpty()) { throw new RuntimeException( failedFutures.size() + " " + portalDataKey + " entities failed to import.\n\n" + "\tPer entity exception logs and a full report can be found in " + logDirectory + "\n"); } reportWriter.flush(); } if (!dataToImport.isEmpty()) { throw new IllegalStateException( "The following PortalDataKeys are not listed in the dataTypeImportOrder List: " + dataToImport.keySet()); } logger.info("For a detailed report on the data import see " + importReport); } catch (InterruptedException e) { throw new RuntimeException("Interrupted while waiting for entities to import", e); } finally { IOUtils.closeQuietly(reportWriter); IMPORT_BASE_DIR.remove(); } } /** Determine directory to log import/export reports to */ private File determineLogDirectory(final BatchOptions options, String operation) { File logDirectoryParent = options != null ? options.getLogDirectoryParent() : null; if (logDirectoryParent == null) { logDirectoryParent = Files.createTempDir(); } File logDirectory = new File(logDirectoryParent, "data-" + operation + "-reports"); try { logDirectory = logDirectory.getCanonicalFile(); FileUtils.deleteDirectory(logDirectory); } catch (IOException e) { throw new RuntimeException( "Failed to clean data-" + operation + " log directory: " + logDirectory, e); } logDirectory.mkdirs(); return logDirectory; } @Override public void importData(final Resource resource) { this.importData(resource, null); } @Override public void importData(Source source) { this.importData(source, null); } @Override public final void importData(final Source source, PortalDataKey portalDataKey) { // Get a StAX reader for the source to determine info about the data to import final BufferedXMLEventReader bufferedXmlEventReader = createSourceXmlEventReader(source); // If no PortalDataKey was passed build it from the source if (portalDataKey == null) { final StartElement rootElement = StaxUtils.getRootElement(bufferedXmlEventReader); portalDataKey = new PortalDataKey(rootElement); bufferedXmlEventReader.reset(); } final String systemId = source.getSystemId(); // Post Process the PortalDataKey to see if more complex import operations are needed final IPortalDataType portalDataType = this.dataKeyTypes.get(portalDataKey); if (portalDataType == null) { throw new RuntimeException( "No IPortalDataType configured for " + portalDataKey + ", the resource will be ignored: " + getPartialSystemId(systemId)); } final Set<PortalDataKey> postProcessedPortalDataKeys = portalDataType.postProcessPortalDataKey(systemId, portalDataKey, bufferedXmlEventReader); bufferedXmlEventReader.reset(); // If only a single result from post processing import if (postProcessedPortalDataKeys.size() == 1) { this.importOrUpgradeData( systemId, DataAccessUtils.singleResult(postProcessedPortalDataKeys), bufferedXmlEventReader); } // If multiple results from post processing ordering is needed else { // Iterate over the data key order list to run the imports in the correct order for (final PortalDataKey orderedPortalDataKey : this.dataKeyImportOrder) { if (postProcessedPortalDataKeys.contains(orderedPortalDataKey)) { // Reset the to start of the XML document for each import/upgrade call bufferedXmlEventReader.reset(); this.importOrUpgradeData(systemId, orderedPortalDataKey, bufferedXmlEventReader); } } } } /** * @param portalDataKey Optional PortalDataKey to use, useful for batch imports where * post-processing of keys has already take place */ protected final void importData(final Resource resource, final PortalDataKey portalDataKey) { final InputStream resourceStream; try { resourceStream = resource.getInputStream(); } catch (IOException e) { throw new RuntimeException("Could not load InputStream for resource: " + resource, e); } try { final String resourceUri = ResourceUtils.getResourceUri(resource); this.importData(new StreamSource(resourceStream, resourceUri), portalDataKey); } finally { IOUtils.closeQuietly(resourceStream); } } protected String getPartialSystemId(String systemId) { final String directoryUriStr = IMPORT_BASE_DIR.get(); if (directoryUriStr == null) { return systemId; } if (systemId.startsWith(directoryUriStr)) { return systemId.substring(directoryUriStr.length()); } return systemId; } /** Run the import/update process on the data */ protected final void importOrUpgradeData( String systemId, PortalDataKey portalDataKey, XMLEventReader xmlEventReader) { // See if there is a registered importer for the data, if so import final IDataImporter<Object> dataImporterExporter = this.portalDataImporters.get(portalDataKey); if (dataImporterExporter != null) { this.logger.debug("Importing: {}", getPartialSystemId(systemId)); final Object data = unmarshallData(xmlEventReader, dataImporterExporter); dataImporterExporter.importData(data); this.logger.info("Imported : {}", getPartialSystemId(systemId)); return; } // No importer, see if there is an upgrader, if so upgrade final IDataUpgrader dataUpgrader = this.portalDataUpgraders.get(portalDataKey); if (dataUpgrader != null) { this.logger.debug("Upgrading: {}", getPartialSystemId(systemId)); // Convert the StAX stream to a DOM node, due to poor JDK support for StAX with XSLT final Node sourceNode; try { sourceNode = xmlUtilities.convertToDom(xmlEventReader); } catch (XMLStreamException e) { throw new RuntimeException("Failed to create StAXSource from original XML reader", e); } final DOMSource source = new DOMSource(sourceNode); final DOMResult result = new DOMResult(); final boolean doImport = dataUpgrader.upgradeData(source, result); if (doImport) { // If the upgrader didn't handle the import as well wrap the result DOM in a new Source and // start the import process over again final org.w3c.dom.Node node = result.getNode(); final PortalDataKey upgradedPortalDataKey = new PortalDataKey(node); if (this.logger.isTraceEnabled()) { this.logger.trace( "Upgraded: " + getPartialSystemId(systemId) + " to " + upgradedPortalDataKey + "\n\nSource XML: \n" + XmlUtilitiesImpl.toString(source.getNode()) + "\n\nResult XML: \n" + XmlUtilitiesImpl.toString(node)); } else { this.logger.info( "Upgraded: {} to {}", getPartialSystemId(systemId), upgradedPortalDataKey); } final DOMSource upgradedSource = new DOMSource(node, systemId); this.importData(upgradedSource, upgradedPortalDataKey); } else { this.logger.info("Upgraded and Imported: {}", getPartialSystemId(systemId)); } return; } // No importer or upgrader found, fail throw new IllegalArgumentException( "Provided data " + portalDataKey + " has no registered importer or upgrader support: " + systemId); } protected Object unmarshallData( final XMLEventReader bufferedXmlEventReader, final IDataImporter<Object> dataImporterExporter) { final Unmarshaller unmarshaller = dataImporterExporter.getUnmarshaller(); try { final StAXSource source = new StAXSource(bufferedXmlEventReader); return unmarshaller.unmarshal(source); } catch (XmlMappingException e) { throw new RuntimeException("Failed to map provided XML to portal data", e); } catch (IOException e) { throw new RuntimeException("Failed to read the provided XML data", e); } catch (XMLStreamException e) { throw new RuntimeException("Failed to create StAX Source to read XML data", e); } } protected BufferedXMLEventReader createSourceXmlEventReader(final Source source) { // If it is a StAXSource see if we can do better handling of it if (source instanceof StAXSource) { final StAXSource staxSource = (StAXSource) source; XMLEventReader xmlEventReader = staxSource.getXMLEventReader(); if (xmlEventReader != null) { if (xmlEventReader instanceof BufferedXMLEventReader) { final BufferedXMLEventReader bufferedXMLEventReader = (BufferedXMLEventReader) xmlEventReader; bufferedXMLEventReader.reset(); bufferedXMLEventReader.mark(-1); return bufferedXMLEventReader; } return new BufferedXMLEventReader(xmlEventReader, -1); } } final XMLInputFactory xmlInputFactory = this.xmlUtilities.getXmlInputFactory(); final XMLEventReader xmlEventReader; try { xmlEventReader = xmlInputFactory.createXMLEventReader(source); } catch (XMLStreamException e) { throw new RuntimeException("Failed to create XML Event Reader for data Source", e); } return new BufferedXMLEventReader(xmlEventReader, -1); } @Override public Iterable<IPortalDataType> getExportPortalDataTypes() { return this.exportPortalDataTypes; } @Override public Iterable<IPortalDataType> getDeletePortalDataTypes() { return this.deletePortalDataTypes; } @Override public Iterable<? extends IPortalData> getPortalData(String typeId) { final IDataExporter<Object> dataImporterExporter = getPortalDataExporter(typeId); return dataImporterExporter.getPortalData(); } @Override public String exportData(String typeId, String dataId, Result result) { final IDataExporter<Object> portalDataExporter = this.getPortalDataExporter(typeId); final Object data = portalDataExporter.exportData(dataId); if (data == null) { return null; } final Marshaller marshaller = portalDataExporter.getMarshaller(); try { marshaller.marshal(data, result); return portalDataExporter.getFileName(data); } catch (XmlMappingException e) { throw new RuntimeException("Failed to map provided portal data to XML", e); } catch (IOException e) { throw new RuntimeException("Failed to write the provided XML data", e); } } @Override public boolean exportData(String typeId, String dataId, File directory) { directory.mkdirs(); final File exportTempFile; try { exportTempFile = File.createTempFile( SafeFilenameUtils.makeSafeFilename(StringUtils.rightPad(dataId, 2, '-') + "-"), SafeFilenameUtils.makeSafeFilename("." + typeId), directory); } catch (IOException e) { throw new RuntimeException( "Could not create temp file to export " + typeId + " " + dataId, e); } try { final String fileName = this.exportData(typeId, dataId, new StreamResult(exportTempFile)); if (fileName == null) { logger.info("Skipped: type={} id={}", typeId, dataId); return false; } final File destFile = new File(directory, fileName + "." + typeId + ".xml"); if (destFile.exists()) { logger.warn( "Exporting " + typeId + " " + dataId + " but destination file already exists, it will be overwritten: " + destFile); destFile.delete(); } FileUtils.moveFile(exportTempFile, destFile); logger.info("Exported: {}", destFile); return true; } catch (Exception e) { if (e instanceof RuntimeException) { throw (RuntimeException) e; } throw new RuntimeException("Failed to export " + typeId + " " + dataId, e); } finally { FileUtils.deleteQuietly(exportTempFile); } } @Override public void exportAllDataOfType(Set<String> typeIds, File directory, BatchExportOptions options) { final Queue<ExportFuture<?>> exportFutures = new ConcurrentLinkedQueue<ExportFuture<?>>(); final boolean failOnError = options != null ? options.isFailOnError() : true; // Determine the parent directory to log to final File logDirectory = determineLogDirectory(options, "export"); // Setup reporting file final File exportReport = new File(logDirectory, "data-export.txt"); final PrintWriter reportWriter; try { reportWriter = new PrintWriter(new BufferedWriter(new FileWriter(exportReport))); } catch (IOException e) { throw new RuntimeException("Failed to create FileWriter for: " + exportReport, e); } try { for (final String typeId : typeIds) { final List<FutureHolder<?>> failedFutures = new LinkedList<FutureHolder<?>>(); final File typeDir = new File(directory, typeId); logger.info("Adding all data of type {} to export queue: {}", typeId, typeDir); reportWriter.println(typeId + "," + typeDir); final Iterable<? extends IPortalData> dataForType = this.getPortalData(typeId); for (final IPortalData data : dataForType) { final String dataId = data.getDataId(); // Check for completed futures on every iteration, needed to fail as fast as possible on // an import exception final List<FutureHolder<?>> newFailed = waitForFutures(exportFutures, reportWriter, logDirectory, false); failedFutures.addAll(newFailed); final AtomicLong exportTime = new AtomicLong(-1); // Create export task Callable<Object> task = new CallableWithoutResult() { @Override protected void callWithoutResult() { exportTime.set(System.nanoTime()); try { exportData(typeId, dataId, typeDir); } finally { exportTime.set(System.nanoTime() - exportTime.get()); } } }; // Submit the export task final Future<?> exportFuture = this.importExportThreadPool.submit(task); // Add the future for tracking final ExportFuture futureHolder = new ExportFuture(exportFuture, typeId, dataId, exportTime); exportFutures.offer(futureHolder); } final List<FutureHolder<?>> newFailed = waitForFutures(exportFutures, reportWriter, logDirectory, true); failedFutures.addAll(newFailed); reportWriter.flush(); if (failOnError && !failedFutures.isEmpty()) { throw new RuntimeException( failedFutures.size() + " " + typeId + " entities failed to export.\n" + "\tPer entity exception logs and a full report can be found in " + logDirectory); } } } catch (InterruptedException e) { throw new RuntimeException("Interrupted while waiting for entities to export", e); } finally { IOUtils.closeQuietly(reportWriter); } } @Override public void exportAllData(File directory, BatchExportOptions options) { final Set<IPortalDataType> portalDataTypes; if (this.exportAllPortalDataTypes != null) { portalDataTypes = this.exportAllPortalDataTypes; } else { portalDataTypes = this.exportPortalDataTypes; } final Set<String> typeIds = new LinkedHashSet<String>(); for (final IPortalDataType portalDataType : portalDataTypes) { typeIds.add(portalDataType.getTypeId()); } this.exportAllDataOfType(typeIds, directory, options); } protected IDataExporter<Object> getPortalDataExporter(String typeId) { final IDataExporter<Object> dataExporter = this.portalDataExporters.get(typeId); if (dataExporter == null) { throw new IllegalArgumentException("No IDataExporter exists for: " + typeId); } return dataExporter; } @Override public void deleteData(String typeId, String dataId) { final IDataDeleter<Object> dataDeleter = this.portalDataDeleters.get(typeId); if (dataDeleter == null) { throw new IllegalArgumentException("No IDataDeleter exists for: " + typeId); } final Object data = dataDeleter.deleteData(dataId); if (data != null) { logger.info("Deleted data " + dataId + " of type " + typeId); } else { logger.info("No data " + dataId + " of type " + typeId + " exists to delete"); } } /** * Used by batch import and export to wait for queued tasks to complete. Handles fail-fast * behavior if any of the tasks threw and exception by canceling all queued futures and logging a * summary of the failures. All completed futures are removed from the queue. * * @param futures Queued futures to check for completeness * @param wait If true it will wait for all futures to complete, if false only check for completed * futures * @return a list of futures that either threw exceptions or timed out */ protected List<FutureHolder<?>> waitForFutures( final Queue<? extends FutureHolder<?>> futures, final PrintWriter reportWriter, final File reportDirectory, final boolean wait) throws InterruptedException { final List<FutureHolder<?>> failedFutures = new LinkedList<FutureHolder<?>>(); for (Iterator<? extends FutureHolder<?>> futuresItr = futures.iterator(); futuresItr.hasNext(); ) { final FutureHolder<?> futureHolder = futuresItr.next(); // If waiting, or if not waiting but the future is already done do the get final Future<?> future = futureHolder.getFuture(); if (wait || (!wait && future.isDone())) { futuresItr.remove(); try { // Don't bother doing a get() on canceled futures if (!future.isCancelled()) { if (this.maxWait > 0) { future.get(this.maxWait, this.maxWaitTimeUnit); } else { future.get(); } reportWriter.printf( REPORT_FORMAT, "SUCCESS", futureHolder.getDescription(), futureHolder.getExecutionTimeMillis()); } } catch (CancellationException e) { // Ignore cancellation exceptions } catch (ExecutionException e) { logger.error("Failed: " + futureHolder); futureHolder.setError(e); failedFutures.add(futureHolder); reportWriter.printf( REPORT_FORMAT, "FAIL", futureHolder.getDescription(), futureHolder.getExecutionTimeMillis()); try { final String dataReportName = SafeFilenameUtils.makeSafeFilename( futureHolder.getDataType() + "_" + futureHolder.getDataName() + ".txt"); final File dataReportFile = new File(reportDirectory, dataReportName); final PrintWriter dataReportWriter = new PrintWriter(new BufferedWriter(new FileWriter(dataReportFile))); try { dataReportWriter.println( "FAIL: " + futureHolder.getDataType() + " - " + futureHolder.getDataName()); dataReportWriter.println( "--------------------------------------------------------------------------------"); e.getCause().printStackTrace(dataReportWriter); } finally { IOUtils.closeQuietly(dataReportWriter); } } catch (Exception re) { logger.warn( "Failed to write error report for failed " + futureHolder + ", logging root failure here", e.getCause()); } } catch (TimeoutException e) { logger.warn("Failed: " + futureHolder); futureHolder.setError(e); failedFutures.add(futureHolder); future.cancel(true); reportWriter.printf( REPORT_FORMAT, "TIMEOUT", futureHolder.getDescription(), futureHolder.getExecutionTimeMillis()); } } } return failedFutures; } private abstract static class FutureHolder<T> { private final Future<T> future; private final AtomicLong time; private Exception error; public FutureHolder(Future<T> future, AtomicLong time) { this.future = future; this.time = time; } public Future<T> getFuture() { return this.future; } public double getExecutionTimeMillis() { final long t = time.get(); if (!future.isDone()) { return System.nanoTime() - t; } return t / 1000000.0; } public Exception getError() { return error; } public void setError(Exception error) { this.error = error; } public abstract String getDescription(); public abstract String getDataType(); public abstract String getDataName(); } private static class ImportFuture<T> extends FutureHolder<T> { private final Resource resource; private final PortalDataKey dataKey; public ImportFuture( Future<T> future, Resource resource, PortalDataKey dataKey, AtomicLong importTime) { super(future, importTime); this.resource = resource; this.dataKey = dataKey; } @Override public String getDescription() { return this.resource.getDescription(); } @Override public String getDataType() { return dataKey.getName().getLocalPart(); } @Override public String getDataName() { return this.resource.getFilename(); } @Override public String toString() { return "importing " + this.getDescription(); } } private static class ExportFuture<T> extends FutureHolder<T> { private final String typeId; private final String dataId; public ExportFuture(Future<T> future, String typeId, String dataId, AtomicLong exportTime) { super(future, exportTime); this.typeId = typeId; this.dataId = dataId; } @Override public String getDescription() { return "type=" + this.typeId + ", dataId=" + this.dataId; } @Override public String getDataType() { return this.typeId; } @Override public String getDataName() { return this.dataId; } @Override public String toString() { return "exporting " + this.getDescription(); } } }
public ParseResult getParse(Content content) { String mimeType = content.getContentType(); URL base; try { base = new URL(content.getBaseUrl()); } catch (MalformedURLException e) { return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); } // get the right parser using the mime type as a clue Parser parser = tikaConfig.getParser(MediaType.parse(mimeType)); byte[] raw = content.getContent(); if (parser == null) { String message = "Can't retrieve Tika parser for mime-type " + mimeType; LOG.error(message); return new ParseStatus(ParseStatus.FAILED, message) .getEmptyParseResult(content.getUrl(), getConf()); } LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType); Metadata tikamd = new Metadata(); HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment root = doc.createDocumentFragment(); DOMBuilder domhandler = new DOMBuilder(doc, root); ParseContext context = new ParseContext(); try { parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context); } catch (Exception e) { LOG.error("Error parsing " + content.getUrl(), e); return new ParseStatus(ParseStatus.FAILED, e.getMessage()) .getEmptyParseResult(content.getUrl(), getConf()); } HTMLMetaTags metaTags = new HTMLMetaTags(); String text = ""; String title = ""; Outlink[] outlinks = new Outlink[0]; org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata(); // we have converted the sax events generated by Tika into a DOM object // so we can now use the usual HTML resources from Nutch // get meta directives HTMLMetaProcessor.getMetaTags(metaTags, root, base); if (LOG.isTraceEnabled()) { LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); } // check meta directives if (!metaTags.getNoIndex()) { // okay to index StringBuffer sb = new StringBuffer(); if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); } utils.getText(sb, root); // extract text text = sb.toString(); sb.setLength(0); if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); } utils.getTitle(sb, root); // extract title title = sb.toString().trim(); } if (!metaTags.getNoFollow()) { // okay to follow links ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks URL baseTag = utils.getBase(root); if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } utils.getOutlinks(baseTag != null ? baseTag : base, l, root); outlinks = l.toArray(new Outlink[l.size()]); if (LOG.isTraceEnabled()) { LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl()); } } // populate Nutch metadata with Tika metadata String[] TikaMDNames = tikamd.names(); for (String tikaMDName : TikaMDNames) { if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) continue; // TODO what if multivalued? nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName)); } // no outlinks? try OutlinkExtractor e.g works for mime types where no // explicit markup for anchors if (outlinks.length == 0) { outlinks = OutlinkExtractor.getOutlinks(text, getConf()); } ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); if (metaTags.getRefresh()) { status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); status.setArgs( new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) }); } ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata); ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); // run filters on parse ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root); if (metaTags.getNoCache()) { // not okay to cache for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy); } return filteredParse; }