@SuppressWarnings("serial") public class SiteMapTikaParser implements Parser { private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( new HashSet<MediaType>( Arrays.asList( MediaType.text("xml"), MediaType.application("xml"), MediaType.application("x-xml"), MediaType.application("atom+xml"), MediaType.application("rss+xml"), MediaType.text("plain"), MediaType.application("gzip"), MediaType.application("x-gzip"), MediaType.application("x-gunzip"), MediaType.application("gzipped"), MediaType.application("gzip-compressed"), MediaType.application("x-compress"), MediaType.application("octet-stream")))); // SiteMapParser _parser; public SiteMapTikaParser() { // _parser = new SiteMapParser(); } @Override public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } @Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { if (metadata.get(Metadata.CONTENT_TYPE) == null) { metadata.set(Metadata.CONTENT_TYPE, "application/xml"); } final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("p"); xhtml.endElement("p"); xhtml.endDocument(); } /** @deprecated This method will be removed in Apache Tika 1.0. */ public void parse(InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { parse(stream, handler, metadata, new ParseContext()); } }
/** * HTML parser. Uses TagSoup to turn the input document to HTML SAX events, and post-processes the * events to produce XHTML and metadata expected by Tika clients. */ public class HtmlParser extends AbstractParser { /** Serial version UID */ private static final long serialVersionUID = 7895315240498733128L; private static final MediaType XHTML = MediaType.application("xhtml+xml"); private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml"); private static final MediaType X_ASP = MediaType.application("x-asp"); private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( new HashSet<MediaType>(Arrays.asList(MediaType.text("html"), XHTML, WAP_XHTML, X_ASP))); private static final ServiceLoader LOADER = new ServiceLoader(HtmlParser.class.getClassLoader()); /** HTML schema singleton used to amortise the heavy instantiation time. */ private static final Schema HTML_SCHEMA = new HTMLSchema(); public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Automatically detect the character encoding AutoDetectReader reader = new AutoDetectReader( new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER)); try { Charset charset = reader.getCharset(); String previous = metadata.get(Metadata.CONTENT_TYPE); MediaType contentType = null; if (previous == null || previous.startsWith("text/html")) { contentType = new MediaType(MediaType.TEXT_HTML, charset); } else if (previous.startsWith("application/xhtml+xml")) { contentType = new MediaType(XHTML, charset); } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) { contentType = new MediaType(WAP_XHTML, charset); } else if (previous.startsWith("application/x-asp")) { contentType = new MediaType(X_ASP, charset); } if (contentType != null) { metadata.set(Metadata.CONTENT_TYPE, contentType.toString()); } // deprecated, see TIKA-431 metadata.set(Metadata.CONTENT_ENCODING, charset.name()); // Get the HTML mapper from the parse context HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper()); // Parse the HTML document org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser(); // Use schema from context or default Schema schema = context.get(Schema.class, HTML_SCHEMA); // TIKA-528: Reuse share schema to avoid heavy instantiation parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema); // TIKA-599: Shared schema is thread-safe only if bogons are ignored parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true); parser.setContentHandler( new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata))); parser.parse(reader.asInputSource()); } finally { reader.close(); } } /** * Maps "safe" HTML element names to semantic XHTML equivalents. If the given element is unknown * or deemed unsafe for inclusion in the parse output, then this method returns <code>null</code> * and the element will be ignored but the content inside it is still processed. See the {@link * #isDiscardElement(String)} method for a way to discard the entire contents of an element. * * <p>Subclasses can override this method to customize the default mapping. * * @param name HTML element name (upper case) * @return XHTML element name (lower case), or <code>null</code> if the element is unsafe * @since Apache Tika 0.5 * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method * will be removed in Tika 1.0. */ protected String mapSafeElement(String name) { return DefaultHtmlMapper.INSTANCE.mapSafeElement(name); } /** * Checks whether all content within the given HTML element should be discarded instead of * including it in the parse output. Subclasses can override this method to customize the set of * discarded elements. * * @param name HTML element name (upper case) * @return <code>true</code> if content inside the named element should be ignored, <code>false * </code> otherwise * @since Apache Tika 0.5 * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method * will be removed in Tika 1.0. */ protected boolean isDiscardElement(String name) { return DefaultHtmlMapper.INSTANCE.isDiscardElement(name); } /** * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method * will be removed in Tika 1.0. */ public String mapSafeAttribute(String elementName, String attributeName) { return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName); } /** * Adapter class that maintains backwards compatibility with the protected HtmlParser methods. * Making HtmlParser implement HtmlMapper directly would require those methods to be public, which * would break backwards compatibility with subclasses. * * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This class will * be removed in Tika 1.0. */ private class HtmlParserMapper implements HtmlMapper { public String mapSafeElement(String name) { return HtmlParser.this.mapSafeElement(name); } public boolean isDiscardElement(String name) { return HtmlParser.this.isDiscardElement(name); } public String mapSafeAttribute(String elementName, String attributeName) { return HtmlParser.this.mapSafeAttribute(elementName, attributeName); } } }