Example #1
0
@SuppressWarnings("serial")
public class SiteMapTikaParser implements Parser {
  private static final Set<MediaType> SUPPORTED_TYPES =
      Collections.unmodifiableSet(
          new HashSet<MediaType>(
              Arrays.asList(
                  MediaType.text("xml"),
                  MediaType.application("xml"),
                  MediaType.application("x-xml"),
                  MediaType.application("atom+xml"),
                  MediaType.application("rss+xml"),
                  MediaType.text("plain"),
                  MediaType.application("gzip"),
                  MediaType.application("x-gzip"),
                  MediaType.application("x-gunzip"),
                  MediaType.application("gzipped"),
                  MediaType.application("gzip-compressed"),
                  MediaType.application("x-compress"),
                  MediaType.application("octet-stream"))));

  //    SiteMapParser _parser;

  public SiteMapTikaParser() {
    //        _parser = new SiteMapParser();
  }

  @Override
  public Set<MediaType> getSupportedTypes(ParseContext context) {
    return SUPPORTED_TYPES;
  }

  @Override
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    if (metadata.get(Metadata.CONTENT_TYPE) == null) {
      metadata.set(Metadata.CONTENT_TYPE, "application/xml");
    }

    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.startElement("p");

    xhtml.endElement("p");
    xhtml.endDocument();
  }

  /** @deprecated This method will be removed in Apache Tika 1.0. */
  public void parse(InputStream stream, ContentHandler handler, Metadata metadata)
      throws IOException, SAXException, TikaException {
    parse(stream, handler, metadata, new ParseContext());
  }
}
Example #2
0
/**
 * HTML parser. Uses TagSoup to turn the input document to HTML SAX events, and post-processes the
 * events to produce XHTML and metadata expected by Tika clients.
 */
public class HtmlParser extends AbstractParser {

  /** Serial version UID */
  private static final long serialVersionUID = 7895315240498733128L;

  private static final MediaType XHTML = MediaType.application("xhtml+xml");
  private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
  private static final MediaType X_ASP = MediaType.application("x-asp");

  private static final Set<MediaType> SUPPORTED_TYPES =
      Collections.unmodifiableSet(
          new HashSet<MediaType>(Arrays.asList(MediaType.text("html"), XHTML, WAP_XHTML, X_ASP)));

  private static final ServiceLoader LOADER = new ServiceLoader(HtmlParser.class.getClassLoader());

  /** HTML schema singleton used to amortise the heavy instantiation time. */
  private static final Schema HTML_SCHEMA = new HTMLSchema();

  public Set<MediaType> getSupportedTypes(ParseContext context) {
    return SUPPORTED_TYPES;
  }

  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    // Automatically detect the character encoding
    AutoDetectReader reader =
        new AutoDetectReader(
            new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER));
    try {
      Charset charset = reader.getCharset();
      String previous = metadata.get(Metadata.CONTENT_TYPE);
      MediaType contentType = null;
      if (previous == null || previous.startsWith("text/html")) {
        contentType = new MediaType(MediaType.TEXT_HTML, charset);
      } else if (previous.startsWith("application/xhtml+xml")) {
        contentType = new MediaType(XHTML, charset);
      } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
        contentType = new MediaType(WAP_XHTML, charset);
      } else if (previous.startsWith("application/x-asp")) {
        contentType = new MediaType(X_ASP, charset);
      }
      if (contentType != null) {
        metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
      }
      // deprecated, see TIKA-431
      metadata.set(Metadata.CONTENT_ENCODING, charset.name());

      // Get the HTML mapper from the parse context
      HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());

      // Parse the HTML document
      org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();

      // Use schema from context or default
      Schema schema = context.get(Schema.class, HTML_SCHEMA);

      // TIKA-528: Reuse share schema to avoid heavy instantiation
      parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
      // TIKA-599: Shared schema is thread-safe only if bogons are ignored
      parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

      parser.setContentHandler(
          new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));

      parser.parse(reader.asInputSource());
    } finally {
      reader.close();
    }
  }

  /**
   * Maps "safe" HTML element names to semantic XHTML equivalents. If the given element is unknown
   * or deemed unsafe for inclusion in the parse output, then this method returns <code>null</code>
   * and the element will be ignored but the content inside it is still processed. See the {@link
   * #isDiscardElement(String)} method for a way to discard the entire contents of an element.
   *
   * <p>Subclasses can override this method to customize the default mapping.
   *
   * @param name HTML element name (upper case)
   * @return XHTML element name (lower case), or <code>null</code> if the element is unsafe
   * @since Apache Tika 0.5
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method
   *     will be removed in Tika 1.0.
   */
  protected String mapSafeElement(String name) {
    return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
  }

  /**
   * Checks whether all content within the given HTML element should be discarded instead of
   * including it in the parse output. Subclasses can override this method to customize the set of
   * discarded elements.
   *
   * @param name HTML element name (upper case)
   * @return <code>true</code> if content inside the named element should be ignored, <code>false
   *     </code> otherwise
   * @since Apache Tika 0.5
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method
   *     will be removed in Tika 1.0.
   */
  protected boolean isDiscardElement(String name) {
    return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
  }

  /**
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method
   *     will be removed in Tika 1.0.
   */
  public String mapSafeAttribute(String elementName, String attributeName) {
    return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
  }

  /**
   * Adapter class that maintains backwards compatibility with the protected HtmlParser methods.
   * Making HtmlParser implement HtmlMapper directly would require those methods to be public, which
   * would break backwards compatibility with subclasses.
   *
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This class will
   *     be removed in Tika 1.0.
   */
  private class HtmlParserMapper implements HtmlMapper {
    public String mapSafeElement(String name) {
      return HtmlParser.this.mapSafeElement(name);
    }

    public boolean isDiscardElement(String name) {
      return HtmlParser.this.isDiscardElement(name);
    }

    public String mapSafeAttribute(String elementName, String attributeName) {
      return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
    }
  }
}