Java MediaType.text Examples

Programming Language: Java

Namespace/Package Name: org.apache.tika.mime

Class/Type: MediaType

Method/Function: text

Examples at hotexamples.com: 2

Java MediaType.text - 2 examples found. These are the top rated real world Java examples of org.apache.tika.mime.MediaType.text extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

toString(14)

parse(10)

application(9)

getSubtype(4)

equals(3)

getType(3)

image(3)

getBaseType(2)

text(2)

getParameters(1)

set(1)

Example #1

Show file

File: SiteMapTikaParser.java Project: thaingo/ptd

@SuppressWarnings("serial")
public class SiteMapTikaParser implements Parser {
  private static final Set<MediaType> SUPPORTED_TYPES =
      Collections.unmodifiableSet(
          new HashSet<MediaType>(
              Arrays.asList(
                  MediaType.text("xml"),
                  MediaType.application("xml"),
                  MediaType.application("x-xml"),
                  MediaType.application("atom+xml"),
                  MediaType.application("rss+xml"),
                  MediaType.text("plain"),
                  MediaType.application("gzip"),
                  MediaType.application("x-gzip"),
                  MediaType.application("x-gunzip"),
                  MediaType.application("gzipped"),
                  MediaType.application("gzip-compressed"),
                  MediaType.application("x-compress"),
                  MediaType.application("octet-stream"))));

  //    SiteMapParser _parser;

  public SiteMapTikaParser() {
    //        _parser = new SiteMapParser();
  }

  @Override
  public Set<MediaType> getSupportedTypes(ParseContext context) {
    return SUPPORTED_TYPES;
  }

  @Override
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    if (metadata.get(Metadata.CONTENT_TYPE) == null) {
      metadata.set(Metadata.CONTENT_TYPE, "application/xml");
    }

    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.startElement("p");

    xhtml.endElement("p");
    xhtml.endDocument();
  }

  /** @deprecated This method will be removed in Apache Tika 1.0. */
  public void parse(InputStream stream, ContentHandler handler, Metadata metadata)
      throws IOException, SAXException, TikaException {
    parse(stream, handler, metadata, new ParseContext());
  }
}

Example #2

Show file

File: HtmlParser.java Project: asitang/tika_pdf_celgene

/**
 * HTML parser. Uses TagSoup to turn the input document to HTML SAX events, and post-processes the
 * events to produce XHTML and metadata expected by Tika clients.
 */
public class HtmlParser extends AbstractParser {

  /** Serial version UID */
  private static final long serialVersionUID = 7895315240498733128L;

  private static final MediaType XHTML = MediaType.application("xhtml+xml");
  private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
  private static final MediaType X_ASP = MediaType.application("x-asp");

  private static final Set<MediaType> SUPPORTED_TYPES =
      Collections.unmodifiableSet(
          new HashSet<MediaType>(Arrays.asList(MediaType.text("html"), XHTML, WAP_XHTML, X_ASP)));

  private static final ServiceLoader LOADER = new ServiceLoader(HtmlParser.class.getClassLoader());

  /** HTML schema singleton used to amortise the heavy instantiation time. */
  private static final Schema HTML_SCHEMA = new HTMLSchema();

  public Set<MediaType> getSupportedTypes(ParseContext context) {
    return SUPPORTED_TYPES;
  }

  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    // Automatically detect the character encoding
    AutoDetectReader reader =
        new AutoDetectReader(
            new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER));
    try {
      Charset charset = reader.getCharset();
      String previous = metadata.get(Metadata.CONTENT_TYPE);
      MediaType contentType = null;
      if (previous == null || previous.startsWith("text/html")) {
        contentType = new MediaType(MediaType.TEXT_HTML, charset);
      } else if (previous.startsWith("application/xhtml+xml")) {
        contentType = new MediaType(XHTML, charset);
      } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
        contentType = new MediaType(WAP_XHTML, charset);
      } else if (previous.startsWith("application/x-asp")) {
        contentType = new MediaType(X_ASP, charset);
      }
      if (contentType != null) {
        metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
      }
      // deprecated, see TIKA-431
      metadata.set(Metadata.CONTENT_ENCODING, charset.name());

      // Get the HTML mapper from the parse context
      HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());

      // Parse the HTML document
      org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();

      // Use schema from context or default
      Schema schema = context.get(Schema.class, HTML_SCHEMA);

      // TIKA-528: Reuse share schema to avoid heavy instantiation
      parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
      // TIKA-599: Shared schema is thread-safe only if bogons are ignored
      parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

      parser.setContentHandler(
          new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));

      parser.parse(reader.asInputSource());
    } finally {
      reader.close();
    }
  }

  /**
   * Maps "safe" HTML element names to semantic XHTML equivalents. If the given element is unknown
   * or deemed unsafe for inclusion in the parse output, then this method returns <code>null</code>
   * and the element will be ignored but the content inside it is still processed. See the {@link
   * #isDiscardElement(String)} method for a way to discard the entire contents of an element.
   *
   * <p>Subclasses can override this method to customize the default mapping.
   *
   * @param name HTML element name (upper case)
   * @return XHTML element name (lower case), or <code>null</code> if the element is unsafe
   * @since Apache Tika 0.5
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method
   *     will be removed in Tika 1.0.
   */
  protected String mapSafeElement(String name) {
    return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
  }

  /**
   * Checks whether all content within the given HTML element should be discarded instead of
   * including it in the parse output. Subclasses can override this method to customize the set of
   * discarded elements.
   *
   * @param name HTML element name (upper case)
   * @return <code>true</code> if content inside the named element should be ignored, <code>false
   *     </code> otherwise
   * @since Apache Tika 0.5
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method
   *     will be removed in Tika 1.0.
   */
  protected boolean isDiscardElement(String name) {
    return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
  }

  /**
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method
   *     will be removed in Tika 1.0.
   */
  public String mapSafeAttribute(String elementName, String attributeName) {
    return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
  }

  /**
   * Adapter class that maintains backwards compatibility with the protected HtmlParser methods.
   * Making HtmlParser implement HtmlMapper directly would require those methods to be public, which
   * would break backwards compatibility with subclasses.
   *
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This class will
   *     be removed in Tika 1.0.
   */
  private class HtmlParserMapper implements HtmlMapper {
    public String mapSafeElement(String name) {
      return HtmlParser.this.mapSafeElement(name);
    }

    public boolean isDiscardElement(String name) {
      return HtmlParser.this.isDiscardElement(name);
    }

    public String mapSafeAttribute(String elementName, String attributeName) {
      return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
    }
  }
}