Пример #1
0
 /**
  * Delegates the call to the matching component parser.
  *
  * <p>Potential {@link RuntimeException}s, {@link IOException}s and {@link SAXException}s
  * unrelated to the given input stream and content handler are automatically wrapped into {@link
  * TikaException}s to better honor the {@link Parser} contract.
  */
 public void parse(
     InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
     throws IOException, SAXException, TikaException {
   Parser parser = getParser(metadata, context);
   TemporaryResources tmp = new TemporaryResources();
   try {
     TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
     TaggedContentHandler taggedHandler =
         handler != null ? new TaggedContentHandler(handler) : null;
     if (parser instanceof ParserDecorator) {
       metadata.add(
           "X-Parsed-By", ((ParserDecorator) parser).getWrappedParser().getClass().getName());
     } else {
       metadata.add("X-Parsed-By", parser.getClass().getName());
     }
     try {
       parser.parse(taggedStream, taggedHandler, metadata, context);
     } catch (RuntimeException e) {
       throw new TikaException("Unexpected RuntimeException from " + parser, e);
     } catch (IOException e) {
       taggedStream.throwIfCauseOf(e);
       throw new TikaException("TIKA-198: Illegal IOException from " + parser, e);
     } catch (SAXException e) {
       if (taggedHandler != null) taggedHandler.throwIfCauseOf(e);
       throw new TikaException("TIKA-237: Illegal SAXException from " + parser, e);
     }
   } finally {
     tmp.dispose();
   }
 }
Пример #2
0
 /**
  * Parse the input stream with a SAX parser. Wraps the content handler with an {@link
  * org.apache.tika.sax.OfflineContentHandler} to avoid that any namespace lookups are made. In
  * addition, by overriding {@link #getContentHandler(ContentHandler, Metadata, ParseContext)}, it
  * is possible to add additional wrappers.
  *
  * @param stream that should be parsed
  * @param handler that will receive the SAX events
  * @param metadata of current document stream
  * @param context of current parse
  * @throws IOException if the stream cannot be read
  * @throws SAXException if the SAX parsing fails.
  * @throws TikaException if the XML parsing fails.
  */
 public void parse(
     InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
     throws IOException, SAXException, TikaException {
   final TaggedContentHandler tagged = new TaggedContentHandler(handler);
   if (metadata.get(HttpHeaders.CONTENT_TYPE) == null) {
     metadata.set(HttpHeaders.CONTENT_TYPE, "application/xml");
   }
   try {
     context
         .getSAXParser()
         .parse(
             new CloseShieldInputStream(stream),
             new OfflineContentHandler(getContentHandler(tagged, metadata, context)));
   } catch (final SAXException e) {
     tagged.throwIfCauseOf(e);
     throw new TikaException("XML parse error", e);
   }
 }
Пример #3
0
  @Override
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    setContentType(metadata);

    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    TaggedContentHandler tagged = new TaggedContentHandler(xhtml);
    try {
      context
          .getSAXParser()
          .parse(
              new CloseShieldInputStream(stream),
              new OfflineContentHandler(
                  new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
    } catch (SAXException e) {
      tagged.throwIfCauseOf(e);
      throw new TikaException("XML parse error", e);
    } finally {
      xhtml.endDocument();
    }
  }