/** * Builds a Tika-compatible SAX content handler, which will be used to generate+capture the XHTML */ private ContentHandler buildContentHandler(Writer output, RenderingContext context) { // Create the main transformer SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); TransformerHandler handler; try { handler = factory.newTransformerHandler(); } catch (TransformerConfigurationException e) { throw new RenditionServiceException("SAX Processing isn't available - " + e); } handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); handler.setResult(new StreamResult(output)); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); // Change the image links as they go past String dirName = null, imgPrefix = null; if (context.getParamWithDefault(PARAM_IMAGES_SAME_FOLDER, false)) { imgPrefix = getImagesPrefixName(context); } else { dirName = getImagesDirectoryName(context); } ContentHandler contentHandler = new TikaImageRewritingContentHandler(handler, dirName, imgPrefix); // If required, wrap it to only return the body boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false); if (bodyOnly) { contentHandler = new BodyContentHandler(contentHandler); } // All done return contentHandler; }
private TikaImageExtractingParser(RenderingContext renderingContext) { this.renderingContext = renderingContext; // Our expected types types = new HashSet<MediaType>(); types.add(MediaType.image("bmp")); types.add(MediaType.image("gif")); types.add(MediaType.image("jpg")); types.add(MediaType.image("jpeg")); types.add(MediaType.image("png")); types.add(MediaType.image("tiff")); // Are images going in the same place as the HTML? if (renderingContext.getParamWithDefault(PARAM_IMAGES_SAME_FOLDER, false)) { RenditionLocation location = resolveRenditionLocation( renderingContext.getSourceNode(), renderingContext.getDefinition(), renderingContext.getDestinationNode()); imgFolder = location.getParentRef(); if (logger.isDebugEnabled()) { logger.debug("Using imgFolder: " + imgFolder); } } }
/** What prefix should be applied to the name of images? */ private String getImagesPrefixName(RenderingContext context) { if (context.getParamWithDefault(PARAM_IMAGES_SAME_FOLDER, false)) { // Prefix with the name of the source node return getHtmlBaseName(context) + "_"; } else { // They have their own folder, so no prefix is needed return ""; } }
/** Asks Tika to translate the contents into HTML */ private void generateHTML(Parser p, RenderingContext context) { ContentReader contentReader = context.makeContentReader(); // Setup things to parse with StringWriter sw = new StringWriter(); ContentHandler handler = buildContentHandler(sw, context); // Tell Tika what we're dealing with Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, contentReader.getMimetype()); metadata.set( Metadata.RESOURCE_NAME_KEY, nodeService.getProperty(context.getSourceNode(), ContentModel.PROP_NAME).toString()); // Our parse context needs to extract images ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, new TikaImageExtractingParser(context)); // Parse try { p.parse(contentReader.getContentInputStream(), handler, metadata, parseContext); } catch (Exception e) { throw new RenditionServiceException("Tika HTML Conversion Failed", e); } // As a string String html = sw.toString(); // If we're doing body-only, remove all the html namespaces // that will otherwise clutter up the document boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false); if (bodyOnly) { html = html.replaceAll("<\\?xml.*?\\?>", ""); html = html.replaceAll("<p xmlns=\"http://www.w3.org/1999/xhtml\"", "<p"); html = html.replaceAll("<h(\\d) xmlns=\"http://www.w3.org/1999/xhtml\"", "<h\\1"); html = html.replaceAll("<div xmlns=\"http://www.w3.org/1999/xhtml\"", "<div"); html = html.replaceAll("<table xmlns=\"http://www.w3.org/1999/xhtml\"", "<table"); html = html.replaceAll(" ", ""); } // Save it ContentWriter contentWriter = context.makeContentWriter(); contentWriter.setMimetype("text/html"); contentWriter.putContent(html); }