@Override protected boolean doProcess(Record record, InputStream inputStream) { Parser parser = detectParser(record); if (parser == null) { return false; } ParseContext parseContext = new ParseContext(); parseContext.set(Locale.class, locale); Metadata metadata = new Metadata(); for (Entry<String, Object> entry : record.getFields().entries()) { metadata.add(entry.getKey(), entry.getValue().toString()); } SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema); try { inputStream = TikaInputStream.get(inputStream); ContentHandler parsingHandler = handler; // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()"; if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); parsingHandler = new MatchingContentHandler(parsingHandler, matcher); } try { parser.parse(inputStream, parsingHandler, metadata, parseContext); } catch (IOException e) { throw new MorphlineRuntimeException("Cannot parse", e); } catch (SAXException e) { throw new MorphlineRuntimeException("Cannot parse", e); } catch (TikaException e) { throw new MorphlineRuntimeException("Cannot parse", e); } } finally { if (inputStream != null) { Closeables.closeQuietly(inputStream); } } SolrInputDocument doc = handler.newDocument(); LOG.debug("solr doc: {}", doc); Record outputRecord = toRecord(doc); return getChild().process(outputRecord); }
@Override public void load( SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception { Parser parser = null; String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null); if (streamType != null) { // Cache? Parsers are lightweight to construct and thread-safe, so I'm told MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT)); parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt); } else { parser = autoDetectParser; } if (parser != null) { Metadata metadata = new Metadata(); // If you specify the resource name (the filename, roughly) with this parameter, // then Tika can make use of it in guessing the appropriate MIME type: String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null); if (resourceName != null) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName); } // Provide stream's content type as hint for auto detection if (stream.getContentType() != null) { metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType()); } InputStream inputStream = null; try { inputStream = stream.getStream(); metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName()); metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo()); metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize())); metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType()); // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); if (charset != null) { metadata.add(HttpHeaders.CONTENT_ENCODING, charset); } String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema()); ContentHandler parsingHandler = handler; StringWriter writer = null; BaseMarkupSerializer serializer = null; if (extractOnly == true) { String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml"); writer = new StringWriter(); if (extractFormat.equals(TEXT_FORMAT)) { serializer = new TextSerializer(); serializer.setOutputCharStream(writer); serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true)); } else { serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true)); } if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); serializer .startDocument(); // The MatchingContentHandler does not invoke startDocument. See // http://tika.markmail.org/message/kknu3hw7argwiqin parsingHandler = new MatchingContentHandler(serializer, matcher); } else { parsingHandler = serializer; } } else if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); parsingHandler = new MatchingContentHandler(handler, matcher); } // else leave it as is try { // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler // for getting the document. ParseContext context = parseContextConfig.create(); context.set(Parser.class, parser); context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); // Password handling RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider(); String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE); if (pwMapFile != null && pwMapFile.length() > 0) { InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile); if (is != null) { log.debug("Password file supplied: " + pwMapFile); epp.parse(is); } } context.set(PasswordProvider.class, epp); String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD); if (resourcePassword != null) { epp.setExplicitPassword(resourcePassword); log.debug("Literal password supplied for file " + resourceName); } parser.parse(inputStream, parsingHandler, metadata, context); } catch (TikaException e) { if (ignoreTikaException) log.warn( new StringBuilder("skip extracting text due to ") .append(e.getLocalizedMessage()) .append(". metadata=") .append(metadata.toString()) .toString()); else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } if (extractOnly == false) { addDoc(handler); } else { // serializer is not null, so we need to call endDoc on it if using xpath if (xpathExpr != null) { serializer.endDocument(); } rsp.add(stream.getName(), writer.toString()); writer.close(); String[] names = metadata.names(); NamedList metadataNL = new NamedList(); for (int i = 0; i < names.length; i++) { String[] vals = metadata.getValues(names[i]); metadataNL.add(names[i], vals); } rsp.add(stream.getName() + "_metadata", metadataNL); } } catch (SAXException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } finally { IOUtils.closeQuietly(inputStream); } } else { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter."); } }