public void testStringStream() throws IOException { String input = "aads ghaskdgasgldj asl sadg ajdsg &jag # @ hjsakg hsakdg hjkas s"; ContentStreamBase stream = new ContentStreamBase.StringStream(input); assertEquals(input.length(), stream.getSize().intValue()); assertEquals(input, IOUtils.toString(stream.getStream(), "UTF-8")); assertEquals(input, IOUtils.toString(stream.getReader())); }
/** * Take a string and make it an iterable ContentStream * * <p>This should be moved to a helper class. (it is useful for the client too!) */ public static Collection<ContentStream> toContentStreams( final String str, final String contentType) { ArrayList<ContentStream> streams = new ArrayList<>(); ContentStreamBase stream = new ContentStreamBase.StringStream(str); stream.setContentType(contentType); streams.add(stream); return streams; }
/** Take a string and make it an iterable ContentStream */ public static Collection<ContentStream> toContentStreams( final String str, final String contentType) { if (str == null) return null; ArrayList<ContentStream> streams = new ArrayList<ContentStream>(1); ContentStreamBase ccc = new ContentStreamBase.StringStream(str); ccc.setContentType(contentType); streams.add(ccc); return streams; }
public void testFileStream() throws IOException { InputStream is = new SolrResourceLoader(null, null).openResource("solrj/README"); assertNotNull(is); File file = new File(TEMP_DIR, "README"); FileOutputStream os = new FileOutputStream(file); IOUtils.copy(is, os); os.close(); ContentStreamBase stream = new ContentStreamBase.FileStream(file); assertEquals(file.length(), stream.getSize().intValue()); assertTrue(IOUtils.contentEquals(new FileInputStream(file), stream.getStream())); assertTrue(IOUtils.contentEquals(new FileReader(file), stream.getReader())); }
public void testURLStream() throws IOException { byte[] content = null; String contentType = null; URL url = new URL("http://svn.apache.org/repos/asf/lucene/dev/trunk/"); InputStream in = null; try { URLConnection conn = url.openConnection(); in = conn.getInputStream(); contentType = conn.getContentType(); content = IOUtils.toByteArray(in); assumeTrue("not enough content for test to be useful", content.length > 10); } catch (IOException ex) { assumeNoException("Unable to connect to " + url + " to run the test.", ex); } finally { if (in != null) { IOUtils.closeQuietly(in); } } ContentStreamBase stream = new ContentStreamBase.URLStream(url); assertEquals(content.length, stream.getSize().intValue()); // Test the stream in = stream.getStream(); try { assertTrue(IOUtils.contentEquals(new ByteArrayInputStream(content), in)); } finally { IOUtils.closeQuietly(in); } String charset = ContentStreamBase.getCharsetFromContentType(contentType); if (charset == null) charset = ContentStreamBase.DEFAULT_CHARSET; // Re-open the stream and this time use a reader stream = new ContentStreamBase.URLStream(url); assertTrue( IOUtils.contentEquals(new StringReader(new String(content, charset)), stream.getReader())); }
@Override public void load( SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception { Parser parser = null; String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null); if (streamType != null) { // Cache? Parsers are lightweight to construct and thread-safe, so I'm told MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT)); parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt); } else { parser = autoDetectParser; } if (parser != null) { Metadata metadata = new Metadata(); // If you specify the resource name (the filename, roughly) with this parameter, // then Tika can make use of it in guessing the appropriate MIME type: String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null); if (resourceName != null) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName); } // Provide stream's content type as hint for auto detection if (stream.getContentType() != null) { metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType()); } InputStream inputStream = null; try { inputStream = stream.getStream(); metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName()); metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo()); metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize())); metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType()); // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); if (charset != null) { metadata.add(HttpHeaders.CONTENT_ENCODING, charset); } String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema()); ContentHandler parsingHandler = handler; StringWriter writer = null; BaseMarkupSerializer serializer = null; if (extractOnly == true) { String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml"); writer = new StringWriter(); if (extractFormat.equals(TEXT_FORMAT)) { serializer = new TextSerializer(); serializer.setOutputCharStream(writer); serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true)); } else { serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true)); } if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); serializer .startDocument(); // The MatchingContentHandler does not invoke startDocument. See // http://tika.markmail.org/message/kknu3hw7argwiqin parsingHandler = new MatchingContentHandler(serializer, matcher); } else { parsingHandler = serializer; } } else if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); parsingHandler = new MatchingContentHandler(handler, matcher); } // else leave it as is try { // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler // for getting the document. ParseContext context = parseContextConfig.create(); context.set(Parser.class, parser); context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); // Password handling RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider(); String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE); if (pwMapFile != null && pwMapFile.length() > 0) { InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile); if (is != null) { log.debug("Password file supplied: " + pwMapFile); epp.parse(is); } } context.set(PasswordProvider.class, epp); String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD); if (resourcePassword != null) { epp.setExplicitPassword(resourcePassword); log.debug("Literal password supplied for file " + resourceName); } parser.parse(inputStream, parsingHandler, metadata, context); } catch (TikaException e) { if (ignoreTikaException) log.warn( new StringBuilder("skip extracting text due to ") .append(e.getLocalizedMessage()) .append(". metadata=") .append(metadata.toString()) .toString()); else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } if (extractOnly == false) { addDoc(handler); } else { // serializer is not null, so we need to call endDoc on it if using xpath if (xpathExpr != null) { serializer.endDocument(); } rsp.add(stream.getName(), writer.toString()); writer.close(); String[] names = metadata.names(); NamedList metadataNL = new NamedList(); for (int i = 0; i < names.length; i++) { String[] vals = metadata.getValues(names[i]); metadataNL.add(names[i], vals); } rsp.add(stream.getName() + "_metadata", metadataNL); } } catch (SAXException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } finally { IOUtils.closeQuietly(inputStream); } } else { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter."); } }
@Override public void load( SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception { final String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); InputStream is = null; XMLStreamReader parser = null; String tr = req.getParams().get(CommonParams.TR, null); if (tr != null) { final Transformer t = getTransformer(tr, req); final DOMResult result = new DOMResult(); // first step: read XML and build DOM using Transformer (this is no overhead, as XSL always // produces // an internal result DOM tree, we just access it directly as input for StAX): try { is = stream.getStream(); final InputSource isrc = new InputSource(is); isrc.setEncoding(charset); final XMLReader xmlr = saxFactory.newSAXParser().getXMLReader(); xmlr.setErrorHandler(xmllog); xmlr.setEntityResolver(EmptyEntityResolver.SAX_INSTANCE); final SAXSource source = new SAXSource(xmlr, isrc); t.transform(source, result); } catch (TransformerException te) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, te.getMessage(), te); } finally { IOUtils.closeQuietly(is); } // second step: feed the intermediate DOM tree into StAX parser: try { parser = inputFactory.createXMLStreamReader(new DOMSource(result.getNode())); this.processUpdate(req, processor, parser); } catch (XMLStreamException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e); } finally { if (parser != null) parser.close(); } } // Normal XML Loader else { try { is = stream.getStream(); if (log.isTraceEnabled()) { final byte[] body = IOUtils.toByteArray(is); // TODO: The charset may be wrong, as the real charset is later // determined by the XML parser, the content-type is only used as a hint! log.trace( "body", new String(body, (charset == null) ? ContentStreamBase.DEFAULT_CHARSET : charset)); IOUtils.closeQuietly(is); is = new ByteArrayInputStream(body); } parser = (charset == null) ? inputFactory.createXMLStreamReader(is) : inputFactory.createXMLStreamReader(is, charset); this.processUpdate(req, processor, parser); } catch (XMLStreamException e) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e); } finally { if (parser != null) parser.close(); IOUtils.closeQuietly(is); } } }