예제 #1
0
 public void testStringStream() throws IOException {
   String input = "aads ghaskdgasgldj asl sadg ajdsg &jag # @ hjsakg hsakdg hjkas s";
   ContentStreamBase stream = new ContentStreamBase.StringStream(input);
   assertEquals(input.length(), stream.getSize().intValue());
   assertEquals(input, IOUtils.toString(stream.getStream(), "UTF-8"));
   assertEquals(input, IOUtils.toString(stream.getReader()));
 }
예제 #2
0
 /**
  * Take a string and make it an iterable ContentStream
  *
  * <p>This should be moved to a helper class. (it is useful for the client too!)
  */
 public static Collection<ContentStream> toContentStreams(
     final String str, final String contentType) {
   ArrayList<ContentStream> streams = new ArrayList<>();
   ContentStreamBase stream = new ContentStreamBase.StringStream(str);
   stream.setContentType(contentType);
   streams.add(stream);
   return streams;
 }
예제 #3
0
  /** Take a string and make it an iterable ContentStream */
  public static Collection<ContentStream> toContentStreams(
      final String str, final String contentType) {
    if (str == null) return null;

    ArrayList<ContentStream> streams = new ArrayList<ContentStream>(1);
    ContentStreamBase ccc = new ContentStreamBase.StringStream(str);
    ccc.setContentType(contentType);
    streams.add(ccc);
    return streams;
  }
예제 #4
0
  public void testFileStream() throws IOException {
    InputStream is = new SolrResourceLoader(null, null).openResource("solrj/README");
    assertNotNull(is);
    File file = new File(TEMP_DIR, "README");
    FileOutputStream os = new FileOutputStream(file);
    IOUtils.copy(is, os);
    os.close();

    ContentStreamBase stream = new ContentStreamBase.FileStream(file);
    assertEquals(file.length(), stream.getSize().intValue());
    assertTrue(IOUtils.contentEquals(new FileInputStream(file), stream.getStream()));
    assertTrue(IOUtils.contentEquals(new FileReader(file), stream.getReader()));
  }
예제 #5
0
  public void testURLStream() throws IOException {
    byte[] content = null;
    String contentType = null;
    URL url = new URL("http://svn.apache.org/repos/asf/lucene/dev/trunk/");
    InputStream in = null;
    try {
      URLConnection conn = url.openConnection();
      in = conn.getInputStream();
      contentType = conn.getContentType();
      content = IOUtils.toByteArray(in);

      assumeTrue("not enough content for test to be useful", content.length > 10);

    } catch (IOException ex) {
      assumeNoException("Unable to connect to " + url + " to run the test.", ex);
    } finally {
      if (in != null) {
        IOUtils.closeQuietly(in);
      }
    }

    ContentStreamBase stream = new ContentStreamBase.URLStream(url);
    assertEquals(content.length, stream.getSize().intValue());

    // Test the stream
    in = stream.getStream();
    try {
      assertTrue(IOUtils.contentEquals(new ByteArrayInputStream(content), in));
    } finally {
      IOUtils.closeQuietly(in);
    }

    String charset = ContentStreamBase.getCharsetFromContentType(contentType);
    if (charset == null) charset = ContentStreamBase.DEFAULT_CHARSET;
    // Re-open the stream and this time use a reader
    stream = new ContentStreamBase.URLStream(url);
    assertTrue(
        IOUtils.contentEquals(new StringReader(new String(content, charset)), stream.getReader()));
  }
  @Override
  public void load(
      SolrQueryRequest req,
      SolrQueryResponse rsp,
      ContentStream stream,
      UpdateRequestProcessor processor)
      throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
      // Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
      MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
      parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
      parser = autoDetectParser;
    }
    if (parser != null) {
      Metadata metadata = new Metadata();

      // If you specify the resource name (the filename, roughly) with this parameter,
      // then Tika can make use of it in guessing the appropriate MIME type:
      String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
      if (resourceName != null) {
        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
      }
      // Provide stream's content type as hint for auto detection
      if (stream.getContentType() != null) {
        metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
      }

      InputStream inputStream = null;
      try {
        inputStream = stream.getStream();
        metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
        metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
        metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
        metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
        // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
        String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
        if (charset != null) {
          metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
        }

        String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
        boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
        SolrContentHandler handler =
            factory.createSolrContentHandler(metadata, params, req.getSchema());
        ContentHandler parsingHandler = handler;

        StringWriter writer = null;
        BaseMarkupSerializer serializer = null;
        if (extractOnly == true) {
          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
          writer = new StringWriter();
          if (extractFormat.equals(TEXT_FORMAT)) {
            serializer = new TextSerializer();
            serializer.setOutputCharStream(writer);
            serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
          } else {
            serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
          }
          if (xpathExpr != null) {
            Matcher matcher = PARSER.parse(xpathExpr);
            serializer
                .startDocument(); // The MatchingContentHandler does not invoke startDocument.  See
                                  // http://tika.markmail.org/message/kknu3hw7argwiqin
            parsingHandler = new MatchingContentHandler(serializer, matcher);
          } else {
            parsingHandler = serializer;
          }
        } else if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(handler, matcher);
        } // else leave it as is

        try {
          // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler
          // for getting the document.
          ParseContext context = parseContextConfig.create();

          context.set(Parser.class, parser);
          context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);

          // Password handling
          RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
          String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
          if (pwMapFile != null && pwMapFile.length() > 0) {
            InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
            if (is != null) {
              log.debug("Password file supplied: " + pwMapFile);
              epp.parse(is);
            }
          }
          context.set(PasswordProvider.class, epp);
          String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
          if (resourcePassword != null) {
            epp.setExplicitPassword(resourcePassword);
            log.debug("Literal password supplied for file " + resourceName);
          }
          parser.parse(inputStream, parsingHandler, metadata, context);
        } catch (TikaException e) {
          if (ignoreTikaException)
            log.warn(
                new StringBuilder("skip extracting text due to ")
                    .append(e.getLocalizedMessage())
                    .append(". metadata=")
                    .append(metadata.toString())
                    .toString());
          else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
        if (extractOnly == false) {
          addDoc(handler);
        } else {
          // serializer is not null, so we need to call endDoc on it if using xpath
          if (xpathExpr != null) {
            serializer.endDocument();
          }
          rsp.add(stream.getName(), writer.toString());
          writer.close();
          String[] names = metadata.names();
          NamedList metadataNL = new NamedList();
          for (int i = 0; i < names.length; i++) {
            String[] vals = metadata.getValues(names[i]);
            metadataNL.add(names[i], vals);
          }
          rsp.add(stream.getName() + "_metadata", metadataNL);
        }
      } catch (SAXException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      } finally {
        IOUtils.closeQuietly(inputStream);
      }
    } else {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Stream type of "
              + streamType
              + " didn't match any known parsers.  Please supply the "
              + ExtractingParams.STREAM_TYPE
              + " parameter.");
    }
  }
예제 #7
0
  @Override
  public void load(
      SolrQueryRequest req,
      SolrQueryResponse rsp,
      ContentStream stream,
      UpdateRequestProcessor processor)
      throws Exception {
    final String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());

    InputStream is = null;
    XMLStreamReader parser = null;

    String tr = req.getParams().get(CommonParams.TR, null);
    if (tr != null) {
      final Transformer t = getTransformer(tr, req);
      final DOMResult result = new DOMResult();

      // first step: read XML and build DOM using Transformer (this is no overhead, as XSL always
      // produces
      // an internal result DOM tree, we just access it directly as input for StAX):
      try {
        is = stream.getStream();
        final InputSource isrc = new InputSource(is);
        isrc.setEncoding(charset);
        final XMLReader xmlr = saxFactory.newSAXParser().getXMLReader();
        xmlr.setErrorHandler(xmllog);
        xmlr.setEntityResolver(EmptyEntityResolver.SAX_INSTANCE);
        final SAXSource source = new SAXSource(xmlr, isrc);
        t.transform(source, result);
      } catch (TransformerException te) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, te.getMessage(), te);
      } finally {
        IOUtils.closeQuietly(is);
      }
      // second step: feed the intermediate DOM tree into StAX parser:
      try {
        parser = inputFactory.createXMLStreamReader(new DOMSource(result.getNode()));
        this.processUpdate(req, processor, parser);
      } catch (XMLStreamException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e);
      } finally {
        if (parser != null) parser.close();
      }
    }
    // Normal XML Loader
    else {
      try {
        is = stream.getStream();
        if (log.isTraceEnabled()) {
          final byte[] body = IOUtils.toByteArray(is);
          // TODO: The charset may be wrong, as the real charset is later
          // determined by the XML parser, the content-type is only used as a hint!
          log.trace(
              "body",
              new String(body, (charset == null) ? ContentStreamBase.DEFAULT_CHARSET : charset));
          IOUtils.closeQuietly(is);
          is = new ByteArrayInputStream(body);
        }
        parser =
            (charset == null)
                ? inputFactory.createXMLStreamReader(is)
                : inputFactory.createXMLStreamReader(is, charset);
        this.processUpdate(req, processor, parser);
      } catch (XMLStreamException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e);
      } finally {
        if (parser != null) parser.close();
        IOUtils.closeQuietly(is);
      }
    }
  }