Esempio n. 1
0
    @Override
    protected boolean doProcess(Record record, InputStream inputStream) {
      Parser parser = detectParser(record);
      if (parser == null) {
        return false;
      }

      ParseContext parseContext = new ParseContext();
      parseContext.set(Locale.class, locale);

      Metadata metadata = new Metadata();
      for (Entry<String, Object> entry : record.getFields().entries()) {
        metadata.add(entry.getKey(), entry.getValue().toString());
      }

      SolrContentHandler handler =
          solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
      try {
        inputStream = TikaInputStream.get(inputStream);

        ContentHandler parsingHandler = handler;

        // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()";
        if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(parsingHandler, matcher);
        }

        try {
          parser.parse(inputStream, parsingHandler, metadata, parseContext);
        } catch (IOException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (SAXException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (TikaException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        }
      } finally {
        if (inputStream != null) {
          Closeables.closeQuietly(inputStream);
        }
      }

      SolrInputDocument doc = handler.newDocument();
      LOG.debug("solr doc: {}", doc);
      Record outputRecord = toRecord(doc);
      return getChild().process(outputRecord);
    }
  @Override
  public void load(
      SolrQueryRequest req,
      SolrQueryResponse rsp,
      ContentStream stream,
      UpdateRequestProcessor processor)
      throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
      // Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
      MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
      parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
      parser = autoDetectParser;
    }
    if (parser != null) {
      Metadata metadata = new Metadata();

      // If you specify the resource name (the filename, roughly) with this parameter,
      // then Tika can make use of it in guessing the appropriate MIME type:
      String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
      if (resourceName != null) {
        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
      }
      // Provide stream's content type as hint for auto detection
      if (stream.getContentType() != null) {
        metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
      }

      InputStream inputStream = null;
      try {
        inputStream = stream.getStream();
        metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
        metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
        metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
        metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
        // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
        String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
        if (charset != null) {
          metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
        }

        String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
        boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
        SolrContentHandler handler =
            factory.createSolrContentHandler(metadata, params, req.getSchema());
        ContentHandler parsingHandler = handler;

        StringWriter writer = null;
        BaseMarkupSerializer serializer = null;
        if (extractOnly == true) {
          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
          writer = new StringWriter();
          if (extractFormat.equals(TEXT_FORMAT)) {
            serializer = new TextSerializer();
            serializer.setOutputCharStream(writer);
            serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
          } else {
            serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
          }
          if (xpathExpr != null) {
            Matcher matcher = PARSER.parse(xpathExpr);
            serializer
                .startDocument(); // The MatchingContentHandler does not invoke startDocument.  See
                                  // http://tika.markmail.org/message/kknu3hw7argwiqin
            parsingHandler = new MatchingContentHandler(serializer, matcher);
          } else {
            parsingHandler = serializer;
          }
        } else if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(handler, matcher);
        } // else leave it as is

        try {
          // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler
          // for getting the document.
          ParseContext context = parseContextConfig.create();

          context.set(Parser.class, parser);
          context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);

          // Password handling
          RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
          String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
          if (pwMapFile != null && pwMapFile.length() > 0) {
            InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
            if (is != null) {
              log.debug("Password file supplied: " + pwMapFile);
              epp.parse(is);
            }
          }
          context.set(PasswordProvider.class, epp);
          String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
          if (resourcePassword != null) {
            epp.setExplicitPassword(resourcePassword);
            log.debug("Literal password supplied for file " + resourceName);
          }
          parser.parse(inputStream, parsingHandler, metadata, context);
        } catch (TikaException e) {
          if (ignoreTikaException)
            log.warn(
                new StringBuilder("skip extracting text due to ")
                    .append(e.getLocalizedMessage())
                    .append(". metadata=")
                    .append(metadata.toString())
                    .toString());
          else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
        if (extractOnly == false) {
          addDoc(handler);
        } else {
          // serializer is not null, so we need to call endDoc on it if using xpath
          if (xpathExpr != null) {
            serializer.endDocument();
          }
          rsp.add(stream.getName(), writer.toString());
          writer.close();
          String[] names = metadata.names();
          NamedList metadataNL = new NamedList();
          for (int i = 0; i < names.length; i++) {
            String[] vals = metadata.getValues(names[i]);
            metadataNL.add(names[i], vals);
          }
          rsp.add(stream.getName() + "_metadata", metadataNL);
        }
      } catch (SAXException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      } finally {
        IOUtils.closeQuietly(inputStream);
      }
    } else {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Stream type of "
              + streamType
              + " didn't match any known parsers.  Please supply the "
              + ExtractingParams.STREAM_TYPE
              + " parameter.");
    }
  }