예제 #1
0
 private Record toRecord(SolrInputDocument doc) {
   Record record = new Record();
   for (Entry<String, SolrInputField> entry : doc.entrySet()) {
     record.getFields().putAll(entry.getKey(), entry.getValue().getValues());
   }
   return record;
 }
예제 #2
0
 private boolean hasAtLeastOneMimeType(Record record) {
   if (!record.getFields().containsKey(Fields.ATTACHMENT_MIME_TYPE)) {
     LOG.debug("Command failed because of missing MIME type for record: {}", record);
     return false;
   }
   return true;
 }
    @Override
    protected boolean doProcess(Record record) {
      Timer.Context timerContext = elapsedTime.time();

      try {
        XContentBuilder documentBuilder = jsonBuilder().startObject();
        Map<String, Collection<Object>> map = record.getFields().asMap();
        for (Map.Entry<String, Collection<Object>> entry : map.entrySet()) {
          String key = entry.getKey();
          Iterator<Object> iterator = entry.getValue().iterator();
          while (iterator.hasNext()) {
            documentBuilder.field(key, iterator.next());
          }
        }
        documentBuilder.endObject();
        loader.addDocument(documentBuilder.bytes(), indexName, indexType, ttl);
      } catch (Exception e) {
        throw new MorphlineRuntimeException(e);
      } finally {
        timerContext.stop();
      }

      // pass record to next command in chain:
      return super.doProcess(record);
    }
예제 #4
0
    @Override
    protected boolean doProcess(Record record) {
      if (preserveExisting
          && record
              .getFields()
              .containsKey(
                  Fields.ATTACHMENT_MIME_TYPE)) {; // we must preserve the existing MIME type
      } else {
        List attachments = record.get(Fields.ATTACHMENT_BODY);
        if (attachments.size() > 0) {
          Object attachment = attachments.get(0);
          Preconditions.checkNotNull(attachment);
          InputStream stream;
          if (attachment instanceof byte[]) {
            stream = new ByteArrayInputStream((byte[]) attachment);
          } else {
            stream = (InputStream) attachment;
          }

          Metadata metadata = new Metadata();

          // If you specify the resource name (the filename, roughly) with this
          // parameter, then Tika can use it in guessing the right MIME type
          String resourceName = (String) record.getFirstValue(Fields.ATTACHMENT_NAME);
          if (resourceName != null) {
            metadata.add(Metadata.RESOURCE_NAME_KEY, resourceName);
          }

          // Provide stream's charset as hint to Tika for better auto detection
          String charset = (String) record.getFirstValue(Fields.ATTACHMENT_CHARSET);
          if (charset != null) {
            metadata.add(Metadata.CONTENT_ENCODING, charset);
          }

          if (includeMetaData) {
            for (Entry<String, Object> entry : record.getFields().entries()) {
              metadata.add(entry.getKey(), entry.getValue().toString());
            }
          }

          String mimeType = getMediaType(stream, metadata, excludeParameters);
          record.replaceValues(Fields.ATTACHMENT_MIME_TYPE, mimeType);
        }
      }
      return super.doProcess(record);
    }
예제 #5
0
    @Override
    protected boolean doProcess(Record record, InputStream inputStream) {
      Parser parser = detectParser(record);
      if (parser == null) {
        return false;
      }

      ParseContext parseContext = new ParseContext();
      parseContext.set(Locale.class, locale);

      Metadata metadata = new Metadata();
      for (Entry<String, Object> entry : record.getFields().entries()) {
        metadata.add(entry.getKey(), entry.getValue().toString());
      }

      SolrContentHandler handler =
          solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
      try {
        inputStream = TikaInputStream.get(inputStream);

        ContentHandler parsingHandler = handler;

        // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()";
        if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(parsingHandler, matcher);
        }

        try {
          parser.parse(inputStream, parsingHandler, metadata, parseContext);
        } catch (IOException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (SAXException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (TikaException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        }
      } finally {
        if (inputStream != null) {
          Closeables.closeQuietly(inputStream);
        }
      }

      SolrInputDocument doc = handler.newDocument();
      LOG.debug("solr doc: {}", doc);
      Record outputRecord = toRecord(doc);
      return getChild().process(outputRecord);
    }