Esempio n. 1
0
 private Record toRecord(SolrInputDocument doc) {
   Record record = new Record();
   for (Entry<String, SolrInputField> entry : doc.entrySet()) {
     record.getFields().putAll(entry.getKey(), entry.getValue().getValues());
   }
   return record;
 }
 /** return a mapping of expected keys -> records */
 private HashMap<String, Record> createTextSequenceFile(File file, int numRecords)
     throws IOException {
   HashMap<String, Record> map = new HashMap<String, Record>();
   SequenceFile.Metadata metadata = new SequenceFile.Metadata(getMetadataForSequenceFile());
   FSDataOutputStream out = new FSDataOutputStream(new FileOutputStream(file), null);
   SequenceFile.Writer writer = null;
   try {
     writer =
         SequenceFile.createWriter(
             new Configuration(),
             out,
             Text.class,
             Text.class,
             SequenceFile.CompressionType.NONE,
             null,
             metadata);
     for (int i = 0; i < numRecords; ++i) {
       Text key = new Text("key" + i);
       Text value = new Text("value" + i);
       writer.append(key, value);
       Record record = new Record();
       record.put("key", key);
       record.put("value", value);
       map.put(key.toString(), record);
     }
   } finally {
     Closeables.closeQuietly(writer);
   }
   return map;
 }
 private boolean areRecordFieldsEqual(Record record1, Record record2, List<String> fieldsToCheck) {
   for (String field : fieldsToCheck) {
     if (!record1.get(field).equals(record2.get(field))) {
       return false;
     }
   }
   return true;
 }
  @Override
  public void process(String line, Emitter<T> emitter) {

    record.removeAll(Fields.ATTACHMENT_BODY);
    record.put(Fields.ATTACHMENT_BODY, new ByteArrayInputStream(line.toString().getBytes()));

    if (!morphline.process(record)) {
      LOGGER.error("Unable to process record: {}", line);
      return;
    }

    // the process command above parses the record
    // and stores it into the temporaryRecord
    emitter.emit(temporaryRecord);
  }
    @Override
    protected boolean doProcess(Record inputRecord) {
      Record outputRecord = inputRecord.copy();
      AbstractParser.removeAttachments(outputRecord);
      ByteArrayOutputStream bout = new ByteArrayOutputStream(1024);
      if (format == Format.container) {
        writeContainer(inputRecord, bout);
      } else {
        writeContainerless(inputRecord, bout);
      }
      outputRecord.put(Fields.ATTACHMENT_BODY, bout.toByteArray());

      // pass record to next command in chain:
      return super.doProcess(outputRecord);
    }
  private boolean areFieldsEqual(HashMap<String, Record> expected, List<Record> actual) {
    if (expected.size() != actual.size()) {
      return false;
    }
    for (Record current : actual) {
      String key = current.getFirstValue("key").toString();
      Record currentExpected = expected.get(key);
      if (!areRecordFieldsEqual(
          current, currentExpected, Arrays.asList(new String[] {"key", "value"}))) {
        return false;
      }
    }

    return true;
  }
  @Test
  public void testDiscardRecord() {
    final MorphlineContext context = new MorphlineContext.Builder().build();
    Collector collectorParent = new Collector();
    Collector collectorChild = new Collector();
    final Command command =
        new TimeFilterBuilder().build(config, collectorParent, collectorChild, context);

    Record record = new Record();
    record.put("createdAt", "21/01/2015");

    command.process(record);

    List<Record> records = collectorChild.getRecords();
    assertThat(records.size()).isEqualTo(0);
  }
  @Override
  public boolean process(Record record) {
    temporaryRecord = null;
    LOGGER.debug("Record received: {}", record);

    List fields = record.get(Fields.ATTACHMENT_BODY);
    if (fields.size() != 1) {
      LOGGER.error("Record [ {} ] had incorrect number of fields - [{}]", record, fields.size());
      return false;
    }

    try {
      byte[] byteArray = (byte[]) fields.get(0);
      SeekableByteArrayInput inputStream = new SeekableByteArrayInput(byteArray);
      DatumReader<T> userDatumReader = new SpecificDatumReader<T>(this.recordClass);
      DataFileReader<T> dataFileReader = new DataFileReader<T>(inputStream, userDatumReader);
      temporaryRecord = dataFileReader.next();

    } catch (Exception e) {
      LOGGER.error("Unable to process {}, exception: {}", record, e);
      return false;
    }

    return true;
  }
 private void writeContainerless(Record src, OutputStream dst) {
   try {
     GenericDatumWriter datumWriter = new GenericDatumWriter();
     Encoder encoder = null;
     Schema schema = null;
     for (Object attachment : src.get(Fields.ATTACHMENT_BODY)) {
       Preconditions.checkNotNull(attachment);
       GenericContainer datum = (GenericContainer) attachment;
       schema = getSchema(datum, schema);
       assert schema != null;
       datumWriter.setSchema(schema);
       if (encoder == null) { // init
         if (format == Format.containerlessJSON) {
           encoder = EncoderFactory.get().jsonEncoder(schema, dst);
         } else {
           encoder = EncoderFactory.get().binaryEncoder(dst, null);
         }
         assert encoder != null;
       }
       datumWriter.write(datum, encoder);
     }
     encoder.flush();
   } catch (IOException e) {
     throw new MorphlineRuntimeException(e);
   }
 }
 private void writeContainer(Record src, OutputStream dst) {
   DataFileWriter dataFileWriter = null;
   try {
     try {
       Schema schema = null;
       for (Object attachment : src.get(Fields.ATTACHMENT_BODY)) {
         Preconditions.checkNotNull(attachment);
         GenericContainer datum = (GenericContainer) attachment;
         schema = getSchema(datum, schema);
         assert schema != null;
         if (dataFileWriter == null) { // init
           GenericDatumWriter datumWriter = new GenericDatumWriter(schema);
           dataFileWriter = new DataFileWriter(datumWriter);
           if (codecFactory != null) {
             dataFileWriter.setCodec(codecFactory);
           }
           for (Map.Entry<String, String> entry : metadata.entrySet()) {
             dataFileWriter.setMeta(entry.getKey(), entry.getValue());
           }
           dataFileWriter.create(schema, dst);
         }
         dataFileWriter.append(datum);
       }
       if (dataFileWriter != null) {
         dataFileWriter.flush();
       }
     } catch (IOException e) {
       throw new MorphlineRuntimeException(e);
     }
   } finally {
     Closeables.closeQuietly(dataFileWriter);
   }
 }
Esempio n. 11
0
  /**
   * Test that Solr queries on a parsed SequenceFile document return the expected content and
   * fields. Don't pass in our own parser via the context.
   */
  @Test
  public void testSequenceFileContentSimple() throws Exception {
    morphline = createMorphline("test-morphlines/sequenceFileMorphlineSimple");
    String path = RESOURCES_DIR;
    File sequenceFile = new File(path, "testSequenceFileContentSimple.seq");
    int numRecords = 5;
    HashMap<String, Record> expected = createTextSequenceFile(sequenceFile, numRecords);
    InputStream in = new FileInputStream(sequenceFile.getAbsolutePath());
    Record record = new Record();
    record.put(Fields.ATTACHMENT_BODY, in);
    startSession();

    assertEquals(1, collector.getNumStartEvents());
    assertTrue(morphline.process(record));
    assertTrue(areFieldsEqual(expected, collector.getRecords()));
  }
    @Override
    protected boolean doProcess(Record record) {
      Timer.Context timerContext = elapsedTime.time();

      try {
        XContentBuilder documentBuilder = jsonBuilder().startObject();
        Map<String, Collection<Object>> map = record.getFields().asMap();
        for (Map.Entry<String, Collection<Object>> entry : map.entrySet()) {
          String key = entry.getKey();
          Iterator<Object> iterator = entry.getValue().iterator();
          while (iterator.hasNext()) {
            documentBuilder.field(key, iterator.next());
          }
        }
        documentBuilder.endObject();
        loader.addDocument(documentBuilder.bytes(), indexName, indexType, ttl);
      } catch (Exception e) {
        throw new MorphlineRuntimeException(e);
      } finally {
        timerContext.stop();
      }

      // pass record to next command in chain:
      return super.doProcess(record);
    }
Esempio n. 13
0
 private boolean hasAtLeastOneMimeType(Record record) {
   if (!record.getFields().containsKey(Fields.ATTACHMENT_MIME_TYPE)) {
     LOG.debug("Command failed because of missing MIME type for record: {}", record);
     return false;
   }
   return true;
 }
Esempio n. 14
0
    private Parser detectParser(Record record) {
      if (!hasAtLeastOneMimeType(record)) {
        return null;
      }
      String mediaTypeStr =
          (String)
              record.getFirstValue(Fields.ATTACHMENT_MIME_TYPE); // ExtractingParams.STREAM_TYPE);
      assert mediaTypeStr != null;

      MediaType mediaType = parseMediaType(mediaTypeStr).getBaseType();
      Parser parser = mediaTypeToParserMap.get(mediaType); // fast path
      if (parser != null) {
        return parser;
      }
      // wildcard matching
      for (Map.Entry<MediaType, Parser> entry : mediaTypeToParserMap.entrySet()) {
        if (isMediaTypeMatch(mediaType, entry.getKey())) {
          return entry.getValue();
        }
      }
      if (LOG.isDebugEnabled()) {
        LOG.debug(
            "No supported MIME type parser found for "
                + Fields.ATTACHMENT_MIME_TYPE
                + "="
                + mediaTypeStr);
      }
      return null;
    }
Esempio n. 15
0
    @Override
    protected boolean doProcess(Record inputRecord, final InputStream in) throws IOException {
      SequenceFile.Metadata sequenceFileMetaData = null;
      SequenceFile.Reader reader = null;
      try {
        reader =
            new SequenceFile.Reader(
                conf,
                SequenceFile.Reader.stream(new FSDataInputStream(new ForwardOnlySeekable(in))));

        if (includeMetaData) {
          sequenceFileMetaData = reader.getMetadata();
        }
        Class keyClass = reader.getKeyClass();
        Class valueClass = reader.getValueClass();
        Record template = inputRecord.copy();
        removeAttachments(template);

        while (true) {
          Writable key = (Writable) ReflectionUtils.newInstance(keyClass, conf);
          Writable val = (Writable) ReflectionUtils.newInstance(valueClass, conf);
          try {
            if (!reader.next(key, val)) {
              break;
            }
          } catch (EOFException ex) {
            // SequenceFile.Reader will throw an EOFException after reading
            // all the data, if it doesn't know the length.  Since we are
            // passing in an InputStream, we hit this case;
            LOG.trace("Received expected EOFException", ex);
            break;
          }
          incrementNumRecords();
          Record outputRecord = template.copy();
          outputRecord.put(keyField, key);
          outputRecord.put(valueField, val);
          outputRecord.put(Fields.ATTACHMENT_MIME_TYPE, OUTPUT_MEDIA_TYPE);
          if (includeMetaData && sequenceFileMetaData != null) {
            outputRecord.put(SEQUENCE_FILE_META_DATA, sequenceFileMetaData);
          }

          // pass record to next command in chain:
          if (!getChild().process(outputRecord)) {
            return false;
          }
        }
      } finally {
        Closeables.closeQuietly(reader);
      }
      return true;
    }
  @Test
  public void testBasic() throws Exception {
    morphline = createMorphline("test-morphlines/startReportingMetricsToHTTP");

    Record record = new Record();
    String msg = "foo";
    record.put(Fields.MESSAGE, msg);
    Record expected = new Record();
    expected.put(Fields.MESSAGE, msg);
    processAndVerifySuccess(record, expected);

    if ("true".equals(System.getProperty("HttpMetricsMorphlineTest.isDemo"))) {
      // wait forever so user can browse to http://localhost:8080/ and interactively explore the
      // features
      Thread.sleep(Long.MAX_VALUE);
    }

    verifyServing(8080);
    verifyServing(8081);
    verifyShutdown(8080);
    verifyShutdown(8081);
  }
Esempio n. 17
0
    @Override
    protected boolean doProcess(Record record, InputStream inputStream) {
      Parser parser = detectParser(record);
      if (parser == null) {
        return false;
      }

      ParseContext parseContext = new ParseContext();
      parseContext.set(Locale.class, locale);

      Metadata metadata = new Metadata();
      for (Entry<String, Object> entry : record.getFields().entries()) {
        metadata.add(entry.getKey(), entry.getValue().toString());
      }

      SolrContentHandler handler =
          solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
      try {
        inputStream = TikaInputStream.get(inputStream);

        ContentHandler parsingHandler = handler;

        // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()";
        if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(parsingHandler, matcher);
        }

        try {
          parser.parse(inputStream, parsingHandler, metadata, parseContext);
        } catch (IOException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (SAXException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (TikaException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        }
      } finally {
        if (inputStream != null) {
          Closeables.closeQuietly(inputStream);
        }
      }

      SolrInputDocument doc = handler.newDocument();
      LOG.debug("solr doc: {}", doc);
      Record outputRecord = toRecord(doc);
      return getChild().process(outputRecord);
    }
    @Override
    protected boolean doProcess(Record record) {
      if (preserveExisting
          && record
              .getFields()
              .containsKey(
                  Fields.ATTACHMENT_MIME_TYPE)) {; // we must preserve the existing MIME type
      } else {
        List attachments = record.get(Fields.ATTACHMENT_BODY);
        if (attachments.size() > 0) {
          Object attachment = attachments.get(0);
          Preconditions.checkNotNull(attachment);
          InputStream stream;
          if (attachment instanceof byte[]) {
            stream = new ByteArrayInputStream((byte[]) attachment);
          } else {
            stream = (InputStream) attachment;
          }

          Metadata metadata = new Metadata();

          // If you specify the resource name (the filename, roughly) with this
          // parameter, then Tika can use it in guessing the right MIME type
          String resourceName = (String) record.getFirstValue(Fields.ATTACHMENT_NAME);
          if (resourceName != null) {
            metadata.add(Metadata.RESOURCE_NAME_KEY, resourceName);
          }

          // Provide stream's charset as hint to Tika for better auto detection
          String charset = (String) record.getFirstValue(Fields.ATTACHMENT_CHARSET);
          if (charset != null) {
            metadata.add(Metadata.CONTENT_ENCODING, charset);
          }

          if (includeMetaData) {
            for (Entry<String, Object> entry : record.getFields().entries()) {
              metadata.add(entry.getKey(), entry.getValue().toString());
            }
          }

          String mimeType = getMediaType(stream, metadata, excludeParameters);
          record.replaceValues(Fields.ATTACHMENT_MIME_TYPE, mimeType);
        }
      }
      return super.doProcess(record);
    }