private Record toRecord(SolrInputDocument doc) { Record record = new Record(); for (Entry<String, SolrInputField> entry : doc.entrySet()) { record.getFields().putAll(entry.getKey(), entry.getValue().getValues()); } return record; }
/** return a mapping of expected keys -> records */ private HashMap<String, Record> createTextSequenceFile(File file, int numRecords) throws IOException { HashMap<String, Record> map = new HashMap<String, Record>(); SequenceFile.Metadata metadata = new SequenceFile.Metadata(getMetadataForSequenceFile()); FSDataOutputStream out = new FSDataOutputStream(new FileOutputStream(file), null); SequenceFile.Writer writer = null; try { writer = SequenceFile.createWriter( new Configuration(), out, Text.class, Text.class, SequenceFile.CompressionType.NONE, null, metadata); for (int i = 0; i < numRecords; ++i) { Text key = new Text("key" + i); Text value = new Text("value" + i); writer.append(key, value); Record record = new Record(); record.put("key", key); record.put("value", value); map.put(key.toString(), record); } } finally { Closeables.closeQuietly(writer); } return map; }
private boolean areRecordFieldsEqual(Record record1, Record record2, List<String> fieldsToCheck) { for (String field : fieldsToCheck) { if (!record1.get(field).equals(record2.get(field))) { return false; } } return true; }
@Override public void process(String line, Emitter<T> emitter) { record.removeAll(Fields.ATTACHMENT_BODY); record.put(Fields.ATTACHMENT_BODY, new ByteArrayInputStream(line.toString().getBytes())); if (!morphline.process(record)) { LOGGER.error("Unable to process record: {}", line); return; } // the process command above parses the record // and stores it into the temporaryRecord emitter.emit(temporaryRecord); }
@Override protected boolean doProcess(Record inputRecord) { Record outputRecord = inputRecord.copy(); AbstractParser.removeAttachments(outputRecord); ByteArrayOutputStream bout = new ByteArrayOutputStream(1024); if (format == Format.container) { writeContainer(inputRecord, bout); } else { writeContainerless(inputRecord, bout); } outputRecord.put(Fields.ATTACHMENT_BODY, bout.toByteArray()); // pass record to next command in chain: return super.doProcess(outputRecord); }
private boolean areFieldsEqual(HashMap<String, Record> expected, List<Record> actual) { if (expected.size() != actual.size()) { return false; } for (Record current : actual) { String key = current.getFirstValue("key").toString(); Record currentExpected = expected.get(key); if (!areRecordFieldsEqual( current, currentExpected, Arrays.asList(new String[] {"key", "value"}))) { return false; } } return true; }
@Test public void testDiscardRecord() { final MorphlineContext context = new MorphlineContext.Builder().build(); Collector collectorParent = new Collector(); Collector collectorChild = new Collector(); final Command command = new TimeFilterBuilder().build(config, collectorParent, collectorChild, context); Record record = new Record(); record.put("createdAt", "21/01/2015"); command.process(record); List<Record> records = collectorChild.getRecords(); assertThat(records.size()).isEqualTo(0); }
@Override public boolean process(Record record) { temporaryRecord = null; LOGGER.debug("Record received: {}", record); List fields = record.get(Fields.ATTACHMENT_BODY); if (fields.size() != 1) { LOGGER.error("Record [ {} ] had incorrect number of fields - [{}]", record, fields.size()); return false; } try { byte[] byteArray = (byte[]) fields.get(0); SeekableByteArrayInput inputStream = new SeekableByteArrayInput(byteArray); DatumReader<T> userDatumReader = new SpecificDatumReader<T>(this.recordClass); DataFileReader<T> dataFileReader = new DataFileReader<T>(inputStream, userDatumReader); temporaryRecord = dataFileReader.next(); } catch (Exception e) { LOGGER.error("Unable to process {}, exception: {}", record, e); return false; } return true; }
private void writeContainerless(Record src, OutputStream dst) { try { GenericDatumWriter datumWriter = new GenericDatumWriter(); Encoder encoder = null; Schema schema = null; for (Object attachment : src.get(Fields.ATTACHMENT_BODY)) { Preconditions.checkNotNull(attachment); GenericContainer datum = (GenericContainer) attachment; schema = getSchema(datum, schema); assert schema != null; datumWriter.setSchema(schema); if (encoder == null) { // init if (format == Format.containerlessJSON) { encoder = EncoderFactory.get().jsonEncoder(schema, dst); } else { encoder = EncoderFactory.get().binaryEncoder(dst, null); } assert encoder != null; } datumWriter.write(datum, encoder); } encoder.flush(); } catch (IOException e) { throw new MorphlineRuntimeException(e); } }
private void writeContainer(Record src, OutputStream dst) { DataFileWriter dataFileWriter = null; try { try { Schema schema = null; for (Object attachment : src.get(Fields.ATTACHMENT_BODY)) { Preconditions.checkNotNull(attachment); GenericContainer datum = (GenericContainer) attachment; schema = getSchema(datum, schema); assert schema != null; if (dataFileWriter == null) { // init GenericDatumWriter datumWriter = new GenericDatumWriter(schema); dataFileWriter = new DataFileWriter(datumWriter); if (codecFactory != null) { dataFileWriter.setCodec(codecFactory); } for (Map.Entry<String, String> entry : metadata.entrySet()) { dataFileWriter.setMeta(entry.getKey(), entry.getValue()); } dataFileWriter.create(schema, dst); } dataFileWriter.append(datum); } if (dataFileWriter != null) { dataFileWriter.flush(); } } catch (IOException e) { throw new MorphlineRuntimeException(e); } } finally { Closeables.closeQuietly(dataFileWriter); } }
/** * Test that Solr queries on a parsed SequenceFile document return the expected content and * fields. Don't pass in our own parser via the context. */ @Test public void testSequenceFileContentSimple() throws Exception { morphline = createMorphline("test-morphlines/sequenceFileMorphlineSimple"); String path = RESOURCES_DIR; File sequenceFile = new File(path, "testSequenceFileContentSimple.seq"); int numRecords = 5; HashMap<String, Record> expected = createTextSequenceFile(sequenceFile, numRecords); InputStream in = new FileInputStream(sequenceFile.getAbsolutePath()); Record record = new Record(); record.put(Fields.ATTACHMENT_BODY, in); startSession(); assertEquals(1, collector.getNumStartEvents()); assertTrue(morphline.process(record)); assertTrue(areFieldsEqual(expected, collector.getRecords())); }
@Override protected boolean doProcess(Record record) { Timer.Context timerContext = elapsedTime.time(); try { XContentBuilder documentBuilder = jsonBuilder().startObject(); Map<String, Collection<Object>> map = record.getFields().asMap(); for (Map.Entry<String, Collection<Object>> entry : map.entrySet()) { String key = entry.getKey(); Iterator<Object> iterator = entry.getValue().iterator(); while (iterator.hasNext()) { documentBuilder.field(key, iterator.next()); } } documentBuilder.endObject(); loader.addDocument(documentBuilder.bytes(), indexName, indexType, ttl); } catch (Exception e) { throw new MorphlineRuntimeException(e); } finally { timerContext.stop(); } // pass record to next command in chain: return super.doProcess(record); }
private boolean hasAtLeastOneMimeType(Record record) { if (!record.getFields().containsKey(Fields.ATTACHMENT_MIME_TYPE)) { LOG.debug("Command failed because of missing MIME type for record: {}", record); return false; } return true; }
private Parser detectParser(Record record) { if (!hasAtLeastOneMimeType(record)) { return null; } String mediaTypeStr = (String) record.getFirstValue(Fields.ATTACHMENT_MIME_TYPE); // ExtractingParams.STREAM_TYPE); assert mediaTypeStr != null; MediaType mediaType = parseMediaType(mediaTypeStr).getBaseType(); Parser parser = mediaTypeToParserMap.get(mediaType); // fast path if (parser != null) { return parser; } // wildcard matching for (Map.Entry<MediaType, Parser> entry : mediaTypeToParserMap.entrySet()) { if (isMediaTypeMatch(mediaType, entry.getKey())) { return entry.getValue(); } } if (LOG.isDebugEnabled()) { LOG.debug( "No supported MIME type parser found for " + Fields.ATTACHMENT_MIME_TYPE + "=" + mediaTypeStr); } return null; }
@Override protected boolean doProcess(Record inputRecord, final InputStream in) throws IOException { SequenceFile.Metadata sequenceFileMetaData = null; SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader( conf, SequenceFile.Reader.stream(new FSDataInputStream(new ForwardOnlySeekable(in)))); if (includeMetaData) { sequenceFileMetaData = reader.getMetadata(); } Class keyClass = reader.getKeyClass(); Class valueClass = reader.getValueClass(); Record template = inputRecord.copy(); removeAttachments(template); while (true) { Writable key = (Writable) ReflectionUtils.newInstance(keyClass, conf); Writable val = (Writable) ReflectionUtils.newInstance(valueClass, conf); try { if (!reader.next(key, val)) { break; } } catch (EOFException ex) { // SequenceFile.Reader will throw an EOFException after reading // all the data, if it doesn't know the length. Since we are // passing in an InputStream, we hit this case; LOG.trace("Received expected EOFException", ex); break; } incrementNumRecords(); Record outputRecord = template.copy(); outputRecord.put(keyField, key); outputRecord.put(valueField, val); outputRecord.put(Fields.ATTACHMENT_MIME_TYPE, OUTPUT_MEDIA_TYPE); if (includeMetaData && sequenceFileMetaData != null) { outputRecord.put(SEQUENCE_FILE_META_DATA, sequenceFileMetaData); } // pass record to next command in chain: if (!getChild().process(outputRecord)) { return false; } } } finally { Closeables.closeQuietly(reader); } return true; }
@Test public void testBasic() throws Exception { morphline = createMorphline("test-morphlines/startReportingMetricsToHTTP"); Record record = new Record(); String msg = "foo"; record.put(Fields.MESSAGE, msg); Record expected = new Record(); expected.put(Fields.MESSAGE, msg); processAndVerifySuccess(record, expected); if ("true".equals(System.getProperty("HttpMetricsMorphlineTest.isDemo"))) { // wait forever so user can browse to http://localhost:8080/ and interactively explore the // features Thread.sleep(Long.MAX_VALUE); } verifyServing(8080); verifyServing(8081); verifyShutdown(8080); verifyShutdown(8081); }
@Override protected boolean doProcess(Record record, InputStream inputStream) { Parser parser = detectParser(record); if (parser == null) { return false; } ParseContext parseContext = new ParseContext(); parseContext.set(Locale.class, locale); Metadata metadata = new Metadata(); for (Entry<String, Object> entry : record.getFields().entries()) { metadata.add(entry.getKey(), entry.getValue().toString()); } SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema); try { inputStream = TikaInputStream.get(inputStream); ContentHandler parsingHandler = handler; // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()"; if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); parsingHandler = new MatchingContentHandler(parsingHandler, matcher); } try { parser.parse(inputStream, parsingHandler, metadata, parseContext); } catch (IOException e) { throw new MorphlineRuntimeException("Cannot parse", e); } catch (SAXException e) { throw new MorphlineRuntimeException("Cannot parse", e); } catch (TikaException e) { throw new MorphlineRuntimeException("Cannot parse", e); } } finally { if (inputStream != null) { Closeables.closeQuietly(inputStream); } } SolrInputDocument doc = handler.newDocument(); LOG.debug("solr doc: {}", doc); Record outputRecord = toRecord(doc); return getChild().process(outputRecord); }
@Override protected boolean doProcess(Record record) { if (preserveExisting && record .getFields() .containsKey( Fields.ATTACHMENT_MIME_TYPE)) {; // we must preserve the existing MIME type } else { List attachments = record.get(Fields.ATTACHMENT_BODY); if (attachments.size() > 0) { Object attachment = attachments.get(0); Preconditions.checkNotNull(attachment); InputStream stream; if (attachment instanceof byte[]) { stream = new ByteArrayInputStream((byte[]) attachment); } else { stream = (InputStream) attachment; } Metadata metadata = new Metadata(); // If you specify the resource name (the filename, roughly) with this // parameter, then Tika can use it in guessing the right MIME type String resourceName = (String) record.getFirstValue(Fields.ATTACHMENT_NAME); if (resourceName != null) { metadata.add(Metadata.RESOURCE_NAME_KEY, resourceName); } // Provide stream's charset as hint to Tika for better auto detection String charset = (String) record.getFirstValue(Fields.ATTACHMENT_CHARSET); if (charset != null) { metadata.add(Metadata.CONTENT_ENCODING, charset); } if (includeMetaData) { for (Entry<String, Object> entry : record.getFields().entries()) { metadata.add(entry.getKey(), entry.getValue().toString()); } } String mimeType = getMediaType(stream, metadata, excludeParameters); record.replaceValues(Fields.ATTACHMENT_MIME_TYPE, mimeType); } } return super.doProcess(record); }