private void writeContainer(Record src, OutputStream dst) { DataFileWriter dataFileWriter = null; try { try { Schema schema = null; for (Object attachment : src.get(Fields.ATTACHMENT_BODY)) { Preconditions.checkNotNull(attachment); GenericContainer datum = (GenericContainer) attachment; schema = getSchema(datum, schema); assert schema != null; if (dataFileWriter == null) { // init GenericDatumWriter datumWriter = new GenericDatumWriter(schema); dataFileWriter = new DataFileWriter(datumWriter); if (codecFactory != null) { dataFileWriter.setCodec(codecFactory); } for (Map.Entry<String, String> entry : metadata.entrySet()) { dataFileWriter.setMeta(entry.getKey(), entry.getValue()); } dataFileWriter.create(schema, dst); } dataFileWriter.append(datum); } if (dataFileWriter != null) { dataFileWriter.flush(); } } catch (IOException e) { throw new MorphlineRuntimeException(e); } } finally { Closeables.closeQuietly(dataFileWriter); } }
/** * Generates an input Avro file containing the given records in the temporary directory and * returns the full path of the file. */ private <T> String generateTestFile( String filename, List<T> elems, SyncBehavior syncBehavior, int syncInterval, AvroCoder<T> coder, String codec) throws IOException { Random random = new Random(0); File tmpFile = tmpFolder.newFile(filename); String path = tmpFile.toString(); FileOutputStream os = new FileOutputStream(tmpFile); DatumWriter<T> datumWriter = coder.createDatumWriter(); try (DataFileWriter<T> writer = new DataFileWriter<>(datumWriter)) { writer.setCodec(CodecFactory.fromString(codec)); writer.create(coder.getSchema(), os); int recordIndex = 0; int syncIndex = syncBehavior == SyncBehavior.SYNC_RANDOM ? random.nextInt(syncInterval) : 0; for (T elem : elems) { writer.append(elem); recordIndex++; switch (syncBehavior) { case SYNC_REGULAR: if (recordIndex == syncInterval) { recordIndex = 0; writer.sync(); } break; case SYNC_RANDOM: if (recordIndex == syncIndex) { recordIndex = 0; writer.sync(); syncIndex = random.nextInt(syncInterval); } break; case SYNC_DEFAULT: default: } } } return path; }
@Override public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException { FlowFile incomingAvro = session.get(); if (incomingAvro == null) { return; } String inputSchemaProperty = context.getProperty(INPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue(); final Schema inputSchema; try { inputSchema = getSchema(inputSchemaProperty, DefaultConfiguration.get()); } catch (SchemaNotFoundException e) { getLogger().error("Cannot find schema: " + inputSchemaProperty); session.transfer(incomingAvro, FAILURE); return; } String outputSchemaProperty = context.getProperty(OUTPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue(); final Schema outputSchema; try { outputSchema = getSchema(outputSchemaProperty, DefaultConfiguration.get()); } catch (SchemaNotFoundException e) { getLogger().error("Cannot find schema: " + outputSchemaProperty); session.transfer(incomingAvro, FAILURE); return; } final Map<String, String> fieldMapping = new HashMap<>(); for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) { if (entry.getKey().isDynamic()) { fieldMapping.put(entry.getKey().getName(), entry.getValue()); } } final AvroRecordConverter converter = new AvroRecordConverter(inputSchema, outputSchema, fieldMapping); final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class)); writer.setCodec(CodecFactory.snappyCodec()); final DataFileWriter<Record> failureWriter = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class)); failureWriter.setCodec(CodecFactory.snappyCodec()); try { final LongHolder written = new LongHolder(0L); final FailureTracker failures = new FailureTracker(); final List<Record> badRecords = Lists.newLinkedList(); FlowFile incomingAvroCopy = session.clone(incomingAvro); FlowFile outgoingAvro = session.write( incomingAvro, new StreamCallback() { @Override public void process(InputStream in, OutputStream out) throws IOException { try (DataFileStream<Record> stream = new DataFileStream<Record>( in, new GenericDatumReader<Record>(converter.getInputSchema()))) { try (DataFileWriter<Record> w = writer.create(outputSchema, out)) { for (Record record : stream) { try { Record converted = converter.convert(record); w.append(converted); written.incrementAndGet(); } catch (AvroConversionException e) { failures.add(e); getLogger().error("Error converting data: " + e.getMessage()); badRecords.add(record); } } } } } }); FlowFile badOutput = session.write( incomingAvroCopy, new StreamCallback() { @Override public void process(InputStream in, OutputStream out) throws IOException { try (DataFileWriter<Record> w = failureWriter.create(inputSchema, out)) { for (Record record : badRecords) { w.append(record); } } } }); long errors = failures.count(); // update only if file transfer is successful session.adjustCounter("Converted records", written.get(), false); // update only if file transfer is successful session.adjustCounter("Conversion errors", errors, false); if (written.get() > 0L) { session.transfer(outgoingAvro, SUCCESS); } else { session.remove(outgoingAvro); if (errors == 0L) { badOutput = session.putAttribute(badOutput, "errors", "No incoming records"); session.transfer(badOutput, FAILURE); } } if (errors > 0L) { getLogger() .warn( "Failed to convert {}/{} records between Avro Schemas", new Object[] {errors, errors + written.get()}); badOutput = session.putAttribute(badOutput, "errors", failures.summary()); session.transfer(badOutput, FAILURE); } else { session.remove(badOutput); } } catch (ProcessException | DatasetIOException e) { getLogger().error("Failed reading or writing", e); session.transfer(incomingAvro, FAILURE); } catch (DatasetException e) { getLogger().error("Failed to read FlowFile", e); session.transfer(incomingAvro, FAILURE); } }