@Test public void testHadoopCodecFactoryGZip() { CodecFactory hadoopSnappyCodec = HadoopCodecFactory.fromHadoopString("org.apache.hadoop.io.compress.GZipCodec"); CodecFactory avroSnappyCodec = CodecFactory.fromString("deflate"); assertTrue(hadoopSnappyCodec.getClass().equals(avroSnappyCodec.getClass())); }
public WriteAvroToByteArray( CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) { super(builder, config, parent, child, context); this.format = new Validator<Format>() .validateEnum( config, getConfigs().getString(config, "format", Format.container.toString()), Format.class); String codec = getConfigs().getString(config, "codec", null); if (codec == null) { this.codecFactory = null; } else { this.codecFactory = CodecFactory.fromString(codec); } Config metadataConfig = getConfigs().getConfig(config, "metadata", ConfigFactory.empty()); for (Map.Entry<String, Object> entry : new Configs().getEntrySet(metadataConfig)) { this.metadata.put(entry.getKey(), entry.getValue().toString()); } validateArguments(); }
/** * Assure that currentWriter is populated and refers to the correct data file. This may * roll-over the existing data file. Also assures that writing one more span will not violate * limits on Span storage. * * @throws IOException */ private void assureCurrentWriter() throws IOException { boolean createNewFile = false; // Will we overshoot policy? while (this.spansSoFar >= maxSpans) { File oldest = null; // If spansSoFar is positive, there must be at least one file synchronized (this.files) { oldest = this.files.remove(this.files.firstKey()); } this.spansSoFar -= spansPerFile.get(oldest); spansPerFile.remove(oldest); oldest.delete(); } if (files.size() == 0) { // In corner case we have removed the current file, // if that happened we need to clear current variables. currentTimestamp = (long) 0; currentWriter = null; } long rightNow = System.currentTimeMillis() / 1000L; // What file should we be in long cutOff = floorSecond(rightNow); if (currentWriter == null) { createNewFile = true; } // Test for roll-over. else if (cutOff >= (currentTimestamp + secondsPerFile)) { currentWriter.close(); createNewFile = true; } if (createNewFile) { File newFile = new File( traceFileDir + "/" + Thread.currentThread().getId() + "_" + cutOff + FILE_SUFFIX); synchronized (this.files) { this.files.put(cutOff, newFile); } this.spansPerFile.put(newFile, (long) 0); this.currentWriter = new DataFileWriter<Span>(SPAN_WRITER); this.currentWriter.setCodec(CodecFactory.deflateCodec(compressionLevel)); this.currentWriter.create(Span.SCHEMA$, newFile); this.currentTimestamp = cutOff; } }
/** * Generates an input Avro file containing the given records in the temporary directory and * returns the full path of the file. */ private <T> String generateTestFile( String filename, List<T> elems, SyncBehavior syncBehavior, int syncInterval, AvroCoder<T> coder, String codec) throws IOException { Random random = new Random(0); File tmpFile = tmpFolder.newFile(filename); String path = tmpFile.toString(); FileOutputStream os = new FileOutputStream(tmpFile); DatumWriter<T> datumWriter = coder.createDatumWriter(); try (DataFileWriter<T> writer = new DataFileWriter<>(datumWriter)) { writer.setCodec(CodecFactory.fromString(codec)); writer.create(coder.getSchema(), os); int recordIndex = 0; int syncIndex = syncBehavior == SyncBehavior.SYNC_RANDOM ? random.nextInt(syncInterval) : 0; for (T elem : elems) { writer.append(elem); recordIndex++; switch (syncBehavior) { case SYNC_REGULAR: if (recordIndex == syncInterval) { recordIndex = 0; writer.sync(); } break; case SYNC_RANDOM: if (recordIndex == syncIndex) { recordIndex = 0; writer.sync(); syncIndex = random.nextInt(syncInterval); } break; case SYNC_DEFAULT: default: } } } return path; }
@Override public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException { FlowFile incomingAvro = session.get(); if (incomingAvro == null) { return; } String inputSchemaProperty = context.getProperty(INPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue(); final Schema inputSchema; try { inputSchema = getSchema(inputSchemaProperty, DefaultConfiguration.get()); } catch (SchemaNotFoundException e) { getLogger().error("Cannot find schema: " + inputSchemaProperty); session.transfer(incomingAvro, FAILURE); return; } String outputSchemaProperty = context.getProperty(OUTPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue(); final Schema outputSchema; try { outputSchema = getSchema(outputSchemaProperty, DefaultConfiguration.get()); } catch (SchemaNotFoundException e) { getLogger().error("Cannot find schema: " + outputSchemaProperty); session.transfer(incomingAvro, FAILURE); return; } final Map<String, String> fieldMapping = new HashMap<>(); for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) { if (entry.getKey().isDynamic()) { fieldMapping.put(entry.getKey().getName(), entry.getValue()); } } final AvroRecordConverter converter = new AvroRecordConverter(inputSchema, outputSchema, fieldMapping); final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class)); writer.setCodec(CodecFactory.snappyCodec()); final DataFileWriter<Record> failureWriter = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class)); failureWriter.setCodec(CodecFactory.snappyCodec()); try { final LongHolder written = new LongHolder(0L); final FailureTracker failures = new FailureTracker(); final List<Record> badRecords = Lists.newLinkedList(); FlowFile incomingAvroCopy = session.clone(incomingAvro); FlowFile outgoingAvro = session.write( incomingAvro, new StreamCallback() { @Override public void process(InputStream in, OutputStream out) throws IOException { try (DataFileStream<Record> stream = new DataFileStream<Record>( in, new GenericDatumReader<Record>(converter.getInputSchema()))) { try (DataFileWriter<Record> w = writer.create(outputSchema, out)) { for (Record record : stream) { try { Record converted = converter.convert(record); w.append(converted); written.incrementAndGet(); } catch (AvroConversionException e) { failures.add(e); getLogger().error("Error converting data: " + e.getMessage()); badRecords.add(record); } } } } } }); FlowFile badOutput = session.write( incomingAvroCopy, new StreamCallback() { @Override public void process(InputStream in, OutputStream out) throws IOException { try (DataFileWriter<Record> w = failureWriter.create(inputSchema, out)) { for (Record record : badRecords) { w.append(record); } } } }); long errors = failures.count(); // update only if file transfer is successful session.adjustCounter("Converted records", written.get(), false); // update only if file transfer is successful session.adjustCounter("Conversion errors", errors, false); if (written.get() > 0L) { session.transfer(outgoingAvro, SUCCESS); } else { session.remove(outgoingAvro); if (errors == 0L) { badOutput = session.putAttribute(badOutput, "errors", "No incoming records"); session.transfer(badOutput, FAILURE); } } if (errors > 0L) { getLogger() .warn( "Failed to convert {}/{} records between Avro Schemas", new Object[] {errors, errors + written.get()}); badOutput = session.putAttribute(badOutput, "errors", failures.summary()); session.transfer(badOutput, FAILURE); } else { session.remove(badOutput); } } catch (ProcessException | DatasetIOException e) { getLogger().error("Failed reading or writing", e); session.transfer(incomingAvro, FAILURE); } catch (DatasetException e) { getLogger().error("Failed to read FlowFile", e); session.transfer(incomingAvro, FAILURE); } }