@Test public void test() throws IOException { File file = new File("target/AvroDocument.avro"); Schema schema = AvroDocument._SCHEMA; { System.out.println("Writing to: " + file.getAbsolutePath()); DatumWriter<Object> datumWriter = new SpecificDatumWriter(AvroDocument.class); FileOutputStream outputStream = new FileOutputStream(file); DataFileWriter<Object> dfw = new DataFileWriter<Object>(schema, outputStream, datumWriter); AvroDocument d = createTestDocument(); dfw.append(d); dfw.flush(); dfw.close(); } { System.out.println("Reading from: " + file.getAbsolutePath()); DatumReader<Object> datumReader = new SpecificDatumReader(AvroDocument.class); SeekableInput seekableInput = new SeekableFileInput(file); DataFileReader<Object> dfr = new DataFileReader<Object>(seekableInput, datumReader); AvroDocument d = new AvroDocument(); dfr.next(d); AvroDocumentReader.dumpAvroDocument(d, System.out); } }
private byte[] createAvroData(String name, int age, List<String> emails) throws IOException { String AVRO_SCHEMA = "{\n" + "\"type\": \"record\",\n" + "\"name\": \"Employee\",\n" + "\"fields\": [\n" + " {\"name\": \"name\", \"type\": \"string\"},\n" + " {\"name\": \"age\", \"type\": \"int\"},\n" + " {\"name\": \"emails\", \"type\": {\"type\": \"array\", \"items\": \"string\"}},\n" + " {\"name\": \"boss\", \"type\": [\"Employee\",\"null\"]}\n" + "]}"; Schema schema = new Schema.Parser().parse(AVRO_SCHEMA); ByteArrayOutputStream out = new ByteArrayOutputStream(); GenericRecord e1 = new GenericData.Record(schema); e1.put("name", name); e1.put("age", age); e1.put("emails", emails); e1.put("boss", null); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.create(schema, out); dataFileWriter.append(e1); dataFileWriter.close(); return out.toByteArray(); }
@Override public void open(int taskNumber, int numTasks) throws IOException { super.open(taskNumber, numTasks); DatumWriter<E> datumWriter; Schema schema; if (org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)) { datumWriter = new SpecificDatumWriter<E>(avroValueType); try { schema = ((org.apache.avro.specific.SpecificRecordBase) avroValueType.newInstance()).getSchema(); } catch (InstantiationException e) { throw new RuntimeException(e.getMessage()); } catch (IllegalAccessException e) { throw new RuntimeException(e.getMessage()); } } else { datumWriter = new ReflectDatumWriter<E>(avroValueType); schema = ReflectData.get().getSchema(avroValueType); } dataFileWriter = new DataFileWriter<E>(datumWriter); if (userDefinedSchema == null) { dataFileWriter.create(schema, stream); } else { dataFileWriter.create(userDefinedSchema, stream); } }
/** * Builds the target file path as <datasource directory>/<month>.avro. If the target file already * exists, then it is open for appending, otherwise it is created */ private DataFileWriter<GenericRecord> openHDFSFile(DatasourceMonth datasourceMonth) throws IOException { DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>(AVRO_SCHEMA)); writer.setSyncInterval(FILEWRITER_SYNC_INTERVAL); // writer.setCodec(CodecFactory.snappyCodec()); // omit for now Path targetPath = buildTargetPath(datasourceMonth); // just for logging String fullTargetPath = this.hdfs.getWorkingDirectory() + "/" + targetPath; // Append to an existing file, or create a new file is file otherwise if (this.hdfs.exists(targetPath)) { // appending to an existing file // based on // http://technicaltidbit.blogspot.com.es/2013/02/avro-can-append-in-hdfs-after-all.html if (debugMode) { this.hdfs.setReplication(targetPath, (short) 1); } LOGGER.info("Appending to existing file {}", fullTargetPath); OutputStream outputStream = this.hdfs.append(targetPath); writer.appendTo(new FsInput(targetPath, this.hadoopConf), outputStream); } else { // creating a new file LOGGER.info( "Creating new file " + fullTargetPath + " for datasource {} and month {}", datasourceMonth.datasource(), datasourceMonth.month()); OutputStream outputStream = this.hdfs.create(targetPath); writer.create(AVRO_SCHEMA, outputStream); } return writer; }
private static <T> File createFile(File file, Schema schema, T... records) throws IOException { DatumWriter<T> datumWriter = new GenericDatumWriter<T>(schema); DataFileWriter<T> fileWriter = new DataFileWriter<T>(datumWriter); fileWriter.create(schema, file); for (T record : records) { fileWriter.append(record); } fileWriter.close(); return file; }
public void execute(TridentTuple tuple, TridentCollector collector) { GenericRecord docEntry = new GenericData.Record(schema); docEntry.put("docid", tuple.getStringByField("documentId")); docEntry.put("time", Time.currentTimeMillis()); docEntry.put("line", tuple.getStringByField("document")); try { dataFileWriter.append(docEntry); dataFileWriter.flush(); } catch (IOException e) { LOG.error("Error writing to document record: " + e); throw new RuntimeException(e); } }
@Override public void prepare(Map conf, TridentOperationContext context) { try { String path = (String) conf.get("DOCUMENT_PATH"); schema = Schema.parse(PersistDocumentFunction.class.getResourceAsStream("/document.avsc")); File file = new File(path); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema); dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter); if (file.exists()) dataFileWriter.appendTo(file); else dataFileWriter.create(schema, file); } catch (IOException e) { throw new RuntimeException(e); } }
@Override public void execute(Tuple inputTuple) { /* Processing tuples of the shape (DATASOURCE_ID, TIMESTAMP_FIELD, CONTENT_FIELD) */ // get datasource String datasource = inputTuple.getStringByField(RestIngestionSpout.DATASOURCE_ID); // compute month long timestamp = inputTuple.getLongByField(RestIngestionSpout.TIMESTAMP_FIELD); // this computation is completely stateless String month = timestampToMonth(timestamp); // now get the DataFileWriter DataFileWriter<GenericRecord> writer = null; try { writer = this.writersCache.get(DatasourceMonth.create(datasource, month)); } catch (ExecutionException ee) { LOGGER.error( "Error getting DataFileWriter for tuple for datasource " + datasource + " and timestamp " + timestamp + " : " + ee.getMessage()); this.collector.fail(inputTuple); return; } // create and write a new record GenericRecord newDataRecord = new GenericData.Record(AVRO_SCHEMA); newDataRecord.put(AVRO_TIMESTAMP_FIELD, new Long(timestamp)); newDataRecord.put( AVRO_CONTENT_FIELD, inputTuple.getStringByField(RestIngestionSpout.CONTENT_FIELD)); try { writer.append(newDataRecord); } catch (IOException ioe) { LOGGER.error( "Error writing Avro record for datasource " + datasource + " and timestamp " + timestamp + " : " + ioe.getMessage()); this.collector.fail(inputTuple); return; } // ACK processing for this tupe as ok this.collector.ack(inputTuple); }
private void populateGenericFile(List<GenericRecord> genericRecords) throws IOException { FileOutputStream outputStream = new FileOutputStream(this.avroFile); GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(genericDatumWriter); dataFileWriter.create(schema, outputStream); for (GenericRecord record : genericRecords) { dataFileWriter.append(record); } dataFileWriter.close(); outputStream.close(); }
public void testWrite() throws IOException { URL url = this.getClass().getClassLoader().getResource("input/Company.avsc"); assertNotNull(url); Schema schema = new Schema.Parser().parse(new File(url.getFile())); assertNotNull(schema); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema); // Another way of loading a file File file = new File("src/test/resources/input/companies.avro"); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(file, datumReader); File fileOut = new File("target/companies2.avro"); Schema schemaOut = new Schema.Parser().parse(new File("src/test/resources/input/Company2.avsc")); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schemaOut); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter); GenericRecord company = null; int count = 0; while (dataFileReader.hasNext()) { company = dataFileReader.next(company); if (company.get("name").toString().equals("aol")) { dataFileWriter.create(schemaOut, fileOut); GenericRecord recordOut = new GenericData.Record(schemaOut); recordOut.put("id", company.get("id")); recordOut.put("name", company.get("name")); assertTrue(recordOut.getSchema().getField("address") != null); assertTrue(recordOut.getSchema().getField("employeeCount") == null); // address is of complex type GenericRecord address = new GenericData.Record((GenericData.Record) company.get("address"), true); recordOut.put("address", address); dataFileWriter.append(recordOut); count++; } } assertTrue(count > 0); dataFileWriter.close(); }
@Override public void cleanup() { try { dataFileWriter.close(); } catch (IOException e) { LOG.error("Error Closing file: " + e); } }
public static void main(String[] args) throws IOException { User user1 = new User(); user1.setName("Alyssa"); user1.setFavoriteNumber(256); // Leave favorite color null // Alternate constructor User user2 = new User("Ben", 7, "red"); // Construct via builder User user3 = User.newBuilder() .setName("Charlie") .setFavoriteColor("blue") .setFavoriteNumber(null) .build(); // Serialize user1 and user2 to disk File file = new File("users.avro"); DatumWriter<User> userDatumWriter = new SpecificDatumWriter<User>(User.class); DataFileWriter<User> dataFileWriter = new DataFileWriter<User>(userDatumWriter); dataFileWriter.create(user1.getSchema(), file); dataFileWriter.append(user1); dataFileWriter.append(user2); dataFileWriter.append(user3); dataFileWriter.close(); // Deserialize Users from disk DatumReader<User> userDatumReader = new SpecificDatumReader<User>(User.class); DataFileReader<User> dataFileReader = new DataFileReader<User>(file, userDatumReader); try { User user = null; while (dataFileReader.hasNext()) { // Reuse user object by passing it to next(). This saves us from // allocating and garbage collecting many objects for files with // many items. user = dataFileReader.next(user); System.out.println(user); } } finally { dataFileReader.close(); } }
public static void main(String[] args) throws IOException { DatumWriter<ArchivePlace> datumWriter = new SpecificDatumWriter<ArchivePlace>(ArchivePlace.class); DataFileWriter<ArchivePlace> falloutDatafileWriter = new DataFileWriter<ArchivePlace>(datumWriter); FileOutputStream falloutOutputStream = new FileOutputStream("src/test/resources/archive-places/input.avro", true); falloutDatafileWriter.create(ArchivePlace.SCHEMA$, falloutOutputStream); List<ArchivePlace> places = SerializationUtil.loadFromJsons( ArchivePlace.SCHEMA$, "src/test/resources/archive-places/input.json"); for (ArchivePlace place : places) { falloutDatafileWriter.append(place); falloutDatafileWriter.flush(); } falloutDatafileWriter.close(); falloutOutputStream.close(); }
/** Writes an avro file of generic records with a 'key', 'blah', and 'value' field. */ private Path writeGenericRecordAvroFile() throws IOException { // Open a writer. final File file = new File(getLocalTempDir(), "generic-kv.avro"); final Schema writerSchema = Schema.createRecord("record", null, null, false); writerSchema.setFields( Lists.newArrayList( new Schema.Field("key", Schema.create(Schema.Type.INT), null, null), new Schema.Field("blah", Schema.create(Schema.Type.STRING), null, null), new Schema.Field("value", Schema.create(Schema.Type.STRING), null, null))); final DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>(writerSchema)) .create(writerSchema, file); try { // Write a record. GenericData.Record record = new GenericData.Record(writerSchema); record.put("key", 1); record.put("blah", "blah"); record.put("value", "one"); fileWriter.append(record); // Write another record. record = new GenericData.Record(writerSchema); record.put("key", 2); record.put("blah", "blah"); record.put("value", "two"); fileWriter.append(record); // Write a duplicate record with the same key field value. record = new GenericData.Record(writerSchema); record.put("key", 2); record.put("blah", "blah"); record.put("value", "deux"); fileWriter.append(record); // Close it and return the path. } finally { fileWriter.close(); } return new Path(file.getPath()); }
private void writeContainer(Record src, OutputStream dst) { DataFileWriter dataFileWriter = null; try { try { Schema schema = null; for (Object attachment : src.get(Fields.ATTACHMENT_BODY)) { Preconditions.checkNotNull(attachment); GenericContainer datum = (GenericContainer) attachment; schema = getSchema(datum, schema); assert schema != null; if (dataFileWriter == null) { // init GenericDatumWriter datumWriter = new GenericDatumWriter(schema); dataFileWriter = new DataFileWriter(datumWriter); if (codecFactory != null) { dataFileWriter.setCodec(codecFactory); } for (Map.Entry<String, String> entry : metadata.entrySet()) { dataFileWriter.setMeta(entry.getKey(), entry.getValue()); } dataFileWriter.create(schema, dst); } dataFileWriter.append(datum); } if (dataFileWriter != null) { dataFileWriter.flush(); } } catch (IOException e) { throw new MorphlineRuntimeException(e); } } finally { Closeables.closeQuietly(dataFileWriter); } }
private List<Map.Entry> previewAvroBatch(FileStatus fileStatus, int batchSize) throws IOException, InterruptedException { SeekableInput input = new FsInput(fileStatus.getPath(), hadoopConf); DatumReader<GenericRecord> reader = new GenericDatumReader<>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader); List<Map.Entry> batch = new ArrayList<>(); int count = 0; while (fileReader.hasNext() && batch.size() < batchSize) { GenericRecord datum = fileReader.next(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>( new GenericDatumWriter<GenericRecord>(datum.getSchema())); dataFileWriter.create(datum.getSchema(), out); dataFileWriter.append(datum); dataFileWriter.close(); out.close(); batch.add(new Pair(fileStatus.getPath().toUri().getPath() + "::" + count, out.toByteArray())); count++; } return batch; }
public void serializeSpecific() throws IOException { // Create a datum to serialize. MyPair datum = new MyPair(); datum.left = new Utf8("dog"); datum.right = new Utf8("cat"); File tmpFile = File.createTempFile("myPairAvroExample", ".avro"); // Serialize it. DataFileWriter<MyPair> writer = new DataFileWriter<MyPair>(new SpecificDatumWriter<MyPair>(MyPair.class)); writer.create(MyPair.SCHEMA$, tmpFile); writer.append(datum); writer.close(); System.out.println("Serialization: " + tmpFile); // Deserialize it. FileReader<MyPair> reader = DataFileReader.openReader(tmpFile, new SpecificDatumReader<MyPair>(MyPair.class)); while (reader.hasNext()) { MyPair result = reader.next(); System.out.printf("Left: %s, Right: %s\n", result.left, result.right); } reader.close(); }
/** * Assure that currentWriter is populated and refers to the correct data file. This may * roll-over the existing data file. Also assures that writing one more span will not violate * limits on Span storage. * * @throws IOException */ private void assureCurrentWriter() throws IOException { boolean createNewFile = false; // Will we overshoot policy? while (this.spansSoFar >= maxSpans) { File oldest = null; // If spansSoFar is positive, there must be at least one file synchronized (this.files) { oldest = this.files.remove(this.files.firstKey()); } this.spansSoFar -= spansPerFile.get(oldest); spansPerFile.remove(oldest); oldest.delete(); } if (files.size() == 0) { // In corner case we have removed the current file, // if that happened we need to clear current variables. currentTimestamp = (long) 0; currentWriter = null; } long rightNow = System.currentTimeMillis() / 1000L; // What file should we be in long cutOff = floorSecond(rightNow); if (currentWriter == null) { createNewFile = true; } // Test for roll-over. else if (cutOff >= (currentTimestamp + secondsPerFile)) { currentWriter.close(); createNewFile = true; } if (createNewFile) { File newFile = new File( traceFileDir + "/" + Thread.currentThread().getId() + "_" + cutOff + FILE_SUFFIX); synchronized (this.files) { this.files.put(cutOff, newFile); } this.spansPerFile.put(newFile, (long) 0); this.currentWriter = new DataFileWriter<Span>(SPAN_WRITER); this.currentWriter.setCodec(CodecFactory.deflateCodec(compressionLevel)); this.currentWriter.create(Span.SCHEMA$, newFile); this.currentTimestamp = cutOff; } }
/** * Generates an input Avro file containing the given records in the temporary directory and * returns the full path of the file. */ private <T> String generateTestFile( String filename, List<T> elems, SyncBehavior syncBehavior, int syncInterval, AvroCoder<T> coder, String codec) throws IOException { Random random = new Random(0); File tmpFile = tmpFolder.newFile(filename); String path = tmpFile.toString(); FileOutputStream os = new FileOutputStream(tmpFile); DatumWriter<T> datumWriter = coder.createDatumWriter(); try (DataFileWriter<T> writer = new DataFileWriter<>(datumWriter)) { writer.setCodec(CodecFactory.fromString(codec)); writer.create(coder.getSchema(), os); int recordIndex = 0; int syncIndex = syncBehavior == SyncBehavior.SYNC_RANDOM ? random.nextInt(syncInterval) : 0; for (T elem : elems) { writer.append(elem); recordIndex++; switch (syncBehavior) { case SYNC_REGULAR: if (recordIndex == syncInterval) { recordIndex = 0; writer.sync(); } break; case SYNC_RANDOM: if (recordIndex == syncIndex) { recordIndex = 0; writer.sync(); syncIndex = random.nextInt(syncInterval); } break; case SYNC_DEFAULT: default: } } } return path; }
@SuppressWarnings("deprecation") // uses internal test functionality. @Override protected void prepareWrite(WritableByteChannel channel) throws Exception { dataFileWriter = new DataFileWriter<>(coder.createDatumWriter()); dataFileWriter.create(coder.getSchema(), Channels.newOutputStream(channel)); }
@Override public void write(T value) throws Exception { dataFileWriter.append(value); }
@Override public void writeRecord(E record) throws IOException { dataFileWriter.append(record); }
@Override public void close() throws IOException { dataFileWriter.flush(); dataFileWriter.close(); super.close(); }
@Override protected void writeFooter() throws Exception { dataFileWriter.flush(); }
@Override public void onTrigger(ProcessContext context, final ProcessSession session) throws ProcessException { FlowFile incomingAvro = session.get(); if (incomingAvro == null) { return; } String inputSchemaProperty = context.getProperty(INPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue(); final Schema inputSchema; try { inputSchema = getSchema(inputSchemaProperty, DefaultConfiguration.get()); } catch (SchemaNotFoundException e) { getLogger().error("Cannot find schema: " + inputSchemaProperty); session.transfer(incomingAvro, FAILURE); return; } String outputSchemaProperty = context.getProperty(OUTPUT_SCHEMA).evaluateAttributeExpressions(incomingAvro).getValue(); final Schema outputSchema; try { outputSchema = getSchema(outputSchemaProperty, DefaultConfiguration.get()); } catch (SchemaNotFoundException e) { getLogger().error("Cannot find schema: " + outputSchemaProperty); session.transfer(incomingAvro, FAILURE); return; } final Map<String, String> fieldMapping = new HashMap<>(); for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) { if (entry.getKey().isDynamic()) { fieldMapping.put(entry.getKey().getName(), entry.getValue()); } } final AvroRecordConverter converter = new AvroRecordConverter(inputSchema, outputSchema, fieldMapping); final DataFileWriter<Record> writer = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class)); writer.setCodec(CodecFactory.snappyCodec()); final DataFileWriter<Record> failureWriter = new DataFileWriter<>(AvroUtil.newDatumWriter(outputSchema, Record.class)); failureWriter.setCodec(CodecFactory.snappyCodec()); try { final LongHolder written = new LongHolder(0L); final FailureTracker failures = new FailureTracker(); final List<Record> badRecords = Lists.newLinkedList(); FlowFile incomingAvroCopy = session.clone(incomingAvro); FlowFile outgoingAvro = session.write( incomingAvro, new StreamCallback() { @Override public void process(InputStream in, OutputStream out) throws IOException { try (DataFileStream<Record> stream = new DataFileStream<Record>( in, new GenericDatumReader<Record>(converter.getInputSchema()))) { try (DataFileWriter<Record> w = writer.create(outputSchema, out)) { for (Record record : stream) { try { Record converted = converter.convert(record); w.append(converted); written.incrementAndGet(); } catch (AvroConversionException e) { failures.add(e); getLogger().error("Error converting data: " + e.getMessage()); badRecords.add(record); } } } } } }); FlowFile badOutput = session.write( incomingAvroCopy, new StreamCallback() { @Override public void process(InputStream in, OutputStream out) throws IOException { try (DataFileWriter<Record> w = failureWriter.create(inputSchema, out)) { for (Record record : badRecords) { w.append(record); } } } }); long errors = failures.count(); // update only if file transfer is successful session.adjustCounter("Converted records", written.get(), false); // update only if file transfer is successful session.adjustCounter("Conversion errors", errors, false); if (written.get() > 0L) { session.transfer(outgoingAvro, SUCCESS); } else { session.remove(outgoingAvro); if (errors == 0L) { badOutput = session.putAttribute(badOutput, "errors", "No incoming records"); session.transfer(badOutput, FAILURE); } } if (errors > 0L) { getLogger() .warn( "Failed to convert {}/{} records between Avro Schemas", new Object[] {errors, errors + written.get()}); badOutput = session.putAttribute(badOutput, "errors", failures.summary()); session.transfer(badOutput, FAILURE); } else { session.remove(badOutput); } } catch (ProcessException | DatasetIOException e) { getLogger().error("Failed reading or writing", e); session.transfer(incomingAvro, FAILURE); } catch (DatasetException e) { getLogger().error("Failed to read FlowFile", e); session.transfer(incomingAvro, FAILURE); } }