private byte[] createAvroData(String name, int age, List<String> emails) throws IOException { String AVRO_SCHEMA = "{\n" + "\"type\": \"record\",\n" + "\"name\": \"Employee\",\n" + "\"fields\": [\n" + " {\"name\": \"name\", \"type\": \"string\"},\n" + " {\"name\": \"age\", \"type\": \"int\"},\n" + " {\"name\": \"emails\", \"type\": {\"type\": \"array\", \"items\": \"string\"}},\n" + " {\"name\": \"boss\", \"type\": [\"Employee\",\"null\"]}\n" + "]}"; Schema schema = new Schema.Parser().parse(AVRO_SCHEMA); ByteArrayOutputStream out = new ByteArrayOutputStream(); GenericRecord e1 = new GenericData.Record(schema); e1.put("name", name); e1.put("age", age); e1.put("emails", emails); e1.put("boss", null); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.create(schema, out); dataFileWriter.append(e1); dataFileWriter.close(); return out.toByteArray(); }
private void writeContainer(Record src, OutputStream dst) { DataFileWriter dataFileWriter = null; try { try { Schema schema = null; for (Object attachment : src.get(Fields.ATTACHMENT_BODY)) { Preconditions.checkNotNull(attachment); GenericContainer datum = (GenericContainer) attachment; schema = getSchema(datum, schema); assert schema != null; if (dataFileWriter == null) { // init GenericDatumWriter datumWriter = new GenericDatumWriter(schema); dataFileWriter = new DataFileWriter(datumWriter); if (codecFactory != null) { dataFileWriter.setCodec(codecFactory); } for (Map.Entry<String, String> entry : metadata.entrySet()) { dataFileWriter.setMeta(entry.getKey(), entry.getValue()); } dataFileWriter.create(schema, dst); } dataFileWriter.append(datum); } if (dataFileWriter != null) { dataFileWriter.flush(); } } catch (IOException e) { throw new MorphlineRuntimeException(e); } } finally { Closeables.closeQuietly(dataFileWriter); } }
@Test public void test() throws IOException { File file = new File("target/AvroDocument.avro"); Schema schema = AvroDocument._SCHEMA; { System.out.println("Writing to: " + file.getAbsolutePath()); DatumWriter<Object> datumWriter = new SpecificDatumWriter(AvroDocument.class); FileOutputStream outputStream = new FileOutputStream(file); DataFileWriter<Object> dfw = new DataFileWriter<Object>(schema, outputStream, datumWriter); AvroDocument d = createTestDocument(); dfw.append(d); dfw.flush(); dfw.close(); } { System.out.println("Reading from: " + file.getAbsolutePath()); DatumReader<Object> datumReader = new SpecificDatumReader(AvroDocument.class); SeekableInput seekableInput = new SeekableFileInput(file); DataFileReader<Object> dfr = new DataFileReader<Object>(seekableInput, datumReader); AvroDocument d = new AvroDocument(); dfr.next(d); AvroDocumentReader.dumpAvroDocument(d, System.out); } }
private static <T> File createFile(File file, Schema schema, T... records) throws IOException { DatumWriter<T> datumWriter = new GenericDatumWriter<T>(schema); DataFileWriter<T> fileWriter = new DataFileWriter<T>(datumWriter); fileWriter.create(schema, file); for (T record : records) { fileWriter.append(record); } fileWriter.close(); return file; }
public static void main(String[] args) throws IOException { User user1 = new User(); user1.setName("Alyssa"); user1.setFavoriteNumber(256); // Leave favorite color null // Alternate constructor User user2 = new User("Ben", 7, "red"); // Construct via builder User user3 = User.newBuilder() .setName("Charlie") .setFavoriteColor("blue") .setFavoriteNumber(null) .build(); // Serialize user1 and user2 to disk File file = new File("users.avro"); DatumWriter<User> userDatumWriter = new SpecificDatumWriter<User>(User.class); DataFileWriter<User> dataFileWriter = new DataFileWriter<User>(userDatumWriter); dataFileWriter.create(user1.getSchema(), file); dataFileWriter.append(user1); dataFileWriter.append(user2); dataFileWriter.append(user3); dataFileWriter.close(); // Deserialize Users from disk DatumReader<User> userDatumReader = new SpecificDatumReader<User>(User.class); DataFileReader<User> dataFileReader = new DataFileReader<User>(file, userDatumReader); try { User user = null; while (dataFileReader.hasNext()) { // Reuse user object by passing it to next(). This saves us from // allocating and garbage collecting many objects for files with // many items. user = dataFileReader.next(user); System.out.println(user); } } finally { dataFileReader.close(); } }
/** Writes an avro file of generic records with a 'key', 'blah', and 'value' field. */ private Path writeGenericRecordAvroFile() throws IOException { // Open a writer. final File file = new File(getLocalTempDir(), "generic-kv.avro"); final Schema writerSchema = Schema.createRecord("record", null, null, false); writerSchema.setFields( Lists.newArrayList( new Schema.Field("key", Schema.create(Schema.Type.INT), null, null), new Schema.Field("blah", Schema.create(Schema.Type.STRING), null, null), new Schema.Field("value", Schema.create(Schema.Type.STRING), null, null))); final DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>(writerSchema)) .create(writerSchema, file); try { // Write a record. GenericData.Record record = new GenericData.Record(writerSchema); record.put("key", 1); record.put("blah", "blah"); record.put("value", "one"); fileWriter.append(record); // Write another record. record = new GenericData.Record(writerSchema); record.put("key", 2); record.put("blah", "blah"); record.put("value", "two"); fileWriter.append(record); // Write a duplicate record with the same key field value. record = new GenericData.Record(writerSchema); record.put("key", 2); record.put("blah", "blah"); record.put("value", "deux"); fileWriter.append(record); // Close it and return the path. } finally { fileWriter.close(); } return new Path(file.getPath()); }
public void execute(TridentTuple tuple, TridentCollector collector) { GenericRecord docEntry = new GenericData.Record(schema); docEntry.put("docid", tuple.getStringByField("documentId")); docEntry.put("time", Time.currentTimeMillis()); docEntry.put("line", tuple.getStringByField("document")); try { dataFileWriter.append(docEntry); dataFileWriter.flush(); } catch (IOException e) { LOG.error("Error writing to document record: " + e); throw new RuntimeException(e); } }
@Override public void execute(Tuple inputTuple) { /* Processing tuples of the shape (DATASOURCE_ID, TIMESTAMP_FIELD, CONTENT_FIELD) */ // get datasource String datasource = inputTuple.getStringByField(RestIngestionSpout.DATASOURCE_ID); // compute month long timestamp = inputTuple.getLongByField(RestIngestionSpout.TIMESTAMP_FIELD); // this computation is completely stateless String month = timestampToMonth(timestamp); // now get the DataFileWriter DataFileWriter<GenericRecord> writer = null; try { writer = this.writersCache.get(DatasourceMonth.create(datasource, month)); } catch (ExecutionException ee) { LOGGER.error( "Error getting DataFileWriter for tuple for datasource " + datasource + " and timestamp " + timestamp + " : " + ee.getMessage()); this.collector.fail(inputTuple); return; } // create and write a new record GenericRecord newDataRecord = new GenericData.Record(AVRO_SCHEMA); newDataRecord.put(AVRO_TIMESTAMP_FIELD, new Long(timestamp)); newDataRecord.put( AVRO_CONTENT_FIELD, inputTuple.getStringByField(RestIngestionSpout.CONTENT_FIELD)); try { writer.append(newDataRecord); } catch (IOException ioe) { LOGGER.error( "Error writing Avro record for datasource " + datasource + " and timestamp " + timestamp + " : " + ioe.getMessage()); this.collector.fail(inputTuple); return; } // ACK processing for this tupe as ok this.collector.ack(inputTuple); }
private void populateGenericFile(List<GenericRecord> genericRecords) throws IOException { FileOutputStream outputStream = new FileOutputStream(this.avroFile); GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(genericDatumWriter); dataFileWriter.create(schema, outputStream); for (GenericRecord record : genericRecords) { dataFileWriter.append(record); } dataFileWriter.close(); outputStream.close(); }
/** * Generates an input Avro file containing the given records in the temporary directory and * returns the full path of the file. */ private <T> String generateTestFile( String filename, List<T> elems, SyncBehavior syncBehavior, int syncInterval, AvroCoder<T> coder, String codec) throws IOException { Random random = new Random(0); File tmpFile = tmpFolder.newFile(filename); String path = tmpFile.toString(); FileOutputStream os = new FileOutputStream(tmpFile); DatumWriter<T> datumWriter = coder.createDatumWriter(); try (DataFileWriter<T> writer = new DataFileWriter<>(datumWriter)) { writer.setCodec(CodecFactory.fromString(codec)); writer.create(coder.getSchema(), os); int recordIndex = 0; int syncIndex = syncBehavior == SyncBehavior.SYNC_RANDOM ? random.nextInt(syncInterval) : 0; for (T elem : elems) { writer.append(elem); recordIndex++; switch (syncBehavior) { case SYNC_REGULAR: if (recordIndex == syncInterval) { recordIndex = 0; writer.sync(); } break; case SYNC_RANDOM: if (recordIndex == syncIndex) { recordIndex = 0; writer.sync(); syncIndex = random.nextInt(syncInterval); } break; case SYNC_DEFAULT: default: } } } return path; }
public void testWrite() throws IOException { URL url = this.getClass().getClassLoader().getResource("input/Company.avsc"); assertNotNull(url); Schema schema = new Schema.Parser().parse(new File(url.getFile())); assertNotNull(schema); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema); // Another way of loading a file File file = new File("src/test/resources/input/companies.avro"); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(file, datumReader); File fileOut = new File("target/companies2.avro"); Schema schemaOut = new Schema.Parser().parse(new File("src/test/resources/input/Company2.avsc")); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schemaOut); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter); GenericRecord company = null; int count = 0; while (dataFileReader.hasNext()) { company = dataFileReader.next(company); if (company.get("name").toString().equals("aol")) { dataFileWriter.create(schemaOut, fileOut); GenericRecord recordOut = new GenericData.Record(schemaOut); recordOut.put("id", company.get("id")); recordOut.put("name", company.get("name")); assertTrue(recordOut.getSchema().getField("address") != null); assertTrue(recordOut.getSchema().getField("employeeCount") == null); // address is of complex type GenericRecord address = new GenericData.Record((GenericData.Record) company.get("address"), true); recordOut.put("address", address); dataFileWriter.append(recordOut); count++; } } assertTrue(count > 0); dataFileWriter.close(); }
public static void main(String[] args) throws IOException { DatumWriter<ArchivePlace> datumWriter = new SpecificDatumWriter<ArchivePlace>(ArchivePlace.class); DataFileWriter<ArchivePlace> falloutDatafileWriter = new DataFileWriter<ArchivePlace>(datumWriter); FileOutputStream falloutOutputStream = new FileOutputStream("src/test/resources/archive-places/input.avro", true); falloutDatafileWriter.create(ArchivePlace.SCHEMA$, falloutOutputStream); List<ArchivePlace> places = SerializationUtil.loadFromJsons( ArchivePlace.SCHEMA$, "src/test/resources/archive-places/input.json"); for (ArchivePlace place : places) { falloutDatafileWriter.append(place); falloutDatafileWriter.flush(); } falloutDatafileWriter.close(); falloutOutputStream.close(); }
private List<Map.Entry> previewAvroBatch(FileStatus fileStatus, int batchSize) throws IOException, InterruptedException { SeekableInput input = new FsInput(fileStatus.getPath(), hadoopConf); DatumReader<GenericRecord> reader = new GenericDatumReader<>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader); List<Map.Entry> batch = new ArrayList<>(); int count = 0; while (fileReader.hasNext() && batch.size() < batchSize) { GenericRecord datum = fileReader.next(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>( new GenericDatumWriter<GenericRecord>(datum.getSchema())); dataFileWriter.create(datum.getSchema(), out); dataFileWriter.append(datum); dataFileWriter.close(); out.close(); batch.add(new Pair(fileStatus.getPath().toUri().getPath() + "::" + count, out.toByteArray())); count++; } return batch; }
public void serializeSpecific() throws IOException { // Create a datum to serialize. MyPair datum = new MyPair(); datum.left = new Utf8("dog"); datum.right = new Utf8("cat"); File tmpFile = File.createTempFile("myPairAvroExample", ".avro"); // Serialize it. DataFileWriter<MyPair> writer = new DataFileWriter<MyPair>(new SpecificDatumWriter<MyPair>(MyPair.class)); writer.create(MyPair.SCHEMA$, tmpFile); writer.append(datum); writer.close(); System.out.println("Serialization: " + tmpFile); // Deserialize it. FileReader<MyPair> reader = DataFileReader.openReader(tmpFile, new SpecificDatumReader<MyPair>(MyPair.class)); while (reader.hasNext()) { MyPair result = reader.next(); System.out.printf("Left: %s, Right: %s\n", result.left, result.right); } reader.close(); }
@Override public void writeRecord(E record) throws IOException { dataFileWriter.append(record); }
@Override public void write(T value) throws Exception { dataFileWriter.append(value); }