@Override public void open(int taskNumber, int numTasks) throws IOException { super.open(taskNumber, numTasks); DatumWriter<E> datumWriter; Schema schema; if (org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)) { datumWriter = new SpecificDatumWriter<E>(avroValueType); try { schema = ((org.apache.avro.specific.SpecificRecordBase) avroValueType.newInstance()).getSchema(); } catch (InstantiationException e) { throw new RuntimeException(e.getMessage()); } catch (IllegalAccessException e) { throw new RuntimeException(e.getMessage()); } } else { datumWriter = new ReflectDatumWriter<E>(avroValueType); schema = ReflectData.get().getSchema(avroValueType); } dataFileWriter = new DataFileWriter<E>(datumWriter); if (userDefinedSchema == null) { dataFileWriter.create(schema, stream); } else { dataFileWriter.create(userDefinedSchema, stream); } }
/** * Builds the target file path as <datasource directory>/<month>.avro. If the target file already * exists, then it is open for appending, otherwise it is created */ private DataFileWriter<GenericRecord> openHDFSFile(DatasourceMonth datasourceMonth) throws IOException { DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>(AVRO_SCHEMA)); writer.setSyncInterval(FILEWRITER_SYNC_INTERVAL); // writer.setCodec(CodecFactory.snappyCodec()); // omit for now Path targetPath = buildTargetPath(datasourceMonth); // just for logging String fullTargetPath = this.hdfs.getWorkingDirectory() + "/" + targetPath; // Append to an existing file, or create a new file is file otherwise if (this.hdfs.exists(targetPath)) { // appending to an existing file // based on // http://technicaltidbit.blogspot.com.es/2013/02/avro-can-append-in-hdfs-after-all.html if (debugMode) { this.hdfs.setReplication(targetPath, (short) 1); } LOGGER.info("Appending to existing file {}", fullTargetPath); OutputStream outputStream = this.hdfs.append(targetPath); writer.appendTo(new FsInput(targetPath, this.hadoopConf), outputStream); } else { // creating a new file LOGGER.info( "Creating new file " + fullTargetPath + " for datasource {} and month {}", datasourceMonth.datasource(), datasourceMonth.month()); OutputStream outputStream = this.hdfs.create(targetPath); writer.create(AVRO_SCHEMA, outputStream); } return writer; }
private void writeContainer(Record src, OutputStream dst) { DataFileWriter dataFileWriter = null; try { try { Schema schema = null; for (Object attachment : src.get(Fields.ATTACHMENT_BODY)) { Preconditions.checkNotNull(attachment); GenericContainer datum = (GenericContainer) attachment; schema = getSchema(datum, schema); assert schema != null; if (dataFileWriter == null) { // init GenericDatumWriter datumWriter = new GenericDatumWriter(schema); dataFileWriter = new DataFileWriter(datumWriter); if (codecFactory != null) { dataFileWriter.setCodec(codecFactory); } for (Map.Entry<String, String> entry : metadata.entrySet()) { dataFileWriter.setMeta(entry.getKey(), entry.getValue()); } dataFileWriter.create(schema, dst); } dataFileWriter.append(datum); } if (dataFileWriter != null) { dataFileWriter.flush(); } } catch (IOException e) { throw new MorphlineRuntimeException(e); } } finally { Closeables.closeQuietly(dataFileWriter); } }
private byte[] createAvroData(String name, int age, List<String> emails) throws IOException { String AVRO_SCHEMA = "{\n" + "\"type\": \"record\",\n" + "\"name\": \"Employee\",\n" + "\"fields\": [\n" + " {\"name\": \"name\", \"type\": \"string\"},\n" + " {\"name\": \"age\", \"type\": \"int\"},\n" + " {\"name\": \"emails\", \"type\": {\"type\": \"array\", \"items\": \"string\"}},\n" + " {\"name\": \"boss\", \"type\": [\"Employee\",\"null\"]}\n" + "]}"; Schema schema = new Schema.Parser().parse(AVRO_SCHEMA); ByteArrayOutputStream out = new ByteArrayOutputStream(); GenericRecord e1 = new GenericData.Record(schema); e1.put("name", name); e1.put("age", age); e1.put("emails", emails); e1.put("boss", null); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter); dataFileWriter.create(schema, out); dataFileWriter.append(e1); dataFileWriter.close(); return out.toByteArray(); }
private static <T> File createFile(File file, Schema schema, T... records) throws IOException { DatumWriter<T> datumWriter = new GenericDatumWriter<T>(schema); DataFileWriter<T> fileWriter = new DataFileWriter<T>(datumWriter); fileWriter.create(schema, file); for (T record : records) { fileWriter.append(record); } fileWriter.close(); return file; }
@Override public void prepare(Map conf, TridentOperationContext context) { try { String path = (String) conf.get("DOCUMENT_PATH"); schema = Schema.parse(PersistDocumentFunction.class.getResourceAsStream("/document.avsc")); File file = new File(path); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema); dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter); if (file.exists()) dataFileWriter.appendTo(file); else dataFileWriter.create(schema, file); } catch (IOException e) { throw new RuntimeException(e); } }
private void populateGenericFile(List<GenericRecord> genericRecords) throws IOException { FileOutputStream outputStream = new FileOutputStream(this.avroFile); GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(genericDatumWriter); dataFileWriter.create(schema, outputStream); for (GenericRecord record : genericRecords) { dataFileWriter.append(record); } dataFileWriter.close(); outputStream.close(); }
/** * Generates an input Avro file containing the given records in the temporary directory and * returns the full path of the file. */ private <T> String generateTestFile( String filename, List<T> elems, SyncBehavior syncBehavior, int syncInterval, AvroCoder<T> coder, String codec) throws IOException { Random random = new Random(0); File tmpFile = tmpFolder.newFile(filename); String path = tmpFile.toString(); FileOutputStream os = new FileOutputStream(tmpFile); DatumWriter<T> datumWriter = coder.createDatumWriter(); try (DataFileWriter<T> writer = new DataFileWriter<>(datumWriter)) { writer.setCodec(CodecFactory.fromString(codec)); writer.create(coder.getSchema(), os); int recordIndex = 0; int syncIndex = syncBehavior == SyncBehavior.SYNC_RANDOM ? random.nextInt(syncInterval) : 0; for (T elem : elems) { writer.append(elem); recordIndex++; switch (syncBehavior) { case SYNC_REGULAR: if (recordIndex == syncInterval) { recordIndex = 0; writer.sync(); } break; case SYNC_RANDOM: if (recordIndex == syncIndex) { recordIndex = 0; writer.sync(); syncIndex = random.nextInt(syncInterval); } break; case SYNC_DEFAULT: default: } } } return path; }
public void testWrite() throws IOException { URL url = this.getClass().getClassLoader().getResource("input/Company.avsc"); assertNotNull(url); Schema schema = new Schema.Parser().parse(new File(url.getFile())); assertNotNull(schema); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema); // Another way of loading a file File file = new File("src/test/resources/input/companies.avro"); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(file, datumReader); File fileOut = new File("target/companies2.avro"); Schema schemaOut = new Schema.Parser().parse(new File("src/test/resources/input/Company2.avsc")); DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schemaOut); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter); GenericRecord company = null; int count = 0; while (dataFileReader.hasNext()) { company = dataFileReader.next(company); if (company.get("name").toString().equals("aol")) { dataFileWriter.create(schemaOut, fileOut); GenericRecord recordOut = new GenericData.Record(schemaOut); recordOut.put("id", company.get("id")); recordOut.put("name", company.get("name")); assertTrue(recordOut.getSchema().getField("address") != null); assertTrue(recordOut.getSchema().getField("employeeCount") == null); // address is of complex type GenericRecord address = new GenericData.Record((GenericData.Record) company.get("address"), true); recordOut.put("address", address); dataFileWriter.append(recordOut); count++; } } assertTrue(count > 0); dataFileWriter.close(); }
public static void main(String[] args) throws IOException { User user1 = new User(); user1.setName("Alyssa"); user1.setFavoriteNumber(256); // Leave favorite color null // Alternate constructor User user2 = new User("Ben", 7, "red"); // Construct via builder User user3 = User.newBuilder() .setName("Charlie") .setFavoriteColor("blue") .setFavoriteNumber(null) .build(); // Serialize user1 and user2 to disk File file = new File("users.avro"); DatumWriter<User> userDatumWriter = new SpecificDatumWriter<User>(User.class); DataFileWriter<User> dataFileWriter = new DataFileWriter<User>(userDatumWriter); dataFileWriter.create(user1.getSchema(), file); dataFileWriter.append(user1); dataFileWriter.append(user2); dataFileWriter.append(user3); dataFileWriter.close(); // Deserialize Users from disk DatumReader<User> userDatumReader = new SpecificDatumReader<User>(User.class); DataFileReader<User> dataFileReader = new DataFileReader<User>(file, userDatumReader); try { User user = null; while (dataFileReader.hasNext()) { // Reuse user object by passing it to next(). This saves us from // allocating and garbage collecting many objects for files with // many items. user = dataFileReader.next(user); System.out.println(user); } } finally { dataFileReader.close(); } }
public static void main(String[] args) throws IOException { DatumWriter<ArchivePlace> datumWriter = new SpecificDatumWriter<ArchivePlace>(ArchivePlace.class); DataFileWriter<ArchivePlace> falloutDatafileWriter = new DataFileWriter<ArchivePlace>(datumWriter); FileOutputStream falloutOutputStream = new FileOutputStream("src/test/resources/archive-places/input.avro", true); falloutDatafileWriter.create(ArchivePlace.SCHEMA$, falloutOutputStream); List<ArchivePlace> places = SerializationUtil.loadFromJsons( ArchivePlace.SCHEMA$, "src/test/resources/archive-places/input.json"); for (ArchivePlace place : places) { falloutDatafileWriter.append(place); falloutDatafileWriter.flush(); } falloutDatafileWriter.close(); falloutOutputStream.close(); }
private List<Map.Entry> previewAvroBatch(FileStatus fileStatus, int batchSize) throws IOException, InterruptedException { SeekableInput input = new FsInput(fileStatus.getPath(), hadoopConf); DatumReader<GenericRecord> reader = new GenericDatumReader<>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader); List<Map.Entry> batch = new ArrayList<>(); int count = 0; while (fileReader.hasNext() && batch.size() < batchSize) { GenericRecord datum = fileReader.next(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>( new GenericDatumWriter<GenericRecord>(datum.getSchema())); dataFileWriter.create(datum.getSchema(), out); dataFileWriter.append(datum); dataFileWriter.close(); out.close(); batch.add(new Pair(fileStatus.getPath().toUri().getPath() + "::" + count, out.toByteArray())); count++; } return batch; }
public void serializeSpecific() throws IOException { // Create a datum to serialize. MyPair datum = new MyPair(); datum.left = new Utf8("dog"); datum.right = new Utf8("cat"); File tmpFile = File.createTempFile("myPairAvroExample", ".avro"); // Serialize it. DataFileWriter<MyPair> writer = new DataFileWriter<MyPair>(new SpecificDatumWriter<MyPair>(MyPair.class)); writer.create(MyPair.SCHEMA$, tmpFile); writer.append(datum); writer.close(); System.out.println("Serialization: " + tmpFile); // Deserialize it. FileReader<MyPair> reader = DataFileReader.openReader(tmpFile, new SpecificDatumReader<MyPair>(MyPair.class)); while (reader.hasNext()) { MyPair result = reader.next(); System.out.printf("Left: %s, Right: %s\n", result.left, result.right); } reader.close(); }
@SuppressWarnings("deprecation") // uses internal test functionality. @Override protected void prepareWrite(WritableByteChannel channel) throws Exception { dataFileWriter = new DataFileWriter<>(coder.createDatumWriter()); dataFileWriter.create(coder.getSchema(), Channels.newOutputStream(channel)); }