示例#1
0
  @Override
  public void open(int taskNumber, int numTasks) throws IOException {
    super.open(taskNumber, numTasks);

    DatumWriter<E> datumWriter;
    Schema schema;
    if (org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)) {
      datumWriter = new SpecificDatumWriter<E>(avroValueType);
      try {
        schema =
            ((org.apache.avro.specific.SpecificRecordBase) avroValueType.newInstance()).getSchema();
      } catch (InstantiationException e) {
        throw new RuntimeException(e.getMessage());
      } catch (IllegalAccessException e) {
        throw new RuntimeException(e.getMessage());
      }
    } else {
      datumWriter = new ReflectDatumWriter<E>(avroValueType);
      schema = ReflectData.get().getSchema(avroValueType);
    }
    dataFileWriter = new DataFileWriter<E>(datumWriter);
    if (userDefinedSchema == null) {
      dataFileWriter.create(schema, stream);
    } else {
      dataFileWriter.create(userDefinedSchema, stream);
    }
  }
示例#2
0
  /**
   * Builds the target file path as <datasource directory>/<month>.avro. If the target file already
   * exists, then it is open for appending, otherwise it is created
   */
  private DataFileWriter<GenericRecord> openHDFSFile(DatasourceMonth datasourceMonth)
      throws IOException {
    DataFileWriter<GenericRecord> writer =
        new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>(AVRO_SCHEMA));
    writer.setSyncInterval(FILEWRITER_SYNC_INTERVAL);
    // writer.setCodec(CodecFactory.snappyCodec()); // omit for now

    Path targetPath = buildTargetPath(datasourceMonth);
    // just for logging
    String fullTargetPath = this.hdfs.getWorkingDirectory() + "/" + targetPath;
    // Append to an existing file, or create a new file is file otherwise
    if (this.hdfs.exists(targetPath)) {
      // appending to an existing file
      // based on
      // http://technicaltidbit.blogspot.com.es/2013/02/avro-can-append-in-hdfs-after-all.html
      if (debugMode) {
        this.hdfs.setReplication(targetPath, (short) 1);
      }
      LOGGER.info("Appending to existing file {}", fullTargetPath);
      OutputStream outputStream = this.hdfs.append(targetPath);
      writer.appendTo(new FsInput(targetPath, this.hadoopConf), outputStream);
    } else {
      // creating a new file
      LOGGER.info(
          "Creating new file " + fullTargetPath + " for datasource {} and month {}",
          datasourceMonth.datasource(),
          datasourceMonth.month());
      OutputStream outputStream = this.hdfs.create(targetPath);
      writer.create(AVRO_SCHEMA, outputStream);
    }

    return writer;
  }
 private void writeContainer(Record src, OutputStream dst) {
   DataFileWriter dataFileWriter = null;
   try {
     try {
       Schema schema = null;
       for (Object attachment : src.get(Fields.ATTACHMENT_BODY)) {
         Preconditions.checkNotNull(attachment);
         GenericContainer datum = (GenericContainer) attachment;
         schema = getSchema(datum, schema);
         assert schema != null;
         if (dataFileWriter == null) { // init
           GenericDatumWriter datumWriter = new GenericDatumWriter(schema);
           dataFileWriter = new DataFileWriter(datumWriter);
           if (codecFactory != null) {
             dataFileWriter.setCodec(codecFactory);
           }
           for (Map.Entry<String, String> entry : metadata.entrySet()) {
             dataFileWriter.setMeta(entry.getKey(), entry.getValue());
           }
           dataFileWriter.create(schema, dst);
         }
         dataFileWriter.append(datum);
       }
       if (dataFileWriter != null) {
         dataFileWriter.flush();
       }
     } catch (IOException e) {
       throw new MorphlineRuntimeException(e);
     }
   } finally {
     Closeables.closeQuietly(dataFileWriter);
   }
 }
  private byte[] createAvroData(String name, int age, List<String> emails) throws IOException {
    String AVRO_SCHEMA =
        "{\n"
            + "\"type\": \"record\",\n"
            + "\"name\": \"Employee\",\n"
            + "\"fields\": [\n"
            + " {\"name\": \"name\", \"type\": \"string\"},\n"
            + " {\"name\": \"age\", \"type\": \"int\"},\n"
            + " {\"name\": \"emails\", \"type\": {\"type\": \"array\", \"items\": \"string\"}},\n"
            + " {\"name\": \"boss\", \"type\": [\"Employee\",\"null\"]}\n"
            + "]}";
    Schema schema = new Schema.Parser().parse(AVRO_SCHEMA);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    GenericRecord e1 = new GenericData.Record(schema);
    e1.put("name", name);
    e1.put("age", age);
    e1.put("emails", emails);
    e1.put("boss", null);

    DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.create(schema, out);
    dataFileWriter.append(e1);
    dataFileWriter.close();
    return out.toByteArray();
  }
示例#5
0
 private static <T> File createFile(File file, Schema schema, T... records) throws IOException {
   DatumWriter<T> datumWriter = new GenericDatumWriter<T>(schema);
   DataFileWriter<T> fileWriter = new DataFileWriter<T>(datumWriter);
   fileWriter.create(schema, file);
   for (T record : records) {
     fileWriter.append(record);
   }
   fileWriter.close();
   return file;
 }
 @Override
 public void prepare(Map conf, TridentOperationContext context) {
   try {
     String path = (String) conf.get("DOCUMENT_PATH");
     schema = Schema.parse(PersistDocumentFunction.class.getResourceAsStream("/document.avsc"));
     File file = new File(path);
     DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema);
     dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter);
     if (file.exists()) dataFileWriter.appendTo(file);
     else dataFileWriter.create(schema, file);
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
 }
  private void populateGenericFile(List<GenericRecord> genericRecords) throws IOException {
    FileOutputStream outputStream = new FileOutputStream(this.avroFile);
    GenericDatumWriter<GenericRecord> genericDatumWriter =
        new GenericDatumWriter<GenericRecord>(schema);

    DataFileWriter<GenericRecord> dataFileWriter =
        new DataFileWriter<GenericRecord>(genericDatumWriter);
    dataFileWriter.create(schema, outputStream);

    for (GenericRecord record : genericRecords) {
      dataFileWriter.append(record);
    }

    dataFileWriter.close();
    outputStream.close();
  }
  /**
   * Generates an input Avro file containing the given records in the temporary directory and
   * returns the full path of the file.
   */
  private <T> String generateTestFile(
      String filename,
      List<T> elems,
      SyncBehavior syncBehavior,
      int syncInterval,
      AvroCoder<T> coder,
      String codec)
      throws IOException {
    Random random = new Random(0);
    File tmpFile = tmpFolder.newFile(filename);
    String path = tmpFile.toString();

    FileOutputStream os = new FileOutputStream(tmpFile);
    DatumWriter<T> datumWriter = coder.createDatumWriter();
    try (DataFileWriter<T> writer = new DataFileWriter<>(datumWriter)) {
      writer.setCodec(CodecFactory.fromString(codec));
      writer.create(coder.getSchema(), os);

      int recordIndex = 0;
      int syncIndex = syncBehavior == SyncBehavior.SYNC_RANDOM ? random.nextInt(syncInterval) : 0;

      for (T elem : elems) {
        writer.append(elem);
        recordIndex++;

        switch (syncBehavior) {
          case SYNC_REGULAR:
            if (recordIndex == syncInterval) {
              recordIndex = 0;
              writer.sync();
            }
            break;
          case SYNC_RANDOM:
            if (recordIndex == syncIndex) {
              recordIndex = 0;
              writer.sync();
              syncIndex = random.nextInt(syncInterval);
            }
            break;
          case SYNC_DEFAULT:
          default:
        }
      }
    }
    return path;
  }
  public void testWrite() throws IOException {

    URL url = this.getClass().getClassLoader().getResource("input/Company.avsc");
    assertNotNull(url);
    Schema schema = new Schema.Parser().parse(new File(url.getFile()));
    assertNotNull(schema);

    DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema);
    // Another way of loading a file
    File file = new File("src/test/resources/input/companies.avro");
    DataFileReader<GenericRecord> dataFileReader =
        new DataFileReader<GenericRecord>(file, datumReader);

    File fileOut = new File("target/companies2.avro");
    Schema schemaOut =
        new Schema.Parser().parse(new File("src/test/resources/input/Company2.avsc"));
    DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schemaOut);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter);

    GenericRecord company = null;
    int count = 0;
    while (dataFileReader.hasNext()) {
      company = dataFileReader.next(company);
      if (company.get("name").toString().equals("aol")) {
        dataFileWriter.create(schemaOut, fileOut);

        GenericRecord recordOut = new GenericData.Record(schemaOut);
        recordOut.put("id", company.get("id"));
        recordOut.put("name", company.get("name"));
        assertTrue(recordOut.getSchema().getField("address") != null);
        assertTrue(recordOut.getSchema().getField("employeeCount") == null);

        // address is of complex type
        GenericRecord address =
            new GenericData.Record((GenericData.Record) company.get("address"), true);
        recordOut.put("address", address);

        dataFileWriter.append(recordOut);

        count++;
      }
    }
    assertTrue(count > 0);

    dataFileWriter.close();
  }
示例#10
0
  public static void main(String[] args) throws IOException {
    User user1 = new User();
    user1.setName("Alyssa");
    user1.setFavoriteNumber(256);
    // Leave favorite color null

    // Alternate constructor
    User user2 = new User("Ben", 7, "red");

    // Construct via builder
    User user3 =
        User.newBuilder()
            .setName("Charlie")
            .setFavoriteColor("blue")
            .setFavoriteNumber(null)
            .build();

    // Serialize user1 and user2 to disk
    File file = new File("users.avro");
    DatumWriter<User> userDatumWriter = new SpecificDatumWriter<User>(User.class);
    DataFileWriter<User> dataFileWriter = new DataFileWriter<User>(userDatumWriter);
    dataFileWriter.create(user1.getSchema(), file);
    dataFileWriter.append(user1);
    dataFileWriter.append(user2);
    dataFileWriter.append(user3);
    dataFileWriter.close();

    // Deserialize Users from disk
    DatumReader<User> userDatumReader = new SpecificDatumReader<User>(User.class);
    DataFileReader<User> dataFileReader = new DataFileReader<User>(file, userDatumReader);
    try {
      User user = null;
      while (dataFileReader.hasNext()) {
        // Reuse user object by passing it to next(). This saves us from
        // allocating and garbage collecting many objects for files with
        // many items.
        user = dataFileReader.next(user);
        System.out.println(user);
      }
    } finally {
      dataFileReader.close();
    }
  }
  public static void main(String[] args) throws IOException {
    DatumWriter<ArchivePlace> datumWriter =
        new SpecificDatumWriter<ArchivePlace>(ArchivePlace.class);
    DataFileWriter<ArchivePlace> falloutDatafileWriter =
        new DataFileWriter<ArchivePlace>(datumWriter);
    FileOutputStream falloutOutputStream =
        new FileOutputStream("src/test/resources/archive-places/input.avro", true);

    falloutDatafileWriter.create(ArchivePlace.SCHEMA$, falloutOutputStream);

    List<ArchivePlace> places =
        SerializationUtil.loadFromJsons(
            ArchivePlace.SCHEMA$, "src/test/resources/archive-places/input.json");
    for (ArchivePlace place : places) {
      falloutDatafileWriter.append(place);
      falloutDatafileWriter.flush();
    }
    falloutDatafileWriter.close();
    falloutOutputStream.close();
  }
 private List<Map.Entry> previewAvroBatch(FileStatus fileStatus, int batchSize)
     throws IOException, InterruptedException {
   SeekableInput input = new FsInput(fileStatus.getPath(), hadoopConf);
   DatumReader<GenericRecord> reader = new GenericDatumReader<>();
   FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);
   List<Map.Entry> batch = new ArrayList<>();
   int count = 0;
   while (fileReader.hasNext() && batch.size() < batchSize) {
     GenericRecord datum = fileReader.next();
     ByteArrayOutputStream out = new ByteArrayOutputStream();
     DataFileWriter<GenericRecord> dataFileWriter =
         new DataFileWriter<GenericRecord>(
             new GenericDatumWriter<GenericRecord>(datum.getSchema()));
     dataFileWriter.create(datum.getSchema(), out);
     dataFileWriter.append(datum);
     dataFileWriter.close();
     out.close();
     batch.add(new Pair(fileStatus.getPath().toUri().getPath() + "::" + count, out.toByteArray()));
     count++;
   }
   return batch;
 }
示例#13
0
  public void serializeSpecific() throws IOException {
    // Create a datum to serialize.
    MyPair datum = new MyPair();
    datum.left = new Utf8("dog");
    datum.right = new Utf8("cat");
    File tmpFile = File.createTempFile("myPairAvroExample", ".avro");
    // Serialize it.
    DataFileWriter<MyPair> writer =
        new DataFileWriter<MyPair>(new SpecificDatumWriter<MyPair>(MyPair.class));
    writer.create(MyPair.SCHEMA$, tmpFile);
    writer.append(datum);
    writer.close();

    System.out.println("Serialization: " + tmpFile);

    // Deserialize it.
    FileReader<MyPair> reader =
        DataFileReader.openReader(tmpFile, new SpecificDatumReader<MyPair>(MyPair.class));
    while (reader.hasNext()) {
      MyPair result = reader.next();
      System.out.printf("Left: %s, Right: %s\n", result.left, result.right);
    }
    reader.close();
  }
示例#14
0
 @SuppressWarnings("deprecation") // uses internal test functionality.
 @Override
 protected void prepareWrite(WritableByteChannel channel) throws Exception {
   dataFileWriter = new DataFileWriter<>(coder.createDatumWriter());
   dataFileWriter.create(coder.getSchema(), Channels.newOutputStream(channel));
 }