Пример #1
0
  /**
   * Builds the target file path as <datasource directory>/<month>.avro. If the target file already
   * exists, then it is open for appending, otherwise it is created
   */
  private DataFileWriter<GenericRecord> openHDFSFile(DatasourceMonth datasourceMonth)
      throws IOException {
    DataFileWriter<GenericRecord> writer =
        new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>(AVRO_SCHEMA));
    writer.setSyncInterval(FILEWRITER_SYNC_INTERVAL);
    // writer.setCodec(CodecFactory.snappyCodec()); // omit for now

    Path targetPath = buildTargetPath(datasourceMonth);
    // just for logging
    String fullTargetPath = this.hdfs.getWorkingDirectory() + "/" + targetPath;
    // Append to an existing file, or create a new file is file otherwise
    if (this.hdfs.exists(targetPath)) {
      // appending to an existing file
      // based on
      // http://technicaltidbit.blogspot.com.es/2013/02/avro-can-append-in-hdfs-after-all.html
      if (debugMode) {
        this.hdfs.setReplication(targetPath, (short) 1);
      }
      LOGGER.info("Appending to existing file {}", fullTargetPath);
      OutputStream outputStream = this.hdfs.append(targetPath);
      writer.appendTo(new FsInput(targetPath, this.hadoopConf), outputStream);
    } else {
      // creating a new file
      LOGGER.info(
          "Creating new file " + fullTargetPath + " for datasource {} and month {}",
          datasourceMonth.datasource(),
          datasourceMonth.month());
      OutputStream outputStream = this.hdfs.create(targetPath);
      writer.create(AVRO_SCHEMA, outputStream);
    }

    return writer;
  }
Пример #2
0
 private Path buildTargetPath(DatasourceMonth datasourceMonth) {
   return new Path(
       this.datasourcesDirectories.get(datasourceMonth.datasource())
           + "/"
           + datasourceMonth.month()
           + ".avro");
 }
Пример #3
0
  @Override
  public void execute(Tuple inputTuple) {
    /* Processing tuples of the shape
    (DATASOURCE_ID, TIMESTAMP_FIELD, CONTENT_FIELD) */

    // get datasource
    String datasource = inputTuple.getStringByField(RestIngestionSpout.DATASOURCE_ID);
    // compute month
    long timestamp = inputTuple.getLongByField(RestIngestionSpout.TIMESTAMP_FIELD);
    // this computation is completely stateless
    String month = timestampToMonth(timestamp);

    // now get the DataFileWriter
    DataFileWriter<GenericRecord> writer = null;
    try {
      writer = this.writersCache.get(DatasourceMonth.create(datasource, month));
    } catch (ExecutionException ee) {
      LOGGER.error(
          "Error getting DataFileWriter for tuple for datasource "
              + datasource
              + " and timestamp "
              + timestamp
              + " : "
              + ee.getMessage());
      this.collector.fail(inputTuple);
      return;
    }

    // create and write a new record
    GenericRecord newDataRecord = new GenericData.Record(AVRO_SCHEMA);
    newDataRecord.put(AVRO_TIMESTAMP_FIELD, new Long(timestamp));
    newDataRecord.put(
        AVRO_CONTENT_FIELD, inputTuple.getStringByField(RestIngestionSpout.CONTENT_FIELD));
    try {
      writer.append(newDataRecord);
    } catch (IOException ioe) {
      LOGGER.error(
          "Error writing Avro record for datasource "
              + datasource
              + " and timestamp "
              + timestamp
              + " : "
              + ioe.getMessage());
      this.collector.fail(inputTuple);
      return;
    }

    // ACK processing for this tupe as ok
    this.collector.ack(inputTuple);
  }