/** * Builds the target file path as <datasource directory>/<month>.avro. If the target file already * exists, then it is open for appending, otherwise it is created */ private DataFileWriter<GenericRecord> openHDFSFile(DatasourceMonth datasourceMonth) throws IOException { DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>(AVRO_SCHEMA)); writer.setSyncInterval(FILEWRITER_SYNC_INTERVAL); // writer.setCodec(CodecFactory.snappyCodec()); // omit for now Path targetPath = buildTargetPath(datasourceMonth); // just for logging String fullTargetPath = this.hdfs.getWorkingDirectory() + "/" + targetPath; // Append to an existing file, or create a new file is file otherwise if (this.hdfs.exists(targetPath)) { // appending to an existing file // based on // http://technicaltidbit.blogspot.com.es/2013/02/avro-can-append-in-hdfs-after-all.html if (debugMode) { this.hdfs.setReplication(targetPath, (short) 1); } LOGGER.info("Appending to existing file {}", fullTargetPath); OutputStream outputStream = this.hdfs.append(targetPath); writer.appendTo(new FsInput(targetPath, this.hadoopConf), outputStream); } else { // creating a new file LOGGER.info( "Creating new file " + fullTargetPath + " for datasource {} and month {}", datasourceMonth.datasource(), datasourceMonth.month()); OutputStream outputStream = this.hdfs.create(targetPath); writer.create(AVRO_SCHEMA, outputStream); } return writer; }
private Path buildTargetPath(DatasourceMonth datasourceMonth) { return new Path( this.datasourcesDirectories.get(datasourceMonth.datasource()) + "/" + datasourceMonth.month() + ".avro"); }
@Override public void execute(Tuple inputTuple) { /* Processing tuples of the shape (DATASOURCE_ID, TIMESTAMP_FIELD, CONTENT_FIELD) */ // get datasource String datasource = inputTuple.getStringByField(RestIngestionSpout.DATASOURCE_ID); // compute month long timestamp = inputTuple.getLongByField(RestIngestionSpout.TIMESTAMP_FIELD); // this computation is completely stateless String month = timestampToMonth(timestamp); // now get the DataFileWriter DataFileWriter<GenericRecord> writer = null; try { writer = this.writersCache.get(DatasourceMonth.create(datasource, month)); } catch (ExecutionException ee) { LOGGER.error( "Error getting DataFileWriter for tuple for datasource " + datasource + " and timestamp " + timestamp + " : " + ee.getMessage()); this.collector.fail(inputTuple); return; } // create and write a new record GenericRecord newDataRecord = new GenericData.Record(AVRO_SCHEMA); newDataRecord.put(AVRO_TIMESTAMP_FIELD, new Long(timestamp)); newDataRecord.put( AVRO_CONTENT_FIELD, inputTuple.getStringByField(RestIngestionSpout.CONTENT_FIELD)); try { writer.append(newDataRecord); } catch (IOException ioe) { LOGGER.error( "Error writing Avro record for datasource " + datasource + " and timestamp " + timestamp + " : " + ioe.getMessage()); this.collector.fail(inputTuple); return; } // ACK processing for this tupe as ok this.collector.ack(inputTuple); }