@Override public int run(final String[] args) throws Exception { createTable(); final Configuration config = getConf(); final Pipeline pipeline = new MRPipeline(CrunchStockDateInserter.class, "PipelineWithFilterFn", config); PCollection<String> lines = pipeline.readTextFile(Constants.HDFS_INPUT_PATH + "/2004_2014.csv"); PCollection<Put> resultPut = CrunchUtils.returnDates(lines); System.out.println("********** size ************ : " + resultPut.getSize()); pipeline.write(resultPut, new HBaseTarget(Constants.STOCK_DATES_TABLE)); PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
public int run(String[] args) throws Exception { Pipeline pipeline = new MRPipeline(SecondarySortingExample.class); // Read input PCollection<String> lines = pipeline.readTextFile(args[0]); // Split each line and count them PTable<String, Long> wordcount = lines.parallelDo(new Tokenizer(), Writables.strings()).count(); // Sort PCollection<Pair<String, Long>> sorted = Sort.sortPairs(wordcount, ColumnOrder.by(1, Sort.Order.DESCENDING)); // Write the output sorted.write(To.textFile(args[0])); // Kick off execution PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
@Override public int execute(Configuration conf) throws Exception { Pipeline p = pipelineParams.create(SampleCommand.class, conf); PCollection<Record> elements = inputParams.getRecords(p); if (sampleSize > 0 && samplingProbability > 0.0) { throw new IllegalArgumentException("--size and --prob are mutually exclusive options."); } PCollection<Record> sample; if (sampleSize > 0) { sample = ReservoirSampling.sample(elements, sampleSize); } else if (samplingProbability > 0.0 && samplingProbability < 1.0) { sample = Sample.sample(elements, samplingProbability); } else { throw new IllegalArgumentException( String.format( "Invalid input args: sample size = %d, sample prob = %.4f", sampleSize, samplingProbability)); } outputParams.write(sample, sampleFile); PipelineResult pr = p.done(); return pr.succeeded() ? 0 : 1; }