@Override public int run(final String[] args) throws Exception { createTable(); final Configuration config = getConf(); final Pipeline pipeline = new MRPipeline(CrunchStockDateInserter.class, "PipelineWithFilterFn", config); PCollection<String> lines = pipeline.readTextFile(Constants.HDFS_INPUT_PATH + "/2004_2014.csv"); PCollection<Put> resultPut = CrunchUtils.returnDates(lines); System.out.println("********** size ************ : " + resultPut.getSize()); pipeline.write(resultPut, new HBaseTarget(Constants.STOCK_DATES_TABLE)); PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
public int run(String[] args) throws Exception { Pipeline pipeline = new MRPipeline(SecondarySortingExample.class); // Read input PCollection<String> lines = pipeline.readTextFile(args[0]); // Split each line and count them PTable<String, Long> wordcount = lines.parallelDo(new Tokenizer(), Writables.strings()).count(); // Sort PCollection<Pair<String, Long>> sorted = Sort.sortPairs(wordcount, ColumnOrder.by(1, Sort.Order.DESCENDING)); // Write the output sorted.write(To.textFile(args[0])); // Kick off execution PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
@Override public int execute(Configuration conf) throws IOException { Pipeline p = pipelineParams.create(KMeansSketchCommand.class, conf); List<Vector> initial = null; if (initVectorsPath != null) { initial = getInitialVectors(p); } PCollection<Vector> input = inputParams.getVectors(p); if (initial == null || initial.isEmpty()) { initial = Lists.newArrayList(); initial.add(input.materialize().iterator().next()); } KMeansParallel kmp = new KMeansParallel(randomParams.getRandom(), indexBits, indexSamples); Crossfold cf = new Crossfold(crossFolds); List<List<Weighted<Vector>>> wv = kmp.initialization(input, numIterations, samplesPerIteration, initial, cf); AvroIO.write(toWeightedCenters(wv), new File(outputFile)); p.done(); return 0; }
@Override public int execute(Configuration conf) throws Exception { Pipeline p = pipelineParams.create(SampleCommand.class, conf); PCollection<Record> elements = inputParams.getRecords(p); if (sampleSize > 0 && samplingProbability > 0.0) { throw new IllegalArgumentException("--size and --prob are mutually exclusive options."); } PCollection<Record> sample; if (sampleSize > 0) { sample = ReservoirSampling.sample(elements, sampleSize); } else if (samplingProbability > 0.0 && samplingProbability < 1.0) { sample = Sample.sample(elements, samplingProbability); } else { throw new IllegalArgumentException( String.format( "Invalid input args: sample size = %d, sample prob = %.4f", sampleSize, samplingProbability)); } outputParams.write(sample, sampleFile); PipelineResult pr = p.done(); return pr.succeeded() ? 0 : 1; }
@Override public int execute(Configuration conf) throws Exception { Pipeline p = pipelineParams.create(SummaryCommand.class, conf); PCollection<Record> records = inputParams.getRecords(p); Spec spec = null; List<Integer> symbolicColumns = Lists.newArrayList(); List<Integer> ignoredColumns = Lists.newArrayList(); if (headerFile != null) { spec = Specs.readFromHeaderFile(headerFile, ignoredColumns, symbolicColumns); } Summarizer summarizer = new Summarizer() .spec(spec) .defaultToSymbolic(false) .exceptionColumns(symbolicColumns) .ignoreColumns(ignoredColumns); Summary summary = summarizer.build(records).getValue(); summaryParams.save(summary, summaryFile); p.done(); return 0; }
@After public void tearDown() { pipeline.done(); }
private void runCheckpointPipeline(View<Record> inputView, View<Record> outputView) { Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class); PCollection<GenericData.Record> data = pipeline.read(CrunchDatasets.asSource(inputView)); pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.CHECKPOINT); pipeline.done(); }