private void runMapsideJoin(Pipeline pipeline, boolean inMemory, boolean materialize) { PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt"); PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt"); JoinStrategy<Integer, String, String> mapsideJoin = new MapsideJoinStrategy<Integer, String, String>(materialize); PTable<Integer, String> custOrders = mapsideJoin .join(customerTable, orderTable, JoinType.INNER_JOIN) .mapValues("concat", new ConcatValuesFn(), Writables.strings()); PTable<Integer, String> ORDER_TABLE = orderTable.mapValues(new CapOrdersFn(), orderTable.getValueType()); PTable<Integer, Pair<String, String>> joined = mapsideJoin.join(custOrders, ORDER_TABLE, JoinType.INNER_JOIN); List<Pair<Integer, Pair<String, String>>> expectedJoinResult = Lists.newArrayList(); expectedJoinResult.add(Pair.of(111, Pair.of("[John Doe,Corn flakes]", "CORN FLAKES"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PAPER"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PLUNGER"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PAPER"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PLUNGER"))); expectedJoinResult.add(Pair.of(333, Pair.of("[Someone Else,Toilet brush]", "TOILET BRUSH"))); Iterable<Pair<Integer, Pair<String, String>>> iter = joined.materialize(); PipelineResult res = pipeline.run(); if (!inMemory) { assertEquals(materialize ? 2 : 1, res.getStageResults().size()); } List<Pair<Integer, Pair<String, String>>> joinedResultList = Lists.newArrayList(iter); Collections.sort(joinedResultList); assertEquals(expectedJoinResult, joinedResultList); }
@Override public int run(final String[] args) throws Exception { createTable(); final Configuration config = getConf(); final Pipeline pipeline = new MRPipeline(CrunchStockDateInserter.class, "PipelineWithFilterFn", config); PCollection<String> lines = pipeline.readTextFile(Constants.HDFS_INPUT_PATH + "/2004_2014.csv"); PCollection<Put> resultPut = CrunchUtils.returnDates(lines); System.out.println("********** size ************ : " + resultPut.getSize()); pipeline.write(resultPut, new HBaseTarget(Constants.STOCK_DATES_TABLE)); PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
public int run(String[] args) throws Exception { Pipeline pipeline = new MRPipeline(SecondarySortingExample.class); // Read input PCollection<String> lines = pipeline.readTextFile(args[0]); // Split each line and count them PTable<String, Long> wordcount = lines.parallelDo(new Tokenizer(), Writables.strings()).count(); // Sort PCollection<Pair<String, Long>> sorted = Sort.sortPairs(wordcount, ColumnOrder.by(1, Sort.Order.DESCENDING)); // Write the output sorted.write(To.textFile(args[0])); // Kick off execution PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
@Override public int execute(Configuration conf) throws Exception { Pipeline p = pipelineParams.create(SampleCommand.class, conf); PCollection<Record> elements = inputParams.getRecords(p); if (sampleSize > 0 && samplingProbability > 0.0) { throw new IllegalArgumentException("--size and --prob are mutually exclusive options."); } PCollection<Record> sample; if (sampleSize > 0) { sample = ReservoirSampling.sample(elements, sampleSize); } else if (samplingProbability > 0.0 && samplingProbability < 1.0) { sample = Sample.sample(elements, samplingProbability); } else { throw new IllegalArgumentException( String.format( "Invalid input args: sample size = %d, sample prob = %.4f", sampleSize, samplingProbability)); } outputParams.write(sample, sampleFile); PipelineResult pr = p.done(); return pr.succeeded() ? 0 : 1; }