@Test public void testPGroupedTableToMultipleOutputs() throws IOException { Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration()); PGroupedTable<String, String> groupedLineTable = pipeline .readTextFile(tmpDir.copyResourceFileName("set1.txt")) .by(IdentityFn.<String>getInstance(), Writables.strings()) .groupByKey(); PTable<String, String> ungroupedTableA = groupedLineTable.ungroup(); PTable<String, String> ungroupedTableB = groupedLineTable.ungroup(); File outputDirA = tmpDir.getFile("output_a"); File outputDirB = tmpDir.getFile("output_b"); pipeline.writeTextFile(ungroupedTableA, outputDirA.getAbsolutePath()); pipeline.writeTextFile(ungroupedTableB, outputDirB.getAbsolutePath()); PipelineResult result = pipeline.done(); for (StageResult stageResult : result.getStageResults()) { assertTrue(stageResult.getStageName().length() > 1); assertTrue(stageResult.getStageId().length() > 1); } // Verify that output from a single PGroupedTable can be sent to multiple collections assertTrue(new File(outputDirA, "part-r-00000").exists()); assertTrue(new File(outputDirB, "part-r-00000").exists()); }
private void runMapsideJoin(Pipeline pipeline, boolean inMemory, boolean materialize) { PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt"); PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt"); JoinStrategy<Integer, String, String> mapsideJoin = new MapsideJoinStrategy<Integer, String, String>(materialize); PTable<Integer, String> custOrders = mapsideJoin .join(customerTable, orderTable, JoinType.INNER_JOIN) .mapValues("concat", new ConcatValuesFn(), Writables.strings()); PTable<Integer, String> ORDER_TABLE = orderTable.mapValues(new CapOrdersFn(), orderTable.getValueType()); PTable<Integer, Pair<String, String>> joined = mapsideJoin.join(custOrders, ORDER_TABLE, JoinType.INNER_JOIN); List<Pair<Integer, Pair<String, String>>> expectedJoinResult = Lists.newArrayList(); expectedJoinResult.add(Pair.of(111, Pair.of("[John Doe,Corn flakes]", "CORN FLAKES"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PAPER"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PLUNGER"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PAPER"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PLUNGER"))); expectedJoinResult.add(Pair.of(333, Pair.of("[Someone Else,Toilet brush]", "TOILET BRUSH"))); Iterable<Pair<Integer, Pair<String, String>>> iter = joined.materialize(); PipelineResult res = pipeline.run(); if (!inMemory) { assertEquals(materialize ? 2 : 1, res.getStageResults().size()); } List<Pair<Integer, Pair<String, String>>> joinedResultList = Lists.newArrayList(iter); Collections.sort(joinedResultList); assertEquals(expectedJoinResult, joinedResultList); }
private PTable<Integer, String> readTable(Pipeline pipeline, String filename) { try { return pipeline .readTextFile(tmpDir.copyResourceFileName(filename)) .parallelDo( "asTable", new LineSplitter(), Writables.tableOf(Writables.ints(), Writables.strings())); } catch (IOException e) { throw new RuntimeException(e); } }
private static void run(Pipeline p, String input, String expected) throws Exception { Iterable<String> mat = p.read(From.textFile(input)) .parallelDo( "conf", CONFIG_FN, Writables.strings(), ParallelDoOptions.builder().conf(KEY, expected).build()) .materialize(); for (String v : mat) { if (!expected.equals(v)) { Assert.fail("Unexpected value: " + v); } } p.done(); }
public int run(String[] args) throws Exception { Pipeline pipeline = new MRPipeline(SecondarySortingExample.class); // Read input PCollection<String> lines = pipeline.readTextFile(args[0]); // Split each line and count them PTable<String, Long> wordcount = lines.parallelDo(new Tokenizer(), Writables.strings()).count(); // Sort PCollection<Pair<String, Long>> sorted = Sort.sortPairs(wordcount, ColumnOrder.by(1, Sort.Order.DESCENDING)); // Write the output sorted.write(To.textFile(args[0])); // Kick off execution PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
public static void main(String[] args) throws Exception { Pipeline pipeline = new MRPipeline(WordCount.class); PCollection<String> lines = pipeline.readTextFile(args[0]); PCollection<String> words = lines.parallelDo( "my splitter", new DoFn<String, String>() { public void process(String line, Emitter<String> emitter) { for (String word : line.split("\\s+")) { emitter.emit(word); } } }, Writables.strings()); PTable<String, Long> counts = Aggregate.count(words); pipeline.writeTextFile(counts, args[1]); pipeline.run(); }