@Test public void testRun() throws Exception { run( new MRPipeline(ConfigurationIT.class, tmpDir.getDefaultConfiguration()), tmpDir.copyResourceFileName("set1.txt"), "testapalooza"); }
@Test public void unionWriteShouldNotThrowNPE() throws IOException { String outputPath1 = tmpDir.getFileName("output1"); String outputPath2 = tmpDir.getFileName("output2"); String outputPath3 = tmpDir.getFileName("output3"); if (typeFamily == AvroTypeFamily.getInstance()) { union.write(To.avroFile(outputPath1)); pipeline.write(union, To.avroFile(outputPath2)); pipeline.run(); checkFileContents(outputPath1); checkFileContents(outputPath2); } else { union.write(To.textFile(outputPath1)); pipeline.write(union, To.textFile(outputPath2)); pipeline.writeTextFile(union, outputPath3); pipeline.run(); checkFileContents(outputPath1); checkFileContents(outputPath2); checkFileContents(outputPath3); } }
@Before @SuppressWarnings("unchecked") public void setUp() throws IOException { String inputFile1 = tmpDir.copyResourceFileName("set1.txt"); String inputFile2 = tmpDir.copyResourceFileName("set2.txt"); if (pipelineClass == null) { pipeline = MemPipeline.getInstance(); } else { pipeline = new MRPipeline(pipelineClass, tmpDir.getDefaultConfiguration()); } PCollection<String> firstCollection = pipeline.read(At.textFile(inputFile1, typeFamily.strings())); PCollection<String> secondCollection = pipeline.read(At.textFile(inputFile2, typeFamily.strings())); LOG.info( "Test fixture: [" + pipeline.getClass().getSimpleName() + " : " + typeFamily.getClass().getSimpleName() + "] First: " + Lists.newArrayList(firstCollection.materialize().iterator()) + ", Second: " + Lists.newArrayList(secondCollection.materialize().iterator())); union = secondCollection.union(firstCollection); }
@Test public void testPGroupedTableToMultipleOutputs() throws IOException { Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration()); PGroupedTable<String, String> groupedLineTable = pipeline .readTextFile(tmpDir.copyResourceFileName("set1.txt")) .by(IdentityFn.<String>getInstance(), Writables.strings()) .groupByKey(); PTable<String, String> ungroupedTableA = groupedLineTable.ungroup(); PTable<String, String> ungroupedTableB = groupedLineTable.ungroup(); File outputDirA = tmpDir.getFile("output_a"); File outputDirB = tmpDir.getFile("output_b"); pipeline.writeTextFile(ungroupedTableA, outputDirA.getAbsolutePath()); pipeline.writeTextFile(ungroupedTableB, outputDirB.getAbsolutePath()); PipelineResult result = pipeline.done(); for (StageResult stageResult : result.getStageResults()) { assertTrue(stageResult.getStageName().length() > 1); assertTrue(stageResult.getStageId().length() > 1); } // Verify that output from a single PGroupedTable can be sent to multiple collections assertTrue(new File(outputDirA, "part-r-00000").exists()); assertTrue(new File(outputDirB, "part-r-00000").exists()); }
@Test public void materializedColShouldBeWritten() throws Exception { File textFile = tmpDir.copyResourceFile("shakes.txt"); Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration()); PCollection<String> genericCollection = pipeline.readTextFile(textFile.getAbsolutePath()); pipeline.run(); PCollection<String> filter = genericCollection.filter("Filtering data", FilterFns.<String>ACCEPT_ALL()); filter.materialize(); pipeline.run(); File file = tmpDir.getFile("output.txt"); Target outFile = To.textFile(file.getAbsolutePath()); PCollection<String> write = filter.write(outFile); write.materialize(); pipeline.run(); }
@Test public void testMapsideJoin_LeftOuterJoin() throws IOException { runMapsideLeftOuterJoin( new MRPipeline(MapsideJoinStrategyIT.class, tmpDir.getDefaultConfiguration()), false, false); }
private PTable<Integer, String> readTable(Pipeline pipeline, String filename) { try { return pipeline .readTextFile(tmpDir.copyResourceFileName(filename)) .parallelDo( "asTable", new LineSplitter(), Writables.tableOf(Writables.ints(), Writables.strings())); } catch (IOException e) { throw new RuntimeException(e); } }
@Test public void testMapsideJoin_RightSideIsEmpty() throws IOException { MRPipeline pipeline = new MRPipeline(MapsideJoinStrategyIT.class, tmpDir.getDefaultConfiguration()); PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt"); PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt"); PTable<Integer, String> filteredOrderTable = orderTable.parallelDo( FilterFns.<Pair<Integer, String>>REJECT_ALL(), orderTable.getPTableType()); JoinStrategy<Integer, String, String> mapsideJoin = new MapsideJoinStrategy<Integer, String, String>(); PTable<Integer, Pair<String, String>> joined = mapsideJoin.join(customerTable, filteredOrderTable, JoinType.INNER_JOIN); List<Pair<Integer, Pair<String, String>>> materializedJoin = Lists.newArrayList(joined.materialize()); assertTrue(materializedJoin.isEmpty()); }
private PCollection<String> getPCollection(Pipeline pipeline) throws IOException { String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt"); PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath); return shakespeare; }
@Test public void testAsCollectionMRPipeline() throws IOException { runAsCollection(new MRPipeline(CollectionPObjectIT.class, tmpDir.getDefaultConfiguration())); }