Example #1
0
 @Test
 public void testRun() throws Exception {
   run(
       new MRPipeline(ConfigurationIT.class, tmpDir.getDefaultConfiguration()),
       tmpDir.copyResourceFileName("set1.txt"),
       "testapalooza");
 }
Example #2
0
  @Test
  public void unionWriteShouldNotThrowNPE() throws IOException {
    String outputPath1 = tmpDir.getFileName("output1");
    String outputPath2 = tmpDir.getFileName("output2");
    String outputPath3 = tmpDir.getFileName("output3");

    if (typeFamily == AvroTypeFamily.getInstance()) {
      union.write(To.avroFile(outputPath1));
      pipeline.write(union, To.avroFile(outputPath2));

      pipeline.run();

      checkFileContents(outputPath1);
      checkFileContents(outputPath2);

    } else {

      union.write(To.textFile(outputPath1));
      pipeline.write(union, To.textFile(outputPath2));
      pipeline.writeTextFile(union, outputPath3);

      pipeline.run();

      checkFileContents(outputPath1);
      checkFileContents(outputPath2);
      checkFileContents(outputPath3);
    }
  }
Example #3
0
  @Before
  @SuppressWarnings("unchecked")
  public void setUp() throws IOException {
    String inputFile1 = tmpDir.copyResourceFileName("set1.txt");
    String inputFile2 = tmpDir.copyResourceFileName("set2.txt");
    if (pipelineClass == null) {
      pipeline = MemPipeline.getInstance();
    } else {
      pipeline = new MRPipeline(pipelineClass, tmpDir.getDefaultConfiguration());
    }
    PCollection<String> firstCollection =
        pipeline.read(At.textFile(inputFile1, typeFamily.strings()));
    PCollection<String> secondCollection =
        pipeline.read(At.textFile(inputFile2, typeFamily.strings()));

    LOG.info(
        "Test fixture: ["
            + pipeline.getClass().getSimpleName()
            + " : "
            + typeFamily.getClass().getSimpleName()
            + "]  First: "
            + Lists.newArrayList(firstCollection.materialize().iterator())
            + ", Second: "
            + Lists.newArrayList(secondCollection.materialize().iterator()));

    union = secondCollection.union(firstCollection);
  }
Example #4
0
  @Test
  public void testPGroupedTableToMultipleOutputs() throws IOException {
    Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
    PGroupedTable<String, String> groupedLineTable =
        pipeline
            .readTextFile(tmpDir.copyResourceFileName("set1.txt"))
            .by(IdentityFn.<String>getInstance(), Writables.strings())
            .groupByKey();

    PTable<String, String> ungroupedTableA = groupedLineTable.ungroup();
    PTable<String, String> ungroupedTableB = groupedLineTable.ungroup();

    File outputDirA = tmpDir.getFile("output_a");
    File outputDirB = tmpDir.getFile("output_b");

    pipeline.writeTextFile(ungroupedTableA, outputDirA.getAbsolutePath());
    pipeline.writeTextFile(ungroupedTableB, outputDirB.getAbsolutePath());
    PipelineResult result = pipeline.done();
    for (StageResult stageResult : result.getStageResults()) {
      assertTrue(stageResult.getStageName().length() > 1);
      assertTrue(stageResult.getStageId().length() > 1);
    }

    // Verify that output from a single PGroupedTable can be sent to multiple collections
    assertTrue(new File(outputDirA, "part-r-00000").exists());
    assertTrue(new File(outputDirB, "part-r-00000").exists());
  }
Example #5
0
 @Test
 public void materializedColShouldBeWritten() throws Exception {
   File textFile = tmpDir.copyResourceFile("shakes.txt");
   Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
   PCollection<String> genericCollection = pipeline.readTextFile(textFile.getAbsolutePath());
   pipeline.run();
   PCollection<String> filter =
       genericCollection.filter("Filtering data", FilterFns.<String>ACCEPT_ALL());
   filter.materialize();
   pipeline.run();
   File file = tmpDir.getFile("output.txt");
   Target outFile = To.textFile(file.getAbsolutePath());
   PCollection<String> write = filter.write(outFile);
   write.materialize();
   pipeline.run();
 }
Example #6
0
 @Test
 public void testMapsideJoin_LeftOuterJoin() throws IOException {
   runMapsideLeftOuterJoin(
       new MRPipeline(MapsideJoinStrategyIT.class, tmpDir.getDefaultConfiguration()),
       false,
       false);
 }
Example #7
0
 private PTable<Integer, String> readTable(Pipeline pipeline, String filename) {
   try {
     return pipeline
         .readTextFile(tmpDir.copyResourceFileName(filename))
         .parallelDo(
             "asTable",
             new LineSplitter(),
             Writables.tableOf(Writables.ints(), Writables.strings()));
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
 }
Example #8
0
  @Test
  public void testMapsideJoin_RightSideIsEmpty() throws IOException {
    MRPipeline pipeline =
        new MRPipeline(MapsideJoinStrategyIT.class, tmpDir.getDefaultConfiguration());
    PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt");
    PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt");

    PTable<Integer, String> filteredOrderTable =
        orderTable.parallelDo(
            FilterFns.<Pair<Integer, String>>REJECT_ALL(), orderTable.getPTableType());

    JoinStrategy<Integer, String, String> mapsideJoin =
        new MapsideJoinStrategy<Integer, String, String>();
    PTable<Integer, Pair<String, String>> joined =
        mapsideJoin.join(customerTable, filteredOrderTable, JoinType.INNER_JOIN);

    List<Pair<Integer, Pair<String, String>>> materializedJoin =
        Lists.newArrayList(joined.materialize());

    assertTrue(materializedJoin.isEmpty());
  }
Example #9
0
 private PCollection<String> getPCollection(Pipeline pipeline) throws IOException {
   String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
   PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
   return shakespeare;
 }
Example #10
0
 @Test
 public void testAsCollectionMRPipeline() throws IOException {
   runAsCollection(new MRPipeline(CollectionPObjectIT.class, tmpDir.getDefaultConfiguration()));
 }