Ejemplo n.º 1
0
  @Test
  public void testPGroupedTableToMultipleOutputs() throws IOException {
    Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
    PGroupedTable<String, String> groupedLineTable =
        pipeline
            .readTextFile(tmpDir.copyResourceFileName("set1.txt"))
            .by(IdentityFn.<String>getInstance(), Writables.strings())
            .groupByKey();

    PTable<String, String> ungroupedTableA = groupedLineTable.ungroup();
    PTable<String, String> ungroupedTableB = groupedLineTable.ungroup();

    File outputDirA = tmpDir.getFile("output_a");
    File outputDirB = tmpDir.getFile("output_b");

    pipeline.writeTextFile(ungroupedTableA, outputDirA.getAbsolutePath());
    pipeline.writeTextFile(ungroupedTableB, outputDirB.getAbsolutePath());
    PipelineResult result = pipeline.done();
    for (StageResult stageResult : result.getStageResults()) {
      assertTrue(stageResult.getStageName().length() > 1);
      assertTrue(stageResult.getStageId().length() > 1);
    }

    // Verify that output from a single PGroupedTable can be sent to multiple collections
    assertTrue(new File(outputDirA, "part-r-00000").exists());
    assertTrue(new File(outputDirB, "part-r-00000").exists());
  }
Ejemplo n.º 2
0
  private void runMapsideJoin(Pipeline pipeline, boolean inMemory, boolean materialize) {
    PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt");
    PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt");

    JoinStrategy<Integer, String, String> mapsideJoin =
        new MapsideJoinStrategy<Integer, String, String>(materialize);
    PTable<Integer, String> custOrders =
        mapsideJoin
            .join(customerTable, orderTable, JoinType.INNER_JOIN)
            .mapValues("concat", new ConcatValuesFn(), Writables.strings());

    PTable<Integer, String> ORDER_TABLE =
        orderTable.mapValues(new CapOrdersFn(), orderTable.getValueType());
    PTable<Integer, Pair<String, String>> joined =
        mapsideJoin.join(custOrders, ORDER_TABLE, JoinType.INNER_JOIN);

    List<Pair<Integer, Pair<String, String>>> expectedJoinResult = Lists.newArrayList();
    expectedJoinResult.add(Pair.of(111, Pair.of("[John Doe,Corn flakes]", "CORN FLAKES")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PAPER")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PLUNGER")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PAPER")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PLUNGER")));
    expectedJoinResult.add(Pair.of(333, Pair.of("[Someone Else,Toilet brush]", "TOILET BRUSH")));
    Iterable<Pair<Integer, Pair<String, String>>> iter = joined.materialize();

    PipelineResult res = pipeline.run();
    if (!inMemory) {
      assertEquals(materialize ? 2 : 1, res.getStageResults().size());
    }

    List<Pair<Integer, Pair<String, String>>> joinedResultList = Lists.newArrayList(iter);
    Collections.sort(joinedResultList);

    assertEquals(expectedJoinResult, joinedResultList);
  }
Ejemplo n.º 3
0
 private PTable<Integer, String> readTable(Pipeline pipeline, String filename) {
   try {
     return pipeline
         .readTextFile(tmpDir.copyResourceFileName(filename))
         .parallelDo(
             "asTable",
             new LineSplitter(),
             Writables.tableOf(Writables.ints(), Writables.strings()));
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
 }
Ejemplo n.º 4
0
 private static void run(Pipeline p, String input, String expected) throws Exception {
   Iterable<String> mat =
       p.read(From.textFile(input))
           .parallelDo(
               "conf",
               CONFIG_FN,
               Writables.strings(),
               ParallelDoOptions.builder().conf(KEY, expected).build())
           .materialize();
   for (String v : mat) {
     if (!expected.equals(v)) {
       Assert.fail("Unexpected value: " + v);
     }
   }
   p.done();
 }
  public int run(String[] args) throws Exception {

    Pipeline pipeline = new MRPipeline(SecondarySortingExample.class);
    // Read input
    PCollection<String> lines = pipeline.readTextFile(args[0]);
    // Split each line and count them
    PTable<String, Long> wordcount = lines.parallelDo(new Tokenizer(), Writables.strings()).count();
    // Sort
    PCollection<Pair<String, Long>> sorted =
        Sort.sortPairs(wordcount, ColumnOrder.by(1, Sort.Order.DESCENDING));
    // Write the output
    sorted.write(To.textFile(args[0]));
    // Kick off execution
    PipelineResult result = pipeline.done();
    return result.succeeded() ? 0 : 1;
  }
Ejemplo n.º 6
0
  public static void main(String[] args) throws Exception {

    Pipeline pipeline = new MRPipeline(WordCount.class);
    PCollection<String> lines = pipeline.readTextFile(args[0]);

    PCollection<String> words =
        lines.parallelDo(
            "my splitter",
            new DoFn<String, String>() {
              public void process(String line, Emitter<String> emitter) {
                for (String word : line.split("\\s+")) {
                  emitter.emit(word);
                }
              }
            },
            Writables.strings());

    PTable<String, Long> counts = Aggregate.count(words);

    pipeline.writeTextFile(counts, args[1]);
    pipeline.run();
  }