Exemplo n.º 1
0
  private void runMapsideJoin(Pipeline pipeline, boolean inMemory, boolean materialize) {
    PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt");
    PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt");

    JoinStrategy<Integer, String, String> mapsideJoin =
        new MapsideJoinStrategy<Integer, String, String>(materialize);
    PTable<Integer, String> custOrders =
        mapsideJoin
            .join(customerTable, orderTable, JoinType.INNER_JOIN)
            .mapValues("concat", new ConcatValuesFn(), Writables.strings());

    PTable<Integer, String> ORDER_TABLE =
        orderTable.mapValues(new CapOrdersFn(), orderTable.getValueType());
    PTable<Integer, Pair<String, String>> joined =
        mapsideJoin.join(custOrders, ORDER_TABLE, JoinType.INNER_JOIN);

    List<Pair<Integer, Pair<String, String>>> expectedJoinResult = Lists.newArrayList();
    expectedJoinResult.add(Pair.of(111, Pair.of("[John Doe,Corn flakes]", "CORN FLAKES")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PAPER")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PLUNGER")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PAPER")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PLUNGER")));
    expectedJoinResult.add(Pair.of(333, Pair.of("[Someone Else,Toilet brush]", "TOILET BRUSH")));
    Iterable<Pair<Integer, Pair<String, String>>> iter = joined.materialize();

    PipelineResult res = pipeline.run();
    if (!inMemory) {
      assertEquals(materialize ? 2 : 1, res.getStageResults().size());
    }

    List<Pair<Integer, Pair<String, String>>> joinedResultList = Lists.newArrayList(iter);
    Collections.sort(joinedResultList);

    assertEquals(expectedJoinResult, joinedResultList);
  }
Exemplo n.º 2
0
  @Test
  public void testMapsideJoin_RightSideIsEmpty() throws IOException {
    MRPipeline pipeline =
        new MRPipeline(MapsideJoinStrategyIT.class, tmpDir.getDefaultConfiguration());
    PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt");
    PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt");

    PTable<Integer, String> filteredOrderTable =
        orderTable.parallelDo(
            FilterFns.<Pair<Integer, String>>REJECT_ALL(), orderTable.getPTableType());

    JoinStrategy<Integer, String, String> mapsideJoin =
        new MapsideJoinStrategy<Integer, String, String>();
    PTable<Integer, Pair<String, String>> joined =
        mapsideJoin.join(customerTable, filteredOrderTable, JoinType.INNER_JOIN);

    List<Pair<Integer, Pair<String, String>>> materializedJoin =
        Lists.newArrayList(joined.materialize());

    assertTrue(materializedJoin.isEmpty());
  }