@Test
  public void testCrossWithLarge() {
    // construct the plan
    FileDataSource source1 = new FileDataSource(new DummyInputFormat(), IN_FILE, "Source 1");
    FileDataSource source2 = new FileDataSource(new DummyInputFormat(), IN_FILE, "Source 2");

    CrossOperator cross =
        CrossWithLargeOperator.builder(new DummyCrossStub())
            .input1(source1)
            .input2(source2)
            .name("Cross")
            .build();

    FileDataSink sink = new FileDataSink(new DummyOutputFormat(), OUT_FILE, cross, "Sink");

    Plan plan = new Plan(sink);
    plan.setDefaultParallelism(DEFAULT_PARALLELISM);

    try {
      OptimizedPlan oPlan = compileNoStats(plan);
      OptimizerPlanNodeResolver resolver = new OptimizerPlanNodeResolver(oPlan);

      DualInputPlanNode crossPlanNode = resolver.getNode("Cross");
      Channel in1 = crossPlanNode.getInput1();
      Channel in2 = crossPlanNode.getInput2();

      assertEquals(ShipStrategyType.BROADCAST, in1.getShipStrategy());
      assertEquals(ShipStrategyType.FORWARD, in2.getShipStrategy());
    } catch (CompilerException ce) {
      ce.printStackTrace();
      fail("The pact compiler is unable to compile this plan correctly.");
    }
  }
예제 #2
0
  @Override
  public Plan getPlan(String... args) {
    // parse job parameters
    int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
    String dataPointInput = (args.length > 1 ? args[1] : "");
    String clusterInput = (args.length > 2 ? args[2] : "");
    String output = (args.length > 3 ? args[3] : "");

    // create DataSourceContract for data point input
    @SuppressWarnings("unchecked")
    FileDataSource pointsSource =
        new FileDataSource(
            new CsvInputFormat(
                '|', IntValue.class, DoubleValue.class, DoubleValue.class, DoubleValue.class),
            dataPointInput,
            "Data Points");

    // create DataSourceContract for cluster center input
    @SuppressWarnings("unchecked")
    FileDataSource clustersSource =
        new FileDataSource(
            new CsvInputFormat(
                '|', IntValue.class, DoubleValue.class, DoubleValue.class, DoubleValue.class),
            clusterInput,
            "Centers");

    MapOperator dataPoints =
        MapOperator.builder(new PointBuilder())
            .name("Build data points")
            .input(pointsSource)
            .build();

    MapOperator clusterPoints =
        MapOperator.builder(new PointBuilder())
            .name("Build cluster points")
            .input(clustersSource)
            .build();

    // the mapper computes the distance to all points, which it draws from a broadcast variable
    MapOperator findNearestClusterCenters =
        MapOperator.builder(new SelectNearestCenter())
            .setBroadcastVariable("centers", clusterPoints)
            .input(dataPoints)
            .name("Find Nearest Centers")
            .build();

    // create reducer recomputes the cluster centers as the  average of all associated data points
    ReduceOperator recomputeClusterCenter =
        ReduceOperator.builder(new RecomputeClusterCenter(), IntValue.class, 0)
            .input(findNearestClusterCenters)
            .name("Recompute Center Positions")
            .build();

    // create DataSinkContract for writing the new cluster positions
    FileDataSink newClusterPoints =
        new FileDataSink(
            new PointOutFormat(), output, recomputeClusterCenter, "New Center Positions");

    // return the plan
    Plan plan = new Plan(newClusterPoints, "KMeans Iteration");
    plan.setDefaultParallelism(numSubTasks);
    return plan;
  }
  @SuppressWarnings("unchecked")
  @Override
  public Plan getPlan(String... args) {
    // parse job parameters
    final int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
    final String verticesInput = (args.length > 1 ? args[1] : "");
    final String edgeInput = (args.length > 2 ? args[2] : "");
    final String output = (args.length > 3 ? args[3] : "");
    final int maxIterations = (args.length > 4 ? Integer.parseInt(args[4]) : 1);

    // data source for initial vertices
    FileDataSource initialVertices =
        new FileDataSource(new CsvInputFormat(' ', LongValue.class), verticesInput, "Vertices");

    MapOperator verticesWithId =
        MapOperator.builder(DuplicateLongMap.class)
            .input(initialVertices)
            .name("Assign Vertex Ids")
            .build();

    DeltaIteration iteration = new DeltaIteration(0, "Connected Components Iteration");
    iteration.setInitialSolutionSet(verticesWithId);
    iteration.setInitialWorkset(verticesWithId);
    iteration.setMaximumNumberOfIterations(maxIterations);

    // create DataSourceContract for the edges
    FileDataSource edges =
        new FileDataSource(
            new CsvInputFormat(' ', LongValue.class, LongValue.class), edgeInput, "Edges");

    // create CrossOperator for distance computation
    JoinOperator joinWithNeighbors =
        JoinOperator.builder(new NeighborWithComponentIDJoin(), LongValue.class, 0, 0)
            .input1(iteration.getWorkset())
            .input2(edges)
            .name("Join Candidate Id With Neighbor")
            .build();

    CoGroupOperator minAndUpdate =
        CoGroupOperator.builder(new MinIdAndUpdate(), LongValue.class, 0, 0)
            .input1(joinWithNeighbors)
            .input2(iteration.getSolutionSet())
            .name("Min Id and Update")
            .build();

    iteration.setNextWorkset(minAndUpdate);
    iteration.setSolutionSetDelta(minAndUpdate);

    // create DataSinkContract for writing the new cluster positions
    FileDataSink result = new FileDataSink(new CsvOutputFormat(), output, iteration, "Result");
    CsvOutputFormat.configureRecordFormat(result)
        .recordDelimiter('\n')
        .fieldDelimiter(' ')
        .field(LongValue.class, 0)
        .field(LongValue.class, 1);

    // return the PACT plan
    Plan plan = new Plan(result, "Workset Connected Components");
    plan.setDefaultParallelism(numSubTasks);
    return plan;
  }