@Test public void testCrossWithLarge() { // construct the plan FileDataSource source1 = new FileDataSource(new DummyInputFormat(), IN_FILE, "Source 1"); FileDataSource source2 = new FileDataSource(new DummyInputFormat(), IN_FILE, "Source 2"); CrossOperator cross = CrossWithLargeOperator.builder(new DummyCrossStub()) .input1(source1) .input2(source2) .name("Cross") .build(); FileDataSink sink = new FileDataSink(new DummyOutputFormat(), OUT_FILE, cross, "Sink"); Plan plan = new Plan(sink); plan.setDefaultParallelism(DEFAULT_PARALLELISM); try { OptimizedPlan oPlan = compileNoStats(plan); OptimizerPlanNodeResolver resolver = new OptimizerPlanNodeResolver(oPlan); DualInputPlanNode crossPlanNode = resolver.getNode("Cross"); Channel in1 = crossPlanNode.getInput1(); Channel in2 = crossPlanNode.getInput2(); assertEquals(ShipStrategyType.BROADCAST, in1.getShipStrategy()); assertEquals(ShipStrategyType.FORWARD, in2.getShipStrategy()); } catch (CompilerException ce) { ce.printStackTrace(); fail("The pact compiler is unable to compile this plan correctly."); } }
@Override public Plan getPlan(String... args) { // parse job parameters int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String dataPointInput = (args.length > 1 ? args[1] : ""); String clusterInput = (args.length > 2 ? args[2] : ""); String output = (args.length > 3 ? args[3] : ""); // create DataSourceContract for data point input @SuppressWarnings("unchecked") FileDataSource pointsSource = new FileDataSource( new CsvInputFormat( '|', IntValue.class, DoubleValue.class, DoubleValue.class, DoubleValue.class), dataPointInput, "Data Points"); // create DataSourceContract for cluster center input @SuppressWarnings("unchecked") FileDataSource clustersSource = new FileDataSource( new CsvInputFormat( '|', IntValue.class, DoubleValue.class, DoubleValue.class, DoubleValue.class), clusterInput, "Centers"); MapOperator dataPoints = MapOperator.builder(new PointBuilder()) .name("Build data points") .input(pointsSource) .build(); MapOperator clusterPoints = MapOperator.builder(new PointBuilder()) .name("Build cluster points") .input(clustersSource) .build(); // the mapper computes the distance to all points, which it draws from a broadcast variable MapOperator findNearestClusterCenters = MapOperator.builder(new SelectNearestCenter()) .setBroadcastVariable("centers", clusterPoints) .input(dataPoints) .name("Find Nearest Centers") .build(); // create reducer recomputes the cluster centers as the average of all associated data points ReduceOperator recomputeClusterCenter = ReduceOperator.builder(new RecomputeClusterCenter(), IntValue.class, 0) .input(findNearestClusterCenters) .name("Recompute Center Positions") .build(); // create DataSinkContract for writing the new cluster positions FileDataSink newClusterPoints = new FileDataSink( new PointOutFormat(), output, recomputeClusterCenter, "New Center Positions"); // return the plan Plan plan = new Plan(newClusterPoints, "KMeans Iteration"); plan.setDefaultParallelism(numSubTasks); return plan; }
@SuppressWarnings("unchecked") @Override public Plan getPlan(String... args) { // parse job parameters final int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); final String verticesInput = (args.length > 1 ? args[1] : ""); final String edgeInput = (args.length > 2 ? args[2] : ""); final String output = (args.length > 3 ? args[3] : ""); final int maxIterations = (args.length > 4 ? Integer.parseInt(args[4]) : 1); // data source for initial vertices FileDataSource initialVertices = new FileDataSource(new CsvInputFormat(' ', LongValue.class), verticesInput, "Vertices"); MapOperator verticesWithId = MapOperator.builder(DuplicateLongMap.class) .input(initialVertices) .name("Assign Vertex Ids") .build(); DeltaIteration iteration = new DeltaIteration(0, "Connected Components Iteration"); iteration.setInitialSolutionSet(verticesWithId); iteration.setInitialWorkset(verticesWithId); iteration.setMaximumNumberOfIterations(maxIterations); // create DataSourceContract for the edges FileDataSource edges = new FileDataSource( new CsvInputFormat(' ', LongValue.class, LongValue.class), edgeInput, "Edges"); // create CrossOperator for distance computation JoinOperator joinWithNeighbors = JoinOperator.builder(new NeighborWithComponentIDJoin(), LongValue.class, 0, 0) .input1(iteration.getWorkset()) .input2(edges) .name("Join Candidate Id With Neighbor") .build(); CoGroupOperator minAndUpdate = CoGroupOperator.builder(new MinIdAndUpdate(), LongValue.class, 0, 0) .input1(joinWithNeighbors) .input2(iteration.getSolutionSet()) .name("Min Id and Update") .build(); iteration.setNextWorkset(minAndUpdate); iteration.setSolutionSetDelta(minAndUpdate); // create DataSinkContract for writing the new cluster positions FileDataSink result = new FileDataSink(new CsvOutputFormat(), output, iteration, "Result"); CsvOutputFormat.configureRecordFormat(result) .recordDelimiter('\n') .fieldDelimiter(' ') .field(LongValue.class, 0) .field(LongValue.class, 1); // return the PACT plan Plan plan = new Plan(result, "Workset Connected Components"); plan.setDefaultParallelism(numSubTasks); return plan; }