@SuppressWarnings("unchecked") public static Plan getPlan( int numSubTasks, String verticesInput, String edgeInput, String output, int maxIterations) { // data source for initial vertices FileDataSource initialVertices = new FileDataSource(new CsvInputFormat(' ', LongValue.class), verticesInput, "Vertices"); MapOperator verticesWithId = MapOperator.builder(DuplicateLongMap.class) .input(initialVertices) .name("Assign Vertex Ids") .build(); DeltaIteration iteration = new DeltaIteration(0, "Connected Components Iteration"); iteration.setInitialSolutionSet(verticesWithId); iteration.setInitialWorkset(verticesWithId); iteration.setMaximumNumberOfIterations(maxIterations); // create DataSourceContract for the edges FileDataSource edges = new FileDataSource( new CsvInputFormat(' ', LongValue.class, LongValue.class), edgeInput, "Edges"); // create CrossOperator for distance computation JoinOperator joinWithNeighbors = JoinOperator.builder(new NeighborWithComponentIDJoin(), LongValue.class, 0, 0) .input1(iteration.getWorkset()) .input2(edges) .name("Join Candidate Id With Neighbor") .build(); CoGroupOperator minAndUpdate = CoGroupOperator.builder(new MinIdAndUpdate(), LongValue.class, 0, 0) .input1(joinWithNeighbors) .input2(iteration.getSolutionSet()) .name("Min Id and Update") .build(); iteration.setNextWorkset(minAndUpdate); iteration.setSolutionSetDelta(minAndUpdate); // create DataSinkContract for writing the new cluster positions FileDataSink result = new FileDataSink(new CsvOutputFormat(), output, iteration, "Result"); CsvOutputFormat.configureRecordFormat(result) .recordDelimiter('\n') .fieldDelimiter(' ') .field(LongValue.class, 0) .field(LongValue.class, 1); // return the PACT plan Plan plan = new Plan(result, "Workset Connected Components"); plan.setDefaultParallelism(numSubTasks); return plan; }
/** * Statistics that push towards a repartition merge join. If the join blows the data volume up * significantly, re-exploiting the sorted order is cheaper. */ @Test public void testQueryWithStatsForRepartitionMerge() { TPCHQuery3 query = new TPCHQuery3(); Plan p = query.getPlan(DEFAULT_PARALLELISM_STRING, IN_FILE, IN_FILE, OUT_FILE); p.setExecutionConfig(defaultExecutionConfig); // set compiler hints OperatorResolver cr = getContractResolver(p); JoinOperator match = cr.getNode("JoinLiO"); match.getCompilerHints().setFilterFactor(100f); testQueryGeneric( 100l * 1024 * 1024 * 1024 * 1024, 100l * 1024 * 1024 * 1024 * 1024, 0.05f, 100f, false, true, false, false, true); }
private void testQueryGeneric( Plan p, long orderSize, long lineitemSize, float orderSelectivity, float joinSelectivity, boolean broadcastOkay, boolean partitionedOkay, boolean hashJoinFirstOkay, boolean hashJoinSecondOkay, boolean mergeJoinOkay) { try { // set statistics OperatorResolver cr = getContractResolver(p); FileDataSource ordersSource = cr.getNode(ORDERS); FileDataSource lineItemSource = cr.getNode(LINEITEM); MapOperator mapper = cr.getNode(MAPPER_NAME); JoinOperator joiner = cr.getNode(JOIN_NAME); setSourceStatistics(ordersSource, orderSize, 100f); setSourceStatistics(lineItemSource, lineitemSize, 140f); mapper.getCompilerHints().setAvgOutputRecordSize(16f); mapper.getCompilerHints().setFilterFactor(orderSelectivity); joiner.getCompilerHints().setFilterFactor(joinSelectivity); // compile final OptimizedPlan plan = compileWithStats(p); final OptimizerPlanNodeResolver or = getOptimizerPlanNodeResolver(plan); // get the nodes from the final plan final SinkPlanNode sink = or.getNode("Output"); final SingleInputPlanNode reducer = or.getNode("AggLio"); final SingleInputPlanNode combiner = reducer.getPredecessor() instanceof SingleInputPlanNode ? (SingleInputPlanNode) reducer.getPredecessor() : null; final DualInputPlanNode join = or.getNode("JoinLiO"); final SingleInputPlanNode filteringMapper = or.getNode("FilterO"); checkStandardStrategies(filteringMapper, join, combiner, reducer, sink); // check the possible variants and that the variant ia allowed in this specific setting if (checkBroadcastShipStrategies(join, reducer, combiner)) { Assert.assertTrue("Broadcast join incorrectly chosen.", broadcastOkay); if (checkHashJoinStrategies(join, reducer, true)) { Assert.assertTrue("Hash join (build orders) incorrectly chosen", hashJoinFirstOkay); } else if (checkHashJoinStrategies(join, reducer, false)) { Assert.assertTrue("Hash join (build lineitem) incorrectly chosen", hashJoinSecondOkay); } else if (checkBroadcastMergeJoin(join, reducer)) { Assert.assertTrue("Merge join incorrectly chosen", mergeJoinOkay); } else { Assert.fail("Plan has no correct hash join or merge join strategies."); } } else if (checkRepartitionShipStrategies(join, reducer, combiner)) { Assert.assertTrue("Partitioned join incorrectly chosen.", partitionedOkay); if (checkHashJoinStrategies(join, reducer, true)) { Assert.assertTrue("Hash join (build orders) incorrectly chosen", hashJoinFirstOkay); } else if (checkHashJoinStrategies(join, reducer, false)) { Assert.assertTrue("Hash join (build lineitem) incorrectly chosen", hashJoinSecondOkay); } else if (checkRepartitionMergeJoin(join, reducer)) { Assert.assertTrue("Merge join incorrectly chosen", mergeJoinOkay); } else { Assert.fail("Plan has no correct hash join or merge join strategies."); } } else { Assert.fail("Plan has neither correct BC join or partitioned join configuration."); } } catch (Exception e) { e.printStackTrace(); Assert.fail(e.getMessage()); } }