@Test public void testSim8() throws Exception { PhysicalPlan php = new PhysicalPlan(); PhysicalPlan ldGrpChain1 = GenPhyOp.loadedGrpChain(); PhysicalPlan ldGrpChain2 = GenPhyOp.loadedGrpChain(); POLocalRearrange lr1 = GenPhyOp.topLocalRearrangeOp(); POLocalRearrange lr2 = GenPhyOp.topLocalRearrangeOp(); ldGrpChain1.addAsLeaf(lr1); ldGrpChain2.addAsLeaf(lr2); php.merge(ldGrpChain1); php.merge(ldGrpChain2); POGlobalRearrange gr = GenPhyOp.topGlobalRearrangeOp(); php.addAsLeaf(gr); PhysicalPlan ldFil1 = GenPhyOp.loadedFilter(); PhysicalPlan ldFil2 = GenPhyOp.loadedFilter(); php.merge(ldFil1); php.connect(ldFil1.getLeaves().get(0), gr); php.merge(ldFil2); php.connect(ldFil2.getLeaves().get(0), gr); POPackage pk = GenPhyOp.topPackageOp(); php.addAsLeaf(pk); POStore st = GenPhyOp.topStoreOp(); php.addAsLeaf(st); run(php, "test/org/apache/pig/test/data/GoldenFiles/MRC8.gld"); }
// Tests Single input case for both blocking and non-blocking // with both map and reduce phases @Test public void testSim1() throws Exception { PhysicalPlan php = new PhysicalPlan(); POLoad ld = GenPhyOp.topLoadOp(); php.add(ld); PhysicalPlan grpChain1 = GenPhyOp.grpChain(); php.merge(grpChain1); php.connect(ld, grpChain1.getRoots().get(0)); PhysicalOperator leaf = php.getLeaves().get(0); PhysicalPlan grpChain2 = GenPhyOp.grpChain(); php.merge(grpChain2); php.connect(leaf, grpChain2.getRoots().get(0)); leaf = php.getLeaves().get(0); POFilter fl = GenPhyOp.topFilterOp(); php.add(fl); php.connect(leaf, fl); POStore st = GenPhyOp.topStoreOp(); php.add(st); php.connect(fl, st); run(php, "test/org/apache/pig/test/data/GoldenFiles/MRC1.gld"); }
@Test public void testSim4() throws Exception { PhysicalPlan php = new PhysicalPlan(); PhysicalPlan ldGrpChain1 = GenPhyOp.loadedGrpChain(); PhysicalPlan ldGrpChain2 = GenPhyOp.loadedGrpChain(); php.merge(ldGrpChain1); php.merge(ldGrpChain2); POUnion un = GenPhyOp.topUnionOp(); php.addAsLeaf(un); PhysicalPlan ldFil1 = GenPhyOp.loadedFilter(); PhysicalPlan ldFil2 = GenPhyOp.loadedFilter(); php.merge(ldFil1); php.connect(ldFil1.getLeaves().get(0), un); php.merge(ldFil2); php.connect(ldFil2.getLeaves().get(0), un); POStore st = GenPhyOp.topStoreOp(); php.add(st); php.connect(un, st); run(php, "test/org/apache/pig/test/data/GoldenFiles/MRC4.gld"); }
/** * Replace old POLocalRearrange with new pre-combine LR, add new map foreach, new * map-local-rearrange, and connect them * * @param mapPlan * @param preCombinerLR * @param mfe * @param mapAgg * @param mlr * @throws PlanException */ private static void patchUpMap( PhysicalPlan mapPlan, POPreCombinerLocalRearrange preCombinerLR, POForEach mfe, POPartialAgg mapAgg, POLocalRearrange mlr) throws PlanException { POLocalRearrange oldLR = (POLocalRearrange) mapPlan.getLeaves().get(0); mapPlan.replace(oldLR, preCombinerLR); mapPlan.add(mfe); mapPlan.connect(preCombinerLR, mfe); // the operator before local rearrange PhysicalOperator opBeforeLR = mfe; if (mapAgg != null) { mapPlan.add(mapAgg); mapPlan.connect(mfe, mapAgg); opBeforeLR = mapAgg; } mapPlan.add(mlr); mapPlan.connect(opBeforeLR, mlr); }
/** * add algebraic functions with appropriate projection to new foreach in combiner * * @param cfe - the new foreach in combiner * @param op2newpos - mapping of physical operator to position in input * @throws CloneNotSupportedException * @throws PlanException */ private static void addAlgebraicFuncToCombineFE( POForEach cfe, Map<PhysicalOperator, Integer> op2newpos) throws CloneNotSupportedException, PlanException { // an array that we will first populate with physical operators in order // of their position in input. Used while adding plans to combine // foreach just so that output of combine foreach same positions as // input. That means the same operator to position mapping can be used // by reduce as well PhysicalOperator[] opsInOrder = new PhysicalOperator[op2newpos.size() + 1]; for (Map.Entry<PhysicalOperator, Integer> op2pos : op2newpos.entrySet()) { opsInOrder[op2pos.getValue()] = op2pos.getKey(); } // first position is used by group column and a plan has been added for // it, so start with 1 for (int i = 1; i < opsInOrder.length; i++) { // create new inner plan for foreach add cloned copy of given // physical operator and a new project. Even if the udf in query // takes multiple input, only one project needs to be added because // input to this udf will be the INITIAL version of udf evaluated in // map. PhysicalPlan newPlan = new PhysicalPlan(); PhysicalOperator newOp = opsInOrder[i].clone(); newPlan.add(newOp); POProject proj = new POProject(createOperatorKey(cfe.getOperatorKey().getScope()), 1, i); proj.setResultType(DataType.BAG); newPlan.add(proj); newPlan.connect(proj, newOp); cfe.addInputPlan(newPlan, false); } }
@Test public void testSpl2() throws Exception { PhysicalPlan php = new PhysicalPlan(); POLoad lA = GenPhyOp.topLoadOp(); POSplit spl = GenPhyOp.topSplitOp(); php.add(lA); php.add(spl); php.connect(lA, spl); POFilter fl1 = GenPhyOp.topFilterOp(); POFilter fl2 = GenPhyOp.topFilterOp(); php.add(fl1); php.add(fl2); php.connect(spl, fl1); php.connect(spl, fl2); POLocalRearrange lr1 = GenPhyOp.topLocalRearrangeOp(); POLocalRearrange lr2 = GenPhyOp.topLocalRearrangeOp(); php.add(lr1); php.add(lr2); php.connect(fl1, lr1); php.connect(fl2, lr2); POGlobalRearrange gr = GenPhyOp.topGlobalRearrangeOp(); php.addAsLeaf(gr); POPackage pk = GenPhyOp.topPackageOp(); php.addAsLeaf(pk); POSplit sp2 = GenPhyOp.topSplitOp(); php.addAsLeaf(sp2); POFilter fl3 = GenPhyOp.topFilterOp(); POFilter fl4 = GenPhyOp.topFilterOp(); php.add(fl3); php.add(fl4); php.connect(sp2, fl3); php.connect(sp2, fl4); POUnion un = GenPhyOp.topUnionOp(); php.addAsLeaf(un); POStore st = GenPhyOp.topStoreOp(); php.addAsLeaf(st); run(php, "test/org/apache/pig/test/data/GoldenFiles/MRC13.gld"); }
/** * Recursively clone op and its predecessors from pplan and add them to newplan * * @param op * @param pplan * @param newplan * @return * @throws CloneNotSupportedException * @throws PlanException */ private static PhysicalOperator addPredecessorsToPlan( PhysicalOperator op, PhysicalPlan pplan, PhysicalPlan newplan) throws CloneNotSupportedException, PlanException { PhysicalOperator newOp = op.clone(); newplan.add(newOp); if (pplan.getPredecessors(op) == null || pplan.getPredecessors(op).size() == 0) { return newOp; } for (PhysicalOperator pred : pplan.getPredecessors(op)) { PhysicalOperator newPred = addPredecessorsToPlan(pred, pplan, newplan); newplan.connect(newPred, newOp); } return newOp; }
public void testLimit() throws Exception { PhysicalPlan php = new PhysicalPlan(); POLoad lC = GenPhyOp.topLoadOp(); php.add(lC); POLimit op = new POLimit(new OperatorKey("", r.nextLong()), -1, null); php.add(op); php.connect(lC, op); POStore st = GenPhyOp.topStoreOp(); php.addAsLeaf(st); run(php, "test/org/apache/pig/test/data/GoldenFiles/MRC17.gld"); }
/** * @param op * @param index * @param plan * @throws PlanException */ private static void setProjectInput(PhysicalOperator op, PhysicalPlan plan, int index) throws PlanException { String scope = op.getOperatorKey().scope; POProject proj = new POProject( new OperatorKey(scope, NodeIdGenerator.getGenerator().getNextNodeId(scope)), op.getRequestedParallelism(), index); proj.setResultType(DataType.BAG); // Remove old connections and elements from the plan plan.trimAbove(op); plan.add(proj); plan.connect(proj, op); List<PhysicalOperator> inputs = Lists.newArrayList(); inputs.add(proj); op.setInputs(inputs); }
public void testDistinct1() throws Exception { PhysicalPlan php = new PhysicalPlan(); PhysicalPlan ldFil1 = GenPhyOp.loadedFilter(); php.merge(ldFil1); PODistinct op = new PODistinct(new OperatorKey("", r.nextLong()), -1, null); php.addAsLeaf(op); PhysicalPlan grpChain1 = GenPhyOp.grpChain(); php.merge(grpChain1); php.connect(op, grpChain1.getRoots().get(0)); PODistinct op1 = new PODistinct(new OperatorKey("", r.nextLong()), -1, null); php.addAsLeaf(op1); POStore st = GenPhyOp.topStoreOp(); php.addAsLeaf(st); run(php, "test/org/apache/pig/test/data/GoldenFiles/MRC16.gld"); }
public void testSpl1() throws Exception { PhysicalPlan php = new PhysicalPlan(); POLoad lA = GenPhyOp.topLoadOp(); POSplit spl = GenPhyOp.topSplitOp(); php.add(lA); php.add(spl); php.connect(lA, spl); POFilter fl1 = GenPhyOp.topFilterOp(); POFilter fl2 = GenPhyOp.topFilterOp(); php.add(fl1); php.add(fl2); php.connect(spl, fl1); php.connect(spl, fl2); POLocalRearrange lr1 = GenPhyOp.topLocalRearrangeOp(); POLocalRearrange lr2 = GenPhyOp.topLocalRearrangeOp(); php.add(lr1); php.add(lr2); php.connect(fl1, lr1); php.connect(fl2, lr2); POGlobalRearrange gr = GenPhyOp.topGlobalRearrangeOp(); php.add(gr); php.connect(lr1, gr); php.connect(lr2, gr); POPackage pk = GenPhyOp.topPackageOp(); php.add(pk); php.connect(gr, pk); POStore st = GenPhyOp.topStoreOp(); php.add(st); php.connect(pk, st); run(php, "test/org/apache/pig/test/data/GoldenFiles/MRC12.gld"); }
@Test public void testSortUDF1() throws Exception { PhysicalPlan php = new PhysicalPlan(); PhysicalPlan ldFil1 = GenPhyOp.loadedFilter(); php.merge(ldFil1); // set up order by * String funcName = WeirdComparator.class.getName(); POUserComparisonFunc comparator = new POUserComparisonFunc( new OperatorKey("", r.nextLong()), -1, null, new FuncSpec(funcName)); POSort sort = new POSort( new OperatorKey("", r.nextLong()), -1, ldFil1.getLeaves(), null, new ArrayList<Boolean>(), comparator); sort.setRequestedParallelism(20); PhysicalPlan nesSortPlan = new PhysicalPlan(); POProject topPrj = new POProject(new OperatorKey("", r.nextLong())); topPrj.setColumn(1); topPrj.setOverloaded(true); topPrj.setResultType(DataType.TUPLE); nesSortPlan.add(topPrj); POProject prjStar2 = new POProject(new OperatorKey("", r.nextLong())); prjStar2.setResultType(DataType.TUPLE); prjStar2.setStar(true); nesSortPlan.add(prjStar2); nesSortPlan.connect(topPrj, prjStar2); List<PhysicalPlan> nesSortPlanLst = new ArrayList<PhysicalPlan>(); nesSortPlanLst.add(nesSortPlan); sort.setSortPlans(nesSortPlanLst); php.add(sort); php.connect(ldFil1.getLeaves().get(0), sort); // have a foreach which takes the sort output // and send it two two udfs List<String> udfs = new ArrayList<String>(); udfs.add(COUNT.class.getName()); udfs.add(SUM.class.getName()); POForEach fe3 = GenPhyOp.topForEachOPWithUDF(udfs); php.add(fe3); php.connect(sort, fe3); // add a group above the foreach PhysicalPlan grpChain1 = GenPhyOp.grpChain(); php.merge(grpChain1); php.connect(fe3, grpChain1.getRoots().get(0)); udfs.clear(); udfs.add(AVG.class.getName()); POForEach fe4 = GenPhyOp.topForEachOPWithUDF(udfs); php.addAsLeaf(fe4); PhysicalPlan grpChain2 = GenPhyOp.grpChain(); php.merge(grpChain2); php.connect(fe4, grpChain2.getRoots().get(0)); udfs.clear(); udfs.add(GFCross.class.getName() + "('1')"); POForEach fe5 = GenPhyOp.topForEachOPWithUDF(udfs); php.addAsLeaf(fe5); POStore st = GenPhyOp.topStoreOp(); php.addAsLeaf(st); run(php, "test/org/apache/pig/test/data/GoldenFiles/MRC15.gld"); }
@Test public void testSpl3() throws Exception { PhysicalPlan php = new PhysicalPlan(); POLoad lA = GenPhyOp.topLoadOp(); POSplit spl = GenPhyOp.topSplitOp(); php.add(lA); php.add(spl); php.connect(lA, spl); POFilter fl1 = GenPhyOp.topFilterOp(); fl1.setRequestedParallelism(10); POFilter fl2 = GenPhyOp.topFilterOp(); fl2.setRequestedParallelism(20); php.add(fl1); php.add(fl2); php.connect(spl, fl1); php.connect(spl, fl2); POSplit sp11 = GenPhyOp.topSplitOp(); POSplit sp21 = GenPhyOp.topSplitOp(); php.add(sp11); php.add(sp21); php.connect(fl1, sp11); php.connect(fl2, sp21); POFilter fl11 = GenPhyOp.topFilterOp(); fl11.setRequestedParallelism(10); POFilter fl21 = GenPhyOp.topFilterOp(); fl21.setRequestedParallelism(20); POFilter fl22 = GenPhyOp.topFilterOp(); fl22.setRequestedParallelism(30); php.add(fl11); php.add(fl21); php.add(fl22); php.connect(sp11, fl11); php.connect(sp21, fl21); php.connect(sp21, fl22); POLocalRearrange lr1 = GenPhyOp.topLocalRearrangeOp(); lr1.setRequestedParallelism(40); POLocalRearrange lr21 = GenPhyOp.topLocalRearrangeOp(); lr21.setRequestedParallelism(15); POLocalRearrange lr22 = GenPhyOp.topLocalRearrangeOp(); lr22.setRequestedParallelism(35); php.add(lr1); php.add(lr21); php.add(lr22); php.connect(fl11, lr1); php.connect(fl21, lr21); php.connect(fl22, lr22); POGlobalRearrange gr = GenPhyOp.topGlobalRearrangeOp(); php.addAsLeaf(gr); POPackage pk = GenPhyOp.topPackageOp(); pk.setRequestedParallelism(25); php.addAsLeaf(pk); POSplit sp2 = GenPhyOp.topSplitOp(); php.addAsLeaf(sp2); POFilter fl3 = GenPhyOp.topFilterOp(); fl3.setRequestedParallelism(100); POFilter fl4 = GenPhyOp.topFilterOp(); fl4.setRequestedParallelism(80); php.add(fl3); php.add(fl4); php.connect(sp2, fl3); php.connect(sp2, fl4); POUnion un = GenPhyOp.topUnionOp(); php.addAsLeaf(un); POStore st = GenPhyOp.topStoreOp(); php.addAsLeaf(st); run(php, "test/org/apache/pig/test/data/GoldenFiles/MRC14.gld"); }
@Test public void testRun2() throws Exception { PhysicalPlan php = new PhysicalPlan(); PhysicalPlan part1 = new PhysicalPlan(); POLoad lC = GenPhyOp.topLoadOp(); POFilter fC = GenPhyOp.topFilterOp(); POLocalRearrange lrC = GenPhyOp.topLocalRearrangeOp(); POGlobalRearrange grC = GenPhyOp.topGlobalRearrangeOp(); POPackage pkC = GenPhyOp.topPackageOp(); part1.add(lC); part1.add(fC); part1.connect(lC, fC); part1.add(lrC); part1.connect(fC, lrC); part1.add(grC); part1.connect(lrC, grC); part1.add(pkC); part1.connect(grC, pkC); POPackage pkD = GenPhyOp.topPackageOp(); POLocalRearrange lrD = GenPhyOp.topLocalRearrangeOp(); POGlobalRearrange grD = GenPhyOp.topGlobalRearrangeOp(); POLoad lD = GenPhyOp.topLoadOp(); part1.add(lD); part1.add(lrD); part1.connect(lD, lrD); part1.add(grD); part1.connect(lrD, grD); part1.add(pkD); part1.connect(grD, pkD); part1.connect(pkD, grC); POLoad lA = GenPhyOp.topLoadOp(); POLoad lB = GenPhyOp.topLoadOp(); // POLoad lC = lA; POFilter fA = GenPhyOp.topFilterOp(); POLocalRearrange lrA = GenPhyOp.topLocalRearrangeOp(); POLocalRearrange lrB = GenPhyOp.topLocalRearrangeOp(); POGlobalRearrange grAB = GenPhyOp.topGlobalRearrangeOp(); POPackage pkAB = GenPhyOp.topPackageOp(); POFilter fAB = GenPhyOp.topFilterOp(); POUnion unABC = GenPhyOp.topUnionOp(); php.add(lA); php.add(lB); php.add(fA); php.connect(lA, fA); php.add(lrA); php.add(lrB); php.connect(fA, lrA); php.connect(lB, lrB); php.add(grAB); php.connect(lrA, grAB); php.connect(lrB, grAB); php.add(pkAB); php.connect(grAB, pkAB); php.add(fAB); php.connect(pkAB, fAB); php.merge(part1); List<PhysicalOperator> leaves = new ArrayList<PhysicalOperator>(); for (PhysicalOperator phyOp : php.getLeaves()) { leaves.add(phyOp); } php.add(unABC); for (PhysicalOperator physicalOperator : leaves) { php.connect(physicalOperator, unABC); } POStore st = GenPhyOp.topStoreOp(); php.add(st); php.connect(unABC, st); run(php, "test/org/apache/pig/test/data/GoldenFiles/MRC11.gld"); }
/** * Algebraic functions and distinct in nested plan of a foreach are partially computed in the map * and combine phase. A new foreach statement with initial and intermediate forms of algebraic * functions are added to map and combine plans respectively. * * <p>If bag portion of group-by result is projected or a non algebraic expression/udf has bag as * input, combiner will not be used. This is because the use of combiner in such case is likely to * degrade performance as there will not be much reduction in data size in combine stage to offset * the cost of the additional number of times (de)serialization is done. * * <p>Major areas for enhancement: 1. use of combiner in cogroup 2. queries with order-by, limit * or sort in a nested foreach after group-by 3. case where group-by is followed by filter that * has algebraic expression */ public static void addCombiner( PhysicalPlan mapPlan, PhysicalPlan reducePlan, PhysicalPlan combinePlan, CompilationMessageCollector messageCollector, boolean doMapAgg) throws VisitorException { // part one - check if this MR job represents a group-by + foreach. Find // the POLocalRearrange in the map. I'll need it later. List<PhysicalOperator> mapLeaves = mapPlan.getLeaves(); if (mapLeaves == null || mapLeaves.size() != 1) { messageCollector.collect( "Expected map to have single leaf", MessageType.Warning, PigWarning.MULTI_LEAF_MAP); return; } PhysicalOperator mapLeaf = mapLeaves.get(0); if (!(mapLeaf instanceof POLocalRearrange)) { return; } POLocalRearrange rearrange = (POLocalRearrange) mapLeaf; List<PhysicalOperator> reduceRoots = reducePlan.getRoots(); if (reduceRoots.size() != 1) { messageCollector.collect( "Expected reduce to have single root", MessageType.Warning, PigWarning.MULTI_ROOT_REDUCE); return; } // I expect that the first root should always be a POPackage. If not, I // don't know what's going on, so I'm out of here. PhysicalOperator root = reduceRoots.get(0); if (!(root instanceof POPackage)) { messageCollector.collect( "Expected reduce root to be a POPackage", MessageType.Warning, PigWarning.NON_PACKAGE_REDUCE_PLAN_ROOT); return; } POPackage pack = (POPackage) root; List<PhysicalOperator> packSuccessors = reducePlan.getSuccessors(root); if (packSuccessors == null || packSuccessors.size() != 1) { return; } PhysicalOperator successor = packSuccessors.get(0); if (successor instanceof POLimit) { // POLimit is acceptable, as long has it has a single foreach as // successor List<PhysicalOperator> limitSucs = reducePlan.getSuccessors(successor); if (limitSucs != null && limitSucs.size() == 1 && limitSucs.get(0) instanceof POForEach) { // the code below will now further examine the foreach successor = limitSucs.get(0); } } if (successor instanceof POForEach) { POForEach foreach = (POForEach) successor; List<PhysicalPlan> feInners = foreach.getInputPlans(); // find algebraic operators and also check if the foreach statement // is suitable for combiner use List<Pair<PhysicalOperator, PhysicalPlan>> algebraicOps = findAlgebraicOps(feInners); if (algebraicOps == null || algebraicOps.size() == 0) { // the plan is not combinable or there is nothing to combine // we're done return; } if (combinePlan != null && combinePlan.getRoots().size() != 0) { messageCollector.collect( "Wasn't expecting to find anything already " + "in the combiner!", MessageType.Warning, PigWarning.NON_EMPTY_COMBINE_PLAN); return; } LOG.info("Choosing to move algebraic foreach to combiner"); try { // replace PODistinct->Project[*] with distinct udf (which is Algebraic) for (Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps) { if (!(op2plan.first instanceof PODistinct)) { continue; } DistinctPatcher distinctPatcher = new DistinctPatcher(op2plan.second); distinctPatcher.visit(); if (distinctPatcher.getDistinct() == null) { int errCode = 2073; String msg = "Problem with replacing distinct operator with distinct built-in function."; throw new PlanException(msg, errCode, PigException.BUG); } op2plan.first = distinctPatcher.getDistinct(); } // create new map foreach POForEach mfe = createForEachWithGrpProj(foreach, rearrange.getKeyType()); Map<PhysicalOperator, Integer> op2newpos = Maps.newHashMap(); Integer pos = 1; // create plan for each algebraic udf and add as inner plan in map-foreach for (Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps) { PhysicalPlan udfPlan = createPlanWithPredecessors(op2plan.first, op2plan.second); mfe.addInputPlan(udfPlan, false); op2newpos.put(op2plan.first, pos++); } changeFunc(mfe, POUserFunc.INITIAL); // since we will only be creating SingleTupleBag as input to // the map foreach, we should flag the POProjects in the map // foreach inner plans to also use SingleTupleBag for (PhysicalPlan mpl : mfe.getInputPlans()) { try { new fixMapProjects(mpl).visit(); } catch (VisitorException e) { int errCode = 2089; String msg = "Unable to flag project operator to use single tuple bag."; throw new PlanException(msg, errCode, PigException.BUG, e); } } // create new combine foreach POForEach cfe = createForEachWithGrpProj(foreach, rearrange.getKeyType()); // add algebraic functions with appropriate projection addAlgebraicFuncToCombineFE(cfe, op2newpos); changeFunc(cfe, POUserFunc.INTERMEDIATE); // fix projection and function time for algebraic functions in reduce foreach for (Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps) { setProjectInput(op2plan.first, op2plan.second, op2newpos.get(op2plan.first)); byte resultType = op2plan.first.getResultType(); ((POUserFunc) op2plan.first).setAlgebraicFunction(POUserFunc.FINAL); op2plan.first.setResultType(resultType); } // we have modified the foreach inner plans - so set them again // for the foreach so that foreach can do any re-initialization // around them. // FIXME - this is a necessary evil right now because the leaves // are explicitly stored in the POForeach as a list rather than // computed each time at run time from the plans for // optimization. Do we want to have the Foreach compute the // leaves each time and have Java optimize it (will Java // optimize?)? mfe.setInputPlans(mfe.getInputPlans()); cfe.setInputPlans(cfe.getInputPlans()); foreach.setInputPlans(foreach.getInputPlans()); // tell POCombinerPackage which fields need projected and which // placed in bags. First field is simple project rest need to go // into bags int numFields = algebraicOps.size() + 1; // algebraic funcs + group key boolean[] bags = new boolean[numFields]; bags[0] = false; for (int i = 1; i < numFields; i++) { bags[i] = true; } // Use the POCombiner package in the combine plan // as it needs to act differently than the regular // package operator. CombinerPackager pkgr = new CombinerPackager(pack.getPkgr(), bags); POPackage combinePack = pack.clone(); combinePack.setPkgr(pkgr); combinePack.setParentPlan(null); combinePlan.add(combinePack); combinePlan.add(cfe); combinePlan.connect(combinePack, cfe); // No need to connect projections in cfe to cp, because // PigCombiner directly attaches output from package to // root of remaining plan. POLocalRearrange mlr = getNewRearrange(rearrange); POPartialAgg mapAgg = null; if (doMapAgg) { mapAgg = createPartialAgg(cfe); } // A specialized local rearrange operator will replace // the normal local rearrange in the map plan. This behaves // like the regular local rearrange in the getNext() // as far as getting its input and constructing the // "key" out of the input. It then returns a tuple with // two fields - the key in the first position and the // "value" inside a bag in the second position. This output // format resembles the format out of a Package. This output // will feed to the map foreach which expects this format. // If the key field isn't in the project of the combiner or map foreach, // it is added to the end (This is required so that we can // set up the inner plan of the new Local Rearrange leaf in the map // and combine plan to contain just the project of the key). patchUpMap(mapPlan, getPreCombinerLR(rearrange), mfe, mapAgg, mlr); POLocalRearrange clr = getNewRearrange(rearrange); clr.setParentPlan(null); combinePlan.add(clr); combinePlan.connect(cfe, clr); // Change the package operator in the reduce plan to // be the POCombiner package, as it needs to act // differently than the regular package operator. pack.setPkgr(pkgr.clone()); } catch (Exception e) { int errCode = 2018; String msg = "Internal error. Unable to introduce the combiner for optimization."; throw new OptimizerException(msg, errCode, PigException.BUG, e); } } }