@Test public void testImplicitSplitInCoGroup2() throws Exception { // this query is similar to the one reported in JIRA - PIG-537 LogicalPlanTester planTester = new LogicalPlanTester(); planTester.buildPlan("a = load 'file1' using PigStorage(':') as (name:chararray, marks:int);"); planTester.buildPlan( "b = load 'file2' using PigStorage(':') as (name:chararray, rank:chararray);"); planTester.buildPlan("c = cogroup a by name, b by name;"); planTester.buildPlan("d = foreach c generate group, FLATTEN(a.marks) as newmarks;"); planTester.buildPlan("e = cogroup a by marks, d by newmarks;"); LogicalPlan plan = planTester.buildPlan("f = foreach e generate group, flatten(a), flatten(d);"); // Set the logical plan values correctly in all the operators PlanSetter ps = new PlanSetter(plan); ps.visit(); // run through validator CompilationMessageCollector collector = new CompilationMessageCollector(); TypeCheckingValidator typeValidator = new TypeCheckingValidator(); typeValidator.validate(plan, collector); printMessageCollector(collector); printTypeGraph(plan); if (collector.hasError()) { throw new Exception("Error during type checking"); } // this will run ImplicitSplitInserter TestLogicalOptimizer.optimizePlan(plan); // get Schema of leaf and compare: Schema expectedSchema = Util.getSchemaFromString( "grp: int,A::username: chararray,A::marks: int,AB::group: chararray,AB::newmarks: int"); assertTrue(Schema.equals(expectedSchema, plan.getLeaves().get(0).getSchema(), false, true)); }
/* (non-Javadoc) * @see org.apache.pig.newplan.logical.expression.LogicalExpressionVisitor#visit(org.apache.pig.newplan.logical.expression.CastExpression) *if input type is bytearray, find and set the corresponding load function * that this field comes from. * The load functions LoadCaster interface will help with conversion * at runtime. * If there is no corresponding load function (eg if the input is an * output of a udf), set nothing - it assumes that bytearray is being used * as equivalent of 'unknown' type. It will try to identify the type * at runtime and cast it. */ @Override public void visit(CastExpression cast) throws FrontendException { byte inType = cast.getExpression().getType(); byte outType = cast.getType(); if (containsByteArrayOrEmtpyInSchema(cast.getExpression().getFieldSchema())) { long inUid = cast.getExpression().getFieldSchema().uid; FuncSpec inLoadFunc = uid2LoadFuncMap.get(inUid); if (inLoadFunc == null) { String msg = "Cannot resolve load function to use for casting from " + DataType.findTypeName(inType) + " to " + DataType.findTypeName(outType) + ". "; msgCollector.collect(msg, MessageType.Warning); } else { cast.setFuncSpec(inLoadFunc); } } }
/** * Algebraic functions and distinct in nested plan of a foreach are partially computed in the map * and combine phase. A new foreach statement with initial and intermediate forms of algebraic * functions are added to map and combine plans respectively. * * <p>If bag portion of group-by result is projected or a non algebraic expression/udf has bag as * input, combiner will not be used. This is because the use of combiner in such case is likely to * degrade performance as there will not be much reduction in data size in combine stage to offset * the cost of the additional number of times (de)serialization is done. * * <p>Major areas for enhancement: 1. use of combiner in cogroup 2. queries with order-by, limit * or sort in a nested foreach after group-by 3. case where group-by is followed by filter that * has algebraic expression */ public static void addCombiner( PhysicalPlan mapPlan, PhysicalPlan reducePlan, PhysicalPlan combinePlan, CompilationMessageCollector messageCollector, boolean doMapAgg) throws VisitorException { // part one - check if this MR job represents a group-by + foreach. Find // the POLocalRearrange in the map. I'll need it later. List<PhysicalOperator> mapLeaves = mapPlan.getLeaves(); if (mapLeaves == null || mapLeaves.size() != 1) { messageCollector.collect( "Expected map to have single leaf", MessageType.Warning, PigWarning.MULTI_LEAF_MAP); return; } PhysicalOperator mapLeaf = mapLeaves.get(0); if (!(mapLeaf instanceof POLocalRearrange)) { return; } POLocalRearrange rearrange = (POLocalRearrange) mapLeaf; List<PhysicalOperator> reduceRoots = reducePlan.getRoots(); if (reduceRoots.size() != 1) { messageCollector.collect( "Expected reduce to have single root", MessageType.Warning, PigWarning.MULTI_ROOT_REDUCE); return; } // I expect that the first root should always be a POPackage. If not, I // don't know what's going on, so I'm out of here. PhysicalOperator root = reduceRoots.get(0); if (!(root instanceof POPackage)) { messageCollector.collect( "Expected reduce root to be a POPackage", MessageType.Warning, PigWarning.NON_PACKAGE_REDUCE_PLAN_ROOT); return; } POPackage pack = (POPackage) root; List<PhysicalOperator> packSuccessors = reducePlan.getSuccessors(root); if (packSuccessors == null || packSuccessors.size() != 1) { return; } PhysicalOperator successor = packSuccessors.get(0); if (successor instanceof POLimit) { // POLimit is acceptable, as long has it has a single foreach as // successor List<PhysicalOperator> limitSucs = reducePlan.getSuccessors(successor); if (limitSucs != null && limitSucs.size() == 1 && limitSucs.get(0) instanceof POForEach) { // the code below will now further examine the foreach successor = limitSucs.get(0); } } if (successor instanceof POForEach) { POForEach foreach = (POForEach) successor; List<PhysicalPlan> feInners = foreach.getInputPlans(); // find algebraic operators and also check if the foreach statement // is suitable for combiner use List<Pair<PhysicalOperator, PhysicalPlan>> algebraicOps = findAlgebraicOps(feInners); if (algebraicOps == null || algebraicOps.size() == 0) { // the plan is not combinable or there is nothing to combine // we're done return; } if (combinePlan != null && combinePlan.getRoots().size() != 0) { messageCollector.collect( "Wasn't expecting to find anything already " + "in the combiner!", MessageType.Warning, PigWarning.NON_EMPTY_COMBINE_PLAN); return; } LOG.info("Choosing to move algebraic foreach to combiner"); try { // replace PODistinct->Project[*] with distinct udf (which is Algebraic) for (Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps) { if (!(op2plan.first instanceof PODistinct)) { continue; } DistinctPatcher distinctPatcher = new DistinctPatcher(op2plan.second); distinctPatcher.visit(); if (distinctPatcher.getDistinct() == null) { int errCode = 2073; String msg = "Problem with replacing distinct operator with distinct built-in function."; throw new PlanException(msg, errCode, PigException.BUG); } op2plan.first = distinctPatcher.getDistinct(); } // create new map foreach POForEach mfe = createForEachWithGrpProj(foreach, rearrange.getKeyType()); Map<PhysicalOperator, Integer> op2newpos = Maps.newHashMap(); Integer pos = 1; // create plan for each algebraic udf and add as inner plan in map-foreach for (Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps) { PhysicalPlan udfPlan = createPlanWithPredecessors(op2plan.first, op2plan.second); mfe.addInputPlan(udfPlan, false); op2newpos.put(op2plan.first, pos++); } changeFunc(mfe, POUserFunc.INITIAL); // since we will only be creating SingleTupleBag as input to // the map foreach, we should flag the POProjects in the map // foreach inner plans to also use SingleTupleBag for (PhysicalPlan mpl : mfe.getInputPlans()) { try { new fixMapProjects(mpl).visit(); } catch (VisitorException e) { int errCode = 2089; String msg = "Unable to flag project operator to use single tuple bag."; throw new PlanException(msg, errCode, PigException.BUG, e); } } // create new combine foreach POForEach cfe = createForEachWithGrpProj(foreach, rearrange.getKeyType()); // add algebraic functions with appropriate projection addAlgebraicFuncToCombineFE(cfe, op2newpos); changeFunc(cfe, POUserFunc.INTERMEDIATE); // fix projection and function time for algebraic functions in reduce foreach for (Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps) { setProjectInput(op2plan.first, op2plan.second, op2newpos.get(op2plan.first)); byte resultType = op2plan.first.getResultType(); ((POUserFunc) op2plan.first).setAlgebraicFunction(POUserFunc.FINAL); op2plan.first.setResultType(resultType); } // we have modified the foreach inner plans - so set them again // for the foreach so that foreach can do any re-initialization // around them. // FIXME - this is a necessary evil right now because the leaves // are explicitly stored in the POForeach as a list rather than // computed each time at run time from the plans for // optimization. Do we want to have the Foreach compute the // leaves each time and have Java optimize it (will Java // optimize?)? mfe.setInputPlans(mfe.getInputPlans()); cfe.setInputPlans(cfe.getInputPlans()); foreach.setInputPlans(foreach.getInputPlans()); // tell POCombinerPackage which fields need projected and which // placed in bags. First field is simple project rest need to go // into bags int numFields = algebraicOps.size() + 1; // algebraic funcs + group key boolean[] bags = new boolean[numFields]; bags[0] = false; for (int i = 1; i < numFields; i++) { bags[i] = true; } // Use the POCombiner package in the combine plan // as it needs to act differently than the regular // package operator. CombinerPackager pkgr = new CombinerPackager(pack.getPkgr(), bags); POPackage combinePack = pack.clone(); combinePack.setPkgr(pkgr); combinePack.setParentPlan(null); combinePlan.add(combinePack); combinePlan.add(cfe); combinePlan.connect(combinePack, cfe); // No need to connect projections in cfe to cp, because // PigCombiner directly attaches output from package to // root of remaining plan. POLocalRearrange mlr = getNewRearrange(rearrange); POPartialAgg mapAgg = null; if (doMapAgg) { mapAgg = createPartialAgg(cfe); } // A specialized local rearrange operator will replace // the normal local rearrange in the map plan. This behaves // like the regular local rearrange in the getNext() // as far as getting its input and constructing the // "key" out of the input. It then returns a tuple with // two fields - the key in the first position and the // "value" inside a bag in the second position. This output // format resembles the format out of a Package. This output // will feed to the map foreach which expects this format. // If the key field isn't in the project of the combiner or map foreach, // it is added to the end (This is required so that we can // set up the inner plan of the new Local Rearrange leaf in the map // and combine plan to contain just the project of the key). patchUpMap(mapPlan, getPreCombinerLR(rearrange), mfe, mapAgg, mlr); POLocalRearrange clr = getNewRearrange(rearrange); clr.setParentPlan(null); combinePlan.add(clr); combinePlan.connect(cfe, clr); // Change the package operator in the reduce plan to // be the POCombiner package, as it needs to act // differently than the regular package operator. pack.setPkgr(pkgr.clone()); } catch (Exception e) { int errCode = 2018; String msg = "Internal error. Unable to introduce the combiner for optimization."; throw new OptimizerException(msg, errCode, PigException.BUG, e); } } }