/** * Change the algebriac function type for algebraic functions in map and combine In map and * combine the algebraic functions will be leaf of the plan * * @param fe * @param type * @throws PlanException */ private static void changeFunc(POForEach fe, byte type) throws PlanException { for (PhysicalPlan plan : fe.getInputPlans()) { List<PhysicalOperator> leaves = plan.getLeaves(); if (leaves == null || leaves.size() != 1) { int errCode = 2019; String msg = "Expected to find plan with single leaf. Found " + leaves.size() + " leaves."; throw new PlanException(msg, errCode, PigException.BUG); } PhysicalOperator leaf = leaves.get(0); if (leaf instanceof POProject) { continue; } if (!(leaf instanceof POUserFunc)) { int errCode = 2020; String msg = "Expected to find plan with UDF or project leaf. Found " + leaf.getClass().getSimpleName(); throw new PlanException(msg, errCode, PigException.BUG); } POUserFunc func = (POUserFunc) leaf; try { func.setAlgebraicFunction(type); } catch (ExecException e) { int errCode = 2075; String msg = "Could not set algebraic function type."; throw new PlanException(msg, errCode, PigException.BUG, e); } } }
@Override public void visitProject(POProject proj) throws VisitorException { // check if this project is preceded by PODistinct and // has the return type bag List<PhysicalOperator> preds = mPlan.getPredecessors(proj); if (preds == null) return; // this is a leaf project and so not interesting for patching PhysicalOperator pred = preds.get(0); if (preds.size() == 1 && pred instanceof PODistinct) { if (distinct != null) { // we should not already have been patched since the // Project-Distinct pair should occur only once int errCode = 2076; String msg = "Unexpected Project-Distinct pair while trying to set up plans for use with combiner."; throw new OptimizerException(msg, errCode, PigException.BUG); } // we have stick in the POUserfunc(org.apache.pig.builtin.Distinct)[DataBag] // in place of the Project-PODistinct pair PhysicalOperator distinctPredecessor = mPlan.getPredecessors(pred).get(0); POUserFunc func = null; try { String scope = proj.getOperatorKey().scope; List<PhysicalOperator> funcInput = new ArrayList<PhysicalOperator>(); FuncSpec fSpec = new FuncSpec(DISTINCT_UDF_CLASSNAME); funcInput.add(distinctPredecessor); // explicitly set distinctPredecessor's result type to // be tuple - this is relevant when distinctPredecessor is // originally a POForeach with return type BAG - we need to // set it to tuple so we get a stream of tuples. distinctPredecessor.setResultType(DataType.TUPLE); func = new POUserFunc( new OperatorKey(scope, NodeIdGenerator.getGenerator().getNextNodeId(scope)), -1, funcInput, fSpec); func.setResultType(DataType.BAG); mPlan.replace(proj, func); mPlan.remove(pred); // connect the the newly added "func" to // the predecessor to the earlier PODistinct mPlan.connect(distinctPredecessor, func); } catch (PlanException e) { int errCode = 2077; String msg = "Problem with reconfiguring plan to add distinct built-in function."; throw new OptimizerException(msg, errCode, PigException.BUG, e); } distinct = func; } }
/** * The reduce function which packages the key and List<Tuple> into key, Bag<Tuple> * after converting Hadoop type key into Pig type. The package result is either collected as is, * if the reduce plan is empty or after passing through the reduce plan. */ @Override protected void reduce(PigNullableWritable key, Iterable<NullableTuple> tupIter, Context context) throws IOException, InterruptedException { if (!initialized) { initialized = true; // cache the collector for use in runPipeline() // which could additionally be called from close() this.outputCollector = context; pigReporter.setRep(context); PhysicalOperator.setReporter(pigReporter); boolean aggregateWarning = "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning")); PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance(); pigStatusReporter.setContext(new MRTaskContext(context)); PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance(); pigHadoopLogger.setReporter(pigStatusReporter); pigHadoopLogger.setAggregate(aggregateWarning); PhysicalOperator.setPigLogger(pigHadoopLogger); if (!inIllustrator) for (POStore store : stores) { MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context); store.setStoreImpl(impl); store.setUp(); } } // In the case we optimize the join, we combine // POPackage and POForeach - so we could get many // tuples out of the getnext() call of POJoinPackage // In this case, we process till we see EOP from // POJoinPacakage.getNext() if (pack.getPkgr() instanceof JoinPackager) { pack.attachInput(key, tupIter.iterator()); while (true) { if (processOnePackageOutput(context)) break; } } else { // join is not optimized, so package will // give only one tuple out for the key pack.attachInput(key, tupIter.iterator()); processOnePackageOutput(context); } }
/** * @param op * @param index * @param plan * @throws PlanException */ private static void setProjectInput(PhysicalOperator op, PhysicalPlan plan, int index) throws PlanException { String scope = op.getOperatorKey().scope; POProject proj = new POProject( new OperatorKey(scope, NodeIdGenerator.getGenerator().getNextNodeId(scope)), op.getRequestedParallelism(), index); proj.setResultType(DataType.BAG); // Remove old connections and elements from the plan plan.trimAbove(op); plan.add(proj); plan.connect(proj, op); List<PhysicalOperator> inputs = Lists.newArrayList(); inputs.add(proj); op.setInputs(inputs); }
/** * stolen from JobControlCompiler TODO: refactor it to share this * * @param physicalPlan * @param poLoad * @param jobConf * @return * @throws java.io.IOException */ private static JobConf configureLoader(PhysicalPlan physicalPlan, POLoad poLoad, JobConf jobConf) throws IOException { // 这部分似乎没用 Job job = new Job(jobConf); LoadFunc loadFunc = poLoad.getLoadFunc(); loadFunc.setLocation(poLoad.getLFile().getFileName(), job); // stolen from JobControlCompiler ArrayList<FileSpec> pigInputs = new ArrayList<FileSpec>(); // Store the inp filespecs pigInputs.add(poLoad.getLFile()); ArrayList<List<OperatorKey>> inpTargets = Lists.newArrayList(); ArrayList<String> inpSignatures = Lists.newArrayList(); ArrayList<Long> inpLimits = Lists.newArrayList(); // Store the target operators for tuples read // from this input List<PhysicalOperator> loadSuccessors = physicalPlan.getSuccessors(poLoad); List<OperatorKey> loadSuccessorsKeys = Lists.newArrayList(); if (loadSuccessors != null) { for (PhysicalOperator loadSuccessor : loadSuccessors) { loadSuccessorsKeys.add(loadSuccessor.getOperatorKey()); } } inpTargets.add(loadSuccessorsKeys); inpSignatures.add(poLoad.getSignature()); inpLimits.add(poLoad.getLimit()); jobConf.set("pig.inputs", ObjectSerializer.serialize(pigInputs)); jobConf.set("pig.inpTargets", ObjectSerializer.serialize(inpTargets)); jobConf.set("pig.inpSignatures", ObjectSerializer.serialize(inpSignatures)); jobConf.set("pig.inpLimits", ObjectSerializer.serialize(inpLimits)); return jobConf; }
/** * Recursively clone op and its predecessors from pplan and add them to newplan * * @param op * @param pplan * @param newplan * @return * @throws CloneNotSupportedException * @throws PlanException */ private static PhysicalOperator addPredecessorsToPlan( PhysicalOperator op, PhysicalPlan pplan, PhysicalPlan newplan) throws CloneNotSupportedException, PlanException { PhysicalOperator newOp = op.clone(); newplan.add(newOp); if (pplan.getPredecessors(op) == null || pplan.getPredecessors(op).size() == 0) { return newOp; } for (PhysicalOperator pred : pplan.getPredecessors(op)) { PhysicalOperator newPred = addPredecessorsToPlan(pred, pplan, newplan); newplan.connect(newPred, newOp); } return newOp; }
@Override protected void runPipeline(PhysicalOperator leaf) throws IOException, InterruptedException { while (true) { Result res = leaf.getNextTuple(); if (res.returnStatus == POStatus.STATUS_OK) { // For POPartitionRearrange, the result is a bag. // This operator is used for skewed join if (res.result instanceof DataBag) { Iterator<Tuple> its = ((DataBag) res.result).iterator(); while (its.hasNext()) { collect(outputCollector, its.next()); } } else { collect(outputCollector, (Tuple) res.result); } continue; } if (res.returnStatus == POStatus.STATUS_EOP) { return; } if (res.returnStatus == POStatus.STATUS_NULL) { continue; } if (res.returnStatus == POStatus.STATUS_ERR) { // remember that we had an issue so that in // close() we can do the right thing errorInMap = true; // if there is an errmessage use it String errMsg; if (res.result != null) { errMsg = "Received Error while " + "processing the map plan: " + res.result; } else { errMsg = "Received Error while " + "processing the map plan."; } int errCode = 2055; throw new ExecException(errMsg, errCode, PigException.BUG); } } }
/** * Will be called once all the intermediate keys and values are processed. So right place to * stop the reporter thread. */ @Override protected void cleanup(Context context) throws IOException, InterruptedException { super.cleanup(context); if (errorInReduce) { // there was an error in reduce - just return return; } if (PigMapReduce.sJobConfInternal.get().get("pig.stream.in.reduce", "false").equals("true")) { // If there is a stream in the pipeline we could // potentially have more to process - so lets // set the flag stating that all map input has been sent // already and then lets run the pipeline one more time // This will result in nothing happening in the case // where there is no stream in the pipeline rp.endOfAllInput = true; runPipeline(leaf); } if (!inIllustrator) { for (POStore store : stores) { if (!initialized) { MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context); store.setStoreImpl(impl); store.setUp(); } store.tearDown(); } } // Calling EvalFunc.finish() UDFFinishVisitor finisher = new UDFFinishVisitor(rp, new DependencyOrderWalker<PhysicalOperator, PhysicalPlan>(rp)); try { finisher.visit(); } catch (VisitorException e) { throw new IOException("Error trying to finish UDFs", e); } PhysicalOperator.setReporter(null); initialized = false; }
/** * @param leaf * @throws InterruptedException * @throws IOException */ protected void runPipeline(PhysicalOperator leaf) throws InterruptedException, IOException { while (true) { Result redRes = leaf.getNextTuple(); if (redRes.returnStatus == POStatus.STATUS_OK) { try { outputCollector.write(null, (Tuple) redRes.result); } catch (Exception e) { throw new IOException(e); } continue; } if (redRes.returnStatus == POStatus.STATUS_EOP) { return; } if (redRes.returnStatus == POStatus.STATUS_NULL) { continue; } if (redRes.returnStatus == POStatus.STATUS_ERR) { // remember that we had an issue so that in // close() we can do the right thing errorInReduce = true; // if there is an errmessage use it String msg; if (redRes.result != null) { msg = "Received Error while " + "processing the reduce plan: " + redRes.result; } else { msg = "Received Error while " + "processing the reduce plan."; } int errCode = 2090; throw new ExecException(msg, errCode, PigException.BUG); } } }
/** * The reduce function which packages the key and List<Tuple> into key, Bag<Tuple> * after converting Hadoop type key into Pig type. The package result is either collected as is, * if the reduce plan is empty or after passing through the reduce plan. */ @Override protected void reduce(PigNullableWritable key, Iterable<NullableTuple> tupIter, Context context) throws IOException, InterruptedException { if (!initialized) { initialized = true; // cache the collector for use in runPipeline() // which could additionally be called from close() this.outputCollector = context; pigReporter.setRep(context); PhysicalOperator.setReporter(pigReporter); boolean aggregateWarning = "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning")); PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance(); pigStatusReporter.setContext(new MRTaskContext(context)); PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance(); pigHadoopLogger.setReporter(pigStatusReporter); pigHadoopLogger.setAggregate(aggregateWarning); PhysicalOperator.setPigLogger(pigHadoopLogger); for (POStore store : stores) { MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context); store.setStoreImpl(impl); store.setUp(); } } // If the keyType is not a tuple, the MapWithComparator.collect() // would have wrapped the key into a tuple so that the // comparison UDF used in the order by can process it. // We need to unwrap the key out of the tuple and hand it // to the POPackage for processing if (keyType != DataType.TUPLE) { Tuple t = (Tuple) (key.getValueAsPigType()); try { key = HDataType.getWritableComparableTypes(t.get(0), keyType); } catch (ExecException e) { throw e; } } pack.attachInput(key, tupIter.iterator()); Result res = pack.getNextTuple(); if (res.returnStatus == POStatus.STATUS_OK) { Tuple packRes = (Tuple) res.result; if (rp.isEmpty()) { context.write(null, packRes); return; } rp.attachInput(packRes); List<PhysicalOperator> leaves = rp.getLeaves(); PhysicalOperator leaf = leaves.get(0); runPipeline(leaf); } if (res.returnStatus == POStatus.STATUS_NULL) { return; } if (res.returnStatus == POStatus.STATUS_ERR) { int errCode = 2093; String msg = "Encountered error in package operator while processing group."; throw new ExecException(msg, errCode, PigException.BUG); } }
@Override protected void execute(LogicalExpression op) throws FrontendException { if (op instanceof UserFuncExpression) { UserFuncExpression udf = (UserFuncExpression) op; if (!udf.getEvalFunc().allowCompileTimeCalculation()) { return; } } boolean valSet = false; Object val = null; if (currentWalker.getPlan().getSuccessors(op) != null) { // If has successors and all successors are constant, calculate the constant for (Operator succ : currentWalker.getPlan().getSuccessors(op)) { if (!(succ instanceof ConstantExpression)) { return; } } // All successors are constant, calculate the value OperatorPlan expLogicalPlan = new LogicalExpressionPlan(); ((BaseOperatorPlan) currentWalker.getPlan()) .moveTree(op, (BaseOperatorPlan) expLogicalPlan); PhysicalPlan expPhysicalPlan = new PhysicalPlan(); Map<Operator, PhysicalOperator> logToPhyMap = new HashMap<Operator, PhysicalOperator>(); PlanWalker childWalker = new ReverseDependencyOrderWalkerWOSeenChk(expLogicalPlan); // Save the old walker and use childWalker as current Walker pushWalker(childWalker); ExpToPhyTranslationVisitor expTranslationVisitor = new ExpToPhyTranslationVisitor( expLogicalPlan, childWalker, currentOp, expPhysicalPlan, logToPhyMap); expTranslationVisitor.visit(); popWalker(); PhysicalOperator root = expPhysicalPlan.getLeaves().get(0); try { UDFContext.getUDFContext() .addJobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), true)); PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance(); PhysicalOperator.setPigLogger(pigHadoopLogger); setDefaultTimeZone(); val = root.getNext(root.getResultType()).result; restoreDefaultTimeZone(); UDFContext.getUDFContext().addJobConf(null); } catch (ExecException e) { throw new FrontendException(e); } valSet = true; } else if (op instanceof UserFuncExpression) { // If solo UDF, calculate UDF UserFuncExpression udf = (UserFuncExpression) op; try { UDFContext.getUDFContext() .addJobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), true)); setDefaultTimeZone(); val = udf.getEvalFunc().exec(null); restoreDefaultTimeZone(); UDFContext.getUDFContext().addJobConf(null); } catch (IOException e) { throw new FrontendException(e); } valSet = true; } if (valSet) { ConstantExpression constantExpr; constantExpr = new ConstantExpression(currentWalker.getPlan(), val); constantExpr.inheritSchema(op); currentWalker.getPlan().replace(op, constantExpr); } }