예제 #1
0
  /**
   * Change the algebriac function type for algebraic functions in map and combine In map and
   * combine the algebraic functions will be leaf of the plan
   *
   * @param fe
   * @param type
   * @throws PlanException
   */
  private static void changeFunc(POForEach fe, byte type) throws PlanException {
    for (PhysicalPlan plan : fe.getInputPlans()) {
      List<PhysicalOperator> leaves = plan.getLeaves();
      if (leaves == null || leaves.size() != 1) {
        int errCode = 2019;
        String msg = "Expected to find plan with single leaf. Found " + leaves.size() + " leaves.";
        throw new PlanException(msg, errCode, PigException.BUG);
      }

      PhysicalOperator leaf = leaves.get(0);
      if (leaf instanceof POProject) {
        continue;
      }
      if (!(leaf instanceof POUserFunc)) {
        int errCode = 2020;
        String msg =
            "Expected to find plan with UDF or project leaf. Found "
                + leaf.getClass().getSimpleName();
        throw new PlanException(msg, errCode, PigException.BUG);
      }

      POUserFunc func = (POUserFunc) leaf;
      try {
        func.setAlgebraicFunction(type);
      } catch (ExecException e) {
        int errCode = 2075;
        String msg = "Could not set algebraic function type.";
        throw new PlanException(msg, errCode, PigException.BUG, e);
      }
    }
  }
예제 #2
0
    @Override
    public void visitProject(POProject proj) throws VisitorException {
      // check if this project is preceded by PODistinct and
      // has the return type bag

      List<PhysicalOperator> preds = mPlan.getPredecessors(proj);
      if (preds == null) return; // this is a leaf project and so not interesting for patching
      PhysicalOperator pred = preds.get(0);
      if (preds.size() == 1 && pred instanceof PODistinct) {
        if (distinct != null) {
          // we should not already have been patched since the
          // Project-Distinct pair should occur only once
          int errCode = 2076;
          String msg =
              "Unexpected Project-Distinct pair while trying to set up plans for use with combiner.";
          throw new OptimizerException(msg, errCode, PigException.BUG);
        }
        // we have stick in the POUserfunc(org.apache.pig.builtin.Distinct)[DataBag]
        // in place of the Project-PODistinct pair
        PhysicalOperator distinctPredecessor = mPlan.getPredecessors(pred).get(0);
        POUserFunc func = null;

        try {
          String scope = proj.getOperatorKey().scope;
          List<PhysicalOperator> funcInput = new ArrayList<PhysicalOperator>();
          FuncSpec fSpec = new FuncSpec(DISTINCT_UDF_CLASSNAME);
          funcInput.add(distinctPredecessor);
          // explicitly set distinctPredecessor's result type to
          // be tuple - this is relevant when distinctPredecessor is
          // originally a POForeach with return type BAG - we need to
          // set it to tuple so we get a stream of tuples.
          distinctPredecessor.setResultType(DataType.TUPLE);
          func =
              new POUserFunc(
                  new OperatorKey(scope, NodeIdGenerator.getGenerator().getNextNodeId(scope)),
                  -1,
                  funcInput,
                  fSpec);
          func.setResultType(DataType.BAG);
          mPlan.replace(proj, func);
          mPlan.remove(pred);
          // connect the the newly added "func" to
          // the predecessor to the earlier PODistinct
          mPlan.connect(distinctPredecessor, func);
        } catch (PlanException e) {
          int errCode = 2077;
          String msg = "Problem with reconfiguring plan to add distinct built-in function.";
          throw new OptimizerException(msg, errCode, PigException.BUG, e);
        }
        distinct = func;
      }
    }
    /**
     * The reduce function which packages the key and List&lt;Tuple&gt; into key, Bag&lt;Tuple&gt;
     * after converting Hadoop type key into Pig type. The package result is either collected as is,
     * if the reduce plan is empty or after passing through the reduce plan.
     */
    @Override
    protected void reduce(PigNullableWritable key, Iterable<NullableTuple> tupIter, Context context)
        throws IOException, InterruptedException {

      if (!initialized) {
        initialized = true;

        // cache the collector for use in runPipeline()
        // which could additionally be called from close()
        this.outputCollector = context;
        pigReporter.setRep(context);
        PhysicalOperator.setReporter(pigReporter);

        boolean aggregateWarning =
            "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning"));
        PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance();
        pigStatusReporter.setContext(new MRTaskContext(context));
        PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance();
        pigHadoopLogger.setReporter(pigStatusReporter);
        pigHadoopLogger.setAggregate(aggregateWarning);
        PhysicalOperator.setPigLogger(pigHadoopLogger);

        if (!inIllustrator)
          for (POStore store : stores) {
            MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context);
            store.setStoreImpl(impl);
            store.setUp();
          }
      }

      // In the case we optimize the join, we combine
      // POPackage and POForeach - so we could get many
      // tuples out of the getnext() call of POJoinPackage
      // In this case, we process till we see EOP from
      // POJoinPacakage.getNext()
      if (pack.getPkgr() instanceof JoinPackager) {
        pack.attachInput(key, tupIter.iterator());
        while (true) {
          if (processOnePackageOutput(context)) break;
        }
      } else {
        // join is not optimized, so package will
        // give only one tuple out for the key
        pack.attachInput(key, tupIter.iterator());
        processOnePackageOutput(context);
      }
    }
예제 #4
0
 /**
  * @param op
  * @param index
  * @param plan
  * @throws PlanException
  */
 private static void setProjectInput(PhysicalOperator op, PhysicalPlan plan, int index)
     throws PlanException {
   String scope = op.getOperatorKey().scope;
   POProject proj =
       new POProject(
           new OperatorKey(scope, NodeIdGenerator.getGenerator().getNextNodeId(scope)),
           op.getRequestedParallelism(),
           index);
   proj.setResultType(DataType.BAG);
   // Remove old connections and elements from the plan
   plan.trimAbove(op);
   plan.add(proj);
   plan.connect(proj, op);
   List<PhysicalOperator> inputs = Lists.newArrayList();
   inputs.add(proj);
   op.setInputs(inputs);
 }
예제 #5
0
  /**
   * stolen from JobControlCompiler TODO: refactor it to share this
   *
   * @param physicalPlan
   * @param poLoad
   * @param jobConf
   * @return
   * @throws java.io.IOException
   */
  private static JobConf configureLoader(PhysicalPlan physicalPlan, POLoad poLoad, JobConf jobConf)
      throws IOException {

    // 这部分似乎没用
    Job job = new Job(jobConf);
    LoadFunc loadFunc = poLoad.getLoadFunc();
    loadFunc.setLocation(poLoad.getLFile().getFileName(), job);

    // stolen from JobControlCompiler
    ArrayList<FileSpec> pigInputs = new ArrayList<FileSpec>();
    // Store the inp filespecs
    pigInputs.add(poLoad.getLFile());

    ArrayList<List<OperatorKey>> inpTargets = Lists.newArrayList();
    ArrayList<String> inpSignatures = Lists.newArrayList();
    ArrayList<Long> inpLimits = Lists.newArrayList();

    // Store the target operators for tuples read
    // from this input
    List<PhysicalOperator> loadSuccessors = physicalPlan.getSuccessors(poLoad);
    List<OperatorKey> loadSuccessorsKeys = Lists.newArrayList();
    if (loadSuccessors != null) {
      for (PhysicalOperator loadSuccessor : loadSuccessors) {
        loadSuccessorsKeys.add(loadSuccessor.getOperatorKey());
      }
    }

    inpTargets.add(loadSuccessorsKeys);
    inpSignatures.add(poLoad.getSignature());
    inpLimits.add(poLoad.getLimit());

    jobConf.set("pig.inputs", ObjectSerializer.serialize(pigInputs));
    jobConf.set("pig.inpTargets", ObjectSerializer.serialize(inpTargets));
    jobConf.set("pig.inpSignatures", ObjectSerializer.serialize(inpSignatures));
    jobConf.set("pig.inpLimits", ObjectSerializer.serialize(inpLimits));

    return jobConf;
  }
예제 #6
0
 /**
  * Recursively clone op and its predecessors from pplan and add them to newplan
  *
  * @param op
  * @param pplan
  * @param newplan
  * @return
  * @throws CloneNotSupportedException
  * @throws PlanException
  */
 private static PhysicalOperator addPredecessorsToPlan(
     PhysicalOperator op, PhysicalPlan pplan, PhysicalPlan newplan)
     throws CloneNotSupportedException, PlanException {
   PhysicalOperator newOp = op.clone();
   newplan.add(newOp);
   if (pplan.getPredecessors(op) == null || pplan.getPredecessors(op).size() == 0) {
     return newOp;
   }
   for (PhysicalOperator pred : pplan.getPredecessors(op)) {
     PhysicalOperator newPred = addPredecessorsToPlan(pred, pplan, newplan);
     newplan.connect(newPred, newOp);
   }
   return newOp;
 }
    @Override
    protected void runPipeline(PhysicalOperator leaf) throws IOException, InterruptedException {

      while (true) {
        Result res = leaf.getNextTuple();

        if (res.returnStatus == POStatus.STATUS_OK) {
          // For POPartitionRearrange, the result is a bag.
          // This operator is used for skewed join
          if (res.result instanceof DataBag) {
            Iterator<Tuple> its = ((DataBag) res.result).iterator();
            while (its.hasNext()) {
              collect(outputCollector, its.next());
            }
          } else {
            collect(outputCollector, (Tuple) res.result);
          }
          continue;
        }

        if (res.returnStatus == POStatus.STATUS_EOP) {
          return;
        }

        if (res.returnStatus == POStatus.STATUS_NULL) {
          continue;
        }

        if (res.returnStatus == POStatus.STATUS_ERR) {
          // remember that we had an issue so that in
          // close() we can do the right thing
          errorInMap = true;
          // if there is an errmessage use it
          String errMsg;
          if (res.result != null) {
            errMsg = "Received Error while " + "processing the map plan: " + res.result;
          } else {
            errMsg = "Received Error while " + "processing the map plan.";
          }

          int errCode = 2055;
          throw new ExecException(errMsg, errCode, PigException.BUG);
        }
      }
    }
    /**
     * Will be called once all the intermediate keys and values are processed. So right place to
     * stop the reporter thread.
     */
    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
      super.cleanup(context);

      if (errorInReduce) {
        // there was an error in reduce - just return
        return;
      }

      if (PigMapReduce.sJobConfInternal.get().get("pig.stream.in.reduce", "false").equals("true")) {
        // If there is a stream in the pipeline we could
        // potentially have more to process - so lets
        // set the flag stating that all map input has been sent
        // already and then lets run the pipeline one more time
        // This will result in nothing happening in the case
        // where there is no stream in the pipeline
        rp.endOfAllInput = true;
        runPipeline(leaf);
      }

      if (!inIllustrator) {
        for (POStore store : stores) {
          if (!initialized) {
            MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context);
            store.setStoreImpl(impl);
            store.setUp();
          }
          store.tearDown();
        }
      }

      // Calling EvalFunc.finish()
      UDFFinishVisitor finisher =
          new UDFFinishVisitor(rp, new DependencyOrderWalker<PhysicalOperator, PhysicalPlan>(rp));
      try {
        finisher.visit();
      } catch (VisitorException e) {
        throw new IOException("Error trying to finish UDFs", e);
      }

      PhysicalOperator.setReporter(null);
      initialized = false;
    }
    /**
     * @param leaf
     * @throws InterruptedException
     * @throws IOException
     */
    protected void runPipeline(PhysicalOperator leaf) throws InterruptedException, IOException {

      while (true) {
        Result redRes = leaf.getNextTuple();
        if (redRes.returnStatus == POStatus.STATUS_OK) {
          try {
            outputCollector.write(null, (Tuple) redRes.result);
          } catch (Exception e) {
            throw new IOException(e);
          }
          continue;
        }

        if (redRes.returnStatus == POStatus.STATUS_EOP) {
          return;
        }

        if (redRes.returnStatus == POStatus.STATUS_NULL) {
          continue;
        }

        if (redRes.returnStatus == POStatus.STATUS_ERR) {
          // remember that we had an issue so that in
          // close() we can do the right thing
          errorInReduce = true;
          // if there is an errmessage use it
          String msg;
          if (redRes.result != null) {
            msg = "Received Error while " + "processing the reduce plan: " + redRes.result;
          } else {
            msg = "Received Error while " + "processing the reduce plan.";
          }
          int errCode = 2090;
          throw new ExecException(msg, errCode, PigException.BUG);
        }
      }
    }
예제 #10
0
    /**
     * The reduce function which packages the key and List&lt;Tuple&gt; into key, Bag&lt;Tuple&gt;
     * after converting Hadoop type key into Pig type. The package result is either collected as is,
     * if the reduce plan is empty or after passing through the reduce plan.
     */
    @Override
    protected void reduce(PigNullableWritable key, Iterable<NullableTuple> tupIter, Context context)
        throws IOException, InterruptedException {

      if (!initialized) {
        initialized = true;

        // cache the collector for use in runPipeline()
        // which could additionally be called from close()
        this.outputCollector = context;
        pigReporter.setRep(context);
        PhysicalOperator.setReporter(pigReporter);

        boolean aggregateWarning =
            "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning"));
        PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance();
        pigStatusReporter.setContext(new MRTaskContext(context));
        PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance();
        pigHadoopLogger.setReporter(pigStatusReporter);
        pigHadoopLogger.setAggregate(aggregateWarning);
        PhysicalOperator.setPigLogger(pigHadoopLogger);

        for (POStore store : stores) {
          MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context);
          store.setStoreImpl(impl);
          store.setUp();
        }
      }

      // If the keyType is not a tuple, the MapWithComparator.collect()
      // would have wrapped the key into a tuple so that the
      // comparison UDF used in the order by can process it.
      // We need to unwrap the key out of the tuple and hand it
      // to the POPackage for processing
      if (keyType != DataType.TUPLE) {
        Tuple t = (Tuple) (key.getValueAsPigType());
        try {
          key = HDataType.getWritableComparableTypes(t.get(0), keyType);
        } catch (ExecException e) {
          throw e;
        }
      }

      pack.attachInput(key, tupIter.iterator());

      Result res = pack.getNextTuple();
      if (res.returnStatus == POStatus.STATUS_OK) {
        Tuple packRes = (Tuple) res.result;

        if (rp.isEmpty()) {
          context.write(null, packRes);
          return;
        }

        rp.attachInput(packRes);

        List<PhysicalOperator> leaves = rp.getLeaves();

        PhysicalOperator leaf = leaves.get(0);
        runPipeline(leaf);
      }

      if (res.returnStatus == POStatus.STATUS_NULL) {
        return;
      }

      if (res.returnStatus == POStatus.STATUS_ERR) {
        int errCode = 2093;
        String msg = "Encountered error in package operator while processing group.";
        throw new ExecException(msg, errCode, PigException.BUG);
      }
    }
      @Override
      protected void execute(LogicalExpression op) throws FrontendException {
        if (op instanceof UserFuncExpression) {
          UserFuncExpression udf = (UserFuncExpression) op;
          if (!udf.getEvalFunc().allowCompileTimeCalculation()) {
            return;
          }
        }
        boolean valSet = false;
        Object val = null;
        if (currentWalker.getPlan().getSuccessors(op) != null) {
          // If has successors and all successors are constant, calculate the constant
          for (Operator succ : currentWalker.getPlan().getSuccessors(op)) {
            if (!(succ instanceof ConstantExpression)) {
              return;
            }
          }
          // All successors are constant, calculate the value
          OperatorPlan expLogicalPlan = new LogicalExpressionPlan();
          ((BaseOperatorPlan) currentWalker.getPlan())
              .moveTree(op, (BaseOperatorPlan) expLogicalPlan);
          PhysicalPlan expPhysicalPlan = new PhysicalPlan();
          Map<Operator, PhysicalOperator> logToPhyMap = new HashMap<Operator, PhysicalOperator>();
          PlanWalker childWalker = new ReverseDependencyOrderWalkerWOSeenChk(expLogicalPlan);

          // Save the old walker and use childWalker as current Walker
          pushWalker(childWalker);
          ExpToPhyTranslationVisitor expTranslationVisitor =
              new ExpToPhyTranslationVisitor(
                  expLogicalPlan, childWalker, currentOp, expPhysicalPlan, logToPhyMap);
          expTranslationVisitor.visit();
          popWalker();
          PhysicalOperator root = expPhysicalPlan.getLeaves().get(0);
          try {
            UDFContext.getUDFContext()
                .addJobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), true));
            PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance();
            PhysicalOperator.setPigLogger(pigHadoopLogger);
            setDefaultTimeZone();
            val = root.getNext(root.getResultType()).result;
            restoreDefaultTimeZone();
            UDFContext.getUDFContext().addJobConf(null);
          } catch (ExecException e) {
            throw new FrontendException(e);
          }
          valSet = true;
        } else if (op instanceof UserFuncExpression) {
          // If solo UDF, calculate UDF
          UserFuncExpression udf = (UserFuncExpression) op;
          try {
            UDFContext.getUDFContext()
                .addJobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), true));
            setDefaultTimeZone();
            val = udf.getEvalFunc().exec(null);
            restoreDefaultTimeZone();
            UDFContext.getUDFContext().addJobConf(null);
          } catch (IOException e) {
            throw new FrontendException(e);
          }
          valSet = true;
        }
        if (valSet) {
          ConstantExpression constantExpr;
          constantExpr = new ConstantExpression(currentWalker.getPlan(), val);
          constantExpr.inheritSchema(op);
          currentWalker.getPlan().replace(op, constantExpr);
        }
      }