Beispiel #1
0
  // Tests Single input case for both blocking and non-blocking
  // with both map and reduce phases
  @Test
  public void testSim1() throws Exception {
    PhysicalPlan php = new PhysicalPlan();
    POLoad ld = GenPhyOp.topLoadOp();
    php.add(ld);
    PhysicalPlan grpChain1 = GenPhyOp.grpChain();
    php.merge(grpChain1);

    php.connect(ld, grpChain1.getRoots().get(0));

    PhysicalOperator leaf = php.getLeaves().get(0);

    PhysicalPlan grpChain2 = GenPhyOp.grpChain();
    php.merge(grpChain2);

    php.connect(leaf, grpChain2.getRoots().get(0));

    leaf = php.getLeaves().get(0);
    POFilter fl = GenPhyOp.topFilterOp();
    php.add(fl);

    php.connect(leaf, fl);

    POStore st = GenPhyOp.topStoreOp();
    php.add(st);

    php.connect(fl, st);
    run(php, "test/org/apache/pig/test/data/GoldenFiles/MRC1.gld");
  }
    /** Configures the Reduce plan, the POPackage operator and the reporter thread */
    @SuppressWarnings("unchecked")
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
      super.setup(context);
      inIllustrator = inIllustrator(context);
      if (inIllustrator) pack = getPack(context);
      Configuration jConf = context.getConfiguration();
      SpillableMemoryManager.configure(ConfigurationUtil.toProperties(jConf));
      context
          .getConfiguration()
          .set(
              PigConstants.TASK_INDEX,
              Integer.toString(context.getTaskAttemptID().getTaskID().getId()));
      sJobContext = context;
      sJobConfInternal.set(context.getConfiguration());
      sJobConf = context.getConfiguration();
      try {
        PigContext.setPackageImportList(
            (ArrayList<String>) ObjectSerializer.deserialize(jConf.get("udf.import.list")));
        pigContext = (PigContext) ObjectSerializer.deserialize(jConf.get("pig.pigContext"));

        // This attempts to fetch all of the generated code from the distributed cache, and resolve
        // it
        SchemaTupleBackend.initialize(jConf, pigContext);

        if (rp == null)
          rp = (PhysicalPlan) ObjectSerializer.deserialize(jConf.get("pig.reducePlan"));
        stores = PlanHelper.getPhysicalOperators(rp, POStore.class);

        if (!inIllustrator)
          pack = (POPackage) ObjectSerializer.deserialize(jConf.get("pig.reduce.package"));
        // To be removed
        if (rp.isEmpty()) log.debug("Reduce Plan empty!");
        else {
          ByteArrayOutputStream baos = new ByteArrayOutputStream();
          rp.explain(baos);
          log.debug(baos.toString());
        }
        pigReporter = new ProgressableReporter();
        if (!(rp.isEmpty())) {
          roots = rp.getRoots().toArray(new PhysicalOperator[1]);
          leaf = rp.getLeaves().get(0);
        }

        // Get the UDF specific context
        MapRedUtil.setupUDFContext(jConf);

      } catch (IOException ioe) {
        String msg = "Problem while configuring reduce plan.";
        throw new RuntimeException(msg, ioe);
      }

      log.info(
          "Aliases being processed per job phase (AliasName[line,offset]): "
              + jConf.get("pig.alias.location"));

      Utils.setDefaultTimeZone(PigMapReduce.sJobConfInternal.get());
    }
Beispiel #3
0
  @Test(expected = MRCompilerException.class)
  public void testMRCompilerErr() throws Exception {
    String query = "a = load 'input';" + "b = filter a by $0 > 5;" + "store b into 'output';";

    PhysicalPlan pp = Util.buildPp(pigServer, query);
    pp.remove(pp.getRoots().get(0));
    try {
      Util.buildMRPlan(new PhysicalPlan(), pc);
    } catch (MRCompilerException mrce) {
      assertEquals(2053, mrce.getErrorCode());
      throw mrce;
    }
  }
Beispiel #4
0
  @Test
  public void testMergeJoin() throws Exception {
    String query =
        "a = load '/tmp/input1';"
            + "b = load '/tmp/input2';"
            + "c = join a by $0, b by $0 using 'merge';"
            + "store c into '/tmp/output1';";

    PhysicalPlan pp = Util.buildPp(pigServer, query);
    MRCompiler comp = new MRCompiler(pp, pc);
    comp.compile();
    MROperPlan mrp = comp.getMRPlan();
    assertTrue(mrp.size() == 2);

    MapReduceOper mrOp0 = mrp.getRoots().get(0);
    assertTrue(mrOp0.mapPlan.size() == 2);
    PhysicalOperator load0 = mrOp0.mapPlan.getRoots().get(0);
    MergeJoinIndexer func =
        (MergeJoinIndexer)
            PigContext.instantiateFuncFromSpec(((POLoad) load0).getLFile().getFuncSpec());
    Field lrField = MergeJoinIndexer.class.getDeclaredField("lr");
    lrField.setAccessible(true);
    POLocalRearrange lr = (POLocalRearrange) lrField.get(func);
    List<PhysicalPlan> innerPlans = lr.getPlans();
    PhysicalOperator localrearrange0 = mrOp0.mapPlan.getSuccessors(load0).get(0);
    assertTrue(localrearrange0 instanceof POLocalRearrange);
    assertTrue(mrOp0.reducePlan.size() == 3);
    PhysicalOperator pack0 = mrOp0.reducePlan.getRoots().get(0);
    assertTrue(pack0 instanceof POPackage);
    PhysicalOperator foreach0 = mrOp0.reducePlan.getSuccessors(pack0).get(0);
    assertTrue(foreach0 instanceof POForEach);
    PhysicalOperator store0 = mrOp0.reducePlan.getSuccessors(foreach0).get(0);
    assertTrue(store0 instanceof POStore);

    assertTrue(innerPlans.size() == 1);
    PhysicalPlan innerPlan = innerPlans.get(0);
    assertTrue(innerPlan.size() == 1);
    PhysicalOperator project = innerPlan.getRoots().get(0);
    assertTrue(project instanceof POProject);
    assertTrue(((POProject) project).getColumn() == 0);

    MapReduceOper mrOp1 = mrp.getSuccessors(mrOp0).get(0);
    assertTrue(mrOp1.mapPlan.size() == 3);
    PhysicalOperator load1 = mrOp1.mapPlan.getRoots().get(0);
    assertTrue(load1 instanceof POLoad);
    PhysicalOperator mergejoin1 = mrOp1.mapPlan.getSuccessors(load1).get(0);
    assertTrue(mergejoin1 instanceof POMergeJoin);
    PhysicalOperator store1 = mrOp1.mapPlan.getSuccessors(mergejoin1).get(0);
    assertTrue(store1 instanceof POStore);
    assertTrue(mrOp1.reducePlan.isEmpty());
  }
  @Test
  public void testMRCompilerErr() throws Exception {
    planTester.buildPlan("a = load 'input';");
    LogicalPlan lp = planTester.buildPlan("b = filter a by $0 > 5;");

    PhysicalPlan pp = Util.buildPhysicalPlan(lp, pc);
    pp.remove(pp.getRoots().get(0));
    try {
      Util.buildMRPlan(new PhysicalPlan(), pc);
      fail("Expected failure.");
    } catch (MRCompilerException mrce) {
      assertTrue(mrce.getErrorCode() == 2053);
    }
  }
  public void testDistinct1() throws Exception {
    PhysicalPlan php = new PhysicalPlan();
    PhysicalPlan ldFil1 = GenPhyOp.loadedFilter();
    php.merge(ldFil1);

    PODistinct op = new PODistinct(new OperatorKey("", r.nextLong()), -1, null);

    php.addAsLeaf(op);

    PhysicalPlan grpChain1 = GenPhyOp.grpChain();
    php.merge(grpChain1);
    php.connect(op, grpChain1.getRoots().get(0));

    PODistinct op1 = new PODistinct(new OperatorKey("", r.nextLong()), -1, null);

    php.addAsLeaf(op1);
    POStore st = GenPhyOp.topStoreOp();
    php.addAsLeaf(st);
    run(php, "test/org/apache/pig/test/data/GoldenFiles/MRC16.gld");
  }
Beispiel #7
0
  @Test
  public void testSortUDF1() throws Exception {
    PhysicalPlan php = new PhysicalPlan();
    PhysicalPlan ldFil1 = GenPhyOp.loadedFilter();
    php.merge(ldFil1);

    // set up order by *
    String funcName = WeirdComparator.class.getName();
    POUserComparisonFunc comparator =
        new POUserComparisonFunc(
            new OperatorKey("", r.nextLong()), -1, null, new FuncSpec(funcName));
    POSort sort =
        new POSort(
            new OperatorKey("", r.nextLong()),
            -1,
            ldFil1.getLeaves(),
            null,
            new ArrayList<Boolean>(),
            comparator);
    sort.setRequestedParallelism(20);
    PhysicalPlan nesSortPlan = new PhysicalPlan();
    POProject topPrj = new POProject(new OperatorKey("", r.nextLong()));
    topPrj.setColumn(1);
    topPrj.setOverloaded(true);
    topPrj.setResultType(DataType.TUPLE);
    nesSortPlan.add(topPrj);

    POProject prjStar2 = new POProject(new OperatorKey("", r.nextLong()));
    prjStar2.setResultType(DataType.TUPLE);
    prjStar2.setStar(true);
    nesSortPlan.add(prjStar2);

    nesSortPlan.connect(topPrj, prjStar2);
    List<PhysicalPlan> nesSortPlanLst = new ArrayList<PhysicalPlan>();
    nesSortPlanLst.add(nesSortPlan);

    sort.setSortPlans(nesSortPlanLst);

    php.add(sort);
    php.connect(ldFil1.getLeaves().get(0), sort);
    // have a foreach which takes the sort output
    // and send it two two udfs
    List<String> udfs = new ArrayList<String>();
    udfs.add(COUNT.class.getName());
    udfs.add(SUM.class.getName());
    POForEach fe3 = GenPhyOp.topForEachOPWithUDF(udfs);
    php.add(fe3);
    php.connect(sort, fe3);

    // add a group above the foreach
    PhysicalPlan grpChain1 = GenPhyOp.grpChain();
    php.merge(grpChain1);
    php.connect(fe3, grpChain1.getRoots().get(0));

    udfs.clear();
    udfs.add(AVG.class.getName());
    POForEach fe4 = GenPhyOp.topForEachOPWithUDF(udfs);
    php.addAsLeaf(fe4);

    PhysicalPlan grpChain2 = GenPhyOp.grpChain();
    php.merge(grpChain2);
    php.connect(fe4, grpChain2.getRoots().get(0));

    udfs.clear();
    udfs.add(GFCross.class.getName() + "('1')");
    POForEach fe5 = GenPhyOp.topForEachOPWithUDF(udfs);
    php.addAsLeaf(fe5);

    POStore st = GenPhyOp.topStoreOp();
    php.addAsLeaf(st);
    run(php, "test/org/apache/pig/test/data/GoldenFiles/MRC15.gld");
  }
Beispiel #8
0
  /**
   * Algebraic functions and distinct in nested plan of a foreach are partially computed in the map
   * and combine phase. A new foreach statement with initial and intermediate forms of algebraic
   * functions are added to map and combine plans respectively.
   *
   * <p>If bag portion of group-by result is projected or a non algebraic expression/udf has bag as
   * input, combiner will not be used. This is because the use of combiner in such case is likely to
   * degrade performance as there will not be much reduction in data size in combine stage to offset
   * the cost of the additional number of times (de)serialization is done.
   *
   * <p>Major areas for enhancement: 1. use of combiner in cogroup 2. queries with order-by, limit
   * or sort in a nested foreach after group-by 3. case where group-by is followed by filter that
   * has algebraic expression
   */
  public static void addCombiner(
      PhysicalPlan mapPlan,
      PhysicalPlan reducePlan,
      PhysicalPlan combinePlan,
      CompilationMessageCollector messageCollector,
      boolean doMapAgg)
      throws VisitorException {

    // part one - check if this MR job represents a group-by + foreach. Find
    // the POLocalRearrange in the map. I'll need it later.
    List<PhysicalOperator> mapLeaves = mapPlan.getLeaves();
    if (mapLeaves == null || mapLeaves.size() != 1) {
      messageCollector.collect(
          "Expected map to have single leaf", MessageType.Warning, PigWarning.MULTI_LEAF_MAP);
      return;
    }
    PhysicalOperator mapLeaf = mapLeaves.get(0);
    if (!(mapLeaf instanceof POLocalRearrange)) {
      return;
    }
    POLocalRearrange rearrange = (POLocalRearrange) mapLeaf;

    List<PhysicalOperator> reduceRoots = reducePlan.getRoots();
    if (reduceRoots.size() != 1) {
      messageCollector.collect(
          "Expected reduce to have single root", MessageType.Warning, PigWarning.MULTI_ROOT_REDUCE);
      return;
    }

    // I expect that the first root should always be a POPackage. If not, I
    // don't know what's going on, so I'm out of here.
    PhysicalOperator root = reduceRoots.get(0);
    if (!(root instanceof POPackage)) {
      messageCollector.collect(
          "Expected reduce root to be a POPackage",
          MessageType.Warning,
          PigWarning.NON_PACKAGE_REDUCE_PLAN_ROOT);
      return;
    }
    POPackage pack = (POPackage) root;

    List<PhysicalOperator> packSuccessors = reducePlan.getSuccessors(root);
    if (packSuccessors == null || packSuccessors.size() != 1) {
      return;
    }
    PhysicalOperator successor = packSuccessors.get(0);

    if (successor instanceof POLimit) {
      // POLimit is acceptable, as long has it has a single foreach as
      // successor
      List<PhysicalOperator> limitSucs = reducePlan.getSuccessors(successor);
      if (limitSucs != null && limitSucs.size() == 1 && limitSucs.get(0) instanceof POForEach) {
        // the code below will now further examine the foreach
        successor = limitSucs.get(0);
      }
    }
    if (successor instanceof POForEach) {
      POForEach foreach = (POForEach) successor;
      List<PhysicalPlan> feInners = foreach.getInputPlans();

      // find algebraic operators and also check if the foreach statement
      // is suitable for combiner use
      List<Pair<PhysicalOperator, PhysicalPlan>> algebraicOps = findAlgebraicOps(feInners);
      if (algebraicOps == null || algebraicOps.size() == 0) {
        // the plan is not combinable or there is nothing to combine
        // we're done
        return;
      }
      if (combinePlan != null && combinePlan.getRoots().size() != 0) {
        messageCollector.collect(
            "Wasn't expecting to find anything already " + "in the combiner!",
            MessageType.Warning,
            PigWarning.NON_EMPTY_COMBINE_PLAN);
        return;
      }

      LOG.info("Choosing to move algebraic foreach to combiner");
      try {
        // replace PODistinct->Project[*] with distinct udf (which is Algebraic)
        for (Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps) {
          if (!(op2plan.first instanceof PODistinct)) {
            continue;
          }
          DistinctPatcher distinctPatcher = new DistinctPatcher(op2plan.second);
          distinctPatcher.visit();
          if (distinctPatcher.getDistinct() == null) {
            int errCode = 2073;
            String msg =
                "Problem with replacing distinct operator with distinct built-in function.";
            throw new PlanException(msg, errCode, PigException.BUG);
          }
          op2plan.first = distinctPatcher.getDistinct();
        }

        // create new map foreach
        POForEach mfe = createForEachWithGrpProj(foreach, rearrange.getKeyType());
        Map<PhysicalOperator, Integer> op2newpos = Maps.newHashMap();
        Integer pos = 1;
        // create plan for each algebraic udf and add as inner plan in map-foreach
        for (Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps) {
          PhysicalPlan udfPlan = createPlanWithPredecessors(op2plan.first, op2plan.second);
          mfe.addInputPlan(udfPlan, false);
          op2newpos.put(op2plan.first, pos++);
        }
        changeFunc(mfe, POUserFunc.INITIAL);

        // since we will only be creating SingleTupleBag as input to
        // the map foreach, we should flag the POProjects in the map
        // foreach inner plans to also use SingleTupleBag
        for (PhysicalPlan mpl : mfe.getInputPlans()) {
          try {
            new fixMapProjects(mpl).visit();
          } catch (VisitorException e) {
            int errCode = 2089;
            String msg = "Unable to flag project operator to use single tuple bag.";
            throw new PlanException(msg, errCode, PigException.BUG, e);
          }
        }

        // create new combine foreach
        POForEach cfe = createForEachWithGrpProj(foreach, rearrange.getKeyType());
        // add algebraic functions with appropriate projection
        addAlgebraicFuncToCombineFE(cfe, op2newpos);
        changeFunc(cfe, POUserFunc.INTERMEDIATE);

        // fix projection and function time for algebraic functions in reduce foreach
        for (Pair<PhysicalOperator, PhysicalPlan> op2plan : algebraicOps) {
          setProjectInput(op2plan.first, op2plan.second, op2newpos.get(op2plan.first));
          byte resultType = op2plan.first.getResultType();
          ((POUserFunc) op2plan.first).setAlgebraicFunction(POUserFunc.FINAL);
          op2plan.first.setResultType(resultType);
        }

        // we have modified the foreach inner plans - so set them again
        // for the foreach so that foreach can do any re-initialization
        // around them.
        // FIXME - this is a necessary evil right now because the leaves
        // are explicitly stored in the POForeach as a list rather than
        // computed each time at run time from the plans for
        // optimization. Do we want to have the Foreach compute the
        // leaves each time and have Java optimize it (will Java
        // optimize?)?
        mfe.setInputPlans(mfe.getInputPlans());
        cfe.setInputPlans(cfe.getInputPlans());
        foreach.setInputPlans(foreach.getInputPlans());

        // tell POCombinerPackage which fields need projected and which
        // placed in bags. First field is simple project rest need to go
        // into bags
        int numFields = algebraicOps.size() + 1; // algebraic funcs + group key
        boolean[] bags = new boolean[numFields];
        bags[0] = false;
        for (int i = 1; i < numFields; i++) {
          bags[i] = true;
        }

        // Use the POCombiner package in the combine plan
        // as it needs to act differently than the regular
        // package operator.
        CombinerPackager pkgr = new CombinerPackager(pack.getPkgr(), bags);
        POPackage combinePack = pack.clone();
        combinePack.setPkgr(pkgr);
        combinePack.setParentPlan(null);

        combinePlan.add(combinePack);
        combinePlan.add(cfe);
        combinePlan.connect(combinePack, cfe);

        // No need to connect projections in cfe to cp, because
        // PigCombiner directly attaches output from package to
        // root of remaining plan.

        POLocalRearrange mlr = getNewRearrange(rearrange);
        POPartialAgg mapAgg = null;
        if (doMapAgg) {
          mapAgg = createPartialAgg(cfe);
        }

        // A specialized local rearrange operator will replace
        // the normal local rearrange in the map plan. This behaves
        // like the regular local rearrange in the getNext()
        // as far as getting its input and constructing the
        // "key" out of the input. It then returns a tuple with
        // two fields - the key in the first position and the
        // "value" inside a bag in the second position. This output
        // format resembles the format out of a Package. This output
        // will feed to the map foreach which expects this format.
        // If the key field isn't in the project of the combiner or map foreach,
        // it is added to the end (This is required so that we can
        // set up the inner plan of the new Local Rearrange leaf in the map
        // and combine plan to contain just the project of the key).
        patchUpMap(mapPlan, getPreCombinerLR(rearrange), mfe, mapAgg, mlr);
        POLocalRearrange clr = getNewRearrange(rearrange);
        clr.setParentPlan(null);
        combinePlan.add(clr);
        combinePlan.connect(cfe, clr);

        // Change the package operator in the reduce plan to
        // be the POCombiner package, as it needs to act
        // differently than the regular package operator.
        pack.setPkgr(pkgr.clone());
      } catch (Exception e) {
        int errCode = 2018;
        String msg = "Internal error. Unable to introduce the combiner for optimization.";
        throw new OptimizerException(msg, errCode, PigException.BUG, e);
      }
    }
  }
Beispiel #9
0
  /**
   * find algebraic operators and also check if the foreach statement is suitable for combiner use
   *
   * @param feInners inner plans of foreach
   * @return null if plan is not combinable, otherwise list of combinable operators
   * @throws VisitorException
   */
  private static List<Pair<PhysicalOperator, PhysicalPlan>> findAlgebraicOps(
      List<PhysicalPlan> feInners) throws VisitorException {
    List<Pair<PhysicalOperator, PhysicalPlan>> algebraicOps = Lists.newArrayList();

    // check each foreach inner plan
    for (PhysicalPlan pplan : feInners) {
      // check for presence of non combinable operators
      AlgebraicPlanChecker algChecker = new AlgebraicPlanChecker(pplan);
      algChecker.visit();
      if (algChecker.sawNonAlgebraic) {
        return null;
      }

      // if we found a combinable distinct add that to list
      if (algChecker.sawDistinctAgg) {
        algebraicOps.add(new Pair<PhysicalOperator, PhysicalPlan>(algChecker.getDistinct(), pplan));
        continue;
      }

      List<PhysicalOperator> roots = pplan.getRoots();
      // combinable operators have to be attached to POProject root(s)
      // if root does not have a successor that is combinable, the project
      // has to be projecting the group column. Otherwise this MR job is
      // considered not combinable as we don't want to use combiner for
      // cases where this foreach statement is projecting bags (likely to
      // bad for performance because of additional (de)serialization
      // costs)

      for (PhysicalOperator root : roots) {
        if (root instanceof ConstantExpression) {
          continue;
        }
        if (!(root instanceof POProject)) {
          // how can this happen? - expect root of inner plan to be
          // constant or project. not combining it
          return null;
        }
        POProject proj = (POProject) root;
        POUserFunc combineUdf = getAlgebraicSuccessor(proj, pplan);
        if (combineUdf == null) {
          if (proj.isProjectToEnd()) {
            // project-star or project to end
            // not combinable
            return null;
          }
          // Check to see if this is a projection of the grouping column.
          // If so, it will be a projection of col 0
          List<Integer> cols = proj.getColumns();
          if (cols != null && cols.size() == 1 && cols.get(0) == 0) {
            // it is project of grouping column, so the plan is
            // still combinable
            continue;
          } else {
            // not combinable
            return null;
          }
        }

        // The algebraic udf can have more than one input. Add the udf only once
        boolean exist = false;
        for (Pair<PhysicalOperator, PhysicalPlan> pair : algebraicOps) {
          if (pair.first.equals(combineUdf)) {
            exist = true;
            break;
          }
        }
        if (!exist) algebraicOps.add(new Pair<PhysicalOperator, PhysicalPlan>(combineUdf, pplan));
      }
    }

    return algebraicOps;
  }