Пример #1
0
  @Test
  public void testDefaultParallelInSkewJoin() throws Throwable {
    // default_parallel is considered only at runtime, so here we only test requested parallel
    // more thorough tests can be found in TestNumberOfReducers.java
    String query =
        "a = load 'input';"
            + "b = load 'input';"
            + "c = join a by $0, b by $0 using 'skewed' parallel 100;"
            + "store c into 'output';";
    PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties());
    PhysicalPlan pp = Util.buildPp(ps, query);
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);

    // Get the skew join job
    Iterator<MapReduceOper> iter = mrPlan.getKeys().values().iterator();
    int counter = 0;
    while (iter.hasNext()) {
      MapReduceOper op = iter.next();
      counter++;
      if (op.isSkewedJoin()) {
        assertTrue(op.getRequestedParallelism() == 100);
      }
    }
    assertEquals(3, counter);

    pc.defaultParallel = -1;
  }
Пример #2
0
  @Test
  public void testLimitAdjusterFuncShipped() throws Exception {
    String query =
        "a = load 'input';"
            + "b = order a by $0 parallel 2;"
            + "c = limit b 7;"
            + "store c into 'output' using "
            + PigStorageNoDefCtor.class.getName()
            + "('\t');";

    PhysicalPlan pp = Util.buildPp(pigServerMR, query);
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);

    LimitAdjuster la = new LimitAdjuster(mrPlan, pc);
    la.visit();
    la.adjust();

    MapReduceOper mrOper = mrPlan.getRoots().get(0);
    int count = 1;

    while (mrPlan.getSuccessors(mrOper) != null) {
      mrOper = mrPlan.getSuccessors(mrOper).get(0);
      ++count;
    }
    assertEquals(4, count);

    MapReduceOper op = mrPlan.getLeaves().get(0);
    assertTrue(op.UDFs.contains(new FuncSpec(PigStorageNoDefCtor.class.getName()) + "('\t')"));
  }
Пример #3
0
  @Test
  public void testMergeJoin() throws Exception {
    String query =
        "a = load '/tmp/input1';"
            + "b = load '/tmp/input2';"
            + "c = join a by $0, b by $0 using 'merge';"
            + "store c into '/tmp/output1';";

    PhysicalPlan pp = Util.buildPp(pigServer, query);
    MRCompiler comp = new MRCompiler(pp, pc);
    comp.compile();
    MROperPlan mrp = comp.getMRPlan();
    assertTrue(mrp.size() == 2);

    MapReduceOper mrOp0 = mrp.getRoots().get(0);
    assertTrue(mrOp0.mapPlan.size() == 2);
    PhysicalOperator load0 = mrOp0.mapPlan.getRoots().get(0);
    MergeJoinIndexer func =
        (MergeJoinIndexer)
            PigContext.instantiateFuncFromSpec(((POLoad) load0).getLFile().getFuncSpec());
    Field lrField = MergeJoinIndexer.class.getDeclaredField("lr");
    lrField.setAccessible(true);
    POLocalRearrange lr = (POLocalRearrange) lrField.get(func);
    List<PhysicalPlan> innerPlans = lr.getPlans();
    PhysicalOperator localrearrange0 = mrOp0.mapPlan.getSuccessors(load0).get(0);
    assertTrue(localrearrange0 instanceof POLocalRearrange);
    assertTrue(mrOp0.reducePlan.size() == 3);
    PhysicalOperator pack0 = mrOp0.reducePlan.getRoots().get(0);
    assertTrue(pack0 instanceof POPackage);
    PhysicalOperator foreach0 = mrOp0.reducePlan.getSuccessors(pack0).get(0);
    assertTrue(foreach0 instanceof POForEach);
    PhysicalOperator store0 = mrOp0.reducePlan.getSuccessors(foreach0).get(0);
    assertTrue(store0 instanceof POStore);

    assertTrue(innerPlans.size() == 1);
    PhysicalPlan innerPlan = innerPlans.get(0);
    assertTrue(innerPlan.size() == 1);
    PhysicalOperator project = innerPlan.getRoots().get(0);
    assertTrue(project instanceof POProject);
    assertTrue(((POProject) project).getColumn() == 0);

    MapReduceOper mrOp1 = mrp.getSuccessors(mrOp0).get(0);
    assertTrue(mrOp1.mapPlan.size() == 3);
    PhysicalOperator load1 = mrOp1.mapPlan.getRoots().get(0);
    assertTrue(load1 instanceof POLoad);
    PhysicalOperator mergejoin1 = mrOp1.mapPlan.getSuccessors(load1).get(0);
    assertTrue(mergejoin1 instanceof POMergeJoin);
    PhysicalOperator store1 = mrOp1.mapPlan.getSuccessors(mergejoin1).get(0);
    assertTrue(store1 instanceof POStore);
    assertTrue(mrOp1.reducePlan.isEmpty());
  }
Пример #4
0
  @Test
  public void testMergeJoinWithIndexableLoadFunc() throws Exception {
    String query =
        "a = load 'input1';"
            + "b = load 'input2' using "
            + TestMergeJoin.DummyIndexableLoader.class.getName()
            + ";"
            + "c = join a by $0, b by $0 using 'merge';"
            + "store c into 'output';";

    PhysicalPlan pp = Util.buildPp(pigServer, query);
    MROperPlan mp = Util.buildMRPlan(pp, pc);
    assertEquals(
        "Checking number of MR Jobs for merge join with " + "IndexableLoadFunc:", 1, mp.size());
  }
Пример #5
0
  /**
   * Test to ensure that the order by without parallel followed by a limit, i.e., top k always
   * produces the correct number of map reduce jobs. In the testcase below since we are running the
   * unit test locally, we will get reduce parallelism as 1. So we will NOT introduce the extra MR
   * job to do a final limit
   */
  @Test
  public void testNumReducersInLimit() throws Exception {
    String query =
        "a = load 'input';" + "b = order a by $0;" + "c = limit b 10;" + "store c into 'output';";

    PhysicalPlan pp = Util.buildPp(pigServer, query);
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);
    MapReduceOper mrOper = mrPlan.getRoots().get(0);
    int count = 1;

    while (mrPlan.getSuccessors(mrOper) != null) {
      mrOper = mrPlan.getSuccessors(mrOper).get(0);
      ++count;
    }
    assertEquals(3, count);
  }
Пример #6
0
  @Test
  public void testUDFInMergedJoin() throws Exception {
    String query =
        "a = load 'input1';"
            + "b = load 'input2' using "
            + TestIndexableLoadFunc.class.getName()
            + "();"
            + "c = join a by $0, b by $0 using 'merge';"
            + "store c into 'output';";

    PhysicalPlan pp = Util.buildPp(pigServer, query);
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);
    MapReduceOper mrOper = mrPlan.getRoots().get(0);

    assertTrue(mrOper.UDFs.contains(TestIndexableLoadFunc.class.getName()));
  }
Пример #7
0
 @Test
 public void testCastFuncShipped() throws Exception {
   String query =
       "a = load 'input1' using "
           + PigStorageNoDefCtor.class.getName()
           + "('\t') as (a0, a1, a2);"
           + "b = group a by a0;"
           + "c = foreach b generate flatten(a);"
           + "d = order c by a0;"
           + "e = foreach d generate a1+a2;"
           + "store e into 'output';";
   PhysicalPlan pp = Util.buildPp(pigServer, query);
   MROperPlan mp = Util.buildMRPlan(pp, pc);
   MapReduceOper op = mp.getLeaves().get(0);
   assertTrue(op.UDFs.contains(new FuncSpec(PigStorageNoDefCtor.class.getName()) + "('\t')"));
 }
Пример #8
0
  @Test
  public void testUDFInJoin() throws Exception {
    String query =
        "a = load 'input1' using BinStorage();"
            + "b = load 'input2';"
            + "c = join a by $0, b by $0;"
            + "store c into 'output';";

    PhysicalPlan pp = Util.buildPp(pigServer, query);
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);
    MapReduceOper mrOper = mrPlan.getRoots().get(0);

    assertEquals(2, mrOper.UDFs.size());
    assertEquals(2, mrOper.UDFs.size());
    assertTrue(mrOper.UDFs.contains("BinStorage"));
    assertTrue(mrOper.UDFs.contains("org.apache.pig.builtin.PigStorage"));
  }
Пример #9
0
  /**
   * Test to ensure that the order by with parallel followed by a limit, i.e., top k always produces
   * the correct number of map reduce jobs
   */
  @Test
  public void testNumReducersInLimitWithParallel() throws Exception {
    planTester.buildPlan("a = load 'input';");
    planTester.buildPlan("b = order a by $0 parallel 2;");
    planTester.buildPlan("c = limit b 10;");
    LogicalPlan lp = planTester.buildPlan("store c into '/tmp';");

    PhysicalPlan pp = Util.buildPhysicalPlan(lp, pc);
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);
    MapReduceOper mrOper = mrPlan.getRoots().get(0);
    int count = 1;

    while (mrPlan.getSuccessors(mrOper) != null) {
      mrOper = mrPlan.getSuccessors(mrOper).get(0);
      ++count;
    }
    assertTrue(count == 4);
  }
Пример #10
0
  // PIG-2146
  @Test
  public void testSchemaInStoreForDistinctLimit() throws Exception {
    // test if the POStore in the 2nd mr plan (that stores the actual output)
    // has a schema
    String query =
        "a = load 'input1' as (a : int,b :float ,c : int);"
            + "b  = distinct a;"
            + "c = limit b 10;"
            + "store c into 'output';";

    PhysicalPlan pp = Util.buildPp(pigServer, query);
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);
    MapReduceOper secondMrOper = mrPlan.getLeaves().get(0);
    POStore store = (POStore) secondMrOper.reducePlan.getLeaves().get(0);
    assertEquals(
        "compare load and store schema",
        store.getSchema(),
        Utils.getSchemaFromString("a : int,b :float ,c : int"));
  }
Пример #11
0
 // See PIG-4538
 @Test
 public void testFetchOptimizerSideEffect() throws Exception {
   String query =
       "in1 = LOAD 'data.txt' AS (ident:chararray);"
           + "in2 = LOAD 'data.txt' AS (ident:chararray);"
           + "in3 = LOAD 'data.txt';"
           + "joined = JOIN in1 BY ident LEFT OUTER, in2 BY ident;"
           + "store joined into 'output';";
   PhysicalPlan pp = Util.buildPp(pigServer, query);
   MROperPlan mp = Util.buildMRPlan(pp, pc);
   // isPlanFetchable should not bring side effect:
   //   set parentPlan for operators
   FetchOptimizer.isPlanFetchable(pc, pp);
   MapReduceOper op = mp.getLeaves().get(0);
   PhysicalOperator store = op.reducePlan.getLeaves().get(0);
   POForEach foreach = (POForEach) op.reducePlan.getPredecessors(store).get(0);
   PhysicalOperator project = foreach.getInputPlans().get(0).getRoots().get(0);
   Field parentPlan = PhysicalOperator.class.getDeclaredField("parentPlan");
   parentPlan.setAccessible(true);
   assertTrue(parentPlan.get(project) == null);
 }
Пример #12
0
  // PIG-2146
  @Test
  public void testStorerLimit() throws Exception {
    // test if the POStore in the 1st mr plan
    // use the right StoreFunc
    String query =
        "a = load 'input1';"
            + "b = limit a 10;"
            + "store b into 'output' using "
            + PigStorageNoDefCtor.class.getName()
            + "(',');";

    PhysicalPlan pp = Util.buildPp(pigServer, query);
    MROperPlan mrPlan = Util.buildMRPlan(pp, pc);

    LimitAdjuster la = new LimitAdjuster(mrPlan, pc);
    la.visit();
    la.adjust();

    MapReduceOper firstMrOper = mrPlan.getRoots().get(0);
    POStore store = (POStore) firstMrOper.reducePlan.getLeaves().get(0);
    assertEquals(store.getStoreFunc().getClass().getName(), "org.apache.pig.impl.io.InterStorage");
  }
Пример #13
0
  @Test
  public void testReducerNumEstimationForOrderBy() throws Exception {
    // Skip the test for Tez. Tez use a different mechanism.
    // Equivalent test is in TestTezAutoParallelism
    Assume.assumeTrue("Skip this test for TEZ", Util.isMapredExecType(cluster.getExecType()));
    // use the estimation
    pc.getProperties().setProperty("pig.exec.reducers.bytes.per.reducer", "100");
    pc.getProperties().setProperty("pig.exec.reducers.max", "10");

    String query = "a = load '/passwd';" + "b = order a by $0;" + "store b into 'output';";
    PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties());
    PhysicalPlan pp = Util.buildPp(ps, query);

    MROperPlan mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    JobControlCompiler jcc = new JobControlCompiler(pc, conf);
    JobControl jobControl = jcc.compile(mrPlan, query);

    assertEquals(2, mrPlan.size());

    // first job uses a single reducer for the sampling
    Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf());

    // Simulate the first job having run so estimation kicks in.
    MapReduceOper sort = mrPlan.getLeaves().get(0);
    jcc.updateMROpPlan(jobControl.getReadyJobs());
    FileLocalizer.create(sort.getQuantFile(), pc);
    jobControl = jcc.compile(mrPlan, query);

    sort = mrPlan.getLeaves().get(0);
    long reducer =
        Math.min(
            (long) Math.ceil(new File("test/org/apache/pig/test/data/passwd").length() / 100.0),
            10);
    assertEquals(reducer, sort.getRequestedParallelism());

    // the second job estimates reducers
    Util.assertParallelValues(
        -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf());

    // use the PARALLEL key word, it will override the estimated reducer number
    query = "a = load '/passwd';" + "b = order a by $0 PARALLEL 2;" + "store b into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);

    assertEquals(2, mrPlan.size());

    sort = mrPlan.getLeaves().get(0);
    assertEquals(2, sort.getRequestedParallelism());

    // the estimation won't take effect when it apply to non-dfs or the files doesn't exist, such as
    // hbase
    query =
        "a = load 'hbase://passwd' using org.apache.pig.backend.hadoop.hbase.HBaseStorage('c:f1 c:f2');"
            + "b = order a by $0 ;"
            + "store b into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    assertEquals(2, mrPlan.size());

    sort = mrPlan.getLeaves().get(0);

    // the requested parallel will be -1 if users don't set any of default_parallel, paralllel
    // and the estimation doesn't take effect. MR framework will finally set it to 1.
    assertEquals(-1, sort.getRequestedParallelism());

    // test order by with three jobs (after optimization)
    query =
        "a = load '/passwd';"
            + "b = foreach a generate $0, $1, $2;"
            + "c = order b by $0;"
            + "store c into 'output';";
    pp = Util.buildPp(ps, query);

    mrPlan = Util.buildMRPlanWithOptimizer(pp, pc);
    assertEquals(3, mrPlan.size());

    // Simulate the first 2 jobs having run so estimation kicks in.
    sort = mrPlan.getLeaves().get(0);
    FileLocalizer.create(sort.getQuantFile(), pc);

    jobControl = jcc.compile(mrPlan, query);
    Util.copyFromLocalToCluster(
        cluster,
        "test/org/apache/pig/test/data/passwd",
        ((POLoad) sort.mapPlan.getRoots().get(0)).getLFile().getFileName());

    // First job is just foreach with projection, mapper-only job, so estimate gets ignored
    Util.assertParallelValues(-1, -1, -1, 0, jobControl.getWaitingJobs().get(0).getJobConf());

    jcc.updateMROpPlan(jobControl.getReadyJobs());
    jobControl = jcc.compile(mrPlan, query);
    jcc.updateMROpPlan(jobControl.getReadyJobs());

    // Second job is a sampler, which requests and gets 1 reducer
    Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf());

    jobControl = jcc.compile(mrPlan, query);
    sort = mrPlan.getLeaves().get(0);
    assertEquals(reducer, sort.getRequestedParallelism());

    // Third job is the order, which uses the estimated number of reducers
    Util.assertParallelValues(
        -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf());
  }