@Test public void testDefaultParallelInSkewJoin() throws Throwable { // default_parallel is considered only at runtime, so here we only test requested parallel // more thorough tests can be found in TestNumberOfReducers.java String query = "a = load 'input';" + "b = load 'input';" + "c = join a by $0, b by $0 using 'skewed' parallel 100;" + "store c into 'output';"; PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties()); PhysicalPlan pp = Util.buildPp(ps, query); MROperPlan mrPlan = Util.buildMRPlan(pp, pc); // Get the skew join job Iterator<MapReduceOper> iter = mrPlan.getKeys().values().iterator(); int counter = 0; while (iter.hasNext()) { MapReduceOper op = iter.next(); counter++; if (op.isSkewedJoin()) { assertTrue(op.getRequestedParallelism() == 100); } } assertEquals(3, counter); pc.defaultParallel = -1; }
@Test public void testLimitAdjusterFuncShipped() throws Exception { String query = "a = load 'input';" + "b = order a by $0 parallel 2;" + "c = limit b 7;" + "store c into 'output' using " + PigStorageNoDefCtor.class.getName() + "('\t');"; PhysicalPlan pp = Util.buildPp(pigServerMR, query); MROperPlan mrPlan = Util.buildMRPlan(pp, pc); LimitAdjuster la = new LimitAdjuster(mrPlan, pc); la.visit(); la.adjust(); MapReduceOper mrOper = mrPlan.getRoots().get(0); int count = 1; while (mrPlan.getSuccessors(mrOper) != null) { mrOper = mrPlan.getSuccessors(mrOper).get(0); ++count; } assertEquals(4, count); MapReduceOper op = mrPlan.getLeaves().get(0); assertTrue(op.UDFs.contains(new FuncSpec(PigStorageNoDefCtor.class.getName()) + "('\t')")); }
@Test public void testMergeJoin() throws Exception { String query = "a = load '/tmp/input1';" + "b = load '/tmp/input2';" + "c = join a by $0, b by $0 using 'merge';" + "store c into '/tmp/output1';"; PhysicalPlan pp = Util.buildPp(pigServer, query); MRCompiler comp = new MRCompiler(pp, pc); comp.compile(); MROperPlan mrp = comp.getMRPlan(); assertTrue(mrp.size() == 2); MapReduceOper mrOp0 = mrp.getRoots().get(0); assertTrue(mrOp0.mapPlan.size() == 2); PhysicalOperator load0 = mrOp0.mapPlan.getRoots().get(0); MergeJoinIndexer func = (MergeJoinIndexer) PigContext.instantiateFuncFromSpec(((POLoad) load0).getLFile().getFuncSpec()); Field lrField = MergeJoinIndexer.class.getDeclaredField("lr"); lrField.setAccessible(true); POLocalRearrange lr = (POLocalRearrange) lrField.get(func); List<PhysicalPlan> innerPlans = lr.getPlans(); PhysicalOperator localrearrange0 = mrOp0.mapPlan.getSuccessors(load0).get(0); assertTrue(localrearrange0 instanceof POLocalRearrange); assertTrue(mrOp0.reducePlan.size() == 3); PhysicalOperator pack0 = mrOp0.reducePlan.getRoots().get(0); assertTrue(pack0 instanceof POPackage); PhysicalOperator foreach0 = mrOp0.reducePlan.getSuccessors(pack0).get(0); assertTrue(foreach0 instanceof POForEach); PhysicalOperator store0 = mrOp0.reducePlan.getSuccessors(foreach0).get(0); assertTrue(store0 instanceof POStore); assertTrue(innerPlans.size() == 1); PhysicalPlan innerPlan = innerPlans.get(0); assertTrue(innerPlan.size() == 1); PhysicalOperator project = innerPlan.getRoots().get(0); assertTrue(project instanceof POProject); assertTrue(((POProject) project).getColumn() == 0); MapReduceOper mrOp1 = mrp.getSuccessors(mrOp0).get(0); assertTrue(mrOp1.mapPlan.size() == 3); PhysicalOperator load1 = mrOp1.mapPlan.getRoots().get(0); assertTrue(load1 instanceof POLoad); PhysicalOperator mergejoin1 = mrOp1.mapPlan.getSuccessors(load1).get(0); assertTrue(mergejoin1 instanceof POMergeJoin); PhysicalOperator store1 = mrOp1.mapPlan.getSuccessors(mergejoin1).get(0); assertTrue(store1 instanceof POStore); assertTrue(mrOp1.reducePlan.isEmpty()); }
@Test public void testMergeJoinWithIndexableLoadFunc() throws Exception { String query = "a = load 'input1';" + "b = load 'input2' using " + TestMergeJoin.DummyIndexableLoader.class.getName() + ";" + "c = join a by $0, b by $0 using 'merge';" + "store c into 'output';"; PhysicalPlan pp = Util.buildPp(pigServer, query); MROperPlan mp = Util.buildMRPlan(pp, pc); assertEquals( "Checking number of MR Jobs for merge join with " + "IndexableLoadFunc:", 1, mp.size()); }
/** * Test to ensure that the order by without parallel followed by a limit, i.e., top k always * produces the correct number of map reduce jobs. In the testcase below since we are running the * unit test locally, we will get reduce parallelism as 1. So we will NOT introduce the extra MR * job to do a final limit */ @Test public void testNumReducersInLimit() throws Exception { String query = "a = load 'input';" + "b = order a by $0;" + "c = limit b 10;" + "store c into 'output';"; PhysicalPlan pp = Util.buildPp(pigServer, query); MROperPlan mrPlan = Util.buildMRPlan(pp, pc); MapReduceOper mrOper = mrPlan.getRoots().get(0); int count = 1; while (mrPlan.getSuccessors(mrOper) != null) { mrOper = mrPlan.getSuccessors(mrOper).get(0); ++count; } assertEquals(3, count); }
@Test public void testUDFInMergedJoin() throws Exception { String query = "a = load 'input1';" + "b = load 'input2' using " + TestIndexableLoadFunc.class.getName() + "();" + "c = join a by $0, b by $0 using 'merge';" + "store c into 'output';"; PhysicalPlan pp = Util.buildPp(pigServer, query); MROperPlan mrPlan = Util.buildMRPlan(pp, pc); MapReduceOper mrOper = mrPlan.getRoots().get(0); assertTrue(mrOper.UDFs.contains(TestIndexableLoadFunc.class.getName())); }
@Test public void testCastFuncShipped() throws Exception { String query = "a = load 'input1' using " + PigStorageNoDefCtor.class.getName() + "('\t') as (a0, a1, a2);" + "b = group a by a0;" + "c = foreach b generate flatten(a);" + "d = order c by a0;" + "e = foreach d generate a1+a2;" + "store e into 'output';"; PhysicalPlan pp = Util.buildPp(pigServer, query); MROperPlan mp = Util.buildMRPlan(pp, pc); MapReduceOper op = mp.getLeaves().get(0); assertTrue(op.UDFs.contains(new FuncSpec(PigStorageNoDefCtor.class.getName()) + "('\t')")); }
@Test public void testUDFInJoin() throws Exception { String query = "a = load 'input1' using BinStorage();" + "b = load 'input2';" + "c = join a by $0, b by $0;" + "store c into 'output';"; PhysicalPlan pp = Util.buildPp(pigServer, query); MROperPlan mrPlan = Util.buildMRPlan(pp, pc); MapReduceOper mrOper = mrPlan.getRoots().get(0); assertEquals(2, mrOper.UDFs.size()); assertEquals(2, mrOper.UDFs.size()); assertTrue(mrOper.UDFs.contains("BinStorage")); assertTrue(mrOper.UDFs.contains("org.apache.pig.builtin.PigStorage")); }
/** * Test to ensure that the order by with parallel followed by a limit, i.e., top k always produces * the correct number of map reduce jobs */ @Test public void testNumReducersInLimitWithParallel() throws Exception { planTester.buildPlan("a = load 'input';"); planTester.buildPlan("b = order a by $0 parallel 2;"); planTester.buildPlan("c = limit b 10;"); LogicalPlan lp = planTester.buildPlan("store c into '/tmp';"); PhysicalPlan pp = Util.buildPhysicalPlan(lp, pc); MROperPlan mrPlan = Util.buildMRPlan(pp, pc); MapReduceOper mrOper = mrPlan.getRoots().get(0); int count = 1; while (mrPlan.getSuccessors(mrOper) != null) { mrOper = mrPlan.getSuccessors(mrOper).get(0); ++count; } assertTrue(count == 4); }
// PIG-2146 @Test public void testSchemaInStoreForDistinctLimit() throws Exception { // test if the POStore in the 2nd mr plan (that stores the actual output) // has a schema String query = "a = load 'input1' as (a : int,b :float ,c : int);" + "b = distinct a;" + "c = limit b 10;" + "store c into 'output';"; PhysicalPlan pp = Util.buildPp(pigServer, query); MROperPlan mrPlan = Util.buildMRPlan(pp, pc); MapReduceOper secondMrOper = mrPlan.getLeaves().get(0); POStore store = (POStore) secondMrOper.reducePlan.getLeaves().get(0); assertEquals( "compare load and store schema", store.getSchema(), Utils.getSchemaFromString("a : int,b :float ,c : int")); }
// See PIG-4538 @Test public void testFetchOptimizerSideEffect() throws Exception { String query = "in1 = LOAD 'data.txt' AS (ident:chararray);" + "in2 = LOAD 'data.txt' AS (ident:chararray);" + "in3 = LOAD 'data.txt';" + "joined = JOIN in1 BY ident LEFT OUTER, in2 BY ident;" + "store joined into 'output';"; PhysicalPlan pp = Util.buildPp(pigServer, query); MROperPlan mp = Util.buildMRPlan(pp, pc); // isPlanFetchable should not bring side effect: // set parentPlan for operators FetchOptimizer.isPlanFetchable(pc, pp); MapReduceOper op = mp.getLeaves().get(0); PhysicalOperator store = op.reducePlan.getLeaves().get(0); POForEach foreach = (POForEach) op.reducePlan.getPredecessors(store).get(0); PhysicalOperator project = foreach.getInputPlans().get(0).getRoots().get(0); Field parentPlan = PhysicalOperator.class.getDeclaredField("parentPlan"); parentPlan.setAccessible(true); assertTrue(parentPlan.get(project) == null); }
// PIG-2146 @Test public void testStorerLimit() throws Exception { // test if the POStore in the 1st mr plan // use the right StoreFunc String query = "a = load 'input1';" + "b = limit a 10;" + "store b into 'output' using " + PigStorageNoDefCtor.class.getName() + "(',');"; PhysicalPlan pp = Util.buildPp(pigServer, query); MROperPlan mrPlan = Util.buildMRPlan(pp, pc); LimitAdjuster la = new LimitAdjuster(mrPlan, pc); la.visit(); la.adjust(); MapReduceOper firstMrOper = mrPlan.getRoots().get(0); POStore store = (POStore) firstMrOper.reducePlan.getLeaves().get(0); assertEquals(store.getStoreFunc().getClass().getName(), "org.apache.pig.impl.io.InterStorage"); }
@Test public void testReducerNumEstimationForOrderBy() throws Exception { // Skip the test for Tez. Tez use a different mechanism. // Equivalent test is in TestTezAutoParallelism Assume.assumeTrue("Skip this test for TEZ", Util.isMapredExecType(cluster.getExecType())); // use the estimation pc.getProperties().setProperty("pig.exec.reducers.bytes.per.reducer", "100"); pc.getProperties().setProperty("pig.exec.reducers.max", "10"); String query = "a = load '/passwd';" + "b = order a by $0;" + "store b into 'output';"; PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties()); PhysicalPlan pp = Util.buildPp(ps, query); MROperPlan mrPlan = Util.buildMRPlanWithOptimizer(pp, pc); Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties()); JobControlCompiler jcc = new JobControlCompiler(pc, conf); JobControl jobControl = jcc.compile(mrPlan, query); assertEquals(2, mrPlan.size()); // first job uses a single reducer for the sampling Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf()); // Simulate the first job having run so estimation kicks in. MapReduceOper sort = mrPlan.getLeaves().get(0); jcc.updateMROpPlan(jobControl.getReadyJobs()); FileLocalizer.create(sort.getQuantFile(), pc); jobControl = jcc.compile(mrPlan, query); sort = mrPlan.getLeaves().get(0); long reducer = Math.min( (long) Math.ceil(new File("test/org/apache/pig/test/data/passwd").length() / 100.0), 10); assertEquals(reducer, sort.getRequestedParallelism()); // the second job estimates reducers Util.assertParallelValues( -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf()); // use the PARALLEL key word, it will override the estimated reducer number query = "a = load '/passwd';" + "b = order a by $0 PARALLEL 2;" + "store b into 'output';"; pp = Util.buildPp(ps, query); mrPlan = Util.buildMRPlanWithOptimizer(pp, pc); assertEquals(2, mrPlan.size()); sort = mrPlan.getLeaves().get(0); assertEquals(2, sort.getRequestedParallelism()); // the estimation won't take effect when it apply to non-dfs or the files doesn't exist, such as // hbase query = "a = load 'hbase://passwd' using org.apache.pig.backend.hadoop.hbase.HBaseStorage('c:f1 c:f2');" + "b = order a by $0 ;" + "store b into 'output';"; pp = Util.buildPp(ps, query); mrPlan = Util.buildMRPlanWithOptimizer(pp, pc); assertEquals(2, mrPlan.size()); sort = mrPlan.getLeaves().get(0); // the requested parallel will be -1 if users don't set any of default_parallel, paralllel // and the estimation doesn't take effect. MR framework will finally set it to 1. assertEquals(-1, sort.getRequestedParallelism()); // test order by with three jobs (after optimization) query = "a = load '/passwd';" + "b = foreach a generate $0, $1, $2;" + "c = order b by $0;" + "store c into 'output';"; pp = Util.buildPp(ps, query); mrPlan = Util.buildMRPlanWithOptimizer(pp, pc); assertEquals(3, mrPlan.size()); // Simulate the first 2 jobs having run so estimation kicks in. sort = mrPlan.getLeaves().get(0); FileLocalizer.create(sort.getQuantFile(), pc); jobControl = jcc.compile(mrPlan, query); Util.copyFromLocalToCluster( cluster, "test/org/apache/pig/test/data/passwd", ((POLoad) sort.mapPlan.getRoots().get(0)).getLFile().getFileName()); // First job is just foreach with projection, mapper-only job, so estimate gets ignored Util.assertParallelValues(-1, -1, -1, 0, jobControl.getWaitingJobs().get(0).getJobConf()); jcc.updateMROpPlan(jobControl.getReadyJobs()); jobControl = jcc.compile(mrPlan, query); jcc.updateMROpPlan(jobControl.getReadyJobs()); // Second job is a sampler, which requests and gets 1 reducer Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf()); jobControl = jcc.compile(mrPlan, query); sort = mrPlan.getLeaves().get(0); assertEquals(reducer, sort.getRequestedParallelism()); // Third job is the order, which uses the estimated number of reducers Util.assertParallelValues( -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf()); }