@Test public void testBzipStoreInMultiQuery2() throws Exception { String[] inputData = new String[] {"1\t2\r3\t4"}; String inputFileName = "input2.txt"; Util.createInputFile(cluster, inputFileName, inputData); PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); PigContext pigContext = pig.getPigContext(); pigContext.getProperties().setProperty("output.compression.enabled", "true"); pigContext .getProperties() .setProperty("output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec"); pig.setBatchOn(); pig.registerQuery("a = load '" + inputFileName + "';"); pig.registerQuery("store a into 'output2.bz2';"); pig.registerQuery("store a into 'output2';"); pig.executeBatch(); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FileStatus stat = fs.getFileStatus(new Path("output2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); stat = fs.getFileStatus(new Path("output2.bz2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); }
private TezJob getJob(TezPlanContainerNode tezPlanNode, TezPlanContainer planContainer) throws JobCreationException { try { Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); localResources.putAll(planContainer.getLocalResources()); TezOperPlan tezPlan = tezPlanNode.getTezOperPlan(); localResources.putAll(tezPlan.getExtraResources()); String shipFiles = pigContext.getProperties().getProperty("pig.streaming.ship.files"); if (shipFiles != null) { for (String file : shipFiles.split(",")) { TezResourceManager.getInstance().addTezResource(new File(file.trim()).toURI()); } } String cacheFiles = pigContext.getProperties().getProperty("pig.streaming.cache.files"); if (cacheFiles != null) { addCacheResources(cacheFiles.split(",")); } for (Map.Entry<String, LocalResource> entry : localResources.entrySet()) { log.info("Local resource: " + entry.getKey()); } DAG tezDag = buildDAG(tezPlanNode, localResources); tezDag.setDAGInfo(createDagInfo(TezScriptState.get().getScript())); // set Tez caller context // Reflection for the following code since it is only available since tez 0.8.1: // CallerContext context = CallerContext.create(ATSService.CallerContext, // ATSService.getPigAuditId(pigContext), // ATSService.EntityType, ""); // tezDag.setCallerContext(context); Class callerContextClass = null; try { callerContextClass = Class.forName("org.apache.tez.client.CallerContext"); } catch (ClassNotFoundException e) { // If pre-Tez 0.8.1, skip setting CallerContext } if (callerContextClass != null) { Method builderBuildMethod = callerContextClass.getMethod( "create", String.class, String.class, String.class, String.class); Object context = builderBuildMethod.invoke( null, PigATSClient.CALLER_CONTEXT, PigATSClient.getPigAuditId(pigContext), PigATSClient.ENTITY_TYPE, ""); Method dagSetCallerContext = tezDag.getClass().getMethod("setCallerContext", context.getClass()); dagSetCallerContext.invoke(tezDag, context); } log.info("Total estimated parallelism is " + tezPlan.getEstimatedTotalParallelism()); return new TezJob(tezConf, tezDag, localResources, tezPlan); } catch (Exception e) { int errCode = 2017; String msg = "Internal error creating job configuration."; throw new JobCreationException(msg, errCode, PigException.BUG, e); } }
@Test public void testDefaultParallel() throws Throwable { pc.defaultParallel = 100; String query = "a = load '/passwd';" + "b = group a by $0;" + "store b into 'output';"; PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties()); PhysicalPlan pp = Util.buildPp(ps, query); checkDefaultParallelResult(pp, pc); pc.defaultParallel = -1; }
@Test public void testParserWithEscapeCharacters() throws Exception { // All the needed variables PigContext pigContext = new PigContext(ExecType.LOCAL, new Properties()); PigServer pigServer = new PigServer(pigContext); pigContext.connect(); String tempFile = this.prepareTempFile(); String query = String.format("A = LOAD '%s' ;", Util.encodeEscape(tempFile)); // Start the real parsing job { // Initial statement Util.buildLp(pigServer, query); } { // Normal condition String q = query + "B = filter A by $0 eq 'This is a test string' ;"; checkParsedConstContent(pigServer, pigContext, q, "This is a test string"); } { // single-quote condition String q = query + "B = filter A by $0 eq 'This is a test \\'string' ;"; checkParsedConstContent(pigServer, pigContext, q, "This is a test 'string"); } { // escaping dot // the reason we have 4 backslashes below is we really want to put two backslashes but // since this is to be represented in a Java String, we escape each backslash with one more // backslash - hence 4. In a pig script in a file, this would be // \\.string String q = query + "B = filter A by $0 eq 'This is a test \\\\.string' ;"; checkParsedConstContent(pigServer, pigContext, q, "This is a test \\.string"); } { // newline condition String q = query + "B = filter A by $0 eq 'This is a test \\nstring' ;"; checkParsedConstContent(pigServer, pigContext, q, "This is a test \nstring"); } { // Unicode String q = query + "B = filter A by $0 eq 'This is a test \\uD30C\\uC774string' ;"; checkParsedConstContent(pigServer, pigContext, q, "This is a test \uD30C\uC774string"); } }
private void setDefaultTimeZone() { String dtzStr = pc.getProperties().getProperty("pig.datetime.default.tz"); if (dtzStr != null && dtzStr.length() > 0) { currentDTZ = DateTimeZone.getDefault(); DateTimeZone.setDefault(DateTimeZone.forID(dtzStr)); } }
@BeforeClass public static void setUpBeforeClass() throws Exception { cluster = MiniCluster.buildCluster(); pc = new PigContext(ExecType.LOCAL, new Properties()); pcMR = new PigContext(ExecType.MAPREDUCE, cluster.getProperties()); pc.connect(); }
private void checkParsedConstContent( PigServer pigServer, PigContext pigContext, String query, String expectedContent) throws Exception { pigContext.connect(); LogicalPlan lp = Util.buildLp(pigServer, query + "store B into 'output';"); // Digging down the tree Operator load = lp.getSources().get(0); Operator filter = lp.getSuccessors(load).get(0); LogicalExpressionPlan comparisonPlan = ((LOFilter) filter).getFilterPlan(); List<Operator> comparisonPlanRoots = comparisonPlan.getSinks(); Operator compRootOne = comparisonPlanRoots.get(0); Operator compRootTwo = comparisonPlanRoots.get(1); // Here is the actual check logic if (compRootOne instanceof ConstantExpression) { assertTrue( "Must be equal", ((String) ((ConstantExpression) compRootOne).getValue()).equals(expectedContent)); } // If not left, it must be right. else { assertTrue( "Must be equal", ((String) ((ConstantExpression) compRootTwo).getValue()).equals(expectedContent)); } }
@Test public void testDefaultParallelInSkewJoin() throws Throwable { // default_parallel is considered only at runtime, so here we only test requested parallel // more thorough tests can be found in TestNumberOfReducers.java String query = "a = load 'input';" + "b = load 'input';" + "c = join a by $0, b by $0 using 'skewed' parallel 100;" + "store c into 'output';"; PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties()); PhysicalPlan pp = Util.buildPp(ps, query); MROperPlan mrPlan = Util.buildMRPlan(pp, pc); // Get the skew join job Iterator<MapReduceOper> iter = mrPlan.getKeys().values().iterator(); int counter = 0; while (iter.hasNext()) { MapReduceOper op = iter.next(); counter++; if (op.isSkewedJoin()) { assertTrue(op.getRequestedParallelism() == 100); } } assertEquals(3, counter); pc.defaultParallel = -1; }
@Override public StoreFuncInterface createStoreFunc(POStore store) throws IOException { Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties()); StoreFuncInterface storeFunc = store.getStoreFunc(); JobContext jc = HadoopShims.createJobContext(conf, new JobID()); OutputFormat<?, ?> outputFormat = storeFunc.getOutputFormat(); PigOutputFormat.setLocation(jc, store); context = HadoopShims.createTaskAttemptContext(conf, HadoopShims.getNewTaskAttemptID()); PigOutputFormat.setLocation(context, store); try { outputFormat.checkOutputSpecs(jc); } catch (InterruptedException e) { throw new IOException(e); } try { outputCommitter = outputFormat.getOutputCommitter(context); outputCommitter.setupJob(jc); outputCommitter.setupTask(context); writer = outputFormat.getRecordWriter(context); } catch (InterruptedException e) { throw new IOException(e); } storeFunc.prepareToWrite(writer); return storeFunc; }
public static LOLoad newLOLoad( FileSpec loader, LogicalSchema schema, LogicalPlan plan, Configuration conf) { LoadFunc loadFunc = null; if (loader != null) { loadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(loader.getFuncSpec()); } return new LOLoad(loader, schema, plan, conf, loadFunc, "alias_newOperatorKey"); }
{ try { pigContext.connect(); } catch (ExecException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
/** Configures the Reduce plan, the POPackage operator and the reporter thread */ @SuppressWarnings("unchecked") @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); inIllustrator = inIllustrator(context); if (inIllustrator) pack = getPack(context); Configuration jConf = context.getConfiguration(); SpillableMemoryManager.configure(ConfigurationUtil.toProperties(jConf)); context .getConfiguration() .set( PigConstants.TASK_INDEX, Integer.toString(context.getTaskAttemptID().getTaskID().getId())); sJobContext = context; sJobConfInternal.set(context.getConfiguration()); sJobConf = context.getConfiguration(); try { PigContext.setPackageImportList( (ArrayList<String>) ObjectSerializer.deserialize(jConf.get("udf.import.list"))); pigContext = (PigContext) ObjectSerializer.deserialize(jConf.get("pig.pigContext")); // This attempts to fetch all of the generated code from the distributed cache, and resolve // it SchemaTupleBackend.initialize(jConf, pigContext); if (rp == null) rp = (PhysicalPlan) ObjectSerializer.deserialize(jConf.get("pig.reducePlan")); stores = PlanHelper.getPhysicalOperators(rp, POStore.class); if (!inIllustrator) pack = (POPackage) ObjectSerializer.deserialize(jConf.get("pig.reduce.package")); // To be removed if (rp.isEmpty()) log.debug("Reduce Plan empty!"); else { ByteArrayOutputStream baos = new ByteArrayOutputStream(); rp.explain(baos); log.debug(baos.toString()); } pigReporter = new ProgressableReporter(); if (!(rp.isEmpty())) { roots = rp.getRoots().toArray(new PhysicalOperator[1]); leaf = rp.getLeaves().get(0); } // Get the UDF specific context MapRedUtil.setupUDFContext(jConf); } catch (IOException ioe) { String msg = "Problem while configuring reduce plan."; throw new RuntimeException(msg, ioe); } log.info( "Aliases being processed per job phase (AliasName[line,offset]): " + jConf.get("pig.alias.location")); Utils.setDefaultTimeZone(PigMapReduce.sJobConfInternal.get()); }
static { pc = new PigContext(); try { pc.connect(); } catch (ExecException e) { // TODO Auto-generated catch block e.printStackTrace(); } r = new Random(SEED); }
public static void oneTimeSetUp() throws Exception { cluster = MiniGenericCluster.buildCluster(); pc = new PigContext(cluster.getExecType(), cluster.getProperties()); try { pc.connect(); } catch (ExecException e) { // TODO Auto-generated catch block e.printStackTrace(); } GenPhyOp.setPc(pc); Util.copyFromLocalToCluster(cluster, "test/org/apache/pig/test/data/passwd", "/passwd"); }
@Test public void testMergeJoin() throws Exception { String query = "a = load '/tmp/input1';" + "b = load '/tmp/input2';" + "c = join a by $0, b by $0 using 'merge';" + "store c into '/tmp/output1';"; PhysicalPlan pp = Util.buildPp(pigServer, query); MRCompiler comp = new MRCompiler(pp, pc); comp.compile(); MROperPlan mrp = comp.getMRPlan(); assertTrue(mrp.size() == 2); MapReduceOper mrOp0 = mrp.getRoots().get(0); assertTrue(mrOp0.mapPlan.size() == 2); PhysicalOperator load0 = mrOp0.mapPlan.getRoots().get(0); MergeJoinIndexer func = (MergeJoinIndexer) PigContext.instantiateFuncFromSpec(((POLoad) load0).getLFile().getFuncSpec()); Field lrField = MergeJoinIndexer.class.getDeclaredField("lr"); lrField.setAccessible(true); POLocalRearrange lr = (POLocalRearrange) lrField.get(func); List<PhysicalPlan> innerPlans = lr.getPlans(); PhysicalOperator localrearrange0 = mrOp0.mapPlan.getSuccessors(load0).get(0); assertTrue(localrearrange0 instanceof POLocalRearrange); assertTrue(mrOp0.reducePlan.size() == 3); PhysicalOperator pack0 = mrOp0.reducePlan.getRoots().get(0); assertTrue(pack0 instanceof POPackage); PhysicalOperator foreach0 = mrOp0.reducePlan.getSuccessors(pack0).get(0); assertTrue(foreach0 instanceof POForEach); PhysicalOperator store0 = mrOp0.reducePlan.getSuccessors(foreach0).get(0); assertTrue(store0 instanceof POStore); assertTrue(innerPlans.size() == 1); PhysicalPlan innerPlan = innerPlans.get(0); assertTrue(innerPlan.size() == 1); PhysicalOperator project = innerPlan.getRoots().get(0); assertTrue(project instanceof POProject); assertTrue(((POProject) project).getColumn() == 0); MapReduceOper mrOp1 = mrp.getSuccessors(mrOp0).get(0); assertTrue(mrOp1.mapPlan.size() == 3); PhysicalOperator load1 = mrOp1.mapPlan.getRoots().get(0); assertTrue(load1 instanceof POLoad); PhysicalOperator mergejoin1 = mrOp1.mapPlan.getSuccessors(load1).get(0); assertTrue(mergejoin1 instanceof POMergeJoin); PhysicalOperator store1 = mrOp1.mapPlan.getSuccessors(mergejoin1).get(0); assertTrue(store1 instanceof POStore); assertTrue(mrOp1.reducePlan.isEmpty()); }
/** * The reduce function which packages the key and List<Tuple> into key, Bag<Tuple> * after converting Hadoop type key into Pig type. The package result is either collected as is, * if the reduce plan is empty or after passing through the reduce plan. */ @Override protected void reduce(PigNullableWritable key, Iterable<NullableTuple> tupIter, Context context) throws IOException, InterruptedException { if (!initialized) { initialized = true; // cache the collector for use in runPipeline() // which could additionally be called from close() this.outputCollector = context; pigReporter.setRep(context); PhysicalOperator.setReporter(pigReporter); boolean aggregateWarning = "true".equalsIgnoreCase(pigContext.getProperties().getProperty("aggregate.warning")); PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance(); pigStatusReporter.setContext(new MRTaskContext(context)); PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance(); pigHadoopLogger.setReporter(pigStatusReporter); pigHadoopLogger.setAggregate(aggregateWarning); PhysicalOperator.setPigLogger(pigHadoopLogger); if (!inIllustrator) for (POStore store : stores) { MapReducePOStoreImpl impl = new MapReducePOStoreImpl(context); store.setStoreImpl(impl); store.setUp(); } } // In the case we optimize the join, we combine // POPackage and POForeach - so we could get many // tuples out of the getnext() call of POJoinPackage // In this case, we process till we see EOP from // POJoinPacakage.getNext() if (pack.getPkgr() instanceof JoinPackager) { pack.attachInput(key, tupIter.iterator()); while (true) { if (processOnePackageOutput(context)) break; } } else { // join is not optimized, so package will // give only one tuple out for the key pack.attachInput(key, tupIter.iterator()); processOnePackageOutput(context); } }
@Override public InputFormat getInputFormat() throws IOException { // if not manually set in options string if (inputFormatClassName == null) { if (loadLocation.endsWith(".bz2") || loadLocation.endsWith(".bz")) { inputFormatClassName = Bzip2TextInputFormat.class.getName(); } else { inputFormatClassName = TextInputFormat.class.getName(); } } try { return (FileInputFormat) PigContext.resolveClassName(inputFormatClassName).newInstance(); } catch (InstantiationException e) { throw new IOException("Failed creating input format " + inputFormatClassName, e); } catch (IllegalAccessException e) { throw new IOException("Failed creating input format " + inputFormatClassName, e); } }
/** * This must be called when the code has been generated and the generated code needs to be shipped * to the cluster, so that it may be used by the mappers and reducers. * * @param pigContext * @param conf */ public static void copyAllGeneratedToDistributedCache(PigContext pigContext, Configuration conf) { if (stf == null) { LOG.debug("Nothing registered to generate."); return; } SchemaTupleFrontendGenHelper stfgh = new SchemaTupleFrontendGenHelper(pigContext, conf); stfgh.generateAll(stf.getSchemasToGenerate()); stfgh.internalCopyAllGeneratedToDistributedCache(); Properties prop = pigContext.getProperties(); String value = conf.get(GENERATED_CLASSES_KEY); if (value != null) { prop.setProperty(GENERATED_CLASSES_KEY, value); } else { prop.remove(GENERATED_CLASSES_KEY); } value = conf.get(LOCAL_CODE_DIR); if (value != null) { prop.setProperty(LOCAL_CODE_DIR, value); } else { prop.remove(LOCAL_CODE_DIR); } }
@Override protected void execute(LogicalExpression op) throws FrontendException { if (op instanceof UserFuncExpression) { UserFuncExpression udf = (UserFuncExpression) op; if (!udf.getEvalFunc().allowCompileTimeCalculation()) { return; } } boolean valSet = false; Object val = null; if (currentWalker.getPlan().getSuccessors(op) != null) { // If has successors and all successors are constant, calculate the constant for (Operator succ : currentWalker.getPlan().getSuccessors(op)) { if (!(succ instanceof ConstantExpression)) { return; } } // All successors are constant, calculate the value OperatorPlan expLogicalPlan = new LogicalExpressionPlan(); ((BaseOperatorPlan) currentWalker.getPlan()) .moveTree(op, (BaseOperatorPlan) expLogicalPlan); PhysicalPlan expPhysicalPlan = new PhysicalPlan(); Map<Operator, PhysicalOperator> logToPhyMap = new HashMap<Operator, PhysicalOperator>(); PlanWalker childWalker = new ReverseDependencyOrderWalkerWOSeenChk(expLogicalPlan); // Save the old walker and use childWalker as current Walker pushWalker(childWalker); ExpToPhyTranslationVisitor expTranslationVisitor = new ExpToPhyTranslationVisitor( expLogicalPlan, childWalker, currentOp, expPhysicalPlan, logToPhyMap); expTranslationVisitor.visit(); popWalker(); PhysicalOperator root = expPhysicalPlan.getLeaves().get(0); try { UDFContext.getUDFContext() .addJobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), true)); PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance(); PhysicalOperator.setPigLogger(pigHadoopLogger); setDefaultTimeZone(); val = root.getNext(root.getResultType()).result; restoreDefaultTimeZone(); UDFContext.getUDFContext().addJobConf(null); } catch (ExecException e) { throw new FrontendException(e); } valSet = true; } else if (op instanceof UserFuncExpression) { // If solo UDF, calculate UDF UserFuncExpression udf = (UserFuncExpression) op; try { UDFContext.getUDFContext() .addJobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), true)); setDefaultTimeZone(); val = udf.getEvalFunc().exec(null); restoreDefaultTimeZone(); UDFContext.getUDFContext().addJobConf(null); } catch (IOException e) { throw new FrontendException(e); } valSet = true; } if (valSet) { ConstantExpression constantExpr; constantExpr = new ConstantExpression(currentWalker.getPlan(), val); constantExpr.inheritSchema(op); currentWalker.getPlan().replace(op, constantExpr); } }
@Test public void testReducerNumEstimationForOrderBy() throws Exception { // Skip the test for Tez. Tez use a different mechanism. // Equivalent test is in TestTezAutoParallelism Assume.assumeTrue("Skip this test for TEZ", Util.isMapredExecType(cluster.getExecType())); // use the estimation pc.getProperties().setProperty("pig.exec.reducers.bytes.per.reducer", "100"); pc.getProperties().setProperty("pig.exec.reducers.max", "10"); String query = "a = load '/passwd';" + "b = order a by $0;" + "store b into 'output';"; PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties()); PhysicalPlan pp = Util.buildPp(ps, query); MROperPlan mrPlan = Util.buildMRPlanWithOptimizer(pp, pc); Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties()); JobControlCompiler jcc = new JobControlCompiler(pc, conf); JobControl jobControl = jcc.compile(mrPlan, query); assertEquals(2, mrPlan.size()); // first job uses a single reducer for the sampling Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf()); // Simulate the first job having run so estimation kicks in. MapReduceOper sort = mrPlan.getLeaves().get(0); jcc.updateMROpPlan(jobControl.getReadyJobs()); FileLocalizer.create(sort.getQuantFile(), pc); jobControl = jcc.compile(mrPlan, query); sort = mrPlan.getLeaves().get(0); long reducer = Math.min( (long) Math.ceil(new File("test/org/apache/pig/test/data/passwd").length() / 100.0), 10); assertEquals(reducer, sort.getRequestedParallelism()); // the second job estimates reducers Util.assertParallelValues( -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf()); // use the PARALLEL key word, it will override the estimated reducer number query = "a = load '/passwd';" + "b = order a by $0 PARALLEL 2;" + "store b into 'output';"; pp = Util.buildPp(ps, query); mrPlan = Util.buildMRPlanWithOptimizer(pp, pc); assertEquals(2, mrPlan.size()); sort = mrPlan.getLeaves().get(0); assertEquals(2, sort.getRequestedParallelism()); // the estimation won't take effect when it apply to non-dfs or the files doesn't exist, such as // hbase query = "a = load 'hbase://passwd' using org.apache.pig.backend.hadoop.hbase.HBaseStorage('c:f1 c:f2');" + "b = order a by $0 ;" + "store b into 'output';"; pp = Util.buildPp(ps, query); mrPlan = Util.buildMRPlanWithOptimizer(pp, pc); assertEquals(2, mrPlan.size()); sort = mrPlan.getLeaves().get(0); // the requested parallel will be -1 if users don't set any of default_parallel, paralllel // and the estimation doesn't take effect. MR framework will finally set it to 1. assertEquals(-1, sort.getRequestedParallelism()); // test order by with three jobs (after optimization) query = "a = load '/passwd';" + "b = foreach a generate $0, $1, $2;" + "c = order b by $0;" + "store c into 'output';"; pp = Util.buildPp(ps, query); mrPlan = Util.buildMRPlanWithOptimizer(pp, pc); assertEquals(3, mrPlan.size()); // Simulate the first 2 jobs having run so estimation kicks in. sort = mrPlan.getLeaves().get(0); FileLocalizer.create(sort.getQuantFile(), pc); jobControl = jcc.compile(mrPlan, query); Util.copyFromLocalToCluster( cluster, "test/org/apache/pig/test/data/passwd", ((POLoad) sort.mapPlan.getRoots().get(0)).getLFile().getFileName()); // First job is just foreach with projection, mapper-only job, so estimate gets ignored Util.assertParallelValues(-1, -1, -1, 0, jobControl.getWaitingJobs().get(0).getJobConf()); jcc.updateMROpPlan(jobControl.getReadyJobs()); jobControl = jcc.compile(mrPlan, query); jcc.updateMROpPlan(jobControl.getReadyJobs()); // Second job is a sampler, which requests and gets 1 reducer Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf()); jobControl = jcc.compile(mrPlan, query); sort = mrPlan.getLeaves().get(0); assertEquals(reducer, sort.getRequestedParallelism()); // Third job is the order, which uses the estimated number of reducers Util.assertParallelValues( -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf()); }
@Test public void testReducerNumEstimation() throws Exception { // Skip the test for Tez. Tez use a different mechanism. // Equivalent test is in TestTezAutoParallelism Assume.assumeTrue("Skip this test for TEZ", Util.isMapredExecType(cluster.getExecType())); // use the estimation Configuration conf = HBaseConfiguration.create(new Configuration()); HBaseTestingUtility util = new HBaseTestingUtility(conf); int clientPort = util.startMiniZKCluster().getClientPort(); util.startMiniHBaseCluster(1, 1); String query = "a = load '/passwd';" + "b = group a by $0;" + "store b into 'output';"; PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties()); PhysicalPlan pp = Util.buildPp(ps, query); MROperPlan mrPlan = Util.buildMRPlan(pp, pc); pc.getConf().setProperty("pig.exec.reducers.bytes.per.reducer", "100"); pc.getConf().setProperty("pig.exec.reducers.max", "10"); pc.getConf().setProperty(HConstants.ZOOKEEPER_CLIENT_PORT, Integer.toString(clientPort)); ConfigurationValidator.validatePigProperties(pc.getProperties()); conf = ConfigurationUtil.toConfiguration(pc.getProperties()); JobControlCompiler jcc = new JobControlCompiler(pc, conf); JobControl jc = jcc.compile(mrPlan, "Test"); Job job = jc.getWaitingJobs().get(0); long reducer = Math.min( (long) Math.ceil(new File("test/org/apache/pig/test/data/passwd").length() / 100.0), 10); Util.assertParallelValues(-1, -1, reducer, reducer, job.getJobConf()); // use the PARALLEL key word, it will override the estimated reducer number query = "a = load '/passwd';" + "b = group a by $0 PARALLEL 2;" + "store b into 'output';"; pp = Util.buildPp(ps, query); mrPlan = Util.buildMRPlan(pp, pc); pc.getConf().setProperty("pig.exec.reducers.bytes.per.reducer", "100"); pc.getConf().setProperty("pig.exec.reducers.max", "10"); ConfigurationValidator.validatePigProperties(pc.getProperties()); conf = ConfigurationUtil.toConfiguration(pc.getProperties()); jcc = new JobControlCompiler(pc, conf); jc = jcc.compile(mrPlan, "Test"); job = jc.getWaitingJobs().get(0); Util.assertParallelValues(-1, 2, -1, 2, job.getJobConf()); final byte[] COLUMNFAMILY = Bytes.toBytes("pig"); util.createTable(Bytes.toBytesBinary("test_table"), COLUMNFAMILY); // the estimation won't take effect when it apply to non-dfs or the files doesn't exist, such as // hbase query = "a = load 'hbase://test_table' using org.apache.pig.backend.hadoop.hbase.HBaseStorage('c:f1 c:f2');" + "b = group a by $0 ;" + "store b into 'output';"; pp = Util.buildPp(ps, query); mrPlan = Util.buildMRPlan(pp, pc); pc.getConf().setProperty("pig.exec.reducers.bytes.per.reducer", "100"); pc.getConf().setProperty("pig.exec.reducers.max", "10"); ConfigurationValidator.validatePigProperties(pc.getProperties()); conf = ConfigurationUtil.toConfiguration(pc.getProperties()); jcc = new JobControlCompiler(pc, conf); jc = jcc.compile(mrPlan, "Test"); job = jc.getWaitingJobs().get(0); Util.assertParallelValues(-1, -1, 1, 1, job.getJobConf()); util.deleteTable(Bytes.toBytesBinary("test_table")); // In HBase 0.90.1 and above we can use util.shutdownMiniHBaseCluster() // here instead. MiniHBaseCluster hbc = util.getHBaseCluster(); if (hbc != null) { hbc.shutdown(); hbc.join(); } util.shutdownMiniZKCluster(); }
/** * This method copies all class files present in the local temp directory to the distributed * cache. All copied files will have a symlink of their name. No files will be copied if the * current job is being run from local mode. * * @param pigContext * @param conf */ private void internalCopyAllGeneratedToDistributedCache() { LOG.info("Starting process to move generated code to distributed cacche"); if (pigContext.getExecType().isLocal()) { String codePath = codeDir.getAbsolutePath(); LOG.info( "Distributed cache not supported or needed in local mode. Setting key [" + LOCAL_CODE_DIR + "] with code temp directory: " + codePath); conf.set(LOCAL_CODE_DIR, codePath); return; } else { // This let's us avoid NPE in some of the non-traditional pipelines String codePath = codeDir.getAbsolutePath(); conf.set(LOCAL_CODE_DIR, codePath); } DistributedCache.createSymlink(conf); // we will read using symlinks StringBuilder serialized = new StringBuilder(); boolean first = true; // We attempt to copy over every file in the generated code temp directory for (File f : codeDir.listFiles()) { if (first) { first = false; } else { serialized.append(","); } String symlink = f.getName(); // the class name will also be the symlink serialized.append(symlink); Path src = new Path(f.toURI()); Path dst; try { dst = FileLocalizer.getTemporaryPath(pigContext); } catch (IOException e) { throw new RuntimeException("Error getting temporary path in HDFS", e); } FileSystem fs; try { fs = dst.getFileSystem(conf); } catch (IOException e) { throw new RuntimeException("Unable to get FileSystem", e); } try { fs.copyFromLocalFile(src, dst); fs.setReplication(dst, (short) conf.getInt(MRConfiguration.SUMIT_REPLICATION, 3)); } catch (IOException e) { throw new RuntimeException( "Unable to copy from local filesystem to HDFS, src = " + src + ", dst = " + dst, e); } String destination = dst.toString() + "#" + symlink; try { DistributedCache.addCacheFile(new URI(destination), conf); } catch (URISyntaxException e) { throw new RuntimeException("Unable to add file to distributed cache: " + destination, e); } LOG.info("File successfully added to the distributed cache: " + symlink); } String toSer = serialized.toString(); LOG.info( "Setting key [" + GENERATED_CLASSES_KEY + "] with classes to deserialize [" + toSer + "]"); // we must set a key in the job conf so individual jobs know to resolve the shipped classes conf.set(GENERATED_CLASSES_KEY, toSer); }
public DefaultInputHandler(HandleSpec spec) { serializer = (PigToStream) PigContext.instantiateFuncFromSpec(spec.spec); }
/** * The Main-Class for the Pig Jar that will provide a shell and setup a classpath appropriate for * executing Jar files. * * @param args -jar can be used to add additional jar files (colon separated). - will start a * shell. -e will execute the rest of the command line as if it was input to the shell. * @throws IOException */ public static void main(String args[]) { int rc = 1; Properties properties = new Properties(); PropertiesUtil.loadPropertiesFromFile(properties); boolean verbose = false; boolean gruntCalled = false; String logFileName = null; try { BufferedReader pin = null; boolean debug = false; boolean dryrun = false; ArrayList<String> params = new ArrayList<String>(); ArrayList<String> paramFiles = new ArrayList<String>(); HashSet<String> optimizerRules = new HashSet<String>(); CmdLineParser opts = new CmdLineParser(args); opts.registerOpt('4', "log4jconf", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('b', "brief", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('c', "cluster", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('d', "debug", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('e', "execute", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('f', "file", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('h', "help", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('i', "version", CmdLineParser.ValueExpected.OPTIONAL); opts.registerOpt('j', "jar", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('l', "logfile", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('m', "param_file", CmdLineParser.ValueExpected.OPTIONAL); opts.registerOpt('o', "hod", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('p', "param", CmdLineParser.ValueExpected.OPTIONAL); opts.registerOpt('r', "dryrun", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('t', "optimizer_off", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('v', "verbose", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('w', "warning", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('x', "exectype", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('F', "stop_on_failure", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('M', "no_multiquery", CmdLineParser.ValueExpected.NOT_ACCEPTED); ExecMode mode = ExecMode.UNKNOWN; String file = null; ExecType execType = ExecType.MAPREDUCE; String execTypeString = properties.getProperty("exectype"); if (execTypeString != null && execTypeString.length() > 0) { execType = PigServer.parseExecType(execTypeString); } String cluster = "local"; String clusterConfigured = properties.getProperty("cluster"); if (clusterConfigured != null && clusterConfigured.length() > 0) { cluster = clusterConfigured; } // by default warning aggregation is on properties.setProperty("aggregate.warning", "" + true); // by default multiquery optimization is on properties.setProperty("opt.multiquery", "" + true); // by default we keep going on error on the backend properties.setProperty("stop.on.failure", "" + false); char opt; while ((opt = opts.getNextOpt()) != CmdLineParser.EndOfOpts) { switch (opt) { case '4': String log4jconf = opts.getValStr(); if (log4jconf != null) { properties.setProperty(LOG4J_CONF, log4jconf); } break; case 'b': properties.setProperty(BRIEF, "true"); break; case 'c': // Needed away to specify the cluster to run the MR job on // Bug 831708 - fixed String clusterParameter = opts.getValStr(); if (clusterParameter != null && clusterParameter.length() > 0) { cluster = clusterParameter; } break; case 'd': String logLevel = opts.getValStr(); if (logLevel != null) { properties.setProperty(DEBUG, logLevel); } debug = true; break; case 'e': mode = ExecMode.STRING; break; case 'f': mode = ExecMode.FILE; file = opts.getValStr(); break; case 'F': properties.setProperty("stop.on.failure", "" + true); break; case 'h': usage(); return; case 'i': System.out.println(getVersionString()); return; case 'j': String jarsString = opts.getValStr(); if (jarsString != null) { properties.setProperty(JAR, jarsString); } break; case 'l': // call to method that validates the path to the log file // and sets up the file to store the client side log file String logFileParameter = opts.getValStr(); if (logFileParameter != null && logFileParameter.length() > 0) { logFileName = validateLogFile(logFileParameter, null); } else { logFileName = validateLogFile(logFileName, null); } properties.setProperty("pig.logfile", logFileName); break; case 'm': paramFiles.add(opts.getValStr()); break; case 'M': // turns off multiquery optimization properties.setProperty("opt.multiquery", "" + false); break; case 'o': // TODO sgroschupf using system properties is always a very bad idea String gateway = System.getProperty("ssh.gateway"); if (gateway == null || gateway.length() == 0) { properties.setProperty("hod.server", "local"); } else { properties.setProperty("hod.server", System.getProperty("ssh.gateway")); } break; case 'p': String val = opts.getValStr(); params.add(opts.getValStr()); break; case 'r': // currently only used for parameter substitution // will be extended in the future dryrun = true; break; case 't': optimizerRules.add(opts.getValStr()); break; case 'v': properties.setProperty(VERBOSE, "" + true); verbose = true; break; case 'w': properties.setProperty("aggregate.warning", "" + false); break; case 'x': try { execType = PigServer.parseExecType(opts.getValStr()); } catch (IOException e) { throw new RuntimeException("ERROR: Unrecognized exectype.", e); } break; default: { Character cc = new Character(opt); throw new AssertionError("Unhandled option " + cc.toString()); } } } // configure logging configureLog4J(properties); // create the context with the parameter PigContext pigContext = new PigContext(execType, properties); if (logFileName == null) { logFileName = validateLogFile(null, null); } pigContext.getProperties().setProperty("pig.logfile", logFileName); if (optimizerRules.size() > 0) { pigContext .getProperties() .setProperty("pig.optimizer.rules", ObjectSerializer.serialize(optimizerRules)); } LogicalPlanBuilder.classloader = pigContext.createCl(null); // construct the parameter substitution preprocessor Grunt grunt = null; BufferedReader in; String substFile = null; switch (mode) { case FILE: { // Run, using the provided file as a pig file in = new BufferedReader(new FileReader(file)); // run parameter substitution preprocessor first substFile = file + ".substituted"; pin = runParamPreprocessor(in, params, paramFiles, substFile, debug || dryrun); if (dryrun) { log.info("Dry run completed. Substituted pig script is at " + substFile); return; } logFileName = validateLogFile(logFileName, file); pigContext.getProperties().setProperty("pig.logfile", logFileName); // Set job name based on name of the script pigContext .getProperties() .setProperty(PigContext.JOB_NAME, "PigLatin:" + new File(file).getName()); if (!debug) { new File(substFile).deleteOnExit(); } grunt = new Grunt(pin, pigContext); gruntCalled = true; int results[] = grunt.exec(); rc = getReturnCodeForStats(results); return; } case STRING: { // Gather up all the remaining arguments into a string and pass them into // grunt. StringBuffer sb = new StringBuffer(); String remainders[] = opts.getRemainingArgs(); for (int i = 0; i < remainders.length; i++) { if (i != 0) sb.append(' '); sb.append(remainders[i]); } in = new BufferedReader(new StringReader(sb.toString())); grunt = new Grunt(in, pigContext); gruntCalled = true; int results[] = grunt.exec(); rc = getReturnCodeForStats(results); return; } default: break; } // If we're here, we don't know yet what they want. They may have just // given us a jar to execute, they might have given us a pig script to // execute, or they might have given us a dash (or nothing) which means to // run grunt interactive. String remainders[] = opts.getRemainingArgs(); if (remainders == null) { // Interactive mode = ExecMode.SHELL; ConsoleReader reader = new ConsoleReader(System.in, new OutputStreamWriter(System.out)); reader.setDefaultPrompt("grunt> "); final String HISTORYFILE = ".pig_history"; String historyFile = System.getProperty("user.home") + File.separator + HISTORYFILE; reader.setHistory(new History(new File(historyFile))); ConsoleReaderInputStream inputStream = new ConsoleReaderInputStream(reader); grunt = new Grunt(new BufferedReader(new InputStreamReader(inputStream)), pigContext); grunt.setConsoleReader(reader); gruntCalled = true; grunt.run(); rc = 0; return; } else { // They have a pig script they want us to run. if (remainders.length > 1) { throw new RuntimeException( "You can only run one pig script " + "at a time from the command line."); } mode = ExecMode.FILE; in = new BufferedReader(new FileReader(remainders[0])); // run parameter substitution preprocessor first substFile = remainders[0] + ".substituted"; pin = runParamPreprocessor(in, params, paramFiles, substFile, debug || dryrun); if (dryrun) { log.info("Dry run completed. Substituted pig script is at " + substFile); return; } logFileName = validateLogFile(logFileName, remainders[0]); pigContext.getProperties().setProperty("pig.logfile", logFileName); if (!debug) { new File(substFile).deleteOnExit(); } // Set job name based on name of the script pigContext .getProperties() .setProperty(PigContext.JOB_NAME, "PigLatin:" + new File(remainders[0]).getName()); grunt = new Grunt(pin, pigContext); gruntCalled = true; int[] results = grunt.exec(); rc = getReturnCodeForStats(results); return; } // Per Utkarsh and Chris invocation of jar file via pig depricated. } catch (ParseException e) { usage(); rc = 2; } catch (NumberFormatException e) { usage(); rc = 2; } catch (PigException pe) { if (pe.retriable()) { rc = 1; } else { rc = 2; } if (!gruntCalled) { LogUtils.writeLog(pe, logFileName, log, verbose); } } catch (Throwable e) { rc = 2; if (!gruntCalled) { LogUtils.writeLog(e, logFileName, log, verbose); } } finally { // clear temp files FileLocalizer.deleteTempFiles(); PerformanceTimerFactory.getPerfTimerFactory().dumpTimers(); System.exit(rc); } }