@Before public void setUp() throws Exception { curDir = System.getProperty("user.dir"); inpDir = curDir + File.separatorChar + "test/org/apache/pig/test/data/InputFiles/"; golDir = curDir + File.separatorChar + "test/org/apache/pig/test/data/GoldenFiles/"; if (Util.WINDOWS) { inpDir = "/" + FileLocalizer.parseCygPath(inpDir, FileLocalizer.STYLE_WINDOWS); golDir = "/" + FileLocalizer.parseCygPath(golDir, FileLocalizer.STYLE_WINDOWS); } }
@Before public void setUp() throws ExecException { GenPhyOp.setR(r); GenPhyOp.setPc(pc); // Set random seed to generate deterministic temporary paths FileLocalizer.setR(new Random(1331L)); NodeIdGenerator.reset(""); pigServer = new PigServer(pc); pigServerMR = new PigServer(pcMR); }
/** * . Given a path, which may represent a glob pattern, a directory, comma separated files/glob * patterns or a file, this method finds the set of relevant metadata files on the storage system. * The algorithm for finding the metadata file is as follows: * * <p>For each object represented by the path (either directly, or via a glob): If object is a * directory, and path/metaname exists, use that as the metadata file. Else if parentPath/metaname * exists, use that as the metadata file. * * <p>Resolving conflicts, merging the metadata, etc, is not handled by this method and should be * taken care of by downstream code. * * <p> * * @param path Path, as passed in to a LoadFunc (may be a Hadoop glob) * @param metaname Metadata file designation, such as .pig_schema or .pig_stats * @param conf configuration object * @return Set of element descriptors for all metadata files associated with the files on the * path. */ protected Set<ElementDescriptor> findMetaFile(String path, String metaname, Configuration conf) throws IOException { Set<ElementDescriptor> metaFileSet = new HashSet<ElementDescriptor>(); String[] locations = LoadFunc.getPathStrings(path); for (String loc : locations) { DataStorage storage; storage = new HDataStorage(new Path(loc).toUri(), ConfigurationUtil.toProperties(conf)); String fullPath = FileLocalizer.fullPath(loc, storage); if (storage.isContainer(fullPath)) { ElementDescriptor metaFilePath = storage.asElement(fullPath, metaname); if (exists(metaFilePath)) { metaFileSet.add(metaFilePath); } } else { ElementDescriptor[] descriptors = storage.asCollection(loc); for (ElementDescriptor descriptor : descriptors) { ContainerDescriptor container = null; if (descriptor instanceof HFile) { Path descriptorPath = ((HPath) descriptor).getPath(); Path parent = descriptorPath.getParent(); container = new HDirectory((HDataStorage) storage, parent); } else { // descriptor instanceof HDirectory container = (HDirectory) descriptor; } // if no custom schema, try the parent directory ElementDescriptor metaFilePath = storage.asElement(container, metaname); if (exists(metaFilePath)) { metaFileSet.add(metaFilePath); } } } } return metaFileSet; }
@Test public void testReducerNumEstimationForOrderBy() throws Exception { // Skip the test for Tez. Tez use a different mechanism. // Equivalent test is in TestTezAutoParallelism Assume.assumeTrue("Skip this test for TEZ", Util.isMapredExecType(cluster.getExecType())); // use the estimation pc.getProperties().setProperty("pig.exec.reducers.bytes.per.reducer", "100"); pc.getProperties().setProperty("pig.exec.reducers.max", "10"); String query = "a = load '/passwd';" + "b = order a by $0;" + "store b into 'output';"; PigServer ps = new PigServer(cluster.getExecType(), cluster.getProperties()); PhysicalPlan pp = Util.buildPp(ps, query); MROperPlan mrPlan = Util.buildMRPlanWithOptimizer(pp, pc); Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties()); JobControlCompiler jcc = new JobControlCompiler(pc, conf); JobControl jobControl = jcc.compile(mrPlan, query); assertEquals(2, mrPlan.size()); // first job uses a single reducer for the sampling Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf()); // Simulate the first job having run so estimation kicks in. MapReduceOper sort = mrPlan.getLeaves().get(0); jcc.updateMROpPlan(jobControl.getReadyJobs()); FileLocalizer.create(sort.getQuantFile(), pc); jobControl = jcc.compile(mrPlan, query); sort = mrPlan.getLeaves().get(0); long reducer = Math.min( (long) Math.ceil(new File("test/org/apache/pig/test/data/passwd").length() / 100.0), 10); assertEquals(reducer, sort.getRequestedParallelism()); // the second job estimates reducers Util.assertParallelValues( -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf()); // use the PARALLEL key word, it will override the estimated reducer number query = "a = load '/passwd';" + "b = order a by $0 PARALLEL 2;" + "store b into 'output';"; pp = Util.buildPp(ps, query); mrPlan = Util.buildMRPlanWithOptimizer(pp, pc); assertEquals(2, mrPlan.size()); sort = mrPlan.getLeaves().get(0); assertEquals(2, sort.getRequestedParallelism()); // the estimation won't take effect when it apply to non-dfs or the files doesn't exist, such as // hbase query = "a = load 'hbase://passwd' using org.apache.pig.backend.hadoop.hbase.HBaseStorage('c:f1 c:f2');" + "b = order a by $0 ;" + "store b into 'output';"; pp = Util.buildPp(ps, query); mrPlan = Util.buildMRPlanWithOptimizer(pp, pc); assertEquals(2, mrPlan.size()); sort = mrPlan.getLeaves().get(0); // the requested parallel will be -1 if users don't set any of default_parallel, paralllel // and the estimation doesn't take effect. MR framework will finally set it to 1. assertEquals(-1, sort.getRequestedParallelism()); // test order by with three jobs (after optimization) query = "a = load '/passwd';" + "b = foreach a generate $0, $1, $2;" + "c = order b by $0;" + "store c into 'output';"; pp = Util.buildPp(ps, query); mrPlan = Util.buildMRPlanWithOptimizer(pp, pc); assertEquals(3, mrPlan.size()); // Simulate the first 2 jobs having run so estimation kicks in. sort = mrPlan.getLeaves().get(0); FileLocalizer.create(sort.getQuantFile(), pc); jobControl = jcc.compile(mrPlan, query); Util.copyFromLocalToCluster( cluster, "test/org/apache/pig/test/data/passwd", ((POLoad) sort.mapPlan.getRoots().get(0)).getLFile().getFileName()); // First job is just foreach with projection, mapper-only job, so estimate gets ignored Util.assertParallelValues(-1, -1, -1, 0, jobControl.getWaitingJobs().get(0).getJobConf()); jcc.updateMROpPlan(jobControl.getReadyJobs()); jobControl = jcc.compile(mrPlan, query); jcc.updateMROpPlan(jobControl.getReadyJobs()); // Second job is a sampler, which requests and gets 1 reducer Util.assertParallelValues(-1, 1, -1, 1, jobControl.getWaitingJobs().get(0).getJobConf()); jobControl = jcc.compile(mrPlan, query); sort = mrPlan.getLeaves().get(0); assertEquals(reducer, sort.getRequestedParallelism()); // Third job is the order, which uses the estimated number of reducers Util.assertParallelValues( -1, -1, reducer, reducer, jobControl.getWaitingJobs().get(0).getJobConf()); }
/** * The Main-Class for the Pig Jar that will provide a shell and setup a classpath appropriate for * executing Jar files. * * @param args -jar can be used to add additional jar files (colon separated). - will start a * shell. -e will execute the rest of the command line as if it was input to the shell. * @throws IOException */ public static void main(String args[]) { int rc = 1; Properties properties = new Properties(); PropertiesUtil.loadPropertiesFromFile(properties); boolean verbose = false; boolean gruntCalled = false; String logFileName = null; try { BufferedReader pin = null; boolean debug = false; boolean dryrun = false; ArrayList<String> params = new ArrayList<String>(); ArrayList<String> paramFiles = new ArrayList<String>(); HashSet<String> optimizerRules = new HashSet<String>(); CmdLineParser opts = new CmdLineParser(args); opts.registerOpt('4', "log4jconf", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('b', "brief", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('c', "cluster", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('d', "debug", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('e', "execute", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('f', "file", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('h', "help", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('i', "version", CmdLineParser.ValueExpected.OPTIONAL); opts.registerOpt('j', "jar", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('l', "logfile", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('m', "param_file", CmdLineParser.ValueExpected.OPTIONAL); opts.registerOpt('o', "hod", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('p', "param", CmdLineParser.ValueExpected.OPTIONAL); opts.registerOpt('r', "dryrun", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('t', "optimizer_off", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('v', "verbose", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('w', "warning", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('x', "exectype", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('F', "stop_on_failure", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('M', "no_multiquery", CmdLineParser.ValueExpected.NOT_ACCEPTED); ExecMode mode = ExecMode.UNKNOWN; String file = null; ExecType execType = ExecType.MAPREDUCE; String execTypeString = properties.getProperty("exectype"); if (execTypeString != null && execTypeString.length() > 0) { execType = PigServer.parseExecType(execTypeString); } String cluster = "local"; String clusterConfigured = properties.getProperty("cluster"); if (clusterConfigured != null && clusterConfigured.length() > 0) { cluster = clusterConfigured; } // by default warning aggregation is on properties.setProperty("aggregate.warning", "" + true); // by default multiquery optimization is on properties.setProperty("opt.multiquery", "" + true); // by default we keep going on error on the backend properties.setProperty("stop.on.failure", "" + false); char opt; while ((opt = opts.getNextOpt()) != CmdLineParser.EndOfOpts) { switch (opt) { case '4': String log4jconf = opts.getValStr(); if (log4jconf != null) { properties.setProperty(LOG4J_CONF, log4jconf); } break; case 'b': properties.setProperty(BRIEF, "true"); break; case 'c': // Needed away to specify the cluster to run the MR job on // Bug 831708 - fixed String clusterParameter = opts.getValStr(); if (clusterParameter != null && clusterParameter.length() > 0) { cluster = clusterParameter; } break; case 'd': String logLevel = opts.getValStr(); if (logLevel != null) { properties.setProperty(DEBUG, logLevel); } debug = true; break; case 'e': mode = ExecMode.STRING; break; case 'f': mode = ExecMode.FILE; file = opts.getValStr(); break; case 'F': properties.setProperty("stop.on.failure", "" + true); break; case 'h': usage(); return; case 'i': System.out.println(getVersionString()); return; case 'j': String jarsString = opts.getValStr(); if (jarsString != null) { properties.setProperty(JAR, jarsString); } break; case 'l': // call to method that validates the path to the log file // and sets up the file to store the client side log file String logFileParameter = opts.getValStr(); if (logFileParameter != null && logFileParameter.length() > 0) { logFileName = validateLogFile(logFileParameter, null); } else { logFileName = validateLogFile(logFileName, null); } properties.setProperty("pig.logfile", logFileName); break; case 'm': paramFiles.add(opts.getValStr()); break; case 'M': // turns off multiquery optimization properties.setProperty("opt.multiquery", "" + false); break; case 'o': // TODO sgroschupf using system properties is always a very bad idea String gateway = System.getProperty("ssh.gateway"); if (gateway == null || gateway.length() == 0) { properties.setProperty("hod.server", "local"); } else { properties.setProperty("hod.server", System.getProperty("ssh.gateway")); } break; case 'p': String val = opts.getValStr(); params.add(opts.getValStr()); break; case 'r': // currently only used for parameter substitution // will be extended in the future dryrun = true; break; case 't': optimizerRules.add(opts.getValStr()); break; case 'v': properties.setProperty(VERBOSE, "" + true); verbose = true; break; case 'w': properties.setProperty("aggregate.warning", "" + false); break; case 'x': try { execType = PigServer.parseExecType(opts.getValStr()); } catch (IOException e) { throw new RuntimeException("ERROR: Unrecognized exectype.", e); } break; default: { Character cc = new Character(opt); throw new AssertionError("Unhandled option " + cc.toString()); } } } // configure logging configureLog4J(properties); // create the context with the parameter PigContext pigContext = new PigContext(execType, properties); if (logFileName == null) { logFileName = validateLogFile(null, null); } pigContext.getProperties().setProperty("pig.logfile", logFileName); if (optimizerRules.size() > 0) { pigContext .getProperties() .setProperty("pig.optimizer.rules", ObjectSerializer.serialize(optimizerRules)); } LogicalPlanBuilder.classloader = pigContext.createCl(null); // construct the parameter substitution preprocessor Grunt grunt = null; BufferedReader in; String substFile = null; switch (mode) { case FILE: { // Run, using the provided file as a pig file in = new BufferedReader(new FileReader(file)); // run parameter substitution preprocessor first substFile = file + ".substituted"; pin = runParamPreprocessor(in, params, paramFiles, substFile, debug || dryrun); if (dryrun) { log.info("Dry run completed. Substituted pig script is at " + substFile); return; } logFileName = validateLogFile(logFileName, file); pigContext.getProperties().setProperty("pig.logfile", logFileName); // Set job name based on name of the script pigContext .getProperties() .setProperty(PigContext.JOB_NAME, "PigLatin:" + new File(file).getName()); if (!debug) { new File(substFile).deleteOnExit(); } grunt = new Grunt(pin, pigContext); gruntCalled = true; int results[] = grunt.exec(); rc = getReturnCodeForStats(results); return; } case STRING: { // Gather up all the remaining arguments into a string and pass them into // grunt. StringBuffer sb = new StringBuffer(); String remainders[] = opts.getRemainingArgs(); for (int i = 0; i < remainders.length; i++) { if (i != 0) sb.append(' '); sb.append(remainders[i]); } in = new BufferedReader(new StringReader(sb.toString())); grunt = new Grunt(in, pigContext); gruntCalled = true; int results[] = grunt.exec(); rc = getReturnCodeForStats(results); return; } default: break; } // If we're here, we don't know yet what they want. They may have just // given us a jar to execute, they might have given us a pig script to // execute, or they might have given us a dash (or nothing) which means to // run grunt interactive. String remainders[] = opts.getRemainingArgs(); if (remainders == null) { // Interactive mode = ExecMode.SHELL; ConsoleReader reader = new ConsoleReader(System.in, new OutputStreamWriter(System.out)); reader.setDefaultPrompt("grunt> "); final String HISTORYFILE = ".pig_history"; String historyFile = System.getProperty("user.home") + File.separator + HISTORYFILE; reader.setHistory(new History(new File(historyFile))); ConsoleReaderInputStream inputStream = new ConsoleReaderInputStream(reader); grunt = new Grunt(new BufferedReader(new InputStreamReader(inputStream)), pigContext); grunt.setConsoleReader(reader); gruntCalled = true; grunt.run(); rc = 0; return; } else { // They have a pig script they want us to run. if (remainders.length > 1) { throw new RuntimeException( "You can only run one pig script " + "at a time from the command line."); } mode = ExecMode.FILE; in = new BufferedReader(new FileReader(remainders[0])); // run parameter substitution preprocessor first substFile = remainders[0] + ".substituted"; pin = runParamPreprocessor(in, params, paramFiles, substFile, debug || dryrun); if (dryrun) { log.info("Dry run completed. Substituted pig script is at " + substFile); return; } logFileName = validateLogFile(logFileName, remainders[0]); pigContext.getProperties().setProperty("pig.logfile", logFileName); if (!debug) { new File(substFile).deleteOnExit(); } // Set job name based on name of the script pigContext .getProperties() .setProperty(PigContext.JOB_NAME, "PigLatin:" + new File(remainders[0]).getName()); grunt = new Grunt(pin, pigContext); gruntCalled = true; int[] results = grunt.exec(); rc = getReturnCodeForStats(results); return; } // Per Utkarsh and Chris invocation of jar file via pig depricated. } catch (ParseException e) { usage(); rc = 2; } catch (NumberFormatException e) { usage(); rc = 2; } catch (PigException pe) { if (pe.retriable()) { rc = 1; } else { rc = 2; } if (!gruntCalled) { LogUtils.writeLog(pe, logFileName, log, verbose); } } catch (Throwable e) { rc = 2; if (!gruntCalled) { LogUtils.writeLog(e, logFileName, log, verbose); } } finally { // clear temp files FileLocalizer.deleteTempFiles(); PerformanceTimerFactory.getPerfTimerFactory().dumpTimers(); System.exit(rc); } }
/** * This method copies all class files present in the local temp directory to the distributed * cache. All copied files will have a symlink of their name. No files will be copied if the * current job is being run from local mode. * * @param pigContext * @param conf */ private void internalCopyAllGeneratedToDistributedCache() { LOG.info("Starting process to move generated code to distributed cacche"); if (pigContext.getExecType().isLocal()) { String codePath = codeDir.getAbsolutePath(); LOG.info( "Distributed cache not supported or needed in local mode. Setting key [" + LOCAL_CODE_DIR + "] with code temp directory: " + codePath); conf.set(LOCAL_CODE_DIR, codePath); return; } else { // This let's us avoid NPE in some of the non-traditional pipelines String codePath = codeDir.getAbsolutePath(); conf.set(LOCAL_CODE_DIR, codePath); } DistributedCache.createSymlink(conf); // we will read using symlinks StringBuilder serialized = new StringBuilder(); boolean first = true; // We attempt to copy over every file in the generated code temp directory for (File f : codeDir.listFiles()) { if (first) { first = false; } else { serialized.append(","); } String symlink = f.getName(); // the class name will also be the symlink serialized.append(symlink); Path src = new Path(f.toURI()); Path dst; try { dst = FileLocalizer.getTemporaryPath(pigContext); } catch (IOException e) { throw new RuntimeException("Error getting temporary path in HDFS", e); } FileSystem fs; try { fs = dst.getFileSystem(conf); } catch (IOException e) { throw new RuntimeException("Unable to get FileSystem", e); } try { fs.copyFromLocalFile(src, dst); fs.setReplication(dst, (short) conf.getInt(MRConfiguration.SUMIT_REPLICATION, 3)); } catch (IOException e) { throw new RuntimeException( "Unable to copy from local filesystem to HDFS, src = " + src + ", dst = " + dst, e); } String destination = dst.toString() + "#" + symlink; try { DistributedCache.addCacheFile(new URI(destination), conf); } catch (URISyntaxException e) { throw new RuntimeException("Unable to add file to distributed cache: " + destination, e); } LOG.info("File successfully added to the distributed cache: " + symlink); } String toSer = serialized.toString(); LOG.info( "Setting key [" + GENERATED_CLASSES_KEY + "] with classes to deserialize [" + toSer + "]"); // we must set a key in the job conf so individual jobs know to resolve the shipped classes conf.set(GENERATED_CLASSES_KEY, toSer); }