/** Configures the Reduce plan, the POPackage operator and the reporter thread */ @SuppressWarnings("unchecked") @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); inIllustrator = inIllustrator(context); if (inIllustrator) pack = getPack(context); Configuration jConf = context.getConfiguration(); SpillableMemoryManager.configure(ConfigurationUtil.toProperties(jConf)); context .getConfiguration() .set( PigConstants.TASK_INDEX, Integer.toString(context.getTaskAttemptID().getTaskID().getId())); sJobContext = context; sJobConfInternal.set(context.getConfiguration()); sJobConf = context.getConfiguration(); try { PigContext.setPackageImportList( (ArrayList<String>) ObjectSerializer.deserialize(jConf.get("udf.import.list"))); pigContext = (PigContext) ObjectSerializer.deserialize(jConf.get("pig.pigContext")); // This attempts to fetch all of the generated code from the distributed cache, and resolve // it SchemaTupleBackend.initialize(jConf, pigContext); if (rp == null) rp = (PhysicalPlan) ObjectSerializer.deserialize(jConf.get("pig.reducePlan")); stores = PlanHelper.getPhysicalOperators(rp, POStore.class); if (!inIllustrator) pack = (POPackage) ObjectSerializer.deserialize(jConf.get("pig.reduce.package")); // To be removed if (rp.isEmpty()) log.debug("Reduce Plan empty!"); else { ByteArrayOutputStream baos = new ByteArrayOutputStream(); rp.explain(baos); log.debug(baos.toString()); } pigReporter = new ProgressableReporter(); if (!(rp.isEmpty())) { roots = rp.getRoots().toArray(new PhysicalOperator[1]); leaf = rp.getLeaves().get(0); } // Get the UDF specific context MapRedUtil.setupUDFContext(jConf); } catch (IOException ioe) { String msg = "Problem while configuring reduce plan."; throw new RuntimeException(msg, ioe); } log.info( "Aliases being processed per job phase (AliasName[line,offset]): " + jConf.get("pig.alias.location")); Utils.setDefaultTimeZone(PigMapReduce.sJobConfInternal.get()); }
public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) throws FrontendException { if (requiredFieldList == null) { return null; } if (requiredFieldList.getFields() != null) { int lastColumn = -1; for (RequiredField rf : requiredFieldList.getFields()) { if (rf.getIndex() > lastColumn) { lastColumn = rf.getIndex(); } } requiredFields = new boolean[lastColumn + 1]; for (RequiredField rf : requiredFieldList.getFields()) { if (rf.getIndex() != -1) requiredFields[rf.getIndex()] = true; } Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass()); try { p.setProperty(REQUIRED_FIELDS_SIGNATURE, ObjectSerializer.serialize(requiredFields)); } catch (Exception e) { throw new RuntimeException("Cannot serialize mRequiredColumns"); } } return new RequiredFieldResponse(true); }
@SuppressWarnings("unchecked") @Override public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { this.reader = reader; final String resourceSchemaAsStr = getValueFromUDFContext(this.contextSignature, RESOURCE_SCHEMA_SIGNATURE); if (resourceSchemaAsStr == null) { throw new IOException("Could not find schema in UDF context"); } schema = (ResourceSchema) ObjectSerializer.deserialize(resourceSchemaAsStr); }
@Override public void prepareToRead(RecordReader recordReader, PigSplit pigSplit) throws IOException { reader = recordReader; if (!requiredFieldsInitialized) { UDFContext udfc = UDFContext.getUDFContext(); Properties p = udfc.getUDFProperties(this.getClass(), new String[] {udfContextSignature}); requiredFields = (boolean[]) ObjectSerializer.deserialize(p.getProperty(REQUIRED_FIELDS_SIGNATURE)); requiredFieldsInitialized = true; } }
public void setConf(Configuration conf) { try { mAsc = (boolean[]) ObjectSerializer.deserialize(conf.get("pig.sortOrder")); } catch (IOException ioe) { mLog.error("Unable to deserialize pig.sortOrder " + ioe.getMessage()); throw new RuntimeException(ioe); } if (mAsc == null) { mAsc = new boolean[1]; mAsc[0] = true; } }
/** * stolen from JobControlCompiler TODO: refactor it to share this * * @param physicalPlan * @param poLoad * @param jobConf * @return * @throws java.io.IOException */ private static JobConf configureLoader(PhysicalPlan physicalPlan, POLoad poLoad, JobConf jobConf) throws IOException { // 这部分似乎没用 Job job = new Job(jobConf); LoadFunc loadFunc = poLoad.getLoadFunc(); loadFunc.setLocation(poLoad.getLFile().getFileName(), job); // stolen from JobControlCompiler ArrayList<FileSpec> pigInputs = new ArrayList<FileSpec>(); // Store the inp filespecs pigInputs.add(poLoad.getLFile()); ArrayList<List<OperatorKey>> inpTargets = Lists.newArrayList(); ArrayList<String> inpSignatures = Lists.newArrayList(); ArrayList<Long> inpLimits = Lists.newArrayList(); // Store the target operators for tuples read // from this input List<PhysicalOperator> loadSuccessors = physicalPlan.getSuccessors(poLoad); List<OperatorKey> loadSuccessorsKeys = Lists.newArrayList(); if (loadSuccessors != null) { for (PhysicalOperator loadSuccessor : loadSuccessors) { loadSuccessorsKeys.add(loadSuccessor.getOperatorKey()); } } inpTargets.add(loadSuccessorsKeys); inpSignatures.add(poLoad.getSignature()); inpLimits.add(poLoad.getLimit()); jobConf.set("pig.inputs", ObjectSerializer.serialize(pigInputs)); jobConf.set("pig.inpTargets", ObjectSerializer.serialize(inpTargets)); jobConf.set("pig.inpSignatures", ObjectSerializer.serialize(inpSignatures)); jobConf.set("pig.inpLimits", ObjectSerializer.serialize(inpLimits)); return jobConf; }
@Override public ResourceSchema getSchema(String location, Job job) throws IOException { if (schema != null) { return schema; } final Configuration configuration = job.getConfiguration(); this.initializePhoenixPigConfiguration(location, configuration); this.schema = PhoenixPigSchemaUtil.getResourceSchema(this.config); if (LOG.isDebugEnabled()) { LOG.debug( String.format( "Resource Schema generated for location [%s] is [%s]", location, schema.toString())); } this.storeInUDFContext( this.contextSignature, RESOURCE_SCHEMA_SIGNATURE, ObjectSerializer.serialize(schema)); return schema; }
public void setConf(Configuration conf) { if (!(conf instanceof JobConf)) { mLog.warn("Expected jobconf in setConf, got " + conf.getClass().getName()); return; } JobConf jconf = (JobConf) conf; try { mAsc = (boolean[]) ObjectSerializer.deserialize(jconf.get("pig.sortOrder")); } catch (IOException ioe) { mLog.error("Unable to deserialize pig.sortOrder " + ioe.getMessage()); throw new RuntimeException(ioe); } if (mAsc == null) { mAsc = new boolean[1]; mAsc[0] = true; } // If there's only one entry in mAsc, it means it's for the whole // tuple. So we can't be looking for each column. mWholeTuple = (mAsc.length == 1); }
public static Properties getJobConf(Configuration conf) { if (conf == null) { return null; } Properties jobConfProperties = null; try { jobConfProperties = new Properties(); for (Map.Entry<String, String> entry : conf) { if (entry.getKey().equals("pig.mapPlan") || entry.getKey().equals("pig.reducePlan")) { jobConfProperties.setProperty( entry.getKey(), ObjectSerializer.deserialize(entry.getValue()).toString()); } else if (JOB_CONF_KEYS.contains(entry.getKey())) { jobConfProperties.setProperty(entry.getKey(), entry.getValue()); } } } catch (IOException e) { logger.warn("Error while reading job conf: " + e.getMessage()); } return jobConfProperties; }
/* (non-Javadoc) * @see org.apache.pig.builtin.PigStorage#getNext() */ @Override public Tuple getNext() throws IOException { // If SKIP_INPUT_HEADER and this is the first input split, skip header record // We store its value as a string though, so we can compare // further records to it. If they are the same (this would // happen if multiple small files each with a header were combined // into one split), we know to skip the duplicate header record as well. if (loadingFirstRecord && headerTreatment == Headers.SKIP_INPUT_HEADER && (splitIndex == 0 || splitIndex == -1)) { try { if (!in.nextKeyValue()) return null; header = ((Text) in.getCurrentValue()).toString(); } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } } loadingFirstRecord = false; mProtoTuple = new ArrayList<Object>(); getNextInQuotedField = false; boolean evenQuotesSeen = true; boolean sawEmbeddedRecordDelimiter = false; byte[] buf = null; if (!mRequiredColumnsInitialized) { if (udfContextSignature != null) { Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass()); mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p.getProperty(udfContextSignature)); } mRequiredColumnsInitialized = true; } // Note: we cannot factor out the check for nextKeyValue() being null, // because that call overwrites buf with the new line, which is // bad if we have a field with a newline. try { int recordLen = 0; getNextFieldID = 0; while (sawEmbeddedRecordDelimiter || getNextFieldID == 0) { Text value = null; if (sawEmbeddedRecordDelimiter) { // Deal with pulling more records from the input, because // a double quoted embedded newline was encountered in a field. // Save the length of the record so far, plus one byte for the // record delimiter (usually newline) that's embedded in the field // we were working on before falling into this branch: int prevLineLen = recordLen + 1; // Save previous line (the one with the field that has the newline) in a new array. // The last byte will be random; we'll fill in the embedded // record delimiter (usually newline) below: byte[] prevLineSaved = Arrays.copyOf(buf, prevLineLen); prevLineSaved[prevLineLen - 1] = RECORD_DEL; // Read the continuation of the record, unless EOF: if (!in.nextKeyValue()) { return null; } value = (Text) in.getCurrentValue(); recordLen = value.getLength(); // Grab the continuation's bytes: buf = value.getBytes(); // Combine the previous line and the continuation into a new array. // The following copyOf() does half the job: it allocates all the // space, and also copies the previous line into that space: byte[] prevLineAndContinuation = Arrays.copyOf(prevLineSaved, prevLineLen + recordLen); // Now append the continuation. Parms: fromBuf, fromStartPos, toBuf, toStartPos, // lengthToCopy: System.arraycopy(buf, 0, prevLineAndContinuation, prevLineLen, recordLen); // We'll work with the combination now: buf = prevLineAndContinuation; // Do the whole record over from the start: mProtoTuple.clear(); getNextInQuotedField = false; evenQuotesSeen = true; getNextFieldID = 0; recordLen = prevLineAndContinuation.length; } else { // Previous record finished cleanly: start with the next record, // unless EOF: if (!in.nextKeyValue()) { return null; } value = (Text) in.getCurrentValue(); // if the line is a duplicate header and 'SKIP_INPUT_HEADER' is set, ignore it // (this might happen if multiple files each with a header are combined into a single // split) if (headerTreatment == Headers.SKIP_INPUT_HEADER && value.toString().equals(header)) { if (!in.nextKeyValue()) return null; value = (Text) in.getCurrentValue(); } buf = value.getBytes(); getNextFieldID = 0; recordLen = value.getLength(); } nextTupleSkipChar = false; ByteBuffer fieldBuffer = ByteBuffer.allocate(recordLen); sawEmbeddedRecordDelimiter = processOneInRecord(evenQuotesSeen, buf, recordLen, fieldBuffer); // The last field is never delimited by a FIELD_DEL, but by // the end of the record. So we need to add that last field. // The '!sawEmbeddedRecordDelimiter' handles the case of // embedded newlines; we are amidst a field, not at // the final record: if (!sawEmbeddedRecordDelimiter) readField(fieldBuffer, getNextFieldID++); } // end while } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple); return t; }
/** * The Main-Class for the Pig Jar that will provide a shell and setup a classpath appropriate for * executing Jar files. * * @param args -jar can be used to add additional jar files (colon separated). - will start a * shell. -e will execute the rest of the command line as if it was input to the shell. * @throws IOException */ public static void main(String args[]) { int rc = 1; Properties properties = new Properties(); PropertiesUtil.loadPropertiesFromFile(properties); boolean verbose = false; boolean gruntCalled = false; String logFileName = null; try { BufferedReader pin = null; boolean debug = false; boolean dryrun = false; ArrayList<String> params = new ArrayList<String>(); ArrayList<String> paramFiles = new ArrayList<String>(); HashSet<String> optimizerRules = new HashSet<String>(); CmdLineParser opts = new CmdLineParser(args); opts.registerOpt('4', "log4jconf", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('b', "brief", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('c', "cluster", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('d', "debug", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('e', "execute", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('f', "file", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('h', "help", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('i', "version", CmdLineParser.ValueExpected.OPTIONAL); opts.registerOpt('j', "jar", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('l', "logfile", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('m', "param_file", CmdLineParser.ValueExpected.OPTIONAL); opts.registerOpt('o', "hod", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('p', "param", CmdLineParser.ValueExpected.OPTIONAL); opts.registerOpt('r', "dryrun", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('t', "optimizer_off", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('v', "verbose", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('w', "warning", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('x', "exectype", CmdLineParser.ValueExpected.REQUIRED); opts.registerOpt('F', "stop_on_failure", CmdLineParser.ValueExpected.NOT_ACCEPTED); opts.registerOpt('M', "no_multiquery", CmdLineParser.ValueExpected.NOT_ACCEPTED); ExecMode mode = ExecMode.UNKNOWN; String file = null; ExecType execType = ExecType.MAPREDUCE; String execTypeString = properties.getProperty("exectype"); if (execTypeString != null && execTypeString.length() > 0) { execType = PigServer.parseExecType(execTypeString); } String cluster = "local"; String clusterConfigured = properties.getProperty("cluster"); if (clusterConfigured != null && clusterConfigured.length() > 0) { cluster = clusterConfigured; } // by default warning aggregation is on properties.setProperty("aggregate.warning", "" + true); // by default multiquery optimization is on properties.setProperty("opt.multiquery", "" + true); // by default we keep going on error on the backend properties.setProperty("stop.on.failure", "" + false); char opt; while ((opt = opts.getNextOpt()) != CmdLineParser.EndOfOpts) { switch (opt) { case '4': String log4jconf = opts.getValStr(); if (log4jconf != null) { properties.setProperty(LOG4J_CONF, log4jconf); } break; case 'b': properties.setProperty(BRIEF, "true"); break; case 'c': // Needed away to specify the cluster to run the MR job on // Bug 831708 - fixed String clusterParameter = opts.getValStr(); if (clusterParameter != null && clusterParameter.length() > 0) { cluster = clusterParameter; } break; case 'd': String logLevel = opts.getValStr(); if (logLevel != null) { properties.setProperty(DEBUG, logLevel); } debug = true; break; case 'e': mode = ExecMode.STRING; break; case 'f': mode = ExecMode.FILE; file = opts.getValStr(); break; case 'F': properties.setProperty("stop.on.failure", "" + true); break; case 'h': usage(); return; case 'i': System.out.println(getVersionString()); return; case 'j': String jarsString = opts.getValStr(); if (jarsString != null) { properties.setProperty(JAR, jarsString); } break; case 'l': // call to method that validates the path to the log file // and sets up the file to store the client side log file String logFileParameter = opts.getValStr(); if (logFileParameter != null && logFileParameter.length() > 0) { logFileName = validateLogFile(logFileParameter, null); } else { logFileName = validateLogFile(logFileName, null); } properties.setProperty("pig.logfile", logFileName); break; case 'm': paramFiles.add(opts.getValStr()); break; case 'M': // turns off multiquery optimization properties.setProperty("opt.multiquery", "" + false); break; case 'o': // TODO sgroschupf using system properties is always a very bad idea String gateway = System.getProperty("ssh.gateway"); if (gateway == null || gateway.length() == 0) { properties.setProperty("hod.server", "local"); } else { properties.setProperty("hod.server", System.getProperty("ssh.gateway")); } break; case 'p': String val = opts.getValStr(); params.add(opts.getValStr()); break; case 'r': // currently only used for parameter substitution // will be extended in the future dryrun = true; break; case 't': optimizerRules.add(opts.getValStr()); break; case 'v': properties.setProperty(VERBOSE, "" + true); verbose = true; break; case 'w': properties.setProperty("aggregate.warning", "" + false); break; case 'x': try { execType = PigServer.parseExecType(opts.getValStr()); } catch (IOException e) { throw new RuntimeException("ERROR: Unrecognized exectype.", e); } break; default: { Character cc = new Character(opt); throw new AssertionError("Unhandled option " + cc.toString()); } } } // configure logging configureLog4J(properties); // create the context with the parameter PigContext pigContext = new PigContext(execType, properties); if (logFileName == null) { logFileName = validateLogFile(null, null); } pigContext.getProperties().setProperty("pig.logfile", logFileName); if (optimizerRules.size() > 0) { pigContext .getProperties() .setProperty("pig.optimizer.rules", ObjectSerializer.serialize(optimizerRules)); } LogicalPlanBuilder.classloader = pigContext.createCl(null); // construct the parameter substitution preprocessor Grunt grunt = null; BufferedReader in; String substFile = null; switch (mode) { case FILE: { // Run, using the provided file as a pig file in = new BufferedReader(new FileReader(file)); // run parameter substitution preprocessor first substFile = file + ".substituted"; pin = runParamPreprocessor(in, params, paramFiles, substFile, debug || dryrun); if (dryrun) { log.info("Dry run completed. Substituted pig script is at " + substFile); return; } logFileName = validateLogFile(logFileName, file); pigContext.getProperties().setProperty("pig.logfile", logFileName); // Set job name based on name of the script pigContext .getProperties() .setProperty(PigContext.JOB_NAME, "PigLatin:" + new File(file).getName()); if (!debug) { new File(substFile).deleteOnExit(); } grunt = new Grunt(pin, pigContext); gruntCalled = true; int results[] = grunt.exec(); rc = getReturnCodeForStats(results); return; } case STRING: { // Gather up all the remaining arguments into a string and pass them into // grunt. StringBuffer sb = new StringBuffer(); String remainders[] = opts.getRemainingArgs(); for (int i = 0; i < remainders.length; i++) { if (i != 0) sb.append(' '); sb.append(remainders[i]); } in = new BufferedReader(new StringReader(sb.toString())); grunt = new Grunt(in, pigContext); gruntCalled = true; int results[] = grunt.exec(); rc = getReturnCodeForStats(results); return; } default: break; } // If we're here, we don't know yet what they want. They may have just // given us a jar to execute, they might have given us a pig script to // execute, or they might have given us a dash (or nothing) which means to // run grunt interactive. String remainders[] = opts.getRemainingArgs(); if (remainders == null) { // Interactive mode = ExecMode.SHELL; ConsoleReader reader = new ConsoleReader(System.in, new OutputStreamWriter(System.out)); reader.setDefaultPrompt("grunt> "); final String HISTORYFILE = ".pig_history"; String historyFile = System.getProperty("user.home") + File.separator + HISTORYFILE; reader.setHistory(new History(new File(historyFile))); ConsoleReaderInputStream inputStream = new ConsoleReaderInputStream(reader); grunt = new Grunt(new BufferedReader(new InputStreamReader(inputStream)), pigContext); grunt.setConsoleReader(reader); gruntCalled = true; grunt.run(); rc = 0; return; } else { // They have a pig script they want us to run. if (remainders.length > 1) { throw new RuntimeException( "You can only run one pig script " + "at a time from the command line."); } mode = ExecMode.FILE; in = new BufferedReader(new FileReader(remainders[0])); // run parameter substitution preprocessor first substFile = remainders[0] + ".substituted"; pin = runParamPreprocessor(in, params, paramFiles, substFile, debug || dryrun); if (dryrun) { log.info("Dry run completed. Substituted pig script is at " + substFile); return; } logFileName = validateLogFile(logFileName, remainders[0]); pigContext.getProperties().setProperty("pig.logfile", logFileName); if (!debug) { new File(substFile).deleteOnExit(); } // Set job name based on name of the script pigContext .getProperties() .setProperty(PigContext.JOB_NAME, "PigLatin:" + new File(remainders[0]).getName()); grunt = new Grunt(pin, pigContext); gruntCalled = true; int[] results = grunt.exec(); rc = getReturnCodeForStats(results); return; } // Per Utkarsh and Chris invocation of jar file via pig depricated. } catch (ParseException e) { usage(); rc = 2; } catch (NumberFormatException e) { usage(); rc = 2; } catch (PigException pe) { if (pe.retriable()) { rc = 1; } else { rc = 2; } if (!gruntCalled) { LogUtils.writeLog(pe, logFileName, log, verbose); } } catch (Throwable e) { rc = 2; if (!gruntCalled) { LogUtils.writeLog(e, logFileName, log, verbose); } } finally { // clear temp files FileLocalizer.deleteTempFiles(); PerformanceTimerFactory.getPerfTimerFactory().dumpTimers(); System.exit(rc); } }