/** Configures the Reduce plan, the POPackage operator and the reporter thread */
    @SuppressWarnings("unchecked")
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
      super.setup(context);
      inIllustrator = inIllustrator(context);
      if (inIllustrator) pack = getPack(context);
      Configuration jConf = context.getConfiguration();
      SpillableMemoryManager.configure(ConfigurationUtil.toProperties(jConf));
      context
          .getConfiguration()
          .set(
              PigConstants.TASK_INDEX,
              Integer.toString(context.getTaskAttemptID().getTaskID().getId()));
      sJobContext = context;
      sJobConfInternal.set(context.getConfiguration());
      sJobConf = context.getConfiguration();
      try {
        PigContext.setPackageImportList(
            (ArrayList<String>) ObjectSerializer.deserialize(jConf.get("udf.import.list")));
        pigContext = (PigContext) ObjectSerializer.deserialize(jConf.get("pig.pigContext"));

        // This attempts to fetch all of the generated code from the distributed cache, and resolve
        // it
        SchemaTupleBackend.initialize(jConf, pigContext);

        if (rp == null)
          rp = (PhysicalPlan) ObjectSerializer.deserialize(jConf.get("pig.reducePlan"));
        stores = PlanHelper.getPhysicalOperators(rp, POStore.class);

        if (!inIllustrator)
          pack = (POPackage) ObjectSerializer.deserialize(jConf.get("pig.reduce.package"));
        // To be removed
        if (rp.isEmpty()) log.debug("Reduce Plan empty!");
        else {
          ByteArrayOutputStream baos = new ByteArrayOutputStream();
          rp.explain(baos);
          log.debug(baos.toString());
        }
        pigReporter = new ProgressableReporter();
        if (!(rp.isEmpty())) {
          roots = rp.getRoots().toArray(new PhysicalOperator[1]);
          leaf = rp.getLeaves().get(0);
        }

        // Get the UDF specific context
        MapRedUtil.setupUDFContext(jConf);

      } catch (IOException ioe) {
        String msg = "Problem while configuring reduce plan.";
        throw new RuntimeException(msg, ioe);
      }

      log.info(
          "Aliases being processed per job phase (AliasName[line,offset]): "
              + jConf.get("pig.alias.location"));

      Utils.setDefaultTimeZone(PigMapReduce.sJobConfInternal.get());
    }
Esempio n. 2
0
 @SuppressWarnings("unchecked")
 @Override
 public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
   this.reader = reader;
   final String resourceSchemaAsStr =
       getValueFromUDFContext(this.contextSignature, RESOURCE_SCHEMA_SIGNATURE);
   if (resourceSchemaAsStr == null) {
     throw new IOException("Could not find schema in UDF context");
   }
   schema = (ResourceSchema) ObjectSerializer.deserialize(resourceSchemaAsStr);
 }
 @Override
 public void prepareToRead(RecordReader recordReader, PigSplit pigSplit) throws IOException {
   reader = recordReader;
   if (!requiredFieldsInitialized) {
     UDFContext udfc = UDFContext.getUDFContext();
     Properties p = udfc.getUDFProperties(this.getClass(), new String[] {udfContextSignature});
     requiredFields =
         (boolean[]) ObjectSerializer.deserialize(p.getProperty(REQUIRED_FIELDS_SIGNATURE));
     requiredFieldsInitialized = true;
   }
 }
 public void setConf(Configuration conf) {
   try {
     mAsc = (boolean[]) ObjectSerializer.deserialize(conf.get("pig.sortOrder"));
   } catch (IOException ioe) {
     mLog.error("Unable to deserialize pig.sortOrder " + ioe.getMessage());
     throw new RuntimeException(ioe);
   }
   if (mAsc == null) {
     mAsc = new boolean[1];
     mAsc[0] = true;
   }
 }
 public void setConf(Configuration conf) {
   if (!(conf instanceof JobConf)) {
     mLog.warn("Expected jobconf in setConf, got " + conf.getClass().getName());
     return;
   }
   JobConf jconf = (JobConf) conf;
   try {
     mAsc = (boolean[]) ObjectSerializer.deserialize(jconf.get("pig.sortOrder"));
   } catch (IOException ioe) {
     mLog.error("Unable to deserialize pig.sortOrder " + ioe.getMessage());
     throw new RuntimeException(ioe);
   }
   if (mAsc == null) {
     mAsc = new boolean[1];
     mAsc[0] = true;
   }
   // If there's only one entry in mAsc, it means it's for the whole
   // tuple. So we can't be looking for each column.
   mWholeTuple = (mAsc.length == 1);
 }
  public static Properties getJobConf(Configuration conf) {
    if (conf == null) {
      return null;
    }

    Properties jobConfProperties = null;
    try {
      jobConfProperties = new Properties();
      for (Map.Entry<String, String> entry : conf) {
        if (entry.getKey().equals("pig.mapPlan") || entry.getKey().equals("pig.reducePlan")) {
          jobConfProperties.setProperty(
              entry.getKey(), ObjectSerializer.deserialize(entry.getValue()).toString());
        } else if (JOB_CONF_KEYS.contains(entry.getKey())) {
          jobConfProperties.setProperty(entry.getKey(), entry.getValue());
        }
      }
    } catch (IOException e) {
      logger.warn("Error while reading job conf: " + e.getMessage());
    }
    return jobConfProperties;
  }
Esempio n. 7
0
  /* (non-Javadoc)
   * @see org.apache.pig.builtin.PigStorage#getNext()
   */
  @Override
  public Tuple getNext() throws IOException {
    // If SKIP_INPUT_HEADER and this is the first input split, skip header record
    // We store its value as a string though, so we can compare
    // further records to it. If they are the same (this would
    // happen if multiple small files each with a header were combined
    // into one split), we know to skip the duplicate header record as well.
    if (loadingFirstRecord
        && headerTreatment == Headers.SKIP_INPUT_HEADER
        && (splitIndex == 0 || splitIndex == -1)) {
      try {
        if (!in.nextKeyValue()) return null;
        header = ((Text) in.getCurrentValue()).toString();
      } catch (InterruptedException e) {
        int errCode = 6018;
        String errMsg = "Error while reading input";
        throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
      }
    }
    loadingFirstRecord = false;

    mProtoTuple = new ArrayList<Object>();

    getNextInQuotedField = false;
    boolean evenQuotesSeen = true;
    boolean sawEmbeddedRecordDelimiter = false;
    byte[] buf = null;

    if (!mRequiredColumnsInitialized) {
      if (udfContextSignature != null) {
        Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
        mRequiredColumns =
            (boolean[]) ObjectSerializer.deserialize(p.getProperty(udfContextSignature));
      }
      mRequiredColumnsInitialized = true;
    }
    // Note: we cannot factor out the check for nextKeyValue() being null,
    // because that call overwrites buf with the new line, which is
    // bad if we have a field with a newline.

    try {
      int recordLen = 0;
      getNextFieldID = 0;

      while (sawEmbeddedRecordDelimiter || getNextFieldID == 0) {
        Text value = null;
        if (sawEmbeddedRecordDelimiter) {

          // Deal with pulling more records from the input, because
          // a double quoted embedded newline was encountered in a field.
          // Save the length of the record so far, plus one byte for the
          // record delimiter (usually newline) that's embedded in the field
          // we were working on before falling into this branch:
          int prevLineLen = recordLen + 1;

          // Save previous line (the one with the field that has the newline) in a new array.
          // The last byte will be random; we'll fill in the embedded
          // record delimiter (usually newline) below:
          byte[] prevLineSaved = Arrays.copyOf(buf, prevLineLen);
          prevLineSaved[prevLineLen - 1] = RECORD_DEL;

          // Read the continuation of the record, unless EOF:
          if (!in.nextKeyValue()) {
            return null;
          }
          value = (Text) in.getCurrentValue();
          recordLen = value.getLength();
          // Grab the continuation's bytes:
          buf = value.getBytes();

          // Combine the previous line and the continuation into a new array.
          // The following copyOf() does half the job: it allocates all the
          // space, and also copies the previous line into that space:
          byte[] prevLineAndContinuation = Arrays.copyOf(prevLineSaved, prevLineLen + recordLen);

          // Now append the continuation. Parms: fromBuf, fromStartPos, toBuf, toStartPos,
          // lengthToCopy:
          System.arraycopy(buf, 0, prevLineAndContinuation, prevLineLen, recordLen);

          // We'll work with the combination now:
          buf = prevLineAndContinuation;

          // Do the whole record over from the start:
          mProtoTuple.clear();
          getNextInQuotedField = false;
          evenQuotesSeen = true;
          getNextFieldID = 0;
          recordLen = prevLineAndContinuation.length;

        } else {
          // Previous record finished cleanly: start with the next record,
          // unless EOF:
          if (!in.nextKeyValue()) {
            return null;
          }
          value = (Text) in.getCurrentValue();

          // if the line is a duplicate header and 'SKIP_INPUT_HEADER' is set, ignore it
          // (this might happen if multiple files each with a header are combined into a single
          // split)
          if (headerTreatment == Headers.SKIP_INPUT_HEADER && value.toString().equals(header)) {
            if (!in.nextKeyValue()) return null;
            value = (Text) in.getCurrentValue();
          }

          buf = value.getBytes();
          getNextFieldID = 0;
          recordLen = value.getLength();
        }

        nextTupleSkipChar = false;

        ByteBuffer fieldBuffer = ByteBuffer.allocate(recordLen);

        sawEmbeddedRecordDelimiter =
            processOneInRecord(evenQuotesSeen, buf, recordLen, fieldBuffer);

        // The last field is never delimited by a FIELD_DEL, but by
        // the end of the record. So we need to add that last field.
        // The '!sawEmbeddedRecordDelimiter' handles the case of
        // embedded newlines; we are amidst a field, not at
        // the final record:
        if (!sawEmbeddedRecordDelimiter) readField(fieldBuffer, getNextFieldID++);
      } // end while

    } catch (InterruptedException e) {
      int errCode = 6018;
      String errMsg = "Error while reading input";
      throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
    }

    Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
    return t;
  }