Exemplo n.º 1
0
    @Override
    public void nextBatch() throws IOException {
      for (int i = 0; i < bags.length; i++) {
        bags[i].clear();
      }

      key = currKey;
      for (int i = 0; i < batchSize; i++) {
        if (iter.hasNext()) {
          NullableTuple ntup = iter.next();
          int index = ntup.getIndex();
          Tuple copy = getValueTuple(ntup, index);
          if (numInputs == 1) {

            // this is for multi-query merge where
            // the numInputs is always 1, but the index
            // (the position of the inner plan in the
            // enclosed operator) may not be 1.
            bags[0].add(copy);
          } else {
            bags[index].add(copy);
          }
        }
      }
    }
Exemplo n.º 2
0
  /**
   * Calls getNext to get next ForEach result. The input for POJoinPackage is a (key, NullableTuple)
   * pair. We will materialize n-1 inputs into bags, feed input#n one tuple a time to the delegated
   * ForEach operator, the input for ForEach is
   *
   * <p>(input#1, input#2, input#3....input#n[i]), i=(1..k), suppose input#n consists
   *
   * <p>of k tuples. For every ForEach input, pull all the results from ForEach. getNext will be
   * called multiple times for a particular input, it returns one output tuple from ForEach every
   * time we call getNext, so we need to maintain internal status to keep tracking of where we are.
   */
  @Override
  public Result getNext(Tuple t) throws ExecException {

    if (firstTime) {
      firstTime = false;
      if (PigMapReduce.sJobConf != null) {
        String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type");
        if (bagType != null && bagType.equalsIgnoreCase("default")) {
          useDefaultBag = true;
        }
      }
    }
    // if a previous call to foreach.getNext()
    // has still not returned all output, process it
    if (forEach.processingPlan) {
      forEachResult = forEach.getNext(t1);
      switch (forEachResult.returnStatus) {
        case POStatus.STATUS_OK:
        case POStatus.STATUS_NULL:
        case POStatus.STATUS_ERR:
          return forEachResult;
        case POStatus.STATUS_EOP:
          break;
      }
    }

    NullableTuple it = null;

    // If we see a new NullableTupleIterator, materialize n-1 inputs, construct ForEach input
    // tuple res = (key, input#1, input#2....input#n), the only missing value is input#n,
    // we will get input#n one tuple a time, fill in res, feed to ForEach.
    // After this block, we have the first tuple of input#n in hand (kept in variable it)
    if (newKey) {
      lastInputTuple = false;
      // Put n-1 inputs into bags
      dbs = new DataBag[numInputs];
      for (int i = 0; i < numInputs - 1; i++) {
        dbs[i] =
            useDefaultBag
                ? BagFactory.getInstance().newDefaultBag()
                // In a very rare case if there is a POStream after this
                // POJoinPackage in the pipeline and is also blocking the pipeline;
                // constructor argument should be 2 * numInputs. But for one obscure
                // case we don't want to pay the penalty all the time.
                : new InternalCachedBag(numInputs - 1);
      }
      // For last bag, we always use NonSpillableBag.
      dbs[lastBagIndex] = new NonSpillableDataBag((int) chunkSize);

      // For each Nullable tuple in the input, put it
      // into the corresponding bag based on the index,
      // except for the last input, which we will stream
      // The tuples will arrive in the order of the index,
      // starting from index 0 and such that all tuples for
      // a given index arrive before a tuple for the next
      // index does.
      while (tupIter.hasNext()) {
        it = tupIter.next();
        int itIndex = it.getIndex();
        if (itIndex != numInputs - 1) {
          dbs[itIndex].add(getValueTuple(it, itIndex));
        } else {
          lastInputTuple = true;
          break;
        }
        if (reporter != null) reporter.progress();
      }
      // If we don't have any tuple for input#n
      // we do not need any further process, return EOP
      if (!lastInputTuple) {
        // we will return at this point because we ought
        // to be having a flatten on this last input
        // and we have an empty bag which should result
        // in this key being taken out of the output
        newKey = true;
        return eopResult;
      }

      res = mTupleFactory.newTuple(numInputs + 1);
      for (int i = 0; i < dbs.length; i++) res.set(i + 1, dbs[i]);

      res.set(0, key);
      // if we have an inner anywhere and the corresponding
      // bag is empty, we can just return
      for (int i = 0; i < dbs.length - 1; i++) {
        if (inner[i] && dbs[i].size() == 0) {
          detachInput();
          return eopResult;
        }
      }
      newKey = false;

      // set up the bag with last input to contain
      // a chunk of CHUNKSIZE values OR the entire bag if
      // it has less than CHUNKSIZE values - the idea is in most
      // cases the values are > CHUNKSIZE in number and in
      // those cases we will be sending the last bag
      // as a set of smaller chunked bags thus holding lesser
      // in memory

      // the first tuple can be directly retrieved from "it"
      dbs[lastBagIndex].add(getValueTuple(it, it.getIndex()));
      for (int i = 0; i < chunkSize - 1 && tupIter.hasNext(); i++) {
        it = tupIter.next();
        dbs[lastBagIndex].add(getValueTuple(it, it.getIndex()));
      }

      // Attach the input to forEach
      forEach.attachInput(res);

      // pull output tuple from ForEach
      Result forEachResult = forEach.getNext(t1);
      {
        switch (forEachResult.returnStatus) {
          case POStatus.STATUS_OK:
          case POStatus.STATUS_NULL:
          case POStatus.STATUS_ERR:
            return forEachResult;
          case POStatus.STATUS_EOP:
            break;
        }
      }
    }

    // Keep attaching input tuple to ForEach, until:
    // 1. We can initialize ForEach.getNext();
    // 2. There is no more input#n
    while (true) {
      if (tupIter.hasNext()) {
        // try setting up a bag of CHUNKSIZE OR
        // the remainder of the bag of last input
        // (if < CHUNKSIZE) to foreach
        dbs[lastBagIndex].clear(); // clear last chunk
        for (int i = 0; i < chunkSize && tupIter.hasNext(); i++) {
          it = tupIter.next();
          dbs[lastBagIndex].add(getValueTuple(it, it.getIndex()));
        }
      } else
      // if we do not have any more tuples for input#n, return EOP
      {
        detachInput();
        newKey = true;
        return eopResult;
      }
      // Attach the input to forEach
      forEach.attachInput(res);

      // pull output tuple from ForEach
      Result forEachResult = forEach.getNext(t1);
      {
        switch (forEachResult.returnStatus) {
          case POStatus.STATUS_OK:
          case POStatus.STATUS_NULL:
          case POStatus.STATUS_ERR:
            return forEachResult;
          case POStatus.STATUS_EOP:
            break;
        }
      }
    }
  }
Exemplo n.º 3
0
  /**
   * From the inputs, constructs the output tuple for this co-group in the required format which is
   * (key, {bag of tuples from input 1}, {bag of tuples from input 2}, ...)
   */
  @Override
  public Result getNext(Tuple t) throws ExecException {
    Tuple res;

    if (firstTime) {
      firstTime = false;
      if (PigMapReduce.sJobConf != null) {
        String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type");
        if (bagType != null && bagType.equalsIgnoreCase("default")) {
          useDefaultBag = true;
        }
      }
    }

    if (distinct) {
      // only set the key which has the whole
      // tuple
      res = mTupleFactory.newTuple(1);
      res.set(0, key);
    } else {
      // Create numInputs bags
      DataBag[] dbs = null;
      dbs = new DataBag[numInputs];

      if (isAccumulative()) {
        // create bag wrapper to pull tuples in many batches
        // all bags have reference to the sample tuples buffer
        // which contains tuples from one batch
        POPackageTupleBuffer buffer = new POPackageTupleBuffer();
        for (int i = 0; i < numInputs; i++) {
          dbs[i] = new AccumulativeBag(buffer, i);
        }

      } else {
        // create bag to pull all tuples out of iterator
        for (int i = 0; i < numInputs; i++) {
          dbs[i] =
              useDefaultBag
                  ? BagFactory.getInstance().newDefaultBag()
                  // In a very rare case if there is a POStream after this
                  // POPackage in the pipeline and is also blocking the pipeline;
                  // constructor argument should be 2 * numInputs. But for one obscure
                  // case we don't want to pay the penalty all the time.
                  : new InternalCachedBag(numInputs);
        }
        // For each indexed tup in the inp, sort them
        // into their corresponding bags based
        // on the index
        while (tupIter.hasNext()) {
          NullableTuple ntup = tupIter.next();
          int index = ntup.getIndex();
          Tuple copy = getValueTuple(ntup, index);

          if (numInputs == 1) {

            // this is for multi-query merge where
            // the numInputs is always 1, but the index
            // (the position of the inner plan in the
            // enclosed operator) may not be 1.
            dbs[0].add(copy);
          } else {
            dbs[index].add(copy);
          }
          if (reporter != null) reporter.progress();
        }
      }

      // Construct the output tuple by appending
      // the key and all the above constructed bags
      // and return it.
      res = mTupleFactory.newTuple(numInputs + 1);
      res.set(0, key);
      int i = -1;
      for (DataBag bag : dbs) {
        i++;
        if (inner[i] && !isAccumulative()) {
          if (bag.size() == 0) {
            detachInput();
            Result r = new Result();
            r.returnStatus = POStatus.STATUS_NULL;
            return r;
          }
        }

        res.set(i + 1, bag);
      }
    }
    detachInput();
    Result r = new Result();
    r.result = res;
    r.returnStatus = POStatus.STATUS_OK;
    return r;
  }