@Override public void nextBatch() throws IOException { for (int i = 0; i < bags.length; i++) { bags[i].clear(); } key = currKey; for (int i = 0; i < batchSize; i++) { if (iter.hasNext()) { NullableTuple ntup = iter.next(); int index = ntup.getIndex(); Tuple copy = getValueTuple(ntup, index); if (numInputs == 1) { // this is for multi-query merge where // the numInputs is always 1, but the index // (the position of the inner plan in the // enclosed operator) may not be 1. bags[0].add(copy); } else { bags[index].add(copy); } } } }
/** * Calls getNext to get next ForEach result. The input for POJoinPackage is a (key, NullableTuple) * pair. We will materialize n-1 inputs into bags, feed input#n one tuple a time to the delegated * ForEach operator, the input for ForEach is * * <p>(input#1, input#2, input#3....input#n[i]), i=(1..k), suppose input#n consists * * <p>of k tuples. For every ForEach input, pull all the results from ForEach. getNext will be * called multiple times for a particular input, it returns one output tuple from ForEach every * time we call getNext, so we need to maintain internal status to keep tracking of where we are. */ @Override public Result getNext(Tuple t) throws ExecException { if (firstTime) { firstTime = false; if (PigMapReduce.sJobConf != null) { String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type"); if (bagType != null && bagType.equalsIgnoreCase("default")) { useDefaultBag = true; } } } // if a previous call to foreach.getNext() // has still not returned all output, process it if (forEach.processingPlan) { forEachResult = forEach.getNext(t1); switch (forEachResult.returnStatus) { case POStatus.STATUS_OK: case POStatus.STATUS_NULL: case POStatus.STATUS_ERR: return forEachResult; case POStatus.STATUS_EOP: break; } } NullableTuple it = null; // If we see a new NullableTupleIterator, materialize n-1 inputs, construct ForEach input // tuple res = (key, input#1, input#2....input#n), the only missing value is input#n, // we will get input#n one tuple a time, fill in res, feed to ForEach. // After this block, we have the first tuple of input#n in hand (kept in variable it) if (newKey) { lastInputTuple = false; // Put n-1 inputs into bags dbs = new DataBag[numInputs]; for (int i = 0; i < numInputs - 1; i++) { dbs[i] = useDefaultBag ? BagFactory.getInstance().newDefaultBag() // In a very rare case if there is a POStream after this // POJoinPackage in the pipeline and is also blocking the pipeline; // constructor argument should be 2 * numInputs. But for one obscure // case we don't want to pay the penalty all the time. : new InternalCachedBag(numInputs - 1); } // For last bag, we always use NonSpillableBag. dbs[lastBagIndex] = new NonSpillableDataBag((int) chunkSize); // For each Nullable tuple in the input, put it // into the corresponding bag based on the index, // except for the last input, which we will stream // The tuples will arrive in the order of the index, // starting from index 0 and such that all tuples for // a given index arrive before a tuple for the next // index does. while (tupIter.hasNext()) { it = tupIter.next(); int itIndex = it.getIndex(); if (itIndex != numInputs - 1) { dbs[itIndex].add(getValueTuple(it, itIndex)); } else { lastInputTuple = true; break; } if (reporter != null) reporter.progress(); } // If we don't have any tuple for input#n // we do not need any further process, return EOP if (!lastInputTuple) { // we will return at this point because we ought // to be having a flatten on this last input // and we have an empty bag which should result // in this key being taken out of the output newKey = true; return eopResult; } res = mTupleFactory.newTuple(numInputs + 1); for (int i = 0; i < dbs.length; i++) res.set(i + 1, dbs[i]); res.set(0, key); // if we have an inner anywhere and the corresponding // bag is empty, we can just return for (int i = 0; i < dbs.length - 1; i++) { if (inner[i] && dbs[i].size() == 0) { detachInput(); return eopResult; } } newKey = false; // set up the bag with last input to contain // a chunk of CHUNKSIZE values OR the entire bag if // it has less than CHUNKSIZE values - the idea is in most // cases the values are > CHUNKSIZE in number and in // those cases we will be sending the last bag // as a set of smaller chunked bags thus holding lesser // in memory // the first tuple can be directly retrieved from "it" dbs[lastBagIndex].add(getValueTuple(it, it.getIndex())); for (int i = 0; i < chunkSize - 1 && tupIter.hasNext(); i++) { it = tupIter.next(); dbs[lastBagIndex].add(getValueTuple(it, it.getIndex())); } // Attach the input to forEach forEach.attachInput(res); // pull output tuple from ForEach Result forEachResult = forEach.getNext(t1); { switch (forEachResult.returnStatus) { case POStatus.STATUS_OK: case POStatus.STATUS_NULL: case POStatus.STATUS_ERR: return forEachResult; case POStatus.STATUS_EOP: break; } } } // Keep attaching input tuple to ForEach, until: // 1. We can initialize ForEach.getNext(); // 2. There is no more input#n while (true) { if (tupIter.hasNext()) { // try setting up a bag of CHUNKSIZE OR // the remainder of the bag of last input // (if < CHUNKSIZE) to foreach dbs[lastBagIndex].clear(); // clear last chunk for (int i = 0; i < chunkSize && tupIter.hasNext(); i++) { it = tupIter.next(); dbs[lastBagIndex].add(getValueTuple(it, it.getIndex())); } } else // if we do not have any more tuples for input#n, return EOP { detachInput(); newKey = true; return eopResult; } // Attach the input to forEach forEach.attachInput(res); // pull output tuple from ForEach Result forEachResult = forEach.getNext(t1); { switch (forEachResult.returnStatus) { case POStatus.STATUS_OK: case POStatus.STATUS_NULL: case POStatus.STATUS_ERR: return forEachResult; case POStatus.STATUS_EOP: break; } } } }
/** * From the inputs, constructs the output tuple for this co-group in the required format which is * (key, {bag of tuples from input 1}, {bag of tuples from input 2}, ...) */ @Override public Result getNext(Tuple t) throws ExecException { Tuple res; if (firstTime) { firstTime = false; if (PigMapReduce.sJobConf != null) { String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type"); if (bagType != null && bagType.equalsIgnoreCase("default")) { useDefaultBag = true; } } } if (distinct) { // only set the key which has the whole // tuple res = mTupleFactory.newTuple(1); res.set(0, key); } else { // Create numInputs bags DataBag[] dbs = null; dbs = new DataBag[numInputs]; if (isAccumulative()) { // create bag wrapper to pull tuples in many batches // all bags have reference to the sample tuples buffer // which contains tuples from one batch POPackageTupleBuffer buffer = new POPackageTupleBuffer(); for (int i = 0; i < numInputs; i++) { dbs[i] = new AccumulativeBag(buffer, i); } } else { // create bag to pull all tuples out of iterator for (int i = 0; i < numInputs; i++) { dbs[i] = useDefaultBag ? BagFactory.getInstance().newDefaultBag() // In a very rare case if there is a POStream after this // POPackage in the pipeline and is also blocking the pipeline; // constructor argument should be 2 * numInputs. But for one obscure // case we don't want to pay the penalty all the time. : new InternalCachedBag(numInputs); } // For each indexed tup in the inp, sort them // into their corresponding bags based // on the index while (tupIter.hasNext()) { NullableTuple ntup = tupIter.next(); int index = ntup.getIndex(); Tuple copy = getValueTuple(ntup, index); if (numInputs == 1) { // this is for multi-query merge where // the numInputs is always 1, but the index // (the position of the inner plan in the // enclosed operator) may not be 1. dbs[0].add(copy); } else { dbs[index].add(copy); } if (reporter != null) reporter.progress(); } } // Construct the output tuple by appending // the key and all the above constructed bags // and return it. res = mTupleFactory.newTuple(numInputs + 1); res.set(0, key); int i = -1; for (DataBag bag : dbs) { i++; if (inner[i] && !isAccumulative()) { if (bag.size() == 0) { detachInput(); Result r = new Result(); r.returnStatus = POStatus.STATUS_NULL; return r; } } res.set(i + 1, bag); } } detachInput(); Result r = new Result(); r.result = res; r.returnStatus = POStatus.STATUS_OK; return r; }