@Override public void nextBatch() throws IOException { for (int i = 0; i < bags.length; i++) { bags[i].clear(); } key = currKey; for (int i = 0; i < batchSize; i++) { if (iter.hasNext()) { NullableTuple ntup = iter.next(); int index = ntup.getIndex(); Tuple copy = getValueTuple(ntup, index); if (numInputs == 1) { // this is for multi-query merge where // the numInputs is always 1, but the index // (the position of the inner plan in the // enclosed operator) may not be 1. bags[0].add(copy); } else { bags[index].add(copy); } } } }
@Override public Result getNextTuple() throws ExecException { res = super.getNextTuple(); if (writer == null) { // In the case of combiner return res; } try { switch (res.returnStatus) { case POStatus.STATUS_OK: if (illustrator == null) { Tuple result = (Tuple) res.result; Byte index = (Byte) result.get(0); PigNullableWritable key = HDataType.getWritableComparableTypes(result.get(1), keyType); NullableTuple val = new NullableTuple((Tuple) result.get(2)); // Both the key and the value need the index. The key needs it so // that it can be sorted on the index in addition to the key // value. The value needs it so that POPackage can properly // assign the tuple to its slot in the projection. key.setIndex(index); val.setIndex(index); if (isSkewedJoin) { // Wrap into a NullablePartitionWritable to match the key // of the right table from POPartitionRearrangeTez for the skewed join NullablePartitionWritable wrappedKey = new NullablePartitionWritable(key); wrappedKey.setPartition(-1); key = wrappedKey; } writer.write(key, val); } else { illustratorMarkup(res.result, res.result, 0); } res = RESULT_EMPTY; break; case POStatus.STATUS_EOP: case POStatus.STATUS_ERR: case POStatus.STATUS_NULL: default: break; } } catch (IOException ioe) { int errCode = 2135; String msg = "Received error from POLocalRearrage function." + ioe.getMessage(); throw new ExecException(msg, errCode, ioe); } return inp; }
@Override public void collect(Context oc, Tuple tuple) throws InterruptedException, IOException { Byte index = (Byte) tuple.get(0); PigNullableWritable key = HDataType.getWritableComparableTypes(tuple.get(1), keyType); NullableTuple val = new NullableTuple((Tuple) tuple.get(2)); // Both the key and the value need the index. The key needs it so // that it can be sorted on the index in addition to the key // value. The value needs it so that POPackage can properly // assign the tuple to its slot in the projection. key.setIndex(index); val.setIndex(index); oc.write(key, val); }
protected Tuple getValueTuple(NullableTuple ntup, int index) throws ExecException { // Need to make a copy of the value, as hadoop uses the same ntup // to represent each value. Tuple val = (Tuple) ntup.getValueAsPigType(); Tuple copy = null; // The "value (val)" that we just got may not // be the complete "value". It may have some portions // in the "key" (look in POLocalRearrange for more comments) // If this is the case we need to stitch // the "value" together. Pair<Boolean, Map<Integer, Integer>> lrKeyInfo = keyInfo.get(index); boolean isProjectStar = lrKeyInfo.first; Map<Integer, Integer> keyLookup = lrKeyInfo.second; int keyLookupSize = keyLookup.size(); if (keyLookupSize > 0) { // we have some fields of the "value" in the // "key". copy = mTupleFactory.newTuple(); int finalValueSize = keyLookupSize + val.size(); int valIndex = 0; // an index for accessing elements from // the value (val) that we have currently for (int i = 0; i < finalValueSize; i++) { Integer keyIndex = keyLookup.get(i); if (keyIndex == null) { // the field for this index is not in the // key - so just take it from the "value" // we were handed copy.append(val.get(valIndex)); valIndex++; } else { // the field for this index is in the key if (isKeyTuple) { // the key is a tuple, extract the // field out of the tuple copy.append(keyAsTuple.get(keyIndex)); } else { copy.append(key); } } } } else if (isProjectStar) { // the whole "value" is present in the "key" copy = mTupleFactory.newTuple(keyAsTuple.getAll()); } else { // there is no field of the "value" in the // "key" - so just make a copy of what we got // as the "value" copy = mTupleFactory.newTuple(val.getAll()); } return copy; }
public int compare(Object o1, Object o2) { NullableTuple nt1 = (NullableTuple) o1; NullableTuple nt2 = (NullableTuple) o2; int rc = 0; // If either are null, handle differently. if (!nt1.isNull() && !nt2.isNull()) { rc = compareTuple((Tuple) nt1.getValueAsPigType(), (Tuple) nt2.getValueAsPigType()); } else { // For sorting purposes two nulls are equal. if (nt1.isNull() && nt2.isNull()) rc = 0; else if (nt1.isNull()) rc = -1; else rc = 1; if (mWholeTuple && !mAsc[0]) rc *= -1; } return rc; }
@Override public void collect(Context oc, Tuple tuple) throws InterruptedException, IOException { Byte tupleKeyIdx = 2; Byte tupleValIdx = 3; Byte index = (Byte) tuple.get(0); Integer partitionIndex = -1; // for partitioning table, the partition index isn't present if (tuple.size() == 3) { // super.collect(oc, tuple); // return; tupleKeyIdx--; tupleValIdx--; } else { partitionIndex = (Integer) tuple.get(1); } PigNullableWritable key = HDataType.getWritableComparableTypes(tuple.get(tupleKeyIdx), keyType); NullablePartitionWritable wrappedKey = new NullablePartitionWritable(key); NullableTuple val = new NullableTuple((Tuple) tuple.get(tupleValIdx)); // Both the key and the value need the index. The key needs it so // that it can be sorted on the index in addition to the key // value. The value needs it so that POPackage can properly // assign the tuple to its slot in the projection. wrappedKey.setIndex(index); // set the partition wrappedKey.setPartition(partitionIndex); val.setIndex(index); oc.write(wrappedKey, val); }
/** * Calls getNext to get next ForEach result. The input for POJoinPackage is a (key, NullableTuple) * pair. We will materialize n-1 inputs into bags, feed input#n one tuple a time to the delegated * ForEach operator, the input for ForEach is * * <p>(input#1, input#2, input#3....input#n[i]), i=(1..k), suppose input#n consists * * <p>of k tuples. For every ForEach input, pull all the results from ForEach. getNext will be * called multiple times for a particular input, it returns one output tuple from ForEach every * time we call getNext, so we need to maintain internal status to keep tracking of where we are. */ @Override public Result getNext(Tuple t) throws ExecException { if (firstTime) { firstTime = false; if (PigMapReduce.sJobConf != null) { String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type"); if (bagType != null && bagType.equalsIgnoreCase("default")) { useDefaultBag = true; } } } // if a previous call to foreach.getNext() // has still not returned all output, process it if (forEach.processingPlan) { forEachResult = forEach.getNext(t1); switch (forEachResult.returnStatus) { case POStatus.STATUS_OK: case POStatus.STATUS_NULL: case POStatus.STATUS_ERR: return forEachResult; case POStatus.STATUS_EOP: break; } } NullableTuple it = null; // If we see a new NullableTupleIterator, materialize n-1 inputs, construct ForEach input // tuple res = (key, input#1, input#2....input#n), the only missing value is input#n, // we will get input#n one tuple a time, fill in res, feed to ForEach. // After this block, we have the first tuple of input#n in hand (kept in variable it) if (newKey) { lastInputTuple = false; // Put n-1 inputs into bags dbs = new DataBag[numInputs]; for (int i = 0; i < numInputs - 1; i++) { dbs[i] = useDefaultBag ? BagFactory.getInstance().newDefaultBag() // In a very rare case if there is a POStream after this // POJoinPackage in the pipeline and is also blocking the pipeline; // constructor argument should be 2 * numInputs. But for one obscure // case we don't want to pay the penalty all the time. : new InternalCachedBag(numInputs - 1); } // For last bag, we always use NonSpillableBag. dbs[lastBagIndex] = new NonSpillableDataBag((int) chunkSize); // For each Nullable tuple in the input, put it // into the corresponding bag based on the index, // except for the last input, which we will stream // The tuples will arrive in the order of the index, // starting from index 0 and such that all tuples for // a given index arrive before a tuple for the next // index does. while (tupIter.hasNext()) { it = tupIter.next(); int itIndex = it.getIndex(); if (itIndex != numInputs - 1) { dbs[itIndex].add(getValueTuple(it, itIndex)); } else { lastInputTuple = true; break; } if (reporter != null) reporter.progress(); } // If we don't have any tuple for input#n // we do not need any further process, return EOP if (!lastInputTuple) { // we will return at this point because we ought // to be having a flatten on this last input // and we have an empty bag which should result // in this key being taken out of the output newKey = true; return eopResult; } res = mTupleFactory.newTuple(numInputs + 1); for (int i = 0; i < dbs.length; i++) res.set(i + 1, dbs[i]); res.set(0, key); // if we have an inner anywhere and the corresponding // bag is empty, we can just return for (int i = 0; i < dbs.length - 1; i++) { if (inner[i] && dbs[i].size() == 0) { detachInput(); return eopResult; } } newKey = false; // set up the bag with last input to contain // a chunk of CHUNKSIZE values OR the entire bag if // it has less than CHUNKSIZE values - the idea is in most // cases the values are > CHUNKSIZE in number and in // those cases we will be sending the last bag // as a set of smaller chunked bags thus holding lesser // in memory // the first tuple can be directly retrieved from "it" dbs[lastBagIndex].add(getValueTuple(it, it.getIndex())); for (int i = 0; i < chunkSize - 1 && tupIter.hasNext(); i++) { it = tupIter.next(); dbs[lastBagIndex].add(getValueTuple(it, it.getIndex())); } // Attach the input to forEach forEach.attachInput(res); // pull output tuple from ForEach Result forEachResult = forEach.getNext(t1); { switch (forEachResult.returnStatus) { case POStatus.STATUS_OK: case POStatus.STATUS_NULL: case POStatus.STATUS_ERR: return forEachResult; case POStatus.STATUS_EOP: break; } } } // Keep attaching input tuple to ForEach, until: // 1. We can initialize ForEach.getNext(); // 2. There is no more input#n while (true) { if (tupIter.hasNext()) { // try setting up a bag of CHUNKSIZE OR // the remainder of the bag of last input // (if < CHUNKSIZE) to foreach dbs[lastBagIndex].clear(); // clear last chunk for (int i = 0; i < chunkSize && tupIter.hasNext(); i++) { it = tupIter.next(); dbs[lastBagIndex].add(getValueTuple(it, it.getIndex())); } } else // if we do not have any more tuples for input#n, return EOP { detachInput(); newKey = true; return eopResult; } // Attach the input to forEach forEach.attachInput(res); // pull output tuple from ForEach Result forEachResult = forEach.getNext(t1); { switch (forEachResult.returnStatus) { case POStatus.STATUS_OK: case POStatus.STATUS_NULL: case POStatus.STATUS_ERR: return forEachResult; case POStatus.STATUS_EOP: break; } } } }
private void runTest(Object key, boolean inner[], byte keyType) throws ExecException, IOException { Random r = new Random(); DataBag db1 = GenRandomData.genRandSmallTupDataBag(r, 10, 100); DataBag db2 = GenRandomData.genRandSmallTupDataBag(r, 10, 100); List<NullableTuple> db = new ArrayList<NullableTuple>(200); Iterator<Tuple> db1Iter = db1.iterator(); if (!inner[0]) { while (db1Iter.hasNext()) { NullableTuple it = new NullableTuple(db1Iter.next()); it.setIndex((byte) 0); db.add(it); } } Iterator<Tuple> db2Iter = db2.iterator(); while (db2Iter.hasNext()) { NullableTuple it = new NullableTuple(db2Iter.next()); it.setIndex((byte) 1); db.add(it); } // ITIterator iti = new TestPackage.ITIterator(db.iterator()); POPackage pop = new POPackage(new OperatorKey("", r.nextLong())); pop.setNumInps(2); pop.getPkgr().setInner(inner); PigNullableWritable k = HDataType.getWritableComparableTypes(key, keyType); pop.attachInput(k, db.iterator()); if (keyType != DataType.BAG) { // test serialization NullablePartitionWritable wr; if (keyType == DataType.TUPLE) { BinSedesTuple tup = (BinSedesTuple) binfactory.newTupleNoCopy(((Tuple) k.getValueAsPigType()).getAll()); wr = new NullablePartitionWritable(new NullableTuple(tup)); } else { wr = new NullablePartitionWritable(k); } ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream out = new DataOutputStream(baos); wr.write(out); byte[] arr = baos.toByteArray(); ByteArrayInputStream bais = new ByteArrayInputStream(arr); DataInputStream in = new DataInputStream(bais); NullablePartitionWritable re = new NullablePartitionWritable(); re.readFields(in); assertEquals(re, wr); } // we are not doing any optimization to remove // parts of the "value" which are present in the "key" in this // unit test - so set up the "keyInfo" accordingly in // the POPackage Map<Integer, Pair<Boolean, Map<Integer, Integer>>> keyInfo = new HashMap<Integer, Pair<Boolean, Map<Integer, Integer>>>(); Pair<Boolean, Map<Integer, Integer>> p = new Pair<Boolean, Map<Integer, Integer>>(false, new HashMap<Integer, Integer>()); keyInfo.put(0, p); keyInfo.put(1, p); pop.getPkgr().setKeyInfo(keyInfo); Tuple t = null; Result res = null; res = pop.getNextTuple(); if (res.returnStatus == POStatus.STATUS_NULL && inner[0]) return; assertEquals(POStatus.STATUS_OK, res.returnStatus); t = (Tuple) res.result; Object outKey = t.get(0); DataBag outDb1 = (DataBag) t.get(1); DataBag outDb2 = (DataBag) t.get(2); assertEquals(key, outKey); assertTrue(TestHelper.compareBags(db1, outDb1)); assertTrue(TestHelper.compareBags(db2, outDb2)); }
/** * From the inputs, constructs the output tuple for this co-group in the required format which is * (key, {bag of tuples from input 1}, {bag of tuples from input 2}, ...) */ @Override public Result getNext(Tuple t) throws ExecException { Tuple res; if (firstTime) { firstTime = false; if (PigMapReduce.sJobConf != null) { String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type"); if (bagType != null && bagType.equalsIgnoreCase("default")) { useDefaultBag = true; } } } if (distinct) { // only set the key which has the whole // tuple res = mTupleFactory.newTuple(1); res.set(0, key); } else { // Create numInputs bags DataBag[] dbs = null; dbs = new DataBag[numInputs]; if (isAccumulative()) { // create bag wrapper to pull tuples in many batches // all bags have reference to the sample tuples buffer // which contains tuples from one batch POPackageTupleBuffer buffer = new POPackageTupleBuffer(); for (int i = 0; i < numInputs; i++) { dbs[i] = new AccumulativeBag(buffer, i); } } else { // create bag to pull all tuples out of iterator for (int i = 0; i < numInputs; i++) { dbs[i] = useDefaultBag ? BagFactory.getInstance().newDefaultBag() // In a very rare case if there is a POStream after this // POPackage in the pipeline and is also blocking the pipeline; // constructor argument should be 2 * numInputs. But for one obscure // case we don't want to pay the penalty all the time. : new InternalCachedBag(numInputs); } // For each indexed tup in the inp, sort them // into their corresponding bags based // on the index while (tupIter.hasNext()) { NullableTuple ntup = tupIter.next(); int index = ntup.getIndex(); Tuple copy = getValueTuple(ntup, index); if (numInputs == 1) { // this is for multi-query merge where // the numInputs is always 1, but the index // (the position of the inner plan in the // enclosed operator) may not be 1. dbs[0].add(copy); } else { dbs[index].add(copy); } if (reporter != null) reporter.progress(); } } // Construct the output tuple by appending // the key and all the above constructed bags // and return it. res = mTupleFactory.newTuple(numInputs + 1); res.set(0, key); int i = -1; for (DataBag bag : dbs) { i++; if (inner[i] && !isAccumulative()) { if (bag.size() == 0) { detachInput(); Result r = new Result(); r.returnStatus = POStatus.STATUS_NULL; return r; } } res.set(i + 1, bag); } } detachInput(); Result r = new Result(); r.result = res; r.returnStatus = POStatus.STATUS_OK; return r; }