Example #1
0
    @Override
    public void nextBatch() throws IOException {
      for (int i = 0; i < bags.length; i++) {
        bags[i].clear();
      }

      key = currKey;
      for (int i = 0; i < batchSize; i++) {
        if (iter.hasNext()) {
          NullableTuple ntup = iter.next();
          int index = ntup.getIndex();
          Tuple copy = getValueTuple(ntup, index);
          if (numInputs == 1) {

            // this is for multi-query merge where
            // the numInputs is always 1, but the index
            // (the position of the inner plan in the
            // enclosed operator) may not be 1.
            bags[0].add(copy);
          } else {
            bags[index].add(copy);
          }
        }
      }
    }
Example #2
0
  @Override
  public Result getNextTuple() throws ExecException {
    res = super.getNextTuple();
    if (writer == null) { // In the case of combiner
      return res;
    }

    try {
      switch (res.returnStatus) {
        case POStatus.STATUS_OK:
          if (illustrator == null) {
            Tuple result = (Tuple) res.result;
            Byte index = (Byte) result.get(0);
            PigNullableWritable key = HDataType.getWritableComparableTypes(result.get(1), keyType);
            NullableTuple val = new NullableTuple((Tuple) result.get(2));

            // Both the key and the value need the index.  The key needs it so
            // that it can be sorted on the index in addition to the key
            // value.  The value needs it so that POPackage can properly
            // assign the tuple to its slot in the projection.
            key.setIndex(index);
            val.setIndex(index);
            if (isSkewedJoin) {
              // Wrap into a NullablePartitionWritable to match the key
              // of the right table from POPartitionRearrangeTez for the skewed join
              NullablePartitionWritable wrappedKey = new NullablePartitionWritable(key);
              wrappedKey.setPartition(-1);
              key = wrappedKey;
            }
            writer.write(key, val);
          } else {
            illustratorMarkup(res.result, res.result, 0);
          }
          res = RESULT_EMPTY;
          break;
        case POStatus.STATUS_EOP:
        case POStatus.STATUS_ERR:
        case POStatus.STATUS_NULL:
        default:
          break;
      }
    } catch (IOException ioe) {
      int errCode = 2135;
      String msg = "Received error from POLocalRearrage function." + ioe.getMessage();
      throw new ExecException(msg, errCode, ioe);
    }
    return inp;
  }
    @Override
    public void collect(Context oc, Tuple tuple) throws InterruptedException, IOException {

      Byte index = (Byte) tuple.get(0);
      PigNullableWritable key = HDataType.getWritableComparableTypes(tuple.get(1), keyType);
      NullableTuple val = new NullableTuple((Tuple) tuple.get(2));

      // Both the key and the value need the index.  The key needs it so
      // that it can be sorted on the index in addition to the key
      // value.  The value needs it so that POPackage can properly
      // assign the tuple to its slot in the projection.
      key.setIndex(index);
      val.setIndex(index);

      oc.write(key, val);
    }
Example #4
0
  protected Tuple getValueTuple(NullableTuple ntup, int index) throws ExecException {
    // Need to make a copy of the value, as hadoop uses the same ntup
    // to represent each value.
    Tuple val = (Tuple) ntup.getValueAsPigType();

    Tuple copy = null;
    // The "value (val)" that we just got may not
    // be the complete "value". It may have some portions
    // in the "key" (look in POLocalRearrange for more comments)
    // If this is the case we need to stitch
    // the "value" together.
    Pair<Boolean, Map<Integer, Integer>> lrKeyInfo = keyInfo.get(index);
    boolean isProjectStar = lrKeyInfo.first;
    Map<Integer, Integer> keyLookup = lrKeyInfo.second;
    int keyLookupSize = keyLookup.size();

    if (keyLookupSize > 0) {

      // we have some fields of the "value" in the
      // "key".
      copy = mTupleFactory.newTuple();
      int finalValueSize = keyLookupSize + val.size();
      int valIndex = 0; // an index for accessing elements from
      // the value (val) that we have currently
      for (int i = 0; i < finalValueSize; i++) {
        Integer keyIndex = keyLookup.get(i);
        if (keyIndex == null) {
          // the field for this index is not in the
          // key - so just take it from the "value"
          // we were handed
          copy.append(val.get(valIndex));
          valIndex++;
        } else {
          // the field for this index is in the key
          if (isKeyTuple) {
            // the key is a tuple, extract the
            // field out of the tuple
            copy.append(keyAsTuple.get(keyIndex));
          } else {
            copy.append(key);
          }
        }
      }

    } else if (isProjectStar) {

      // the whole "value" is present in the "key"
      copy = mTupleFactory.newTuple(keyAsTuple.getAll());

    } else {

      // there is no field of the "value" in the
      // "key" - so just make a copy of what we got
      // as the "value"
      copy = mTupleFactory.newTuple(val.getAll());
    }
    return copy;
  }
  public int compare(Object o1, Object o2) {
    NullableTuple nt1 = (NullableTuple) o1;
    NullableTuple nt2 = (NullableTuple) o2;
    int rc = 0;

    // If either are null, handle differently.
    if (!nt1.isNull() && !nt2.isNull()) {
      rc = compareTuple((Tuple) nt1.getValueAsPigType(), (Tuple) nt2.getValueAsPigType());
    } else {
      // For sorting purposes two nulls are equal.
      if (nt1.isNull() && nt2.isNull()) rc = 0;
      else if (nt1.isNull()) rc = -1;
      else rc = 1;
      if (mWholeTuple && !mAsc[0]) rc *= -1;
    }
    return rc;
  }
    @Override
    public void collect(Context oc, Tuple tuple) throws InterruptedException, IOException {

      Byte tupleKeyIdx = 2;
      Byte tupleValIdx = 3;

      Byte index = (Byte) tuple.get(0);
      Integer partitionIndex = -1;
      // for partitioning table, the partition index isn't present
      if (tuple.size() == 3) {
        // super.collect(oc, tuple);
        // return;
        tupleKeyIdx--;
        tupleValIdx--;
      } else {
        partitionIndex = (Integer) tuple.get(1);
      }

      PigNullableWritable key =
          HDataType.getWritableComparableTypes(tuple.get(tupleKeyIdx), keyType);

      NullablePartitionWritable wrappedKey = new NullablePartitionWritable(key);

      NullableTuple val = new NullableTuple((Tuple) tuple.get(tupleValIdx));

      // Both the key and the value need the index.  The key needs it so
      // that it can be sorted on the index in addition to the key
      // value.  The value needs it so that POPackage can properly
      // assign the tuple to its slot in the projection.
      wrappedKey.setIndex(index);

      // set the partition
      wrappedKey.setPartition(partitionIndex);
      val.setIndex(index);
      oc.write(wrappedKey, val);
    }
  /**
   * Calls getNext to get next ForEach result. The input for POJoinPackage is a (key, NullableTuple)
   * pair. We will materialize n-1 inputs into bags, feed input#n one tuple a time to the delegated
   * ForEach operator, the input for ForEach is
   *
   * <p>(input#1, input#2, input#3....input#n[i]), i=(1..k), suppose input#n consists
   *
   * <p>of k tuples. For every ForEach input, pull all the results from ForEach. getNext will be
   * called multiple times for a particular input, it returns one output tuple from ForEach every
   * time we call getNext, so we need to maintain internal status to keep tracking of where we are.
   */
  @Override
  public Result getNext(Tuple t) throws ExecException {

    if (firstTime) {
      firstTime = false;
      if (PigMapReduce.sJobConf != null) {
        String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type");
        if (bagType != null && bagType.equalsIgnoreCase("default")) {
          useDefaultBag = true;
        }
      }
    }
    // if a previous call to foreach.getNext()
    // has still not returned all output, process it
    if (forEach.processingPlan) {
      forEachResult = forEach.getNext(t1);
      switch (forEachResult.returnStatus) {
        case POStatus.STATUS_OK:
        case POStatus.STATUS_NULL:
        case POStatus.STATUS_ERR:
          return forEachResult;
        case POStatus.STATUS_EOP:
          break;
      }
    }

    NullableTuple it = null;

    // If we see a new NullableTupleIterator, materialize n-1 inputs, construct ForEach input
    // tuple res = (key, input#1, input#2....input#n), the only missing value is input#n,
    // we will get input#n one tuple a time, fill in res, feed to ForEach.
    // After this block, we have the first tuple of input#n in hand (kept in variable it)
    if (newKey) {
      lastInputTuple = false;
      // Put n-1 inputs into bags
      dbs = new DataBag[numInputs];
      for (int i = 0; i < numInputs - 1; i++) {
        dbs[i] =
            useDefaultBag
                ? BagFactory.getInstance().newDefaultBag()
                // In a very rare case if there is a POStream after this
                // POJoinPackage in the pipeline and is also blocking the pipeline;
                // constructor argument should be 2 * numInputs. But for one obscure
                // case we don't want to pay the penalty all the time.
                : new InternalCachedBag(numInputs - 1);
      }
      // For last bag, we always use NonSpillableBag.
      dbs[lastBagIndex] = new NonSpillableDataBag((int) chunkSize);

      // For each Nullable tuple in the input, put it
      // into the corresponding bag based on the index,
      // except for the last input, which we will stream
      // The tuples will arrive in the order of the index,
      // starting from index 0 and such that all tuples for
      // a given index arrive before a tuple for the next
      // index does.
      while (tupIter.hasNext()) {
        it = tupIter.next();
        int itIndex = it.getIndex();
        if (itIndex != numInputs - 1) {
          dbs[itIndex].add(getValueTuple(it, itIndex));
        } else {
          lastInputTuple = true;
          break;
        }
        if (reporter != null) reporter.progress();
      }
      // If we don't have any tuple for input#n
      // we do not need any further process, return EOP
      if (!lastInputTuple) {
        // we will return at this point because we ought
        // to be having a flatten on this last input
        // and we have an empty bag which should result
        // in this key being taken out of the output
        newKey = true;
        return eopResult;
      }

      res = mTupleFactory.newTuple(numInputs + 1);
      for (int i = 0; i < dbs.length; i++) res.set(i + 1, dbs[i]);

      res.set(0, key);
      // if we have an inner anywhere and the corresponding
      // bag is empty, we can just return
      for (int i = 0; i < dbs.length - 1; i++) {
        if (inner[i] && dbs[i].size() == 0) {
          detachInput();
          return eopResult;
        }
      }
      newKey = false;

      // set up the bag with last input to contain
      // a chunk of CHUNKSIZE values OR the entire bag if
      // it has less than CHUNKSIZE values - the idea is in most
      // cases the values are > CHUNKSIZE in number and in
      // those cases we will be sending the last bag
      // as a set of smaller chunked bags thus holding lesser
      // in memory

      // the first tuple can be directly retrieved from "it"
      dbs[lastBagIndex].add(getValueTuple(it, it.getIndex()));
      for (int i = 0; i < chunkSize - 1 && tupIter.hasNext(); i++) {
        it = tupIter.next();
        dbs[lastBagIndex].add(getValueTuple(it, it.getIndex()));
      }

      // Attach the input to forEach
      forEach.attachInput(res);

      // pull output tuple from ForEach
      Result forEachResult = forEach.getNext(t1);
      {
        switch (forEachResult.returnStatus) {
          case POStatus.STATUS_OK:
          case POStatus.STATUS_NULL:
          case POStatus.STATUS_ERR:
            return forEachResult;
          case POStatus.STATUS_EOP:
            break;
        }
      }
    }

    // Keep attaching input tuple to ForEach, until:
    // 1. We can initialize ForEach.getNext();
    // 2. There is no more input#n
    while (true) {
      if (tupIter.hasNext()) {
        // try setting up a bag of CHUNKSIZE OR
        // the remainder of the bag of last input
        // (if < CHUNKSIZE) to foreach
        dbs[lastBagIndex].clear(); // clear last chunk
        for (int i = 0; i < chunkSize && tupIter.hasNext(); i++) {
          it = tupIter.next();
          dbs[lastBagIndex].add(getValueTuple(it, it.getIndex()));
        }
      } else
      // if we do not have any more tuples for input#n, return EOP
      {
        detachInput();
        newKey = true;
        return eopResult;
      }
      // Attach the input to forEach
      forEach.attachInput(res);

      // pull output tuple from ForEach
      Result forEachResult = forEach.getNext(t1);
      {
        switch (forEachResult.returnStatus) {
          case POStatus.STATUS_OK:
          case POStatus.STATUS_NULL:
          case POStatus.STATUS_ERR:
            return forEachResult;
          case POStatus.STATUS_EOP:
            break;
        }
      }
    }
  }
  private void runTest(Object key, boolean inner[], byte keyType)
      throws ExecException, IOException {
    Random r = new Random();
    DataBag db1 = GenRandomData.genRandSmallTupDataBag(r, 10, 100);
    DataBag db2 = GenRandomData.genRandSmallTupDataBag(r, 10, 100);
    List<NullableTuple> db = new ArrayList<NullableTuple>(200);
    Iterator<Tuple> db1Iter = db1.iterator();
    if (!inner[0]) {
      while (db1Iter.hasNext()) {
        NullableTuple it = new NullableTuple(db1Iter.next());
        it.setIndex((byte) 0);
        db.add(it);
      }
    }
    Iterator<Tuple> db2Iter = db2.iterator();
    while (db2Iter.hasNext()) {
      NullableTuple it = new NullableTuple(db2Iter.next());
      it.setIndex((byte) 1);
      db.add(it);
    }
    // ITIterator iti = new TestPackage.ITIterator(db.iterator());
    POPackage pop = new POPackage(new OperatorKey("", r.nextLong()));
    pop.setNumInps(2);
    pop.getPkgr().setInner(inner);
    PigNullableWritable k = HDataType.getWritableComparableTypes(key, keyType);
    pop.attachInput(k, db.iterator());
    if (keyType != DataType.BAG) {
      // test serialization
      NullablePartitionWritable wr;
      if (keyType == DataType.TUPLE) {
        BinSedesTuple tup =
            (BinSedesTuple) binfactory.newTupleNoCopy(((Tuple) k.getValueAsPigType()).getAll());
        wr = new NullablePartitionWritable(new NullableTuple(tup));
      } else {
        wr = new NullablePartitionWritable(k);
      }
      ByteArrayOutputStream baos = new ByteArrayOutputStream();
      DataOutputStream out = new DataOutputStream(baos);
      wr.write(out);
      byte[] arr = baos.toByteArray();
      ByteArrayInputStream bais = new ByteArrayInputStream(arr);
      DataInputStream in = new DataInputStream(bais);
      NullablePartitionWritable re = new NullablePartitionWritable();
      re.readFields(in);
      assertEquals(re, wr);
    }

    // we are not doing any optimization to remove
    // parts of the "value" which are present in the "key" in this
    // unit test - so set up the "keyInfo" accordingly in
    // the POPackage
    Map<Integer, Pair<Boolean, Map<Integer, Integer>>> keyInfo =
        new HashMap<Integer, Pair<Boolean, Map<Integer, Integer>>>();
    Pair<Boolean, Map<Integer, Integer>> p =
        new Pair<Boolean, Map<Integer, Integer>>(false, new HashMap<Integer, Integer>());
    keyInfo.put(0, p);
    keyInfo.put(1, p);
    pop.getPkgr().setKeyInfo(keyInfo);
    Tuple t = null;
    Result res = null;
    res = pop.getNextTuple();
    if (res.returnStatus == POStatus.STATUS_NULL && inner[0]) return;
    assertEquals(POStatus.STATUS_OK, res.returnStatus);

    t = (Tuple) res.result;
    Object outKey = t.get(0);
    DataBag outDb1 = (DataBag) t.get(1);
    DataBag outDb2 = (DataBag) t.get(2);

    assertEquals(key, outKey);
    assertTrue(TestHelper.compareBags(db1, outDb1));
    assertTrue(TestHelper.compareBags(db2, outDb2));
  }
Example #9
0
  /**
   * From the inputs, constructs the output tuple for this co-group in the required format which is
   * (key, {bag of tuples from input 1}, {bag of tuples from input 2}, ...)
   */
  @Override
  public Result getNext(Tuple t) throws ExecException {
    Tuple res;

    if (firstTime) {
      firstTime = false;
      if (PigMapReduce.sJobConf != null) {
        String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type");
        if (bagType != null && bagType.equalsIgnoreCase("default")) {
          useDefaultBag = true;
        }
      }
    }

    if (distinct) {
      // only set the key which has the whole
      // tuple
      res = mTupleFactory.newTuple(1);
      res.set(0, key);
    } else {
      // Create numInputs bags
      DataBag[] dbs = null;
      dbs = new DataBag[numInputs];

      if (isAccumulative()) {
        // create bag wrapper to pull tuples in many batches
        // all bags have reference to the sample tuples buffer
        // which contains tuples from one batch
        POPackageTupleBuffer buffer = new POPackageTupleBuffer();
        for (int i = 0; i < numInputs; i++) {
          dbs[i] = new AccumulativeBag(buffer, i);
        }

      } else {
        // create bag to pull all tuples out of iterator
        for (int i = 0; i < numInputs; i++) {
          dbs[i] =
              useDefaultBag
                  ? BagFactory.getInstance().newDefaultBag()
                  // In a very rare case if there is a POStream after this
                  // POPackage in the pipeline and is also blocking the pipeline;
                  // constructor argument should be 2 * numInputs. But for one obscure
                  // case we don't want to pay the penalty all the time.
                  : new InternalCachedBag(numInputs);
        }
        // For each indexed tup in the inp, sort them
        // into their corresponding bags based
        // on the index
        while (tupIter.hasNext()) {
          NullableTuple ntup = tupIter.next();
          int index = ntup.getIndex();
          Tuple copy = getValueTuple(ntup, index);

          if (numInputs == 1) {

            // this is for multi-query merge where
            // the numInputs is always 1, but the index
            // (the position of the inner plan in the
            // enclosed operator) may not be 1.
            dbs[0].add(copy);
          } else {
            dbs[index].add(copy);
          }
          if (reporter != null) reporter.progress();
        }
      }

      // Construct the output tuple by appending
      // the key and all the above constructed bags
      // and return it.
      res = mTupleFactory.newTuple(numInputs + 1);
      res.set(0, key);
      int i = -1;
      for (DataBag bag : dbs) {
        i++;
        if (inner[i] && !isAccumulative()) {
          if (bag.size() == 0) {
            detachInput();
            Result r = new Result();
            r.returnStatus = POStatus.STATUS_NULL;
            return r;
          }
        }

        res.set(i + 1, bag);
      }
    }
    detachInput();
    Result r = new Result();
    r.result = res;
    r.returnStatus = POStatus.STATUS_OK;
    return r;
  }