Пример #1
1
  /**
   * java level API
   *
   * @param input expects a single numeric value
   * @param output returns a single numeric value, nextup value of the argument
   */
  public Float exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) return null;
    Float d;
    try {
      d = (Float) input.get(0);
    } catch (Exception e) {
      throw new IOException("Caught exception processing input row ", e);
    }

    return Math.nextUp(d);
  }
Пример #2
1
 /**
  * java level API
  *
  * @param input expects a tuple containing two numeric DataAtom value
  * @param output returns a single numeric DataAtom value, which is first floating-point argument
  *     with the sign of the second floating-point argument.
  */
 @Override
 public Float exec(Tuple input) throws IOException {
   if (input == null || input.size() < 2) return null;
   try {
     float first = (Float) input.get(0);
     float second = (Float) input.get(1);
     return Math.copySign(first, second);
   } catch (Exception e) {
     throw WrappedIOException.wrap("Caught exception processing input row ", e);
   }
 }
 private int compareTuple(Tuple t1, Tuple t2) {
   int sz1 = t1.size();
   int sz2 = t2.size();
   if (sz2 < sz1) {
     return 1;
   } else if (sz2 > sz1) {
     return -1;
   } else {
     for (int i = 0; i < sz1; i++) {
       try {
         Object o1 = t1.get(i);
         Object o2 = t2.get(i);
         if (o1 == null || o2 == null) mHasNullField = true;
         int c = DataType.compare(o1, o2);
         if (c != 0) {
           if (!mWholeTuple && !mAsc[i]) c *= -1;
           else if (mWholeTuple && !mAsc[0]) c *= -1;
           return c;
         }
       } catch (ExecException e) {
         throw new RuntimeException("Unable to compare tuples", e);
       }
     }
     return 0;
   }
 }
Пример #4
0
  private HashMap<String, Object> createMap(Tuple input) throws IOException {

    try {

      HashMap<String, Object> map = new HashMap<String, Object>();

      if (input == null || input.size() == 0) {
        return map; // an empty map
      }

      for (int i = 0; i < input.size(); i = i + 2) {

        String key = input.get(i).toString();
        if (null != key && (i + 1 < input.size())) {

          map.put(key, input.get(i + 1));
        }
      }

      return map;

    } catch (Exception e) {

      int errCode = 2106;
      String msg = "Error while creating map with" + this.getClass().getSimpleName();
      throw new ExecException(msg, errCode, PigException.BUG, e);
    }
  }
Пример #5
0
  public Tuple tupleFlatten(Tuple inTuple) throws IOException {
    final int count = outSchema.getNumColumns();

    // TODO: can we reuse tuple?
    Tuple tuple = TupleFactory.getInstance().newTuple(count);

    // for each position, retrieve either column value or flattened value
    int outidx = 0;
    for (int colId = 0; colId < inTuple.size(); colId++) {
      Object obj = inTuple.get(colId);

      if (!isFlattenTuple(flattenPositions.get(colId))) {
        // Not a "flatten" column. Preserve object.
        tuple.set(outidx++, obj);
        continue;
      }

      // Object is a tuple. Flatten it.
      Tuple preFlattening = (Tuple) obj;

      int nColumnFields = this.inputColumnIndexToOutputTypes.get(colId).size();

      if (obj == null || preFlattening.size() == 0) {
        for (int i = 0; i < nColumnFields; i++) tuple.set(outidx++, null);
      } else {
        for (int i = 0; i < nColumnFields; i++) tuple.set(outidx++, preFlattening.get(i));
      }
    }

    if (outidx < count)
      throw new RuntimeException(
          String.format(
              "FlattenTuple: found fewer fields than expected=%d, found=%d", count, outidx));
    return tuple;
  }
Пример #6
0
  /**
   * Given two tuple bags as returned by the NGramGenerator function, return true if the two bags
   * contain the same number of tuples, which, pairwise, have the same contents. Strategy, sort the
   * bags, and then compare tuple by tuple.
   *
   * @param bag1
   * @param bag2
   * @return
   * @throws ExecException
   */
  private boolean compareBags(DefaultDataBag bag1, DefaultDataBag bag2) throws ExecException {
    SortedDataBag sortedBag1 = new SortedDataBag(null);
    SortedDataBag sortedBag2 = new SortedDataBag(null);
    sortedBag1.addAll(bag1);
    sortedBag2.addAll(bag2);
    Iterator<Tuple> bag1Iter = sortedBag1.iterator();
    Iterator<Tuple> bag2Iter = sortedBag2.iterator();

    while (bag1Iter.hasNext()) {
      if (!bag2Iter.hasNext()) {
        return false;
      }
      Tuple t1 = bag1Iter.next();
      Tuple t2 = bag2Iter.next();
      // ************
      // int t1Size = t1.size();
      // int t2Size = t2.size();
      // ************
      if (t1.size() != t2.size()) return false;
      for (int i = 0; i < t1.size(); i++) {
        if (!t1.get(i).equals(t2.get(i))) return false;
      }
    }
    if (bag2Iter.hasNext()) return false;
    return true;
  }
  @Test
  public void exact() throws Exception {
    EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates();
    ItemsSketch<String> sketch = new ItemsSketch<String>(8);
    sketch.update("a");
    sketch.update("a");
    sketch.update("b");
    Tuple inputTuple =
        PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe())));
    DataBag bag = func.exec(inputTuple);
    Assert.assertNotNull(bag);
    Assert.assertEquals(bag.size(), 2);

    Iterator<Tuple> it = bag.iterator();
    Tuple tuple1 = it.next();
    Assert.assertEquals(tuple1.size(), 4);
    Assert.assertEquals((String) tuple1.get(0), "a");
    Assert.assertEquals((long) tuple1.get(1), 2L);
    Assert.assertEquals((long) tuple1.get(2), 2L);
    Assert.assertEquals((long) tuple1.get(3), 2L);

    Tuple tuple2 = it.next();
    Assert.assertEquals(tuple2.size(), 4);
    Assert.assertEquals((String) tuple2.get(0), "b");
    Assert.assertEquals((long) tuple2.get(1), 1L);
    Assert.assertEquals((long) tuple2.get(2), 1L);
    Assert.assertEquals((long) tuple2.get(3), 1L);
  }
Пример #8
0
  @SuppressWarnings("unchecked")
  @Override
  public DataBag exec(Tuple input) throws IOException {
    if (input.size() < 2) {
      throw new RuntimeException("Expected at least two inputs, but found " + input.size());
    }

    for (Object o : input) {
      if (o != null && !(o instanceof DataBag)) {
        throw new RuntimeException("Inputs must be bags");
      }
    }

    DataBag outputBag = bagFactory.newDefaultBag();

    DataBag bag1 = (DataBag) input.get(0);
    DataBag bag2 = (DataBag) input.get(1);

    if (bag1 == null || bag1.size() == 0) {
      return outputBag;
    }
    // optimization
    else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) {
      return bag1;
    }

    PriorityQueue<Pair> pq = loadBags(input);

    Tuple lastData = null;

    while (true) {
      Pair nextPair = pq.peek();

      // ignore data we've already encountered
      if (nextPair.data.compareTo(lastData) != 0) {
        // Only take data from the first bag, where there are no other
        // bags that have the same data.
        if (nextPair.index.equals(0) && countMatches(pq) == 0) {
          outputBag.add(nextPair.data);
          lastData = nextPair.data;
        }
      }

      Pair p = pq.poll();

      // only put the bag back into the queue if it still has data
      if (p.hasNext()) {
        p.next();
        pq.offer(p);
      } else if (p.index.equals(0)) {
        // stop when we exhaust all elements from the first bag
        break;
      }
    }

    return outputBag;
  }
Пример #9
0
  public String exec(Tuple input) throws IOException {
    if (input == null || input.size() < 2) return null;

    String delim = input.get(0).toString();

    StringBuilder sb = new StringBuilder();

    for (int i = 1; i < input.size(); i++) {
      cat(sb, input.get(i), delim);
    }

    return sb.toString();
  }
Пример #10
0
 private Object[] tupleToArgs(Tuple t) throws ExecException {
   if ((t == null && paramClasses_ != null) || (t != null && t.size() != paramClasses_.length)) {
     throw new ExecException("unable to match function arguments to declared signature.");
   }
   if (t == null) {
     return null;
   }
   Object[] args = new Object[t.size()];
   for (int i = 0; i < t.size(); i++) {
     args[i] = unPrimitivize(paramClasses_[i]).cast(t.get(i));
   }
   return args;
 }
Пример #11
0
  /**
   * Loads the data bags from the input tuple and puts them in a priority queue, where ordering is
   * determined by the data from the iterator for each bag.
   *
   * <p>The bags are wrapped in a {@link Pair} object that is comparable on the data currently
   * available from the iterator. These objects are ordered first by the data, then by the index
   * within the tuple the bag came from.
   *
   * @param input
   * @return priority queue ordered
   * @throws IOException
   */
  private PriorityQueue<Pair> loadBags(Tuple input) throws IOException {
    PriorityQueue<Pair> pq = new PriorityQueue<Pair>(input.size());

    for (int i = 0; i < input.size(); i++) {
      if (input.get(i) != null) {
        Iterator<Tuple> inputIterator = ((DataBag) input.get(i)).iterator();
        if (inputIterator.hasNext()) {
          pq.add(new Pair(inputIterator, i));
        }
      }
    }
    return pq;
  }
  private static Tuple buildInitialTupleForTheRow(Tuple input) throws ExecException {
    int numberOfTheColumns = 0;

    Tuple row = null;
    if (null == input) {
      return null;
    } else if (input.get(0) instanceof DataBag) {
      DataBag values = (DataBag) input.get(0);
      Iterator<Tuple> it = values.iterator();
      row = it.next();
      numberOfTheColumns = row.size();
    } else {
      numberOfTheColumns = input.size();
      row = input;
    }

    Tuple vaTuple = initTuple(numberOfTheColumns);
    //        0      1      2          3         4          5
    // 2*3/2+2*2=7
    // x0,x1->sumx0,sumx1,sum(x0*x0),sum(x0x1),sum(x1*x1)
    int i6 = -6;
    for (int i = 0; i < numberOfTheColumns; i++) {
      for (int j = i + 1; j < numberOfTheColumns; j++) {
        i6 += 6;
        // Jeff: to fix pivotal41573093:Although x or y is null,we can calculate the count.
        // count
        increaseTheValueOfElInTheTupleBy(vaTuple, i6, 1);

        if (null == row.get(i) || null == row.get(j)) {
          continue;
        }

        Double x = DataType.toDouble(row.get(i));
        Double y = DataType.toDouble(row.get(j));

        // value x
        increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 1, x);
        // value y
        increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 2, y);
        // value xx
        increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 3, x * x);
        // value yy
        increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 4, y * y);
        // value xy
        increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 5, x * y);
      }
    }

    return vaTuple;
  }
  @Override
  public Integer exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
      return null;
    }
    if (input.size() != 2) {
      throw new IOException("ConvertDocumentIDToID requires 2 parameters");
    }

    String documentIndexPath = (String) input.get(0);
    if (documentIndex == null) {
      loadDocumentIndex(documentIndexPath);
    }
    String docID = (String) input.get(1);
    return documentIndex.get(docID);
  }
Пример #14
0
  @Override
  public DataBag exec(Tuple input) throws IOException {
    try {

      DataBag bag = DefaultBagFactory.getInstance().newDefaultBag();

      if (input == null || input.size() == 0) {
        return bag; // an empty bag
      }
      if (this.fieldType == DataType.MAP) {

        Tuple t = DefaultTupleFactory.getInstance().newTuple(1);
        t.set(0, createMap(input));

        bag.add(t);

      } else {
        bag.add(input);
      }

      return bag;

    } catch (Exception e) {
      throw new RuntimeException(
          "Error while computing size in " + this.getClass().getSimpleName());
    }
  }
Пример #15
0
  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
      return null;
    }

    String normStr = ((String) input.get(0));
    if (normStr == null) {
      return null;
    }

    // Remove punctuation except when it's a version number
    normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" ");
    normStr = spacePattern.matcher(normStr).replaceAll(" ");

    DataBag output = bagFactory.newDefaultBag();
    for (String s : spacePattern.split(normStr.trim())) {
      if (s.length() <= 30) {
        Tuple t = tupleFactory.newTuple(1);
        t.set(0, s);
        output.add(t);
      }
    }

    return output;
  }
Пример #16
0
  @Override
  public Long exec(Tuple input) throws IOException {
    try {
      String tinput = "";
      if (input == null || input.size() == 0) return null;
      else {
        if (input.getType(0) == DataType.CHARARRAY) tinput = (String) input.get(0);
        else
          throw new RuntimeException(
              "Input type expected to be chararray but got: " + input.getType(0));
      }
      tinput = tinput.replaceAll("[-+.^:, ]", "");

      if (tinput.length() > 14) return Long.parseLong(tinput.substring(0, 14));
      else if (tinput.length() < 14)
        return Long.parseLong(String.format("%-14s", tinput).replace(' ', '0'));
      else return Long.parseLong(tinput);

    } catch (ExecException exp) {
      throw exp;
    } catch (Exception e) {
      int errCode = 2107;
      String msg = "Error while computing date_format in " + this.getClass().getSimpleName();
      throw new ExecException(msg, errCode, PigException.BUG, e);
    }
  }
Пример #17
0
  @Override
  public String exec(Tuple input) throws IOException {

    // validate input
    if (input == null || input.size() == 0 || input.get(0) == null) {
      return null;
    }

    // get the value of input
    String strAddress = (String) input.get(0);

    // Get geoip information
    try {
      String result = this.geo.getCountryName(strAddress);

      // replace "--" and "N/A" to null, better for pig
      if (result == null || result.equals("--") || result.equals("N/A")) {
        return null;
      } else {
        return result;
      }

    } catch (Exception e) {
      // e.printStackTrace();
      return null;
    }
  }
Пример #18
0
  /**
   * Creates a serialized S4 event given Pig data.
   *
   * <p>All field names in the input tuple must match the name of a setter method in the event. For
   * example, an input field named "value" will invoke the <tt>setValue</tt> method when creating an
   * event. Setters are always called in the order specified in the constructor. A <tt>null</tt>
   * value means the setter for that field is not called.
   *
   * <p>Type mismatches will produce an exception. Differences in case are ignored.
   *
   * @param input Tuple of values for each field, in the order provided to the constructor.
   * @return Serialized version of the event.
   */
  public DataByteArray exec(Tuple input) throws IOException {
    if (input == null || input.size() < methods.size()) return null;

    // create empty event object
    Object event;
    try {
      event = eventClass.newInstance();
    } catch (Exception e) {
      e.printStackTrace();
      return null;
    }

    // iterate through fields setting values
    for (int i = 0; i < methods.size(); i++) {
      if (input.get(i) != null) {

        MethodNamePair pair = methods.get(i);
        try {
          Method m = pair.method;
          m.invoke(event, input.get(i));
        } catch (Exception e) {
          e.printStackTrace();
          return null;
        }
      }
    }

    // serialize event
    Tuple outputTuple = tupleFactory.newTuple(2);
    byte[] rawEvent = serializer.serialize(event);
    DataByteArray serializedEvent = new DataByteArray(rawEvent);

    return serializedEvent;
  }
Пример #19
0
  @Override
  public String exec(Tuple input) throws IOException {
    if (input.size() != 3) {
      String msg = "RegexExtract : Only 3 parameters are allowed.";
      throw new IOException(msg);
    }
    if (input.get(0) == null) return null;
    try {
      if (!input.get(1).equals(mExpression)) {
        try {
          mExpression = (String) input.get(1);
          mPattern = Pattern.compile(mExpression);
        } catch (Exception e) {
          String msg = "RegexExtract : Mal-Formed Regular expression : " + input.get(1);
          throw new IOException(msg);
        }
      }
    } catch (NullPointerException e) {
      String msg = "RegexExtract : Regular expression is null";
      throw new IOException(msg);
    }
    int mIndex = (Integer) input.get(2);

    Matcher m = mPattern.matcher((String) input.get(0));

    if (!mUseMatches && m.find() || mUseMatches && m.matches()) {
      if (m.groupCount() >= mIndex) {
        return m.group(mIndex);
      }
    }
    warn("RegexExtract : Cannot extract group for input " + input.get(0), PigWarning.UDF_WARNING_1);
    return null;
  }
Пример #20
0
  @Override
  public DateTime exec(Tuple input) throws IOException {
    if (input == null || input.size() < 2) {
      return null;
    }

    return ((DateTime) input.get(0)).plus(new Period((String) input.get(1)));
  }
  private static void markTheTuple(Tuple input) throws ExecException {
    Tuple row = null;
    if (null == input) {
      return;
    } else if (input.get(0) instanceof DataBag) {
      DataBag values = (DataBag) input.get(0);
      Iterator<Tuple> it = values.iterator();
      row = it.next();
    } else {
      row = input;
    }

    if (null == row.get(row.size() - 1)) {
      row.set(row.size() - 1, MARKER);
      return;
    }
  }
Пример #22
0
  protected Tuple getValueTuple(NullableTuple ntup, int index) throws ExecException {
    // Need to make a copy of the value, as hadoop uses the same ntup
    // to represent each value.
    Tuple val = (Tuple) ntup.getValueAsPigType();

    Tuple copy = null;
    // The "value (val)" that we just got may not
    // be the complete "value". It may have some portions
    // in the "key" (look in POLocalRearrange for more comments)
    // If this is the case we need to stitch
    // the "value" together.
    Pair<Boolean, Map<Integer, Integer>> lrKeyInfo = keyInfo.get(index);
    boolean isProjectStar = lrKeyInfo.first;
    Map<Integer, Integer> keyLookup = lrKeyInfo.second;
    int keyLookupSize = keyLookup.size();

    if (keyLookupSize > 0) {

      // we have some fields of the "value" in the
      // "key".
      copy = mTupleFactory.newTuple();
      int finalValueSize = keyLookupSize + val.size();
      int valIndex = 0; // an index for accessing elements from
      // the value (val) that we have currently
      for (int i = 0; i < finalValueSize; i++) {
        Integer keyIndex = keyLookup.get(i);
        if (keyIndex == null) {
          // the field for this index is not in the
          // key - so just take it from the "value"
          // we were handed
          copy.append(val.get(valIndex));
          valIndex++;
        } else {
          // the field for this index is in the key
          if (isKeyTuple) {
            // the key is a tuple, extract the
            // field out of the tuple
            copy.append(keyAsTuple.get(keyIndex));
          } else {
            copy.append(key);
          }
        }
      }

    } else if (isProjectStar) {

      // the whole "value" is present in the "key"
      copy = mTupleFactory.newTuple(keyAsTuple.getAll());

    } else {

      // there is no field of the "value" in the
      // "key" - so just make a copy of what we got
      // as the "value"
      copy = mTupleFactory.newTuple(val.getAll());
    }
    return copy;
  }
Пример #23
0
 public String exec(Tuple input) throws IOException {
   if (input == null || input.size() == 0) return null;
   try {
     String str = (String) input.get(0);
     return str.toUpperCase();
   } catch (Exception e) {
     throw WrappedIOException.wrap("Caught exception processing input row ", e);
   }
 }
  /**
   * java level API
   *
   * @param input expects a single numeric value
   * @param output returns a single numeric value, unbiased exponent used in the representation of a
   *     double
   */
  public Integer exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0 || input.get(0) == null) return null;

    try {
      Float d = (Float) input.get(0);
      return Math.getExponent(d);
    } catch (Exception e) {
      throw new IOException("Caught exception processing input row ", e);
    }
  }
  @Override
  public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
      return null;
    }

    Object obj = null;
    Integer limnum = null;
    try {
      obj = (DataByteArray) input.get(1);

    } catch (ExecException e) {
      logger.error("Error in reading field proto:", e);
      throw e;
    }

    try {
      limnum = (Integer) input.get(2);
    } catch (ExecException e) {
      logger.error("Error in reading baglimit:", e);
      throw e;
    }

    DataByteArray dba = null;
    try {
      dba = (DataByteArray) obj;
    } catch (ClassCastException e) {
      logger.error("Error in casting Object (" + input.getType(1) + ") to DataByteArray:", e);
      throw e;
    }

    DocumentMetadata dm = null;
    try {
      dm = DocumentMetadata.parseFrom(dba.get());
    } catch (InvalidProtocolBufferException e) {
      logger.error("Error in reading ByteArray to DocumentMetadata:", e);
      throw e;
    }

    String key = dm.getKey();
    DataBag db = new DefaultDataBag();
    int bagsize = 0;
    for (ClassifCode code : dm.getBasicMetadata().getClassifCodeList()) {
      for (String co_str : code.getValueList()) {
        bagsize++;
        db.add(TupleFactory.getInstance().newTuple(co_str));
      }
    }
    if (bagsize > limnum) {
      Object[] to = new Object[] {key, db, bagsize};
      return TupleFactory.getInstance().newTuple(Arrays.asList(to));
    }
    return null;
  }
Пример #26
0
  protected static long count(Tuple input) throws ExecException {
    DataBag values = (DataBag) input.get(0);
    Iterator it = values.iterator();
    long cnt = 0;
    while (it.hasNext()) {
      Tuple t = (Tuple) it.next();
      if (t != null && t.size() > 0 && t.get(0) != null) cnt++;
    }

    return cnt;
  }
Пример #27
0
 @Override
 public Long exec(Tuple input) throws IOException {
   try {
     if (input == null) return null;
     return Long.valueOf(input.size());
   } catch (Exception e) {
     int errCode = 2106;
     String msg = "Error while computing size in " + this.getClass().getSimpleName();
     throw new ExecException(msg, errCode, PigException.BUG, e);
   }
 }
  private static Tuple mergeResultsIntoAggregation(Tuple tupTmp, Tuple t) throws ExecException {

    int columnCount = tupTmp.size();

    for (int cn = 0; cn < columnCount; cn++) {
      if (null == t.get(cn) || t.get(cn) instanceof String) {
        continue;
      }
      increaseTheValueOfElInTheTupleBy(tupTmp, cn, DataType.toDouble(t.get(cn)));
    }
    return tupTmp;
  }
Пример #29
0
 @Override
 public String exec(Tuple tuple) throws IOException {
   if (tuple == null || tuple.size() < 1) {
     return null;
   }
   try {
     String refURL = (String) tuple.get(0);
     return extractQuery(refURL);
   } catch (ExecException ee) {
     throw new IOException(ee);
   }
 }
Пример #30
0
  @Override
  public DataBag exec(Tuple input) throws IOException {
    retrieveContextValues();

    ArrayList<String> joinKeyNames = new ArrayList<String>();
    for (int i = 1; i < input.size(); i += 2) {
      joinKeyNames.add((String) input.get(i));
    }

    JoinCollector collector = new JoinCollector();
    // the first bag is the outer bag
    String leftBagName = bagNames.get(0);
    DataBag leftBag = getBag(input, leftBagName);
    String leftBagJoinKeyName =
        getPrefixedAliasName(bagNameToJoinKeyPrefix.get(leftBagName), joinKeyNames.get(0));
    collector.setJoinData(collector.groupTuples(leftBag, leftBagJoinKeyName));
    // now, for each additional bag, group up the tuples by the join key, then join them in
    if (bagNames.size() > 1) {
      for (int i = 1; i < bagNames.size(); i++) {
        String bagName = bagNames.get(i);
        DataBag bag = getBag(input, bagName);
        String joinKeyName =
            getPrefixedAliasName(bagNameToJoinKeyPrefix.get(bagName), joinKeyNames.get(i));
        int tupleSize = bagNameToSize.get(bagName);
        if (bag == null)
          throw new IOException(
              "Error in instance: "
                  + getInstanceName()
                  + " with properties: "
                  + getInstanceProperties()
                  + " and tuple: "
                  + input.toDelimitedString(", ")
                  + " -- Expected bag, got null");
        HashMap<Object, List<Tuple>> groupedData = collector.groupTuples(bag, joinKeyName);
        // outer join, so go back in and add nulls;
        groupedData = collector.insertNullTuples(groupedData, tupleSize);
        for (Map.Entry<Object, List<Tuple>> entry : groupedData.entrySet()) {
          collector.joinTuples(entry.getKey(), entry.getValue());
        }
      }
    }

    // assemble output bag
    DataBag outputBag = BagFactory.getInstance().newDefaultBag();
    for (List<Tuple> tuples : collector.getJoinData().values()) {
      for (Tuple tuple : tuples) {
        outputBag.add(tuple);
      }
    }

    return outputBag;
  }