@Override
    public DataBag exec(Tuple input) throws IOException {
      DataBag bag = (DataBag) input.get(0);
      long n = 0L;
      DataBag selected = bagFactory.newDefaultBag();
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());

      for (Tuple innerTuple : bag) {
        n += (Long) innerTuple.get(0);
        selected.addAll((DataBag) innerTuple.get(1));
        waiting.addAll((DataBag) innerTuple.get(2));
      }

      long sampleSize = (long) Math.ceil(_samplingProbability * n);
      long nNeeded = sampleSize - selected.size();

      for (Tuple scored : waiting) {
        if (nNeeded <= 0) {
          break;
        }
        selected.add(ScoredTuple.fromIntermediateTuple(scored).getTuple());
        nNeeded--;
      }

      return selected;
    }
  // pig 1048
  public void testSkewedJoinOneValue() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    // Filter key with a single value

    pigServer.registerQuery("C = FILTER A by id == 400;");
    pigServer.registerQuery("D = FILTER B by id == 400;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by id, D by id using \"skewed\";");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by id, D by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }
  public void testSkewedJoinWithGroup() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    pigServer.registerQuery("C = GROUP A by id;");
    pigServer.registerQuery("D = GROUP B by id;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbshj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by group, D by group using \"skewed\" parallel 5;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by group, D by group;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbshj.add(iter.next());
      }
    }
    Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));
  }
  public void testSkewedJoinWithNoProperties() throws IOException {
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      DataBag dbshj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("E = join A by(id, name), B by (id, name);");
        Iterator<Tuple> iter = pigServer.openIterator("E");

        while (iter.hasNext()) {
          dbshj.add(iter.next());
        }
      }
      Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
      Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));

    } catch (Exception e) {
      fail(e.getMessage());
    }
  }
  public void testSkewedJoinManyReducers() throws IOException {
    pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "2");
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE6 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE7 + "' as (id,name);");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 300;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join A by id, B by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }
    @Override
    public Tuple exec(Tuple input) throws IOException {
      Tuple output = tupleFactory.newTuple();
      DataBag selected = bagFactory.newDefaultBag();
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());

      DataBag items = (DataBag) input.get(0);

      if (items != null) {
        long n = items.size();

        double q1 = getQ1(n, _samplingProbability);
        double q2 = getQ2(n, _samplingProbability);

        for (Tuple item : items) {
          double key = _rdg.nextUniform(0.0d, 1.0d);

          if (key < q1) {
            selected.add(item);
          } else if (key < q2) {
            waiting.add(new ScoredTuple(key, item).getIntermediateTuple(tupleFactory));
          }
        }

        output.append(n);
        output.append(selected);
        output.append(waiting);
      }

      return output;
    }
  public DataBag exec(Tuple input) throws IOException {
    try {
      if (!input.isNull()) {
        // Create the output like a databag {(res1,res2),(res3,res4)..}
        DataBag output_databag = mBagFactory.newDefaultBag();
        // Unpack tuple in order to get the bag {(1,2),(3,4),...}
        String input_time = (String) input.get(0);
        try {

          DateFormat formatter = new SimpleDateFormat("MM/dd/yyyy kk:mm:ss");
          Date date =
              formatter.parse(
                  String.format(
                      "%s/%s/%s %s:%s:%s",
                      input_time.substring(5, 7),
                      input_time.substring(8, 10),
                      input_time.substring(0, 4),
                      input_time.substring(11, 13),
                      input_time.substring(14, 16),
                      input_time.substring(17, 18)));
          Calendar calendar = Calendar.getInstance();
          calendar.setTime(date);
          int dayOfWeek = calendar.get(Calendar.DAY_OF_WEEK);
          int dayOfMonth = calendar.get(Calendar.DAY_OF_MONTH);
          int hour = calendar.get(Calendar.HOUR_OF_DAY);

          // Add items to output
          Tuple items = TupleFactory.getInstance().newTuple(1);
          items.set(0, String.format("%d:%d:%d", dayOfWeek, dayOfMonth, hour));
          output_databag.add(items);

        } catch (Exception e) {
          Tuple items = TupleFactory.getInstance().newTuple(1);
          items.set(0, "petting #1" + e.getMessage());
          output_databag.add(items);
          return output_databag;
        }
        return output_databag;
      } else {
        DataBag output_databag = mBagFactory.newDefaultBag();
        Tuple items = TupleFactory.getInstance().newTuple(1);
        items.set(0, "petting #2");
        output_databag.add(items);
        return output_databag;
      }
    } catch (Exception e) {
      System.err.println("Error with ?? ..");
      DataBag output_databag = mBagFactory.newDefaultBag();
      Tuple items = TupleFactory.getInstance().newTuple(1);
      items.set(0, "petting #3" + e.getMessage());
      output_databag.add(items);
      return output_databag;
    }
  }
Example #8
0
  @Override
  public DataBag exec(Tuple input) throws IOException {
    DataBag bag = (DataBag) input.get(0);
    HashMap<String, Double> clsCnt = new HashMap<String, Double>();
    Iterator<Tuple> it = bag.iterator();
    Double sum = new Double(0.0);
    while (it.hasNext()) {
      Tuple item = (Tuple) it.next();
      String cls = (String) item.get(3);
      if (cls != null && cls.length() > 0) {
        Double cur = clsCnt.get(cls);
        Double inc = (Double) item.get(2);
        if (cur != null) {
          clsCnt.put(cls, cur + inc);
        } else {
          clsCnt.put(cls, inc);
        }
        sum += inc;
      }
    }

    Set<Entry<String, Double>> clses = clsCnt.entrySet();
    Iterator<Entry<String, Double>> cit = clses.iterator();
    DataBag result = bagFactory.newDefaultBag();
    while (cit.hasNext()) {
      Entry<String, Double> cls = cit.next();
      Tuple tpl = tupleFactory.newTuple(2);
      tpl.set(0, cls.getKey());
      tpl.set(1, cls.getValue() / sum);
      result.add(tpl);
    }

    return result;
  }
 @SuppressWarnings("rawtypes")
 @Override
 public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
   this.reader = (WikipediaPageInputFormat.WikipediaRecordReader) reader;
   tupleFactory = TupleFactory.getInstance();
   bagFactory = BagFactory.getInstance();
 }
Example #10
0
/**
 * TokenizeText uses the Lucene libraries StandardAnalyzer class to tokenize a raw text input. A
 * list of the stopwords used is available {@link StopWords}. Output is a pig bag containing tokens.
 * <dt><b>Example:</b>
 * <dd><code>
 * register varaha.jar;<br/>
 * documents    = LOAD 'documents' AS (doc_id:chararray, text:chararray);<br/>
 * tokenized    = FOREACH documents GENERATE doc_id AS doc_id, FLATTEN(StanfordTokenize(text)) AS (token:chararray);
 * </code>
 * </dl>
 *
 * @see
 * @author Russell Jurney
 */
public class StanfordTokenize extends EvalFunc<DataBag> {

  private static TupleFactory tupleFactory = TupleFactory.getInstance();
  private static BagFactory bagFactory = BagFactory.getInstance();

  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0)) return null;

    // Output bag
    DataBag bagOfTokens = bagFactory.newDefaultBag();

    StringReader textInput = new StringReader(input.get(0).toString());
    PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), "");

    for (CoreLabel label; ptbt.hasNext(); ) {
      label = (CoreLabel) ptbt.next();
      if (label.value().length() > 2) {
        System.err.println(label.toString());
        Tuple termText = tupleFactory.newTuple(label.word());
        bagOfTokens.add(termText);
      }
    }
    return bagOfTokens;
  }
}
Example #11
0
    @Override
    public Tuple exec(Tuple input) throws IOException {
      DataBag output = BagFactory.getInstance().newDefaultBag();

      DataBag samples = (DataBag) input.get(0);
      if (samples == null) {
        // do nothing
      } else if (samples.size() <= numSamples) {
        // no need to construct a reservoir, so just emit intermediate tuples
        for (Tuple sample : samples) {
          // add the score on to the intermediate tuple
          output.add(new ScoredTuple(Math.random(), sample).getIntermediateTuple(tupleFactory));
        }
      } else {
        for (Tuple sample : samples) {
          getReservoir().consider(new ScoredTuple(Math.random(), sample));
        }

        for (ScoredTuple scoredTuple : getReservoir()) {
          // add the score on to the intermediate tuple
          output.add(scoredTuple.getIntermediateTuple(tupleFactory));
        }
      }

      return tupleFactory.newTuple(output);
    }
Example #12
0
  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
      return null;
    }

    String normStr = ((String) input.get(0));
    if (normStr == null) {
      return null;
    }

    // Remove punctuation except when it's a version number
    normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" ");
    normStr = spacePattern.matcher(normStr).replaceAll(" ");

    DataBag output = bagFactory.newDefaultBag();
    for (String s : spacePattern.split(normStr.trim())) {
      if (s.length() <= 30) {
        Tuple t = tupleFactory.newTuple(1);
        t.set(0, s);
        output.add(t);
      }
    }

    return output;
  }
Example #13
0
public class UnigramExtractor extends EvalFunc<DataBag> {

  private static BagFactory bagFactory = BagFactory.getInstance();
  private static TupleFactory tupleFactory = TupleFactory.getInstance();

  private static final Pattern spacePattern = Pattern.compile("\\s+");
  private static final Pattern punctPattern = Pattern.compile("\\p{Punct}(?:(?<!\\d)(?!\\d))");

  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
      return null;
    }

    String normStr = ((String) input.get(0));
    if (normStr == null) {
      return null;
    }

    // Remove punctuation except when it's a version number
    normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" ");
    normStr = spacePattern.matcher(normStr).replaceAll(" ");

    DataBag output = bagFactory.newDefaultBag();
    for (String s : spacePattern.split(normStr.trim())) {
      if (s.length() <= 30) {
        Tuple t = tupleFactory.newTuple(1);
        t.set(0, s);
        output.add(t);
      }
    }

    return output;
  }
}
Example #14
0
 @Override
 public DataBag getValue() {
   DataBag output = BagFactory.getInstance().newDefaultBag();
   for (ScoredTuple sample : getReservoir()) {
     output.add(sample.getTuple());
   }
   return output;
 }
 /**
  * create bag having given number of tuples
  *
  * @param size
  * @return
  */
 private DataBag createBag(int size) {
   Tuple innerTuple = TupleFactory.getInstance().newTuple();
   innerTuple.append(Integer.valueOf(1));
   DataBag bag = BagFactory.getInstance().newDefaultBag();
   for (int i = 0; i < size; i++) {
     bag.add(innerTuple);
   }
   return bag;
 }
  @SuppressWarnings("unchecked")
  @Override
  public DataBag exec(Tuple input) throws IOException {
    if (input.size() < 2) {
      throw new RuntimeException("Expected at least two inputs, but found " + input.size());
    }

    for (Object o : input) {
      if (o != null && !(o instanceof DataBag)) {
        throw new RuntimeException("Inputs must be bags");
      }
    }

    DataBag outputBag = bagFactory.newDefaultBag();

    DataBag bag1 = (DataBag) input.get(0);
    DataBag bag2 = (DataBag) input.get(1);

    if (bag1 == null || bag1.size() == 0) {
      return outputBag;
    }
    // optimization
    else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) {
      return bag1;
    }

    PriorityQueue<Pair> pq = loadBags(input);

    Tuple lastData = null;

    while (true) {
      Pair nextPair = pq.peek();

      // ignore data we've already encountered
      if (nextPair.data.compareTo(lastData) != 0) {
        // Only take data from the first bag, where there are no other
        // bags that have the same data.
        if (nextPair.index.equals(0) && countMatches(pq) == 0) {
          outputBag.add(nextPair.data);
          lastData = nextPair.data;
        }
      }

      Pair p = pq.poll();

      // only put the bag back into the queue if it still has data
      if (p.hasNext()) {
        p.next();
        pq.offer(p);
      } else if (p.index.equals(0)) {
        // stop when we exhaust all elements from the first bag
        break;
      }
    }

    return outputBag;
  }
Example #17
0
  @Override
  public void setInput(Map<String, Block> input, JsonNode operatorJson, BlockProperties props)
      throws IOException, InterruptedException {
    inputBlock = input.values().iterator().next();

    init(operatorJson, inputBlock.getProperties().getSchema());

    nullBag = BagFactory.getInstance().newDefaultBag();

    nullBag.add(TupleFactory.getInstance().newTuple(0));
  }
Example #18
0
 @Override
 public DataBag exec(Tuple input) throws ExecException, FrontendException {
   if (input == null) {
     return null;
   }
   Iterable<Pair<Integer, Double>> vector = elgen.apply(input);
   List<Tuple> result = new ArrayList<Tuple>(N);
   for (Pair<Integer, Double> el : ordering.greatestOf(vector, N)) {
     result.add(tfac.newTuple(el.getFirst()));
   }
   return bfac.newDefaultBag(result);
 }
Example #19
0
  @Override
  public DataBag exec(Tuple input) throws IOException {
    retrieveContextValues();

    ArrayList<String> joinKeyNames = new ArrayList<String>();
    for (int i = 1; i < input.size(); i += 2) {
      joinKeyNames.add((String) input.get(i));
    }

    JoinCollector collector = new JoinCollector();
    // the first bag is the outer bag
    String leftBagName = bagNames.get(0);
    DataBag leftBag = getBag(input, leftBagName);
    String leftBagJoinKeyName =
        getPrefixedAliasName(bagNameToJoinKeyPrefix.get(leftBagName), joinKeyNames.get(0));
    collector.setJoinData(collector.groupTuples(leftBag, leftBagJoinKeyName));
    // now, for each additional bag, group up the tuples by the join key, then join them in
    if (bagNames.size() > 1) {
      for (int i = 1; i < bagNames.size(); i++) {
        String bagName = bagNames.get(i);
        DataBag bag = getBag(input, bagName);
        String joinKeyName =
            getPrefixedAliasName(bagNameToJoinKeyPrefix.get(bagName), joinKeyNames.get(i));
        int tupleSize = bagNameToSize.get(bagName);
        if (bag == null)
          throw new IOException(
              "Error in instance: "
                  + getInstanceName()
                  + " with properties: "
                  + getInstanceProperties()
                  + " and tuple: "
                  + input.toDelimitedString(", ")
                  + " -- Expected bag, got null");
        HashMap<Object, List<Tuple>> groupedData = collector.groupTuples(bag, joinKeyName);
        // outer join, so go back in and add nulls;
        groupedData = collector.insertNullTuples(groupedData, tupleSize);
        for (Map.Entry<Object, List<Tuple>> entry : groupedData.entrySet()) {
          collector.joinTuples(entry.getKey(), entry.getValue());
        }
      }
    }

    // assemble output bag
    DataBag outputBag = BagFactory.getInstance().newDefaultBag();
    for (List<Tuple> tuples : collector.getJoinData().values()) {
      for (Tuple tuple : tuples) {
        outputBag.add(tuple);
      }
    }

    return outputBag;
  }
  public void testSkewedJoinReducers() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id, B by id using \"skewed\" parallel 1;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      fail("Should not throw exception, should continue execution");
    }
  }
Example #21
0
    @Override
    public DataBag exec(Tuple input) throws IOException {
      DataBag bagOfSamples = (DataBag) input.get(0);
      for (Tuple innerTuple : bagOfSamples) {
        DataBag samples = (DataBag) innerTuple.get(0);

        for (Tuple sample : samples) {
          // use the same score as previously generated
          getReservoir().consider(ScoredTuple.fromIntermediateTuple(sample));
        }
      }

      DataBag output = BagFactory.getInstance().newDefaultBag();
      for (ScoredTuple scoredTuple : getReservoir()) {
        // output the original tuple
        output.add(scoredTuple.getTuple());
      }

      return output;
    }
  public void testSkewedJoin3Way() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("D = join A by id, B by id, C by id using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("D");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      return;
    }

    fail("Should throw exception, do not support 3 way join");
  }
  public void testSkewedJoinNullKeys() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support null keys in skewed join");
    }
    return;
  }
 @SuppressWarnings("unchecked")
 private void accumulateData() throws ExecException {
   int count = 0;
   int length = inputs.size() - 1;
   inputBags = new DataBag[length];
   its = new Iterator[length];
   for (int i = 0; i < length; ++i) {
     PhysicalOperator op = inputs.get(i);
     DataBag bag = BagFactory.getInstance().newDefaultBag();
     inputBags[count] = bag;
     for (Result res = op.getNextTuple();
         res.returnStatus != POStatus.STATUS_EOP;
         res = op.getNextTuple()) {
       if (res.returnStatus == POStatus.STATUS_NULL) continue;
       if (res.returnStatus == POStatus.STATUS_ERR)
         throw new ExecException("Error accumulating data in the local Cross operator");
       if (res.returnStatus == POStatus.STATUS_OK) bag.add((Tuple) res.result);
     }
     its[count++] = bag.iterator();
   }
 }
  public void testSkewedJoinMapKey() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE4 + "' as (m:[]);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE4 + "' as (n:[]);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (chararray)m#'a100', B by (chararray)n#'a100' using \"skewed\" parallel 20;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support maps and expression operators as keys");
    }

    return;
  }
  @Test
  public void testTupleWriteRead1() throws IOException {
    // create a tuple with columns of different type
    Tuple tuplein = TupleFactory.getInstance().newTuple(7);
    tuplein.set(0, 12);
    Map<String, String> map = new HashMap<String, String>();
    map.put("pig", "scalability");
    tuplein.set(1, map);
    tuplein.set(2, null);
    tuplein.set(3, 12L);
    tuplein.set(4, 1.2F);

    Tuple innerTuple = TupleFactory.getInstance().newTuple(1);
    innerTuple.set(0, "innerTuple");
    tuplein.set(5, innerTuple);
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    bag.add(innerTuple);
    tuplein.set(6, bag);

    testTupleSedes(tuplein);

    assertEquals(
        "(12,[pig#scalability],,12,1.2,(innerTuple),{(innerTuple)})", TupleFormat.format(tuplein));
  }
  public DataBag exec(Tuple input) throws IOException {
    try {
      DataBag bag = BagFactory.getInstance().newDefaultBag();

      for (int i = 0; i < input.size(); i++) {
        final Object object = input.get(i);
        if (object instanceof Tuple) {
          for (int j = 0; j < ((Tuple) object).size(); j++) {
            Tuple tp2 = TupleFactory.getInstance().newTuple(1);
            tp2.set(0, ((Tuple) object).get(j));
            bag.add(tp2);
          }
        } else {
          Tuple tp2 = TupleFactory.getInstance().newTuple(1);
          tp2.set(0, object);
          bag.add(tp2);
        }
      }

      return bag;
    } catch (Exception ee) {
      throw new RuntimeException("Error while creating a bag", ee);
    }
  }
Example #28
0
public class CalcClassWeight extends EvalFunc<DataBag> {
  TupleFactory tupleFactory = TupleFactory.getInstance();
  BagFactory bagFactory = BagFactory.getInstance();

  @Override
  public DataBag exec(Tuple input) throws IOException {
    DataBag bag = (DataBag) input.get(0);
    HashMap<String, Double> clsCnt = new HashMap<String, Double>();
    Iterator<Tuple> it = bag.iterator();
    Double sum = new Double(0.0);
    while (it.hasNext()) {
      Tuple item = (Tuple) it.next();
      String cls = (String) item.get(3);
      if (cls != null && cls.length() > 0) {
        Double cur = clsCnt.get(cls);
        Double inc = (Double) item.get(2);
        if (cur != null) {
          clsCnt.put(cls, cur + inc);
        } else {
          clsCnt.put(cls, inc);
        }
        sum += inc;
      }
    }

    Set<Entry<String, Double>> clses = clsCnt.entrySet();
    Iterator<Entry<String, Double>> cit = clses.iterator();
    DataBag result = bagFactory.newDefaultBag();
    while (cit.hasNext()) {
      Entry<String, Double> cls = cit.next();
      Tuple tpl = tupleFactory.newTuple(2);
      tpl.set(0, cls.getKey());
      tpl.set(1, cls.getValue() / sum);
      result.add(tpl);
    }

    return result;
  }

  @Override
  public Schema outputSchema(Schema input) {
    try {
      if (input.getFields().size() != 1 || input.getField(0).type != DataType.BAG) {
        throw new RuntimeException("expect input {bag}");
      }
      Schema bag = input.getField(0).schema.getField(0).schema;
      if (bag.getFields().size() < 4
          || bag.getField(0).type != DataType.CHARARRAY
          || bag.getField(1).type != DataType.CHARARRAY
          || bag.getField(2).type != DataType.DOUBLE
          || bag.getField(3).type != DataType.CHARARRAY) {
        throw new RuntimeException(
            "expect input {userid:chararray, " + "md:chararray, weight:double, cls:chararray}");
      }

      Schema result = new Schema();
      result.add(new FieldSchema("cls", DataType.CHARARRAY));
      result.add(new FieldSchema("weight", DataType.DOUBLE));
      return result;
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
}
/**
 * Computes the set difference of two or more bags. Duplicates are eliminated. <b>The input bags
 * must be sorted.</b>
 *
 * <p>If bags A and B are provided, then this computes A-B, i.e. all elements in A that are not in
 * B. If bags A, B and C are provided, then this computes A-B-C, i.e. all elements in A that are not
 * in B or C.
 *
 * <p>Example:
 *
 * <pre>{@code
 * define SetDifference datafu.pig.sets.SetDifference();
 *
 * -- input:
 * -- ({(1),(2),(3),(4),(5),(6)},{(3),(4)})
 * input = LOAD 'input' AS (B1:bag{T:tuple(val:int)},B2:bag{T:tuple(val:int)});
 *
 * input = FOREACH input {
 *   B1 = ORDER B1 BY val ASC;
 *   B2 = ORDER B2 BY val ASC;
 *
 *   -- output:
 *   -- ({(1),(2),(5),(6)})
 *   GENERATE SetDifference(B1,B2);
 * }
 * }</pre>
 */
public class SetDifference extends SetOperationsBase {
  private static final BagFactory bagFactory = BagFactory.getInstance();

  /**
   * Loads the data bags from the input tuple and puts them in a priority queue, where ordering is
   * determined by the data from the iterator for each bag.
   *
   * <p>The bags are wrapped in a {@link Pair} object that is comparable on the data currently
   * available from the iterator. These objects are ordered first by the data, then by the index
   * within the tuple the bag came from.
   *
   * @param input
   * @return priority queue ordered
   * @throws IOException
   */
  private PriorityQueue<Pair> loadBags(Tuple input) throws IOException {
    PriorityQueue<Pair> pq = new PriorityQueue<Pair>(input.size());

    for (int i = 0; i < input.size(); i++) {
      if (input.get(i) != null) {
        Iterator<Tuple> inputIterator = ((DataBag) input.get(i)).iterator();
        if (inputIterator.hasNext()) {
          pq.add(new Pair(inputIterator, i));
        }
      }
    }
    return pq;
  }

  /**
   * Counts how many elements in the priority queue match the element at the front of the queue,
   * which should be from the first bag.
   *
   * @param pq priority queue
   * @return number of matches
   */
  public int countMatches(PriorityQueue<Pair> pq) {
    Pair nextPair = pq.peek();
    Tuple data = nextPair.data;

    // sanity check
    if (!nextPair.index.equals(0)) {
      throw new RuntimeException("Expected next bag to have index 0");
    }

    int matches = 0;
    for (Pair p : pq) {
      if (data.equals(p.data)) matches++;
    }
    // subtract 1 since element matches itself
    return matches - 1;
  }

  @SuppressWarnings("unchecked")
  @Override
  public DataBag exec(Tuple input) throws IOException {
    if (input.size() < 2) {
      throw new RuntimeException("Expected at least two inputs, but found " + input.size());
    }

    for (Object o : input) {
      if (o != null && !(o instanceof DataBag)) {
        throw new RuntimeException("Inputs must be bags");
      }
    }

    DataBag outputBag = bagFactory.newDefaultBag();

    DataBag bag1 = (DataBag) input.get(0);
    DataBag bag2 = (DataBag) input.get(1);

    if (bag1 == null || bag1.size() == 0) {
      return outputBag;
    }
    // optimization
    else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) {
      return bag1;
    }

    PriorityQueue<Pair> pq = loadBags(input);

    Tuple lastData = null;

    while (true) {
      Pair nextPair = pq.peek();

      // ignore data we've already encountered
      if (nextPair.data.compareTo(lastData) != 0) {
        // Only take data from the first bag, where there are no other
        // bags that have the same data.
        if (nextPair.index.equals(0) && countMatches(pq) == 0) {
          outputBag.add(nextPair.data);
          lastData = nextPair.data;
        }
      }

      Pair p = pq.poll();

      // only put the bag back into the queue if it still has data
      if (p.hasNext()) {
        p.next();
        pq.offer(p);
      } else if (p.index.equals(0)) {
        // stop when we exhaust all elements from the first bag
        break;
      }
    }

    return outputBag;
  }

  /**
   * A wrapper for the tuple iterator that implements comparable so it can be used in the priority
   * queue.
   *
   * <p>This is compared first on the data, then on the index the bag came from in the input tuple.
   */
  private static class Pair implements Comparable<Pair> {
    private final Iterator<Tuple> it;
    private final Integer index;
    private Tuple data;

    /**
     * Constructs the {@link Pair}.
     *
     * @param it tuple iterator
     * @param index index within the tuple that the bag came from
     */
    public Pair(Iterator<Tuple> it, int index) {
      this.index = index;
      this.it = it;
      this.data = it.next();
    }

    @SuppressWarnings("unchecked")
    @Override
    public int compareTo(Pair o) {
      int r = this.data.compareTo(o.data);
      if (r == 0) {
        return index.compareTo(o.index);
      } else {
        return r;
      }
    }

    public boolean hasNext() {
      return it.hasNext();
    }

    @SuppressWarnings("unchecked")
    public Tuple next() {
      Tuple nextData = it.next();
      // algorithm assumes data is in order
      if (data.compareTo(nextData) > 0) {
        throw new RuntimeException("Out of order!");
      }
      this.data = nextData;
      return this.data;
    }

    @Override
    public String toString() {
      return String.format("[%s within %d]", data, index);
    }
  }
}
Example #30
0
public class TestScalarAliasesLocal {
  private static final String BUILD_TEST_TMP = "build/test/tmp/";
  private PigServer pigServer;

  TupleFactory mTf = TupleFactory.getInstance();
  BagFactory mBf = BagFactory.getInstance();

  @Before
  public void setUp() throws Exception {
    pigServer = new PigServer(Util.getLocalTestMode());
  }

  public static void deleteDirectory(File file) {
    if (file.exists()) {
      Util.deleteDirectory(file);
    }
  }

  public static File createLocalInputFile(String filename, String[] inputData) throws IOException {
    new File(filename).getParentFile().mkdirs();
    return Util.createLocalInputFile(filename, inputData);
  }

  // See PIG-1434
  @Test
  public void testScalarAliasesBatchNobatch() throws Exception {
    String[] input = {"1\t5", "2\t10", "3\t20"};

    String output = BUILD_TEST_TMP + "table_testScalarAliasesDir";
    TestScalarAliases.deleteDirectory(new File(output));
    // Test the use of scalars in expressions
    String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesBatch";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    // Test in script mode
    pigServer.setBatchOn();
    pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);");
    pigServer.registerQuery("B = group A all;");
    pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;");
    pigServer.registerQuery("Y = foreach A generate (a0 * C.count), (a1 / C.max);");
    pigServer.registerQuery("Store Y into '" + output + "';");
    pigServer.executeBatch();
    // Check output
    pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: int, a1: double);");

    Iterator<Tuple> iter;
    Tuple t;
    iter = pigServer.openIterator("Z");

    t = iter.next();
    assertTrue(t.toString().equals("(3,0.25)"));

    t = iter.next();
    assertTrue(t.toString().equals("(6,0.5)"));

    t = iter.next();
    assertTrue(t.toString().equals("(9,1.0)"));

    assertFalse(iter.hasNext());

    iter = pigServer.openIterator("Y");

    t = iter.next();
    assertTrue(t.toString().equals("(3,0.25)"));

    t = iter.next();
    assertTrue(t.toString().equals("(6,0.5)"));

    t = iter.next();
    assertTrue(t.toString().equals("(9,1.0)"));

    assertFalse(iter.hasNext());
  }

  // See PIG-1434
  @Test
  public void testUseScalarMultipleTimes() throws Exception {
    String[] input = {"1\t5", "2\t10", "3\t20"};

    String outputY = BUILD_TEST_TMP + "table_testUseScalarMultipleTimesOutY";
    TestScalarAliases.deleteDirectory(new File(outputY));
    String outputZ = BUILD_TEST_TMP + "table_testUseScalarMultipleTimesOutZ";
    TestScalarAliases.deleteDirectory(new File(outputZ));
    // Test the use of scalars in expressions
    String inputPath = BUILD_TEST_TMP + "table_testUseScalarMultipleTimes";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    pigServer.setBatchOn();
    pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);");
    pigServer.registerQuery("B = group A all;");
    pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;");
    pigServer.registerQuery("Y = foreach A generate (a0 * C.count), (a1 / C.max);");
    pigServer.registerQuery("Store Y into '" + outputY + "';");
    pigServer.registerQuery("Z = foreach A generate (a1 + C.count), (a0 * C.max);");
    pigServer.registerQuery("Store Z into '" + outputZ + "';");
    // Test Multiquery store
    pigServer.executeBatch();

    // Check output
    pigServer.registerQuery("M = LOAD '" + outputY + "' as (a0: int, a1: double);");

    Iterator<Tuple> iter;
    Tuple t;
    iter = pigServer.openIterator("M");

    t = iter.next();
    assertTrue(t.toString().equals("(3,0.25)"));

    t = iter.next();
    assertTrue(t.toString().equals("(6,0.5)"));

    t = iter.next();
    assertTrue(t.toString().equals("(9,1.0)"));

    assertFalse(iter.hasNext());

    // Check output
    pigServer.registerQuery("N = LOAD '" + outputZ + "' as (a0: double, a1: double);");

    iter = pigServer.openIterator("N");

    t = iter.next();
    assertTrue(t.toString().equals("(8.0,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(13.0,40.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(23.0,60.0)"));

    assertFalse(iter.hasNext());

    // Non batch mode
    iter = pigServer.openIterator("Y");

    t = iter.next();
    assertTrue(t.toString().equals("(3,0.25)"));

    t = iter.next();
    assertTrue(t.toString().equals("(6,0.5)"));

    t = iter.next();
    assertTrue(t.toString().equals("(9,1.0)"));

    assertFalse(iter.hasNext());

    // Check in non-batch mode
    iter = pigServer.openIterator("Z");

    t = iter.next();
    assertTrue(t.toString().equals("(8.0,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(13.0,40.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(23.0,60.0)"));

    assertFalse(iter.hasNext());
  }

  // See PIG-1434
  @Test
  public void testScalarWithNoSchema() throws Exception {
    String[] scalarInput = {"1\t5"};
    String[] input = {"1\t5", "2\t10", "3\t20"};
    String inputPath = BUILD_TEST_TMP + "table_testScalarWithNoSchema";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    String inputPathScalar = BUILD_TEST_TMP + "table_testScalarWithNoSchemaScalar";
    TestScalarAliases.createLocalInputFile(inputPathScalar, scalarInput);
    // Load A as a scalar
    pigServer.registerQuery("A = LOAD '" + inputPath + "';");
    pigServer.registerQuery("scalar = LOAD '" + inputPathScalar + "' as (count, total);");
    pigServer.registerQuery("B = foreach A generate 5 / scalar.total;");

    Iterator<Tuple> iter = pigServer.openIterator("B");

    Tuple t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    assertFalse(iter.hasNext());
  }

  // See PIG-1434
  @Test
  public void testScalarWithTwoBranches() throws Exception {
    String[] inputA = {"1\t5", "2\t10", "3\t20"};

    String[] inputX = {"pig", "hadoop", "rocks"};

    String output = BUILD_TEST_TMP + "testScalarWithTwoBranchesDir";
    TestScalarAliases.deleteDirectory(new File(output));
    // Test the use of scalars in expressions
    String inputPathA = BUILD_TEST_TMP + "testScalarWithTwoBranchesA";
    TestScalarAliases.createLocalInputFile(inputPathA, inputA);
    String inputPathX = BUILD_TEST_TMP + "testScalarWithTwoBranchesX";
    TestScalarAliases.createLocalInputFile(inputPathX, inputX);
    // Test in script mode
    pigServer.setBatchOn();
    pigServer.registerQuery("A = LOAD '" + inputPathA + "' as (a0: long, a1: double);");
    pigServer.registerQuery("B = group A all;");
    pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;");
    pigServer.registerQuery("X = LOAD '" + inputPathX + "' as (names: chararray);");
    pigServer.registerQuery("Y = foreach X generate names, C.max;");
    pigServer.registerQuery("Store Y into '" + output + "';");
    pigServer.executeBatch();
    // Check output
    pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: chararray, a1: double);");

    Iterator<Tuple> iter = pigServer.openIterator("Z");

    Tuple t = iter.next();
    assertTrue(t.toString().equals("(pig,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(hadoop,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(rocks,20.0)"));

    assertFalse(iter.hasNext());

    // Check in non-batch mode
    iter = pigServer.openIterator("Y");

    t = iter.next();
    assertTrue(t.toString().equals("(pig,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(hadoop,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(rocks,20.0)"));

    assertFalse(iter.hasNext());

    pigServer.getPigContext().getProperties().remove("tez.am.inline.task.execution.max-tasks");
  }

  // See PIG-1434
  @Test
  public void testFilteredScalarDollarProj() throws Exception {
    String output = BUILD_TEST_TMP + "table_testFilteredScalarDollarProjDir";
    TestScalarAliases.deleteDirectory(new File(output));
    String[] input = {
      "1\t5\t[state#maine,city#portland]\t{(a),(b)}\t(a,b)", "2\t10\t\t\t", "3\t20\t\t\t"
    };

    // Test the use of scalars in expressions
    String inputPath = BUILD_TEST_TMP + "table_testFilteredScalarDollarProj";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    // Test in script mode
    pigServer.setBatchOn();
    pigServer.registerQuery(
        "A = LOAD '"
            + inputPath
            + "'"
            + " as (a0: long, a1: double, a2 : bytearray, "
            + "a3: bag{ t : tuple(tc : chararray)}, "
            + "a4: tuple(c1 : chararray, c2 : chararray) );");
    pigServer.registerQuery("B = filter A by $1 < 8;");
    pigServer.registerQuery(
        "Y = foreach A generate (a0 * B.$0), (a1 / B.$1), B.$2, B.$2#'state', B.$3, B.a4;");
    pigServer.registerQuery("Store Y into '" + output + "';");
    pigServer.explain("Y", System.err);
    pigServer.executeBatch();
    // Check output
    pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: int, a1: double);");
    pigServer.explain("Z", System.err);

    Iterator<Tuple> iter = pigServer.openIterator("Z");

    Tuple t = iter.next();
    assertTrue(t.toString().equals("(1,1.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(2,2.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(3,4.0)"));

    assertFalse(iter.hasNext());

    // Check in non-batch mode
    iter = pigServer.openIterator("Y");

    t = iter.next();
    assertEquals(t.toString(), "(1,1.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))");

    t = iter.next();
    assertEquals(t.toString(), "(2,2.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))");

    t = iter.next();
    assertEquals(t.toString(), "(3,4.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))");

    assertFalse(iter.hasNext());
  }

  // See PIG-1434
  @Test
  public void testScalarWithNoSchemaDollarProj() throws Exception {
    String[] scalarInput = {"1\t5"};
    String[] input = {"1\t5", "2\t10", "3\t20"};
    String inputPath = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProj";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    String inputPathScalar = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProjScalar";
    TestScalarAliases.createLocalInputFile(inputPathScalar, scalarInput);
    // Load A as a scalar
    pigServer.registerQuery("A = LOAD '" + inputPath + "';");
    pigServer.registerQuery("scalar = LOAD '" + inputPathScalar + "';");
    pigServer.registerQuery("B = foreach A generate 5 / scalar.$1;");

    Iterator<Tuple> iter = pigServer.openIterator("B");

    Tuple t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    assertFalse(iter.hasNext());
  }

  // See PIG-1434
  @Test
  public void testScalarAliasesJoinClause() throws Exception {
    String[] inputA = {"1\t5", "2\t10", "3\t20"};
    String[] inputB = {"Total3\tthree", "Total2\ttwo", "Total1\tone"};

    // Test the use of scalars in expressions
    String inputPathA = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseA";
    TestScalarAliases.createLocalInputFile(inputPathA, inputA);
    String inputPathB = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseB";
    TestScalarAliases.createLocalInputFile(inputPathB, inputB);
    // Test in script mode
    pigServer.registerQuery("A = LOAD '" + inputPathA + "' as (a0, a1);");
    pigServer.registerQuery("G = group A all;");
    pigServer.registerQuery("C = foreach G generate COUNT(A) as count;");

    pigServer.registerQuery("B = LOAD '" + inputPathB + "' as (b0:chararray, b1:chararray);");
    pigServer.registerQuery("Y = join A by CONCAT('Total', (chararray)C.count), B by $0;");

    Iterator<Tuple> iter = pigServer.openIterator("Y");

    String[] expected =
        new String[] {"(1,5,Total3,three)", "(2,10,Total3,three)", "(3,20,Total3,three)"};

    Util.checkQueryOutputsAfterSortRecursive(
        iter,
        expected,
        org.apache.pig.newplan.logical.Util.translateSchema(pigServer.dumpSchema("Y")));
  }

  // See PIG-1434
  @Test
  public void testScalarAliasesFilterClause() throws Exception {
    String[] input = {"1\t5", "2\t10", "3\t20", "4\t12", "5\t8"};

    // Test the use of scalars in expressions
    String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesFilterClause";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    // Test in script mode
    pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0, a1);");
    pigServer.registerQuery("G = group A all;");
    pigServer.registerQuery("C = foreach G generate AVG(A.$1) as average;");

    pigServer.registerQuery("Y = filter A by a1 > C.average;");

    Iterator<Tuple> iter = pigServer.openIterator("Y");

    // Average is 11
    Tuple t = iter.next();
    assertTrue(t.toString().equals("(3,20)"));

    t = iter.next();
    assertTrue(t.toString().equals("(4,12)"));

    assertFalse(iter.hasNext());
  }

  // See PIG-1434
  @Test
  public void testScalarAliasesGrammarNegative() throws Exception {
    String[] input = {"1\t5", "2\t10", "3\t20"};

    String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesGrammar";
    TestScalarAliases.createLocalInputFile(inputPath, input);

    try {
      pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);");
      pigServer.registerQuery("B = group A all;");
      pigServer.registerQuery("C = foreach B generate COUNT(A);");
      // Only projections of C are supported
      pigServer.registerQuery("Y = foreach A generate C;");
      pigServer.openIterator("Y");
      // Control should not reach here
      fail("Scalar projections are only supported");
    } catch (IOException pe) {
      assertTrue(pe.getMessage().contains("Invalid scalar projection: C"));
    }
  }

  // See PIG-1636
  @Test
  public void testScalarAliasesLimit() throws Exception {
    String[] input = {"a\t1", "b\t2", "c\t3", "a\t4", "c\t5"};

    // Test the use of scalars in expressions
    String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesLimit";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    // Test in script mode
    pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0:chararray, a1: int);");
    pigServer.registerQuery("G = group A all;");
    pigServer.registerQuery("C = foreach G generate SUM(A.$1) as total;");
    pigServer.registerQuery("C1 = limit C 1;");
    pigServer.registerQuery("Y = foreach A generate a0, a1 * (double)C1.total;");

    Iterator<Tuple> iter = pigServer.openIterator("Y");

    // Average is 11
    Tuple t = iter.next();
    assertTrue(t.toString().equals("(a,15.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(b,30.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(c,45.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(a,60.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(c,75.0)"));

    assertFalse(iter.hasNext());
  }

  /**
   * Test that a specific string is included in the error message when an exception is thrown for
   * using a relation in a scalar context without projecting any columns out of it
   */
  // See PIG-1788
  @Test
  public void testScalarWithNoProjection() throws Exception {
    String query =
        "  A = load 'table_testScalarWithNoProjection' as (x, y);"
            + "  B = group A by x;"
            +
            // B is unintentionally being used as scalar,
            // the user intends it to be COUNT(A)
            "  C = foreach B generate COUNT(B);";

    Util.checkExceptionMessage(
        query,
        "C",
        "A column needs to be projected from a relation" + " for it to be used as a scalar");
  }

  @Test
  public void testScalarNullValue() throws Exception {
    Storage.Data data = Storage.resetData(pigServer);
    data.set("input", Storage.tuple("a", 1), Storage.tuple("b", 2));

    pigServer.setBatchOn();
    pigServer.registerQuery("A = load 'input' using mock.Storage() as (a:chararray, b:int);");
    pigServer.registerQuery("B = FILTER A by a == 'c';");
    pigServer.registerQuery("C = FOREACH A generate a, b + B.b;");
    pigServer.registerQuery("store C into 'output' using mock.Storage();");

    pigServer.executeBatch();

    List<Tuple> actualResults = data.get("output");
    List<Tuple> expectedResults =
        Util.getTuplesFromConstantTupleStrings(new String[] {"('a', null)", "('b', null)"});
    Util.checkQueryOutputsAfterSort(actualResults.iterator(), expectedResults);
  }
}