Java BagFactory.getInstance示例，org.apache.pig.data.BagFactory.getInstance Java示例

示例#1

0

显示文件

文件： TestSkewedJoin.java 项目： hirohanin/pig7hadoop21

  public void testSkewedJoinWithGroup() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    pigServer.registerQuery("C = GROUP A by id;");
    pigServer.registerQuery("D = GROUP B by id;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbshj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by group, D by group using \"skewed\" parallel 5;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by group, D by group;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbshj.add(iter.next());
      }
    }
    Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));
  }

示例#2

0

显示文件

文件： TestSkewedJoin.java 项目： hirohanin/pig7hadoop21

  public void testSkewedJoinWithNoProperties() throws IOException {
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      DataBag dbshj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("E = join A by(id, name), B by (id, name);");
        Iterator<Tuple> iter = pigServer.openIterator("E");

        while (iter.hasNext()) {
          dbshj.add(iter.next());
        }
      }
      Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
      Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));

    } catch (Exception e) {
      fail(e.getMessage());
    }
  }

示例#3

0

显示文件

文件： TestSkewedJoin.java 项目： hirohanin/pig7hadoop21

  public void testSkewedJoinManyReducers() throws IOException {
    pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "2");
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE6 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE7 + "' as (id,name);");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 300;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join A by id, B by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }

示例#4

0

显示文件

文件： TestSkewedJoin.java 项目： hirohanin/pig7hadoop21

  // pig 1048
  public void testSkewedJoinOneValue() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    // Filter key with a single value

    pigServer.registerQuery("C = FILTER A by id == 400;");
    pigServer.registerQuery("D = FILTER B by id == 400;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by id, D by id using \"skewed\";");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by id, D by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }

示例#5

0

显示文件

文件： ReservoirSample.java 项目： jhartman/datafu

    @Override
    public Tuple exec(Tuple input) throws IOException {
      DataBag output = BagFactory.getInstance().newDefaultBag();

      DataBag samples = (DataBag) input.get(0);
      if (samples == null) {
        // do nothing
      } else if (samples.size() <= numSamples) {
        // no need to construct a reservoir, so just emit intermediate tuples
        for (Tuple sample : samples) {
          // add the score on to the intermediate tuple
          output.add(new ScoredTuple(Math.random(), sample).getIntermediateTuple(tupleFactory));
        }
      } else {
        for (Tuple sample : samples) {
          getReservoir().consider(new ScoredTuple(Math.random(), sample));
        }

        for (ScoredTuple scoredTuple : getReservoir()) {
          // add the score on to the intermediate tuple
          output.add(scoredTuple.getIntermediateTuple(tupleFactory));
        }
      }

      return tupleFactory.newTuple(output);
    }

示例#6

0

显示文件

文件： UnigramExtractor.java 项目： fishingsun/akela

public class UnigramExtractor extends EvalFunc<DataBag> {

  private static BagFactory bagFactory = BagFactory.getInstance();
  private static TupleFactory tupleFactory = TupleFactory.getInstance();

  private static final Pattern spacePattern = Pattern.compile("\\s+");
  private static final Pattern punctPattern = Pattern.compile("\\p{Punct}(?:(?<!\\d)(?!\\d))");

  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
      return null;
    }

    String normStr = ((String) input.get(0));
    if (normStr == null) {
      return null;
    }

    // Remove punctuation except when it's a version number
    normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" ");
    normStr = spacePattern.matcher(normStr).replaceAll(" ");

    DataBag output = bagFactory.newDefaultBag();
    for (String s : spacePattern.split(normStr.trim())) {
      if (s.length() <= 30) {
        Tuple t = tupleFactory.newTuple(1);
        t.set(0, s);
        output.add(t);
      }
    }

    return output;
  }
}

示例#7

0

显示文件

文件： RawWikipediaLoader.java 项目： qwaider/pignlproc

 @SuppressWarnings("rawtypes")
 @Override
 public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
   this.reader = (WikipediaPageInputFormat.WikipediaRecordReader) reader;
   tupleFactory = TupleFactory.getInstance();
   bagFactory = BagFactory.getInstance();
 }

示例#8

0

显示文件

文件： StanfordTokenize.java 项目： kperi/varaha

/**
 * TokenizeText uses the Lucene libraries StandardAnalyzer class to tokenize a raw text input. A
 * list of the stopwords used is available {@link StopWords}. Output is a pig bag containing tokens.
 * <dt><b>Example:</b>
 * <dd><code>
 * register varaha.jar;<br/>
 * documents    = LOAD 'documents' AS (doc_id:chararray, text:chararray);<br/>
 * tokenized    = FOREACH documents GENERATE doc_id AS doc_id, FLATTEN(StanfordTokenize(text)) AS (token:chararray);
 * </code>
 * </dl>
 *
 * @see
 * @author Russell Jurney
 */
public class StanfordTokenize extends EvalFunc<DataBag> {

  private static TupleFactory tupleFactory = TupleFactory.getInstance();
  private static BagFactory bagFactory = BagFactory.getInstance();

  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0)) return null;

    // Output bag
    DataBag bagOfTokens = bagFactory.newDefaultBag();

    StringReader textInput = new StringReader(input.get(0).toString());
    PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), "");

    for (CoreLabel label; ptbt.hasNext(); ) {
      label = (CoreLabel) ptbt.next();
      if (label.value().length() > 2) {
        System.err.println(label.toString());
        Tuple termText = tupleFactory.newTuple(label.word());
        bagOfTokens.add(termText);
      }
    }
    return bagOfTokens;
  }
}

示例#9

0

显示文件

文件： WeekDayDetection.java 项目： rmaestre/d4d-challenge

/**
 * WeekDayDetection.java - get a string formated date, and return the number of the week. (0=Monday,
 * ..., 6=Sunday) Input tuple: {(1,2,3,5)} Output DataBag: {(1,2),(1,3),(1,5),(2,3),(2,5),(3,5)}
 *
 * @author Roberto Maestre
 * @version 1.0
 */
public class WeekDayDetection extends EvalFunc<DataBag> {

  TupleFactory mTupleFactory = TupleFactory.getInstance();
  BagFactory mBagFactory = BagFactory.getInstance();

  public DataBag exec(Tuple input) throws IOException {
    try {
      if (!input.isNull()) {
        // Create the output like a databag {(res1,res2),(res3,res4)..}
        DataBag output_databag = mBagFactory.newDefaultBag();
        // Unpack tuple in order to get the bag {(1,2),(3,4),...}
        String input_time = (String) input.get(0);
        try {

          DateFormat formatter = new SimpleDateFormat("MM/dd/yyyy kk:mm:ss");
          Date date =
              formatter.parse(
                  String.format(
                      "%s/%s/%s %s:%s:%s",
                      input_time.substring(5, 7),
                      input_time.substring(8, 10),
                      input_time.substring(0, 4),
                      input_time.substring(11, 13),
                      input_time.substring(14, 16),
                      input_time.substring(17, 18)));
          Calendar calendar = Calendar.getInstance();
          calendar.setTime(date);
          int dayOfWeek = calendar.get(Calendar.DAY_OF_WEEK);
          int dayOfMonth = calendar.get(Calendar.DAY_OF_MONTH);
          int hour = calendar.get(Calendar.HOUR_OF_DAY);

          // Add items to output
          Tuple items = TupleFactory.getInstance().newTuple(1);
          items.set(0, String.format("%d:%d:%d", dayOfWeek, dayOfMonth, hour));
          output_databag.add(items);

        } catch (Exception e) {
          Tuple items = TupleFactory.getInstance().newTuple(1);
          items.set(0, "petting #1" + e.getMessage());
          output_databag.add(items);
          return output_databag;
        }
        return output_databag;
      } else {
        DataBag output_databag = mBagFactory.newDefaultBag();
        Tuple items = TupleFactory.getInstance().newTuple(1);
        items.set(0, "petting #2");
        output_databag.add(items);
        return output_databag;
      }
    } catch (Exception e) {
      System.err.println("Error with ?? ..");
      DataBag output_databag = mBagFactory.newDefaultBag();
      Tuple items = TupleFactory.getInstance().newTuple(1);
      items.set(0, "petting #3" + e.getMessage());
      output_databag.add(items);
      return output_databag;
    }
  }
}

示例#10

0

显示文件

文件： ReservoirSample.java 项目： jhartman/datafu

 @Override
 public DataBag getValue() {
   DataBag output = BagFactory.getInstance().newDefaultBag();
   for (ScoredTuple sample : getReservoir()) {
     output.add(sample.getTuple());
   }
   return output;
 }

示例#11

0

显示文件

文件： TestBinInterSedes.java 项目： nfouka/hadoop_single_node

 /**
  * create bag having given number of tuples
  *
  * @param size
  * @return
  */
 private DataBag createBag(int size) {
   Tuple innerTuple = TupleFactory.getInstance().newTuple();
   innerTuple.append(Integer.valueOf(1));
   DataBag bag = BagFactory.getInstance().newDefaultBag();
   for (int i = 0; i < size; i++) {
     bag.add(innerTuple);
   }
   return bag;
 }

示例#12

0

显示文件

文件： FlattenOperator2.java 项目： huayuxian/Cubert

  @Override
  public void setInput(Map<String, Block> input, JsonNode operatorJson, BlockProperties props)
      throws IOException, InterruptedException {
    inputBlock = input.values().iterator().next();

    init(operatorJson, inputBlock.getProperties().getSchema());

    nullBag = BagFactory.getInstance().newDefaultBag();

    nullBag.add(TupleFactory.getInstance().newTuple(0));
  }

示例#13

0

显示文件

文件： BagLeftOuterJoin.java 项目： natevalentin/datafu

  @Override
  public DataBag exec(Tuple input) throws IOException {
    retrieveContextValues();

    ArrayList<String> joinKeyNames = new ArrayList<String>();
    for (int i = 1; i < input.size(); i += 2) {
      joinKeyNames.add((String) input.get(i));
    }

    JoinCollector collector = new JoinCollector();
    // the first bag is the outer bag
    String leftBagName = bagNames.get(0);
    DataBag leftBag = getBag(input, leftBagName);
    String leftBagJoinKeyName =
        getPrefixedAliasName(bagNameToJoinKeyPrefix.get(leftBagName), joinKeyNames.get(0));
    collector.setJoinData(collector.groupTuples(leftBag, leftBagJoinKeyName));
    // now, for each additional bag, group up the tuples by the join key, then join them in
    if (bagNames.size() > 1) {
      for (int i = 1; i < bagNames.size(); i++) {
        String bagName = bagNames.get(i);
        DataBag bag = getBag(input, bagName);
        String joinKeyName =
            getPrefixedAliasName(bagNameToJoinKeyPrefix.get(bagName), joinKeyNames.get(i));
        int tupleSize = bagNameToSize.get(bagName);
        if (bag == null)
          throw new IOException(
              "Error in instance: "
                  + getInstanceName()
                  + " with properties: "
                  + getInstanceProperties()
                  + " and tuple: "
                  + input.toDelimitedString(", ")
                  + " -- Expected bag, got null");
        HashMap<Object, List<Tuple>> groupedData = collector.groupTuples(bag, joinKeyName);
        // outer join, so go back in and add nulls;
        groupedData = collector.insertNullTuples(groupedData, tupleSize);
        for (Map.Entry<Object, List<Tuple>> entry : groupedData.entrySet()) {
          collector.joinTuples(entry.getKey(), entry.getValue());
        }
      }
    }

    // assemble output bag
    DataBag outputBag = BagFactory.getInstance().newDefaultBag();
    for (List<Tuple> tuples : collector.getJoinData().values()) {
      for (Tuple tuple : tuples) {
        outputBag.add(tuple);
      }
    }

    return outputBag;
  }

示例#14

0

显示文件

文件： TestSkewedJoin.java 项目： hirohanin/pig7hadoop21

  public void testSkewedJoinReducers() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id, B by id using \"skewed\" parallel 1;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      fail("Should not throw exception, should continue execution");
    }
  }

示例#15

0

显示文件

文件： TestSkewedJoin.java 项目： hirohanin/pig7hadoop21

  public void testSkewedJoinNullKeys() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support null keys in skewed join");
    }
    return;
  }

示例#16

0

显示文件

文件： TestSkewedJoin.java 项目： hirohanin/pig7hadoop21

  public void testSkewedJoin3Way() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("D = join A by id, B by id, C by id using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("D");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      return;
    }

    fail("Should throw exception, do not support 3 way join");
  }

示例#17

0

显示文件

文件： ReservoirSample.java 项目： jhartman/datafu

    @Override
    public DataBag exec(Tuple input) throws IOException {
      DataBag bagOfSamples = (DataBag) input.get(0);
      for (Tuple innerTuple : bagOfSamples) {
        DataBag samples = (DataBag) innerTuple.get(0);

        for (Tuple sample : samples) {
          // use the same score as previously generated
          getReservoir().consider(ScoredTuple.fromIntermediateTuple(sample));
        }
      }

      DataBag output = BagFactory.getInstance().newDefaultBag();
      for (ScoredTuple scoredTuple : getReservoir()) {
        // output the original tuple
        output.add(scoredTuple.getTuple());
      }

      return output;
    }

示例#18

0

显示文件

文件： POCross.java 项目： fysoft2006/apache-pig-on-storm

 @SuppressWarnings("unchecked")
 private void accumulateData() throws ExecException {
   int count = 0;
   int length = inputs.size() - 1;
   inputBags = new DataBag[length];
   its = new Iterator[length];
   for (int i = 0; i < length; ++i) {
     PhysicalOperator op = inputs.get(i);
     DataBag bag = BagFactory.getInstance().newDefaultBag();
     inputBags[count] = bag;
     for (Result res = op.getNextTuple();
         res.returnStatus != POStatus.STATUS_EOP;
         res = op.getNextTuple()) {
       if (res.returnStatus == POStatus.STATUS_NULL) continue;
       if (res.returnStatus == POStatus.STATUS_ERR)
         throw new ExecException("Error accumulating data in the local Cross operator");
       if (res.returnStatus == POStatus.STATUS_OK) bag.add((Tuple) res.result);
     }
     its[count++] = bag.iterator();
   }
 }

示例#19

0

显示文件

文件： TestSkewedJoin.java 项目： hirohanin/pig7hadoop21

  public void testSkewedJoinMapKey() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE4 + "' as (m:[]);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE4 + "' as (n:[]);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (chararray)m#'a100', B by (chararray)n#'a100' using \"skewed\" parallel 20;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support maps and expression operators as keys");
    }

    return;
  }

示例#20

0

显示文件

文件： TestBinInterSedes.java 项目： nfouka/hadoop_single_node

  @Test
  public void testTupleWriteRead1() throws IOException {
    // create a tuple with columns of different type
    Tuple tuplein = TupleFactory.getInstance().newTuple(7);
    tuplein.set(0, 12);
    Map<String, String> map = new HashMap<String, String>();
    map.put("pig", "scalability");
    tuplein.set(1, map);
    tuplein.set(2, null);
    tuplein.set(3, 12L);
    tuplein.set(4, 1.2F);

    Tuple innerTuple = TupleFactory.getInstance().newTuple(1);
    innerTuple.set(0, "innerTuple");
    tuplein.set(5, innerTuple);
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    bag.add(innerTuple);
    tuplein.set(6, bag);

    testTupleSedes(tuplein);

    assertEquals(
        "(12,[pig#scalability],,12,1.2,(innerTuple),{(innerTuple)})", TupleFormat.format(tuplein));
  }

示例#21

0

显示文件

文件： TupleToBag.java 项目： ThinkBigAnalytics/ThinkBigAnalytics-Pig-Extension

  public DataBag exec(Tuple input) throws IOException {
    try {
      DataBag bag = BagFactory.getInstance().newDefaultBag();

      for (int i = 0; i < input.size(); i++) {
        final Object object = input.get(i);
        if (object instanceof Tuple) {
          for (int j = 0; j < ((Tuple) object).size(); j++) {
            Tuple tp2 = TupleFactory.getInstance().newTuple(1);
            tp2.set(0, ((Tuple) object).get(j));
            bag.add(tp2);
          }
        } else {
          Tuple tp2 = TupleFactory.getInstance().newTuple(1);
          tp2.set(0, object);
          bag.add(tp2);
        }
      }

      return bag;
    } catch (Exception ee) {
      throw new RuntimeException("Error while creating a bag", ee);
    }
  }

示例#22

0

显示文件

文件： CalcClassWeight.java 项目： wonderyl/PigUDF

public class CalcClassWeight extends EvalFunc<DataBag> {
  TupleFactory tupleFactory = TupleFactory.getInstance();
  BagFactory bagFactory = BagFactory.getInstance();

  @Override
  public DataBag exec(Tuple input) throws IOException {
    DataBag bag = (DataBag) input.get(0);
    HashMap<String, Double> clsCnt = new HashMap<String, Double>();
    Iterator<Tuple> it = bag.iterator();
    Double sum = new Double(0.0);
    while (it.hasNext()) {
      Tuple item = (Tuple) it.next();
      String cls = (String) item.get(3);
      if (cls != null && cls.length() > 0) {
        Double cur = clsCnt.get(cls);
        Double inc = (Double) item.get(2);
        if (cur != null) {
          clsCnt.put(cls, cur + inc);
        } else {
          clsCnt.put(cls, inc);
        }
        sum += inc;
      }
    }

    Set<Entry<String, Double>> clses = clsCnt.entrySet();
    Iterator<Entry<String, Double>> cit = clses.iterator();
    DataBag result = bagFactory.newDefaultBag();
    while (cit.hasNext()) {
      Entry<String, Double> cls = cit.next();
      Tuple tpl = tupleFactory.newTuple(2);
      tpl.set(0, cls.getKey());
      tpl.set(1, cls.getValue() / sum);
      result.add(tpl);
    }

    return result;
  }

  @Override
  public Schema outputSchema(Schema input) {
    try {
      if (input.getFields().size() != 1 || input.getField(0).type != DataType.BAG) {
        throw new RuntimeException("expect input {bag}");
      }
      Schema bag = input.getField(0).schema.getField(0).schema;
      if (bag.getFields().size() < 4
          || bag.getField(0).type != DataType.CHARARRAY
          || bag.getField(1).type != DataType.CHARARRAY
          || bag.getField(2).type != DataType.DOUBLE
          || bag.getField(3).type != DataType.CHARARRAY) {
        throw new RuntimeException(
            "expect input {userid:chararray, " + "md:chararray, weight:double, cls:chararray}");
      }

      Schema result = new Schema();
      result.add(new FieldSchema("cls", DataType.CHARARRAY));
      result.add(new FieldSchema("weight", DataType.DOUBLE));
      return result;
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
}

示例#23

0

显示文件

文件： SetDifference.java 项目： alearceo/incubator-datafu

/**
 * Computes the set difference of two or more bags. Duplicates are eliminated. <b>The input bags
 * must be sorted.</b>
 *
 * <p>If bags A and B are provided, then this computes A-B, i.e. all elements in A that are not in
 * B. If bags A, B and C are provided, then this computes A-B-C, i.e. all elements in A that are not
 * in B or C.
 *
 * <p>Example:
 *
 * <pre>{@code
 * define SetDifference datafu.pig.sets.SetDifference();
 *
 * -- input:
 * -- ({(1),(2),(3),(4),(5),(6)},{(3),(4)})
 * input = LOAD 'input' AS (B1:bag{T:tuple(val:int)},B2:bag{T:tuple(val:int)});
 *
 * input = FOREACH input {
 *   B1 = ORDER B1 BY val ASC;
 *   B2 = ORDER B2 BY val ASC;
 *
 *   -- output:
 *   -- ({(1),(2),(5),(6)})
 *   GENERATE SetDifference(B1,B2);
 * }
 * }</pre>
 */
public class SetDifference extends SetOperationsBase {
  private static final BagFactory bagFactory = BagFactory.getInstance();

  /**
   * Loads the data bags from the input tuple and puts them in a priority queue, where ordering is
   * determined by the data from the iterator for each bag.
   *
   * <p>The bags are wrapped in a {@link Pair} object that is comparable on the data currently
   * available from the iterator. These objects are ordered first by the data, then by the index
   * within the tuple the bag came from.
   *
   * @param input
   * @return priority queue ordered
   * @throws IOException
   */
  private PriorityQueue<Pair> loadBags(Tuple input) throws IOException {
    PriorityQueue<Pair> pq = new PriorityQueue<Pair>(input.size());

    for (int i = 0; i < input.size(); i++) {
      if (input.get(i) != null) {
        Iterator<Tuple> inputIterator = ((DataBag) input.get(i)).iterator();
        if (inputIterator.hasNext()) {
          pq.add(new Pair(inputIterator, i));
        }
      }
    }
    return pq;
  }

  /**
   * Counts how many elements in the priority queue match the element at the front of the queue,
   * which should be from the first bag.
   *
   * @param pq priority queue
   * @return number of matches
   */
  public int countMatches(PriorityQueue<Pair> pq) {
    Pair nextPair = pq.peek();
    Tuple data = nextPair.data;

    // sanity check
    if (!nextPair.index.equals(0)) {
      throw new RuntimeException("Expected next bag to have index 0");
    }

    int matches = 0;
    for (Pair p : pq) {
      if (data.equals(p.data)) matches++;
    }
    // subtract 1 since element matches itself
    return matches - 1;
  }

  @SuppressWarnings("unchecked")
  @Override
  public DataBag exec(Tuple input) throws IOException {
    if (input.size() < 2) {
      throw new RuntimeException("Expected at least two inputs, but found " + input.size());
    }

    for (Object o : input) {
      if (o != null && !(o instanceof DataBag)) {
        throw new RuntimeException("Inputs must be bags");
      }
    }

    DataBag outputBag = bagFactory.newDefaultBag();

    DataBag bag1 = (DataBag) input.get(0);
    DataBag bag2 = (DataBag) input.get(1);

    if (bag1 == null || bag1.size() == 0) {
      return outputBag;
    }
    // optimization
    else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) {
      return bag1;
    }

    PriorityQueue<Pair> pq = loadBags(input);

    Tuple lastData = null;

    while (true) {
      Pair nextPair = pq.peek();

      // ignore data we've already encountered
      if (nextPair.data.compareTo(lastData) != 0) {
        // Only take data from the first bag, where there are no other
        // bags that have the same data.
        if (nextPair.index.equals(0) && countMatches(pq) == 0) {
          outputBag.add(nextPair.data);
          lastData = nextPair.data;
        }
      }

      Pair p = pq.poll();

      // only put the bag back into the queue if it still has data
      if (p.hasNext()) {
        p.next();
        pq.offer(p);
      } else if (p.index.equals(0)) {
        // stop when we exhaust all elements from the first bag
        break;
      }
    }

    return outputBag;
  }

  /**
   * A wrapper for the tuple iterator that implements comparable so it can be used in the priority
   * queue.
   *
   * <p>This is compared first on the data, then on the index the bag came from in the input tuple.
   */
  private static class Pair implements Comparable<Pair> {
    private final Iterator<Tuple> it;
    private final Integer index;
    private Tuple data;

    /**
     * Constructs the {@link Pair}.
     *
     * @param it tuple iterator
     * @param index index within the tuple that the bag came from
     */
    public Pair(Iterator<Tuple> it, int index) {
      this.index = index;
      this.it = it;
      this.data = it.next();
    }

    @SuppressWarnings("unchecked")
    @Override
    public int compareTo(Pair o) {
      int r = this.data.compareTo(o.data);
      if (r == 0) {
        return index.compareTo(o.index);
      } else {
        return r;
      }
    }

    public boolean hasNext() {
      return it.hasNext();
    }

    @SuppressWarnings("unchecked")
    public Tuple next() {
      Tuple nextData = it.next();
      // algorithm assumes data is in order
      if (data.compareTo(nextData) > 0) {
        throw new RuntimeException("Out of order!");
      }
      this.data = nextData;
      return this.data;
    }

    @Override
    public String toString() {
      return String.format("[%s within %d]", data, index);
    }
  }
}

示例#24

0

显示文件

文件： POPackage.java 项目： hirohanin/pig7hadoop21

  /**
   * From the inputs, constructs the output tuple for this co-group in the required format which is
   * (key, {bag of tuples from input 1}, {bag of tuples from input 2}, ...)
   */
  @Override
  public Result getNext(Tuple t) throws ExecException {
    Tuple res;

    if (firstTime) {
      firstTime = false;
      if (PigMapReduce.sJobConf != null) {
        String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type");
        if (bagType != null && bagType.equalsIgnoreCase("default")) {
          useDefaultBag = true;
        }
      }
    }

    if (distinct) {
      // only set the key which has the whole
      // tuple
      res = mTupleFactory.newTuple(1);
      res.set(0, key);
    } else {
      // Create numInputs bags
      DataBag[] dbs = null;
      dbs = new DataBag[numInputs];

      if (isAccumulative()) {
        // create bag wrapper to pull tuples in many batches
        // all bags have reference to the sample tuples buffer
        // which contains tuples from one batch
        POPackageTupleBuffer buffer = new POPackageTupleBuffer();
        for (int i = 0; i < numInputs; i++) {
          dbs[i] = new AccumulativeBag(buffer, i);
        }

      } else {
        // create bag to pull all tuples out of iterator
        for (int i = 0; i < numInputs; i++) {
          dbs[i] =
              useDefaultBag
                  ? BagFactory.getInstance().newDefaultBag()
                  // In a very rare case if there is a POStream after this
                  // POPackage in the pipeline and is also blocking the pipeline;
                  // constructor argument should be 2 * numInputs. But for one obscure
                  // case we don't want to pay the penalty all the time.
                  : new InternalCachedBag(numInputs);
        }
        // For each indexed tup in the inp, sort them
        // into their corresponding bags based
        // on the index
        while (tupIter.hasNext()) {
          NullableTuple ntup = tupIter.next();
          int index = ntup.getIndex();
          Tuple copy = getValueTuple(ntup, index);

          if (numInputs == 1) {

            // this is for multi-query merge where
            // the numInputs is always 1, but the index
            // (the position of the inner plan in the
            // enclosed operator) may not be 1.
            dbs[0].add(copy);
          } else {
            dbs[index].add(copy);
          }
          if (reporter != null) reporter.progress();
        }
      }

      // Construct the output tuple by appending
      // the key and all the above constructed bags
      // and return it.
      res = mTupleFactory.newTuple(numInputs + 1);
      res.set(0, key);
      int i = -1;
      for (DataBag bag : dbs) {
        i++;
        if (inner[i] && !isAccumulative()) {
          if (bag.size() == 0) {
            detachInput();
            Result r = new Result();
            r.returnStatus = POStatus.STATUS_NULL;
            return r;
          }
        }

        res.set(i + 1, bag);
      }
    }
    detachInput();
    Result r = new Result();
    r.result = res;
    r.returnStatus = POStatus.STATUS_OK;
    return r;
  }

示例#25

0

显示文件

文件： POPackage.java 项目： hirohanin/pig7hadoop21

/**
 * The package operator that packages the globally rearranged tuples into output format as required
 * by co-group. This is last stage of processing co-group. This operator has a slightly different
 * format than other operators in that, it takes two things as input. The key being worked on and
 * the iterator of bags that contain indexed tuples that just need to be packaged into their
 * appropriate output bags based on the index.
 */
public class POPackage extends PhysicalOperator {
  /** */
  private static final long serialVersionUID = 1L;

  private static boolean[] SIMPLE_KEY_POSITION;

  static {
    SIMPLE_KEY_POSITION = new boolean[1];
    SIMPLE_KEY_POSITION[0] = true;
  }

  // The iterator of indexed Tuples
  // that is typically provided by
  // Hadoop
  transient Iterator<NullableTuple> tupIter;

  // The key being worked on
  Object key;

  // marker to indicate if key is a tuple
  protected boolean isKeyTuple = false;
  // key as a Tuple object (if the key is a tuple)
  protected Tuple keyAsTuple;

  // key's type
  byte keyType;

  // The number of inputs to this
  // co-group.  0 indicates a distinct, which means there will only be a
  // key, no value.
  int numInputs;

  // If the attaching map-reduce plan use secondary sort key
  boolean useSecondaryKey = false;

  // Denotes if inner is specified
  // on a particular input
  boolean[] inner;

  // flag to denote whether there is a distinct
  // leading to this package
  protected boolean distinct = false;

  // A mapping of input index to key information got from LORearrange
  // for that index. The Key information is a pair of boolean, Map.
  // The boolean indicates whether there is a lone project(*) in the
  // cogroup by. If not, the Map has a mapping of column numbers in the
  // "value" to column numbers in the "key" which contain the fields in
  // the "value"
  protected Map<Integer, Pair<Boolean, Map<Integer, Integer>>> keyInfo;

  private final transient Log log = LogFactory.getLog(getClass());

  protected static final BagFactory mBagFactory = BagFactory.getInstance();
  protected static final TupleFactory mTupleFactory = TupleFactory.getInstance();

  private boolean firstTime = true;

  private boolean useDefaultBag = false;

  public POPackage(OperatorKey k) {
    this(k, -1, null);
  }

  public POPackage(OperatorKey k, int rp) {
    this(k, rp, null);
  }

  public POPackage(OperatorKey k, List<PhysicalOperator> inp) {
    this(k, -1, inp);
  }

  public POPackage(OperatorKey k, int rp, List<PhysicalOperator> inp) {
    super(k, rp, inp);
    numInputs = -1;
    keyInfo = new HashMap<Integer, Pair<Boolean, Map<Integer, Integer>>>();
  }

  @Override
  public String name() {
    return "Package"
        + "["
        + DataType.findTypeName(resultType)
        + "]"
        + "{"
        + DataType.findTypeName(keyType)
        + "}"
        + " - "
        + mKey.toString();
  }

  @Override
  public boolean supportsMultipleInputs() {
    return false;
  }

  @Override
  public void visit(PhyPlanVisitor v) throws VisitorException {
    v.visitPackage(this);
  }

  @Override
  public boolean supportsMultipleOutputs() {
    return false;
  }

  /**
   * Attaches the required inputs
   *
   * @param k - the key being worked on
   * @param inp - iterator of indexed tuples typically obtained from Hadoop
   */
  public void attachInput(PigNullableWritable k, Iterator<NullableTuple> inp) {
    tupIter = inp;
    key = k.getValueAsPigType();
    if (useSecondaryKey) {
      try {
        key = ((Tuple) key).get(0);
      } catch (ExecException e) {
        // TODO Exception
        throw new RuntimeException(e);
      }
    }
    if (isKeyTuple) {
      // key is a tuple, cache the key as a
      // tuple for use in the getNext()
      keyAsTuple = (Tuple) key;
    }
  }

  /** attachInput's better half! */
  public void detachInput() {
    tupIter = null;
    key = null;
  }

  public int getNumInps() {
    return numInputs;
  }

  public void setNumInps(int numInps) {
    this.numInputs = numInps;
  }

  public boolean[] getInner() {
    return inner;
  }

  public void setInner(boolean[] inner) {
    this.inner = inner;
  }

  /**
   * From the inputs, constructs the output tuple for this co-group in the required format which is
   * (key, {bag of tuples from input 1}, {bag of tuples from input 2}, ...)
   */
  @Override
  public Result getNext(Tuple t) throws ExecException {
    Tuple res;

    if (firstTime) {
      firstTime = false;
      if (PigMapReduce.sJobConf != null) {
        String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type");
        if (bagType != null && bagType.equalsIgnoreCase("default")) {
          useDefaultBag = true;
        }
      }
    }

    if (distinct) {
      // only set the key which has the whole
      // tuple
      res = mTupleFactory.newTuple(1);
      res.set(0, key);
    } else {
      // Create numInputs bags
      DataBag[] dbs = null;
      dbs = new DataBag[numInputs];

      if (isAccumulative()) {
        // create bag wrapper to pull tuples in many batches
        // all bags have reference to the sample tuples buffer
        // which contains tuples from one batch
        POPackageTupleBuffer buffer = new POPackageTupleBuffer();
        for (int i = 0; i < numInputs; i++) {
          dbs[i] = new AccumulativeBag(buffer, i);
        }

      } else {
        // create bag to pull all tuples out of iterator
        for (int i = 0; i < numInputs; i++) {
          dbs[i] =
              useDefaultBag
                  ? BagFactory.getInstance().newDefaultBag()
                  // In a very rare case if there is a POStream after this
                  // POPackage in the pipeline and is also blocking the pipeline;
                  // constructor argument should be 2 * numInputs. But for one obscure
                  // case we don't want to pay the penalty all the time.
                  : new InternalCachedBag(numInputs);
        }
        // For each indexed tup in the inp, sort them
        // into their corresponding bags based
        // on the index
        while (tupIter.hasNext()) {
          NullableTuple ntup = tupIter.next();
          int index = ntup.getIndex();
          Tuple copy = getValueTuple(ntup, index);

          if (numInputs == 1) {

            // this is for multi-query merge where
            // the numInputs is always 1, but the index
            // (the position of the inner plan in the
            // enclosed operator) may not be 1.
            dbs[0].add(copy);
          } else {
            dbs[index].add(copy);
          }
          if (reporter != null) reporter.progress();
        }
      }

      // Construct the output tuple by appending
      // the key and all the above constructed bags
      // and return it.
      res = mTupleFactory.newTuple(numInputs + 1);
      res.set(0, key);
      int i = -1;
      for (DataBag bag : dbs) {
        i++;
        if (inner[i] && !isAccumulative()) {
          if (bag.size() == 0) {
            detachInput();
            Result r = new Result();
            r.returnStatus = POStatus.STATUS_NULL;
            return r;
          }
        }

        res.set(i + 1, bag);
      }
    }
    detachInput();
    Result r = new Result();
    r.result = res;
    r.returnStatus = POStatus.STATUS_OK;
    return r;
  }

  protected Tuple getValueTuple(NullableTuple ntup, int index) throws ExecException {
    // Need to make a copy of the value, as hadoop uses the same ntup
    // to represent each value.
    Tuple val = (Tuple) ntup.getValueAsPigType();

    Tuple copy = null;
    // The "value (val)" that we just got may not
    // be the complete "value". It may have some portions
    // in the "key" (look in POLocalRearrange for more comments)
    // If this is the case we need to stitch
    // the "value" together.
    Pair<Boolean, Map<Integer, Integer>> lrKeyInfo = keyInfo.get(index);
    boolean isProjectStar = lrKeyInfo.first;
    Map<Integer, Integer> keyLookup = lrKeyInfo.second;
    int keyLookupSize = keyLookup.size();

    if (keyLookupSize > 0) {

      // we have some fields of the "value" in the
      // "key".
      copy = mTupleFactory.newTuple();
      int finalValueSize = keyLookupSize + val.size();
      int valIndex = 0; // an index for accessing elements from
      // the value (val) that we have currently
      for (int i = 0; i < finalValueSize; i++) {
        Integer keyIndex = keyLookup.get(i);
        if (keyIndex == null) {
          // the field for this index is not in the
          // key - so just take it from the "value"
          // we were handed
          copy.append(val.get(valIndex));
          valIndex++;
        } else {
          // the field for this index is in the key
          if (isKeyTuple) {
            // the key is a tuple, extract the
            // field out of the tuple
            copy.append(keyAsTuple.get(keyIndex));
          } else {
            copy.append(key);
          }
        }
      }

    } else if (isProjectStar) {

      // the whole "value" is present in the "key"
      copy = mTupleFactory.newTuple(keyAsTuple.getAll());

    } else {

      // there is no field of the "value" in the
      // "key" - so just make a copy of what we got
      // as the "value"
      copy = mTupleFactory.newTuple(val.getAll());
    }
    return copy;
  }

  public byte getKeyType() {
    return keyType;
  }

  public void setKeyType(byte keyType) {
    this.keyType = keyType;
  }

  /**
   * Get the field positions of key in the output tuples. For POPackage, the position is always 0.
   * The POCombinerPackage, however, can return different values.
   *
   * @return the field position of key in the output tuples.
   */
  public boolean[] getKeyPositionsInTuple() {
    return SIMPLE_KEY_POSITION.clone();
  }

  /**
   * Make a deep copy of this operator.
   *
   * @throws CloneNotSupportedException
   */
  @Override
  public POPackage clone() throws CloneNotSupportedException {
    POPackage clone = (POPackage) super.clone();
    clone.mKey =
        new OperatorKey(mKey.scope, NodeIdGenerator.getGenerator().getNextNodeId(mKey.scope));
    clone.requestedParallelism = requestedParallelism;
    clone.resultType = resultType;
    clone.keyType = keyType;
    clone.numInputs = numInputs;
    if (inner != null) {
      clone.inner = new boolean[inner.length];
      for (int i = 0; i < inner.length; i++) {
        clone.inner[i] = inner[i];
      }
    } else clone.inner = null;
    return clone;
  }

  /** @param keyInfo the keyInfo to set */
  public void setKeyInfo(Map<Integer, Pair<Boolean, Map<Integer, Integer>>> keyInfo) {
    this.keyInfo = keyInfo;
  }

  /** @param keyTuple the keyTuple to set */
  public void setKeyTuple(boolean keyTuple) {
    this.isKeyTuple = keyTuple;
  }

  /** @return the keyInfo */
  public Map<Integer, Pair<Boolean, Map<Integer, Integer>>> getKeyInfo() {
    return keyInfo;
  }

  /** @return the distinct */
  public boolean isDistinct() {
    return distinct;
  }

  /** @param distinct the distinct to set */
  public void setDistinct(boolean distinct) {
    this.distinct = distinct;
  }

  public void setUseSecondaryKey(boolean useSecondaryKey) {
    this.useSecondaryKey = useSecondaryKey;
  }

  private class POPackageTupleBuffer implements AccumulativeTupleBuffer {
    private List<Tuple>[] bags;
    private Iterator<NullableTuple> iter;
    private int batchSize;
    private Object currKey;

    @SuppressWarnings("unchecked")
    public POPackageTupleBuffer() {
      batchSize = 20000;
      if (PigMapReduce.sJobConf != null) {
        String size = PigMapReduce.sJobConf.get("pig.accumulative.batchsize");
        if (size != null) {
          batchSize = Integer.parseInt(size);
        }
      }

      this.bags = new List[numInputs];
      for (int i = 0; i < numInputs; i++) {
        this.bags[i] = new ArrayList<Tuple>();
      }
      this.iter = tupIter;
      this.currKey = key;
    }

    @Override
    public boolean hasNextBatch() {
      return iter.hasNext();
    }

    @Override
    public void nextBatch() throws IOException {
      for (int i = 0; i < bags.length; i++) {
        bags[i].clear();
      }

      key = currKey;
      for (int i = 0; i < batchSize; i++) {
        if (iter.hasNext()) {
          NullableTuple ntup = iter.next();
          int index = ntup.getIndex();
          Tuple copy = getValueTuple(ntup, index);
          if (numInputs == 1) {

            // this is for multi-query merge where
            // the numInputs is always 1, but the index
            // (the position of the inner plan in the
            // enclosed operator) may not be 1.
            bags[0].add(copy);
          } else {
            bags[index].add(copy);
          }
        }
      }
    }

    public void clear() {
      for (int i = 0; i < bags.length; i++) {
        bags[i].clear();
      }
      iter = null;
    }

    public Iterator<Tuple> getTuples(int index) {
      return bags[index].iterator();
    }
  };
}

示例#26

0

显示文件

文件： TestScalarAliasesLocal.java 项目： scr/pig

public class TestScalarAliasesLocal {
  private static final String BUILD_TEST_TMP = "build/test/tmp/";
  private PigServer pigServer;

  TupleFactory mTf = TupleFactory.getInstance();
  BagFactory mBf = BagFactory.getInstance();

  @Before
  public void setUp() throws Exception {
    pigServer = new PigServer(Util.getLocalTestMode());
  }

  public static void deleteDirectory(File file) {
    if (file.exists()) {
      Util.deleteDirectory(file);
    }
  }

  public static File createLocalInputFile(String filename, String[] inputData) throws IOException {
    new File(filename).getParentFile().mkdirs();
    return Util.createLocalInputFile(filename, inputData);
  }

  // See PIG-1434
  @Test
  public void testScalarAliasesBatchNobatch() throws Exception {
    String[] input = {"1\t5", "2\t10", "3\t20"};

    String output = BUILD_TEST_TMP + "table_testScalarAliasesDir";
    TestScalarAliases.deleteDirectory(new File(output));
    // Test the use of scalars in expressions
    String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesBatch";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    // Test in script mode
    pigServer.setBatchOn();
    pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);");
    pigServer.registerQuery("B = group A all;");
    pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;");
    pigServer.registerQuery("Y = foreach A generate (a0 * C.count), (a1 / C.max);");
    pigServer.registerQuery("Store Y into '" + output + "';");
    pigServer.executeBatch();
    // Check output
    pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: int, a1: double);");

    Iterator<Tuple> iter;
    Tuple t;
    iter = pigServer.openIterator("Z");

    t = iter.next();
    assertTrue(t.toString().equals("(3,0.25)"));

    t = iter.next();
    assertTrue(t.toString().equals("(6,0.5)"));

    t = iter.next();
    assertTrue(t.toString().equals("(9,1.0)"));

    assertFalse(iter.hasNext());

    iter = pigServer.openIterator("Y");

    t = iter.next();
    assertTrue(t.toString().equals("(3,0.25)"));

    t = iter.next();
    assertTrue(t.toString().equals("(6,0.5)"));

    t = iter.next();
    assertTrue(t.toString().equals("(9,1.0)"));

    assertFalse(iter.hasNext());
  }

  // See PIG-1434
  @Test
  public void testUseScalarMultipleTimes() throws Exception {
    String[] input = {"1\t5", "2\t10", "3\t20"};

    String outputY = BUILD_TEST_TMP + "table_testUseScalarMultipleTimesOutY";
    TestScalarAliases.deleteDirectory(new File(outputY));
    String outputZ = BUILD_TEST_TMP + "table_testUseScalarMultipleTimesOutZ";
    TestScalarAliases.deleteDirectory(new File(outputZ));
    // Test the use of scalars in expressions
    String inputPath = BUILD_TEST_TMP + "table_testUseScalarMultipleTimes";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    pigServer.setBatchOn();
    pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);");
    pigServer.registerQuery("B = group A all;");
    pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;");
    pigServer.registerQuery("Y = foreach A generate (a0 * C.count), (a1 / C.max);");
    pigServer.registerQuery("Store Y into '" + outputY + "';");
    pigServer.registerQuery("Z = foreach A generate (a1 + C.count), (a0 * C.max);");
    pigServer.registerQuery("Store Z into '" + outputZ + "';");
    // Test Multiquery store
    pigServer.executeBatch();

    // Check output
    pigServer.registerQuery("M = LOAD '" + outputY + "' as (a0: int, a1: double);");

    Iterator<Tuple> iter;
    Tuple t;
    iter = pigServer.openIterator("M");

    t = iter.next();
    assertTrue(t.toString().equals("(3,0.25)"));

    t = iter.next();
    assertTrue(t.toString().equals("(6,0.5)"));

    t = iter.next();
    assertTrue(t.toString().equals("(9,1.0)"));

    assertFalse(iter.hasNext());

    // Check output
    pigServer.registerQuery("N = LOAD '" + outputZ + "' as (a0: double, a1: double);");

    iter = pigServer.openIterator("N");

    t = iter.next();
    assertTrue(t.toString().equals("(8.0,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(13.0,40.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(23.0,60.0)"));

    assertFalse(iter.hasNext());

    // Non batch mode
    iter = pigServer.openIterator("Y");

    t = iter.next();
    assertTrue(t.toString().equals("(3,0.25)"));

    t = iter.next();
    assertTrue(t.toString().equals("(6,0.5)"));

    t = iter.next();
    assertTrue(t.toString().equals("(9,1.0)"));

    assertFalse(iter.hasNext());

    // Check in non-batch mode
    iter = pigServer.openIterator("Z");

    t = iter.next();
    assertTrue(t.toString().equals("(8.0,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(13.0,40.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(23.0,60.0)"));

    assertFalse(iter.hasNext());
  }

  // See PIG-1434
  @Test
  public void testScalarWithNoSchema() throws Exception {
    String[] scalarInput = {"1\t5"};
    String[] input = {"1\t5", "2\t10", "3\t20"};
    String inputPath = BUILD_TEST_TMP + "table_testScalarWithNoSchema";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    String inputPathScalar = BUILD_TEST_TMP + "table_testScalarWithNoSchemaScalar";
    TestScalarAliases.createLocalInputFile(inputPathScalar, scalarInput);
    // Load A as a scalar
    pigServer.registerQuery("A = LOAD '" + inputPath + "';");
    pigServer.registerQuery("scalar = LOAD '" + inputPathScalar + "' as (count, total);");
    pigServer.registerQuery("B = foreach A generate 5 / scalar.total;");

    Iterator<Tuple> iter = pigServer.openIterator("B");

    Tuple t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    assertFalse(iter.hasNext());
  }

  // See PIG-1434
  @Test
  public void testScalarWithTwoBranches() throws Exception {
    String[] inputA = {"1\t5", "2\t10", "3\t20"};

    String[] inputX = {"pig", "hadoop", "rocks"};

    String output = BUILD_TEST_TMP + "testScalarWithTwoBranchesDir";
    TestScalarAliases.deleteDirectory(new File(output));
    // Test the use of scalars in expressions
    String inputPathA = BUILD_TEST_TMP + "testScalarWithTwoBranchesA";
    TestScalarAliases.createLocalInputFile(inputPathA, inputA);
    String inputPathX = BUILD_TEST_TMP + "testScalarWithTwoBranchesX";
    TestScalarAliases.createLocalInputFile(inputPathX, inputX);
    // Test in script mode
    pigServer.setBatchOn();
    pigServer.registerQuery("A = LOAD '" + inputPathA + "' as (a0: long, a1: double);");
    pigServer.registerQuery("B = group A all;");
    pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;");
    pigServer.registerQuery("X = LOAD '" + inputPathX + "' as (names: chararray);");
    pigServer.registerQuery("Y = foreach X generate names, C.max;");
    pigServer.registerQuery("Store Y into '" + output + "';");
    pigServer.executeBatch();
    // Check output
    pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: chararray, a1: double);");

    Iterator<Tuple> iter = pigServer.openIterator("Z");

    Tuple t = iter.next();
    assertTrue(t.toString().equals("(pig,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(hadoop,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(rocks,20.0)"));

    assertFalse(iter.hasNext());

    // Check in non-batch mode
    iter = pigServer.openIterator("Y");

    t = iter.next();
    assertTrue(t.toString().equals("(pig,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(hadoop,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(rocks,20.0)"));

    assertFalse(iter.hasNext());

    pigServer.getPigContext().getProperties().remove("tez.am.inline.task.execution.max-tasks");
  }

  // See PIG-1434
  @Test
  public void testFilteredScalarDollarProj() throws Exception {
    String output = BUILD_TEST_TMP + "table_testFilteredScalarDollarProjDir";
    TestScalarAliases.deleteDirectory(new File(output));
    String[] input = {
      "1\t5\t[state#maine,city#portland]\t{(a),(b)}\t(a,b)", "2\t10\t\t\t", "3\t20\t\t\t"
    };

    // Test the use of scalars in expressions
    String inputPath = BUILD_TEST_TMP + "table_testFilteredScalarDollarProj";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    // Test in script mode
    pigServer.setBatchOn();
    pigServer.registerQuery(
        "A = LOAD '"
            + inputPath
            + "'"
            + " as (a0: long, a1: double, a2 : bytearray, "
            + "a3: bag{ t : tuple(tc : chararray)}, "
            + "a4: tuple(c1 : chararray, c2 : chararray) );");
    pigServer.registerQuery("B = filter A by $1 < 8;");
    pigServer.registerQuery(
        "Y = foreach A generate (a0 * B.$0), (a1 / B.$1), B.$2, B.$2#'state', B.$3, B.a4;");
    pigServer.registerQuery("Store Y into '" + output + "';");
    pigServer.explain("Y", System.err);
    pigServer.executeBatch();
    // Check output
    pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: int, a1: double);");
    pigServer.explain("Z", System.err);

    Iterator<Tuple> iter = pigServer.openIterator("Z");

    Tuple t = iter.next();
    assertTrue(t.toString().equals("(1,1.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(2,2.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(3,4.0)"));

    assertFalse(iter.hasNext());

    // Check in non-batch mode
    iter = pigServer.openIterator("Y");

    t = iter.next();
    assertEquals(t.toString(), "(1,1.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))");

    t = iter.next();
    assertEquals(t.toString(), "(2,2.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))");

    t = iter.next();
    assertEquals(t.toString(), "(3,4.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))");

    assertFalse(iter.hasNext());
  }

  // See PIG-1434
  @Test
  public void testScalarWithNoSchemaDollarProj() throws Exception {
    String[] scalarInput = {"1\t5"};
    String[] input = {"1\t5", "2\t10", "3\t20"};
    String inputPath = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProj";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    String inputPathScalar = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProjScalar";
    TestScalarAliases.createLocalInputFile(inputPathScalar, scalarInput);
    // Load A as a scalar
    pigServer.registerQuery("A = LOAD '" + inputPath + "';");
    pigServer.registerQuery("scalar = LOAD '" + inputPathScalar + "';");
    pigServer.registerQuery("B = foreach A generate 5 / scalar.$1;");

    Iterator<Tuple> iter = pigServer.openIterator("B");

    Tuple t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    assertFalse(iter.hasNext());
  }

  // See PIG-1434
  @Test
  public void testScalarAliasesJoinClause() throws Exception {
    String[] inputA = {"1\t5", "2\t10", "3\t20"};
    String[] inputB = {"Total3\tthree", "Total2\ttwo", "Total1\tone"};

    // Test the use of scalars in expressions
    String inputPathA = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseA";
    TestScalarAliases.createLocalInputFile(inputPathA, inputA);
    String inputPathB = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseB";
    TestScalarAliases.createLocalInputFile(inputPathB, inputB);
    // Test in script mode
    pigServer.registerQuery("A = LOAD '" + inputPathA + "' as (a0, a1);");
    pigServer.registerQuery("G = group A all;");
    pigServer.registerQuery("C = foreach G generate COUNT(A) as count;");

    pigServer.registerQuery("B = LOAD '" + inputPathB + "' as (b0:chararray, b1:chararray);");
    pigServer.registerQuery("Y = join A by CONCAT('Total', (chararray)C.count), B by $0;");

    Iterator<Tuple> iter = pigServer.openIterator("Y");

    String[] expected =
        new String[] {"(1,5,Total3,three)", "(2,10,Total3,three)", "(3,20,Total3,three)"};

    Util.checkQueryOutputsAfterSortRecursive(
        iter,
        expected,
        org.apache.pig.newplan.logical.Util.translateSchema(pigServer.dumpSchema("Y")));
  }

  // See PIG-1434
  @Test
  public void testScalarAliasesFilterClause() throws Exception {
    String[] input = {"1\t5", "2\t10", "3\t20", "4\t12", "5\t8"};

    // Test the use of scalars in expressions
    String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesFilterClause";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    // Test in script mode
    pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0, a1);");
    pigServer.registerQuery("G = group A all;");
    pigServer.registerQuery("C = foreach G generate AVG(A.$1) as average;");

    pigServer.registerQuery("Y = filter A by a1 > C.average;");

    Iterator<Tuple> iter = pigServer.openIterator("Y");

    // Average is 11
    Tuple t = iter.next();
    assertTrue(t.toString().equals("(3,20)"));

    t = iter.next();
    assertTrue(t.toString().equals("(4,12)"));

    assertFalse(iter.hasNext());
  }

  // See PIG-1434
  @Test
  public void testScalarAliasesGrammarNegative() throws Exception {
    String[] input = {"1\t5", "2\t10", "3\t20"};

    String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesGrammar";
    TestScalarAliases.createLocalInputFile(inputPath, input);

    try {
      pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);");
      pigServer.registerQuery("B = group A all;");
      pigServer.registerQuery("C = foreach B generate COUNT(A);");
      // Only projections of C are supported
      pigServer.registerQuery("Y = foreach A generate C;");
      pigServer.openIterator("Y");
      // Control should not reach here
      fail("Scalar projections are only supported");
    } catch (IOException pe) {
      assertTrue(pe.getMessage().contains("Invalid scalar projection: C"));
    }
  }

  // See PIG-1636
  @Test
  public void testScalarAliasesLimit() throws Exception {
    String[] input = {"a\t1", "b\t2", "c\t3", "a\t4", "c\t5"};

    // Test the use of scalars in expressions
    String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesLimit";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    // Test in script mode
    pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0:chararray, a1: int);");
    pigServer.registerQuery("G = group A all;");
    pigServer.registerQuery("C = foreach G generate SUM(A.$1) as total;");
    pigServer.registerQuery("C1 = limit C 1;");
    pigServer.registerQuery("Y = foreach A generate a0, a1 * (double)C1.total;");

    Iterator<Tuple> iter = pigServer.openIterator("Y");

    // Average is 11
    Tuple t = iter.next();
    assertTrue(t.toString().equals("(a,15.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(b,30.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(c,45.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(a,60.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(c,75.0)"));

    assertFalse(iter.hasNext());
  }

  /**
   * Test that a specific string is included in the error message when an exception is thrown for
   * using a relation in a scalar context without projecting any columns out of it
   */
  // See PIG-1788
  @Test
  public void testScalarWithNoProjection() throws Exception {
    String query =
        "  A = load 'table_testScalarWithNoProjection' as (x, y);"
            + "  B = group A by x;"
            +
            // B is unintentionally being used as scalar,
            // the user intends it to be COUNT(A)
            "  C = foreach B generate COUNT(B);";

    Util.checkExceptionMessage(
        query,
        "C",
        "A column needs to be projected from a relation" + " for it to be used as a scalar");
  }

  @Test
  public void testScalarNullValue() throws Exception {
    Storage.Data data = Storage.resetData(pigServer);
    data.set("input", Storage.tuple("a", 1), Storage.tuple("b", 2));

    pigServer.setBatchOn();
    pigServer.registerQuery("A = load 'input' using mock.Storage() as (a:chararray, b:int);");
    pigServer.registerQuery("B = FILTER A by a == 'c';");
    pigServer.registerQuery("C = FOREACH A generate a, b + B.b;");
    pigServer.registerQuery("store C into 'output' using mock.Storage();");

    pigServer.executeBatch();

    List<Tuple> actualResults = data.get("output");
    List<Tuple> expectedResults =
        Util.getTuplesFromConstantTupleStrings(new String[] {"('a', null)", "('b', null)"});
    Util.checkQueryOutputsAfterSort(actualResults.iterator(), expectedResults);
  }
}

示例#27

0

显示文件

文件： POJoinPackage.java 项目： hirohanin/pig7hadoop21

  /**
   * Calls getNext to get next ForEach result. The input for POJoinPackage is a (key, NullableTuple)
   * pair. We will materialize n-1 inputs into bags, feed input#n one tuple a time to the delegated
   * ForEach operator, the input for ForEach is
   *
   * <p>(input#1, input#2, input#3....input#n[i]), i=(1..k), suppose input#n consists
   *
   * <p>of k tuples. For every ForEach input, pull all the results from ForEach. getNext will be
   * called multiple times for a particular input, it returns one output tuple from ForEach every
   * time we call getNext, so we need to maintain internal status to keep tracking of where we are.
   */
  @Override
  public Result getNext(Tuple t) throws ExecException {

    if (firstTime) {
      firstTime = false;
      if (PigMapReduce.sJobConf != null) {
        String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type");
        if (bagType != null && bagType.equalsIgnoreCase("default")) {
          useDefaultBag = true;
        }
      }
    }
    // if a previous call to foreach.getNext()
    // has still not returned all output, process it
    if (forEach.processingPlan) {
      forEachResult = forEach.getNext(t1);
      switch (forEachResult.returnStatus) {
        case POStatus.STATUS_OK:
        case POStatus.STATUS_NULL:
        case POStatus.STATUS_ERR:
          return forEachResult;
        case POStatus.STATUS_EOP:
          break;
      }
    }

    NullableTuple it = null;

    // If we see a new NullableTupleIterator, materialize n-1 inputs, construct ForEach input
    // tuple res = (key, input#1, input#2....input#n), the only missing value is input#n,
    // we will get input#n one tuple a time, fill in res, feed to ForEach.
    // After this block, we have the first tuple of input#n in hand (kept in variable it)
    if (newKey) {
      lastInputTuple = false;
      // Put n-1 inputs into bags
      dbs = new DataBag[numInputs];
      for (int i = 0; i < numInputs - 1; i++) {
        dbs[i] =
            useDefaultBag
                ? BagFactory.getInstance().newDefaultBag()
                // In a very rare case if there is a POStream after this
                // POJoinPackage in the pipeline and is also blocking the pipeline;
                // constructor argument should be 2 * numInputs. But for one obscure
                // case we don't want to pay the penalty all the time.
                : new InternalCachedBag(numInputs - 1);
      }
      // For last bag, we always use NonSpillableBag.
      dbs[lastBagIndex] = new NonSpillableDataBag((int) chunkSize);

      // For each Nullable tuple in the input, put it
      // into the corresponding bag based on the index,
      // except for the last input, which we will stream
      // The tuples will arrive in the order of the index,
      // starting from index 0 and such that all tuples for
      // a given index arrive before a tuple for the next
      // index does.
      while (tupIter.hasNext()) {
        it = tupIter.next();
        int itIndex = it.getIndex();
        if (itIndex != numInputs - 1) {
          dbs[itIndex].add(getValueTuple(it, itIndex));
        } else {
          lastInputTuple = true;
          break;
        }
        if (reporter != null) reporter.progress();
      }
      // If we don't have any tuple for input#n
      // we do not need any further process, return EOP
      if (!lastInputTuple) {
        // we will return at this point because we ought
        // to be having a flatten on this last input
        // and we have an empty bag which should result
        // in this key being taken out of the output
        newKey = true;
        return eopResult;
      }

      res = mTupleFactory.newTuple(numInputs + 1);
      for (int i = 0; i < dbs.length; i++) res.set(i + 1, dbs[i]);

      res.set(0, key);
      // if we have an inner anywhere and the corresponding
      // bag is empty, we can just return
      for (int i = 0; i < dbs.length - 1; i++) {
        if (inner[i] && dbs[i].size() == 0) {
          detachInput();
          return eopResult;
        }
      }
      newKey = false;

      // set up the bag with last input to contain
      // a chunk of CHUNKSIZE values OR the entire bag if
      // it has less than CHUNKSIZE values - the idea is in most
      // cases the values are > CHUNKSIZE in number and in
      // those cases we will be sending the last bag
      // as a set of smaller chunked bags thus holding lesser
      // in memory

      // the first tuple can be directly retrieved from "it"
      dbs[lastBagIndex].add(getValueTuple(it, it.getIndex()));
      for (int i = 0; i < chunkSize - 1 && tupIter.hasNext(); i++) {
        it = tupIter.next();
        dbs[lastBagIndex].add(getValueTuple(it, it.getIndex()));
      }

      // Attach the input to forEach
      forEach.attachInput(res);

      // pull output tuple from ForEach
      Result forEachResult = forEach.getNext(t1);
      {
        switch (forEachResult.returnStatus) {
          case POStatus.STATUS_OK:
          case POStatus.STATUS_NULL:
          case POStatus.STATUS_ERR:
            return forEachResult;
          case POStatus.STATUS_EOP:
            break;
        }
      }
    }

    // Keep attaching input tuple to ForEach, until:
    // 1. We can initialize ForEach.getNext();
    // 2. There is no more input#n
    while (true) {
      if (tupIter.hasNext()) {
        // try setting up a bag of CHUNKSIZE OR
        // the remainder of the bag of last input
        // (if < CHUNKSIZE) to foreach
        dbs[lastBagIndex].clear(); // clear last chunk
        for (int i = 0; i < chunkSize && tupIter.hasNext(); i++) {
          it = tupIter.next();
          dbs[lastBagIndex].add(getValueTuple(it, it.getIndex()));
        }
      } else
      // if we do not have any more tuples for input#n, return EOP
      {
        detachInput();
        newKey = true;
        return eopResult;
      }
      // Attach the input to forEach
      forEach.attachInput(res);

      // pull output tuple from ForEach
      Result forEachResult = forEach.getNext(t1);
      {
        switch (forEachResult.returnStatus) {
          case POStatus.STATUS_OK:
          case POStatus.STATUS_NULL:
          case POStatus.STATUS_ERR:
            return forEachResult;
          case POStatus.STATUS_EOP:
            break;
        }
      }
    }
  }

示例#28

0

显示文件

文件： PrependToBag.java 项目： king821221/incubator-datafu

 public DataBag call(DataBag inputBag, Tuple t) throws IOException {
   DataBag outputBag = BagFactory.getInstance().newDefaultBag();
   outputBag.add(t);
   for (Tuple x : inputBag) outputBag.add(x);
   return outputBag;
 }

示例#29

0

显示文件

文件： Utf8StorageConverter.java 项目： kidaak/Hadoop-MapReduce-1

/**
 * This abstract class provides standard conversions between utf8 encoded data and pig data types.
 * It is intended to be extended by load and store functions (such as {@link PigStorage}).
 */
public class Utf8StorageConverter implements LoadStoreCaster {

  protected BagFactory mBagFactory = BagFactory.getInstance();
  protected TupleFactory mTupleFactory = TupleFactory.getInstance();
  protected final Log mLog = LogFactory.getLog(getClass());

  private static final Integer mMaxInt = Integer.valueOf(Integer.MAX_VALUE);
  private static final Integer mMinInt = Integer.valueOf(Integer.MIN_VALUE);
  private static final Long mMaxLong = Long.valueOf(Long.MAX_VALUE);
  private static final Long mMinLong = Long.valueOf(Long.MIN_VALUE);
  private static final int BUFFER_SIZE = 1024;

  public Utf8StorageConverter() {}

  private char findStartChar(char start) throws IOException {
    switch (start) {
      case ')':
        return '(';
      case ']':
        return '[';
      case '}':
        return '{';
      default:
        throw new IOException("Unknown start character");
    }
  }

  private DataBag consumeBag(PushbackInputStream in, ResourceFieldSchema fieldSchema)
      throws IOException {
    if (fieldSchema == null) {
      throw new IOException("Schema is null");
    }
    ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields();
    Tuple t;
    int buf;
    while ((buf = in.read()) != '{') {
      if (buf == -1) {
        throw new IOException("Unexpect end of bag");
      }
    }
    if (fss.length != 1) throw new IOException("Only tuple is allowed inside bag schema");
    ResourceFieldSchema fs = fss[0];
    DataBag db = DefaultBagFactory.getInstance().newDefaultBag();
    while (true) {
      t = consumeTuple(in, fs);
      if (t != null) db.add(t);
      while ((buf = in.read()) != '}' && buf != ',') {
        if (buf == -1) {
          throw new IOException("Unexpect end of bag");
        }
      }
      if (buf == '}') break;
    }
    return db;
  }

  private Tuple consumeTuple(PushbackInputStream in, ResourceFieldSchema fieldSchema)
      throws IOException {
    if (fieldSchema == null) {
      throw new IOException("Schema is null");
    }
    int buf;
    ByteArrayOutputStream mOut;

    while ((buf = in.read()) != '(' || buf == '}') {
      if (buf == -1) {
        throw new IOException("Unexpect end of tuple");
      }
      if (buf == '}') {
        in.unread(buf);
        return null;
      }
    }
    Tuple t = TupleFactory.getInstance().newTuple();
    if (fieldSchema.getSchema() != null && fieldSchema.getSchema().getFields().length != 0) {
      ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields();
      // Interpret item inside tuple one by one based on the inner schema
      for (int i = 0; i < fss.length; i++) {
        Object field;
        ResourceFieldSchema fs = fss[i];
        int delimit = ',';
        if (i == fss.length - 1) delimit = ')';

        if (DataType.isComplex(fs.getType())) {
          field = consumeComplexType(in, fs);
          while ((buf = in.read()) != delimit) {
            if (buf == -1) {
              throw new IOException("Unexpect end of tuple");
            }
          }
        } else {
          mOut = new ByteArrayOutputStream(BUFFER_SIZE);
          while ((buf = in.read()) != delimit) {
            if (buf == -1) {
              throw new IOException("Unexpect end of tuple");
            }
            if (buf == delimit) break;
            mOut.write(buf);
          }
          field = parseSimpleType(mOut.toByteArray(), fs);
        }
        t.append(field);
      }
    } else {
      // No inner schema, treat everything inside tuple as bytearray
      Deque<Character> level =
          new LinkedList<
              Character>(); // keep track of nested tuple/bag/map. We do not interpret, save them as
      // bytearray
      mOut = new ByteArrayOutputStream(BUFFER_SIZE);
      while (true) {
        buf = in.read();
        if (buf == -1) {
          throw new IOException("Unexpect end of tuple");
        }
        if (buf == '[' || buf == '{' || buf == '(') {
          level.push((char) buf);
          mOut.write(buf);
        } else if (buf == ')' && level.isEmpty()) // End of tuple
        {
          DataByteArray value = new DataByteArray(mOut.toByteArray());
          t.append(value);
          break;
        } else if (buf == ',' && level.isEmpty()) {
          DataByteArray value = new DataByteArray(mOut.toByteArray());
          t.append(value);
          mOut.reset();
        } else if (buf == ']' || buf == '}' || buf == ')') {
          if (level.peek() == findStartChar((char) buf)) level.pop();
          else throw new IOException("Malformed tuple");
          mOut.write(buf);
        } else mOut.write(buf);
      }
    }
    return t;
  }

  private Map<String, Object> consumeMap(PushbackInputStream in, ResourceFieldSchema fieldSchema)
      throws IOException {
    int buf;

    while ((buf = in.read()) != '[') {
      if (buf == -1) {
        throw new IOException("Unexpect end of map");
      }
    }
    HashMap<String, Object> m = new HashMap<String, Object>();
    ByteArrayOutputStream mOut = new ByteArrayOutputStream(BUFFER_SIZE);
    while (true) {
      // Read key (assume key can not contains special character such as #, (, [, {, }, ], )
      while ((buf = in.read()) != '#') {
        if (buf == -1) {
          throw new IOException("Unexpect end of map");
        }
        mOut.write(buf);
      }
      String key = bytesToCharArray(mOut.toByteArray());
      if (key.length() == 0) throw new IOException("Map key can not be null");

      // Read value
      mOut.reset();
      Deque<Character> level =
          new LinkedList<
              Character>(); // keep track of nested tuple/bag/map. We do not interpret, save them as
      // bytearray
      while (true) {
        buf = in.read();
        if (buf == -1) {
          throw new IOException("Unexpect end of map");
        }
        if (buf == '[' || buf == '{' || buf == '(') {
          level.push((char) buf);
        } else if (buf == ']' && level.isEmpty()) // End of map
        break;
        else if (buf == ']' || buf == '}' || buf == ')') {
          if (level.isEmpty()) throw new IOException("Malformed map");

          if (level.peek() == findStartChar((char) buf)) level.pop();
        } else if (buf == ',' && level.isEmpty()) { // Current map item complete
          break;
        }
        mOut.write(buf);
      }
      Object value = null;
      if (fieldSchema != null && fieldSchema.getSchema() != null && mOut.size() > 0) {
        value = bytesToObject(mOut.toByteArray(), fieldSchema.getSchema().getFields()[0]);
      } else if (mOut.size() > 0) { // untyped map
        value = new DataByteArray(mOut.toByteArray());
      }
      m.put(key, value);
      mOut.reset();
      if (buf == ']') break;
    }
    return m;
  }

  private Object bytesToObject(byte[] b, ResourceFieldSchema fs) throws IOException {
    Object field;
    if (DataType.isComplex(fs.getType())) {
      ByteArrayInputStream bis = new ByteArrayInputStream(b);
      PushbackInputStream in = new PushbackInputStream(bis);
      field = consumeComplexType(in, fs);
    } else {
      field = parseSimpleType(b, fs);
    }
    return field;
  }

  private Object consumeComplexType(PushbackInputStream in, ResourceFieldSchema complexFieldSchema)
      throws IOException {
    Object field;
    switch (complexFieldSchema.getType()) {
      case DataType.BAG:
        field = consumeBag(in, complexFieldSchema);
        break;
      case DataType.TUPLE:
        field = consumeTuple(in, complexFieldSchema);
        break;
      case DataType.MAP:
        field = consumeMap(in, complexFieldSchema);
        break;
      default:
        throw new IOException("Unknown complex data type");
    }
    return field;
  }

  private Object parseSimpleType(byte[] b, ResourceFieldSchema simpleFieldSchema)
      throws IOException {
    Object field;
    switch (simpleFieldSchema.getType()) {
      case DataType.INTEGER:
        field = bytesToInteger(b);
        break;
      case DataType.LONG:
        field = bytesToLong(b);
        break;
      case DataType.FLOAT:
        field = bytesToFloat(b);
        break;
      case DataType.DOUBLE:
        field = bytesToDouble(b);
        break;
      case DataType.CHARARRAY:
        field = bytesToCharArray(b);
        break;
      case DataType.BYTEARRAY:
        field = new DataByteArray(b);
        break;
      case DataType.BOOLEAN:
        field = bytesToBoolean(b);
        break;
      case DataType.BIGINTEGER:
        field = bytesToBigInteger(b);
        break;
      case DataType.BIGDECIMAL:
        field = bytesToBigDecimal(b);
      case DataType.DATETIME:
        field = bytesToDateTime(b);
        break;
      default:
        throw new IOException("Unknown simple data type");
    }
    return field;
  }

  @Override
  public DataBag bytesToBag(byte[] b, ResourceFieldSchema schema) throws IOException {
    if (b == null) return null;
    DataBag db;
    try {
      ByteArrayInputStream bis = new ByteArrayInputStream(b);
      PushbackInputStream in = new PushbackInputStream(bis);
      db = consumeBag(in, schema);
    } catch (IOException e) {
      LogUtils.warn(
          this,
          "Unable to interpret value "
              + Arrays.toString(b)
              + " in field being "
              + "converted to type bag, caught ParseException <"
              + e.getMessage()
              + "> field discarded",
          PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED,
          mLog);
      return null;
    }
    return db;
  }

  @Override
  public String bytesToCharArray(byte[] b) throws IOException {
    if (b == null) return null;
    return new String(b, "UTF-8");
  }

  @Override
  public Double bytesToDouble(byte[] b) {
    if (b == null || b.length == 0) {
      return null;
    }

    try {
      return Double.valueOf(new String(b));
    } catch (NumberFormatException nfe) {
      LogUtils.warn(
          this,
          "Unable to interpret value "
              + Arrays.toString(b)
              + " in field being "
              + "converted to double, caught NumberFormatException <"
              + nfe.getMessage()
              + "> field discarded",
          PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED,
          mLog);
      return null;
    }
  }

  @Override
  public Float bytesToFloat(byte[] b) throws IOException {
    if (b == null || b.length == 0) {
      return null;
    }

    String s;
    if (b.length > 0 && (b[b.length - 1] == 'F' || b[b.length - 1] == 'f')) {
      s = new String(b, 0, b.length - 1);
    } else {
      s = new String(b);
    }

    try {
      return Float.valueOf(s);
    } catch (NumberFormatException nfe) {
      LogUtils.warn(
          this,
          "Unable to interpret value "
              + Arrays.toString(b)
              + " in field being "
              + "converted to float, caught NumberFormatException <"
              + nfe.getMessage()
              + "> field discarded",
          PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED,
          mLog);
      return null;
    }
  }

  @Override
  public Boolean bytesToBoolean(byte[] b) throws IOException {
    if (b == null) return null;
    String s = new String(b);
    if (s.equalsIgnoreCase("true")) {
      return Boolean.TRUE;
    } else if (s.equalsIgnoreCase("false")) {
      return Boolean.FALSE;
    } else {
      return null;
    }
  }

  /**
   * Sanity check of whether this number is a valid integer or long.
   *
   * @param number the number to check
   * @return true if it doesn't contain any invalid characters, i.e. only contains digits and '-'
   */
  private static boolean sanityCheckIntegerLong(String number) {
    for (int i = 0; i < number.length(); i++) {
      if (number.charAt(i) >= '0' && number.charAt(i) <= '9' || i == 0 && number.charAt(i) == '-') {
        // valid one
      } else {
        // contains invalid characters, must not be a integer or long.
        return false;
      }
    }
    return true;
  }

  @Override
  public Integer bytesToInteger(byte[] b) throws IOException {
    if (b == null || b.length == 0) {
      return null;
    }

    String s = new String(b);
    s = s.trim();
    Integer ret = null;

    // See PIG-2835. Using exception handling to check if it's a double is very expensive.
    // So we write our sanity check.
    if (sanityCheckIntegerLong(s)) {
      try {
        ret = Integer.valueOf(s);
      } catch (NumberFormatException nfe) {
      }
    }
    if (ret == null) {
      // It's possible that this field can be interpreted as a double.
      // Unfortunately Java doesn't handle this in Integer.valueOf.  So
      // we need to try to convert it to a double and if that works then
      // go to an int.
      try {
        Double d = Double.valueOf(s);
        // Need to check for an overflow error
        if (Double.compare(d.doubleValue(), mMaxInt.doubleValue() + 1) >= 0
            || Double.compare(d.doubleValue(), mMinInt.doubleValue() - 1) <= 0) {
          LogUtils.warn(
              this, "Value " + d + " too large for integer", PigWarning.TOO_LARGE_FOR_INT, mLog);
          return null;
        }
        return Integer.valueOf(d.intValue());
      } catch (NumberFormatException nfe2) {
        LogUtils.warn(
            this,
            "Unable to interpret value "
                + Arrays.toString(b)
                + " in field being "
                + "converted to int, caught NumberFormatException <"
                + nfe2.getMessage()
                + "> field discarded",
            PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED,
            mLog);
        return null;
      }
    }
    return ret;
  }

  @Override
  public Long bytesToLong(byte[] b) throws IOException {
    if (b == null || b.length == 0) {
      return null;
    }

    String s = new String(b).trim();
    if (s.endsWith("l") || s.endsWith("L")) {
      s = s.substring(0, s.length() - 1);
    }

    // See PIG-2835. Using exception handling to check if it's a double is very expensive.
    // So we write our sanity check.
    Long ret = null;
    if (sanityCheckIntegerLong(s)) {
      try {
        ret = Long.valueOf(s);
      } catch (NumberFormatException nfe) {
      }
    }

    if (ret == null) {
      // It's possible that this field can be interpreted as a double.
      // Unfortunately Java doesn't handle this in Long.valueOf.  So
      // we need to try to convert it to a double and if that works then
      // go to an long.
      try {
        Double d = Double.valueOf(s);
        // Need to check for an overflow error
        if (Double.compare(d.doubleValue(), mMaxLong.doubleValue() + 1) > 0
            || Double.compare(d.doubleValue(), mMinLong.doubleValue() - 1) < 0) {
          LogUtils.warn(
              this, "Value " + d + " too large for long", PigWarning.TOO_LARGE_FOR_INT, mLog);
          return null;
        }
        return Long.valueOf(d.longValue());
      } catch (NumberFormatException nfe2) {
        LogUtils.warn(
            this,
            "Unable to interpret value "
                + Arrays.toString(b)
                + " in field being "
                + "converted to long, caught NumberFormatException <"
                + nfe2.getMessage()
                + "> field discarded",
            PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED,
            mLog);
        return null;
      }
    }
    return ret;
  }

  @Override
  public DateTime bytesToDateTime(byte[] b) throws IOException {
    if (b == null) {
      return null;
    }
    try {
      String dtStr = new String(b);
      return ToDate.extractDateTime(dtStr);
    } catch (IllegalArgumentException e) {
      LogUtils.warn(
          this,
          "Unable to interpret value "
              + Arrays.toString(b)
              + " in field being "
              + "converted to datetime, caught IllegalArgumentException <"
              + e.getMessage()
              + "> field discarded",
          PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED,
          mLog);
      return null;
    }
  }

  @Override
  public Map<String, Object> bytesToMap(byte[] b, ResourceFieldSchema fieldSchema)
      throws IOException {
    if (b == null) return null;
    Map<String, Object> map;
    try {
      ByteArrayInputStream bis = new ByteArrayInputStream(b);
      PushbackInputStream in = new PushbackInputStream(bis);
      map = consumeMap(in, fieldSchema);
    } catch (IOException e) {
      LogUtils.warn(
          this,
          "Unable to interpret value "
              + Arrays.toString(b)
              + " in field being "
              + "converted to type map, caught ParseException <"
              + e.getMessage()
              + "> field discarded",
          PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED,
          mLog);
      return null;
    }
    return map;
  }

  @Override
  public Map<String, Object> bytesToMap(byte[] b) throws IOException {
    return bytesToMap(b, null);
  }

  @Override
  public Tuple bytesToTuple(byte[] b, ResourceFieldSchema fieldSchema) throws IOException {
    if (b == null) return null;
    Tuple t;

    try {
      ByteArrayInputStream bis = new ByteArrayInputStream(b);
      PushbackInputStream in = new PushbackInputStream(bis);
      t = consumeTuple(in, fieldSchema);
    } catch (IOException e) {
      LogUtils.warn(
          this,
          "Unable to interpret value "
              + Arrays.toString(b)
              + " in field being "
              + "converted to type tuple, caught ParseException <"
              + e.getMessage()
              + "> field discarded",
          PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED,
          mLog);
      return null;
    }

    return t;
  }

  @Override
  public BigInteger bytesToBigInteger(byte[] b) throws IOException {
    if (b == null || b.length == 0) {
      return null;
    }
    return new BigInteger(new String(b));
  }

  @Override
  public BigDecimal bytesToBigDecimal(byte[] b) throws IOException {
    if (b == null || b.length == 0) {
      return null;
    }
    return new BigDecimal(new String(b));
  }

  @Override
  public byte[] toBytes(DataBag bag) throws IOException {
    return bag.toString().getBytes();
  }

  @Override
  public byte[] toBytes(String s) throws IOException {
    return s.getBytes();
  }

  @Override
  public byte[] toBytes(Double d) throws IOException {
    return d.toString().getBytes();
  }

  @Override
  public byte[] toBytes(Float f) throws IOException {
    return f.toString().getBytes();
  }

  @Override
  public byte[] toBytes(Integer i) throws IOException {
    return i.toString().getBytes();
  }

  @Override
  public byte[] toBytes(Long l) throws IOException {
    return l.toString().getBytes();
  }

  @Override
  public byte[] toBytes(Boolean b) throws IOException {
    return b.toString().getBytes();
  }

  @Override
  public byte[] toBytes(DateTime dt) throws IOException {
    return dt.toString().getBytes();
  }

  @Override
  public byte[] toBytes(Map<String, Object> m) throws IOException {
    return DataType.mapToString(m).getBytes();
  }

  @Override
  public byte[] toBytes(Tuple t) throws IOException {
    return t.toString().getBytes();
  }

  @Override
  public byte[] toBytes(DataByteArray a) throws IOException {
    return a.get();
  }

  @Override
  public byte[] toBytes(BigInteger bi) throws IOException {
    return bi.toString().getBytes();
  }

  @Override
  public byte[] toBytes(BigDecimal bd) throws IOException {
    return bd.toString().getBytes();
  }
}

示例#30

0

显示文件

文件： SimpleRandomSample.java 项目： king821221/incubator-datafu

/**
 * Scalable simple random sampling.
 *
 * <p>This UDF implements a scalable simple random sampling algorithm described in
 *
 * <pre>
 * X. Meng, Scalable Simple Random Sampling and Stratified Sampling, ICML 2013.
 * </pre>
 *
 * It takes a sampling probability p as input and outputs a simple random sample of size exactly
 * ceil(p*n) with probability at least 99.99%, where $n$ is the size of the population. This UDF is
 * very useful for stratified sampling. For example,
 *
 * <pre>
 * DEFINE SRS datafu.pig.sampling.SimpleRandomSample('0.01');
 * examples = LOAD ...
 * grouped = GROUP examples BY label;
 * sampled = FOREACH grouped GENERATE FLATTEN(SRS(examples));
 * STORE sampled ...
 * </pre>
 *
 * We note that, in a Java Hadoop job, we can output pre-selected records directly using
 * MultipleOutputs. However, this feature is not available in a Pig UDF. So we still let
 * pre-selected records go through the sort phase. However, as long as the sample size is not huge,
 * this should not be a big problem.
 *
 * @author ximeng
 */
public class SimpleRandomSample extends AlgebraicEvalFunc<DataBag> {
  private static final TupleFactory tupleFactory = TupleFactory.getInstance();
  private static final BagFactory bagFactory = BagFactory.getInstance();

  public SimpleRandomSample() {}

  public SimpleRandomSample(String samplingProbability) {
    Double p = Double.parseDouble(samplingProbability);

    if (p < 0.0 || p > 1.0) {
      throw new IllegalArgumentException("Sampling probability must be inside [0, 1].");
    }
  }

  @Override
  public String getInitial() {
    return Initial.class.getName();
  }

  @Override
  public String getIntermed() {
    return Intermediate.class.getName();
  }

  @Override
  public String getFinal() {
    return Final.class.getName();
  }

  @Override
  public Schema outputSchema(Schema input) {
    try {
      Schema.FieldSchema inputFieldSchema = input.getField(0);

      if (inputFieldSchema.type != DataType.BAG) {
        throw new RuntimeException("Expected a BAG as input");
      }

      return new Schema(
          new Schema.FieldSchema(
              getSchemaName(this.getClass().getName().toLowerCase(), input),
              inputFieldSchema.schema,
              DataType.BAG));
    } catch (FrontendException e) {
      e.printStackTrace();
      throw new RuntimeException(e);
    }
  }

  public static class Initial extends EvalFunc<Tuple> {
    private double _samplingProbability;
    private RandomDataImpl _rdg = new RandomDataImpl();

    public Initial() {}

    public Initial(String samplingProbability) {
      _samplingProbability = Double.parseDouble(samplingProbability);
    }

    @Override
    public Tuple exec(Tuple input) throws IOException {
      Tuple output = tupleFactory.newTuple();
      DataBag selected = bagFactory.newDefaultBag();
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());

      DataBag items = (DataBag) input.get(0);

      if (items != null) {
        long n = items.size();

        double q1 = getQ1(n, _samplingProbability);
        double q2 = getQ2(n, _samplingProbability);

        for (Tuple item : items) {
          double key = _rdg.nextUniform(0.0d, 1.0d);

          if (key < q1) {
            selected.add(item);
          } else if (key < q2) {
            waiting.add(new ScoredTuple(key, item).getIntermediateTuple(tupleFactory));
          }
        }

        output.append(n);
        output.append(selected);
        output.append(waiting);
      }

      return output;
    }
  }

  public static class Intermediate extends EvalFunc<Tuple> {
    public Intermediate() {}

    public Intermediate(String samplingProbability) {
      _samplingProbability = Double.parseDouble(samplingProbability);
    }

    private double _samplingProbability;

    @Override
    public Tuple exec(Tuple input) throws IOException {
      DataBag bag = (DataBag) input.get(0);
      DataBag selected = bagFactory.newDefaultBag();
      DataBag aggWaiting = bagFactory.newSortedBag(new ScoredTupleComparator());
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());
      Tuple output = tupleFactory.newTuple();

      long n = 0L;

      for (Tuple innerTuple : bag) {
        n += (Long) innerTuple.get(0);

        selected.addAll((DataBag) innerTuple.get(1));

        double q1 = getQ1(n, _samplingProbability);
        double q2 = getQ2(n, _samplingProbability);

        for (Tuple t : (DataBag) innerTuple.get(2)) {
          ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t);

          if (scored.getScore() < q1) {
            selected.add(scored.getTuple());
          } else if (scored.getScore() < q2) {
            aggWaiting.add(t);
          } else {
            break;
          }
        }
      }

      double q1 = getQ1(n, _samplingProbability);
      double q2 = getQ2(n, _samplingProbability);

      for (Tuple t : aggWaiting) {
        ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t);

        if (scored.getScore() < q1) {
          selected.add(scored.getTuple());
        } else if (scored.getScore() < q2) {
          waiting.add(t);
        } else {
          break;
        }
      }

      output.append(n);
      output.append(selected);
      output.append(waiting);

      System.err.println(
          "Read "
              + n
              + " items, selected "
              + selected.size()
              + ", and wait-listed "
              + aggWaiting.size()
              + ".");

      return output;
    }
  }

  public static class Final extends EvalFunc<DataBag> {
    private double _samplingProbability;

    public Final() {}

    public Final(String samplingProbability) {
      _samplingProbability = Double.parseDouble(samplingProbability);
    }

    @Override
    public DataBag exec(Tuple input) throws IOException {
      DataBag bag = (DataBag) input.get(0);
      long n = 0L;
      DataBag selected = bagFactory.newDefaultBag();
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());

      for (Tuple innerTuple : bag) {
        n += (Long) innerTuple.get(0);
        selected.addAll((DataBag) innerTuple.get(1));
        waiting.addAll((DataBag) innerTuple.get(2));
      }

      long sampleSize = (long) Math.ceil(_samplingProbability * n);
      long nNeeded = sampleSize - selected.size();

      for (Tuple scored : waiting) {
        if (nNeeded <= 0) {
          break;
        }
        selected.add(ScoredTuple.fromIntermediateTuple(scored).getTuple());
        nNeeded--;
      }

      return selected;
    }
  }

  private static class ScoredTupleComparator implements Comparator<Tuple> {

    @Override
    public int compare(Tuple o1, Tuple o2) {
      try {
        ScoredTuple t1 = ScoredTuple.fromIntermediateTuple(o1);
        ScoredTuple t2 = ScoredTuple.fromIntermediateTuple(o2);
        return t1.getScore().compareTo(t2.getScore());
      } catch (Throwable e) {
        throw new RuntimeException("Cannot compare " + o1 + " and " + o2 + ".", e);
      }
    }
  }

  private static double getQ1(long n, double p) {
    double t1 = 20.0 / (3.0 * n);
    double q1 = p + t1 - Math.sqrt(t1 * t1 + 3.0 * t1 * p);
    return q1;
  }

  private static double getQ2(long n, double p) {
    double t2 = 10.0 / n;
    double q2 = p + t2 + Math.sqrt(t2 * t2 + 2.0 * t2 * p);
    return q2;
  }
}