public void testSkewedJoinWithGroup() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = GROUP A by id;"); pigServer.registerQuery("D = GROUP B by id;"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbshj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join C by group, D by group using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join C by group, D by group;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbshj.add(iter.next()); } } Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj)); }
public void testSkewedJoinWithNoProperties() throws IOException { pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); DataBag dbshj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery( "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join A by(id, name), B by (id, name);"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbshj.add(iter.next()); } } Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj)); } catch (Exception e) { fail(e.getMessage()); } }
public void testSkewedJoinManyReducers() throws IOException { pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "2"); pigServer.registerQuery("A = LOAD '" + INPUT_FILE6 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE7 + "' as (id,name);"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 300;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join A by id, B by id;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbrj.add(iter.next()); } } Assert.assertEquals(dbfrj.size(), dbrj.size()); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj)); }
// pig 1048 public void testSkewedJoinOneValue() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE3 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE3 + "' as (id,name);"); // Filter key with a single value pigServer.registerQuery("C = FILTER A by id == 400;"); pigServer.registerQuery("D = FILTER B by id == 400;"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join C by id, D by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join C by id, D by id;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbrj.add(iter.next()); } } Assert.assertEquals(dbfrj.size(), dbrj.size()); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj)); }
@Override public Tuple exec(Tuple input) throws IOException { DataBag output = BagFactory.getInstance().newDefaultBag(); DataBag samples = (DataBag) input.get(0); if (samples == null) { // do nothing } else if (samples.size() <= numSamples) { // no need to construct a reservoir, so just emit intermediate tuples for (Tuple sample : samples) { // add the score on to the intermediate tuple output.add(new ScoredTuple(Math.random(), sample).getIntermediateTuple(tupleFactory)); } } else { for (Tuple sample : samples) { getReservoir().consider(new ScoredTuple(Math.random(), sample)); } for (ScoredTuple scoredTuple : getReservoir()) { // add the score on to the intermediate tuple output.add(scoredTuple.getIntermediateTuple(tupleFactory)); } } return tupleFactory.newTuple(output); }
public class UnigramExtractor extends EvalFunc<DataBag> { private static BagFactory bagFactory = BagFactory.getInstance(); private static TupleFactory tupleFactory = TupleFactory.getInstance(); private static final Pattern spacePattern = Pattern.compile("\\s+"); private static final Pattern punctPattern = Pattern.compile("\\p{Punct}(?:(?<!\\d)(?!\\d))"); public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } String normStr = ((String) input.get(0)); if (normStr == null) { return null; } // Remove punctuation except when it's a version number normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" "); normStr = spacePattern.matcher(normStr).replaceAll(" "); DataBag output = bagFactory.newDefaultBag(); for (String s : spacePattern.split(normStr.trim())) { if (s.length() <= 30) { Tuple t = tupleFactory.newTuple(1); t.set(0, s); output.add(t); } } return output; } }
@SuppressWarnings("rawtypes") @Override public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { this.reader = (WikipediaPageInputFormat.WikipediaRecordReader) reader; tupleFactory = TupleFactory.getInstance(); bagFactory = BagFactory.getInstance(); }
/** * TokenizeText uses the Lucene libraries StandardAnalyzer class to tokenize a raw text input. A * list of the stopwords used is available {@link StopWords}. Output is a pig bag containing tokens. * <dt><b>Example:</b> * <dd><code> * register varaha.jar;<br/> * documents = LOAD 'documents' AS (doc_id:chararray, text:chararray);<br/> * tokenized = FOREACH documents GENERATE doc_id AS doc_id, FLATTEN(StanfordTokenize(text)) AS (token:chararray); * </code> * </dl> * * @see * @author Russell Jurney */ public class StanfordTokenize extends EvalFunc<DataBag> { private static TupleFactory tupleFactory = TupleFactory.getInstance(); private static BagFactory bagFactory = BagFactory.getInstance(); public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; // Output bag DataBag bagOfTokens = bagFactory.newDefaultBag(); StringReader textInput = new StringReader(input.get(0).toString()); PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), ""); for (CoreLabel label; ptbt.hasNext(); ) { label = (CoreLabel) ptbt.next(); if (label.value().length() > 2) { System.err.println(label.toString()); Tuple termText = tupleFactory.newTuple(label.word()); bagOfTokens.add(termText); } } return bagOfTokens; } }
/** * WeekDayDetection.java - get a string formated date, and return the number of the week. (0=Monday, * ..., 6=Sunday) Input tuple: {(1,2,3,5)} Output DataBag: {(1,2),(1,3),(1,5),(2,3),(2,5),(3,5)} * * @author Roberto Maestre * @version 1.0 */ public class WeekDayDetection extends EvalFunc<DataBag> { TupleFactory mTupleFactory = TupleFactory.getInstance(); BagFactory mBagFactory = BagFactory.getInstance(); public DataBag exec(Tuple input) throws IOException { try { if (!input.isNull()) { // Create the output like a databag {(res1,res2),(res3,res4)..} DataBag output_databag = mBagFactory.newDefaultBag(); // Unpack tuple in order to get the bag {(1,2),(3,4),...} String input_time = (String) input.get(0); try { DateFormat formatter = new SimpleDateFormat("MM/dd/yyyy kk:mm:ss"); Date date = formatter.parse( String.format( "%s/%s/%s %s:%s:%s", input_time.substring(5, 7), input_time.substring(8, 10), input_time.substring(0, 4), input_time.substring(11, 13), input_time.substring(14, 16), input_time.substring(17, 18))); Calendar calendar = Calendar.getInstance(); calendar.setTime(date); int dayOfWeek = calendar.get(Calendar.DAY_OF_WEEK); int dayOfMonth = calendar.get(Calendar.DAY_OF_MONTH); int hour = calendar.get(Calendar.HOUR_OF_DAY); // Add items to output Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, String.format("%d:%d:%d", dayOfWeek, dayOfMonth, hour)); output_databag.add(items); } catch (Exception e) { Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #1" + e.getMessage()); output_databag.add(items); return output_databag; } return output_databag; } else { DataBag output_databag = mBagFactory.newDefaultBag(); Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #2"); output_databag.add(items); return output_databag; } } catch (Exception e) { System.err.println("Error with ?? .."); DataBag output_databag = mBagFactory.newDefaultBag(); Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #3" + e.getMessage()); output_databag.add(items); return output_databag; } } }
@Override public DataBag getValue() { DataBag output = BagFactory.getInstance().newDefaultBag(); for (ScoredTuple sample : getReservoir()) { output.add(sample.getTuple()); } return output; }
/** * create bag having given number of tuples * * @param size * @return */ private DataBag createBag(int size) { Tuple innerTuple = TupleFactory.getInstance().newTuple(); innerTuple.append(Integer.valueOf(1)); DataBag bag = BagFactory.getInstance().newDefaultBag(); for (int i = 0; i < size; i++) { bag.add(innerTuple); } return bag; }
@Override public void setInput(Map<String, Block> input, JsonNode operatorJson, BlockProperties props) throws IOException, InterruptedException { inputBlock = input.values().iterator().next(); init(operatorJson, inputBlock.getProperties().getSchema()); nullBag = BagFactory.getInstance().newDefaultBag(); nullBag.add(TupleFactory.getInstance().newTuple(0)); }
@Override public DataBag exec(Tuple input) throws IOException { retrieveContextValues(); ArrayList<String> joinKeyNames = new ArrayList<String>(); for (int i = 1; i < input.size(); i += 2) { joinKeyNames.add((String) input.get(i)); } JoinCollector collector = new JoinCollector(); // the first bag is the outer bag String leftBagName = bagNames.get(0); DataBag leftBag = getBag(input, leftBagName); String leftBagJoinKeyName = getPrefixedAliasName(bagNameToJoinKeyPrefix.get(leftBagName), joinKeyNames.get(0)); collector.setJoinData(collector.groupTuples(leftBag, leftBagJoinKeyName)); // now, for each additional bag, group up the tuples by the join key, then join them in if (bagNames.size() > 1) { for (int i = 1; i < bagNames.size(); i++) { String bagName = bagNames.get(i); DataBag bag = getBag(input, bagName); String joinKeyName = getPrefixedAliasName(bagNameToJoinKeyPrefix.get(bagName), joinKeyNames.get(i)); int tupleSize = bagNameToSize.get(bagName); if (bag == null) throw new IOException( "Error in instance: " + getInstanceName() + " with properties: " + getInstanceProperties() + " and tuple: " + input.toDelimitedString(", ") + " -- Expected bag, got null"); HashMap<Object, List<Tuple>> groupedData = collector.groupTuples(bag, joinKeyName); // outer join, so go back in and add nulls; groupedData = collector.insertNullTuples(groupedData, tupleSize); for (Map.Entry<Object, List<Tuple>> entry : groupedData.entrySet()) { collector.joinTuples(entry.getKey(), entry.getValue()); } } } // assemble output bag DataBag outputBag = BagFactory.getInstance().newDefaultBag(); for (List<Tuple> tuples : collector.getJoinData().values()) { for (Tuple tuple : tuples) { outputBag.add(tuple); } } return outputBag; }
public void testSkewedJoinReducers() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using \"skewed\" parallel 1;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { fail("Should not throw exception, should continue execution"); } }
public void testSkewedJoinNullKeys() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support null keys in skewed join"); } return; }
public void testSkewedJoin3Way() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("D = join A by id, B by id, C by id using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("D"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { return; } fail("Should throw exception, do not support 3 way join"); }
@Override public DataBag exec(Tuple input) throws IOException { DataBag bagOfSamples = (DataBag) input.get(0); for (Tuple innerTuple : bagOfSamples) { DataBag samples = (DataBag) innerTuple.get(0); for (Tuple sample : samples) { // use the same score as previously generated getReservoir().consider(ScoredTuple.fromIntermediateTuple(sample)); } } DataBag output = BagFactory.getInstance().newDefaultBag(); for (ScoredTuple scoredTuple : getReservoir()) { // output the original tuple output.add(scoredTuple.getTuple()); } return output; }
@SuppressWarnings("unchecked") private void accumulateData() throws ExecException { int count = 0; int length = inputs.size() - 1; inputBags = new DataBag[length]; its = new Iterator[length]; for (int i = 0; i < length; ++i) { PhysicalOperator op = inputs.get(i); DataBag bag = BagFactory.getInstance().newDefaultBag(); inputBags[count] = bag; for (Result res = op.getNextTuple(); res.returnStatus != POStatus.STATUS_EOP; res = op.getNextTuple()) { if (res.returnStatus == POStatus.STATUS_NULL) continue; if (res.returnStatus == POStatus.STATUS_ERR) throw new ExecException("Error accumulating data in the local Cross operator"); if (res.returnStatus == POStatus.STATUS_OK) bag.add((Tuple) res.result); } its[count++] = bag.iterator(); } }
public void testSkewedJoinMapKey() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE4 + "' as (m:[]);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE4 + "' as (n:[]);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery( "C = join A by (chararray)m#'a100', B by (chararray)n#'a100' using \"skewed\" parallel 20;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support maps and expression operators as keys"); } return; }
@Test public void testTupleWriteRead1() throws IOException { // create a tuple with columns of different type Tuple tuplein = TupleFactory.getInstance().newTuple(7); tuplein.set(0, 12); Map<String, String> map = new HashMap<String, String>(); map.put("pig", "scalability"); tuplein.set(1, map); tuplein.set(2, null); tuplein.set(3, 12L); tuplein.set(4, 1.2F); Tuple innerTuple = TupleFactory.getInstance().newTuple(1); innerTuple.set(0, "innerTuple"); tuplein.set(5, innerTuple); DataBag bag = BagFactory.getInstance().newDefaultBag(); bag.add(innerTuple); tuplein.set(6, bag); testTupleSedes(tuplein); assertEquals( "(12,[pig#scalability],,12,1.2,(innerTuple),{(innerTuple)})", TupleFormat.format(tuplein)); }
public DataBag exec(Tuple input) throws IOException { try { DataBag bag = BagFactory.getInstance().newDefaultBag(); for (int i = 0; i < input.size(); i++) { final Object object = input.get(i); if (object instanceof Tuple) { for (int j = 0; j < ((Tuple) object).size(); j++) { Tuple tp2 = TupleFactory.getInstance().newTuple(1); tp2.set(0, ((Tuple) object).get(j)); bag.add(tp2); } } else { Tuple tp2 = TupleFactory.getInstance().newTuple(1); tp2.set(0, object); bag.add(tp2); } } return bag; } catch (Exception ee) { throw new RuntimeException("Error while creating a bag", ee); } }
public class CalcClassWeight extends EvalFunc<DataBag> { TupleFactory tupleFactory = TupleFactory.getInstance(); BagFactory bagFactory = BagFactory.getInstance(); @Override public DataBag exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); HashMap<String, Double> clsCnt = new HashMap<String, Double>(); Iterator<Tuple> it = bag.iterator(); Double sum = new Double(0.0); while (it.hasNext()) { Tuple item = (Tuple) it.next(); String cls = (String) item.get(3); if (cls != null && cls.length() > 0) { Double cur = clsCnt.get(cls); Double inc = (Double) item.get(2); if (cur != null) { clsCnt.put(cls, cur + inc); } else { clsCnt.put(cls, inc); } sum += inc; } } Set<Entry<String, Double>> clses = clsCnt.entrySet(); Iterator<Entry<String, Double>> cit = clses.iterator(); DataBag result = bagFactory.newDefaultBag(); while (cit.hasNext()) { Entry<String, Double> cls = cit.next(); Tuple tpl = tupleFactory.newTuple(2); tpl.set(0, cls.getKey()); tpl.set(1, cls.getValue() / sum); result.add(tpl); } return result; } @Override public Schema outputSchema(Schema input) { try { if (input.getFields().size() != 1 || input.getField(0).type != DataType.BAG) { throw new RuntimeException("expect input {bag}"); } Schema bag = input.getField(0).schema.getField(0).schema; if (bag.getFields().size() < 4 || bag.getField(0).type != DataType.CHARARRAY || bag.getField(1).type != DataType.CHARARRAY || bag.getField(2).type != DataType.DOUBLE || bag.getField(3).type != DataType.CHARARRAY) { throw new RuntimeException( "expect input {userid:chararray, " + "md:chararray, weight:double, cls:chararray}"); } Schema result = new Schema(); result.add(new FieldSchema("cls", DataType.CHARARRAY)); result.add(new FieldSchema("weight", DataType.DOUBLE)); return result; } catch (Exception e) { throw new RuntimeException(e); } } }
/** * Computes the set difference of two or more bags. Duplicates are eliminated. <b>The input bags * must be sorted.</b> * * <p>If bags A and B are provided, then this computes A-B, i.e. all elements in A that are not in * B. If bags A, B and C are provided, then this computes A-B-C, i.e. all elements in A that are not * in B or C. * * <p>Example: * * <pre>{@code * define SetDifference datafu.pig.sets.SetDifference(); * * -- input: * -- ({(1),(2),(3),(4),(5),(6)},{(3),(4)}) * input = LOAD 'input' AS (B1:bag{T:tuple(val:int)},B2:bag{T:tuple(val:int)}); * * input = FOREACH input { * B1 = ORDER B1 BY val ASC; * B2 = ORDER B2 BY val ASC; * * -- output: * -- ({(1),(2),(5),(6)}) * GENERATE SetDifference(B1,B2); * } * }</pre> */ public class SetDifference extends SetOperationsBase { private static final BagFactory bagFactory = BagFactory.getInstance(); /** * Loads the data bags from the input tuple and puts them in a priority queue, where ordering is * determined by the data from the iterator for each bag. * * <p>The bags are wrapped in a {@link Pair} object that is comparable on the data currently * available from the iterator. These objects are ordered first by the data, then by the index * within the tuple the bag came from. * * @param input * @return priority queue ordered * @throws IOException */ private PriorityQueue<Pair> loadBags(Tuple input) throws IOException { PriorityQueue<Pair> pq = new PriorityQueue<Pair>(input.size()); for (int i = 0; i < input.size(); i++) { if (input.get(i) != null) { Iterator<Tuple> inputIterator = ((DataBag) input.get(i)).iterator(); if (inputIterator.hasNext()) { pq.add(new Pair(inputIterator, i)); } } } return pq; } /** * Counts how many elements in the priority queue match the element at the front of the queue, * which should be from the first bag. * * @param pq priority queue * @return number of matches */ public int countMatches(PriorityQueue<Pair> pq) { Pair nextPair = pq.peek(); Tuple data = nextPair.data; // sanity check if (!nextPair.index.equals(0)) { throw new RuntimeException("Expected next bag to have index 0"); } int matches = 0; for (Pair p : pq) { if (data.equals(p.data)) matches++; } // subtract 1 since element matches itself return matches - 1; } @SuppressWarnings("unchecked") @Override public DataBag exec(Tuple input) throws IOException { if (input.size() < 2) { throw new RuntimeException("Expected at least two inputs, but found " + input.size()); } for (Object o : input) { if (o != null && !(o instanceof DataBag)) { throw new RuntimeException("Inputs must be bags"); } } DataBag outputBag = bagFactory.newDefaultBag(); DataBag bag1 = (DataBag) input.get(0); DataBag bag2 = (DataBag) input.get(1); if (bag1 == null || bag1.size() == 0) { return outputBag; } // optimization else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) { return bag1; } PriorityQueue<Pair> pq = loadBags(input); Tuple lastData = null; while (true) { Pair nextPair = pq.peek(); // ignore data we've already encountered if (nextPair.data.compareTo(lastData) != 0) { // Only take data from the first bag, where there are no other // bags that have the same data. if (nextPair.index.equals(0) && countMatches(pq) == 0) { outputBag.add(nextPair.data); lastData = nextPair.data; } } Pair p = pq.poll(); // only put the bag back into the queue if it still has data if (p.hasNext()) { p.next(); pq.offer(p); } else if (p.index.equals(0)) { // stop when we exhaust all elements from the first bag break; } } return outputBag; } /** * A wrapper for the tuple iterator that implements comparable so it can be used in the priority * queue. * * <p>This is compared first on the data, then on the index the bag came from in the input tuple. */ private static class Pair implements Comparable<Pair> { private final Iterator<Tuple> it; private final Integer index; private Tuple data; /** * Constructs the {@link Pair}. * * @param it tuple iterator * @param index index within the tuple that the bag came from */ public Pair(Iterator<Tuple> it, int index) { this.index = index; this.it = it; this.data = it.next(); } @SuppressWarnings("unchecked") @Override public int compareTo(Pair o) { int r = this.data.compareTo(o.data); if (r == 0) { return index.compareTo(o.index); } else { return r; } } public boolean hasNext() { return it.hasNext(); } @SuppressWarnings("unchecked") public Tuple next() { Tuple nextData = it.next(); // algorithm assumes data is in order if (data.compareTo(nextData) > 0) { throw new RuntimeException("Out of order!"); } this.data = nextData; return this.data; } @Override public String toString() { return String.format("[%s within %d]", data, index); } } }
/** * From the inputs, constructs the output tuple for this co-group in the required format which is * (key, {bag of tuples from input 1}, {bag of tuples from input 2}, ...) */ @Override public Result getNext(Tuple t) throws ExecException { Tuple res; if (firstTime) { firstTime = false; if (PigMapReduce.sJobConf != null) { String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type"); if (bagType != null && bagType.equalsIgnoreCase("default")) { useDefaultBag = true; } } } if (distinct) { // only set the key which has the whole // tuple res = mTupleFactory.newTuple(1); res.set(0, key); } else { // Create numInputs bags DataBag[] dbs = null; dbs = new DataBag[numInputs]; if (isAccumulative()) { // create bag wrapper to pull tuples in many batches // all bags have reference to the sample tuples buffer // which contains tuples from one batch POPackageTupleBuffer buffer = new POPackageTupleBuffer(); for (int i = 0; i < numInputs; i++) { dbs[i] = new AccumulativeBag(buffer, i); } } else { // create bag to pull all tuples out of iterator for (int i = 0; i < numInputs; i++) { dbs[i] = useDefaultBag ? BagFactory.getInstance().newDefaultBag() // In a very rare case if there is a POStream after this // POPackage in the pipeline and is also blocking the pipeline; // constructor argument should be 2 * numInputs. But for one obscure // case we don't want to pay the penalty all the time. : new InternalCachedBag(numInputs); } // For each indexed tup in the inp, sort them // into their corresponding bags based // on the index while (tupIter.hasNext()) { NullableTuple ntup = tupIter.next(); int index = ntup.getIndex(); Tuple copy = getValueTuple(ntup, index); if (numInputs == 1) { // this is for multi-query merge where // the numInputs is always 1, but the index // (the position of the inner plan in the // enclosed operator) may not be 1. dbs[0].add(copy); } else { dbs[index].add(copy); } if (reporter != null) reporter.progress(); } } // Construct the output tuple by appending // the key and all the above constructed bags // and return it. res = mTupleFactory.newTuple(numInputs + 1); res.set(0, key); int i = -1; for (DataBag bag : dbs) { i++; if (inner[i] && !isAccumulative()) { if (bag.size() == 0) { detachInput(); Result r = new Result(); r.returnStatus = POStatus.STATUS_NULL; return r; } } res.set(i + 1, bag); } } detachInput(); Result r = new Result(); r.result = res; r.returnStatus = POStatus.STATUS_OK; return r; }
/** * The package operator that packages the globally rearranged tuples into output format as required * by co-group. This is last stage of processing co-group. This operator has a slightly different * format than other operators in that, it takes two things as input. The key being worked on and * the iterator of bags that contain indexed tuples that just need to be packaged into their * appropriate output bags based on the index. */ public class POPackage extends PhysicalOperator { /** */ private static final long serialVersionUID = 1L; private static boolean[] SIMPLE_KEY_POSITION; static { SIMPLE_KEY_POSITION = new boolean[1]; SIMPLE_KEY_POSITION[0] = true; } // The iterator of indexed Tuples // that is typically provided by // Hadoop transient Iterator<NullableTuple> tupIter; // The key being worked on Object key; // marker to indicate if key is a tuple protected boolean isKeyTuple = false; // key as a Tuple object (if the key is a tuple) protected Tuple keyAsTuple; // key's type byte keyType; // The number of inputs to this // co-group. 0 indicates a distinct, which means there will only be a // key, no value. int numInputs; // If the attaching map-reduce plan use secondary sort key boolean useSecondaryKey = false; // Denotes if inner is specified // on a particular input boolean[] inner; // flag to denote whether there is a distinct // leading to this package protected boolean distinct = false; // A mapping of input index to key information got from LORearrange // for that index. The Key information is a pair of boolean, Map. // The boolean indicates whether there is a lone project(*) in the // cogroup by. If not, the Map has a mapping of column numbers in the // "value" to column numbers in the "key" which contain the fields in // the "value" protected Map<Integer, Pair<Boolean, Map<Integer, Integer>>> keyInfo; private final transient Log log = LogFactory.getLog(getClass()); protected static final BagFactory mBagFactory = BagFactory.getInstance(); protected static final TupleFactory mTupleFactory = TupleFactory.getInstance(); private boolean firstTime = true; private boolean useDefaultBag = false; public POPackage(OperatorKey k) { this(k, -1, null); } public POPackage(OperatorKey k, int rp) { this(k, rp, null); } public POPackage(OperatorKey k, List<PhysicalOperator> inp) { this(k, -1, inp); } public POPackage(OperatorKey k, int rp, List<PhysicalOperator> inp) { super(k, rp, inp); numInputs = -1; keyInfo = new HashMap<Integer, Pair<Boolean, Map<Integer, Integer>>>(); } @Override public String name() { return "Package" + "[" + DataType.findTypeName(resultType) + "]" + "{" + DataType.findTypeName(keyType) + "}" + " - " + mKey.toString(); } @Override public boolean supportsMultipleInputs() { return false; } @Override public void visit(PhyPlanVisitor v) throws VisitorException { v.visitPackage(this); } @Override public boolean supportsMultipleOutputs() { return false; } /** * Attaches the required inputs * * @param k - the key being worked on * @param inp - iterator of indexed tuples typically obtained from Hadoop */ public void attachInput(PigNullableWritable k, Iterator<NullableTuple> inp) { tupIter = inp; key = k.getValueAsPigType(); if (useSecondaryKey) { try { key = ((Tuple) key).get(0); } catch (ExecException e) { // TODO Exception throw new RuntimeException(e); } } if (isKeyTuple) { // key is a tuple, cache the key as a // tuple for use in the getNext() keyAsTuple = (Tuple) key; } } /** attachInput's better half! */ public void detachInput() { tupIter = null; key = null; } public int getNumInps() { return numInputs; } public void setNumInps(int numInps) { this.numInputs = numInps; } public boolean[] getInner() { return inner; } public void setInner(boolean[] inner) { this.inner = inner; } /** * From the inputs, constructs the output tuple for this co-group in the required format which is * (key, {bag of tuples from input 1}, {bag of tuples from input 2}, ...) */ @Override public Result getNext(Tuple t) throws ExecException { Tuple res; if (firstTime) { firstTime = false; if (PigMapReduce.sJobConf != null) { String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type"); if (bagType != null && bagType.equalsIgnoreCase("default")) { useDefaultBag = true; } } } if (distinct) { // only set the key which has the whole // tuple res = mTupleFactory.newTuple(1); res.set(0, key); } else { // Create numInputs bags DataBag[] dbs = null; dbs = new DataBag[numInputs]; if (isAccumulative()) { // create bag wrapper to pull tuples in many batches // all bags have reference to the sample tuples buffer // which contains tuples from one batch POPackageTupleBuffer buffer = new POPackageTupleBuffer(); for (int i = 0; i < numInputs; i++) { dbs[i] = new AccumulativeBag(buffer, i); } } else { // create bag to pull all tuples out of iterator for (int i = 0; i < numInputs; i++) { dbs[i] = useDefaultBag ? BagFactory.getInstance().newDefaultBag() // In a very rare case if there is a POStream after this // POPackage in the pipeline and is also blocking the pipeline; // constructor argument should be 2 * numInputs. But for one obscure // case we don't want to pay the penalty all the time. : new InternalCachedBag(numInputs); } // For each indexed tup in the inp, sort them // into their corresponding bags based // on the index while (tupIter.hasNext()) { NullableTuple ntup = tupIter.next(); int index = ntup.getIndex(); Tuple copy = getValueTuple(ntup, index); if (numInputs == 1) { // this is for multi-query merge where // the numInputs is always 1, but the index // (the position of the inner plan in the // enclosed operator) may not be 1. dbs[0].add(copy); } else { dbs[index].add(copy); } if (reporter != null) reporter.progress(); } } // Construct the output tuple by appending // the key and all the above constructed bags // and return it. res = mTupleFactory.newTuple(numInputs + 1); res.set(0, key); int i = -1; for (DataBag bag : dbs) { i++; if (inner[i] && !isAccumulative()) { if (bag.size() == 0) { detachInput(); Result r = new Result(); r.returnStatus = POStatus.STATUS_NULL; return r; } } res.set(i + 1, bag); } } detachInput(); Result r = new Result(); r.result = res; r.returnStatus = POStatus.STATUS_OK; return r; } protected Tuple getValueTuple(NullableTuple ntup, int index) throws ExecException { // Need to make a copy of the value, as hadoop uses the same ntup // to represent each value. Tuple val = (Tuple) ntup.getValueAsPigType(); Tuple copy = null; // The "value (val)" that we just got may not // be the complete "value". It may have some portions // in the "key" (look in POLocalRearrange for more comments) // If this is the case we need to stitch // the "value" together. Pair<Boolean, Map<Integer, Integer>> lrKeyInfo = keyInfo.get(index); boolean isProjectStar = lrKeyInfo.first; Map<Integer, Integer> keyLookup = lrKeyInfo.second; int keyLookupSize = keyLookup.size(); if (keyLookupSize > 0) { // we have some fields of the "value" in the // "key". copy = mTupleFactory.newTuple(); int finalValueSize = keyLookupSize + val.size(); int valIndex = 0; // an index for accessing elements from // the value (val) that we have currently for (int i = 0; i < finalValueSize; i++) { Integer keyIndex = keyLookup.get(i); if (keyIndex == null) { // the field for this index is not in the // key - so just take it from the "value" // we were handed copy.append(val.get(valIndex)); valIndex++; } else { // the field for this index is in the key if (isKeyTuple) { // the key is a tuple, extract the // field out of the tuple copy.append(keyAsTuple.get(keyIndex)); } else { copy.append(key); } } } } else if (isProjectStar) { // the whole "value" is present in the "key" copy = mTupleFactory.newTuple(keyAsTuple.getAll()); } else { // there is no field of the "value" in the // "key" - so just make a copy of what we got // as the "value" copy = mTupleFactory.newTuple(val.getAll()); } return copy; } public byte getKeyType() { return keyType; } public void setKeyType(byte keyType) { this.keyType = keyType; } /** * Get the field positions of key in the output tuples. For POPackage, the position is always 0. * The POCombinerPackage, however, can return different values. * * @return the field position of key in the output tuples. */ public boolean[] getKeyPositionsInTuple() { return SIMPLE_KEY_POSITION.clone(); } /** * Make a deep copy of this operator. * * @throws CloneNotSupportedException */ @Override public POPackage clone() throws CloneNotSupportedException { POPackage clone = (POPackage) super.clone(); clone.mKey = new OperatorKey(mKey.scope, NodeIdGenerator.getGenerator().getNextNodeId(mKey.scope)); clone.requestedParallelism = requestedParallelism; clone.resultType = resultType; clone.keyType = keyType; clone.numInputs = numInputs; if (inner != null) { clone.inner = new boolean[inner.length]; for (int i = 0; i < inner.length; i++) { clone.inner[i] = inner[i]; } } else clone.inner = null; return clone; } /** @param keyInfo the keyInfo to set */ public void setKeyInfo(Map<Integer, Pair<Boolean, Map<Integer, Integer>>> keyInfo) { this.keyInfo = keyInfo; } /** @param keyTuple the keyTuple to set */ public void setKeyTuple(boolean keyTuple) { this.isKeyTuple = keyTuple; } /** @return the keyInfo */ public Map<Integer, Pair<Boolean, Map<Integer, Integer>>> getKeyInfo() { return keyInfo; } /** @return the distinct */ public boolean isDistinct() { return distinct; } /** @param distinct the distinct to set */ public void setDistinct(boolean distinct) { this.distinct = distinct; } public void setUseSecondaryKey(boolean useSecondaryKey) { this.useSecondaryKey = useSecondaryKey; } private class POPackageTupleBuffer implements AccumulativeTupleBuffer { private List<Tuple>[] bags; private Iterator<NullableTuple> iter; private int batchSize; private Object currKey; @SuppressWarnings("unchecked") public POPackageTupleBuffer() { batchSize = 20000; if (PigMapReduce.sJobConf != null) { String size = PigMapReduce.sJobConf.get("pig.accumulative.batchsize"); if (size != null) { batchSize = Integer.parseInt(size); } } this.bags = new List[numInputs]; for (int i = 0; i < numInputs; i++) { this.bags[i] = new ArrayList<Tuple>(); } this.iter = tupIter; this.currKey = key; } @Override public boolean hasNextBatch() { return iter.hasNext(); } @Override public void nextBatch() throws IOException { for (int i = 0; i < bags.length; i++) { bags[i].clear(); } key = currKey; for (int i = 0; i < batchSize; i++) { if (iter.hasNext()) { NullableTuple ntup = iter.next(); int index = ntup.getIndex(); Tuple copy = getValueTuple(ntup, index); if (numInputs == 1) { // this is for multi-query merge where // the numInputs is always 1, but the index // (the position of the inner plan in the // enclosed operator) may not be 1. bags[0].add(copy); } else { bags[index].add(copy); } } } } public void clear() { for (int i = 0; i < bags.length; i++) { bags[i].clear(); } iter = null; } public Iterator<Tuple> getTuples(int index) { return bags[index].iterator(); } }; }
public class TestScalarAliasesLocal { private static final String BUILD_TEST_TMP = "build/test/tmp/"; private PigServer pigServer; TupleFactory mTf = TupleFactory.getInstance(); BagFactory mBf = BagFactory.getInstance(); @Before public void setUp() throws Exception { pigServer = new PigServer(Util.getLocalTestMode()); } public static void deleteDirectory(File file) { if (file.exists()) { Util.deleteDirectory(file); } } public static File createLocalInputFile(String filename, String[] inputData) throws IOException { new File(filename).getParentFile().mkdirs(); return Util.createLocalInputFile(filename, inputData); } // See PIG-1434 @Test public void testScalarAliasesBatchNobatch() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20"}; String output = BUILD_TEST_TMP + "table_testScalarAliasesDir"; TestScalarAliases.deleteDirectory(new File(output)); // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesBatch"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.setBatchOn(); pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);"); pigServer.registerQuery("B = group A all;"); pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;"); pigServer.registerQuery("Y = foreach A generate (a0 * C.count), (a1 / C.max);"); pigServer.registerQuery("Store Y into '" + output + "';"); pigServer.executeBatch(); // Check output pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: int, a1: double);"); Iterator<Tuple> iter; Tuple t; iter = pigServer.openIterator("Z"); t = iter.next(); assertTrue(t.toString().equals("(3,0.25)")); t = iter.next(); assertTrue(t.toString().equals("(6,0.5)")); t = iter.next(); assertTrue(t.toString().equals("(9,1.0)")); assertFalse(iter.hasNext()); iter = pigServer.openIterator("Y"); t = iter.next(); assertTrue(t.toString().equals("(3,0.25)")); t = iter.next(); assertTrue(t.toString().equals("(6,0.5)")); t = iter.next(); assertTrue(t.toString().equals("(9,1.0)")); assertFalse(iter.hasNext()); } // See PIG-1434 @Test public void testUseScalarMultipleTimes() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20"}; String outputY = BUILD_TEST_TMP + "table_testUseScalarMultipleTimesOutY"; TestScalarAliases.deleteDirectory(new File(outputY)); String outputZ = BUILD_TEST_TMP + "table_testUseScalarMultipleTimesOutZ"; TestScalarAliases.deleteDirectory(new File(outputZ)); // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testUseScalarMultipleTimes"; TestScalarAliases.createLocalInputFile(inputPath, input); pigServer.setBatchOn(); pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);"); pigServer.registerQuery("B = group A all;"); pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;"); pigServer.registerQuery("Y = foreach A generate (a0 * C.count), (a1 / C.max);"); pigServer.registerQuery("Store Y into '" + outputY + "';"); pigServer.registerQuery("Z = foreach A generate (a1 + C.count), (a0 * C.max);"); pigServer.registerQuery("Store Z into '" + outputZ + "';"); // Test Multiquery store pigServer.executeBatch(); // Check output pigServer.registerQuery("M = LOAD '" + outputY + "' as (a0: int, a1: double);"); Iterator<Tuple> iter; Tuple t; iter = pigServer.openIterator("M"); t = iter.next(); assertTrue(t.toString().equals("(3,0.25)")); t = iter.next(); assertTrue(t.toString().equals("(6,0.5)")); t = iter.next(); assertTrue(t.toString().equals("(9,1.0)")); assertFalse(iter.hasNext()); // Check output pigServer.registerQuery("N = LOAD '" + outputZ + "' as (a0: double, a1: double);"); iter = pigServer.openIterator("N"); t = iter.next(); assertTrue(t.toString().equals("(8.0,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(13.0,40.0)")); t = iter.next(); assertTrue(t.toString().equals("(23.0,60.0)")); assertFalse(iter.hasNext()); // Non batch mode iter = pigServer.openIterator("Y"); t = iter.next(); assertTrue(t.toString().equals("(3,0.25)")); t = iter.next(); assertTrue(t.toString().equals("(6,0.5)")); t = iter.next(); assertTrue(t.toString().equals("(9,1.0)")); assertFalse(iter.hasNext()); // Check in non-batch mode iter = pigServer.openIterator("Z"); t = iter.next(); assertTrue(t.toString().equals("(8.0,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(13.0,40.0)")); t = iter.next(); assertTrue(t.toString().equals("(23.0,60.0)")); assertFalse(iter.hasNext()); } // See PIG-1434 @Test public void testScalarWithNoSchema() throws Exception { String[] scalarInput = {"1\t5"}; String[] input = {"1\t5", "2\t10", "3\t20"}; String inputPath = BUILD_TEST_TMP + "table_testScalarWithNoSchema"; TestScalarAliases.createLocalInputFile(inputPath, input); String inputPathScalar = BUILD_TEST_TMP + "table_testScalarWithNoSchemaScalar"; TestScalarAliases.createLocalInputFile(inputPathScalar, scalarInput); // Load A as a scalar pigServer.registerQuery("A = LOAD '" + inputPath + "';"); pigServer.registerQuery("scalar = LOAD '" + inputPathScalar + "' as (count, total);"); pigServer.registerQuery("B = foreach A generate 5 / scalar.total;"); Iterator<Tuple> iter = pigServer.openIterator("B"); Tuple t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); assertFalse(iter.hasNext()); } // See PIG-1434 @Test public void testScalarWithTwoBranches() throws Exception { String[] inputA = {"1\t5", "2\t10", "3\t20"}; String[] inputX = {"pig", "hadoop", "rocks"}; String output = BUILD_TEST_TMP + "testScalarWithTwoBranchesDir"; TestScalarAliases.deleteDirectory(new File(output)); // Test the use of scalars in expressions String inputPathA = BUILD_TEST_TMP + "testScalarWithTwoBranchesA"; TestScalarAliases.createLocalInputFile(inputPathA, inputA); String inputPathX = BUILD_TEST_TMP + "testScalarWithTwoBranchesX"; TestScalarAliases.createLocalInputFile(inputPathX, inputX); // Test in script mode pigServer.setBatchOn(); pigServer.registerQuery("A = LOAD '" + inputPathA + "' as (a0: long, a1: double);"); pigServer.registerQuery("B = group A all;"); pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;"); pigServer.registerQuery("X = LOAD '" + inputPathX + "' as (names: chararray);"); pigServer.registerQuery("Y = foreach X generate names, C.max;"); pigServer.registerQuery("Store Y into '" + output + "';"); pigServer.executeBatch(); // Check output pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: chararray, a1: double);"); Iterator<Tuple> iter = pigServer.openIterator("Z"); Tuple t = iter.next(); assertTrue(t.toString().equals("(pig,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(hadoop,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(rocks,20.0)")); assertFalse(iter.hasNext()); // Check in non-batch mode iter = pigServer.openIterator("Y"); t = iter.next(); assertTrue(t.toString().equals("(pig,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(hadoop,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(rocks,20.0)")); assertFalse(iter.hasNext()); pigServer.getPigContext().getProperties().remove("tez.am.inline.task.execution.max-tasks"); } // See PIG-1434 @Test public void testFilteredScalarDollarProj() throws Exception { String output = BUILD_TEST_TMP + "table_testFilteredScalarDollarProjDir"; TestScalarAliases.deleteDirectory(new File(output)); String[] input = { "1\t5\t[state#maine,city#portland]\t{(a),(b)}\t(a,b)", "2\t10\t\t\t", "3\t20\t\t\t" }; // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testFilteredScalarDollarProj"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.setBatchOn(); pigServer.registerQuery( "A = LOAD '" + inputPath + "'" + " as (a0: long, a1: double, a2 : bytearray, " + "a3: bag{ t : tuple(tc : chararray)}, " + "a4: tuple(c1 : chararray, c2 : chararray) );"); pigServer.registerQuery("B = filter A by $1 < 8;"); pigServer.registerQuery( "Y = foreach A generate (a0 * B.$0), (a1 / B.$1), B.$2, B.$2#'state', B.$3, B.a4;"); pigServer.registerQuery("Store Y into '" + output + "';"); pigServer.explain("Y", System.err); pigServer.executeBatch(); // Check output pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: int, a1: double);"); pigServer.explain("Z", System.err); Iterator<Tuple> iter = pigServer.openIterator("Z"); Tuple t = iter.next(); assertTrue(t.toString().equals("(1,1.0)")); t = iter.next(); assertTrue(t.toString().equals("(2,2.0)")); t = iter.next(); assertTrue(t.toString().equals("(3,4.0)")); assertFalse(iter.hasNext()); // Check in non-batch mode iter = pigServer.openIterator("Y"); t = iter.next(); assertEquals(t.toString(), "(1,1.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))"); t = iter.next(); assertEquals(t.toString(), "(2,2.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))"); t = iter.next(); assertEquals(t.toString(), "(3,4.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))"); assertFalse(iter.hasNext()); } // See PIG-1434 @Test public void testScalarWithNoSchemaDollarProj() throws Exception { String[] scalarInput = {"1\t5"}; String[] input = {"1\t5", "2\t10", "3\t20"}; String inputPath = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProj"; TestScalarAliases.createLocalInputFile(inputPath, input); String inputPathScalar = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProjScalar"; TestScalarAliases.createLocalInputFile(inputPathScalar, scalarInput); // Load A as a scalar pigServer.registerQuery("A = LOAD '" + inputPath + "';"); pigServer.registerQuery("scalar = LOAD '" + inputPathScalar + "';"); pigServer.registerQuery("B = foreach A generate 5 / scalar.$1;"); Iterator<Tuple> iter = pigServer.openIterator("B"); Tuple t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); assertFalse(iter.hasNext()); } // See PIG-1434 @Test public void testScalarAliasesJoinClause() throws Exception { String[] inputA = {"1\t5", "2\t10", "3\t20"}; String[] inputB = {"Total3\tthree", "Total2\ttwo", "Total1\tone"}; // Test the use of scalars in expressions String inputPathA = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseA"; TestScalarAliases.createLocalInputFile(inputPathA, inputA); String inputPathB = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseB"; TestScalarAliases.createLocalInputFile(inputPathB, inputB); // Test in script mode pigServer.registerQuery("A = LOAD '" + inputPathA + "' as (a0, a1);"); pigServer.registerQuery("G = group A all;"); pigServer.registerQuery("C = foreach G generate COUNT(A) as count;"); pigServer.registerQuery("B = LOAD '" + inputPathB + "' as (b0:chararray, b1:chararray);"); pigServer.registerQuery("Y = join A by CONCAT('Total', (chararray)C.count), B by $0;"); Iterator<Tuple> iter = pigServer.openIterator("Y"); String[] expected = new String[] {"(1,5,Total3,three)", "(2,10,Total3,three)", "(3,20,Total3,three)"}; Util.checkQueryOutputsAfterSortRecursive( iter, expected, org.apache.pig.newplan.logical.Util.translateSchema(pigServer.dumpSchema("Y"))); } // See PIG-1434 @Test public void testScalarAliasesFilterClause() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20", "4\t12", "5\t8"}; // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesFilterClause"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0, a1);"); pigServer.registerQuery("G = group A all;"); pigServer.registerQuery("C = foreach G generate AVG(A.$1) as average;"); pigServer.registerQuery("Y = filter A by a1 > C.average;"); Iterator<Tuple> iter = pigServer.openIterator("Y"); // Average is 11 Tuple t = iter.next(); assertTrue(t.toString().equals("(3,20)")); t = iter.next(); assertTrue(t.toString().equals("(4,12)")); assertFalse(iter.hasNext()); } // See PIG-1434 @Test public void testScalarAliasesGrammarNegative() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20"}; String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesGrammar"; TestScalarAliases.createLocalInputFile(inputPath, input); try { pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);"); pigServer.registerQuery("B = group A all;"); pigServer.registerQuery("C = foreach B generate COUNT(A);"); // Only projections of C are supported pigServer.registerQuery("Y = foreach A generate C;"); pigServer.openIterator("Y"); // Control should not reach here fail("Scalar projections are only supported"); } catch (IOException pe) { assertTrue(pe.getMessage().contains("Invalid scalar projection: C")); } } // See PIG-1636 @Test public void testScalarAliasesLimit() throws Exception { String[] input = {"a\t1", "b\t2", "c\t3", "a\t4", "c\t5"}; // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesLimit"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0:chararray, a1: int);"); pigServer.registerQuery("G = group A all;"); pigServer.registerQuery("C = foreach G generate SUM(A.$1) as total;"); pigServer.registerQuery("C1 = limit C 1;"); pigServer.registerQuery("Y = foreach A generate a0, a1 * (double)C1.total;"); Iterator<Tuple> iter = pigServer.openIterator("Y"); // Average is 11 Tuple t = iter.next(); assertTrue(t.toString().equals("(a,15.0)")); t = iter.next(); assertTrue(t.toString().equals("(b,30.0)")); t = iter.next(); assertTrue(t.toString().equals("(c,45.0)")); t = iter.next(); assertTrue(t.toString().equals("(a,60.0)")); t = iter.next(); assertTrue(t.toString().equals("(c,75.0)")); assertFalse(iter.hasNext()); } /** * Test that a specific string is included in the error message when an exception is thrown for * using a relation in a scalar context without projecting any columns out of it */ // See PIG-1788 @Test public void testScalarWithNoProjection() throws Exception { String query = " A = load 'table_testScalarWithNoProjection' as (x, y);" + " B = group A by x;" + // B is unintentionally being used as scalar, // the user intends it to be COUNT(A) " C = foreach B generate COUNT(B);"; Util.checkExceptionMessage( query, "C", "A column needs to be projected from a relation" + " for it to be used as a scalar"); } @Test public void testScalarNullValue() throws Exception { Storage.Data data = Storage.resetData(pigServer); data.set("input", Storage.tuple("a", 1), Storage.tuple("b", 2)); pigServer.setBatchOn(); pigServer.registerQuery("A = load 'input' using mock.Storage() as (a:chararray, b:int);"); pigServer.registerQuery("B = FILTER A by a == 'c';"); pigServer.registerQuery("C = FOREACH A generate a, b + B.b;"); pigServer.registerQuery("store C into 'output' using mock.Storage();"); pigServer.executeBatch(); List<Tuple> actualResults = data.get("output"); List<Tuple> expectedResults = Util.getTuplesFromConstantTupleStrings(new String[] {"('a', null)", "('b', null)"}); Util.checkQueryOutputsAfterSort(actualResults.iterator(), expectedResults); } }
/** * Calls getNext to get next ForEach result. The input for POJoinPackage is a (key, NullableTuple) * pair. We will materialize n-1 inputs into bags, feed input#n one tuple a time to the delegated * ForEach operator, the input for ForEach is * * <p>(input#1, input#2, input#3....input#n[i]), i=(1..k), suppose input#n consists * * <p>of k tuples. For every ForEach input, pull all the results from ForEach. getNext will be * called multiple times for a particular input, it returns one output tuple from ForEach every * time we call getNext, so we need to maintain internal status to keep tracking of where we are. */ @Override public Result getNext(Tuple t) throws ExecException { if (firstTime) { firstTime = false; if (PigMapReduce.sJobConf != null) { String bagType = PigMapReduce.sJobConf.get("pig.cachedbag.type"); if (bagType != null && bagType.equalsIgnoreCase("default")) { useDefaultBag = true; } } } // if a previous call to foreach.getNext() // has still not returned all output, process it if (forEach.processingPlan) { forEachResult = forEach.getNext(t1); switch (forEachResult.returnStatus) { case POStatus.STATUS_OK: case POStatus.STATUS_NULL: case POStatus.STATUS_ERR: return forEachResult; case POStatus.STATUS_EOP: break; } } NullableTuple it = null; // If we see a new NullableTupleIterator, materialize n-1 inputs, construct ForEach input // tuple res = (key, input#1, input#2....input#n), the only missing value is input#n, // we will get input#n one tuple a time, fill in res, feed to ForEach. // After this block, we have the first tuple of input#n in hand (kept in variable it) if (newKey) { lastInputTuple = false; // Put n-1 inputs into bags dbs = new DataBag[numInputs]; for (int i = 0; i < numInputs - 1; i++) { dbs[i] = useDefaultBag ? BagFactory.getInstance().newDefaultBag() // In a very rare case if there is a POStream after this // POJoinPackage in the pipeline and is also blocking the pipeline; // constructor argument should be 2 * numInputs. But for one obscure // case we don't want to pay the penalty all the time. : new InternalCachedBag(numInputs - 1); } // For last bag, we always use NonSpillableBag. dbs[lastBagIndex] = new NonSpillableDataBag((int) chunkSize); // For each Nullable tuple in the input, put it // into the corresponding bag based on the index, // except for the last input, which we will stream // The tuples will arrive in the order of the index, // starting from index 0 and such that all tuples for // a given index arrive before a tuple for the next // index does. while (tupIter.hasNext()) { it = tupIter.next(); int itIndex = it.getIndex(); if (itIndex != numInputs - 1) { dbs[itIndex].add(getValueTuple(it, itIndex)); } else { lastInputTuple = true; break; } if (reporter != null) reporter.progress(); } // If we don't have any tuple for input#n // we do not need any further process, return EOP if (!lastInputTuple) { // we will return at this point because we ought // to be having a flatten on this last input // and we have an empty bag which should result // in this key being taken out of the output newKey = true; return eopResult; } res = mTupleFactory.newTuple(numInputs + 1); for (int i = 0; i < dbs.length; i++) res.set(i + 1, dbs[i]); res.set(0, key); // if we have an inner anywhere and the corresponding // bag is empty, we can just return for (int i = 0; i < dbs.length - 1; i++) { if (inner[i] && dbs[i].size() == 0) { detachInput(); return eopResult; } } newKey = false; // set up the bag with last input to contain // a chunk of CHUNKSIZE values OR the entire bag if // it has less than CHUNKSIZE values - the idea is in most // cases the values are > CHUNKSIZE in number and in // those cases we will be sending the last bag // as a set of smaller chunked bags thus holding lesser // in memory // the first tuple can be directly retrieved from "it" dbs[lastBagIndex].add(getValueTuple(it, it.getIndex())); for (int i = 0; i < chunkSize - 1 && tupIter.hasNext(); i++) { it = tupIter.next(); dbs[lastBagIndex].add(getValueTuple(it, it.getIndex())); } // Attach the input to forEach forEach.attachInput(res); // pull output tuple from ForEach Result forEachResult = forEach.getNext(t1); { switch (forEachResult.returnStatus) { case POStatus.STATUS_OK: case POStatus.STATUS_NULL: case POStatus.STATUS_ERR: return forEachResult; case POStatus.STATUS_EOP: break; } } } // Keep attaching input tuple to ForEach, until: // 1. We can initialize ForEach.getNext(); // 2. There is no more input#n while (true) { if (tupIter.hasNext()) { // try setting up a bag of CHUNKSIZE OR // the remainder of the bag of last input // (if < CHUNKSIZE) to foreach dbs[lastBagIndex].clear(); // clear last chunk for (int i = 0; i < chunkSize && tupIter.hasNext(); i++) { it = tupIter.next(); dbs[lastBagIndex].add(getValueTuple(it, it.getIndex())); } } else // if we do not have any more tuples for input#n, return EOP { detachInput(); newKey = true; return eopResult; } // Attach the input to forEach forEach.attachInput(res); // pull output tuple from ForEach Result forEachResult = forEach.getNext(t1); { switch (forEachResult.returnStatus) { case POStatus.STATUS_OK: case POStatus.STATUS_NULL: case POStatus.STATUS_ERR: return forEachResult; case POStatus.STATUS_EOP: break; } } } }
public DataBag call(DataBag inputBag, Tuple t) throws IOException { DataBag outputBag = BagFactory.getInstance().newDefaultBag(); outputBag.add(t); for (Tuple x : inputBag) outputBag.add(x); return outputBag; }
/** * This abstract class provides standard conversions between utf8 encoded data and pig data types. * It is intended to be extended by load and store functions (such as {@link PigStorage}). */ public class Utf8StorageConverter implements LoadStoreCaster { protected BagFactory mBagFactory = BagFactory.getInstance(); protected TupleFactory mTupleFactory = TupleFactory.getInstance(); protected final Log mLog = LogFactory.getLog(getClass()); private static final Integer mMaxInt = Integer.valueOf(Integer.MAX_VALUE); private static final Integer mMinInt = Integer.valueOf(Integer.MIN_VALUE); private static final Long mMaxLong = Long.valueOf(Long.MAX_VALUE); private static final Long mMinLong = Long.valueOf(Long.MIN_VALUE); private static final int BUFFER_SIZE = 1024; public Utf8StorageConverter() {} private char findStartChar(char start) throws IOException { switch (start) { case ')': return '('; case ']': return '['; case '}': return '{'; default: throw new IOException("Unknown start character"); } } private DataBag consumeBag(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException { if (fieldSchema == null) { throw new IOException("Schema is null"); } ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields(); Tuple t; int buf; while ((buf = in.read()) != '{') { if (buf == -1) { throw new IOException("Unexpect end of bag"); } } if (fss.length != 1) throw new IOException("Only tuple is allowed inside bag schema"); ResourceFieldSchema fs = fss[0]; DataBag db = DefaultBagFactory.getInstance().newDefaultBag(); while (true) { t = consumeTuple(in, fs); if (t != null) db.add(t); while ((buf = in.read()) != '}' && buf != ',') { if (buf == -1) { throw new IOException("Unexpect end of bag"); } } if (buf == '}') break; } return db; } private Tuple consumeTuple(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException { if (fieldSchema == null) { throw new IOException("Schema is null"); } int buf; ByteArrayOutputStream mOut; while ((buf = in.read()) != '(' || buf == '}') { if (buf == -1) { throw new IOException("Unexpect end of tuple"); } if (buf == '}') { in.unread(buf); return null; } } Tuple t = TupleFactory.getInstance().newTuple(); if (fieldSchema.getSchema() != null && fieldSchema.getSchema().getFields().length != 0) { ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields(); // Interpret item inside tuple one by one based on the inner schema for (int i = 0; i < fss.length; i++) { Object field; ResourceFieldSchema fs = fss[i]; int delimit = ','; if (i == fss.length - 1) delimit = ')'; if (DataType.isComplex(fs.getType())) { field = consumeComplexType(in, fs); while ((buf = in.read()) != delimit) { if (buf == -1) { throw new IOException("Unexpect end of tuple"); } } } else { mOut = new ByteArrayOutputStream(BUFFER_SIZE); while ((buf = in.read()) != delimit) { if (buf == -1) { throw new IOException("Unexpect end of tuple"); } if (buf == delimit) break; mOut.write(buf); } field = parseSimpleType(mOut.toByteArray(), fs); } t.append(field); } } else { // No inner schema, treat everything inside tuple as bytearray Deque<Character> level = new LinkedList< Character>(); // keep track of nested tuple/bag/map. We do not interpret, save them as // bytearray mOut = new ByteArrayOutputStream(BUFFER_SIZE); while (true) { buf = in.read(); if (buf == -1) { throw new IOException("Unexpect end of tuple"); } if (buf == '[' || buf == '{' || buf == '(') { level.push((char) buf); mOut.write(buf); } else if (buf == ')' && level.isEmpty()) // End of tuple { DataByteArray value = new DataByteArray(mOut.toByteArray()); t.append(value); break; } else if (buf == ',' && level.isEmpty()) { DataByteArray value = new DataByteArray(mOut.toByteArray()); t.append(value); mOut.reset(); } else if (buf == ']' || buf == '}' || buf == ')') { if (level.peek() == findStartChar((char) buf)) level.pop(); else throw new IOException("Malformed tuple"); mOut.write(buf); } else mOut.write(buf); } } return t; } private Map<String, Object> consumeMap(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException { int buf; while ((buf = in.read()) != '[') { if (buf == -1) { throw new IOException("Unexpect end of map"); } } HashMap<String, Object> m = new HashMap<String, Object>(); ByteArrayOutputStream mOut = new ByteArrayOutputStream(BUFFER_SIZE); while (true) { // Read key (assume key can not contains special character such as #, (, [, {, }, ], ) while ((buf = in.read()) != '#') { if (buf == -1) { throw new IOException("Unexpect end of map"); } mOut.write(buf); } String key = bytesToCharArray(mOut.toByteArray()); if (key.length() == 0) throw new IOException("Map key can not be null"); // Read value mOut.reset(); Deque<Character> level = new LinkedList< Character>(); // keep track of nested tuple/bag/map. We do not interpret, save them as // bytearray while (true) { buf = in.read(); if (buf == -1) { throw new IOException("Unexpect end of map"); } if (buf == '[' || buf == '{' || buf == '(') { level.push((char) buf); } else if (buf == ']' && level.isEmpty()) // End of map break; else if (buf == ']' || buf == '}' || buf == ')') { if (level.isEmpty()) throw new IOException("Malformed map"); if (level.peek() == findStartChar((char) buf)) level.pop(); } else if (buf == ',' && level.isEmpty()) { // Current map item complete break; } mOut.write(buf); } Object value = null; if (fieldSchema != null && fieldSchema.getSchema() != null && mOut.size() > 0) { value = bytesToObject(mOut.toByteArray(), fieldSchema.getSchema().getFields()[0]); } else if (mOut.size() > 0) { // untyped map value = new DataByteArray(mOut.toByteArray()); } m.put(key, value); mOut.reset(); if (buf == ']') break; } return m; } private Object bytesToObject(byte[] b, ResourceFieldSchema fs) throws IOException { Object field; if (DataType.isComplex(fs.getType())) { ByteArrayInputStream bis = new ByteArrayInputStream(b); PushbackInputStream in = new PushbackInputStream(bis); field = consumeComplexType(in, fs); } else { field = parseSimpleType(b, fs); } return field; } private Object consumeComplexType(PushbackInputStream in, ResourceFieldSchema complexFieldSchema) throws IOException { Object field; switch (complexFieldSchema.getType()) { case DataType.BAG: field = consumeBag(in, complexFieldSchema); break; case DataType.TUPLE: field = consumeTuple(in, complexFieldSchema); break; case DataType.MAP: field = consumeMap(in, complexFieldSchema); break; default: throw new IOException("Unknown complex data type"); } return field; } private Object parseSimpleType(byte[] b, ResourceFieldSchema simpleFieldSchema) throws IOException { Object field; switch (simpleFieldSchema.getType()) { case DataType.INTEGER: field = bytesToInteger(b); break; case DataType.LONG: field = bytesToLong(b); break; case DataType.FLOAT: field = bytesToFloat(b); break; case DataType.DOUBLE: field = bytesToDouble(b); break; case DataType.CHARARRAY: field = bytesToCharArray(b); break; case DataType.BYTEARRAY: field = new DataByteArray(b); break; case DataType.BOOLEAN: field = bytesToBoolean(b); break; case DataType.BIGINTEGER: field = bytesToBigInteger(b); break; case DataType.BIGDECIMAL: field = bytesToBigDecimal(b); case DataType.DATETIME: field = bytesToDateTime(b); break; default: throw new IOException("Unknown simple data type"); } return field; } @Override public DataBag bytesToBag(byte[] b, ResourceFieldSchema schema) throws IOException { if (b == null) return null; DataBag db; try { ByteArrayInputStream bis = new ByteArrayInputStream(b); PushbackInputStream in = new PushbackInputStream(bis); db = consumeBag(in, schema); } catch (IOException e) { LogUtils.warn( this, "Unable to interpret value " + Arrays.toString(b) + " in field being " + "converted to type bag, caught ParseException <" + e.getMessage() + "> field discarded", PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog); return null; } return db; } @Override public String bytesToCharArray(byte[] b) throws IOException { if (b == null) return null; return new String(b, "UTF-8"); } @Override public Double bytesToDouble(byte[] b) { if (b == null || b.length == 0) { return null; } try { return Double.valueOf(new String(b)); } catch (NumberFormatException nfe) { LogUtils.warn( this, "Unable to interpret value " + Arrays.toString(b) + " in field being " + "converted to double, caught NumberFormatException <" + nfe.getMessage() + "> field discarded", PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog); return null; } } @Override public Float bytesToFloat(byte[] b) throws IOException { if (b == null || b.length == 0) { return null; } String s; if (b.length > 0 && (b[b.length - 1] == 'F' || b[b.length - 1] == 'f')) { s = new String(b, 0, b.length - 1); } else { s = new String(b); } try { return Float.valueOf(s); } catch (NumberFormatException nfe) { LogUtils.warn( this, "Unable to interpret value " + Arrays.toString(b) + " in field being " + "converted to float, caught NumberFormatException <" + nfe.getMessage() + "> field discarded", PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog); return null; } } @Override public Boolean bytesToBoolean(byte[] b) throws IOException { if (b == null) return null; String s = new String(b); if (s.equalsIgnoreCase("true")) { return Boolean.TRUE; } else if (s.equalsIgnoreCase("false")) { return Boolean.FALSE; } else { return null; } } /** * Sanity check of whether this number is a valid integer or long. * * @param number the number to check * @return true if it doesn't contain any invalid characters, i.e. only contains digits and '-' */ private static boolean sanityCheckIntegerLong(String number) { for (int i = 0; i < number.length(); i++) { if (number.charAt(i) >= '0' && number.charAt(i) <= '9' || i == 0 && number.charAt(i) == '-') { // valid one } else { // contains invalid characters, must not be a integer or long. return false; } } return true; } @Override public Integer bytesToInteger(byte[] b) throws IOException { if (b == null || b.length == 0) { return null; } String s = new String(b); s = s.trim(); Integer ret = null; // See PIG-2835. Using exception handling to check if it's a double is very expensive. // So we write our sanity check. if (sanityCheckIntegerLong(s)) { try { ret = Integer.valueOf(s); } catch (NumberFormatException nfe) { } } if (ret == null) { // It's possible that this field can be interpreted as a double. // Unfortunately Java doesn't handle this in Integer.valueOf. So // we need to try to convert it to a double and if that works then // go to an int. try { Double d = Double.valueOf(s); // Need to check for an overflow error if (Double.compare(d.doubleValue(), mMaxInt.doubleValue() + 1) >= 0 || Double.compare(d.doubleValue(), mMinInt.doubleValue() - 1) <= 0) { LogUtils.warn( this, "Value " + d + " too large for integer", PigWarning.TOO_LARGE_FOR_INT, mLog); return null; } return Integer.valueOf(d.intValue()); } catch (NumberFormatException nfe2) { LogUtils.warn( this, "Unable to interpret value " + Arrays.toString(b) + " in field being " + "converted to int, caught NumberFormatException <" + nfe2.getMessage() + "> field discarded", PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog); return null; } } return ret; } @Override public Long bytesToLong(byte[] b) throws IOException { if (b == null || b.length == 0) { return null; } String s = new String(b).trim(); if (s.endsWith("l") || s.endsWith("L")) { s = s.substring(0, s.length() - 1); } // See PIG-2835. Using exception handling to check if it's a double is very expensive. // So we write our sanity check. Long ret = null; if (sanityCheckIntegerLong(s)) { try { ret = Long.valueOf(s); } catch (NumberFormatException nfe) { } } if (ret == null) { // It's possible that this field can be interpreted as a double. // Unfortunately Java doesn't handle this in Long.valueOf. So // we need to try to convert it to a double and if that works then // go to an long. try { Double d = Double.valueOf(s); // Need to check for an overflow error if (Double.compare(d.doubleValue(), mMaxLong.doubleValue() + 1) > 0 || Double.compare(d.doubleValue(), mMinLong.doubleValue() - 1) < 0) { LogUtils.warn( this, "Value " + d + " too large for long", PigWarning.TOO_LARGE_FOR_INT, mLog); return null; } return Long.valueOf(d.longValue()); } catch (NumberFormatException nfe2) { LogUtils.warn( this, "Unable to interpret value " + Arrays.toString(b) + " in field being " + "converted to long, caught NumberFormatException <" + nfe2.getMessage() + "> field discarded", PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog); return null; } } return ret; } @Override public DateTime bytesToDateTime(byte[] b) throws IOException { if (b == null) { return null; } try { String dtStr = new String(b); return ToDate.extractDateTime(dtStr); } catch (IllegalArgumentException e) { LogUtils.warn( this, "Unable to interpret value " + Arrays.toString(b) + " in field being " + "converted to datetime, caught IllegalArgumentException <" + e.getMessage() + "> field discarded", PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog); return null; } } @Override public Map<String, Object> bytesToMap(byte[] b, ResourceFieldSchema fieldSchema) throws IOException { if (b == null) return null; Map<String, Object> map; try { ByteArrayInputStream bis = new ByteArrayInputStream(b); PushbackInputStream in = new PushbackInputStream(bis); map = consumeMap(in, fieldSchema); } catch (IOException e) { LogUtils.warn( this, "Unable to interpret value " + Arrays.toString(b) + " in field being " + "converted to type map, caught ParseException <" + e.getMessage() + "> field discarded", PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog); return null; } return map; } @Override public Map<String, Object> bytesToMap(byte[] b) throws IOException { return bytesToMap(b, null); } @Override public Tuple bytesToTuple(byte[] b, ResourceFieldSchema fieldSchema) throws IOException { if (b == null) return null; Tuple t; try { ByteArrayInputStream bis = new ByteArrayInputStream(b); PushbackInputStream in = new PushbackInputStream(bis); t = consumeTuple(in, fieldSchema); } catch (IOException e) { LogUtils.warn( this, "Unable to interpret value " + Arrays.toString(b) + " in field being " + "converted to type tuple, caught ParseException <" + e.getMessage() + "> field discarded", PigWarning.FIELD_DISCARDED_TYPE_CONVERSION_FAILED, mLog); return null; } return t; } @Override public BigInteger bytesToBigInteger(byte[] b) throws IOException { if (b == null || b.length == 0) { return null; } return new BigInteger(new String(b)); } @Override public BigDecimal bytesToBigDecimal(byte[] b) throws IOException { if (b == null || b.length == 0) { return null; } return new BigDecimal(new String(b)); } @Override public byte[] toBytes(DataBag bag) throws IOException { return bag.toString().getBytes(); } @Override public byte[] toBytes(String s) throws IOException { return s.getBytes(); } @Override public byte[] toBytes(Double d) throws IOException { return d.toString().getBytes(); } @Override public byte[] toBytes(Float f) throws IOException { return f.toString().getBytes(); } @Override public byte[] toBytes(Integer i) throws IOException { return i.toString().getBytes(); } @Override public byte[] toBytes(Long l) throws IOException { return l.toString().getBytes(); } @Override public byte[] toBytes(Boolean b) throws IOException { return b.toString().getBytes(); } @Override public byte[] toBytes(DateTime dt) throws IOException { return dt.toString().getBytes(); } @Override public byte[] toBytes(Map<String, Object> m) throws IOException { return DataType.mapToString(m).getBytes(); } @Override public byte[] toBytes(Tuple t) throws IOException { return t.toString().getBytes(); } @Override public byte[] toBytes(DataByteArray a) throws IOException { return a.get(); } @Override public byte[] toBytes(BigInteger bi) throws IOException { return bi.toString().getBytes(); } @Override public byte[] toBytes(BigDecimal bd) throws IOException { return bd.toString().getBytes(); } }
/** * Scalable simple random sampling. * * <p>This UDF implements a scalable simple random sampling algorithm described in * * <pre> * X. Meng, Scalable Simple Random Sampling and Stratified Sampling, ICML 2013. * </pre> * * It takes a sampling probability p as input and outputs a simple random sample of size exactly * ceil(p*n) with probability at least 99.99%, where $n$ is the size of the population. This UDF is * very useful for stratified sampling. For example, * * <pre> * DEFINE SRS datafu.pig.sampling.SimpleRandomSample('0.01'); * examples = LOAD ... * grouped = GROUP examples BY label; * sampled = FOREACH grouped GENERATE FLATTEN(SRS(examples)); * STORE sampled ... * </pre> * * We note that, in a Java Hadoop job, we can output pre-selected records directly using * MultipleOutputs. However, this feature is not available in a Pig UDF. So we still let * pre-selected records go through the sort phase. However, as long as the sample size is not huge, * this should not be a big problem. * * @author ximeng */ public class SimpleRandomSample extends AlgebraicEvalFunc<DataBag> { private static final TupleFactory tupleFactory = TupleFactory.getInstance(); private static final BagFactory bagFactory = BagFactory.getInstance(); public SimpleRandomSample() {} public SimpleRandomSample(String samplingProbability) { Double p = Double.parseDouble(samplingProbability); if (p < 0.0 || p > 1.0) { throw new IllegalArgumentException("Sampling probability must be inside [0, 1]."); } } @Override public String getInitial() { return Initial.class.getName(); } @Override public String getIntermed() { return Intermediate.class.getName(); } @Override public String getFinal() { return Final.class.getName(); } @Override public Schema outputSchema(Schema input) { try { Schema.FieldSchema inputFieldSchema = input.getField(0); if (inputFieldSchema.type != DataType.BAG) { throw new RuntimeException("Expected a BAG as input"); } return new Schema( new Schema.FieldSchema( getSchemaName(this.getClass().getName().toLowerCase(), input), inputFieldSchema.schema, DataType.BAG)); } catch (FrontendException e) { e.printStackTrace(); throw new RuntimeException(e); } } public static class Initial extends EvalFunc<Tuple> { private double _samplingProbability; private RandomDataImpl _rdg = new RandomDataImpl(); public Initial() {} public Initial(String samplingProbability) { _samplingProbability = Double.parseDouble(samplingProbability); } @Override public Tuple exec(Tuple input) throws IOException { Tuple output = tupleFactory.newTuple(); DataBag selected = bagFactory.newDefaultBag(); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); DataBag items = (DataBag) input.get(0); if (items != null) { long n = items.size(); double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple item : items) { double key = _rdg.nextUniform(0.0d, 1.0d); if (key < q1) { selected.add(item); } else if (key < q2) { waiting.add(new ScoredTuple(key, item).getIntermediateTuple(tupleFactory)); } } output.append(n); output.append(selected); output.append(waiting); } return output; } } public static class Intermediate extends EvalFunc<Tuple> { public Intermediate() {} public Intermediate(String samplingProbability) { _samplingProbability = Double.parseDouble(samplingProbability); } private double _samplingProbability; @Override public Tuple exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); DataBag selected = bagFactory.newDefaultBag(); DataBag aggWaiting = bagFactory.newSortedBag(new ScoredTupleComparator()); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); Tuple output = tupleFactory.newTuple(); long n = 0L; for (Tuple innerTuple : bag) { n += (Long) innerTuple.get(0); selected.addAll((DataBag) innerTuple.get(1)); double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple t : (DataBag) innerTuple.get(2)) { ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t); if (scored.getScore() < q1) { selected.add(scored.getTuple()); } else if (scored.getScore() < q2) { aggWaiting.add(t); } else { break; } } } double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple t : aggWaiting) { ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t); if (scored.getScore() < q1) { selected.add(scored.getTuple()); } else if (scored.getScore() < q2) { waiting.add(t); } else { break; } } output.append(n); output.append(selected); output.append(waiting); System.err.println( "Read " + n + " items, selected " + selected.size() + ", and wait-listed " + aggWaiting.size() + "."); return output; } } public static class Final extends EvalFunc<DataBag> { private double _samplingProbability; public Final() {} public Final(String samplingProbability) { _samplingProbability = Double.parseDouble(samplingProbability); } @Override public DataBag exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); long n = 0L; DataBag selected = bagFactory.newDefaultBag(); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); for (Tuple innerTuple : bag) { n += (Long) innerTuple.get(0); selected.addAll((DataBag) innerTuple.get(1)); waiting.addAll((DataBag) innerTuple.get(2)); } long sampleSize = (long) Math.ceil(_samplingProbability * n); long nNeeded = sampleSize - selected.size(); for (Tuple scored : waiting) { if (nNeeded <= 0) { break; } selected.add(ScoredTuple.fromIntermediateTuple(scored).getTuple()); nNeeded--; } return selected; } } private static class ScoredTupleComparator implements Comparator<Tuple> { @Override public int compare(Tuple o1, Tuple o2) { try { ScoredTuple t1 = ScoredTuple.fromIntermediateTuple(o1); ScoredTuple t2 = ScoredTuple.fromIntermediateTuple(o2); return t1.getScore().compareTo(t2.getScore()); } catch (Throwable e) { throw new RuntimeException("Cannot compare " + o1 + " and " + o2 + ".", e); } } } private static double getQ1(long n, double p) { double t1 = 20.0 / (3.0 * n); double q1 = p + t1 - Math.sqrt(t1 * t1 + 3.0 * t1 * p); return q1; } private static double getQ2(long n, double p) { double t2 = 10.0 / n; double q2 = p + t2 + Math.sqrt(t2 * t2 + 2.0 * t2 * p); return q2; } }