@Override public DataBag exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); long n = 0L; DataBag selected = bagFactory.newDefaultBag(); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); for (Tuple innerTuple : bag) { n += (Long) innerTuple.get(0); selected.addAll((DataBag) innerTuple.get(1)); waiting.addAll((DataBag) innerTuple.get(2)); } long sampleSize = (long) Math.ceil(_samplingProbability * n); long nNeeded = sampleSize - selected.size(); for (Tuple scored : waiting) { if (nNeeded <= 0) { break; } selected.add(ScoredTuple.fromIntermediateTuple(scored).getTuple()); nNeeded--; } return selected; }
// pig 1048 public void testSkewedJoinOneValue() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE3 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE3 + "' as (id,name);"); // Filter key with a single value pigServer.registerQuery("C = FILTER A by id == 400;"); pigServer.registerQuery("D = FILTER B by id == 400;"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join C by id, D by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join C by id, D by id;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbrj.add(iter.next()); } } Assert.assertEquals(dbfrj.size(), dbrj.size()); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj)); }
public void testSkewedJoinWithGroup() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = GROUP A by id;"); pigServer.registerQuery("D = GROUP B by id;"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbshj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join C by group, D by group using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join C by group, D by group;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbshj.add(iter.next()); } } Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj)); }
public void testSkewedJoinWithNoProperties() throws IOException { pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); DataBag dbshj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery( "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join A by(id, name), B by (id, name);"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbshj.add(iter.next()); } } Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj)); } catch (Exception e) { fail(e.getMessage()); } }
public void testSkewedJoinManyReducers() throws IOException { pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "2"); pigServer.registerQuery("A = LOAD '" + INPUT_FILE6 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE7 + "' as (id,name);"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 300;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join A by id, B by id;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbrj.add(iter.next()); } } Assert.assertEquals(dbfrj.size(), dbrj.size()); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj)); }
@Override public Tuple exec(Tuple input) throws IOException { Tuple output = tupleFactory.newTuple(); DataBag selected = bagFactory.newDefaultBag(); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); DataBag items = (DataBag) input.get(0); if (items != null) { long n = items.size(); double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple item : items) { double key = _rdg.nextUniform(0.0d, 1.0d); if (key < q1) { selected.add(item); } else if (key < q2) { waiting.add(new ScoredTuple(key, item).getIntermediateTuple(tupleFactory)); } } output.append(n); output.append(selected); output.append(waiting); } return output; }
public DataBag exec(Tuple input) throws IOException { try { if (!input.isNull()) { // Create the output like a databag {(res1,res2),(res3,res4)..} DataBag output_databag = mBagFactory.newDefaultBag(); // Unpack tuple in order to get the bag {(1,2),(3,4),...} String input_time = (String) input.get(0); try { DateFormat formatter = new SimpleDateFormat("MM/dd/yyyy kk:mm:ss"); Date date = formatter.parse( String.format( "%s/%s/%s %s:%s:%s", input_time.substring(5, 7), input_time.substring(8, 10), input_time.substring(0, 4), input_time.substring(11, 13), input_time.substring(14, 16), input_time.substring(17, 18))); Calendar calendar = Calendar.getInstance(); calendar.setTime(date); int dayOfWeek = calendar.get(Calendar.DAY_OF_WEEK); int dayOfMonth = calendar.get(Calendar.DAY_OF_MONTH); int hour = calendar.get(Calendar.HOUR_OF_DAY); // Add items to output Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, String.format("%d:%d:%d", dayOfWeek, dayOfMonth, hour)); output_databag.add(items); } catch (Exception e) { Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #1" + e.getMessage()); output_databag.add(items); return output_databag; } return output_databag; } else { DataBag output_databag = mBagFactory.newDefaultBag(); Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #2"); output_databag.add(items); return output_databag; } } catch (Exception e) { System.err.println("Error with ?? .."); DataBag output_databag = mBagFactory.newDefaultBag(); Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #3" + e.getMessage()); output_databag.add(items); return output_databag; } }
@Override public DataBag exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); HashMap<String, Double> clsCnt = new HashMap<String, Double>(); Iterator<Tuple> it = bag.iterator(); Double sum = new Double(0.0); while (it.hasNext()) { Tuple item = (Tuple) it.next(); String cls = (String) item.get(3); if (cls != null && cls.length() > 0) { Double cur = clsCnt.get(cls); Double inc = (Double) item.get(2); if (cur != null) { clsCnt.put(cls, cur + inc); } else { clsCnt.put(cls, inc); } sum += inc; } } Set<Entry<String, Double>> clses = clsCnt.entrySet(); Iterator<Entry<String, Double>> cit = clses.iterator(); DataBag result = bagFactory.newDefaultBag(); while (cit.hasNext()) { Entry<String, Double> cls = cit.next(); Tuple tpl = tupleFactory.newTuple(2); tpl.set(0, cls.getKey()); tpl.set(1, cls.getValue() / sum); result.add(tpl); } return result; }
@SuppressWarnings("rawtypes") @Override public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { this.reader = (WikipediaPageInputFormat.WikipediaRecordReader) reader; tupleFactory = TupleFactory.getInstance(); bagFactory = BagFactory.getInstance(); }
/** * TokenizeText uses the Lucene libraries StandardAnalyzer class to tokenize a raw text input. A * list of the stopwords used is available {@link StopWords}. Output is a pig bag containing tokens. * <dt><b>Example:</b> * <dd><code> * register varaha.jar;<br/> * documents = LOAD 'documents' AS (doc_id:chararray, text:chararray);<br/> * tokenized = FOREACH documents GENERATE doc_id AS doc_id, FLATTEN(StanfordTokenize(text)) AS (token:chararray); * </code> * </dl> * * @see * @author Russell Jurney */ public class StanfordTokenize extends EvalFunc<DataBag> { private static TupleFactory tupleFactory = TupleFactory.getInstance(); private static BagFactory bagFactory = BagFactory.getInstance(); public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; // Output bag DataBag bagOfTokens = bagFactory.newDefaultBag(); StringReader textInput = new StringReader(input.get(0).toString()); PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), ""); for (CoreLabel label; ptbt.hasNext(); ) { label = (CoreLabel) ptbt.next(); if (label.value().length() > 2) { System.err.println(label.toString()); Tuple termText = tupleFactory.newTuple(label.word()); bagOfTokens.add(termText); } } return bagOfTokens; } }
@Override public Tuple exec(Tuple input) throws IOException { DataBag output = BagFactory.getInstance().newDefaultBag(); DataBag samples = (DataBag) input.get(0); if (samples == null) { // do nothing } else if (samples.size() <= numSamples) { // no need to construct a reservoir, so just emit intermediate tuples for (Tuple sample : samples) { // add the score on to the intermediate tuple output.add(new ScoredTuple(Math.random(), sample).getIntermediateTuple(tupleFactory)); } } else { for (Tuple sample : samples) { getReservoir().consider(new ScoredTuple(Math.random(), sample)); } for (ScoredTuple scoredTuple : getReservoir()) { // add the score on to the intermediate tuple output.add(scoredTuple.getIntermediateTuple(tupleFactory)); } } return tupleFactory.newTuple(output); }
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } String normStr = ((String) input.get(0)); if (normStr == null) { return null; } // Remove punctuation except when it's a version number normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" "); normStr = spacePattern.matcher(normStr).replaceAll(" "); DataBag output = bagFactory.newDefaultBag(); for (String s : spacePattern.split(normStr.trim())) { if (s.length() <= 30) { Tuple t = tupleFactory.newTuple(1); t.set(0, s); output.add(t); } } return output; }
public class UnigramExtractor extends EvalFunc<DataBag> { private static BagFactory bagFactory = BagFactory.getInstance(); private static TupleFactory tupleFactory = TupleFactory.getInstance(); private static final Pattern spacePattern = Pattern.compile("\\s+"); private static final Pattern punctPattern = Pattern.compile("\\p{Punct}(?:(?<!\\d)(?!\\d))"); public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } String normStr = ((String) input.get(0)); if (normStr == null) { return null; } // Remove punctuation except when it's a version number normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" "); normStr = spacePattern.matcher(normStr).replaceAll(" "); DataBag output = bagFactory.newDefaultBag(); for (String s : spacePattern.split(normStr.trim())) { if (s.length() <= 30) { Tuple t = tupleFactory.newTuple(1); t.set(0, s); output.add(t); } } return output; } }
@Override public DataBag getValue() { DataBag output = BagFactory.getInstance().newDefaultBag(); for (ScoredTuple sample : getReservoir()) { output.add(sample.getTuple()); } return output; }
/** * create bag having given number of tuples * * @param size * @return */ private DataBag createBag(int size) { Tuple innerTuple = TupleFactory.getInstance().newTuple(); innerTuple.append(Integer.valueOf(1)); DataBag bag = BagFactory.getInstance().newDefaultBag(); for (int i = 0; i < size; i++) { bag.add(innerTuple); } return bag; }
@SuppressWarnings("unchecked") @Override public DataBag exec(Tuple input) throws IOException { if (input.size() < 2) { throw new RuntimeException("Expected at least two inputs, but found " + input.size()); } for (Object o : input) { if (o != null && !(o instanceof DataBag)) { throw new RuntimeException("Inputs must be bags"); } } DataBag outputBag = bagFactory.newDefaultBag(); DataBag bag1 = (DataBag) input.get(0); DataBag bag2 = (DataBag) input.get(1); if (bag1 == null || bag1.size() == 0) { return outputBag; } // optimization else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) { return bag1; } PriorityQueue<Pair> pq = loadBags(input); Tuple lastData = null; while (true) { Pair nextPair = pq.peek(); // ignore data we've already encountered if (nextPair.data.compareTo(lastData) != 0) { // Only take data from the first bag, where there are no other // bags that have the same data. if (nextPair.index.equals(0) && countMatches(pq) == 0) { outputBag.add(nextPair.data); lastData = nextPair.data; } } Pair p = pq.poll(); // only put the bag back into the queue if it still has data if (p.hasNext()) { p.next(); pq.offer(p); } else if (p.index.equals(0)) { // stop when we exhaust all elements from the first bag break; } } return outputBag; }
@Override public void setInput(Map<String, Block> input, JsonNode operatorJson, BlockProperties props) throws IOException, InterruptedException { inputBlock = input.values().iterator().next(); init(operatorJson, inputBlock.getProperties().getSchema()); nullBag = BagFactory.getInstance().newDefaultBag(); nullBag.add(TupleFactory.getInstance().newTuple(0)); }
@Override public DataBag exec(Tuple input) throws ExecException, FrontendException { if (input == null) { return null; } Iterable<Pair<Integer, Double>> vector = elgen.apply(input); List<Tuple> result = new ArrayList<Tuple>(N); for (Pair<Integer, Double> el : ordering.greatestOf(vector, N)) { result.add(tfac.newTuple(el.getFirst())); } return bfac.newDefaultBag(result); }
@Override public DataBag exec(Tuple input) throws IOException { retrieveContextValues(); ArrayList<String> joinKeyNames = new ArrayList<String>(); for (int i = 1; i < input.size(); i += 2) { joinKeyNames.add((String) input.get(i)); } JoinCollector collector = new JoinCollector(); // the first bag is the outer bag String leftBagName = bagNames.get(0); DataBag leftBag = getBag(input, leftBagName); String leftBagJoinKeyName = getPrefixedAliasName(bagNameToJoinKeyPrefix.get(leftBagName), joinKeyNames.get(0)); collector.setJoinData(collector.groupTuples(leftBag, leftBagJoinKeyName)); // now, for each additional bag, group up the tuples by the join key, then join them in if (bagNames.size() > 1) { for (int i = 1; i < bagNames.size(); i++) { String bagName = bagNames.get(i); DataBag bag = getBag(input, bagName); String joinKeyName = getPrefixedAliasName(bagNameToJoinKeyPrefix.get(bagName), joinKeyNames.get(i)); int tupleSize = bagNameToSize.get(bagName); if (bag == null) throw new IOException( "Error in instance: " + getInstanceName() + " with properties: " + getInstanceProperties() + " and tuple: " + input.toDelimitedString(", ") + " -- Expected bag, got null"); HashMap<Object, List<Tuple>> groupedData = collector.groupTuples(bag, joinKeyName); // outer join, so go back in and add nulls; groupedData = collector.insertNullTuples(groupedData, tupleSize); for (Map.Entry<Object, List<Tuple>> entry : groupedData.entrySet()) { collector.joinTuples(entry.getKey(), entry.getValue()); } } } // assemble output bag DataBag outputBag = BagFactory.getInstance().newDefaultBag(); for (List<Tuple> tuples : collector.getJoinData().values()) { for (Tuple tuple : tuples) { outputBag.add(tuple); } } return outputBag; }
public void testSkewedJoinReducers() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using \"skewed\" parallel 1;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { fail("Should not throw exception, should continue execution"); } }
@Override public DataBag exec(Tuple input) throws IOException { DataBag bagOfSamples = (DataBag) input.get(0); for (Tuple innerTuple : bagOfSamples) { DataBag samples = (DataBag) innerTuple.get(0); for (Tuple sample : samples) { // use the same score as previously generated getReservoir().consider(ScoredTuple.fromIntermediateTuple(sample)); } } DataBag output = BagFactory.getInstance().newDefaultBag(); for (ScoredTuple scoredTuple : getReservoir()) { // output the original tuple output.add(scoredTuple.getTuple()); } return output; }
public void testSkewedJoin3Way() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("D = join A by id, B by id, C by id using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("D"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { return; } fail("Should throw exception, do not support 3 way join"); }
public void testSkewedJoinNullKeys() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support null keys in skewed join"); } return; }
@SuppressWarnings("unchecked") private void accumulateData() throws ExecException { int count = 0; int length = inputs.size() - 1; inputBags = new DataBag[length]; its = new Iterator[length]; for (int i = 0; i < length; ++i) { PhysicalOperator op = inputs.get(i); DataBag bag = BagFactory.getInstance().newDefaultBag(); inputBags[count] = bag; for (Result res = op.getNextTuple(); res.returnStatus != POStatus.STATUS_EOP; res = op.getNextTuple()) { if (res.returnStatus == POStatus.STATUS_NULL) continue; if (res.returnStatus == POStatus.STATUS_ERR) throw new ExecException("Error accumulating data in the local Cross operator"); if (res.returnStatus == POStatus.STATUS_OK) bag.add((Tuple) res.result); } its[count++] = bag.iterator(); } }
public void testSkewedJoinMapKey() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE4 + "' as (m:[]);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE4 + "' as (n:[]);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery( "C = join A by (chararray)m#'a100', B by (chararray)n#'a100' using \"skewed\" parallel 20;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support maps and expression operators as keys"); } return; }
@Test public void testTupleWriteRead1() throws IOException { // create a tuple with columns of different type Tuple tuplein = TupleFactory.getInstance().newTuple(7); tuplein.set(0, 12); Map<String, String> map = new HashMap<String, String>(); map.put("pig", "scalability"); tuplein.set(1, map); tuplein.set(2, null); tuplein.set(3, 12L); tuplein.set(4, 1.2F); Tuple innerTuple = TupleFactory.getInstance().newTuple(1); innerTuple.set(0, "innerTuple"); tuplein.set(5, innerTuple); DataBag bag = BagFactory.getInstance().newDefaultBag(); bag.add(innerTuple); tuplein.set(6, bag); testTupleSedes(tuplein); assertEquals( "(12,[pig#scalability],,12,1.2,(innerTuple),{(innerTuple)})", TupleFormat.format(tuplein)); }
public DataBag exec(Tuple input) throws IOException { try { DataBag bag = BagFactory.getInstance().newDefaultBag(); for (int i = 0; i < input.size(); i++) { final Object object = input.get(i); if (object instanceof Tuple) { for (int j = 0; j < ((Tuple) object).size(); j++) { Tuple tp2 = TupleFactory.getInstance().newTuple(1); tp2.set(0, ((Tuple) object).get(j)); bag.add(tp2); } } else { Tuple tp2 = TupleFactory.getInstance().newTuple(1); tp2.set(0, object); bag.add(tp2); } } return bag; } catch (Exception ee) { throw new RuntimeException("Error while creating a bag", ee); } }
public class CalcClassWeight extends EvalFunc<DataBag> { TupleFactory tupleFactory = TupleFactory.getInstance(); BagFactory bagFactory = BagFactory.getInstance(); @Override public DataBag exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); HashMap<String, Double> clsCnt = new HashMap<String, Double>(); Iterator<Tuple> it = bag.iterator(); Double sum = new Double(0.0); while (it.hasNext()) { Tuple item = (Tuple) it.next(); String cls = (String) item.get(3); if (cls != null && cls.length() > 0) { Double cur = clsCnt.get(cls); Double inc = (Double) item.get(2); if (cur != null) { clsCnt.put(cls, cur + inc); } else { clsCnt.put(cls, inc); } sum += inc; } } Set<Entry<String, Double>> clses = clsCnt.entrySet(); Iterator<Entry<String, Double>> cit = clses.iterator(); DataBag result = bagFactory.newDefaultBag(); while (cit.hasNext()) { Entry<String, Double> cls = cit.next(); Tuple tpl = tupleFactory.newTuple(2); tpl.set(0, cls.getKey()); tpl.set(1, cls.getValue() / sum); result.add(tpl); } return result; } @Override public Schema outputSchema(Schema input) { try { if (input.getFields().size() != 1 || input.getField(0).type != DataType.BAG) { throw new RuntimeException("expect input {bag}"); } Schema bag = input.getField(0).schema.getField(0).schema; if (bag.getFields().size() < 4 || bag.getField(0).type != DataType.CHARARRAY || bag.getField(1).type != DataType.CHARARRAY || bag.getField(2).type != DataType.DOUBLE || bag.getField(3).type != DataType.CHARARRAY) { throw new RuntimeException( "expect input {userid:chararray, " + "md:chararray, weight:double, cls:chararray}"); } Schema result = new Schema(); result.add(new FieldSchema("cls", DataType.CHARARRAY)); result.add(new FieldSchema("weight", DataType.DOUBLE)); return result; } catch (Exception e) { throw new RuntimeException(e); } } }
/** * Computes the set difference of two or more bags. Duplicates are eliminated. <b>The input bags * must be sorted.</b> * * <p>If bags A and B are provided, then this computes A-B, i.e. all elements in A that are not in * B. If bags A, B and C are provided, then this computes A-B-C, i.e. all elements in A that are not * in B or C. * * <p>Example: * * <pre>{@code * define SetDifference datafu.pig.sets.SetDifference(); * * -- input: * -- ({(1),(2),(3),(4),(5),(6)},{(3),(4)}) * input = LOAD 'input' AS (B1:bag{T:tuple(val:int)},B2:bag{T:tuple(val:int)}); * * input = FOREACH input { * B1 = ORDER B1 BY val ASC; * B2 = ORDER B2 BY val ASC; * * -- output: * -- ({(1),(2),(5),(6)}) * GENERATE SetDifference(B1,B2); * } * }</pre> */ public class SetDifference extends SetOperationsBase { private static final BagFactory bagFactory = BagFactory.getInstance(); /** * Loads the data bags from the input tuple and puts them in a priority queue, where ordering is * determined by the data from the iterator for each bag. * * <p>The bags are wrapped in a {@link Pair} object that is comparable on the data currently * available from the iterator. These objects are ordered first by the data, then by the index * within the tuple the bag came from. * * @param input * @return priority queue ordered * @throws IOException */ private PriorityQueue<Pair> loadBags(Tuple input) throws IOException { PriorityQueue<Pair> pq = new PriorityQueue<Pair>(input.size()); for (int i = 0; i < input.size(); i++) { if (input.get(i) != null) { Iterator<Tuple> inputIterator = ((DataBag) input.get(i)).iterator(); if (inputIterator.hasNext()) { pq.add(new Pair(inputIterator, i)); } } } return pq; } /** * Counts how many elements in the priority queue match the element at the front of the queue, * which should be from the first bag. * * @param pq priority queue * @return number of matches */ public int countMatches(PriorityQueue<Pair> pq) { Pair nextPair = pq.peek(); Tuple data = nextPair.data; // sanity check if (!nextPair.index.equals(0)) { throw new RuntimeException("Expected next bag to have index 0"); } int matches = 0; for (Pair p : pq) { if (data.equals(p.data)) matches++; } // subtract 1 since element matches itself return matches - 1; } @SuppressWarnings("unchecked") @Override public DataBag exec(Tuple input) throws IOException { if (input.size() < 2) { throw new RuntimeException("Expected at least two inputs, but found " + input.size()); } for (Object o : input) { if (o != null && !(o instanceof DataBag)) { throw new RuntimeException("Inputs must be bags"); } } DataBag outputBag = bagFactory.newDefaultBag(); DataBag bag1 = (DataBag) input.get(0); DataBag bag2 = (DataBag) input.get(1); if (bag1 == null || bag1.size() == 0) { return outputBag; } // optimization else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) { return bag1; } PriorityQueue<Pair> pq = loadBags(input); Tuple lastData = null; while (true) { Pair nextPair = pq.peek(); // ignore data we've already encountered if (nextPair.data.compareTo(lastData) != 0) { // Only take data from the first bag, where there are no other // bags that have the same data. if (nextPair.index.equals(0) && countMatches(pq) == 0) { outputBag.add(nextPair.data); lastData = nextPair.data; } } Pair p = pq.poll(); // only put the bag back into the queue if it still has data if (p.hasNext()) { p.next(); pq.offer(p); } else if (p.index.equals(0)) { // stop when we exhaust all elements from the first bag break; } } return outputBag; } /** * A wrapper for the tuple iterator that implements comparable so it can be used in the priority * queue. * * <p>This is compared first on the data, then on the index the bag came from in the input tuple. */ private static class Pair implements Comparable<Pair> { private final Iterator<Tuple> it; private final Integer index; private Tuple data; /** * Constructs the {@link Pair}. * * @param it tuple iterator * @param index index within the tuple that the bag came from */ public Pair(Iterator<Tuple> it, int index) { this.index = index; this.it = it; this.data = it.next(); } @SuppressWarnings("unchecked") @Override public int compareTo(Pair o) { int r = this.data.compareTo(o.data); if (r == 0) { return index.compareTo(o.index); } else { return r; } } public boolean hasNext() { return it.hasNext(); } @SuppressWarnings("unchecked") public Tuple next() { Tuple nextData = it.next(); // algorithm assumes data is in order if (data.compareTo(nextData) > 0) { throw new RuntimeException("Out of order!"); } this.data = nextData; return this.data; } @Override public String toString() { return String.format("[%s within %d]", data, index); } } }
public class TestScalarAliasesLocal { private static final String BUILD_TEST_TMP = "build/test/tmp/"; private PigServer pigServer; TupleFactory mTf = TupleFactory.getInstance(); BagFactory mBf = BagFactory.getInstance(); @Before public void setUp() throws Exception { pigServer = new PigServer(Util.getLocalTestMode()); } public static void deleteDirectory(File file) { if (file.exists()) { Util.deleteDirectory(file); } } public static File createLocalInputFile(String filename, String[] inputData) throws IOException { new File(filename).getParentFile().mkdirs(); return Util.createLocalInputFile(filename, inputData); } // See PIG-1434 @Test public void testScalarAliasesBatchNobatch() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20"}; String output = BUILD_TEST_TMP + "table_testScalarAliasesDir"; TestScalarAliases.deleteDirectory(new File(output)); // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesBatch"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.setBatchOn(); pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);"); pigServer.registerQuery("B = group A all;"); pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;"); pigServer.registerQuery("Y = foreach A generate (a0 * C.count), (a1 / C.max);"); pigServer.registerQuery("Store Y into '" + output + "';"); pigServer.executeBatch(); // Check output pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: int, a1: double);"); Iterator<Tuple> iter; Tuple t; iter = pigServer.openIterator("Z"); t = iter.next(); assertTrue(t.toString().equals("(3,0.25)")); t = iter.next(); assertTrue(t.toString().equals("(6,0.5)")); t = iter.next(); assertTrue(t.toString().equals("(9,1.0)")); assertFalse(iter.hasNext()); iter = pigServer.openIterator("Y"); t = iter.next(); assertTrue(t.toString().equals("(3,0.25)")); t = iter.next(); assertTrue(t.toString().equals("(6,0.5)")); t = iter.next(); assertTrue(t.toString().equals("(9,1.0)")); assertFalse(iter.hasNext()); } // See PIG-1434 @Test public void testUseScalarMultipleTimes() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20"}; String outputY = BUILD_TEST_TMP + "table_testUseScalarMultipleTimesOutY"; TestScalarAliases.deleteDirectory(new File(outputY)); String outputZ = BUILD_TEST_TMP + "table_testUseScalarMultipleTimesOutZ"; TestScalarAliases.deleteDirectory(new File(outputZ)); // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testUseScalarMultipleTimes"; TestScalarAliases.createLocalInputFile(inputPath, input); pigServer.setBatchOn(); pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);"); pigServer.registerQuery("B = group A all;"); pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;"); pigServer.registerQuery("Y = foreach A generate (a0 * C.count), (a1 / C.max);"); pigServer.registerQuery("Store Y into '" + outputY + "';"); pigServer.registerQuery("Z = foreach A generate (a1 + C.count), (a0 * C.max);"); pigServer.registerQuery("Store Z into '" + outputZ + "';"); // Test Multiquery store pigServer.executeBatch(); // Check output pigServer.registerQuery("M = LOAD '" + outputY + "' as (a0: int, a1: double);"); Iterator<Tuple> iter; Tuple t; iter = pigServer.openIterator("M"); t = iter.next(); assertTrue(t.toString().equals("(3,0.25)")); t = iter.next(); assertTrue(t.toString().equals("(6,0.5)")); t = iter.next(); assertTrue(t.toString().equals("(9,1.0)")); assertFalse(iter.hasNext()); // Check output pigServer.registerQuery("N = LOAD '" + outputZ + "' as (a0: double, a1: double);"); iter = pigServer.openIterator("N"); t = iter.next(); assertTrue(t.toString().equals("(8.0,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(13.0,40.0)")); t = iter.next(); assertTrue(t.toString().equals("(23.0,60.0)")); assertFalse(iter.hasNext()); // Non batch mode iter = pigServer.openIterator("Y"); t = iter.next(); assertTrue(t.toString().equals("(3,0.25)")); t = iter.next(); assertTrue(t.toString().equals("(6,0.5)")); t = iter.next(); assertTrue(t.toString().equals("(9,1.0)")); assertFalse(iter.hasNext()); // Check in non-batch mode iter = pigServer.openIterator("Z"); t = iter.next(); assertTrue(t.toString().equals("(8.0,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(13.0,40.0)")); t = iter.next(); assertTrue(t.toString().equals("(23.0,60.0)")); assertFalse(iter.hasNext()); } // See PIG-1434 @Test public void testScalarWithNoSchema() throws Exception { String[] scalarInput = {"1\t5"}; String[] input = {"1\t5", "2\t10", "3\t20"}; String inputPath = BUILD_TEST_TMP + "table_testScalarWithNoSchema"; TestScalarAliases.createLocalInputFile(inputPath, input); String inputPathScalar = BUILD_TEST_TMP + "table_testScalarWithNoSchemaScalar"; TestScalarAliases.createLocalInputFile(inputPathScalar, scalarInput); // Load A as a scalar pigServer.registerQuery("A = LOAD '" + inputPath + "';"); pigServer.registerQuery("scalar = LOAD '" + inputPathScalar + "' as (count, total);"); pigServer.registerQuery("B = foreach A generate 5 / scalar.total;"); Iterator<Tuple> iter = pigServer.openIterator("B"); Tuple t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); assertFalse(iter.hasNext()); } // See PIG-1434 @Test public void testScalarWithTwoBranches() throws Exception { String[] inputA = {"1\t5", "2\t10", "3\t20"}; String[] inputX = {"pig", "hadoop", "rocks"}; String output = BUILD_TEST_TMP + "testScalarWithTwoBranchesDir"; TestScalarAliases.deleteDirectory(new File(output)); // Test the use of scalars in expressions String inputPathA = BUILD_TEST_TMP + "testScalarWithTwoBranchesA"; TestScalarAliases.createLocalInputFile(inputPathA, inputA); String inputPathX = BUILD_TEST_TMP + "testScalarWithTwoBranchesX"; TestScalarAliases.createLocalInputFile(inputPathX, inputX); // Test in script mode pigServer.setBatchOn(); pigServer.registerQuery("A = LOAD '" + inputPathA + "' as (a0: long, a1: double);"); pigServer.registerQuery("B = group A all;"); pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;"); pigServer.registerQuery("X = LOAD '" + inputPathX + "' as (names: chararray);"); pigServer.registerQuery("Y = foreach X generate names, C.max;"); pigServer.registerQuery("Store Y into '" + output + "';"); pigServer.executeBatch(); // Check output pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: chararray, a1: double);"); Iterator<Tuple> iter = pigServer.openIterator("Z"); Tuple t = iter.next(); assertTrue(t.toString().equals("(pig,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(hadoop,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(rocks,20.0)")); assertFalse(iter.hasNext()); // Check in non-batch mode iter = pigServer.openIterator("Y"); t = iter.next(); assertTrue(t.toString().equals("(pig,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(hadoop,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(rocks,20.0)")); assertFalse(iter.hasNext()); pigServer.getPigContext().getProperties().remove("tez.am.inline.task.execution.max-tasks"); } // See PIG-1434 @Test public void testFilteredScalarDollarProj() throws Exception { String output = BUILD_TEST_TMP + "table_testFilteredScalarDollarProjDir"; TestScalarAliases.deleteDirectory(new File(output)); String[] input = { "1\t5\t[state#maine,city#portland]\t{(a),(b)}\t(a,b)", "2\t10\t\t\t", "3\t20\t\t\t" }; // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testFilteredScalarDollarProj"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.setBatchOn(); pigServer.registerQuery( "A = LOAD '" + inputPath + "'" + " as (a0: long, a1: double, a2 : bytearray, " + "a3: bag{ t : tuple(tc : chararray)}, " + "a4: tuple(c1 : chararray, c2 : chararray) );"); pigServer.registerQuery("B = filter A by $1 < 8;"); pigServer.registerQuery( "Y = foreach A generate (a0 * B.$0), (a1 / B.$1), B.$2, B.$2#'state', B.$3, B.a4;"); pigServer.registerQuery("Store Y into '" + output + "';"); pigServer.explain("Y", System.err); pigServer.executeBatch(); // Check output pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: int, a1: double);"); pigServer.explain("Z", System.err); Iterator<Tuple> iter = pigServer.openIterator("Z"); Tuple t = iter.next(); assertTrue(t.toString().equals("(1,1.0)")); t = iter.next(); assertTrue(t.toString().equals("(2,2.0)")); t = iter.next(); assertTrue(t.toString().equals("(3,4.0)")); assertFalse(iter.hasNext()); // Check in non-batch mode iter = pigServer.openIterator("Y"); t = iter.next(); assertEquals(t.toString(), "(1,1.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))"); t = iter.next(); assertEquals(t.toString(), "(2,2.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))"); t = iter.next(); assertEquals(t.toString(), "(3,4.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))"); assertFalse(iter.hasNext()); } // See PIG-1434 @Test public void testScalarWithNoSchemaDollarProj() throws Exception { String[] scalarInput = {"1\t5"}; String[] input = {"1\t5", "2\t10", "3\t20"}; String inputPath = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProj"; TestScalarAliases.createLocalInputFile(inputPath, input); String inputPathScalar = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProjScalar"; TestScalarAliases.createLocalInputFile(inputPathScalar, scalarInput); // Load A as a scalar pigServer.registerQuery("A = LOAD '" + inputPath + "';"); pigServer.registerQuery("scalar = LOAD '" + inputPathScalar + "';"); pigServer.registerQuery("B = foreach A generate 5 / scalar.$1;"); Iterator<Tuple> iter = pigServer.openIterator("B"); Tuple t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); assertFalse(iter.hasNext()); } // See PIG-1434 @Test public void testScalarAliasesJoinClause() throws Exception { String[] inputA = {"1\t5", "2\t10", "3\t20"}; String[] inputB = {"Total3\tthree", "Total2\ttwo", "Total1\tone"}; // Test the use of scalars in expressions String inputPathA = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseA"; TestScalarAliases.createLocalInputFile(inputPathA, inputA); String inputPathB = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseB"; TestScalarAliases.createLocalInputFile(inputPathB, inputB); // Test in script mode pigServer.registerQuery("A = LOAD '" + inputPathA + "' as (a0, a1);"); pigServer.registerQuery("G = group A all;"); pigServer.registerQuery("C = foreach G generate COUNT(A) as count;"); pigServer.registerQuery("B = LOAD '" + inputPathB + "' as (b0:chararray, b1:chararray);"); pigServer.registerQuery("Y = join A by CONCAT('Total', (chararray)C.count), B by $0;"); Iterator<Tuple> iter = pigServer.openIterator("Y"); String[] expected = new String[] {"(1,5,Total3,three)", "(2,10,Total3,three)", "(3,20,Total3,three)"}; Util.checkQueryOutputsAfterSortRecursive( iter, expected, org.apache.pig.newplan.logical.Util.translateSchema(pigServer.dumpSchema("Y"))); } // See PIG-1434 @Test public void testScalarAliasesFilterClause() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20", "4\t12", "5\t8"}; // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesFilterClause"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0, a1);"); pigServer.registerQuery("G = group A all;"); pigServer.registerQuery("C = foreach G generate AVG(A.$1) as average;"); pigServer.registerQuery("Y = filter A by a1 > C.average;"); Iterator<Tuple> iter = pigServer.openIterator("Y"); // Average is 11 Tuple t = iter.next(); assertTrue(t.toString().equals("(3,20)")); t = iter.next(); assertTrue(t.toString().equals("(4,12)")); assertFalse(iter.hasNext()); } // See PIG-1434 @Test public void testScalarAliasesGrammarNegative() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20"}; String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesGrammar"; TestScalarAliases.createLocalInputFile(inputPath, input); try { pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);"); pigServer.registerQuery("B = group A all;"); pigServer.registerQuery("C = foreach B generate COUNT(A);"); // Only projections of C are supported pigServer.registerQuery("Y = foreach A generate C;"); pigServer.openIterator("Y"); // Control should not reach here fail("Scalar projections are only supported"); } catch (IOException pe) { assertTrue(pe.getMessage().contains("Invalid scalar projection: C")); } } // See PIG-1636 @Test public void testScalarAliasesLimit() throws Exception { String[] input = {"a\t1", "b\t2", "c\t3", "a\t4", "c\t5"}; // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesLimit"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0:chararray, a1: int);"); pigServer.registerQuery("G = group A all;"); pigServer.registerQuery("C = foreach G generate SUM(A.$1) as total;"); pigServer.registerQuery("C1 = limit C 1;"); pigServer.registerQuery("Y = foreach A generate a0, a1 * (double)C1.total;"); Iterator<Tuple> iter = pigServer.openIterator("Y"); // Average is 11 Tuple t = iter.next(); assertTrue(t.toString().equals("(a,15.0)")); t = iter.next(); assertTrue(t.toString().equals("(b,30.0)")); t = iter.next(); assertTrue(t.toString().equals("(c,45.0)")); t = iter.next(); assertTrue(t.toString().equals("(a,60.0)")); t = iter.next(); assertTrue(t.toString().equals("(c,75.0)")); assertFalse(iter.hasNext()); } /** * Test that a specific string is included in the error message when an exception is thrown for * using a relation in a scalar context without projecting any columns out of it */ // See PIG-1788 @Test public void testScalarWithNoProjection() throws Exception { String query = " A = load 'table_testScalarWithNoProjection' as (x, y);" + " B = group A by x;" + // B is unintentionally being used as scalar, // the user intends it to be COUNT(A) " C = foreach B generate COUNT(B);"; Util.checkExceptionMessage( query, "C", "A column needs to be projected from a relation" + " for it to be used as a scalar"); } @Test public void testScalarNullValue() throws Exception { Storage.Data data = Storage.resetData(pigServer); data.set("input", Storage.tuple("a", 1), Storage.tuple("b", 2)); pigServer.setBatchOn(); pigServer.registerQuery("A = load 'input' using mock.Storage() as (a:chararray, b:int);"); pigServer.registerQuery("B = FILTER A by a == 'c';"); pigServer.registerQuery("C = FOREACH A generate a, b + B.b;"); pigServer.registerQuery("store C into 'output' using mock.Storage();"); pigServer.executeBatch(); List<Tuple> actualResults = data.get("output"); List<Tuple> expectedResults = Util.getTuplesFromConstantTupleStrings(new String[] {"('a', null)", "('b', null)"}); Util.checkQueryOutputsAfterSort(actualResults.iterator(), expectedResults); } }