/** * See PIG-2936. The purpose of this test is to ensure that Tuples are being serialized in the * specific way that we expect. */ @Test public void testTupleSerializationSpecific() throws Exception { byte[] flags = { BinInterSedes.TUPLE_0, BinInterSedes.TUPLE_1, BinInterSedes.TUPLE_2, BinInterSedes.TUPLE_3, BinInterSedes.TUPLE_4, BinInterSedes.TUPLE_5, BinInterSedes.TUPLE_6, BinInterSedes.TUPLE_7, BinInterSedes.TUPLE_8, BinInterSedes.TUPLE_9, }; for (int i = 0; i < flags.length; i++) { Tuple t = mTupleFactory.newTuple(i); ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutput out = new DataOutputStream(baos); out.writeByte(flags[i]); for (int j = 0; j < i; j++) { Integer val = Integer.valueOf(random.nextInt()); bis.writeDatum(out, val); t.set(j, val); } testSerTuple(t, baos.toByteArray()); } }
@Override public Double exec(Tuple input) throws IOException { try { DataBag b = (DataBag) input.get(0); Tuple combined = combine(b); Long sum = (Long) combined.get(0); if (sum == null) { return null; } double count = (Long) combined.get(1); Double avg = null; if (count > 0) { avg = new Double(sum / count); } return avg; } catch (ExecException ee) { throw ee; } catch (Exception e) { int errCode = 2106; String msg = "Error while computing average in " + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
@Override public Long exec(Tuple input) throws IOException { try { String tinput = ""; if (input == null || input.size() == 0) return null; else { if (input.getType(0) == DataType.CHARARRAY) tinput = (String) input.get(0); else throw new RuntimeException( "Input type expected to be chararray but got: " + input.getType(0)); } tinput = tinput.replaceAll("[-+.^:, ]", ""); if (tinput.length() > 14) return Long.parseLong(tinput.substring(0, 14)); else if (tinput.length() < 14) return Long.parseLong(String.format("%-14s", tinput).replace(' ', '0')); else return Long.parseLong(tinput); } catch (ExecException exp) { throw exp; } catch (Exception e) { int errCode = 2107; String msg = "Error while computing date_format in " + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
@Override protected void doHadoopWork() throws BuildException { Tuple tuple = ContextManager.getCurrentTuple(); if (tuple == null) { throw new BuildException( this.getTaskName() + " should be put inside task container which provides tuple to execution context"); } try { if (tuple.getType(fieldNumber) != DataType.TUPLE || !(tuple.get(fieldNumber) instanceof Tuple)) { throw new BuildException("Tuple field " + fieldNumber + " doesn't represent a Tuple"); } ContextManager.setCurrentTupleContext((Tuple) tuple.get(fieldNumber)); try { for (Task task : tasks) { task.perform(); } } finally { ContextManager.resetCurrentTupleContext(); } } catch (ExecException e) { throw new BuildException("Failed to check type of tuple field " + fieldNumber, e); } }
private static Tuple extractKeys(Tuple t, List<Integer> keyFields) throws ExecException { Tuple keys = new DefaultTuple(); for (int keyField : keyFields) { keys.append(t.get(keyField)); } return keys; }
@Override public String exec(Tuple input) throws IOException { // validate input if (input == null || input.size() == 0 || input.get(0) == null) { return null; } // get the value of input String strAddress = (String) input.get(0); // Get geoip information try { String result = this.geo.getCountryName(strAddress); // replace "--" and "N/A" to null, better for pig if (result == null || result.equals("--") || result.equals("N/A")) { return null; } else { return result; } } catch (Exception e) { // e.printStackTrace(); return null; } }
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } String normStr = ((String) input.get(0)); if (normStr == null) { return null; } // Remove punctuation except when it's a version number normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" "); normStr = spacePattern.matcher(normStr).replaceAll(" "); DataBag output = bagFactory.newDefaultBag(); for (String s : spacePattern.split(normStr.trim())) { if (s.length() <= 30) { Tuple t = tupleFactory.newTuple(1); t.set(0, s); output.add(t); } } return output; }
protected static Long sum(Tuple input) throws ExecException, IOException { DataBag values = (DataBag) input.get(0); // if we were handed an empty bag, return NULL if (values.size() == 0) { return null; } long sum = 0; boolean sawNonNull = false; for (Iterator<Tuple> it = values.iterator(); it.hasNext(); ) { Tuple t = it.next(); try { Long l = (Long) (t.get(0)); if (l == null) continue; sawNonNull = true; sum += l; } catch (RuntimeException exp) { int errCode = 2103; String msg = "Problem while computing sum of longs."; throw new ExecException(msg, errCode, PigException.BUG, exp); } } if (sawNonNull) { return Long.valueOf(sum); } else { return null; } }
@Override public Tuple exec(Tuple input) throws IOException { Tuple output = tupleFactory.newTuple(); DataBag selected = bagFactory.newDefaultBag(); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); DataBag items = (DataBag) input.get(0); if (items != null) { long n = items.size(); double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple item : items) { double key = _rdg.nextUniform(0.0d, 1.0d); if (key < q1) { selected.add(item); } else if (key < q2) { waiting.add(new ScoredTuple(key, item).getIntermediateTuple(tupleFactory)); } } output.append(n); output.append(selected); output.append(waiting); } return output; }
protected static Long sumLongs(Tuple input) throws ExecException { // Can't just call sum, because the intermediate results are // now Longs insteads of Integers. DataBag values = (DataBag) input.get(0); // if we were handed an empty bag, return NULL // this is in compliance with SQL standard if (values.size() == 0) { return null; } long sum = 0; boolean sawNonNull = false; for (Iterator<Tuple> it = values.iterator(); it.hasNext(); ) { Tuple t = it.next(); try { Long l = (Long) (t.get(0)); if (l == null) continue; sawNonNull = true; sum += l; } catch (RuntimeException exp) { int errCode = 2103; String msg = "Problem while computing sum of longs."; throw new ExecException(msg, errCode, PigException.BUG, exp); } } if (sawNonNull) { return Long.valueOf(sum); } else { return null; } }
@Test public void coalesceCastIntToDatetimeLazyTest() throws Exception { PigTest test = createPigTestFromString(coalesceCastIntToDatetimeLazyTest); this.writeLinesToFile("input", "1,1375826183000", "2,"); test.runScript(); List<Tuple> lines = this.getLinesForAlias(test, "data3"); Assert.assertEquals(2, lines.size()); for (Tuple t : lines) { Integer testcase = (Integer) t.get(0); Assert.assertNotNull(testcase); switch (testcase) { case 1: Assert.assertEquals( "2013-08-06T21:56:23.000Z", ((DateTime) t.get(1)).toDateTime(DateTimeZone.UTC).toString()); break; case 2: Assert.assertEquals("1970-01-01T00:00:00.000Z", t.get(1).toString()); break; default: Assert.fail("Did not expect: " + t.get(1)); } } }
private int compareTuple(Tuple t1, Tuple t2) { int sz1 = t1.size(); int sz2 = t2.size(); if (sz2 < sz1) { return 1; } else if (sz2 > sz1) { return -1; } else { for (int i = 0; i < sz1; i++) { try { Object o1 = t1.get(i); Object o2 = t2.get(i); if (o1 == null || o2 == null) mHasNullField = true; int c = DataType.compare(o1, o2); if (c != 0) { if (!mWholeTuple && !mAsc[i]) c *= -1; else if (mWholeTuple && !mAsc[0]) c *= -1; return c; } } catch (ExecException e) { throw new RuntimeException("Unable to compare tuples", e); } } return 0; } }
private HashMap<String, Object> createMap(Tuple input) throws IOException { try { HashMap<String, Object> map = new HashMap<String, Object>(); if (input == null || input.size() == 0) { return map; // an empty map } for (int i = 0; i < input.size(); i = i + 2) { String key = input.get(i).toString(); if (null != key && (i + 1 < input.size())) { map.put(key, input.get(i + 1)); } } return map; } catch (Exception e) { int errCode = 2106; String msg = "Error while creating map with" + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
// See PIG-1434 @Test public void testScalarWithNoSchemaDollarProj() throws Exception { String[] scalarInput = {"1\t5"}; String[] input = {"1\t5", "2\t10", "3\t20"}; String inputPath = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProj"; TestScalarAliases.createLocalInputFile(inputPath, input); String inputPathScalar = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProjScalar"; TestScalarAliases.createLocalInputFile(inputPathScalar, scalarInput); // Load A as a scalar pigServer.registerQuery("A = LOAD '" + inputPath + "';"); pigServer.registerQuery("scalar = LOAD '" + inputPathScalar + "';"); pigServer.registerQuery("B = foreach A generate 5 / scalar.$1;"); Iterator<Tuple> iter = pigServer.openIterator("B"); Tuple t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); assertFalse(iter.hasNext()); }
// See PIG-1434 @Test public void testScalarAliasesFilterClause() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20", "4\t12", "5\t8"}; // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesFilterClause"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0, a1);"); pigServer.registerQuery("G = group A all;"); pigServer.registerQuery("C = foreach G generate AVG(A.$1) as average;"); pigServer.registerQuery("Y = filter A by a1 > C.average;"); Iterator<Tuple> iter = pigServer.openIterator("Y"); // Average is 11 Tuple t = iter.next(); assertTrue(t.toString().equals("(3,20)")); t = iter.next(); assertTrue(t.toString().equals("(4,12)")); assertFalse(iter.hasNext()); }
/** * Creates a serialized S4 event given Pig data. * * <p>All field names in the input tuple must match the name of a setter method in the event. For * example, an input field named "value" will invoke the <tt>setValue</tt> method when creating an * event. Setters are always called in the order specified in the constructor. A <tt>null</tt> * value means the setter for that field is not called. * * <p>Type mismatches will produce an exception. Differences in case are ignored. * * @param input Tuple of values for each field, in the order provided to the constructor. * @return Serialized version of the event. */ public DataByteArray exec(Tuple input) throws IOException { if (input == null || input.size() < methods.size()) return null; // create empty event object Object event; try { event = eventClass.newInstance(); } catch (Exception e) { e.printStackTrace(); return null; } // iterate through fields setting values for (int i = 0; i < methods.size(); i++) { if (input.get(i) != null) { MethodNamePair pair = methods.get(i); try { Method m = pair.method; m.invoke(event, input.get(i)); } catch (Exception e) { e.printStackTrace(); return null; } } } // serialize event Tuple outputTuple = tupleFactory.newTuple(2); byte[] rawEvent = serializer.serialize(event); DataByteArray serializedEvent = new DataByteArray(rawEvent); return serializedEvent; }
@Test public void exact() throws Exception { EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates(); ItemsSketch<String> sketch = new ItemsSketch<String>(8); sketch.update("a"); sketch.update("a"); sketch.update("b"); Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))); DataBag bag = func.exec(inputTuple); Assert.assertNotNull(bag); Assert.assertEquals(bag.size(), 2); Iterator<Tuple> it = bag.iterator(); Tuple tuple1 = it.next(); Assert.assertEquals(tuple1.size(), 4); Assert.assertEquals((String) tuple1.get(0), "a"); Assert.assertEquals((long) tuple1.get(1), 2L); Assert.assertEquals((long) tuple1.get(2), 2L); Assert.assertEquals((long) tuple1.get(3), 2L); Tuple tuple2 = it.next(); Assert.assertEquals(tuple2.size(), 4); Assert.assertEquals((String) tuple2.get(0), "b"); Assert.assertEquals((long) tuple2.get(1), 1L); Assert.assertEquals((long) tuple2.get(2), 1L); Assert.assertEquals((long) tuple2.get(3), 1L); }
@Override public DataBag exec(Tuple input) throws IOException { try { DataBag bag = DefaultBagFactory.getInstance().newDefaultBag(); if (input == null || input.size() == 0) { return bag; // an empty bag } if (this.fieldType == DataType.MAP) { Tuple t = DefaultTupleFactory.getInstance().newTuple(1); t.set(0, createMap(input)); bag.add(t); } else { bag.add(input); } return bag; } catch (Exception e) { throw new RuntimeException( "Error while computing size in " + this.getClass().getSimpleName()); } }
public void cat(StringBuilder sb, Object input, String delim) throws IOException { if (input == null) return; if (input instanceof Tuple) { Tuple tuple = (Tuple) input; for (Object o : tuple.getAll()) { cat(sb, o, delim); } } else if (input instanceof DataBag) { DataBag bag = (DataBag) input; for (Tuple t : bag) { for (Object o : t.getAll()) { cat(sb, o, delim); } } } else { String s = input.toString(); s = s.trim(); if (s.length() > 0) { sb.append(s).append(delim); } } }
@Override public DataBag exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); long n = 0L; DataBag selected = bagFactory.newDefaultBag(); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); for (Tuple innerTuple : bag) { n += (Long) innerTuple.get(0); selected.addAll((DataBag) innerTuple.get(1)); waiting.addAll((DataBag) innerTuple.get(2)); } long sampleSize = (long) Math.ceil(_samplingProbability * n); long nNeeded = sampleSize - selected.size(); for (Tuple scored : waiting) { if (nNeeded <= 0) { break; } selected.add(ScoredTuple.fromIntermediateTuple(scored).getTuple()); nNeeded--; } return selected; }
@Override public Tuple exec(Tuple input) throws IOException { // Initial is called in the map - for SUM // we just send the tuple down try { // input is a bag with one tuple containing // the column we are trying to sum DataBag bg = (DataBag) input.get(0); Integer i = null; if (bg.iterator().hasNext()) { Tuple tp = bg.iterator().next(); i = (Integer) tp.get(0); } return tfact.newTuple(i != null ? Long.valueOf(i) : null); } catch (NumberFormatException nfe) { // treat this particular input as null Tuple t = tfact.newTuple(1); t.set(0, null); return t; } catch (ExecException e) { throw e; } catch (Exception e) { int errCode = 2106; String msg = "Error while computing sum in " + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
@Override public DateTime exec(Tuple input) throws IOException { if (input == null || input.size() < 2) { return null; } return ((DateTime) input.get(0)).plus(new Period((String) input.get(1))); }
public void printData(HashMap<Object, List<Tuple>> data) throws ExecException { for (Object o : data.keySet()) { System.out.println(o); for (Tuple t : data.get(o)) { System.out.println("\t" + t.toDelimitedString(", ")); } } }
private Tuple createTupleWithManyCols(int size) { Tuple t = TupleFactory.getInstance().newTuple(size); Integer col = Integer.valueOf(1); for (int i = 0; i < size; i++) { t.append(col); } return t; }
@Override public Block next() throws IOException, InterruptedException { Tuple metaDataTuple = matchingMetaBlock.next(); if (metaDataTuple == null) return null; // Done System.out.println("Collate Vector: metadata tuple = " + metaDataTuple.toString()); return generateVectorBlock(metaDataTuple); }
protected Tuple getValueTuple(NullableTuple ntup, int index) throws ExecException { // Need to make a copy of the value, as hadoop uses the same ntup // to represent each value. Tuple val = (Tuple) ntup.getValueAsPigType(); Tuple copy = null; // The "value (val)" that we just got may not // be the complete "value". It may have some portions // in the "key" (look in POLocalRearrange for more comments) // If this is the case we need to stitch // the "value" together. Pair<Boolean, Map<Integer, Integer>> lrKeyInfo = keyInfo.get(index); boolean isProjectStar = lrKeyInfo.first; Map<Integer, Integer> keyLookup = lrKeyInfo.second; int keyLookupSize = keyLookup.size(); if (keyLookupSize > 0) { // we have some fields of the "value" in the // "key". copy = mTupleFactory.newTuple(); int finalValueSize = keyLookupSize + val.size(); int valIndex = 0; // an index for accessing elements from // the value (val) that we have currently for (int i = 0; i < finalValueSize; i++) { Integer keyIndex = keyLookup.get(i); if (keyIndex == null) { // the field for this index is not in the // key - so just take it from the "value" // we were handed copy.append(val.get(valIndex)); valIndex++; } else { // the field for this index is in the key if (isKeyTuple) { // the key is a tuple, extract the // field out of the tuple copy.append(keyAsTuple.get(keyIndex)); } else { copy.append(key); } } } } else if (isProjectStar) { // the whole "value" is present in the "key" copy = mTupleFactory.newTuple(keyAsTuple.getAll()); } else { // there is no field of the "value" in the // "key" - so just make a copy of what we got // as the "value" copy = mTupleFactory.newTuple(val.getAll()); } return copy; }
/** * create bag having given number of tuples * * @param size * @return */ private DataBag createBag(int size) { Tuple innerTuple = TupleFactory.getInstance().newTuple(); innerTuple.append(Integer.valueOf(1)); DataBag bag = BagFactory.getInstance().newDefaultBag(); for (int i = 0; i < size; i++) { bag.add(innerTuple); } return bag; }
@Override public Tuple next() throws IOException, InterruptedException { Tuple tuple = block.next(); if (tuple == null) return null; for (int i = 0; i < columnCopyMap.length; i++) outputTuple.set(columnCopyMap[i], tuple.get(i)); return outputTuple; }
@SuppressWarnings("unchecked") @Override public DataBag exec(Tuple input) throws IOException { if (input.size() < 2) { throw new RuntimeException("Expected at least two inputs, but found " + input.size()); } for (Object o : input) { if (o != null && !(o instanceof DataBag)) { throw new RuntimeException("Inputs must be bags"); } } DataBag outputBag = bagFactory.newDefaultBag(); DataBag bag1 = (DataBag) input.get(0); DataBag bag2 = (DataBag) input.get(1); if (bag1 == null || bag1.size() == 0) { return outputBag; } // optimization else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) { return bag1; } PriorityQueue<Pair> pq = loadBags(input); Tuple lastData = null; while (true) { Pair nextPair = pq.peek(); // ignore data we've already encountered if (nextPair.data.compareTo(lastData) != 0) { // Only take data from the first bag, where there are no other // bags that have the same data. if (nextPair.index.equals(0) && countMatches(pq) == 0) { outputBag.add(nextPair.data); lastData = nextPair.data; } } Pair p = pq.poll(); // only put the bag back into the queue if it still has data if (p.hasNext()) { p.next(); pq.offer(p); } else if (p.index.equals(0)) { // stop when we exhaust all elements from the first bag break; } } return outputBag; }
public String exec(Tuple input) throws IOException { if (input == null || input.size() == 0) return null; try { String str = (String) input.get(0); return str.toUpperCase(); } catch (Exception e) { throw WrappedIOException.wrap("Caught exception processing input row ", e); } }