/** * java level API * * @param input expects a single numeric value * @param output returns a single numeric value, nextup value of the argument */ public Float exec(Tuple input) throws IOException { if (input == null || input.size() == 0) return null; Float d; try { d = (Float) input.get(0); } catch (Exception e) { throw new IOException("Caught exception processing input row ", e); } return Math.nextUp(d); }
/** * java level API * * @param input expects a tuple containing two numeric DataAtom value * @param output returns a single numeric DataAtom value, which is first floating-point argument * with the sign of the second floating-point argument. */ @Override public Float exec(Tuple input) throws IOException { if (input == null || input.size() < 2) return null; try { float first = (Float) input.get(0); float second = (Float) input.get(1); return Math.copySign(first, second); } catch (Exception e) { throw WrappedIOException.wrap("Caught exception processing input row ", e); } }
private int compareTuple(Tuple t1, Tuple t2) { int sz1 = t1.size(); int sz2 = t2.size(); if (sz2 < sz1) { return 1; } else if (sz2 > sz1) { return -1; } else { for (int i = 0; i < sz1; i++) { try { Object o1 = t1.get(i); Object o2 = t2.get(i); if (o1 == null || o2 == null) mHasNullField = true; int c = DataType.compare(o1, o2); if (c != 0) { if (!mWholeTuple && !mAsc[i]) c *= -1; else if (mWholeTuple && !mAsc[0]) c *= -1; return c; } } catch (ExecException e) { throw new RuntimeException("Unable to compare tuples", e); } } return 0; } }
private HashMap<String, Object> createMap(Tuple input) throws IOException { try { HashMap<String, Object> map = new HashMap<String, Object>(); if (input == null || input.size() == 0) { return map; // an empty map } for (int i = 0; i < input.size(); i = i + 2) { String key = input.get(i).toString(); if (null != key && (i + 1 < input.size())) { map.put(key, input.get(i + 1)); } } return map; } catch (Exception e) { int errCode = 2106; String msg = "Error while creating map with" + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
public Tuple tupleFlatten(Tuple inTuple) throws IOException { final int count = outSchema.getNumColumns(); // TODO: can we reuse tuple? Tuple tuple = TupleFactory.getInstance().newTuple(count); // for each position, retrieve either column value or flattened value int outidx = 0; for (int colId = 0; colId < inTuple.size(); colId++) { Object obj = inTuple.get(colId); if (!isFlattenTuple(flattenPositions.get(colId))) { // Not a "flatten" column. Preserve object. tuple.set(outidx++, obj); continue; } // Object is a tuple. Flatten it. Tuple preFlattening = (Tuple) obj; int nColumnFields = this.inputColumnIndexToOutputTypes.get(colId).size(); if (obj == null || preFlattening.size() == 0) { for (int i = 0; i < nColumnFields; i++) tuple.set(outidx++, null); } else { for (int i = 0; i < nColumnFields; i++) tuple.set(outidx++, preFlattening.get(i)); } } if (outidx < count) throw new RuntimeException( String.format( "FlattenTuple: found fewer fields than expected=%d, found=%d", count, outidx)); return tuple; }
/** * Given two tuple bags as returned by the NGramGenerator function, return true if the two bags * contain the same number of tuples, which, pairwise, have the same contents. Strategy, sort the * bags, and then compare tuple by tuple. * * @param bag1 * @param bag2 * @return * @throws ExecException */ private boolean compareBags(DefaultDataBag bag1, DefaultDataBag bag2) throws ExecException { SortedDataBag sortedBag1 = new SortedDataBag(null); SortedDataBag sortedBag2 = new SortedDataBag(null); sortedBag1.addAll(bag1); sortedBag2.addAll(bag2); Iterator<Tuple> bag1Iter = sortedBag1.iterator(); Iterator<Tuple> bag2Iter = sortedBag2.iterator(); while (bag1Iter.hasNext()) { if (!bag2Iter.hasNext()) { return false; } Tuple t1 = bag1Iter.next(); Tuple t2 = bag2Iter.next(); // ************ // int t1Size = t1.size(); // int t2Size = t2.size(); // ************ if (t1.size() != t2.size()) return false; for (int i = 0; i < t1.size(); i++) { if (!t1.get(i).equals(t2.get(i))) return false; } } if (bag2Iter.hasNext()) return false; return true; }
@Test public void exact() throws Exception { EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates(); ItemsSketch<String> sketch = new ItemsSketch<String>(8); sketch.update("a"); sketch.update("a"); sketch.update("b"); Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))); DataBag bag = func.exec(inputTuple); Assert.assertNotNull(bag); Assert.assertEquals(bag.size(), 2); Iterator<Tuple> it = bag.iterator(); Tuple tuple1 = it.next(); Assert.assertEquals(tuple1.size(), 4); Assert.assertEquals((String) tuple1.get(0), "a"); Assert.assertEquals((long) tuple1.get(1), 2L); Assert.assertEquals((long) tuple1.get(2), 2L); Assert.assertEquals((long) tuple1.get(3), 2L); Tuple tuple2 = it.next(); Assert.assertEquals(tuple2.size(), 4); Assert.assertEquals((String) tuple2.get(0), "b"); Assert.assertEquals((long) tuple2.get(1), 1L); Assert.assertEquals((long) tuple2.get(2), 1L); Assert.assertEquals((long) tuple2.get(3), 1L); }
@SuppressWarnings("unchecked") @Override public DataBag exec(Tuple input) throws IOException { if (input.size() < 2) { throw new RuntimeException("Expected at least two inputs, but found " + input.size()); } for (Object o : input) { if (o != null && !(o instanceof DataBag)) { throw new RuntimeException("Inputs must be bags"); } } DataBag outputBag = bagFactory.newDefaultBag(); DataBag bag1 = (DataBag) input.get(0); DataBag bag2 = (DataBag) input.get(1); if (bag1 == null || bag1.size() == 0) { return outputBag; } // optimization else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) { return bag1; } PriorityQueue<Pair> pq = loadBags(input); Tuple lastData = null; while (true) { Pair nextPair = pq.peek(); // ignore data we've already encountered if (nextPair.data.compareTo(lastData) != 0) { // Only take data from the first bag, where there are no other // bags that have the same data. if (nextPair.index.equals(0) && countMatches(pq) == 0) { outputBag.add(nextPair.data); lastData = nextPair.data; } } Pair p = pq.poll(); // only put the bag back into the queue if it still has data if (p.hasNext()) { p.next(); pq.offer(p); } else if (p.index.equals(0)) { // stop when we exhaust all elements from the first bag break; } } return outputBag; }
public String exec(Tuple input) throws IOException { if (input == null || input.size() < 2) return null; String delim = input.get(0).toString(); StringBuilder sb = new StringBuilder(); for (int i = 1; i < input.size(); i++) { cat(sb, input.get(i), delim); } return sb.toString(); }
private Object[] tupleToArgs(Tuple t) throws ExecException { if ((t == null && paramClasses_ != null) || (t != null && t.size() != paramClasses_.length)) { throw new ExecException("unable to match function arguments to declared signature."); } if (t == null) { return null; } Object[] args = new Object[t.size()]; for (int i = 0; i < t.size(); i++) { args[i] = unPrimitivize(paramClasses_[i]).cast(t.get(i)); } return args; }
/** * Loads the data bags from the input tuple and puts them in a priority queue, where ordering is * determined by the data from the iterator for each bag. * * <p>The bags are wrapped in a {@link Pair} object that is comparable on the data currently * available from the iterator. These objects are ordered first by the data, then by the index * within the tuple the bag came from. * * @param input * @return priority queue ordered * @throws IOException */ private PriorityQueue<Pair> loadBags(Tuple input) throws IOException { PriorityQueue<Pair> pq = new PriorityQueue<Pair>(input.size()); for (int i = 0; i < input.size(); i++) { if (input.get(i) != null) { Iterator<Tuple> inputIterator = ((DataBag) input.get(i)).iterator(); if (inputIterator.hasNext()) { pq.add(new Pair(inputIterator, i)); } } } return pq; }
private static Tuple buildInitialTupleForTheRow(Tuple input) throws ExecException { int numberOfTheColumns = 0; Tuple row = null; if (null == input) { return null; } else if (input.get(0) instanceof DataBag) { DataBag values = (DataBag) input.get(0); Iterator<Tuple> it = values.iterator(); row = it.next(); numberOfTheColumns = row.size(); } else { numberOfTheColumns = input.size(); row = input; } Tuple vaTuple = initTuple(numberOfTheColumns); // 0 1 2 3 4 5 // 2*3/2+2*2=7 // x0,x1->sumx0,sumx1,sum(x0*x0),sum(x0x1),sum(x1*x1) int i6 = -6; for (int i = 0; i < numberOfTheColumns; i++) { for (int j = i + 1; j < numberOfTheColumns; j++) { i6 += 6; // Jeff: to fix pivotal41573093:Although x or y is null,we can calculate the count. // count increaseTheValueOfElInTheTupleBy(vaTuple, i6, 1); if (null == row.get(i) || null == row.get(j)) { continue; } Double x = DataType.toDouble(row.get(i)); Double y = DataType.toDouble(row.get(j)); // value x increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 1, x); // value y increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 2, y); // value xx increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 3, x * x); // value yy increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 4, y * y); // value xy increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 5, x * y); } } return vaTuple; }
@Override public Integer exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } if (input.size() != 2) { throw new IOException("ConvertDocumentIDToID requires 2 parameters"); } String documentIndexPath = (String) input.get(0); if (documentIndex == null) { loadDocumentIndex(documentIndexPath); } String docID = (String) input.get(1); return documentIndex.get(docID); }
@Override public DataBag exec(Tuple input) throws IOException { try { DataBag bag = DefaultBagFactory.getInstance().newDefaultBag(); if (input == null || input.size() == 0) { return bag; // an empty bag } if (this.fieldType == DataType.MAP) { Tuple t = DefaultTupleFactory.getInstance().newTuple(1); t.set(0, createMap(input)); bag.add(t); } else { bag.add(input); } return bag; } catch (Exception e) { throw new RuntimeException( "Error while computing size in " + this.getClass().getSimpleName()); } }
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } String normStr = ((String) input.get(0)); if (normStr == null) { return null; } // Remove punctuation except when it's a version number normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" "); normStr = spacePattern.matcher(normStr).replaceAll(" "); DataBag output = bagFactory.newDefaultBag(); for (String s : spacePattern.split(normStr.trim())) { if (s.length() <= 30) { Tuple t = tupleFactory.newTuple(1); t.set(0, s); output.add(t); } } return output; }
@Override public Long exec(Tuple input) throws IOException { try { String tinput = ""; if (input == null || input.size() == 0) return null; else { if (input.getType(0) == DataType.CHARARRAY) tinput = (String) input.get(0); else throw new RuntimeException( "Input type expected to be chararray but got: " + input.getType(0)); } tinput = tinput.replaceAll("[-+.^:, ]", ""); if (tinput.length() > 14) return Long.parseLong(tinput.substring(0, 14)); else if (tinput.length() < 14) return Long.parseLong(String.format("%-14s", tinput).replace(' ', '0')); else return Long.parseLong(tinput); } catch (ExecException exp) { throw exp; } catch (Exception e) { int errCode = 2107; String msg = "Error while computing date_format in " + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
@Override public String exec(Tuple input) throws IOException { // validate input if (input == null || input.size() == 0 || input.get(0) == null) { return null; } // get the value of input String strAddress = (String) input.get(0); // Get geoip information try { String result = this.geo.getCountryName(strAddress); // replace "--" and "N/A" to null, better for pig if (result == null || result.equals("--") || result.equals("N/A")) { return null; } else { return result; } } catch (Exception e) { // e.printStackTrace(); return null; } }
/** * Creates a serialized S4 event given Pig data. * * <p>All field names in the input tuple must match the name of a setter method in the event. For * example, an input field named "value" will invoke the <tt>setValue</tt> method when creating an * event. Setters are always called in the order specified in the constructor. A <tt>null</tt> * value means the setter for that field is not called. * * <p>Type mismatches will produce an exception. Differences in case are ignored. * * @param input Tuple of values for each field, in the order provided to the constructor. * @return Serialized version of the event. */ public DataByteArray exec(Tuple input) throws IOException { if (input == null || input.size() < methods.size()) return null; // create empty event object Object event; try { event = eventClass.newInstance(); } catch (Exception e) { e.printStackTrace(); return null; } // iterate through fields setting values for (int i = 0; i < methods.size(); i++) { if (input.get(i) != null) { MethodNamePair pair = methods.get(i); try { Method m = pair.method; m.invoke(event, input.get(i)); } catch (Exception e) { e.printStackTrace(); return null; } } } // serialize event Tuple outputTuple = tupleFactory.newTuple(2); byte[] rawEvent = serializer.serialize(event); DataByteArray serializedEvent = new DataByteArray(rawEvent); return serializedEvent; }
@Override public String exec(Tuple input) throws IOException { if (input.size() != 3) { String msg = "RegexExtract : Only 3 parameters are allowed."; throw new IOException(msg); } if (input.get(0) == null) return null; try { if (!input.get(1).equals(mExpression)) { try { mExpression = (String) input.get(1); mPattern = Pattern.compile(mExpression); } catch (Exception e) { String msg = "RegexExtract : Mal-Formed Regular expression : " + input.get(1); throw new IOException(msg); } } } catch (NullPointerException e) { String msg = "RegexExtract : Regular expression is null"; throw new IOException(msg); } int mIndex = (Integer) input.get(2); Matcher m = mPattern.matcher((String) input.get(0)); if (!mUseMatches && m.find() || mUseMatches && m.matches()) { if (m.groupCount() >= mIndex) { return m.group(mIndex); } } warn("RegexExtract : Cannot extract group for input " + input.get(0), PigWarning.UDF_WARNING_1); return null; }
@Override public DateTime exec(Tuple input) throws IOException { if (input == null || input.size() < 2) { return null; } return ((DateTime) input.get(0)).plus(new Period((String) input.get(1))); }
private static void markTheTuple(Tuple input) throws ExecException { Tuple row = null; if (null == input) { return; } else if (input.get(0) instanceof DataBag) { DataBag values = (DataBag) input.get(0); Iterator<Tuple> it = values.iterator(); row = it.next(); } else { row = input; } if (null == row.get(row.size() - 1)) { row.set(row.size() - 1, MARKER); return; } }
protected Tuple getValueTuple(NullableTuple ntup, int index) throws ExecException { // Need to make a copy of the value, as hadoop uses the same ntup // to represent each value. Tuple val = (Tuple) ntup.getValueAsPigType(); Tuple copy = null; // The "value (val)" that we just got may not // be the complete "value". It may have some portions // in the "key" (look in POLocalRearrange for more comments) // If this is the case we need to stitch // the "value" together. Pair<Boolean, Map<Integer, Integer>> lrKeyInfo = keyInfo.get(index); boolean isProjectStar = lrKeyInfo.first; Map<Integer, Integer> keyLookup = lrKeyInfo.second; int keyLookupSize = keyLookup.size(); if (keyLookupSize > 0) { // we have some fields of the "value" in the // "key". copy = mTupleFactory.newTuple(); int finalValueSize = keyLookupSize + val.size(); int valIndex = 0; // an index for accessing elements from // the value (val) that we have currently for (int i = 0; i < finalValueSize; i++) { Integer keyIndex = keyLookup.get(i); if (keyIndex == null) { // the field for this index is not in the // key - so just take it from the "value" // we were handed copy.append(val.get(valIndex)); valIndex++; } else { // the field for this index is in the key if (isKeyTuple) { // the key is a tuple, extract the // field out of the tuple copy.append(keyAsTuple.get(keyIndex)); } else { copy.append(key); } } } } else if (isProjectStar) { // the whole "value" is present in the "key" copy = mTupleFactory.newTuple(keyAsTuple.getAll()); } else { // there is no field of the "value" in the // "key" - so just make a copy of what we got // as the "value" copy = mTupleFactory.newTuple(val.getAll()); } return copy; }
public String exec(Tuple input) throws IOException { if (input == null || input.size() == 0) return null; try { String str = (String) input.get(0); return str.toUpperCase(); } catch (Exception e) { throw WrappedIOException.wrap("Caught exception processing input row ", e); } }
/** * java level API * * @param input expects a single numeric value * @param output returns a single numeric value, unbiased exponent used in the representation of a * double */ public Integer exec(Tuple input) throws IOException { if (input == null || input.size() == 0 || input.get(0) == null) return null; try { Float d = (Float) input.get(0); return Math.getExponent(d); } catch (Exception e) { throw new IOException("Caught exception processing input row ", e); } }
@Override public Tuple exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } Object obj = null; Integer limnum = null; try { obj = (DataByteArray) input.get(1); } catch (ExecException e) { logger.error("Error in reading field proto:", e); throw e; } try { limnum = (Integer) input.get(2); } catch (ExecException e) { logger.error("Error in reading baglimit:", e); throw e; } DataByteArray dba = null; try { dba = (DataByteArray) obj; } catch (ClassCastException e) { logger.error("Error in casting Object (" + input.getType(1) + ") to DataByteArray:", e); throw e; } DocumentMetadata dm = null; try { dm = DocumentMetadata.parseFrom(dba.get()); } catch (InvalidProtocolBufferException e) { logger.error("Error in reading ByteArray to DocumentMetadata:", e); throw e; } String key = dm.getKey(); DataBag db = new DefaultDataBag(); int bagsize = 0; for (ClassifCode code : dm.getBasicMetadata().getClassifCodeList()) { for (String co_str : code.getValueList()) { bagsize++; db.add(TupleFactory.getInstance().newTuple(co_str)); } } if (bagsize > limnum) { Object[] to = new Object[] {key, db, bagsize}; return TupleFactory.getInstance().newTuple(Arrays.asList(to)); } return null; }
protected static long count(Tuple input) throws ExecException { DataBag values = (DataBag) input.get(0); Iterator it = values.iterator(); long cnt = 0; while (it.hasNext()) { Tuple t = (Tuple) it.next(); if (t != null && t.size() > 0 && t.get(0) != null) cnt++; } return cnt; }
@Override public Long exec(Tuple input) throws IOException { try { if (input == null) return null; return Long.valueOf(input.size()); } catch (Exception e) { int errCode = 2106; String msg = "Error while computing size in " + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
private static Tuple mergeResultsIntoAggregation(Tuple tupTmp, Tuple t) throws ExecException { int columnCount = tupTmp.size(); for (int cn = 0; cn < columnCount; cn++) { if (null == t.get(cn) || t.get(cn) instanceof String) { continue; } increaseTheValueOfElInTheTupleBy(tupTmp, cn, DataType.toDouble(t.get(cn))); } return tupTmp; }
@Override public String exec(Tuple tuple) throws IOException { if (tuple == null || tuple.size() < 1) { return null; } try { String refURL = (String) tuple.get(0); return extractQuery(refURL); } catch (ExecException ee) { throw new IOException(ee); } }
@Override public DataBag exec(Tuple input) throws IOException { retrieveContextValues(); ArrayList<String> joinKeyNames = new ArrayList<String>(); for (int i = 1; i < input.size(); i += 2) { joinKeyNames.add((String) input.get(i)); } JoinCollector collector = new JoinCollector(); // the first bag is the outer bag String leftBagName = bagNames.get(0); DataBag leftBag = getBag(input, leftBagName); String leftBagJoinKeyName = getPrefixedAliasName(bagNameToJoinKeyPrefix.get(leftBagName), joinKeyNames.get(0)); collector.setJoinData(collector.groupTuples(leftBag, leftBagJoinKeyName)); // now, for each additional bag, group up the tuples by the join key, then join them in if (bagNames.size() > 1) { for (int i = 1; i < bagNames.size(); i++) { String bagName = bagNames.get(i); DataBag bag = getBag(input, bagName); String joinKeyName = getPrefixedAliasName(bagNameToJoinKeyPrefix.get(bagName), joinKeyNames.get(i)); int tupleSize = bagNameToSize.get(bagName); if (bag == null) throw new IOException( "Error in instance: " + getInstanceName() + " with properties: " + getInstanceProperties() + " and tuple: " + input.toDelimitedString(", ") + " -- Expected bag, got null"); HashMap<Object, List<Tuple>> groupedData = collector.groupTuples(bag, joinKeyName); // outer join, so go back in and add nulls; groupedData = collector.insertNullTuples(groupedData, tupleSize); for (Map.Entry<Object, List<Tuple>> entry : groupedData.entrySet()) { collector.joinTuples(entry.getKey(), entry.getValue()); } } } // assemble output bag DataBag outputBag = BagFactory.getInstance().newDefaultBag(); for (List<Tuple> tuples : collector.getJoinData().values()) { for (Tuple tuple : tuples) { outputBag.add(tuple); } } return outputBag; }