@Override public DataBag extract(Object o, String lang) { DocumentMetadata dm = (DocumentMetadata) o; DataBag db = new DefaultDataBag(); DiacriticsRemover DR = new DiacriticsRemover(); for (TextWithLanguage title : dm.getBasicMetadata().getTitleList()) { if (lang != null && !lang.equalsIgnoreCase(title.getLanguage())) { continue; } String sTitle = title.getText(); String normalized_title = (String) DR.normalize(sTitle); if (normalized_title == null) { continue; } String[] normals = normalized_title.split("[\\W]+"); for (String s : normals) { if (s.isEmpty()) { continue; } Object normalized = normalizeExtracted(s); if (normalized == null) { continue; } Tuple t = TupleFactory.getInstance().newTuple(normalized); db.add(t); } } return db; }
protected static Long sumLongs(Tuple input) throws ExecException { // Can't just call sum, because the intermediate results are // now Longs insteads of Integers. DataBag values = (DataBag) input.get(0); // if we were handed an empty bag, return NULL // this is in compliance with SQL standard if (values.size() == 0) { return null; } long sum = 0; boolean sawNonNull = false; for (Iterator<Tuple> it = values.iterator(); it.hasNext(); ) { Tuple t = it.next(); try { Long l = (Long) (t.get(0)); if (l == null) continue; sawNonNull = true; sum += l; } catch (RuntimeException exp) { int errCode = 2103; String msg = "Problem while computing sum of longs."; throw new ExecException(msg, errCode, PigException.BUG, exp); } } if (sawNonNull) { return Long.valueOf(sum); } else { return null; } }
@Override public Tuple exec(Tuple input) throws IOException { // Initial is called in the map - for SUM // we just send the tuple down try { // input is a bag with one tuple containing // the column we are trying to sum DataBag bg = (DataBag) input.get(0); Integer i = null; if (bg.iterator().hasNext()) { Tuple tp = bg.iterator().next(); i = (Integer) tp.get(0); } return tfact.newTuple(i != null ? Long.valueOf(i) : null); } catch (NumberFormatException nfe) { // treat this particular input as null Tuple t = tfact.newTuple(1); t.set(0, null); return t; } catch (ExecException e) { throw e; } catch (Exception e) { int errCode = 2106; String msg = "Error while computing sum in " + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
@Test public void testCase1() throws IOException { Tuple input = TupleFactory.getInstance().newTuple(2); Tuple groupInfo = TupleFactory.getInstance().newTuple(2); groupInfo.set(0, "column_3"); groupInfo.set(1, Integer.valueOf(1)); DataBag dataBag = new DefaultDataBag(); // {(PSIColumn: int, columnId: int, value: chararray, tag: boolean , rand: int)} for (int i = 0; i < 10; i++) { Tuple t = TupleFactory.getInstance().newTuple(4); t.set(0, Integer.valueOf(1)); t.set(1, Integer.valueOf(1)); t.set(2, array[i]); dataBag.add(t); } input.set(0, groupInfo); input.set(1, dataBag); Tuple output = inst.exec(input); Assert.assertEquals(output.get(0), 1); String[] outputArray = output.get(1).toString().split(String.valueOf(CalculateStatsUDF.CATEGORY_VAL_SEPARATOR)); Assert.assertEquals(outputArray[0], "1"); Assert.assertEquals(outputArray[1], "2"); Assert.assertEquals(outputArray[2], "0"); }
private DataBag consumeBag(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException { if (fieldSchema == null) { throw new IOException("Schema is null"); } ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields(); Tuple t; int buf; while ((buf = in.read()) != '{') { if (buf == -1) { throw new IOException("Unexpect end of bag"); } } if (fss.length != 1) throw new IOException("Only tuple is allowed inside bag schema"); ResourceFieldSchema fs = fss[0]; DataBag db = DefaultBagFactory.getInstance().newDefaultBag(); while (true) { t = consumeTuple(in, fs); if (t != null) db.add(t); while ((buf = in.read()) != '}' && buf != ',') { if (buf == -1) { throw new IOException("Unexpect end of bag"); } } if (buf == '}') break; } return db; }
@Override public DataBag exec(Tuple input) throws IOException { try { DataBag bag = DefaultBagFactory.getInstance().newDefaultBag(); if (input == null || input.size() == 0) { return bag; // an empty bag } if (this.fieldType == DataType.MAP) { Tuple t = DefaultTupleFactory.getInstance().newTuple(1); t.set(0, createMap(input)); bag.add(t); } else { bag.add(input); } return bag; } catch (Exception e) { throw new RuntimeException( "Error while computing size in " + this.getClass().getSimpleName()); } }
@Override public DataBag exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); HashMap<String, Double> clsCnt = new HashMap<String, Double>(); Iterator<Tuple> it = bag.iterator(); Double sum = new Double(0.0); while (it.hasNext()) { Tuple item = (Tuple) it.next(); String cls = (String) item.get(3); if (cls != null && cls.length() > 0) { Double cur = clsCnt.get(cls); Double inc = (Double) item.get(2); if (cur != null) { clsCnt.put(cls, cur + inc); } else { clsCnt.put(cls, inc); } sum += inc; } } Set<Entry<String, Double>> clses = clsCnt.entrySet(); Iterator<Entry<String, Double>> cit = clses.iterator(); DataBag result = bagFactory.newDefaultBag(); while (cit.hasNext()) { Entry<String, Double> cls = cit.next(); Tuple tpl = tupleFactory.newTuple(2); tpl.set(0, cls.getKey()); tpl.set(1, cls.getValue() / sum); result.add(tpl); } return result; }
protected static Long sum(Tuple input) throws ExecException, IOException { DataBag values = (DataBag) input.get(0); // if we were handed an empty bag, return NULL if (values.size() == 0) { return null; } long sum = 0; boolean sawNonNull = false; for (Iterator<Tuple> it = values.iterator(); it.hasNext(); ) { Tuple t = it.next(); try { Long l = (Long) (t.get(0)); if (l == null) continue; sawNonNull = true; sum += l; } catch (RuntimeException exp) { int errCode = 2103; String msg = "Problem while computing sum of longs."; throw new ExecException(msg, errCode, PigException.BUG, exp); } } if (sawNonNull) { return Long.valueOf(sum); } else { return null; } }
@Override public Tuple exec(Tuple input) throws IOException { DataBag output = BagFactory.getInstance().newDefaultBag(); DataBag samples = (DataBag) input.get(0); if (samples == null) { // do nothing } else if (samples.size() <= numSamples) { // no need to construct a reservoir, so just emit intermediate tuples for (Tuple sample : samples) { // add the score on to the intermediate tuple output.add(new ScoredTuple(Math.random(), sample).getIntermediateTuple(tupleFactory)); } } else { for (Tuple sample : samples) { getReservoir().consider(new ScoredTuple(Math.random(), sample)); } for (ScoredTuple scoredTuple : getReservoir()) { // add the score on to the intermediate tuple output.add(scoredTuple.getIntermediateTuple(tupleFactory)); } } return tupleFactory.newTuple(output); }
@Test public void exact() throws Exception { EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates(); ItemsSketch<String> sketch = new ItemsSketch<String>(8); sketch.update("a"); sketch.update("a"); sketch.update("b"); Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))); DataBag bag = func.exec(inputTuple); Assert.assertNotNull(bag); Assert.assertEquals(bag.size(), 2); Iterator<Tuple> it = bag.iterator(); Tuple tuple1 = it.next(); Assert.assertEquals(tuple1.size(), 4); Assert.assertEquals((String) tuple1.get(0), "a"); Assert.assertEquals((long) tuple1.get(1), 2L); Assert.assertEquals((long) tuple1.get(2), 2L); Assert.assertEquals((long) tuple1.get(3), 2L); Tuple tuple2 = it.next(); Assert.assertEquals(tuple2.size(), 4); Assert.assertEquals((String) tuple2.get(0), "b"); Assert.assertEquals((long) tuple2.get(1), 1L); Assert.assertEquals((long) tuple2.get(2), 1L); Assert.assertEquals((long) tuple2.get(3), 1L); }
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } String normStr = ((String) input.get(0)); if (normStr == null) { return null; } // Remove punctuation except when it's a version number normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" "); normStr = spacePattern.matcher(normStr).replaceAll(" "); DataBag output = bagFactory.newDefaultBag(); for (String s : spacePattern.split(normStr.trim())) { if (s.length() <= 30) { Tuple t = tupleFactory.newTuple(1); t.set(0, s); output.add(t); } } return output; }
@Override public DataBag getValue() { DataBag output = BagFactory.getInstance().newDefaultBag(); for (ScoredTuple sample : getReservoir()) { output.add(sample.getTuple()); } return output; }
/** * create bag having given number of tuples * * @param size * @return */ private DataBag createBag(int size) { Tuple innerTuple = TupleFactory.getInstance().newTuple(); innerTuple.append(Integer.valueOf(1)); DataBag bag = BagFactory.getInstance().newDefaultBag(); for (int i = 0; i < size; i++) { bag.add(innerTuple); } return bag; }
@Override public DataBag exec(Tuple input) throws IOException { DataBag samples = (DataBag) input.get(0); if (samples.size() <= numSamples) { return samples; } else { return super.exec(input); } }
@Test public void emptySketch() throws Exception { EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates(); ItemsSketch<String> sketch = new ItemsSketch<String>(8); Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))); DataBag bag = func.exec(inputTuple); Assert.assertNotNull(bag); Assert.assertEquals(bag.size(), 0); }
protected static long count(Tuple input) throws ExecException { DataBag values = (DataBag) input.get(0); Iterator it = values.iterator(); long cnt = 0; while (it.hasNext()) { Tuple t = (Tuple) it.next(); if (t != null && t.size() > 0 && t.get(0) != null) cnt++; } return cnt; }
@Override public Tuple exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } Object obj = null; Integer limnum = null; try { obj = (DataByteArray) input.get(1); } catch (ExecException e) { logger.error("Error in reading field proto:", e); throw e; } try { limnum = (Integer) input.get(2); } catch (ExecException e) { logger.error("Error in reading baglimit:", e); throw e; } DataByteArray dba = null; try { dba = (DataByteArray) obj; } catch (ClassCastException e) { logger.error("Error in casting Object (" + input.getType(1) + ") to DataByteArray:", e); throw e; } DocumentMetadata dm = null; try { dm = DocumentMetadata.parseFrom(dba.get()); } catch (InvalidProtocolBufferException e) { logger.error("Error in reading ByteArray to DocumentMetadata:", e); throw e; } String key = dm.getKey(); DataBag db = new DefaultDataBag(); int bagsize = 0; for (ClassifCode code : dm.getBasicMetadata().getClassifCodeList()) { for (String co_str : code.getValueList()) { bagsize++; db.add(TupleFactory.getInstance().newTuple(co_str)); } } if (bagsize > limnum) { Object[] to = new Object[] {key, db, bagsize}; return TupleFactory.getInstance().newTuple(Arrays.asList(to)); } return null; }
@Override public DataBag exec(Tuple input) throws IOException { retrieveContextValues(); ArrayList<String> joinKeyNames = new ArrayList<String>(); for (int i = 1; i < input.size(); i += 2) { joinKeyNames.add((String) input.get(i)); } JoinCollector collector = new JoinCollector(); // the first bag is the outer bag String leftBagName = bagNames.get(0); DataBag leftBag = getBag(input, leftBagName); String leftBagJoinKeyName = getPrefixedAliasName(bagNameToJoinKeyPrefix.get(leftBagName), joinKeyNames.get(0)); collector.setJoinData(collector.groupTuples(leftBag, leftBagJoinKeyName)); // now, for each additional bag, group up the tuples by the join key, then join them in if (bagNames.size() > 1) { for (int i = 1; i < bagNames.size(); i++) { String bagName = bagNames.get(i); DataBag bag = getBag(input, bagName); String joinKeyName = getPrefixedAliasName(bagNameToJoinKeyPrefix.get(bagName), joinKeyNames.get(i)); int tupleSize = bagNameToSize.get(bagName); if (bag == null) throw new IOException( "Error in instance: " + getInstanceName() + " with properties: " + getInstanceProperties() + " and tuple: " + input.toDelimitedString(", ") + " -- Expected bag, got null"); HashMap<Object, List<Tuple>> groupedData = collector.groupTuples(bag, joinKeyName); // outer join, so go back in and add nulls; groupedData = collector.insertNullTuples(groupedData, tupleSize); for (Map.Entry<Object, List<Tuple>> entry : groupedData.entrySet()) { collector.joinTuples(entry.getKey(), entry.getValue()); } } } // assemble output bag DataBag outputBag = BagFactory.getInstance().newDefaultBag(); for (List<Tuple> tuples : collector.getJoinData().values()) { for (Tuple tuple : tuples) { outputBag.add(tuple); } } return outputBag; }
private static Tuple buildInitialTupleForTheRow(Tuple input) throws ExecException { int numberOfTheColumns = 0; Tuple row = null; if (null == input) { return null; } else if (input.get(0) instanceof DataBag) { DataBag values = (DataBag) input.get(0); Iterator<Tuple> it = values.iterator(); row = it.next(); numberOfTheColumns = row.size(); } else { numberOfTheColumns = input.size(); row = input; } Tuple vaTuple = initTuple(numberOfTheColumns); // 0 1 2 3 4 5 // 2*3/2+2*2=7 // x0,x1->sumx0,sumx1,sum(x0*x0),sum(x0x1),sum(x1*x1) int i6 = -6; for (int i = 0; i < numberOfTheColumns; i++) { for (int j = i + 1; j < numberOfTheColumns; j++) { i6 += 6; // Jeff: to fix pivotal41573093:Although x or y is null,we can calculate the count. // count increaseTheValueOfElInTheTupleBy(vaTuple, i6, 1); if (null == row.get(i) || null == row.get(j)) { continue; } Double x = DataType.toDouble(row.get(i)); Double y = DataType.toDouble(row.get(j)); // value x increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 1, x); // value y increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 2, y); // value xx increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 3, x * x); // value yy increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 4, y * y); // value xy increaseTheValueOfElInTheTupleBy(vaTuple, i6 + 5, x * y); } } return vaTuple; }
@Override public Tuple exec(Tuple input) throws IOException { // Since Initial is guaranteed to be called // only in the map, it will be called with an // input of a bag with a single tuple - the // count should always be 1 if bag is non empty DataBag bag = (DataBag) input.get(0); Iterator<Tuple> it = bag.iterator(); Tuple t = null; if (it.hasNext()) { t = (Tuple) it.next(); } return mTupleFactory.newTuple((Object) MurmurHash.hash64(t)); }
protected static Tuple combine(DataBag values) throws ExecException { long sum = 0; long count = 0; // combine is called from Intermediate and Final // In either case, Initial would have been called // before and would have sent in valid tuples // Hence we don't need to check if incoming bag // is empty Tuple output = mTupleFactory.newTuple(2); boolean sawNonNull = false; for (Iterator<Tuple> it = values.iterator(); it.hasNext(); ) { Tuple t = it.next(); Long l = (Long) t.get(0); // we count nulls in avg as contributing 0 // a departure from SQL for performance of // COUNT() which implemented by just inspecting // size of the bag if (l == null) { l = 0L; } else { sawNonNull = true; } sum += l; count += (Long) t.get(1); } if (sawNonNull) { output.set(0, Long.valueOf(sum)); } else { output.set(0, null); } output.set(1, Long.valueOf(count)); return output; }
private static void markTheTuple(Tuple input) throws ExecException { Tuple row = null; if (null == input) { return; } else if (input.get(0) instanceof DataBag) { DataBag values = (DataBag) input.get(0); Iterator<Tuple> it = values.iterator(); row = it.next(); } else { row = input; } if (null == row.get(row.size() - 1)) { row.set(row.size() - 1, MARKER); return; } }
public void testSkewedJoinReducers() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using \"skewed\" parallel 1;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { fail("Should not throw exception, should continue execution"); } }
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } try { DataByteArray dba = null; try { dba = (DataByteArray) input.get(0); } catch (ExecException e) { logger.error("Error in reading field:", e); throw e; } DocumentWrapper dm = null; try { dm = DocumentWrapper.parseFrom(dba.get()); } catch (Exception e) { logger.error("Error in reading ByteArray to DocumentMetadata:", e); throw e; } DataBag ret = new DefaultDataBag(); DataByteArray metadata = new DataByteArray(dm.getDocumentMetadata().toByteArray()); List<Author> authors = dm.getDocumentMetadata().getBasicMetadata().getAuthorList(); for (int i = 0; i < authors.size(); i++) { String sname = authors.get(i).getSurname(); Object[] to = new Object[] {sname, metadata, i}; Tuple t = TupleFactory.getInstance().newTuple(Arrays.asList(to)); ret.add(t); } return ret; } catch (Exception e) { logger.error("Error in processing input row:", e); throw new IOException( "Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e)); } }
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; // Output bag DataBag bagOfTokens = bagFactory.newDefaultBag(); StringReader textInput = new StringReader(input.get(0).toString()); PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), ""); for (CoreLabel label; ptbt.hasNext(); ) { label = (CoreLabel) ptbt.next(); if (label.value().length() > 2) { System.err.println(label.toString()); Tuple termText = tupleFactory.newTuple(label.word()); bagOfTokens.add(termText); } } return bagOfTokens; }
@Override public void accumulate(Tuple b) throws IOException { try { DataBag bag = (DataBag) b.get(0); Iterator<Tuple> it = bag.iterator(); while (it.hasNext()) { Tuple t = (Tuple) it.next(); if (t != null && t.size() > 0) { accumulate(t, this); } } } catch (ExecException ee) { throw ee; } catch (Exception e) { int errCode = 2106; String msg = "Error while computing min in " + this.getClass().getSimpleName(); throw new ExecException(msg, errCode, PigException.BUG, e); } }
@Override public DataBag exec(Tuple input) throws IOException { DataBag bagOfSamples = (DataBag) input.get(0); for (Tuple innerTuple : bagOfSamples) { DataBag samples = (DataBag) innerTuple.get(0); for (Tuple sample : samples) { // use the same score as previously generated getReservoir().consider(ScoredTuple.fromIntermediateTuple(sample)); } } DataBag output = BagFactory.getInstance().newDefaultBag(); for (ScoredTuple scoredTuple : getReservoir()) { // output the original tuple output.add(scoredTuple.getTuple()); } return output; }
@Override public void setInput(Map<String, Block> input, JsonNode operatorJson, BlockProperties props) throws IOException, InterruptedException { inputBlock = input.values().iterator().next(); init(operatorJson, inputBlock.getProperties().getSchema()); nullBag = BagFactory.getInstance().newDefaultBag(); nullBag.add(TupleFactory.getInstance().newTuple(0)); }
public void testSkewedJoin3Way() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("D = join A by id, B by id, C by id using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("D"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { return; } fail("Should throw exception, do not support 3 way join"); }
public void testSkewedJoinNullKeys() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support null keys in skewed join"); } return; }