@Override public Tuple exec(Tuple input) throws IOException { Tuple output = tupleFactory.newTuple(); DataBag selected = bagFactory.newDefaultBag(); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); DataBag items = (DataBag) input.get(0); if (items != null) { long n = items.size(); double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple item : items) { double key = _rdg.nextUniform(0.0d, 1.0d); if (key < q1) { selected.add(item); } else if (key < q2) { waiting.add(new ScoredTuple(key, item).getIntermediateTuple(tupleFactory)); } } output.append(n); output.append(selected); output.append(waiting); } return output; }
protected Tuple getValueTuple(NullableTuple ntup, int index) throws ExecException { // Need to make a copy of the value, as hadoop uses the same ntup // to represent each value. Tuple val = (Tuple) ntup.getValueAsPigType(); Tuple copy = null; // The "value (val)" that we just got may not // be the complete "value". It may have some portions // in the "key" (look in POLocalRearrange for more comments) // If this is the case we need to stitch // the "value" together. Pair<Boolean, Map<Integer, Integer>> lrKeyInfo = keyInfo.get(index); boolean isProjectStar = lrKeyInfo.first; Map<Integer, Integer> keyLookup = lrKeyInfo.second; int keyLookupSize = keyLookup.size(); if (keyLookupSize > 0) { // we have some fields of the "value" in the // "key". copy = mTupleFactory.newTuple(); int finalValueSize = keyLookupSize + val.size(); int valIndex = 0; // an index for accessing elements from // the value (val) that we have currently for (int i = 0; i < finalValueSize; i++) { Integer keyIndex = keyLookup.get(i); if (keyIndex == null) { // the field for this index is not in the // key - so just take it from the "value" // we were handed copy.append(val.get(valIndex)); valIndex++; } else { // the field for this index is in the key if (isKeyTuple) { // the key is a tuple, extract the // field out of the tuple copy.append(keyAsTuple.get(keyIndex)); } else { copy.append(key); } } } } else if (isProjectStar) { // the whole "value" is present in the "key" copy = mTupleFactory.newTuple(keyAsTuple.getAll()); } else { // there is no field of the "value" in the // "key" - so just make a copy of what we got // as the "value" copy = mTupleFactory.newTuple(val.getAll()); } return copy; }
@Before public void setUp() throws Exception { // New args and tuple to fuse for each test: arg = new DefaultTuple(); arg.append(null); arg.append(null); arg.append(null); toFuse = new DefaultTuple(); }
@Override public void map( LongWritable key, Text value, OutputCollector<BytesWritable, Tuple> output, Reporter reporter) throws IOException { // value should contain "word count" String[] wdct = value.toString().split(" "); if (wdct.length != 2) { // LOG the error return; } byte[] word = wdct[0].getBytes(); bytesKey.set(word, 0, word.length); System.out.println("word: " + new String(word)); tupleRow.set(0, new String(word)); tupleRow.set(1, Integer.parseInt(wdct[1])); System.out.println("count: " + Integer.parseInt(wdct[1])); // This key has to be created by user /* * Tuple userKey = new DefaultTuple(); userKey.append(new String(word)); * userKey.append(Integer.parseInt(wdct[1])); */ System.out.println("in map, sortkey: " + sortKey); Tuple userKey = new ZebraTuple(); if (sortKey.equalsIgnoreCase("word,count")) { userKey.append(new String(word)); userKey.append(Integer.parseInt(wdct[1])); } if (sortKey.equalsIgnoreCase("count")) { userKey.append(Integer.parseInt(wdct[1])); } if (sortKey.equalsIgnoreCase("word")) { userKey.append(new String(word)); } try { /* New M/R Interface */ /* Converts user key to zebra BytesWritable key */ /* using sort key expr tree */ /* Returns a java base object */ /* Done for each user key */ bytesKey = BasicTableOutputFormat.getSortKey(javaObj, userKey); } catch (Exception e) { } output.collect(bytesKey, tupleRow); }
private void setCrossTaskDownstreamTaint(Tuple keys, Set<String> tags) { for (String neighbor : crossTaskDownstreamNeighbors) { Tuple body = new DefaultTuple(); body.append("cross"); body.append(keys); for (String tag : tags) { body.append(tag); } senderReceiver.sendAsync( new Message(Message.Type.TAINT, location, new LogicalLocation(neighbor), body)); } }
/** * test sedes of int of diff sizes * * @throws IOException */ @Test public void testTupleWriteReadIntDiffSizes() throws IOException { // create a tuple with integer columns of different sizes Tuple tuple = TupleFactory.getInstance().newTuple(); tuple.append(new Integer(0)); // boolean rep tuple.append(new Integer(1)); // boolean rep tuple.append(new Integer(125)); // fits into byte tuple.append(new Integer(1024)); // fits into short tuple.append(new Integer(1024 * 1024 * 1024)); // fits into int (=~ 2 ^30) testTupleSedes(tuple); }
@Test public void testGoodArgs() throws IOException { toFuse.append("notIncluded"); toFuse.append("foo"); toFuse.append("bar"); // Fuse cols 1 to end of tuple: arg.set(ConcatColumns.SLICE_SPEC_POS, "1:4"); arg.set(ConcatColumns.TUPLE_TO_FUSE_POS, toFuse); String fusedStr = new ConcatColumns().exec(arg); assertEquals("foobar", fusedStr); arg.set(ConcatColumns.SLICE_SPEC_POS, "1:1"); fusedStr = new ConcatColumns().exec(arg); assertEquals("foo", fusedStr); arg.set(ConcatColumns.SLICE_SPEC_POS, ":1"); fusedStr = new ConcatColumns().exec(arg); assertEquals("notIncluded", fusedStr); arg.set(ConcatColumns.SLICE_SPEC_POS, "0:"); fusedStr = new ConcatColumns().exec(arg); assertEquals("notIncludedfoobar", fusedStr); arg.set(ConcatColumns.SLICE_SPEC_POS, ":"); fusedStr = new ConcatColumns().exec(arg); assertEquals("notIncludedfoobar", fusedStr); arg.set(ConcatColumns.SLICE_SPEC_POS, "1:-1"); fusedStr = new ConcatColumns().exec(arg); assertEquals("foo", fusedStr); arg.set(ConcatColumns.SLICE_SPEC_POS, ":-1"); fusedStr = new ConcatColumns().exec(arg); assertEquals("notIncludedfoo", fusedStr); arg.set(ConcatColumns.SLICE_SPEC_POS, ":-2"); fusedStr = new ConcatColumns().exec(arg); assertEquals("notIncluded", fusedStr); arg.set(ConcatColumns.SLICE_SPEC_POS, ":-3"); fusedStr = new ConcatColumns().exec(arg); assertEquals("notIncluded", fusedStr); arg.set(ConcatColumns.SLICE_SPEC_POS, "1:2"); fusedStr = new ConcatColumns().exec(arg); assertEquals("foo", fusedStr); arg.set(ConcatColumns.SLICE_SPEC_POS, "1:3"); arg.set(ConcatColumns.CONCAT_SEPARATOR_POS, "|"); fusedStr = new ConcatColumns().exec(arg); assertEquals("foo|bar", fusedStr); }
/** * test sedes with maps of diff sizes * * @throws IOException */ @Test public void testTupleWriteReadMapDiffSizes() throws IOException { // tuple with ByteArray and strings of different sizes Tuple tuple = TupleFactory.getInstance().newTuple(); Map<String, Object> tinyMap = createMap(10); Map<String, Object> smallMap = createMap(1000); Map<String, Object> largeMap = createMap(100 * 1024); tuple.append(tinyMap); tuple.append(smallMap); tuple.append(largeMap); testTupleSedes(tuple); }
/** * test sedes with bags of diff sizes * * @throws IOException */ @Test public void testTupleWriteReadBagDiffSizes() throws IOException { // tuple with ByteArray and strings of different sizes Tuple tuple = TupleFactory.getInstance().newTuple(); DataBag tinyBag = createBag(10); DataBag smallBag = createBag(1000); DataBag largeBag = createBag(100 * 1024); tuple.append(tinyBag); tuple.append(smallBag); tuple.append(largeBag); testTupleSedes(tuple); }
private void setWithinTaskDownstreamTaint(Set<String> tags) { if (!withinTaskDownstreamTaint.equals(tags)) { Tuple body = new DefaultTuple(); body.append("within"); for (String tag : tags) { body.append(tag); } for (String neighbor : withinTaskDownstreamNeighbors) { sendWithinTaskMessage( neighbor, new Message(Message.Type.TAINT, location, new LogicalLocation(neighbor), body)); } withinTaskDownstreamTaint = tags; } }
@Override public Tuple exec(Tuple input) throws IOException { TupleFactory tFactory = TupleFactory.getInstance(); Tuple oTuple = tFactory.newTuple(); Tuple sketchTupleA = (DefaultTuple) input.get(0); Tuple sketchTupleB = (DefaultTuple) input.get(1); List<Object> fieldsA = sketchTupleA.getAll(); List<Object> fieldsB = sketchTupleB.getAll(); List<Integer> iFields1 = new ArrayList<Integer>(); List<Integer> iFields2 = new ArrayList<Integer>(); int count = fieldsA.size() * 2; int match = 0; for (int i = 0; i < fieldsA.size(); i++) { int a = (Integer) fieldsA.get(i); for (int j = 0; j < fieldsB.size(); j++) { int b = (Integer) fieldsB.get(j); if (a == b) { match += 2; } } } double sim = (double) match / (double) count; oTuple.append(sim); return oTuple; }
private static Tuple extractKeys(Tuple t, List<Integer> keyFields) throws ExecException { Tuple keys = new DefaultTuple(); for (int keyField : keyFields) { keys.append(t.get(keyField)); } return keys; }
private Tuple createTupleWithManyCols(int size) { Tuple t = TupleFactory.getInstance().newTuple(size); Integer col = Integer.valueOf(1); for (int i = 0; i < size; i++) { t.append(col); } return t; }
// Construct a tuple that represents this json: // {"stacks":[[[4,3],[2,1]], [[1,2],[3,4]]]} public Tuple getTestTuple() { TupleFactory tupleFactory = TupleFactory.getInstance(); Tuple tAll = tupleFactory.newTuple(); Tuple t1 = tupleFactory.newTuple(); Tuple t1a = tupleFactory.newTuple(); t1a.append(4); t1a.append(3); Tuple t1b = tupleFactory.newTuple(); t1b.append(2); t1b.append(1); t1.append(t1a); t1.append(t1b); Tuple t2 = tupleFactory.newTuple(); Tuple t2a = tupleFactory.newTuple(); t2a.append(1); t2a.append(2); Tuple t2b = tupleFactory.newTuple(); t2b.append(3); t2b.append(4); t2.append(t2a); t2.append(t2b); tAll.append(t1); tAll.append(t2); return tAll; }
/** * create bag having given number of tuples * * @param size * @return */ private DataBag createBag(int size) { Tuple innerTuple = TupleFactory.getInstance().newTuple(); innerTuple.append(Integer.valueOf(1)); DataBag bag = BagFactory.getInstance().newDefaultBag(); for (int i = 0; i < size; i++) { bag.add(innerTuple); } return bag; }
@Test(expected = IOException.class) public void testBadSliceDefs() throws IOException { toFuse.append("foo"); arg.set(ConcatColumns.SLICE_SPEC_POS, "3:1"); arg.set(ConcatColumns.CONCAT_SEPARATOR_POS, ""); arg.set(ConcatColumns.TUPLE_TO_FUSE_POS, toFuse); // Start > end: new ConcatColumns().exec(arg); }
public void joinTuples(Object key, List<Tuple> tuples) throws ExecException { List<Tuple> currentTuples = joinData.get(key); if (currentTuples != null) { List<Tuple> newTuples = new LinkedList<Tuple>(); if (tuples != null) { for (Tuple t1 : currentTuples) { for (Tuple t2 : tuples) { Tuple t = TupleFactory.getInstance().newTuple(); for (Object o : t1.getAll()) { t.append(o); } for (Object o : t2.getAll()) { t.append(o); } newTuples.add(t); } } } joinData.put(key, newTuples); } }
/** * test sedes of bytearray, string of diff sizes * * @throws IOException */ @Test public void testTupleWriteReadByteArrStringDiffSizes() throws IOException { // tuple with ByteArray and strings of different sizes Tuple tuple = TupleFactory.getInstance().newTuple(); byte[] tinyBA = new byte[10]; byte[] smallBA = new byte[1000]; byte[] largeBytearray = new byte[80000]; // init large bytearray with non 0 values, its going to be used as // string as well for (int i = 0; i < largeBytearray.length; i++) { largeBytearray[i] = '1'; } tuple.append(new DataByteArray(tinyBA)); tuple.append(new DataByteArray(smallBA)); tuple.append(new DataByteArray(largeBytearray)); testTupleSedes(tuple); // add strings of different sizes tuple = TupleFactory.getInstance().newTuple(); tuple.append(new String("")); tuple.append(new String("x")); // string larger than 32k tuple.append(new String(largeBytearray)); testTupleSedes(tuple); }
private Tuple createTuple(Tuple[] data) throws ExecException { Tuple out = TupleFactory.getInstance().newTuple(); for (int i = 0; i < data.length; ++i) { Tuple t = data[i]; int size = t.size(); for (int j = 0; j < size; ++j) { out.append(t.get(j)); } } return illustratorMarkup(out, out, 0); }
@Test public void test() throws IOException, URISyntaxException { // Prepare Resource File URL metricRes = EndpointGroupsTest.class.getResource("/avro/poem_sync_v2.avro"); File metricAvro = new File(metricRes.toURI()); UnwindServiceMetrics uw = new UnwindServiceMetrics("", "test"); uw.mpsMgr.loadAvro(metricAvro); TupleFactory tf = TupleFactory.getInstance(); Tuple inTuple = tf.newTuple(); inTuple.append("SRMv2"); inTuple.append("se01.afroditi.hellasgrid.gr"); String jsonStr = IOUtils.toString(this.getClass().getResourceAsStream("/ar/missing_endpoint.json"), "UTF-8"); Tuple expTuple = JsonToPig.jsonToTuple(jsonStr); Tuple outTuple = uw.exec(inTuple); assertTrue(expTuple.toString().equals(outTuple.toString())); }
/* * test sedes of long of diff sizes * @throws IOException */ @Test public void testTupleWriteReadLongDiffSizes() throws IOException { Random r = new Random(100L); Tuple tuple = TupleFactory.getInstance().newTuple(); tuple.append(new Long(0)); tuple.append(new Long(1)); tuple.append(new Long(-1)); tuple.append(new Long(300)); tuple.append(new Long(600)); tuple.append(new Long(10000)); tuple.append(new Long(-10000)); tuple.append(new Long(5000000000000000000L)); tuple.append(new Long(-5000000000000000000L)); for (int i = 0; i < 100000; i++) { tuple.append(new Long(r.nextLong())); } testTupleSedes(tuple); }
@Test public void testExecNestedTuple() throws IOException { Tuple input = tupleFactory.newTuple(); input.append("{\"stacks\":[[[4,3],[2,1]], [[1,2],[3,4]]]}"); Map<String, Object> myMap = jsonMap.exec(input); Tuple stacks = (Tuple) myMap.get("stacks"); System.out.println(stacks); Tuple reference = getTestTuple(); assertEquals(reference.toString(), stacks.toString()); assertEquals(reference.size(), stacks.size()); for (int i = 0; i < reference.size(); i++) { Tuple r = (Tuple) reference.get(i); Tuple s = (Tuple) stacks.get(i); assertEquals(r.size(), s.size()); for (int j = 0; j < r.size(); j++) { System.out.println("Checking if " + r.get(j) + " == " + s.get(j)); assertEquals(r.get(j), s.get(j)); } } }
@Override public Tuple exec(Tuple input) throws IOException { myreporter = PigStatusReporter.getInstance(); if (input == null || input.size() == 0) { return null; } try { DataByteArray dba = null; DocumentMetadata dm = null; String title = null; String doi = null; String year = null; try { dba = (DataByteArray) input.get(0); } catch (Exception e) { myreporter.getCounter("extraction problems", "DataByteArray from tuple"); return null; } try { dm = DocumentWrapper.parseFrom(dba.get()).getDocumentMetadata(); } catch (Exception e) { myreporter.getCounter("extraction problems", "document metadata"); return null; } try { for (TextWithLanguage twl : dm.getBasicMetadata().getTitleList()) { if (twl.getLanguage().toLowerCase().startsWith("en")) { title = twl.getText(); break; } } if (title == null) { title = dm.getBasicMetadata().getTitle(0).getText(); } if (title != null && !title.trim().isEmpty()) { title = DiacriticsRemover.removeDiacritics(title); title = title.replaceAll("[^A-Za-z0-9\\-_]", " ").replaceAll("\\s++", " ").trim(); } } catch (Exception e) { } finally { if (title == null || title.trim().isEmpty()) { myreporter.getCounter("extraction problems", "title extraction"); return null; } } try { doi = dm.getBasicMetadata().getDoi().replaceAll("\\s++", " ").trim(); } catch (Exception e) { } finally { if (doi == null || doi.trim().isEmpty()) { myreporter.getCounter("extraction problems", "doi extraction"); return null; } } try { year = dm.getBasicMetadata().getYear().replaceAll("\\s++", " ").trim(); } catch (Exception e) { } finally { if (year == null || year.trim().isEmpty()) { myreporter.getCounter("extraction problems", "year extraction"); return null; } } Tuple t = TupleFactory.getInstance().newTuple(); t.append(doi); t.append(year); t.append(title); return t; } catch (Exception e) { logger.debug(StackTraceExtractor.getStackTrace(e)); throw new IOException(e); } }
private Tuple consumeTuple(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException { if (fieldSchema == null) { throw new IOException("Schema is null"); } int buf; ByteArrayOutputStream mOut; while ((buf = in.read()) != '(' || buf == '}') { if (buf == -1) { throw new IOException("Unexpect end of tuple"); } if (buf == '}') { in.unread(buf); return null; } } Tuple t = TupleFactory.getInstance().newTuple(); if (fieldSchema.getSchema() != null && fieldSchema.getSchema().getFields().length != 0) { ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields(); // Interpret item inside tuple one by one based on the inner schema for (int i = 0; i < fss.length; i++) { Object field; ResourceFieldSchema fs = fss[i]; int delimit = ','; if (i == fss.length - 1) delimit = ')'; if (DataType.isComplex(fs.getType())) { field = consumeComplexType(in, fs); while ((buf = in.read()) != delimit) { if (buf == -1) { throw new IOException("Unexpect end of tuple"); } } } else { mOut = new ByteArrayOutputStream(BUFFER_SIZE); while ((buf = in.read()) != delimit) { if (buf == -1) { throw new IOException("Unexpect end of tuple"); } if (buf == delimit) break; mOut.write(buf); } field = parseSimpleType(mOut.toByteArray(), fs); } t.append(field); } } else { // No inner schema, treat everything inside tuple as bytearray Deque<Character> level = new LinkedList< Character>(); // keep track of nested tuple/bag/map. We do not interpret, save them as // bytearray mOut = new ByteArrayOutputStream(BUFFER_SIZE); while (true) { buf = in.read(); if (buf == -1) { throw new IOException("Unexpect end of tuple"); } if (buf == '[' || buf == '{' || buf == '(') { level.push((char) buf); mOut.write(buf); } else if (buf == ')' && level.isEmpty()) // End of tuple { DataByteArray value = new DataByteArray(mOut.toByteArray()); t.append(value); break; } else if (buf == ',' && level.isEmpty()) { DataByteArray value = new DataByteArray(mOut.toByteArray()); t.append(value); mOut.reset(); } else if (buf == ']' || buf == '}' || buf == ')') { if (level.peek() == findStartChar((char) buf)) level.pop(); else throw new IOException("Malformed tuple"); mOut.write(buf); } else mOut.write(buf); } } return t; }
@Override public Tuple exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); DataBag selected = bagFactory.newDefaultBag(); DataBag aggWaiting = bagFactory.newSortedBag(new ScoredTupleComparator()); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); Tuple output = tupleFactory.newTuple(); long n = 0L; for (Tuple innerTuple : bag) { n += (Long) innerTuple.get(0); selected.addAll((DataBag) innerTuple.get(1)); double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple t : (DataBag) innerTuple.get(2)) { ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t); if (scored.getScore() < q1) { selected.add(scored.getTuple()); } else if (scored.getScore() < q2) { aggWaiting.add(t); } else { break; } } } double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple t : aggWaiting) { ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t); if (scored.getScore() < q1) { selected.add(scored.getTuple()); } else if (scored.getScore() < q2) { waiting.add(t); } else { break; } } output.append(n); output.append(selected); output.append(waiting); System.err.println( "Read " + n + " items, selected " + selected.size() + ", and wait-listed " + aggWaiting.size() + "."); return output; }
@Override public void append(Object val) { t.append(val); }