public DataBag exec(Tuple input) throws IOException { try { if (!input.isNull()) { // Create the output like a databag {(res1,res2),(res3,res4)..} DataBag output_databag = mBagFactory.newDefaultBag(); // Unpack tuple in order to get the bag {(1,2),(3,4),...} String input_time = (String) input.get(0); try { DateFormat formatter = new SimpleDateFormat("MM/dd/yyyy kk:mm:ss"); Date date = formatter.parse( String.format( "%s/%s/%s %s:%s:%s", input_time.substring(5, 7), input_time.substring(8, 10), input_time.substring(0, 4), input_time.substring(11, 13), input_time.substring(14, 16), input_time.substring(17, 18))); Calendar calendar = Calendar.getInstance(); calendar.setTime(date); int dayOfWeek = calendar.get(Calendar.DAY_OF_WEEK); int dayOfMonth = calendar.get(Calendar.DAY_OF_MONTH); int hour = calendar.get(Calendar.HOUR_OF_DAY); // Add items to output Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, String.format("%d:%d:%d", dayOfWeek, dayOfMonth, hour)); output_databag.add(items); } catch (Exception e) { Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #1" + e.getMessage()); output_databag.add(items); return output_databag; } return output_databag; } else { DataBag output_databag = mBagFactory.newDefaultBag(); Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #2"); output_databag.add(items); return output_databag; } } catch (Exception e) { System.err.println("Error with ?? .."); DataBag output_databag = mBagFactory.newDefaultBag(); Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #3" + e.getMessage()); output_databag.add(items); return output_databag; } }
@Override public DataBag exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); HashMap<String, Double> clsCnt = new HashMap<String, Double>(); Iterator<Tuple> it = bag.iterator(); Double sum = new Double(0.0); while (it.hasNext()) { Tuple item = (Tuple) it.next(); String cls = (String) item.get(3); if (cls != null && cls.length() > 0) { Double cur = clsCnt.get(cls); Double inc = (Double) item.get(2); if (cur != null) { clsCnt.put(cls, cur + inc); } else { clsCnt.put(cls, inc); } sum += inc; } } Set<Entry<String, Double>> clses = clsCnt.entrySet(); Iterator<Entry<String, Double>> cit = clses.iterator(); DataBag result = bagFactory.newDefaultBag(); while (cit.hasNext()) { Entry<String, Double> cls = cit.next(); Tuple tpl = tupleFactory.newTuple(2); tpl.set(0, cls.getKey()); tpl.set(1, cls.getValue() / sum); result.add(tpl); } return result; }
@Override public DataBag exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); long n = 0L; DataBag selected = bagFactory.newDefaultBag(); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); for (Tuple innerTuple : bag) { n += (Long) innerTuple.get(0); selected.addAll((DataBag) innerTuple.get(1)); waiting.addAll((DataBag) innerTuple.get(2)); } long sampleSize = (long) Math.ceil(_samplingProbability * n); long nNeeded = sampleSize - selected.size(); for (Tuple scored : waiting) { if (nNeeded <= 0) { break; } selected.add(ScoredTuple.fromIntermediateTuple(scored).getTuple()); nNeeded--; } return selected; }
@Override public Tuple exec(Tuple input) throws IOException { Tuple output = tupleFactory.newTuple(); DataBag selected = bagFactory.newDefaultBag(); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); DataBag items = (DataBag) input.get(0); if (items != null) { long n = items.size(); double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple item : items) { double key = _rdg.nextUniform(0.0d, 1.0d); if (key < q1) { selected.add(item); } else if (key < q2) { waiting.add(new ScoredTuple(key, item).getIntermediateTuple(tupleFactory)); } } output.append(n); output.append(selected); output.append(waiting); } return output; }
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } String normStr = ((String) input.get(0)); if (normStr == null) { return null; } // Remove punctuation except when it's a version number normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" "); normStr = spacePattern.matcher(normStr).replaceAll(" "); DataBag output = bagFactory.newDefaultBag(); for (String s : spacePattern.split(normStr.trim())) { if (s.length() <= 30) { Tuple t = tupleFactory.newTuple(1); t.set(0, s); output.add(t); } } return output; }
@SuppressWarnings("unchecked") @Override public DataBag exec(Tuple input) throws IOException { if (input.size() < 2) { throw new RuntimeException("Expected at least two inputs, but found " + input.size()); } for (Object o : input) { if (o != null && !(o instanceof DataBag)) { throw new RuntimeException("Inputs must be bags"); } } DataBag outputBag = bagFactory.newDefaultBag(); DataBag bag1 = (DataBag) input.get(0); DataBag bag2 = (DataBag) input.get(1); if (bag1 == null || bag1.size() == 0) { return outputBag; } // optimization else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) { return bag1; } PriorityQueue<Pair> pq = loadBags(input); Tuple lastData = null; while (true) { Pair nextPair = pq.peek(); // ignore data we've already encountered if (nextPair.data.compareTo(lastData) != 0) { // Only take data from the first bag, where there are no other // bags that have the same data. if (nextPair.index.equals(0) && countMatches(pq) == 0) { outputBag.add(nextPair.data); lastData = nextPair.data; } } Pair p = pq.poll(); // only put the bag back into the queue if it still has data if (p.hasNext()) { p.next(); pq.offer(p); } else if (p.index.equals(0)) { // stop when we exhaust all elements from the first bag break; } } return outputBag; }
@Override public DataBag exec(Tuple input) throws ExecException, FrontendException { if (input == null) { return null; } Iterable<Pair<Integer, Double>> vector = elgen.apply(input); List<Tuple> result = new ArrayList<Tuple>(N); for (Pair<Integer, Double> el : ordering.greatestOf(vector, N)) { result.add(tfac.newTuple(el.getFirst())); } return bfac.newDefaultBag(result); }
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; // Output bag DataBag bagOfTokens = bagFactory.newDefaultBag(); StringReader textInput = new StringReader(input.get(0).toString()); PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), ""); for (CoreLabel label; ptbt.hasNext(); ) { label = (CoreLabel) ptbt.next(); if (label.value().length() > 2) { System.err.println(label.toString()); Tuple termText = tupleFactory.newTuple(label.word()); bagOfTokens.add(termText); } } return bagOfTokens; }
@Override public Tuple exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); DataBag selected = bagFactory.newDefaultBag(); DataBag aggWaiting = bagFactory.newSortedBag(new ScoredTupleComparator()); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); Tuple output = tupleFactory.newTuple(); long n = 0L; for (Tuple innerTuple : bag) { n += (Long) innerTuple.get(0); selected.addAll((DataBag) innerTuple.get(1)); double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple t : (DataBag) innerTuple.get(2)) { ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t); if (scored.getScore() < q1) { selected.add(scored.getTuple()); } else if (scored.getScore() < q2) { aggWaiting.add(t); } else { break; } } } double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple t : aggWaiting) { ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t); if (scored.getScore() < q1) { selected.add(scored.getTuple()); } else if (scored.getScore() < q2) { waiting.add(t); } else { break; } } output.append(n); output.append(selected); output.append(waiting); System.err.println( "Read " + n + " items, selected " + selected.size() + ", and wait-listed " + aggWaiting.size() + "."); return output; }