@Override public DataBag exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); long n = 0L; DataBag selected = bagFactory.newDefaultBag(); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); for (Tuple innerTuple : bag) { n += (Long) innerTuple.get(0); selected.addAll((DataBag) innerTuple.get(1)); waiting.addAll((DataBag) innerTuple.get(2)); } long sampleSize = (long) Math.ceil(_samplingProbability * n); long nNeeded = sampleSize - selected.size(); for (Tuple scored : waiting) { if (nNeeded <= 0) { break; } selected.add(ScoredTuple.fromIntermediateTuple(scored).getTuple()); nNeeded--; } return selected; }
@Override public Tuple exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); DataBag selected = bagFactory.newDefaultBag(); DataBag aggWaiting = bagFactory.newSortedBag(new ScoredTupleComparator()); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); Tuple output = tupleFactory.newTuple(); long n = 0L; for (Tuple innerTuple : bag) { n += (Long) innerTuple.get(0); selected.addAll((DataBag) innerTuple.get(1)); double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple t : (DataBag) innerTuple.get(2)) { ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t); if (scored.getScore() < q1) { selected.add(scored.getTuple()); } else if (scored.getScore() < q2) { aggWaiting.add(t); } else { break; } } } double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple t : aggWaiting) { ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t); if (scored.getScore() < q1) { selected.add(scored.getTuple()); } else if (scored.getScore() < q2) { waiting.add(t); } else { break; } } output.append(n); output.append(selected); output.append(waiting); System.err.println( "Read " + n + " items, selected " + selected.size() + ", and wait-listed " + aggWaiting.size() + "."); return output; }