Example #1
0
    @Override
    public Tuple exec(Tuple input) throws IOException {
      DataBag output = BagFactory.getInstance().newDefaultBag();

      DataBag samples = (DataBag) input.get(0);
      if (samples == null) {
        // do nothing
      } else if (samples.size() <= numSamples) {
        // no need to construct a reservoir, so just emit intermediate tuples
        for (Tuple sample : samples) {
          // add the score on to the intermediate tuple
          output.add(new ScoredTuple(Math.random(), sample).getIntermediateTuple(tupleFactory));
        }
      } else {
        for (Tuple sample : samples) {
          getReservoir().consider(new ScoredTuple(Math.random(), sample));
        }

        for (ScoredTuple scoredTuple : getReservoir()) {
          // add the score on to the intermediate tuple
          output.add(scoredTuple.getIntermediateTuple(tupleFactory));
        }
      }

      return tupleFactory.newTuple(output);
    }
Example #2
0
 @Override
 public DataBag getValue() {
   DataBag output = BagFactory.getInstance().newDefaultBag();
   for (ScoredTuple sample : getReservoir()) {
     output.add(sample.getTuple());
   }
   return output;
 }
    @Override
    public DataBag exec(Tuple input) throws IOException {
      DataBag bag = (DataBag) input.get(0);
      long n = 0L;
      DataBag selected = bagFactory.newDefaultBag();
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());

      for (Tuple innerTuple : bag) {
        n += (Long) innerTuple.get(0);
        selected.addAll((DataBag) innerTuple.get(1));
        waiting.addAll((DataBag) innerTuple.get(2));
      }

      long sampleSize = (long) Math.ceil(_samplingProbability * n);
      long nNeeded = sampleSize - selected.size();

      for (Tuple scored : waiting) {
        if (nNeeded <= 0) {
          break;
        }
        selected.add(ScoredTuple.fromIntermediateTuple(scored).getTuple());
        nNeeded--;
      }

      return selected;
    }
Example #4
0
    @Override
    public DataBag exec(Tuple input) throws IOException {
      DataBag bagOfSamples = (DataBag) input.get(0);
      for (Tuple innerTuple : bagOfSamples) {
        DataBag samples = (DataBag) innerTuple.get(0);

        for (Tuple sample : samples) {
          // use the same score as previously generated
          getReservoir().consider(ScoredTuple.fromIntermediateTuple(sample));
        }
      }

      DataBag output = BagFactory.getInstance().newDefaultBag();
      for (ScoredTuple scoredTuple : getReservoir()) {
        // output the original tuple
        output.add(scoredTuple.getTuple());
      }

      return output;
    }
 @Override
 public int compare(Tuple o1, Tuple o2) {
   try {
     ScoredTuple t1 = ScoredTuple.fromIntermediateTuple(o1);
     ScoredTuple t2 = ScoredTuple.fromIntermediateTuple(o2);
     return t1.getScore().compareTo(t2.getScore());
   } catch (Throwable e) {
     throw new RuntimeException("Cannot compare " + o1 + " and " + o2 + ".", e);
   }
 }
    @Override
    public Tuple exec(Tuple input) throws IOException {
      DataBag bag = (DataBag) input.get(0);
      DataBag selected = bagFactory.newDefaultBag();
      DataBag aggWaiting = bagFactory.newSortedBag(new ScoredTupleComparator());
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());
      Tuple output = tupleFactory.newTuple();

      long n = 0L;

      for (Tuple innerTuple : bag) {
        n += (Long) innerTuple.get(0);

        selected.addAll((DataBag) innerTuple.get(1));

        double q1 = getQ1(n, _samplingProbability);
        double q2 = getQ2(n, _samplingProbability);

        for (Tuple t : (DataBag) innerTuple.get(2)) {
          ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t);

          if (scored.getScore() < q1) {
            selected.add(scored.getTuple());
          } else if (scored.getScore() < q2) {
            aggWaiting.add(t);
          } else {
            break;
          }
        }
      }

      double q1 = getQ1(n, _samplingProbability);
      double q2 = getQ2(n, _samplingProbability);

      for (Tuple t : aggWaiting) {
        ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t);

        if (scored.getScore() < q1) {
          selected.add(scored.getTuple());
        } else if (scored.getScore() < q2) {
          waiting.add(t);
        } else {
          break;
        }
      }

      output.append(n);
      output.append(selected);
      output.append(waiting);

      System.err.println(
          "Read "
              + n
              + " items, selected "
              + selected.size()
              + ", and wait-listed "
              + aggWaiting.size()
              + ".");

      return output;
    }