public DataBag exec(Tuple input) throws IOException {
    try {
      if (!input.isNull()) {
        // Create the output like a databag {(res1,res2),(res3,res4)..}
        DataBag output_databag = mBagFactory.newDefaultBag();
        // Unpack tuple in order to get the bag {(1,2),(3,4),...}
        String input_time = (String) input.get(0);
        try {

          DateFormat formatter = new SimpleDateFormat("MM/dd/yyyy kk:mm:ss");
          Date date =
              formatter.parse(
                  String.format(
                      "%s/%s/%s %s:%s:%s",
                      input_time.substring(5, 7),
                      input_time.substring(8, 10),
                      input_time.substring(0, 4),
                      input_time.substring(11, 13),
                      input_time.substring(14, 16),
                      input_time.substring(17, 18)));
          Calendar calendar = Calendar.getInstance();
          calendar.setTime(date);
          int dayOfWeek = calendar.get(Calendar.DAY_OF_WEEK);
          int dayOfMonth = calendar.get(Calendar.DAY_OF_MONTH);
          int hour = calendar.get(Calendar.HOUR_OF_DAY);

          // Add items to output
          Tuple items = TupleFactory.getInstance().newTuple(1);
          items.set(0, String.format("%d:%d:%d", dayOfWeek, dayOfMonth, hour));
          output_databag.add(items);

        } catch (Exception e) {
          Tuple items = TupleFactory.getInstance().newTuple(1);
          items.set(0, "petting #1" + e.getMessage());
          output_databag.add(items);
          return output_databag;
        }
        return output_databag;
      } else {
        DataBag output_databag = mBagFactory.newDefaultBag();
        Tuple items = TupleFactory.getInstance().newTuple(1);
        items.set(0, "petting #2");
        output_databag.add(items);
        return output_databag;
      }
    } catch (Exception e) {
      System.err.println("Error with ?? ..");
      DataBag output_databag = mBagFactory.newDefaultBag();
      Tuple items = TupleFactory.getInstance().newTuple(1);
      items.set(0, "petting #3" + e.getMessage());
      output_databag.add(items);
      return output_databag;
    }
  }
Beispiel #2
0
  @Override
  public DataBag exec(Tuple input) throws IOException {
    DataBag bag = (DataBag) input.get(0);
    HashMap<String, Double> clsCnt = new HashMap<String, Double>();
    Iterator<Tuple> it = bag.iterator();
    Double sum = new Double(0.0);
    while (it.hasNext()) {
      Tuple item = (Tuple) it.next();
      String cls = (String) item.get(3);
      if (cls != null && cls.length() > 0) {
        Double cur = clsCnt.get(cls);
        Double inc = (Double) item.get(2);
        if (cur != null) {
          clsCnt.put(cls, cur + inc);
        } else {
          clsCnt.put(cls, inc);
        }
        sum += inc;
      }
    }

    Set<Entry<String, Double>> clses = clsCnt.entrySet();
    Iterator<Entry<String, Double>> cit = clses.iterator();
    DataBag result = bagFactory.newDefaultBag();
    while (cit.hasNext()) {
      Entry<String, Double> cls = cit.next();
      Tuple tpl = tupleFactory.newTuple(2);
      tpl.set(0, cls.getKey());
      tpl.set(1, cls.getValue() / sum);
      result.add(tpl);
    }

    return result;
  }
    @Override
    public DataBag exec(Tuple input) throws IOException {
      DataBag bag = (DataBag) input.get(0);
      long n = 0L;
      DataBag selected = bagFactory.newDefaultBag();
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());

      for (Tuple innerTuple : bag) {
        n += (Long) innerTuple.get(0);
        selected.addAll((DataBag) innerTuple.get(1));
        waiting.addAll((DataBag) innerTuple.get(2));
      }

      long sampleSize = (long) Math.ceil(_samplingProbability * n);
      long nNeeded = sampleSize - selected.size();

      for (Tuple scored : waiting) {
        if (nNeeded <= 0) {
          break;
        }
        selected.add(ScoredTuple.fromIntermediateTuple(scored).getTuple());
        nNeeded--;
      }

      return selected;
    }
    @Override
    public Tuple exec(Tuple input) throws IOException {
      Tuple output = tupleFactory.newTuple();
      DataBag selected = bagFactory.newDefaultBag();
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());

      DataBag items = (DataBag) input.get(0);

      if (items != null) {
        long n = items.size();

        double q1 = getQ1(n, _samplingProbability);
        double q2 = getQ2(n, _samplingProbability);

        for (Tuple item : items) {
          double key = _rdg.nextUniform(0.0d, 1.0d);

          if (key < q1) {
            selected.add(item);
          } else if (key < q2) {
            waiting.add(new ScoredTuple(key, item).getIntermediateTuple(tupleFactory));
          }
        }

        output.append(n);
        output.append(selected);
        output.append(waiting);
      }

      return output;
    }
Beispiel #5
0
  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
      return null;
    }

    String normStr = ((String) input.get(0));
    if (normStr == null) {
      return null;
    }

    // Remove punctuation except when it's a version number
    normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" ");
    normStr = spacePattern.matcher(normStr).replaceAll(" ");

    DataBag output = bagFactory.newDefaultBag();
    for (String s : spacePattern.split(normStr.trim())) {
      if (s.length() <= 30) {
        Tuple t = tupleFactory.newTuple(1);
        t.set(0, s);
        output.add(t);
      }
    }

    return output;
  }
  @SuppressWarnings("unchecked")
  @Override
  public DataBag exec(Tuple input) throws IOException {
    if (input.size() < 2) {
      throw new RuntimeException("Expected at least two inputs, but found " + input.size());
    }

    for (Object o : input) {
      if (o != null && !(o instanceof DataBag)) {
        throw new RuntimeException("Inputs must be bags");
      }
    }

    DataBag outputBag = bagFactory.newDefaultBag();

    DataBag bag1 = (DataBag) input.get(0);
    DataBag bag2 = (DataBag) input.get(1);

    if (bag1 == null || bag1.size() == 0) {
      return outputBag;
    }
    // optimization
    else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) {
      return bag1;
    }

    PriorityQueue<Pair> pq = loadBags(input);

    Tuple lastData = null;

    while (true) {
      Pair nextPair = pq.peek();

      // ignore data we've already encountered
      if (nextPair.data.compareTo(lastData) != 0) {
        // Only take data from the first bag, where there are no other
        // bags that have the same data.
        if (nextPair.index.equals(0) && countMatches(pq) == 0) {
          outputBag.add(nextPair.data);
          lastData = nextPair.data;
        }
      }

      Pair p = pq.poll();

      // only put the bag back into the queue if it still has data
      if (p.hasNext()) {
        p.next();
        pq.offer(p);
      } else if (p.index.equals(0)) {
        // stop when we exhaust all elements from the first bag
        break;
      }
    }

    return outputBag;
  }
Beispiel #7
0
 @Override
 public DataBag exec(Tuple input) throws ExecException, FrontendException {
   if (input == null) {
     return null;
   }
   Iterable<Pair<Integer, Double>> vector = elgen.apply(input);
   List<Tuple> result = new ArrayList<Tuple>(N);
   for (Pair<Integer, Double> el : ordering.greatestOf(vector, N)) {
     result.add(tfac.newTuple(el.getFirst()));
   }
   return bfac.newDefaultBag(result);
 }
Beispiel #8
0
  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0)) return null;

    // Output bag
    DataBag bagOfTokens = bagFactory.newDefaultBag();

    StringReader textInput = new StringReader(input.get(0).toString());
    PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), "");

    for (CoreLabel label; ptbt.hasNext(); ) {
      label = (CoreLabel) ptbt.next();
      if (label.value().length() > 2) {
        System.err.println(label.toString());
        Tuple termText = tupleFactory.newTuple(label.word());
        bagOfTokens.add(termText);
      }
    }
    return bagOfTokens;
  }
    @Override
    public Tuple exec(Tuple input) throws IOException {
      DataBag bag = (DataBag) input.get(0);
      DataBag selected = bagFactory.newDefaultBag();
      DataBag aggWaiting = bagFactory.newSortedBag(new ScoredTupleComparator());
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());
      Tuple output = tupleFactory.newTuple();

      long n = 0L;

      for (Tuple innerTuple : bag) {
        n += (Long) innerTuple.get(0);

        selected.addAll((DataBag) innerTuple.get(1));

        double q1 = getQ1(n, _samplingProbability);
        double q2 = getQ2(n, _samplingProbability);

        for (Tuple t : (DataBag) innerTuple.get(2)) {
          ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t);

          if (scored.getScore() < q1) {
            selected.add(scored.getTuple());
          } else if (scored.getScore() < q2) {
            aggWaiting.add(t);
          } else {
            break;
          }
        }
      }

      double q1 = getQ1(n, _samplingProbability);
      double q2 = getQ2(n, _samplingProbability);

      for (Tuple t : aggWaiting) {
        ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t);

        if (scored.getScore() < q1) {
          selected.add(scored.getTuple());
        } else if (scored.getScore() < q2) {
          waiting.add(t);
        } else {
          break;
        }
      }

      output.append(n);
      output.append(selected);
      output.append(waiting);

      System.err.println(
          "Read "
              + n
              + " items, selected "
              + selected.size()
              + ", and wait-listed "
              + aggWaiting.size()
              + ".");

      return output;
    }