/**
     * Filters for records of the rank relation where the rank is greater than the given threshold.
     *
     * <p>Output Format: 0: URL 1: RANK 2: AVG_DURATION
     */
    @Override
    public void map(PactRecord record, Collector<PactRecord> out) throws Exception {

      if (record.getField(1, PactInteger.class).getValue() > RANKFILTER) {
        out.collect(record);
      }
    }
  @Override
  public void reduce(Iterator<PactRecord> matches, Collector<PactRecord> records) throws Exception {
    PactRecord pr = null;
    PactInteger hashtagID = null;
    int count = 0;
    int minValue = -1;
    timestamps.clear();

    while (matches.hasNext()) {
      pr = matches.next();
      count = pr.getField(2, PactInteger.class).getValue();
      if (count < minValue || minValue == -1) {
        minValue = count;
        hashtagID = pr.getField(1, PactInteger.class);
        timestamps.clear();
        timestamps.add(pr.getField(0, PactString.class));
      } else if (count == minValue) {
        timestamps.add(pr.getField(0, PactString.class));
      }
    }

    if (hashtagID != null) {
      lowsCount.setValue(minValue);
      for (PactString timestamp : timestamps) {
        pr2.setField(0, hashtagID);
        pr2.setField(1, timestamp);
        pr2.setField(2, lowsCount);
        records.collect(pr2);
      }
    }
  }
Example #3
0
 @Override
 public void reduce(Iterator<PactRecord> records, Collector<PactRecord> out) throws Exception {
   if (++this.cnt >= 5) {
     throw new RuntimeException("Expected Test Exception");
   }
   while (records.hasNext()) out.collect(records.next());
 }
  /**
   * Computes a minimum aggregation on the distance of a data point to cluster centers.
   *
   * <p>Output Format: 0: centerID 1: pointVector 2: constant(1) (to enable combinable average
   * computation in the following reducer)
   */
  @Override
  public void reduce(Iterator<PactRecord> pointsWithDistance, Collector<PactRecord> out) {
    double nearestDistance = Double.MAX_VALUE;
    int nearestClusterId = 0;

    // check all cluster centers
    while (pointsWithDistance.hasNext()) {
      PactRecord res = pointsWithDistance.next();

      double distance = res.getField(3, PactDouble.class).getValue();

      // compare distances
      if (distance < nearestDistance) {
        // if distance is smaller than smallest till now, update nearest cluster
        nearestDistance = distance;
        nearestClusterId = res.getField(2, PactInteger.class).getValue();
        res.getFieldInto(1, this.position);
      }
    }

    // emit a new record with the center id and the data point. add a one to ease the
    // implementation of the average function with a combiner
    this.centerId.setValue(nearestClusterId);
    this.result.setField(0, this.centerId);
    this.result.setField(1, this.position);
    this.result.setField(2, this.one);

    out.collect(this.result);
  }
Example #5
0
    @Override
    public void match(PactRecord record1, PactRecord record2, Collector<PactRecord> out) {
      if (++this.cnt >= 10) {
        throw new ExpectedTestException();
      }

      out.collect(record1);
    }
 /**
  * If the visit iterator is empty, all pairs of the rank iterator are emitted. Otherwise, no
  * pair is emitted.
  *
  * <p>Output Format: 0: URL 1: RANK 2: AVG_DURATION
  */
 @Override
 public void coGroup(
     Iterator<PactRecord> ranks, Iterator<PactRecord> visits, Collector<PactRecord> out) {
   // Check if there is a entry in the visits relation
   if (!visits.hasNext()) {
     while (ranks.hasNext()) {
       // Emit all rank pairs
       out.collect(ranks.next());
     }
   }
 }
    /**
     * Filters for records of the visits relation where the year of visit is equal to a specified
     * value. The URL of all visit records passing the filter is emitted.
     *
     * <p>Output Format: 0: URL
     */
    @Override
    public void map(PactRecord record, Collector<PactRecord> out) throws Exception {

      // Parse date string with the format YYYY-MM-DD and extract the year
      String dateString = record.getField(1, PactString.class).getValue();
      int year = Integer.parseInt(dateString.substring(0, 4));

      if (year == YEARFILTER) {
        record.setNull(1);
        out.collect(record);
      }
    }
    /**
     * Filters for documents that contain all of the given keywords and projects the records on the
     * URL field.
     *
     * <p>Output Format: 0: URL
     */
    @Override
    public void map(PactRecord record, Collector<PactRecord> out) throws Exception {

      // FILTER
      // Only collect the document if all keywords are contained
      String docText = record.getField(1, PactString.class).toString();
      boolean allContained = true;
      for (String kw : KEYWORDS) {
        if (!docText.contains(kw)) {
          allContained = false;
          break;
        }
      }

      if (allContained) {
        record.setNull(1);
        out.collect(record);
      }
    }
  /** Computes a minimum aggregation on the distance of a data point to cluster centers. */
  @Override
  public void combine(Iterator<PactRecord> pointsWithDistance, Collector<PactRecord> out) {
    double nearestDistance = Double.MAX_VALUE;

    // check all cluster centers
    while (pointsWithDistance.hasNext()) {
      PactRecord res = pointsWithDistance.next();
      double distance = res.getField(3, PactDouble.class).getValue();

      // compare distances
      if (distance < nearestDistance) {
        nearestDistance = distance;
        res.copyTo(this.nearest);
      }
    }

    // emit nearest one
    out.collect(this.nearest);
  }
 /**
  * Joins entries from the documents and ranks relation on their URL.
  *
  * <p>Output Format: 0: URL 1: RANK 2: AVG_DURATION
  */
 @Override
 public void match(PactRecord document, PactRecord rank, Collector<PactRecord> out)
     throws Exception {
   out.collect(rank);
 }
Example #11
0
 @Override
 public void match(PactRecord record1, PactRecord record2, Collector<PactRecord> out)
     throws Exception {
   out.collect(record1);
 }