/** * Filters for records of the rank relation where the rank is greater than the given threshold. * * <p>Output Format: 0: URL 1: RANK 2: AVG_DURATION */ @Override public void map(PactRecord record, Collector<PactRecord> out) throws Exception { if (record.getField(1, PactInteger.class).getValue() > RANKFILTER) { out.collect(record); } }
@Override public void reduce(Iterator<PactRecord> matches, Collector<PactRecord> records) throws Exception { PactRecord pr = null; PactInteger hashtagID = null; int count = 0; int minValue = -1; timestamps.clear(); while (matches.hasNext()) { pr = matches.next(); count = pr.getField(2, PactInteger.class).getValue(); if (count < minValue || minValue == -1) { minValue = count; hashtagID = pr.getField(1, PactInteger.class); timestamps.clear(); timestamps.add(pr.getField(0, PactString.class)); } else if (count == minValue) { timestamps.add(pr.getField(0, PactString.class)); } } if (hashtagID != null) { lowsCount.setValue(minValue); for (PactString timestamp : timestamps) { pr2.setField(0, hashtagID); pr2.setField(1, timestamp); pr2.setField(2, lowsCount); records.collect(pr2); } } }
@Override public void reduce(Iterator<PactRecord> records, Collector<PactRecord> out) throws Exception { if (++this.cnt >= 5) { throw new RuntimeException("Expected Test Exception"); } while (records.hasNext()) out.collect(records.next()); }
/** * Computes a minimum aggregation on the distance of a data point to cluster centers. * * <p>Output Format: 0: centerID 1: pointVector 2: constant(1) (to enable combinable average * computation in the following reducer) */ @Override public void reduce(Iterator<PactRecord> pointsWithDistance, Collector<PactRecord> out) { double nearestDistance = Double.MAX_VALUE; int nearestClusterId = 0; // check all cluster centers while (pointsWithDistance.hasNext()) { PactRecord res = pointsWithDistance.next(); double distance = res.getField(3, PactDouble.class).getValue(); // compare distances if (distance < nearestDistance) { // if distance is smaller than smallest till now, update nearest cluster nearestDistance = distance; nearestClusterId = res.getField(2, PactInteger.class).getValue(); res.getFieldInto(1, this.position); } } // emit a new record with the center id and the data point. add a one to ease the // implementation of the average function with a combiner this.centerId.setValue(nearestClusterId); this.result.setField(0, this.centerId); this.result.setField(1, this.position); this.result.setField(2, this.one); out.collect(this.result); }
@Override public void match(PactRecord record1, PactRecord record2, Collector<PactRecord> out) { if (++this.cnt >= 10) { throw new ExpectedTestException(); } out.collect(record1); }
/** * If the visit iterator is empty, all pairs of the rank iterator are emitted. Otherwise, no * pair is emitted. * * <p>Output Format: 0: URL 1: RANK 2: AVG_DURATION */ @Override public void coGroup( Iterator<PactRecord> ranks, Iterator<PactRecord> visits, Collector<PactRecord> out) { // Check if there is a entry in the visits relation if (!visits.hasNext()) { while (ranks.hasNext()) { // Emit all rank pairs out.collect(ranks.next()); } } }
/** * Filters for records of the visits relation where the year of visit is equal to a specified * value. The URL of all visit records passing the filter is emitted. * * <p>Output Format: 0: URL */ @Override public void map(PactRecord record, Collector<PactRecord> out) throws Exception { // Parse date string with the format YYYY-MM-DD and extract the year String dateString = record.getField(1, PactString.class).getValue(); int year = Integer.parseInt(dateString.substring(0, 4)); if (year == YEARFILTER) { record.setNull(1); out.collect(record); } }
/** * Filters for documents that contain all of the given keywords and projects the records on the * URL field. * * <p>Output Format: 0: URL */ @Override public void map(PactRecord record, Collector<PactRecord> out) throws Exception { // FILTER // Only collect the document if all keywords are contained String docText = record.getField(1, PactString.class).toString(); boolean allContained = true; for (String kw : KEYWORDS) { if (!docText.contains(kw)) { allContained = false; break; } } if (allContained) { record.setNull(1); out.collect(record); } }
/** Computes a minimum aggregation on the distance of a data point to cluster centers. */ @Override public void combine(Iterator<PactRecord> pointsWithDistance, Collector<PactRecord> out) { double nearestDistance = Double.MAX_VALUE; // check all cluster centers while (pointsWithDistance.hasNext()) { PactRecord res = pointsWithDistance.next(); double distance = res.getField(3, PactDouble.class).getValue(); // compare distances if (distance < nearestDistance) { nearestDistance = distance; res.copyTo(this.nearest); } } // emit nearest one out.collect(this.nearest); }
/** * Joins entries from the documents and ranks relation on their URL. * * <p>Output Format: 0: URL 1: RANK 2: AVG_DURATION */ @Override public void match(PactRecord document, PactRecord rank, Collector<PactRecord> out) throws Exception { out.collect(rank); }
@Override public void match(PactRecord record1, PactRecord record2, Collector<PactRecord> out) throws Exception { out.collect(record1); }