/** * Computes a minimum aggregation on the distance of a data point to cluster centers. * * <p>Output Format: 0: centerID 1: pointVector 2: constant(1) (to enable combinable average * computation in the following reducer) */ @Override public void reduce(Iterator<PactRecord> pointsWithDistance, Collector<PactRecord> out) { double nearestDistance = Double.MAX_VALUE; int nearestClusterId = 0; // check all cluster centers while (pointsWithDistance.hasNext()) { PactRecord res = pointsWithDistance.next(); double distance = res.getField(3, PactDouble.class).getValue(); // compare distances if (distance < nearestDistance) { // if distance is smaller than smallest till now, update nearest cluster nearestDistance = distance; nearestClusterId = res.getField(2, PactInteger.class).getValue(); res.getFieldInto(1, this.position); } } // emit a new record with the center id and the data point. add a one to ease the // implementation of the average function with a combiner this.centerId.setValue(nearestClusterId); this.result.setField(0, this.centerId); this.result.setField(1, this.position); this.result.setField(2, this.one); out.collect(this.result); }
@Override public void reduce(Iterator<PactRecord> matches, Collector<PactRecord> records) throws Exception { PactRecord pr = null; PactInteger hashtagID = null; int count = 0; int minValue = -1; timestamps.clear(); while (matches.hasNext()) { pr = matches.next(); count = pr.getField(2, PactInteger.class).getValue(); if (count < minValue || minValue == -1) { minValue = count; hashtagID = pr.getField(1, PactInteger.class); timestamps.clear(); timestamps.add(pr.getField(0, PactString.class)); } else if (count == minValue) { timestamps.add(pr.getField(0, PactString.class)); } } if (hashtagID != null) { lowsCount.setValue(minValue); for (PactString timestamp : timestamps) { pr2.setField(0, hashtagID); pr2.setField(1, timestamp); pr2.setField(2, lowsCount); records.collect(pr2); } } }
/** * Filters for records of the rank relation where the rank is greater than the given threshold. * * <p>Output Format: 0: URL 1: RANK 2: AVG_DURATION */ @Override public void map(PactRecord record, Collector<PactRecord> out) throws Exception { if (record.getField(1, PactInteger.class).getValue() > RANKFILTER) { out.collect(record); } }
/* (non-Javadoc) * @see eu.stratosphere.pact.runtime.plugable.TypeAccessors#equalToReference(java.lang.Object) */ @Override public boolean equalToReference(PactRecord candidate) { for (int i = 0; i < this.keyFields.length; i++) { final Key k = candidate.getField(this.keyFields[i], this.transientKeyHolders[i]); if (k == null) throw new NullKeyFieldException(this.keyFields[i]); else if (!k.equals(this.keyHolders[i])) return false; } return true; }
/** * Filters for records of the visits relation where the year of visit is equal to a specified * value. The URL of all visit records passing the filter is emitted. * * <p>Output Format: 0: URL */ @Override public void map(PactRecord record, Collector<PactRecord> out) throws Exception { // Parse date string with the format YYYY-MM-DD and extract the year String dateString = record.getField(1, PactString.class).getValue(); int year = Integer.parseInt(dateString.substring(0, 4)); if (year == YEARFILTER) { record.setNull(1); out.collect(record); } }
/* (non-Javadoc) * @see eu.stratosphere.pact.generic.types.TypeComparator#putNormalizedKey(java.lang.Object, byte[], int, int) */ @Override public void putNormalizedKey(PactRecord record, MemorySegment target, int offset, int numBytes) { int i = 0; try { for (; i < this.numLeadingNormalizableKeys & numBytes > 0; i++) { int len = this.normalizedKeyLengths[i]; len = numBytes >= len ? len : numBytes; ((NormalizableKey) record.getField(this.keyFields[i], this.transientKeyHolders[i])) .copyNormalizedKey(target, offset, len); numBytes -= len; offset += len; } } catch (NullPointerException npex) { throw new NullKeyFieldException(this.keyFields[i]); } }
/* (non-Javadoc) * @see eu.stratosphere.pact.runtime.plugable.TypeAccessors#hash(java.lang.Object) */ @Override public int hash(PactRecord object) { int i = 0; try { int code = 0; for (; i < this.keyFields.length; i++) { code ^= object.getField(this.keyFields[i], this.transientKeyHolders[i]).hashCode(); code *= HASH_SALT[i & 0x1F]; // salt code with (i % HASH_SALT.length)-th salt component } return code; } catch (NullPointerException npex) { throw new NullKeyFieldException(this.keyFields[i]); } catch (IndexOutOfBoundsException iobex) { throw new KeyFieldOutOfBoundsException(this.keyFields[i]); } }
/** * Filters for documents that contain all of the given keywords and projects the records on the * URL field. * * <p>Output Format: 0: URL */ @Override public void map(PactRecord record, Collector<PactRecord> out) throws Exception { // FILTER // Only collect the document if all keywords are contained String docText = record.getField(1, PactString.class).toString(); boolean allContained = true; for (String kw : KEYWORDS) { if (!docText.contains(kw)) { allContained = false; break; } } if (allContained) { record.setNull(1); out.collect(record); } }
/** Computes a minimum aggregation on the distance of a data point to cluster centers. */ @Override public void combine(Iterator<PactRecord> pointsWithDistance, Collector<PactRecord> out) { double nearestDistance = Double.MAX_VALUE; // check all cluster centers while (pointsWithDistance.hasNext()) { PactRecord res = pointsWithDistance.next(); double distance = res.getField(3, PactDouble.class).getValue(); // compare distances if (distance < nearestDistance) { nearestDistance = distance; res.copyTo(this.nearest); } } // emit nearest one out.collect(this.nearest); }
/* (non-Javadoc) * @see eu.stratosphere.pact.common.recordio.OutputFormat#writeRecord(eu.stratosphere.pact.common.type.PactRecord) */ @Override public void writeRecord(PactRecord record) throws IOException { int numRecFields = record.getNumFields(); int readPos; for (int i = 0; i < this.numFields; i++) { readPos = this.recordPositions[i]; if (readPos < numRecFields) { Value v = record.getField(this.recordPositions[i], this.classes[i]); if (v != null) { if (i != 0) this.wrt.write(this.fieldDelimiter); this.wrt.write(v.toString()); } else { if (this.lenient) { if (i != 0) this.wrt.write(this.fieldDelimiter); } else { throw new RuntimeException( "Cannot serialize record with <null> value at position: " + readPos); } } } else { if (this.lenient) { if (i != 0) this.wrt.write(this.fieldDelimiter); } else { throw new RuntimeException( "Cannot serialize record with out field at position: " + readPos); } } } // add the record delimiter this.wrt.write(this.recordDelimiter); }