/** * Computes a minimum aggregation on the distance of a data point to cluster centers. * * <p>Output Format: 0: centerID 1: pointVector 2: constant(1) (to enable combinable average * computation in the following reducer) */ @Override public void reduce(Iterator<PactRecord> pointsWithDistance, Collector<PactRecord> out) { double nearestDistance = Double.MAX_VALUE; int nearestClusterId = 0; // check all cluster centers while (pointsWithDistance.hasNext()) { PactRecord res = pointsWithDistance.next(); double distance = res.getField(3, PactDouble.class).getValue(); // compare distances if (distance < nearestDistance) { // if distance is smaller than smallest till now, update nearest cluster nearestDistance = distance; nearestClusterId = res.getField(2, PactInteger.class).getValue(); res.getFieldInto(1, this.position); } } // emit a new record with the center id and the data point. add a one to ease the // implementation of the average function with a combiner this.centerId.setValue(nearestClusterId); this.result.setField(0, this.centerId); this.result.setField(1, this.position); this.result.setField(2, this.one); out.collect(this.result); }
/** * Filters for records of the visits relation where the year of visit is equal to a specified * value. The URL of all visit records passing the filter is emitted. * * <p>Output Format: 0: URL */ @Override public void map(PactRecord record, Collector<PactRecord> out) throws Exception { // Parse date string with the format YYYY-MM-DD and extract the year String dateString = record.getField(1, PactString.class).getValue(); int year = Integer.parseInt(dateString.substring(0, 4)); if (year == YEARFILTER) { record.setNull(1); out.collect(record); } }
/** * Filters for records of the rank relation where the rank is greater than the given threshold. * * <p>Output Format: 0: URL 1: RANK 2: AVG_DURATION */ @Override public void map(PactRecord record, Collector<PactRecord> out) throws Exception { if (record.getField(1, PactInteger.class).getValue() > RANKFILTER) { out.collect(record); } }
@Override public PactRecord peek() { if (readNext) { int bytesRead = currentReadRecord.deserialize(segments, CURRENT_READ_SEGMENT_INDEX, currentReadOffset); while (bytesRead > 0) { if (currentReadSegment.size() - currentReadOffset > bytesRead) { currentReadOffset += bytesRead; bytesRead = 0; } else { bytesRead -= (currentReadSegment.size() - currentReadOffset); // Remove old read segment from list & release in memory manager MemorySegment unused = segments.remove(CURRENT_READ_SEGMENT_INDEX); memoryManager.release(unused); // Update reference to new read segment currentReadSegment = segments.get(CURRENT_READ_SEGMENT_INDEX); currentReadOffset = 0; } } readNext = false; } return currentReadRecord; }
/* (non-Javadoc) * @see eu.stratosphere.pact.runtime.plugable.TypeAccessors#setReferenceForEquality(java.lang.Object) */ @Override public void setReference(PactRecord toCompare) { for (int i = 0; i < this.keyFields.length; i++) { if (!toCompare.getFieldInto(this.keyFields[i], this.keyHolders[i])) { throw new NullKeyFieldException(this.keyFields[i]); } } }
@Override public void reduce(Iterator<PactRecord> matches, Collector<PactRecord> records) throws Exception { PactRecord pr = null; PactInteger hashtagID = null; int count = 0; int minValue = -1; timestamps.clear(); while (matches.hasNext()) { pr = matches.next(); count = pr.getField(2, PactInteger.class).getValue(); if (count < minValue || minValue == -1) { minValue = count; hashtagID = pr.getField(1, PactInteger.class); timestamps.clear(); timestamps.add(pr.getField(0, PactString.class)); } else if (count == minValue) { timestamps.add(pr.getField(0, PactString.class)); } } if (hashtagID != null) { lowsCount.setValue(minValue); for (PactString timestamp : timestamps) { pr2.setField(0, hashtagID); pr2.setField(1, timestamp); pr2.setField(2, lowsCount); records.collect(pr2); } } }
/* (non-Javadoc) * @see eu.stratosphere.pact.runtime.plugable.TypeAccessors#equalToReference(java.lang.Object) */ @Override public boolean equalToReference(PactRecord candidate) { for (int i = 0; i < this.keyFields.length; i++) { final Key k = candidate.getField(this.keyFields[i], this.transientKeyHolders[i]); if (k == null) throw new NullKeyFieldException(this.keyFields[i]); else if (!k.equals(this.keyHolders[i])) return false; } return true; }
/** * Filters for documents that contain all of the given keywords and projects the records on the * URL field. * * <p>Output Format: 0: URL */ @Override public void map(PactRecord record, Collector<PactRecord> out) throws Exception { // FILTER // Only collect the document if all keywords are contained String docText = record.getField(1, PactString.class).toString(); boolean allContained = true; for (String kw : KEYWORDS) { if (!docText.contains(kw)) { allContained = false; break; } } if (allContained) { record.setNull(1); out.collect(record); } }
/** Computes a minimum aggregation on the distance of a data point to cluster centers. */ @Override public void combine(Iterator<PactRecord> pointsWithDistance, Collector<PactRecord> out) { double nearestDistance = Double.MAX_VALUE; // check all cluster centers while (pointsWithDistance.hasNext()) { PactRecord res = pointsWithDistance.next(); double distance = res.getField(3, PactDouble.class).getValue(); // compare distances if (distance < nearestDistance) { nearestDistance = distance; res.copyTo(this.nearest); } } // emit nearest one out.collect(this.nearest); }
@Override public void runIteration(IterationIterator iterationIter) throws Exception { int counter = 0; while (iterationIter.next(tc)) { counter++; } count.setValue(counter); result.setField(0, count); output.collect(result); }
/* (non-Javadoc) * @see eu.stratosphere.pact.common.recordio.OutputFormat#writeRecord(eu.stratosphere.pact.common.type.PactRecord) */ @Override public void writeRecord(PactRecord record) throws IOException { int numRecFields = record.getNumFields(); int readPos; for (int i = 0; i < this.numFields; i++) { readPos = this.recordPositions[i]; if (readPos < numRecFields) { Value v = record.getField(this.recordPositions[i], this.classes[i]); if (v != null) { if (i != 0) this.wrt.write(this.fieldDelimiter); this.wrt.write(v.toString()); } else { if (this.lenient) { if (i != 0) this.wrt.write(this.fieldDelimiter); } else { throw new RuntimeException( "Cannot serialize record with <null> value at position: " + readPos); } } } else { if (this.lenient) { if (i != 0) this.wrt.write(this.fieldDelimiter); } else { throw new RuntimeException( "Cannot serialize record with out field at position: " + readPos); } } } // add the record delimiter this.wrt.write(this.recordDelimiter); }
public final Key[] getKeysAsCopy(PactRecord record) { try { final Key[] keys = new Key[this.keyFields.length]; for (int i = 0; i < keys.length; i++) { keys[i] = this.keyHolders[i].getClass().newInstance(); } record.getFieldsInto(this.keyFields, keys); return keys; } catch (Exception ex) { // this should never happen, because the classes have been instantiated before. Report for // debugging. throw new RuntimeException( "Could not instantiate key classes when duplicating PactRecordComparator.", ex); } }
/* (non-Javadoc) * @see eu.stratosphere.pact.generic.types.TypeComparator#putNormalizedKey(java.lang.Object, byte[], int, int) */ @Override public void putNormalizedKey(PactRecord record, MemorySegment target, int offset, int numBytes) { int i = 0; try { for (; i < this.numLeadingNormalizableKeys & numBytes > 0; i++) { int len = this.normalizedKeyLengths[i]; len = numBytes >= len ? len : numBytes; ((NormalizableKey) record.getField(this.keyFields[i], this.transientKeyHolders[i])) .copyNormalizedKey(target, offset, len); numBytes -= len; offset += len; } } catch (NullPointerException npex) { throw new NullKeyFieldException(this.keyFields[i]); } }
/* (non-Javadoc) * @see eu.stratosphere.pact.runtime.plugable.TypeAccessors#hash(java.lang.Object) */ @Override public int hash(PactRecord object) { int i = 0; try { int code = 0; for (; i < this.keyFields.length; i++) { code ^= object.getField(this.keyFields[i], this.transientKeyHolders[i]).hashCode(); code *= HASH_SALT[i & 0x1F]; // salt code with (i % HASH_SALT.length)-th salt component } return code; } catch (NullPointerException npex) { throw new NullKeyFieldException(this.keyFields[i]); } catch (IndexOutOfBoundsException iobex) { throw new KeyFieldOutOfBoundsException(this.keyFields[i]); } }
@Override public boolean offer(PactRecord rec) { try { rec.serialize(null, currentWriteSegment.outputView, allocatingIterator, newSegments); } catch (Exception ex) { throw new RuntimeException("Bad error during serialization", ex); } if (!newSegments.isEmpty()) { segments.addAll(newSegments); currentWriteSegment = segments.get(segments.size() - 1); newSegments.clear(); } count++; return true; }