public static KSketchIndex createInitialIndex( ClusterSettings settings, PCollection<Pair<Integer, RealVector>> input) { RealVector[] init = new RealVector[settings.getCrossFolds()]; for (Pair<Integer, RealVector> rv : input.materialize()) { if (init[rv.first()] == null) { init[rv.first()] = rv.second(); } boolean done = true; for (RealVector vec : init) { if (vec == null) { done = false; break; } } if (done) { break; } } KSketchIndex index = new KSketchIndex( settings.getCrossFolds(), init[0].getDimension(), settings.getIndexBits(), settings.getIndexSamples(), 1729L); // TODO: something smarter, or figure out that I don't need this b/c I compute // the projections up front for (int i = 0; i < init.length; i++) { index.add(init[i], i); } return index; }
/** {@inheritDoc} */ @Override public void join( K key, int id, Iterable<Pair<U, V>> pairs, Emitter<Pair<K, Pair<U, V>>> emitter) { if (!key.equals(lastKey)) { // Make sure that left side gets emitted. if (0 == lastId && 0 == id) { for (U u : leftValues) { emitter.emit(Pair.of(lastKey, Pair.of(u, (V) null))); } } lastKey = key; leftValues.clear(); } if (id == 0) { for (Pair<U, V> pair : pairs) { if (pair.first() != null) leftValues.add(leftValueType.getDetachedValue(pair.first())); } } else { for (Pair<U, V> pair : pairs) { // Make sure that right side gets emitted. if (leftValues.isEmpty()) { leftValues.add(null); } for (U u : leftValues) { emitter.emit(Pair.of(lastKey, Pair.of(u, pair.second()))); } } } lastId = id; }
@Override public void process( Pair<Pair<Long, Integer>, Iterable<NumericIDValue>> input, Emitter<Pair<Long, NumericIDValue>> emitter) { Pair<Long, Integer> key = input.first(); long currentUserID = key.first(); if (key.second() == BEFORE) { // Last old data had no match, just output it if (previousUserPrefs != null) { Preconditions.checkNotNull(previousUserID); output(previousUserID, previousUserPrefs, null, null, emitter); previousUserPrefs = null; previousUserID = null; } LongFloatMap oldPrefs = new LongFloatMap(); for (NumericIDValue itemPref : input.second()) { float oldPrefValue = itemPref.getValue(); Preconditions.checkState(!Float.isNaN(oldPrefValue), "No prior pref value?"); // Apply decay factor here, if applicable: oldPrefs.increment(itemPref.getID(), doDecay ? oldPrefValue * decayFactor : oldPrefValue); } previousUserPrefs = oldPrefs; previousUserID = currentUserID; } else { // Last old data had no match, just output it if (previousUserPrefs != null && currentUserID != previousUserID) { Preconditions.checkNotNull(previousUserID); output(previousUserID, previousUserPrefs, null, null, emitter); previousUserPrefs = null; previousUserID = null; } LongFloatMap newPrefs = new LongFloatMap(); LongSet removedItemIDs = new LongSet(); for (NumericIDValue itemPref : input.second()) { long itemID = itemPref.getID(); float newPrefValue = itemPref.getValue(); if (Float.isNaN(newPrefValue)) { removedItemIDs.add(itemID); } else { newPrefs.increment(itemID, newPrefValue); } } output(currentUserID, previousUserPrefs, newPrefs, removedItemIDs, emitter); previousUserPrefs = null; previousUserID = null; } }
@Override public String map(Pair<Long, LongFloatMap> input) { return input.first().toString() + '\t' + setToString(input.second()); }
/** * Create a detached value for a table {@link Pair}. * * @param tableType The table type * @param value The value from which a detached value is to be created * @return The detached value * @see PType#getDetachedValue(Object) */ public static <K, V> Pair<K, V> getDetachedValue(PTableType<K, V> tableType, Pair<K, V> value) { return Pair.of( tableType.getKeyType().getDetachedValue(value.first()), tableType.getValueType().getDetachedValue(value.second())); }