public static KSketchIndex createInitialIndex( ClusterSettings settings, PCollection<Pair<Integer, RealVector>> input) { RealVector[] init = new RealVector[settings.getCrossFolds()]; for (Pair<Integer, RealVector> rv : input.materialize()) { if (init[rv.first()] == null) { init[rv.first()] = rv.second(); } boolean done = true; for (RealVector vec : init) { if (vec == null) { done = false; break; } } if (done) { break; } } KSketchIndex index = new KSketchIndex( settings.getCrossFolds(), init[0].getDimension(), settings.getIndexBits(), settings.getIndexSamples(), 1729L); // TODO: something smarter, or figure out that I don't need this b/c I compute // the projections up front for (int i = 0; i < init.length; i++) { index.add(init[i], i); } return index; }
@Test public void testQuantilesNines() { PTable<String, Integer> testTable = MemPipeline.typedTableOf( tableOf(strings(), ints()), "a", 10, "a", 20, "a", 30, "a", 40, "a", 50, "a", 60, "a", 70, "a", 80, "a", 90, "a", 100); Map<String, Result<Integer>> actualS = Quantiles.distributed(testTable, 0.9, 0.99).materializeToMap(); Map<String, Result<Integer>> actualM = Quantiles.inMemory(testTable, 0.9, 0.99).materializeToMap(); Map<String, Result<Integer>> expected = ImmutableMap.of("a", result(10, Pair.of(0.9, 90), Pair.of(0.99, 100))); assertEquals(expected, actualS); assertEquals(expected, actualM); }
/** {@inheritDoc} */ @Override public void cleanup(Emitter<Pair<K, Pair<U, V>>> emitter) { if (0 == lastId) { for (U u : leftValues) { emitter.emit(Pair.of(lastKey, Pair.of(u, (V) null))); } } }
/** {@inheritDoc} */ @Override public void join( K key, int id, Iterable<Pair<U, V>> pairs, Emitter<Pair<K, Pair<U, V>>> emitter) { if (!key.equals(lastKey)) { // Make sure that left side gets emitted. if (0 == lastId && 0 == id) { for (U u : leftValues) { emitter.emit(Pair.of(lastKey, Pair.of(u, (V) null))); } } lastKey = key; leftValues.clear(); } if (id == 0) { for (Pair<U, V> pair : pairs) { if (pair.first() != null) leftValues.add(leftValueType.getDetachedValue(pair.first())); } } else { for (Pair<U, V> pair : pairs) { // Make sure that right side gets emitted. if (leftValues.isEmpty()) { leftValues.add(null); } for (U u : leftValues) { emitter.emit(Pair.of(lastKey, Pair.of(u, pair.second()))); } } } lastId = id; }
@Override public void process( Pair<Pair<Long, Integer>, Iterable<NumericIDValue>> input, Emitter<Pair<Long, NumericIDValue>> emitter) { Pair<Long, Integer> key = input.first(); long currentUserID = key.first(); if (key.second() == BEFORE) { // Last old data had no match, just output it if (previousUserPrefs != null) { Preconditions.checkNotNull(previousUserID); output(previousUserID, previousUserPrefs, null, null, emitter); previousUserPrefs = null; previousUserID = null; } LongFloatMap oldPrefs = new LongFloatMap(); for (NumericIDValue itemPref : input.second()) { float oldPrefValue = itemPref.getValue(); Preconditions.checkState(!Float.isNaN(oldPrefValue), "No prior pref value?"); // Apply decay factor here, if applicable: oldPrefs.increment(itemPref.getID(), doDecay ? oldPrefValue * decayFactor : oldPrefValue); } previousUserPrefs = oldPrefs; previousUserID = currentUserID; } else { // Last old data had no match, just output it if (previousUserPrefs != null && currentUserID != previousUserID) { Preconditions.checkNotNull(previousUserID); output(previousUserID, previousUserPrefs, null, null, emitter); previousUserPrefs = null; previousUserID = null; } LongFloatMap newPrefs = new LongFloatMap(); LongSet removedItemIDs = new LongSet(); for (NumericIDValue itemPref : input.second()) { long itemID = itemPref.getID(); float newPrefValue = itemPref.getValue(); if (Float.isNaN(newPrefValue)) { removedItemIDs.add(itemID); } else { newPrefs.increment(itemID, newPrefValue); } } output(currentUserID, previousUserPrefs, newPrefs, removedItemIDs, emitter); previousUserPrefs = null; previousUserID = null; } }
private void output( long userID, LongFloatMap oldPrefs, LongFloatMap newPrefs, LongSet removedItemIDs, Emitter<Pair<Long, NumericIDValue>> emitter) { // Old prefs may be null when there is no previous generation, for example, or the user is new. // First, write out existing prefs, possibly updated by new values if (oldPrefs != null && !oldPrefs.isEmpty()) { for (LongFloatMap.MapEntry entry : oldPrefs.entrySet()) { long itemID = entry.getKey(); float oldPrefValue = entry.getValue(); Preconditions.checkState(!Float.isNaN(oldPrefValue), "No prior pref value?"); // May be NaN if no new data at all, or new data has no update: float sum = oldPrefValue; if (newPrefs != null) { float newPrefValue = newPrefs.get(itemID); if (!Float.isNaN(newPrefValue)) { sum += newPrefValue; } } boolean remove = false; if (removedItemIDs != null && removedItemIDs.contains(itemID)) { remove = true; } else if (FastMath.abs(sum) <= zeroThreshold) { remove = true; } if (!remove) { emitter.emit(Pair.of(userID, new NumericIDValue(itemID, sum))); } } } // Now output new data, that didn't exist in old prefs if (newPrefs != null && !newPrefs.isEmpty()) { for (LongFloatMap.MapEntry entry : newPrefs.entrySet()) { long itemID = entry.getKey(); if (oldPrefs == null || !oldPrefs.containsKey(itemID)) { // It wasn't already written. If it exists in newPrefs, it's also not removed float newPrefValue = entry.getValue(); if (FastMath.abs(newPrefValue) > zeroThreshold) { emitter.emit(Pair.of(userID, new NumericIDValue(itemID, newPrefValue))); } } } } }
@Test public void testQuantilesExact() { PTable<String, Integer> testTable = MemPipeline.typedTableOf( tableOf(strings(), ints()), "a", 5, "a", 2, "a", 3, "a", 4, "a", 1); Map<String, Result<Integer>> actualS = Quantiles.distributed(testTable, 0, 0.5, 1.0).materializeToMap(); Map<String, Result<Integer>> actualM = Quantiles.inMemory(testTable, 0, 0.5, 1.0).materializeToMap(); Map<String, Result<Integer>> expected = ImmutableMap.of("a", result(5, Pair.of(0.0, 1), Pair.of(0.5, 3), Pair.of(1.0, 5))); assertEquals(expected, actualS); assertEquals(expected, actualM); }
/** * Created a detached value for a {@link PGroupedTable} value. * * @param groupedTableType The grouped table type * @param value The value from which a detached value is to be created * @return The detached value * @see PType#getDetachedValue(Object) */ public static <K, V> Pair<K, Iterable<V>> getGroupedDetachedValue( PGroupedTableType<K, V> groupedTableType, Pair<K, Iterable<V>> value) { PTableType<K, V> tableType = groupedTableType.getTableType(); List<V> detachedIterable = Lists.newArrayList(); PType<V> valueType = tableType.getValueType(); for (V v : value.second()) { detachedIterable.add(valueType.getDetachedValue(v)); } return Pair.of( tableType.getKeyType().getDetachedValue(value.first()), (Iterable<V>) detachedIterable); }
@Override public void process( Pair<Integer, Iterable<Pair<K, V>>> input, Emitter<Pair<Integer, Pair<K, V>>> emitter) { Comparator<Pair<K, V>> cmp = new PairValueComparator<K, V>(maximize); PriorityQueue<Pair<K, V>> queue = new PriorityQueue<Pair<K, V>>(limit, cmp); for (Pair<K, V> pair : input.second()) { queue.add(pair); if (queue.size() > limit) { queue.poll(); } } List<Pair<K, V>> values = Lists.newArrayList(queue); Collections.sort(values, cmp); for (int i = values.size() - 1; i >= 0; i--) { emitter.emit(Pair.of(0, values.get(i))); } }
@Test public void testQuantilesBetween() { PTable<String, Integer> testTable = MemPipeline.typedTableOf( tableOf(strings(), ints()), "a", 5, "a", 2, // We expect the 0.5 to correspond to this element, according to the "nearest rank" // %ile definition. "a", 4, "a", 1); Map<String, Result<Integer>> actualS = Quantiles.distributed(testTable, 0.5).materializeToMap(); Map<String, Result<Integer>> actualM = Quantiles.inMemory(testTable, 0.5).materializeToMap(); Map<String, Result<Integer>> expected = ImmutableMap.of("a", result(4, Pair.of(0.5, 2))); assertEquals(expected, actualS); assertEquals(expected, actualM); }
@Override public void process(Pair<K, Iterable<V>> input, Emitter<Pair<K, V>> emitter) { for (V v : input.second()) { emitter.emit(Pair.of(input.first(), v)); } }
private void runMapsideLeftOuterJoin(Pipeline pipeline, boolean inMemory, boolean materialize) { PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt"); PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt"); JoinStrategy<Integer, String, String> mapsideJoin = new MapsideJoinStrategy<Integer, String, String>(materialize); PTable<Integer, String> custOrders = mapsideJoin .join(customerTable, orderTable, JoinType.LEFT_OUTER_JOIN) .mapValues("concat", new ConcatValuesFn(), Writables.strings()); PTable<Integer, String> ORDER_TABLE = orderTable.mapValues(new CapOrdersFn(), orderTable.getValueType()); PTable<Integer, Pair<String, String>> joined = mapsideJoin.join(custOrders, ORDER_TABLE, JoinType.LEFT_OUTER_JOIN); List<Pair<Integer, Pair<String, String>>> expectedJoinResult = Lists.newArrayList(); expectedJoinResult.add(Pair.of(111, Pair.of("[John Doe,Corn flakes]", "CORN FLAKES"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PAPER"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PLUNGER"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PAPER"))); expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PLUNGER"))); expectedJoinResult.add(Pair.of(333, Pair.of("[Someone Else,Toilet brush]", "TOILET BRUSH"))); expectedJoinResult.add(Pair.of(444, Pair.<String, String>of("[Has No Orders,null]", null))); Iterable<Pair<Integer, Pair<String, String>>> iter = joined.materialize(); PipelineResult res = pipeline.run(); if (!inMemory) { assertEquals(materialize ? 2 : 1, res.getStageResults().size()); } List<Pair<Integer, Pair<String, String>>> joinedResultList = Lists.newArrayList(iter); Collections.sort(joinedResultList); assertEquals(expectedJoinResult, joinedResultList); }
@Override public Pair<Integer, String> map(String input) { String[] fields = input.split("\\|"); return Pair.of(Integer.parseInt(fields[0]), fields[1]); }
@Override public int compare(Pair<K, V> left, Pair<K, V> right) { int cmp = ((Comparable<V>) left.second()).compareTo(right.second()); return ascending ? cmp : -cmp; }
public void cleanup(Emitter<Pair<Integer, Pair<K, V>>> emitter) { for (Pair<K, V> p : values) { emitter.emit(Pair.of(0, p)); } }
/** * Create a detached value for a table {@link Pair}. * * @param tableType The table type * @param value The value from which a detached value is to be created * @return The detached value * @see PType#getDetachedValue(Object) */ public static <K, V> Pair<K, V> getDetachedValue(PTableType<K, V> tableType, Pair<K, V> value) { return Pair.of( tableType.getKeyType().getDetachedValue(value.first()), tableType.getValueType().getDetachedValue(value.second())); }
@Override public String map(Pair<String, String> v) { return v.toString(); }
@Override public void process(E entity, Emitter<Pair<E, Void>> emitter) { emitter.emit(Pair.of(entity, (Void) null)); }
@Override public String map(Pair<Long, LongFloatMap> input) { return input.first().toString() + '\t' + setToString(input.second()); }