Пример #1
0
  public static KSketchIndex createInitialIndex(
      ClusterSettings settings, PCollection<Pair<Integer, RealVector>> input) {
    RealVector[] init = new RealVector[settings.getCrossFolds()];
    for (Pair<Integer, RealVector> rv : input.materialize()) {
      if (init[rv.first()] == null) {
        init[rv.first()] = rv.second();
      }
      boolean done = true;
      for (RealVector vec : init) {
        if (vec == null) {
          done = false;
          break;
        }
      }
      if (done) {
        break;
      }
    }

    KSketchIndex index =
        new KSketchIndex(
            settings.getCrossFolds(),
            init[0].getDimension(),
            settings.getIndexBits(),
            settings.getIndexSamples(),
            1729L); // TODO: something smarter, or figure out that I don't need this b/c I compute
                    // the projections up front
    for (int i = 0; i < init.length; i++) {
      index.add(init[i], i);
    }
    return index;
  }
Пример #2
0
  @Test
  public void testQuantilesNines() {
    PTable<String, Integer> testTable =
        MemPipeline.typedTableOf(
            tableOf(strings(), ints()),
            "a",
            10,
            "a",
            20,
            "a",
            30,
            "a",
            40,
            "a",
            50,
            "a",
            60,
            "a",
            70,
            "a",
            80,
            "a",
            90,
            "a",
            100);
    Map<String, Result<Integer>> actualS =
        Quantiles.distributed(testTable, 0.9, 0.99).materializeToMap();
    Map<String, Result<Integer>> actualM =
        Quantiles.inMemory(testTable, 0.9, 0.99).materializeToMap();
    Map<String, Result<Integer>> expected =
        ImmutableMap.of("a", result(10, Pair.of(0.9, 90), Pair.of(0.99, 100)));

    assertEquals(expected, actualS);
    assertEquals(expected, actualM);
  }
Пример #3
0
 /** {@inheritDoc} */
 @Override
 public void cleanup(Emitter<Pair<K, Pair<U, V>>> emitter) {
   if (0 == lastId) {
     for (U u : leftValues) {
       emitter.emit(Pair.of(lastKey, Pair.of(u, (V) null)));
     }
   }
 }
Пример #4
0
  /** {@inheritDoc} */
  @Override
  public void join(
      K key, int id, Iterable<Pair<U, V>> pairs, Emitter<Pair<K, Pair<U, V>>> emitter) {
    if (!key.equals(lastKey)) {
      // Make sure that left side gets emitted.
      if (0 == lastId && 0 == id) {
        for (U u : leftValues) {
          emitter.emit(Pair.of(lastKey, Pair.of(u, (V) null)));
        }
      }
      lastKey = key;
      leftValues.clear();
    }
    if (id == 0) {
      for (Pair<U, V> pair : pairs) {
        if (pair.first() != null) leftValues.add(leftValueType.getDetachedValue(pair.first()));
      }
    } else {
      for (Pair<U, V> pair : pairs) {
        // Make sure that right side gets emitted.
        if (leftValues.isEmpty()) {
          leftValues.add(null);
        }
        for (U u : leftValues) {
          emitter.emit(Pair.of(lastKey, Pair.of(u, pair.second())));
        }
      }
    }

    lastId = id;
  }
Пример #5
0
  @Override
  public void process(
      Pair<Pair<Long, Integer>, Iterable<NumericIDValue>> input,
      Emitter<Pair<Long, NumericIDValue>> emitter) {
    Pair<Long, Integer> key = input.first();
    long currentUserID = key.first();

    if (key.second() == BEFORE) {

      // Last old data had no match, just output it
      if (previousUserPrefs != null) {
        Preconditions.checkNotNull(previousUserID);
        output(previousUserID, previousUserPrefs, null, null, emitter);
        previousUserPrefs = null;
        previousUserID = null;
      }

      LongFloatMap oldPrefs = new LongFloatMap();
      for (NumericIDValue itemPref : input.second()) {
        float oldPrefValue = itemPref.getValue();
        Preconditions.checkState(!Float.isNaN(oldPrefValue), "No prior pref value?");
        // Apply decay factor here, if applicable:
        oldPrefs.increment(itemPref.getID(), doDecay ? oldPrefValue * decayFactor : oldPrefValue);
      }

      previousUserPrefs = oldPrefs;
      previousUserID = currentUserID;

    } else {
      // Last old data had no match, just output it
      if (previousUserPrefs != null && currentUserID != previousUserID) {
        Preconditions.checkNotNull(previousUserID);
        output(previousUserID, previousUserPrefs, null, null, emitter);
        previousUserPrefs = null;
        previousUserID = null;
      }

      LongFloatMap newPrefs = new LongFloatMap();
      LongSet removedItemIDs = new LongSet();
      for (NumericIDValue itemPref : input.second()) {
        long itemID = itemPref.getID();
        float newPrefValue = itemPref.getValue();
        if (Float.isNaN(newPrefValue)) {
          removedItemIDs.add(itemID);
        } else {
          newPrefs.increment(itemID, newPrefValue);
        }
      }

      output(currentUserID, previousUserPrefs, newPrefs, removedItemIDs, emitter);

      previousUserPrefs = null;
      previousUserID = null;
    }
  }
Пример #6
0
  private void output(
      long userID,
      LongFloatMap oldPrefs,
      LongFloatMap newPrefs,
      LongSet removedItemIDs,
      Emitter<Pair<Long, NumericIDValue>> emitter) {
    // Old prefs may be null when there is no previous generation, for example, or the user is new.
    // First, write out existing prefs, possibly updated by new values
    if (oldPrefs != null && !oldPrefs.isEmpty()) {
      for (LongFloatMap.MapEntry entry : oldPrefs.entrySet()) {
        long itemID = entry.getKey();
        float oldPrefValue = entry.getValue();
        Preconditions.checkState(!Float.isNaN(oldPrefValue), "No prior pref value?");

        // May be NaN if no new data at all, or new data has no update:
        float sum = oldPrefValue;
        if (newPrefs != null) {
          float newPrefValue = newPrefs.get(itemID);
          if (!Float.isNaN(newPrefValue)) {
            sum += newPrefValue;
          }
        }

        boolean remove = false;
        if (removedItemIDs != null && removedItemIDs.contains(itemID)) {
          remove = true;
        } else if (FastMath.abs(sum) <= zeroThreshold) {
          remove = true;
        }

        if (!remove) {
          emitter.emit(Pair.of(userID, new NumericIDValue(itemID, sum)));
        }
      }
    }

    // Now output new data, that didn't exist in old prefs
    if (newPrefs != null && !newPrefs.isEmpty()) {
      for (LongFloatMap.MapEntry entry : newPrefs.entrySet()) {
        long itemID = entry.getKey();
        if (oldPrefs == null || !oldPrefs.containsKey(itemID)) {
          // It wasn't already written. If it exists in newPrefs, it's also not removed
          float newPrefValue = entry.getValue();
          if (FastMath.abs(newPrefValue) > zeroThreshold) {
            emitter.emit(Pair.of(userID, new NumericIDValue(itemID, newPrefValue)));
          }
        }
      }
    }
  }
Пример #7
0
  @Test
  public void testQuantilesExact() {
    PTable<String, Integer> testTable =
        MemPipeline.typedTableOf(
            tableOf(strings(), ints()), "a", 5, "a", 2, "a", 3, "a", 4, "a", 1);
    Map<String, Result<Integer>> actualS =
        Quantiles.distributed(testTable, 0, 0.5, 1.0).materializeToMap();
    Map<String, Result<Integer>> actualM =
        Quantiles.inMemory(testTable, 0, 0.5, 1.0).materializeToMap();
    Map<String, Result<Integer>> expected =
        ImmutableMap.of("a", result(5, Pair.of(0.0, 1), Pair.of(0.5, 3), Pair.of(1.0, 5)));

    assertEquals(expected, actualS);
    assertEquals(expected, actualM);
  }
Пример #8
0
  /**
   * Created a detached value for a {@link PGroupedTable} value.
   *
   * @param groupedTableType The grouped table type
   * @param value The value from which a detached value is to be created
   * @return The detached value
   * @see PType#getDetachedValue(Object)
   */
  public static <K, V> Pair<K, Iterable<V>> getGroupedDetachedValue(
      PGroupedTableType<K, V> groupedTableType, Pair<K, Iterable<V>> value) {

    PTableType<K, V> tableType = groupedTableType.getTableType();
    List<V> detachedIterable = Lists.newArrayList();
    PType<V> valueType = tableType.getValueType();
    for (V v : value.second()) {
      detachedIterable.add(valueType.getDetachedValue(v));
    }
    return Pair.of(
        tableType.getKeyType().getDetachedValue(value.first()), (Iterable<V>) detachedIterable);
  }
Пример #9
0
    @Override
    public void process(
        Pair<Integer, Iterable<Pair<K, V>>> input, Emitter<Pair<Integer, Pair<K, V>>> emitter) {
      Comparator<Pair<K, V>> cmp = new PairValueComparator<K, V>(maximize);
      PriorityQueue<Pair<K, V>> queue = new PriorityQueue<Pair<K, V>>(limit, cmp);
      for (Pair<K, V> pair : input.second()) {
        queue.add(pair);
        if (queue.size() > limit) {
          queue.poll();
        }
      }

      List<Pair<K, V>> values = Lists.newArrayList(queue);
      Collections.sort(values, cmp);
      for (int i = values.size() - 1; i >= 0; i--) {
        emitter.emit(Pair.of(0, values.get(i)));
      }
    }
Пример #10
0
  @Test
  public void testQuantilesBetween() {
    PTable<String, Integer> testTable =
        MemPipeline.typedTableOf(
            tableOf(strings(), ints()),
            "a",
            5,
            "a",
            2, // We expect the 0.5 to correspond to this element, according to the "nearest rank"
            // %ile definition.
            "a",
            4,
            "a",
            1);
    Map<String, Result<Integer>> actualS = Quantiles.distributed(testTable, 0.5).materializeToMap();
    Map<String, Result<Integer>> actualM = Quantiles.inMemory(testTable, 0.5).materializeToMap();
    Map<String, Result<Integer>> expected = ImmutableMap.of("a", result(4, Pair.of(0.5, 2)));

    assertEquals(expected, actualS);
    assertEquals(expected, actualM);
  }
Пример #11
0
 @Override
 public void process(Pair<K, Iterable<V>> input, Emitter<Pair<K, V>> emitter) {
   for (V v : input.second()) {
     emitter.emit(Pair.of(input.first(), v));
   }
 }
Пример #12
0
  private void runMapsideLeftOuterJoin(Pipeline pipeline, boolean inMemory, boolean materialize) {
    PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt");
    PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt");

    JoinStrategy<Integer, String, String> mapsideJoin =
        new MapsideJoinStrategy<Integer, String, String>(materialize);
    PTable<Integer, String> custOrders =
        mapsideJoin
            .join(customerTable, orderTable, JoinType.LEFT_OUTER_JOIN)
            .mapValues("concat", new ConcatValuesFn(), Writables.strings());

    PTable<Integer, String> ORDER_TABLE =
        orderTable.mapValues(new CapOrdersFn(), orderTable.getValueType());
    PTable<Integer, Pair<String, String>> joined =
        mapsideJoin.join(custOrders, ORDER_TABLE, JoinType.LEFT_OUTER_JOIN);

    List<Pair<Integer, Pair<String, String>>> expectedJoinResult = Lists.newArrayList();
    expectedJoinResult.add(Pair.of(111, Pair.of("[John Doe,Corn flakes]", "CORN FLAKES")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PAPER")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PLUNGER")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PAPER")));
    expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PLUNGER")));
    expectedJoinResult.add(Pair.of(333, Pair.of("[Someone Else,Toilet brush]", "TOILET BRUSH")));
    expectedJoinResult.add(Pair.of(444, Pair.<String, String>of("[Has No Orders,null]", null)));
    Iterable<Pair<Integer, Pair<String, String>>> iter = joined.materialize();

    PipelineResult res = pipeline.run();
    if (!inMemory) {
      assertEquals(materialize ? 2 : 1, res.getStageResults().size());
    }

    List<Pair<Integer, Pair<String, String>>> joinedResultList = Lists.newArrayList(iter);
    Collections.sort(joinedResultList);

    assertEquals(expectedJoinResult, joinedResultList);
  }
Пример #13
0
 @Override
 public Pair<Integer, String> map(String input) {
   String[] fields = input.split("\\|");
   return Pair.of(Integer.parseInt(fields[0]), fields[1]);
 }
Пример #14
0
 @Override
 public int compare(Pair<K, V> left, Pair<K, V> right) {
   int cmp = ((Comparable<V>) left.second()).compareTo(right.second());
   return ascending ? cmp : -cmp;
 }
Пример #15
0
 public void cleanup(Emitter<Pair<Integer, Pair<K, V>>> emitter) {
   for (Pair<K, V> p : values) {
     emitter.emit(Pair.of(0, p));
   }
 }
Пример #16
0
 /**
  * Create a detached value for a table {@link Pair}.
  *
  * @param tableType The table type
  * @param value The value from which a detached value is to be created
  * @return The detached value
  * @see PType#getDetachedValue(Object)
  */
 public static <K, V> Pair<K, V> getDetachedValue(PTableType<K, V> tableType, Pair<K, V> value) {
   return Pair.of(
       tableType.getKeyType().getDetachedValue(value.first()),
       tableType.getValueType().getDetachedValue(value.second()));
 }
Пример #17
0
 @Override
 public String map(Pair<String, String> v) {
   return v.toString();
 }
Пример #18
0
 @Override
 public void process(E entity, Emitter<Pair<E, Void>> emitter) {
   emitter.emit(Pair.of(entity, (Void) null));
 }
Пример #19
0
 @Override
 public String map(Pair<Long, LongFloatMap> input) {
   return input.first().toString() + '\t' + setToString(input.second());
 }