Exemplo n.º 1
0
 private static <E> PCollection<E> partition(PCollection<E> collection, int numReducers) {
   PType<E> type = collection.getPType();
   PTableType<E, Void> tableType = Avros.tableOf(type, Avros.nulls());
   PTable<E, Void> table = collection.parallelDo(new AsKeyTable<E>(), tableType);
   PGroupedTable<E, Void> grouped =
       numReducers > 0 ? table.groupByKey(numReducers) : table.groupByKey();
   return grouped.ungroup().keys();
 }
Exemplo n.º 2
0
  @Override
  protected MRPipeline createPipeline() throws IOException {
    JobStepConfig stepConfig = getConfig();
    ClusterSettings settings = ClusterSettings.create(ConfigUtils.getDefaultConfig());

    String instanceDir = stepConfig.getInstanceDir();
    int generationID = stepConfig.getGenerationID();
    int iteration = stepConfig.getIteration();
    String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
    String outputKey = prefix + String.format("sketch/%d/", iteration);
    if (!validOutputPath(outputKey)) {
      return null;
    }

    // get normalized vectors
    String inputKey = prefix + "normalized/";
    MRPipeline p = createBasicPipeline(DistanceToClosestFn.class);
    AvroType<Pair<Integer, RealVector>> inputType = Avros.pairs(Avros.ints(), MLAvros.vector());
    PCollection<Pair<Integer, RealVector>> in = p.read(avroInput(inputKey, inputType));

    // either create or load the set of currently chosen k-sketch vectors
    // they are stored in a KSketchIndex object
    DistanceToClosestFn<RealVector> distanceToClosestFn;
    UpdateIndexFn updateIndexFn;
    if (iteration
        == 1) { // Iteration 1 is the first real iteration; iteration 0 contains initial state
      KSketchIndex index = createInitialIndex(settings, in);
      distanceToClosestFn = new DistanceToClosestFn<>(index);
      updateIndexFn = new UpdateIndexFn(index);
    } else {
      // Get the index location from the previous iteration
      String previousIndexKey = prefix + String.format("sketch/%d/", iteration - 1);
      distanceToClosestFn = new DistanceToClosestFn<>(previousIndexKey);
      updateIndexFn = new UpdateIndexFn(previousIndexKey);
    }

    // compute distance of each vector in dataset to closest vector in k-sketch
    PTable<Integer, Pair<RealVector, Double>> weighted =
        in.parallelDo(
            "computeDistances",
            distanceToClosestFn,
            Avros.tableOf(Avros.ints(), Avros.pairs(MLAvros.vector(), Avros.doubles())));

    // run weighted reservoir sampling on the vector to select another group of
    // settings.getSketchPoints()
    // to add to the k-sketch
    PTable<Integer, RealVector> kSketchSample =
        ReservoirSampling.groupedWeightedSample(
            weighted, settings.getSketchPoints(), RandomManager.getRandom());

    // update the KSketchIndex with the newly-chosen vectors
    kSketchSample
        .parallelDo("updateIndex", updateIndexFn, Serializables.avro(KSketchIndex.class))
        .write(avroOutput(outputKey));

    return p;
  }