Example #1
0
  @Override
  protected MRPipeline createPipeline() throws IOException {
    JobStepConfig stepConfig = getConfig();
    ClusterSettings settings = ClusterSettings.create(ConfigUtils.getDefaultConfig());

    String instanceDir = stepConfig.getInstanceDir();
    int generationID = stepConfig.getGenerationID();
    int iteration = stepConfig.getIteration();
    String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
    String outputKey = prefix + String.format("sketch/%d/", iteration);
    if (!validOutputPath(outputKey)) {
      return null;
    }

    // get normalized vectors
    String inputKey = prefix + "normalized/";
    MRPipeline p = createBasicPipeline(DistanceToClosestFn.class);
    AvroType<Pair<Integer, RealVector>> inputType = Avros.pairs(Avros.ints(), MLAvros.vector());
    PCollection<Pair<Integer, RealVector>> in = p.read(avroInput(inputKey, inputType));

    // either create or load the set of currently chosen k-sketch vectors
    // they are stored in a KSketchIndex object
    DistanceToClosestFn<RealVector> distanceToClosestFn;
    UpdateIndexFn updateIndexFn;
    if (iteration
        == 1) { // Iteration 1 is the first real iteration; iteration 0 contains initial state
      KSketchIndex index = createInitialIndex(settings, in);
      distanceToClosestFn = new DistanceToClosestFn<>(index);
      updateIndexFn = new UpdateIndexFn(index);
    } else {
      // Get the index location from the previous iteration
      String previousIndexKey = prefix + String.format("sketch/%d/", iteration - 1);
      distanceToClosestFn = new DistanceToClosestFn<>(previousIndexKey);
      updateIndexFn = new UpdateIndexFn(previousIndexKey);
    }

    // compute distance of each vector in dataset to closest vector in k-sketch
    PTable<Integer, Pair<RealVector, Double>> weighted =
        in.parallelDo(
            "computeDistances",
            distanceToClosestFn,
            Avros.tableOf(Avros.ints(), Avros.pairs(MLAvros.vector(), Avros.doubles())));

    // run weighted reservoir sampling on the vector to select another group of
    // settings.getSketchPoints()
    // to add to the k-sketch
    PTable<Integer, RealVector> kSketchSample =
        ReservoirSampling.groupedWeightedSample(
            weighted, settings.getSketchPoints(), RandomManager.getRandom());

    // update the KSketchIndex with the newly-chosen vectors
    kSketchSample
        .parallelDo("updateIndex", updateIndexFn, Serializables.avro(KSketchIndex.class))
        .write(avroOutput(outputKey));

    return p;
  }
  @Override
  protected MRPipeline createPipeline() throws IOException {
    JobStepConfig stepConfig = getConfig();
    Config config = ConfigUtils.getDefaultConfig();
    ClusterSettings clusterSettings = ClusterSettings.create(config);

    String instanceDir = stepConfig.getInstanceDir();
    long generationID = stepConfig.getGenerationID();
    String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID);
    String outputKey = prefix + "weighted/";
    if (!validOutputPath(outputKey)) {
      return null;
    }

    String indexKey = prefix + "sketch/" + clusterSettings.getSketchIterations();
    String inputKey = prefix + "normalized/";
    MRPipeline p = createBasicPipeline(ClosestSketchVectorFn.class);

    // first I compute the weight of each k-sketch vector, i.e., Voronoi partition
    // I aggregate all together and persist on disk
    // PCollection<ClosestSketchVectorData> weights = inputPairs(p, inputKey, MLAvros.vector())
    PCollection<ClosestSketchVectorData> weights =
        PTables.asPTable(
                inputPairs(p, inputKey, MLAvros.vector())
                    .parallelDo(
                        "computingSketchVectorWeights",
                        new ClosestSketchVectorFn<RealVector>(indexKey, clusterSettings),
                        Avros.pairs(Avros.ints(), Avros.reflects(ClosestSketchVectorData.class))))
            .groupByKey(1)
            .combineValues(new ClosestSketchVectorAggregator(clusterSettings))
            .values()
            .write(avroOutput(outputKey + "kSketchVectorWeights/"));

    // this "pipeline" takes a single ClosestSketchVectorData and returns weighted vectors
    // could be done outside MapReduce, but that would require me to materialize the
    // ClosestSketchVectorData
    weights
        .parallelDo(
            "generatingWeightedSketchVectors",
            new WeightVectorsFn(indexKey),
            KMeansTypes.FOLD_WEIGHTED_VECTOR)
        .write(avroOutput(outputKey + "weightedKSketchVectors/"));

    return p;
  }