@Override protected MRPipeline createPipeline() throws IOException { JobStepConfig stepConfig = getConfig(); ClusterSettings settings = ClusterSettings.create(ConfigUtils.getDefaultConfig()); String instanceDir = stepConfig.getInstanceDir(); int generationID = stepConfig.getGenerationID(); int iteration = stepConfig.getIteration(); String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID); String outputKey = prefix + String.format("sketch/%d/", iteration); if (!validOutputPath(outputKey)) { return null; } // get normalized vectors String inputKey = prefix + "normalized/"; MRPipeline p = createBasicPipeline(DistanceToClosestFn.class); AvroType<Pair<Integer, RealVector>> inputType = Avros.pairs(Avros.ints(), MLAvros.vector()); PCollection<Pair<Integer, RealVector>> in = p.read(avroInput(inputKey, inputType)); // either create or load the set of currently chosen k-sketch vectors // they are stored in a KSketchIndex object DistanceToClosestFn<RealVector> distanceToClosestFn; UpdateIndexFn updateIndexFn; if (iteration == 1) { // Iteration 1 is the first real iteration; iteration 0 contains initial state KSketchIndex index = createInitialIndex(settings, in); distanceToClosestFn = new DistanceToClosestFn<>(index); updateIndexFn = new UpdateIndexFn(index); } else { // Get the index location from the previous iteration String previousIndexKey = prefix + String.format("sketch/%d/", iteration - 1); distanceToClosestFn = new DistanceToClosestFn<>(previousIndexKey); updateIndexFn = new UpdateIndexFn(previousIndexKey); } // compute distance of each vector in dataset to closest vector in k-sketch PTable<Integer, Pair<RealVector, Double>> weighted = in.parallelDo( "computeDistances", distanceToClosestFn, Avros.tableOf(Avros.ints(), Avros.pairs(MLAvros.vector(), Avros.doubles()))); // run weighted reservoir sampling on the vector to select another group of // settings.getSketchPoints() // to add to the k-sketch PTable<Integer, RealVector> kSketchSample = ReservoirSampling.groupedWeightedSample( weighted, settings.getSketchPoints(), RandomManager.getRandom()); // update the KSketchIndex with the newly-chosen vectors kSketchSample .parallelDo("updateIndex", updateIndexFn, Serializables.avro(KSketchIndex.class)) .write(avroOutput(outputKey)); return p; }
@Override protected MRPipeline createPipeline() throws IOException { JobStepConfig stepConfig = getConfig(); Config config = ConfigUtils.getDefaultConfig(); ClusterSettings clusterSettings = ClusterSettings.create(config); String instanceDir = stepConfig.getInstanceDir(); long generationID = stepConfig.getGenerationID(); String prefix = Namespaces.getInstanceGenerationPrefix(instanceDir, generationID); String outputKey = prefix + "weighted/"; if (!validOutputPath(outputKey)) { return null; } String indexKey = prefix + "sketch/" + clusterSettings.getSketchIterations(); String inputKey = prefix + "normalized/"; MRPipeline p = createBasicPipeline(ClosestSketchVectorFn.class); // first I compute the weight of each k-sketch vector, i.e., Voronoi partition // I aggregate all together and persist on disk // PCollection<ClosestSketchVectorData> weights = inputPairs(p, inputKey, MLAvros.vector()) PCollection<ClosestSketchVectorData> weights = PTables.asPTable( inputPairs(p, inputKey, MLAvros.vector()) .parallelDo( "computingSketchVectorWeights", new ClosestSketchVectorFn<RealVector>(indexKey, clusterSettings), Avros.pairs(Avros.ints(), Avros.reflects(ClosestSketchVectorData.class)))) .groupByKey(1) .combineValues(new ClosestSketchVectorAggregator(clusterSettings)) .values() .write(avroOutput(outputKey + "kSketchVectorWeights/")); // this "pipeline" takes a single ClosestSketchVectorData and returns weighted vectors // could be done outside MapReduce, but that would require me to materialize the // ClosestSketchVectorData weights .parallelDo( "generatingWeightedSketchVectors", new WeightVectorsFn(indexKey), KMeansTypes.FOLD_WEIGHTED_VECTOR) .write(avroOutput(outputKey + "weightedKSketchVectors/")); return p; }