public static void main(String[] args) { if (args.length == 0) { System.err.println("Usage: JavaTC <host> [<slices>]"); System.exit(1); } JavaSparkContext sc = new JavaSparkContext( args[0], "JavaTC", System.getenv("SPARK_HOME"), System.getenv("SPARK_EXAMPLES_JAR")); Integer slices = (args.length > 1) ? Integer.parseInt(args[1]) : 2; JavaPairRDD<Integer, Integer> tc = sc.parallelizePairs(generateGraph(), slices).cache(); // Linear transitive closure: each round grows paths by one edge, // by joining the graph's edges with the already-discovered paths. // e.g. join the path (y, z) from the TC with the edge (x, y) from // the graph to obtain the path (x, z). // Because join() joins on keys, the edges are stored in reversed order. JavaPairRDD<Integer, Integer> edges = tc.map( new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() { public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) { return new Tuple2<Integer, Integer>(e._2(), e._1()); } }); long oldCount = 0; long nextCount = tc.count(); do { oldCount = nextCount; // Perform the join, obtaining an RDD of (y, (z, x)) pairs, // then project the result to obtain the new (x, z) paths. tc = tc.union(tc.join(edges).map(ProjectFn.INSTANCE)).distinct().cache(); nextCount = tc.count(); } while (nextCount != oldCount); System.out.println("TC has " + tc.count() + " edges."); System.exit(0); }
public static void main(String[] args) throws Exception { if (args.length < 4) { System.err.println("Usage: JavaKMeans <master> <file> <k> <convergeDist>"); System.exit(1); } JavaSparkContext sc = new JavaSparkContext( args[0], "JavaKMeans", System.getenv("SPARK_HOME"), System.getenv("SPARK_EXAMPLES_JAR")); String path = args[1]; int K = Integer.parseInt(args[2]); double convergeDist = Double.parseDouble(args[3]); JavaRDD<Vector> data = sc.textFile(path) .map( new Function<String, Vector>() { @Override public Vector call(String line) throws Exception { return parseVector(line); } }) .cache(); final List<Vector> centroids = data.takeSample(false, K, 42); double tempDist; do { // allocate each vector to closest centroid JavaPairRDD<Integer, Vector> closest = data.map( new PairFunction<Vector, Integer, Vector>() { @Override public Tuple2<Integer, Vector> call(Vector vector) throws Exception { return new Tuple2<Integer, Vector>(closestPoint(vector, centroids), vector); } }); // group by cluster id and average the vectors within each cluster to compute centroids JavaPairRDD<Integer, List<Vector>> pointsGroup = closest.groupByKey(); Map<Integer, Vector> newCentroids = pointsGroup .mapValues( new Function<List<Vector>, Vector>() { public Vector call(List<Vector> ps) throws Exception { return average(ps); } }) .collectAsMap(); tempDist = 0.0; for (int i = 0; i < K; i++) { tempDist += centroids.get(i).squaredDist(newCentroids.get(i)); } for (Map.Entry<Integer, Vector> t : newCentroids.entrySet()) { centroids.set(t.getKey(), t.getValue()); } System.out.println("Finished iteration (delta = " + tempDist + ")"); } while (tempDist > convergeDist); System.out.println("Final centers:"); for (Vector c : centroids) System.out.println(c); System.exit(0); }