示例#1
0
文件: JavaTC.java 项目: patelh/spark
  public static void main(String[] args) {
    if (args.length == 0) {
      System.err.println("Usage: JavaTC <host> [<slices>]");
      System.exit(1);
    }

    JavaSparkContext sc =
        new JavaSparkContext(
            args[0], "JavaTC", System.getenv("SPARK_HOME"), System.getenv("SPARK_EXAMPLES_JAR"));
    Integer slices = (args.length > 1) ? Integer.parseInt(args[1]) : 2;
    JavaPairRDD<Integer, Integer> tc = sc.parallelizePairs(generateGraph(), slices).cache();

    // Linear transitive closure: each round grows paths by one edge,
    // by joining the graph's edges with the already-discovered paths.
    // e.g. join the path (y, z) from the TC with the edge (x, y) from
    // the graph to obtain the path (x, z).

    // Because join() joins on keys, the edges are stored in reversed order.
    JavaPairRDD<Integer, Integer> edges =
        tc.map(
            new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
              public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) {
                return new Tuple2<Integer, Integer>(e._2(), e._1());
              }
            });

    long oldCount = 0;
    long nextCount = tc.count();
    do {
      oldCount = nextCount;
      // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
      // then project the result to obtain the new (x, z) paths.
      tc = tc.union(tc.join(edges).map(ProjectFn.INSTANCE)).distinct().cache();
      nextCount = tc.count();
    } while (nextCount != oldCount);

    System.out.println("TC has " + tc.count() + " edges.");
    System.exit(0);
  }
示例#2
0
  public static void main(String[] args) throws Exception {
    if (args.length < 4) {
      System.err.println("Usage: JavaKMeans <master> <file> <k> <convergeDist>");
      System.exit(1);
    }
    JavaSparkContext sc =
        new JavaSparkContext(
            args[0],
            "JavaKMeans",
            System.getenv("SPARK_HOME"),
            System.getenv("SPARK_EXAMPLES_JAR"));
    String path = args[1];
    int K = Integer.parseInt(args[2]);
    double convergeDist = Double.parseDouble(args[3]);

    JavaRDD<Vector> data =
        sc.textFile(path)
            .map(
                new Function<String, Vector>() {
                  @Override
                  public Vector call(String line) throws Exception {
                    return parseVector(line);
                  }
                })
            .cache();

    final List<Vector> centroids = data.takeSample(false, K, 42);

    double tempDist;
    do {
      // allocate each vector to closest centroid
      JavaPairRDD<Integer, Vector> closest =
          data.map(
              new PairFunction<Vector, Integer, Vector>() {
                @Override
                public Tuple2<Integer, Vector> call(Vector vector) throws Exception {
                  return new Tuple2<Integer, Vector>(closestPoint(vector, centroids), vector);
                }
              });

      // group by cluster id and average the vectors within each cluster to compute centroids
      JavaPairRDD<Integer, List<Vector>> pointsGroup = closest.groupByKey();
      Map<Integer, Vector> newCentroids =
          pointsGroup
              .mapValues(
                  new Function<List<Vector>, Vector>() {
                    public Vector call(List<Vector> ps) throws Exception {
                      return average(ps);
                    }
                  })
              .collectAsMap();
      tempDist = 0.0;
      for (int i = 0; i < K; i++) {
        tempDist += centroids.get(i).squaredDist(newCentroids.get(i));
      }
      for (Map.Entry<Integer, Vector> t : newCentroids.entrySet()) {
        centroids.set(t.getKey(), t.getValue());
      }
      System.out.println("Finished iteration (delta = " + tempDist + ")");
    } while (tempDist > convergeDist);

    System.out.println("Final centers:");
    for (Vector c : centroids) System.out.println(c);

    System.exit(0);
  }