Beispiel #1
0
  public static void main(String[] args) {
    if (args.length < 2) {
      System.err.println("Usage: KMeansMP <input_file> <results>");
      System.exit(1);
    }
    String inputFile = args[0];
    String results_path = args[1];
    JavaPairRDD<Integer, Iterable<String>> results;
    int k = 4;
    int iterations = 100;
    int runs = 1;
    long seed = 0;
    final KMeansModel model;

    SparkConf sparkConf = new SparkConf().setAppName("KMeans MP");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    JavaRDD<String> lines = sc.textFile(inputFile);

    JavaRDD<Vector> points = lines.map(new ParsePoint());
    JavaRDD<String> titles = lines.map(new ParseTitle());

    model = KMeans.train(points.rdd(), k, iterations, runs, KMeans.RANDOM(), 0);

    results = titles.zip(points).mapToPair(new ClusterCars(model)).groupByKey();

    results.saveAsTextFile(results_path);

    sc.stop();
  }
Beispiel #2
0
 /**
  * This method trains a k-means clustering model
  *
  * @param data Training data as a JavaRDD of Vectors
  * @param noOfClusters Number of clusters
  * @param noOfIterations Number of iterations to run
  * @param noOfRuns Number of runs of the algorithm to execute in parallel
  * @param initializationMode Initialization algorithm: random or k-means||
  * @return
  */
 public KMeansModel train(
     JavaRDD<Vector> data,
     int noOfClusters,
     int noOfIterations,
     int noOfRuns,
     String initializationMode) {
   return org.apache.spark.mllib.clustering.KMeans.train(
       data.rdd(), noOfClusters, noOfIterations, noOfRuns, initializationMode);
 }