public static void main(String[] args) { if (args.length < 2) { System.err.println("Usage: KMeansMP <input_file> <results>"); System.exit(1); } String inputFile = args[0]; String results_path = args[1]; JavaPairRDD<Integer, Iterable<String>> results; int k = 4; int iterations = 100; int runs = 1; long seed = 0; final KMeansModel model; SparkConf sparkConf = new SparkConf().setAppName("KMeans MP"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<String> lines = sc.textFile(inputFile); JavaRDD<Vector> points = lines.map(new ParsePoint()); JavaRDD<String> titles = lines.map(new ParseTitle()); model = KMeans.train(points.rdd(), k, iterations, runs, KMeans.RANDOM(), 0); results = titles.zip(points).mapToPair(new ClusterCars(model)).groupByKey(); results.saveAsTextFile(results_path); sc.stop(); }
/** * This method trains a k-means clustering model * * @param data Training data as a JavaRDD of Vectors * @param noOfClusters Number of clusters * @param noOfIterations Number of iterations to run * @param noOfRuns Number of runs of the algorithm to execute in parallel * @param initializationMode Initialization algorithm: random or k-means|| * @return */ public KMeansModel train( JavaRDD<Vector> data, int noOfClusters, int noOfIterations, int noOfRuns, String initializationMode) { return org.apache.spark.mllib.clustering.KMeans.train( data.rdd(), noOfClusters, noOfIterations, noOfRuns, initializationMode); }