public void map( LongWritable key, Text value, OutputCollector<IntWritable, ClusterWritable> output, Reporter reporter) throws IOException { String movieIdStr = new String(); String reviewStr = new String(); String userIdStr = new String(); String reviews = new String(); String line = new String(); String tok = new String(""); long movieId; int review, userId, p, q, r, rater, rating, movieIndex; int clusterId = 0; int[] n = new int[maxClusters]; float[] sq_a = new float[maxClusters]; float[] sq_b = new float[maxClusters]; float[] numer = new float[maxClusters]; float[] denom = new float[maxClusters]; float max_similarity = 0.0f; float similarity = 0.0f; Cluster movie = new Cluster(); ClusterWritable movies_arrl = new ClusterWritable(); StringBuffer sb = new StringBuffer(); line = ((Text) value).toString(); movieIndex = line.indexOf(":"); for (r = 0; r < maxClusters; r++) { numer[r] = 0.0f; denom[r] = 0.0f; sq_a[r] = 0.0f; sq_b[r] = 0.0f; n[r] = 0; } if (movieIndex > 0) { movieIdStr = line.substring(0, movieIndex); sb.append(movieIdStr).append(":"); movieId = Long.parseLong(movieIdStr); movie.movie_id = movieId; reviews = line.substring(movieIndex + 1); StringTokenizer token = new StringTokenizer(reviews, ","); int attrCnt = 0; // while (token.hasMoreTokens()) { Leo while (token.hasMoreTokens() && attrCnt < attrNum) { tok = token.nextToken(); int reviewIndex = tok.indexOf("_"); // userIdStr = tok.substring(0, reviewIndex); //Leo userIdStr = String.valueOf(attrCnt); reviewStr = tok.substring(reviewIndex + 1); if (attrCnt > 0) { sb.append(","); } sb.append(String.valueOf(attrCnt)).append("_").append(reviewStr); userId = Integer.parseInt(userIdStr); review = Integer.parseInt(reviewStr); for (r = 0; r < totalClusters; r++) { /*for (q = 0; q < centroids_ref[r].total; q++) { rater = centroids_ref[r].reviews.get(q).rater_id; rating = (int) centroids_ref[r].reviews.get(q).rating; if (userId == rater) { numer[r] += (float) (review * rating); sq_a[r] += (float) (review * review); sq_b[r] += (float) (rating * rating); n[r]++; // counter break; // to avoid multiple ratings by the same reviewer } }*/ // Leo rating = (int) centroids_ref[r].reviews.get(attrCnt).rating; numer[r] += (float) ((review - rating) * (review - rating)); n[r]++; // counter } attrCnt++; } for (p = 0; p < totalClusters; p++) { /*denom[p] = (float) ((Math.sqrt((double) sq_a[p])) * (Math .sqrt((double) sq_b[p]))); if (denom[p] > 0) { similarity = numer[p] / denom[p]; if (similarity > max_similarity) { max_similarity = similarity; clusterId = p; } }*/ // Leo similarity = 250 - numer[p]; if (similarity > max_similarity) { max_similarity = similarity; clusterId = p; } } // movies_arrl.movies.add(line);//Leo movies_arrl.movies.add(sb.toString()); movies_arrl.similarities.add(max_similarity); movies_arrl.similarity = max_similarity; output.collect(new IntWritable(clusterId), movies_arrl); reporter.incrCounter(Counter.WORDS, 1); } }
public static void main(String[] args) { // TEST MEASURE // Point p1 = new Point(-1d, -1d); // Point p2 = new Point(2d, 3d); // System.out.println(measure.d(p1, p2)); // System.out.println(measure.s(p1, p2)); // return; Double[][] data = FileHandler.readFile(fileName); // cannot display points if dimension is > 2 if (data[0].length != 2) canDisplay = false; // build graphic points from coords' array buildPointsFromData(data); Config.computeBoundingRect(points); // init display if (canDisplay) { disp = new Display(); disp.setVisible(true); for (Point p : points) { disp.addObject(p); } } testResults = new double[nbTests]; for (int t = 0; t < nbTests; ++t) { // define K clusters and K temporary centres clusters = new ArrayList<Cluster>(); for (int i = 0; i < K; ++i) { clusters.add(new Cluster()); } setRandomCenters(); for (Cluster c : clusters) { System.out.println("center for cluster " + c + ": " + c.getCenter()); } if (canDisplay) pause(1000); // variables used in for loops double minDist, currDist, diff; Double[] prevCoords, newCoords; Cluster alloc; Point newCenter; for (int i = 0; i < maxIter; ++i) { if (canDisplay) { disp.setLabel("[ iteration #" + (i + 1) + " ]"); } else { System.out.println("------> iteration #" + (i + 1)); } // allocate points to group which center is closest for (Point p : points) { minDist = Config.MAX; alloc = clusters.get(0); // default initialization for (Cluster c : clusters) { currDist = measure.d(p, c.getCenter()); if (currDist < minDist) { minDist = currDist; alloc = c; } } alloc.addPoint(p); } // recenter: calculate gravity centers for formed groups diff = 0; prevCoords = null; for (Cluster c : clusters) { // delete previous center if it not a Point of the Cluster if (canDisplay && !c.getPoints().contains(c.getCenter())) { disp.removeObject(c.getCenter()); } if (stopOnConverge) { prevCoords = c.getCenter().getCoords(); } newCenter = c.makeGravityCenter(); if (stopOnConverge) { newCoords = c.getCenter().getCoords(); for (int k = 0; k < prevCoords.length; ++k) { diff += Math.abs(prevCoords[k] - newCoords[k]); } } if (canDisplay) { disp.addObject(newCenter); } else { // System.out.println("\tcenter for " + c + ": " + c.getCenter()); System.out.println(c.getCenter()); } } // loop over clusters if (canDisplay) { disp.repaint(); } // if Clusters' centers don't change anymore, then stop (algorithm converged) if (diff == 0 && stopOnConverge) { testResults[t] = (double) i; if (canDisplay) { disp.setLabel("[ Converged at iteration #" + (i) + " ]"); disp.repaint(); } else { System.out.println("[ Converged at iteration #" + (i) + " ]"); } break; } pause(100); } // loop over iterations if (testResults[t] == 0) { System.out.println("!!!!!!!!!! Test #" + t + " did not converge."); if (stopOnConverge) return; } // reset display if (canDisplay && t + 1 < nbTests) { for (Point p : points) p.setCluster(null); for (Cluster c : clusters) disp.removeObject(c.getCenter()); } } // loop over tests // display test results and compute means DescriptiveStatistics stats = new DescriptiveStatistics(testResults); System.out.println("=========> Results:"); for (int t = 0; t < nbTests; ++t) { System.out.println("--> [ " + testResults[t] + " ]"); } System.out.println("=========> Means: " + stats.getMean()); System.out.println("=========> Std dev: " + stats.getStandardDeviation()); }