Exemplo n.º 1
0
    public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
      String[] _allCols = StringUtils.splitPreserveAllTokens(value.toString(), splitChar);
      StringBuffer stringBuffer = new StringBuffer();

      for (int i = 0; i < _allCols.length; i++) {
        stringBuffer.append(i).append('=').append(_allCols[i]).append("\t");
      }
      output.collect(new Text(_allCols[0]), new Text(stringBuffer.toString()));
    }
    public void map(
        LongWritable key,
        Text value,
        OutputCollector<IntWritable, ClusterWritable> output,
        Reporter reporter)
        throws IOException {

      String movieIdStr = new String();
      String reviewStr = new String();
      String userIdStr = new String();
      String reviews = new String();
      String line = new String();
      String tok = new String("");
      long movieId;
      int review, userId, p, q, r, rater, rating, movieIndex;
      int clusterId = 0;
      int[] n = new int[maxClusters];
      float[] sq_a = new float[maxClusters];
      float[] sq_b = new float[maxClusters];
      float[] numer = new float[maxClusters];
      float[] denom = new float[maxClusters];
      float max_similarity = 0.0f;
      float similarity = 0.0f;
      Cluster movie = new Cluster();
      ClusterWritable movies_arrl = new ClusterWritable();

      StringBuffer sb = new StringBuffer();

      line = ((Text) value).toString();
      movieIndex = line.indexOf(":");

      for (r = 0; r < maxClusters; r++) {
        numer[r] = 0.0f;
        denom[r] = 0.0f;
        sq_a[r] = 0.0f;
        sq_b[r] = 0.0f;
        n[r] = 0;
      }
      if (movieIndex > 0) {
        movieIdStr = line.substring(0, movieIndex);
        sb.append(movieIdStr).append(":");

        movieId = Long.parseLong(movieIdStr);
        movie.movie_id = movieId;
        reviews = line.substring(movieIndex + 1);
        StringTokenizer token = new StringTokenizer(reviews, ",");

        int attrCnt = 0;
        // while (token.hasMoreTokens()) { Leo
        while (token.hasMoreTokens() && attrCnt < attrNum) {
          tok = token.nextToken();
          int reviewIndex = tok.indexOf("_");
          // userIdStr = tok.substring(0, reviewIndex); //Leo
          userIdStr = String.valueOf(attrCnt);
          reviewStr = tok.substring(reviewIndex + 1);
          if (attrCnt > 0) {
            sb.append(",");
          }
          sb.append(String.valueOf(attrCnt)).append("_").append(reviewStr);
          userId = Integer.parseInt(userIdStr);
          review = Integer.parseInt(reviewStr);
          for (r = 0; r < totalClusters; r++) {
            /*for (q = 0; q < centroids_ref[r].total; q++) {
                rater = centroids_ref[r].reviews.get(q).rater_id;
                rating = (int) centroids_ref[r].reviews.get(q).rating;
                if (userId == rater) {
                    numer[r] += (float) (review * rating);
                    sq_a[r] += (float) (review * review);
                    sq_b[r] += (float) (rating * rating);
                    n[r]++; // counter
                    break; // to avoid multiple ratings by the same reviewer
                }
            }*/
            // Leo
            rating = (int) centroids_ref[r].reviews.get(attrCnt).rating;
            numer[r] += (float) ((review - rating) * (review - rating));

            n[r]++; // counter
          }
          attrCnt++;
        }
        for (p = 0; p < totalClusters; p++) {
          /*denom[p] = (float) ((Math.sqrt((double) sq_a[p])) * (Math
                  .sqrt((double) sq_b[p])));
          if (denom[p] > 0) {
              similarity = numer[p] / denom[p];
              if (similarity > max_similarity) {
                  max_similarity = similarity;
                  clusterId = p;
              }
          }*/
          // Leo
          similarity = 250 - numer[p];
          if (similarity > max_similarity) {
            max_similarity = similarity;
            clusterId = p;
          }
        }

        // movies_arrl.movies.add(line);//Leo
        movies_arrl.movies.add(sb.toString());
        movies_arrl.similarities.add(max_similarity);
        movies_arrl.similarity = max_similarity;
        output.collect(new IntWritable(clusterId), movies_arrl);
        reporter.incrCounter(Counter.WORDS, 1);
      }
    }