Exemplo n.º 1
0
    public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
      String[] _allCols = StringUtils.splitPreserveAllTokens(value.toString(), splitChar);
      StringBuffer stringBuffer = new StringBuffer();

      for (int i = 0; i < _allCols.length; i++) {
        stringBuffer.append(i).append('=').append(_allCols[i]).append("\t");
      }
      output.collect(new Text(_allCols[0]), new Text(stringBuffer.toString()));
    }
    public void map(
        LongWritable key,
        Text value,
        OutputCollector<IntWritable, ClusterWritable> output,
        Reporter reporter)
        throws IOException {

      String movieIdStr = new String();
      String reviewStr = new String();
      String userIdStr = new String();
      String reviews = new String();
      String line = new String();
      String tok = new String("");
      long movieId;
      int review, userId, p, q, r, rater, rating, movieIndex;
      int clusterId = 0;
      int[] n = new int[maxClusters];
      float[] sq_a = new float[maxClusters];
      float[] sq_b = new float[maxClusters];
      float[] numer = new float[maxClusters];
      float[] denom = new float[maxClusters];
      float max_similarity = 0.0f;
      float similarity = 0.0f;
      Cluster movie = new Cluster();
      ClusterWritable movies_arrl = new ClusterWritable();

      StringBuffer sb = new StringBuffer();

      line = ((Text) value).toString();
      movieIndex = line.indexOf(":");

      for (r = 0; r < maxClusters; r++) {
        numer[r] = 0.0f;
        denom[r] = 0.0f;
        sq_a[r] = 0.0f;
        sq_b[r] = 0.0f;
        n[r] = 0;
      }
      if (movieIndex > 0) {
        movieIdStr = line.substring(0, movieIndex);
        sb.append(movieIdStr).append(":");

        movieId = Long.parseLong(movieIdStr);
        movie.movie_id = movieId;
        reviews = line.substring(movieIndex + 1);
        StringTokenizer token = new StringTokenizer(reviews, ",");

        int attrCnt = 0;
        // while (token.hasMoreTokens()) { Leo
        while (token.hasMoreTokens() && attrCnt < attrNum) {
          tok = token.nextToken();
          int reviewIndex = tok.indexOf("_");
          // userIdStr = tok.substring(0, reviewIndex); //Leo
          userIdStr = String.valueOf(attrCnt);
          reviewStr = tok.substring(reviewIndex + 1);
          if (attrCnt > 0) {
            sb.append(",");
          }
          sb.append(String.valueOf(attrCnt)).append("_").append(reviewStr);
          userId = Integer.parseInt(userIdStr);
          review = Integer.parseInt(reviewStr);
          for (r = 0; r < totalClusters; r++) {
            /*for (q = 0; q < centroids_ref[r].total; q++) {
                rater = centroids_ref[r].reviews.get(q).rater_id;
                rating = (int) centroids_ref[r].reviews.get(q).rating;
                if (userId == rater) {
                    numer[r] += (float) (review * rating);
                    sq_a[r] += (float) (review * review);
                    sq_b[r] += (float) (rating * rating);
                    n[r]++; // counter
                    break; // to avoid multiple ratings by the same reviewer
                }
            }*/
            // Leo
            rating = (int) centroids_ref[r].reviews.get(attrCnt).rating;
            numer[r] += (float) ((review - rating) * (review - rating));

            n[r]++; // counter
          }
          attrCnt++;
        }
        for (p = 0; p < totalClusters; p++) {
          /*denom[p] = (float) ((Math.sqrt((double) sq_a[p])) * (Math
                  .sqrt((double) sq_b[p])));
          if (denom[p] > 0) {
              similarity = numer[p] / denom[p];
              if (similarity > max_similarity) {
                  max_similarity = similarity;
                  clusterId = p;
              }
          }*/
          // Leo
          similarity = 250 - numer[p];
          if (similarity > max_similarity) {
            max_similarity = similarity;
            clusterId = p;
          }
        }

        // movies_arrl.movies.add(line);//Leo
        movies_arrl.movies.add(sb.toString());
        movies_arrl.similarities.add(max_similarity);
        movies_arrl.similarity = max_similarity;
        output.collect(new IntWritable(clusterId), movies_arrl);
        reporter.incrCounter(Counter.WORDS, 1);
      }
    }
  public void reduce(
      Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter)
      throws IOException {
    List<String[]> _temp = new ArrayList<String[]>();
    int count = 0;
    while (values.hasNext()) {
      Text _out = values.next();
      String[] tokens = StringUtils.splitPreserveAllTokens(_out.toString(), TAB);
      _temp.add(tokens);
      if (count++ > 100000) break;
    }

    if (count > 10000) {
      Set<String> ipSet = new HashSet<String>();
      for (int posI = 0; posI < _temp.size(); posI++) {
        String[] array = _temp.get(posI);
        if (array == null) continue;

        String mid = array[2];
        String ip = array[3];
        ipSet.add(ip);
      }
      output.collect(
          key, Utils.mergeKey(String.valueOf(ipSet.size()), StringUtils.join(ipSet, '|')));
      return;
    }

    /**
     * ·Ö×éËã·¨ FOREACH ALL_DATA IF IN INDEX THEN UPDATE INDEX AND INSERT DATA ELSE FOREACH SUB_DATA
     * MAKE INDEX AND SET FIND'S DATA AS NULL
     */
    // List<List<String[]>> dataList = new ArrayList<List<String[]>>();
    List<StringBuffer> indexList = new ArrayList<StringBuffer>();
    Set<String> ipSet = new HashSet<String>();
    boolean muliHost = false;
    for (int posI = 0; posI < _temp.size(); posI++) {
      String[] array = _temp.get(posI);
      if (array == null) continue;

      String mid = array[2];
      String ip = array[3];
      ipSet.add(ip);
      boolean hasIndex = false;
      for (int i = 0; i < indexList.size(); i++) {
        StringBuffer index = indexList.get(i);
        if (index.indexOf("|" + mid + "|") >= 0 || index.indexOf("|" + ip + "|") >= 0) {
          if (index.indexOf("|" + mid + "|") < 0) {
            index.append('|').append(mid).append('|');
          }

          if (index.indexOf("|" + ip + "|") < 0) {
            index.append('|').append(ip).append('|');
          }
          // dataList.get(i).add(array);
          hasIndex = true;
          break;
        }
      }
      if (!hasIndex) {
        StringBuffer index = new StringBuffer("|" + mid + "|" + ip + "|");
        // List<String[]> _tmp = new ArrayList<String[]>();
        // _tmp.add(array);
        for (int k = posI + 1; k < _temp.size(); k++) {
          String[] _newArray = _temp.get(k);
          if (_newArray == null) {
            continue;
          }
          String _mid = _newArray[2];
          String _ip = _newArray[3];
          if (index.indexOf("|" + _mid + "|") >= 0 || index.indexOf("|" + _ip + "|") >= 0) {
            if (index.indexOf("|" + _mid + "|") < 0) {
              index.append('|').append(_mid).append('|');
            }

            if (index.indexOf("|" + _ip + "|") < 0) {
              index.append('|').append(_ip).append('|');
            }
            // _tmp.add(_newArray);
            _temp.set(k, null);
          }
        }
        indexList.add(index);
        // dataList.add(_tmp);
      }
    }
    //        for(String[] _array : _temp){
    //            output.collect(key,Utils.mergeKey(_array[1],_array[2],_array[3],_array[4]));
    //        }

    StringBuffer allIndex = new StringBuffer();
    for (StringBuffer index : indexList) {
      allIndex.append(index).append(';');
    }
    if (allIndex.length() > 0) {
      allIndex.deleteCharAt(allIndex.length() - 1);
    }
    output.collect(
        key, Utils.mergeKey(String.valueOf(indexList.size()), StringUtils.join(ipSet, '|')));
  }