public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String[] _allCols = StringUtils.splitPreserveAllTokens(value.toString(), splitChar); StringBuffer stringBuffer = new StringBuffer(); for (int i = 0; i < _allCols.length; i++) { stringBuffer.append(i).append('=').append(_allCols[i]).append("\t"); } output.collect(new Text(_allCols[0]), new Text(stringBuffer.toString())); }
public void map( LongWritable key, Text value, OutputCollector<IntWritable, ClusterWritable> output, Reporter reporter) throws IOException { String movieIdStr = new String(); String reviewStr = new String(); String userIdStr = new String(); String reviews = new String(); String line = new String(); String tok = new String(""); long movieId; int review, userId, p, q, r, rater, rating, movieIndex; int clusterId = 0; int[] n = new int[maxClusters]; float[] sq_a = new float[maxClusters]; float[] sq_b = new float[maxClusters]; float[] numer = new float[maxClusters]; float[] denom = new float[maxClusters]; float max_similarity = 0.0f; float similarity = 0.0f; Cluster movie = new Cluster(); ClusterWritable movies_arrl = new ClusterWritable(); StringBuffer sb = new StringBuffer(); line = ((Text) value).toString(); movieIndex = line.indexOf(":"); for (r = 0; r < maxClusters; r++) { numer[r] = 0.0f; denom[r] = 0.0f; sq_a[r] = 0.0f; sq_b[r] = 0.0f; n[r] = 0; } if (movieIndex > 0) { movieIdStr = line.substring(0, movieIndex); sb.append(movieIdStr).append(":"); movieId = Long.parseLong(movieIdStr); movie.movie_id = movieId; reviews = line.substring(movieIndex + 1); StringTokenizer token = new StringTokenizer(reviews, ","); int attrCnt = 0; // while (token.hasMoreTokens()) { Leo while (token.hasMoreTokens() && attrCnt < attrNum) { tok = token.nextToken(); int reviewIndex = tok.indexOf("_"); // userIdStr = tok.substring(0, reviewIndex); //Leo userIdStr = String.valueOf(attrCnt); reviewStr = tok.substring(reviewIndex + 1); if (attrCnt > 0) { sb.append(","); } sb.append(String.valueOf(attrCnt)).append("_").append(reviewStr); userId = Integer.parseInt(userIdStr); review = Integer.parseInt(reviewStr); for (r = 0; r < totalClusters; r++) { /*for (q = 0; q < centroids_ref[r].total; q++) { rater = centroids_ref[r].reviews.get(q).rater_id; rating = (int) centroids_ref[r].reviews.get(q).rating; if (userId == rater) { numer[r] += (float) (review * rating); sq_a[r] += (float) (review * review); sq_b[r] += (float) (rating * rating); n[r]++; // counter break; // to avoid multiple ratings by the same reviewer } }*/ // Leo rating = (int) centroids_ref[r].reviews.get(attrCnt).rating; numer[r] += (float) ((review - rating) * (review - rating)); n[r]++; // counter } attrCnt++; } for (p = 0; p < totalClusters; p++) { /*denom[p] = (float) ((Math.sqrt((double) sq_a[p])) * (Math .sqrt((double) sq_b[p]))); if (denom[p] > 0) { similarity = numer[p] / denom[p]; if (similarity > max_similarity) { max_similarity = similarity; clusterId = p; } }*/ // Leo similarity = 250 - numer[p]; if (similarity > max_similarity) { max_similarity = similarity; clusterId = p; } } // movies_arrl.movies.add(line);//Leo movies_arrl.movies.add(sb.toString()); movies_arrl.similarities.add(max_similarity); movies_arrl.similarity = max_similarity; output.collect(new IntWritable(clusterId), movies_arrl); reporter.incrCounter(Counter.WORDS, 1); } }
public void reduce( Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { List<String[]> _temp = new ArrayList<String[]>(); int count = 0; while (values.hasNext()) { Text _out = values.next(); String[] tokens = StringUtils.splitPreserveAllTokens(_out.toString(), TAB); _temp.add(tokens); if (count++ > 100000) break; } if (count > 10000) { Set<String> ipSet = new HashSet<String>(); for (int posI = 0; posI < _temp.size(); posI++) { String[] array = _temp.get(posI); if (array == null) continue; String mid = array[2]; String ip = array[3]; ipSet.add(ip); } output.collect( key, Utils.mergeKey(String.valueOf(ipSet.size()), StringUtils.join(ipSet, '|'))); return; } /** * ·Ö×éËã·¨ FOREACH ALL_DATA IF IN INDEX THEN UPDATE INDEX AND INSERT DATA ELSE FOREACH SUB_DATA * MAKE INDEX AND SET FIND'S DATA AS NULL */ // List<List<String[]>> dataList = new ArrayList<List<String[]>>(); List<StringBuffer> indexList = new ArrayList<StringBuffer>(); Set<String> ipSet = new HashSet<String>(); boolean muliHost = false; for (int posI = 0; posI < _temp.size(); posI++) { String[] array = _temp.get(posI); if (array == null) continue; String mid = array[2]; String ip = array[3]; ipSet.add(ip); boolean hasIndex = false; for (int i = 0; i < indexList.size(); i++) { StringBuffer index = indexList.get(i); if (index.indexOf("|" + mid + "|") >= 0 || index.indexOf("|" + ip + "|") >= 0) { if (index.indexOf("|" + mid + "|") < 0) { index.append('|').append(mid).append('|'); } if (index.indexOf("|" + ip + "|") < 0) { index.append('|').append(ip).append('|'); } // dataList.get(i).add(array); hasIndex = true; break; } } if (!hasIndex) { StringBuffer index = new StringBuffer("|" + mid + "|" + ip + "|"); // List<String[]> _tmp = new ArrayList<String[]>(); // _tmp.add(array); for (int k = posI + 1; k < _temp.size(); k++) { String[] _newArray = _temp.get(k); if (_newArray == null) { continue; } String _mid = _newArray[2]; String _ip = _newArray[3]; if (index.indexOf("|" + _mid + "|") >= 0 || index.indexOf("|" + _ip + "|") >= 0) { if (index.indexOf("|" + _mid + "|") < 0) { index.append('|').append(_mid).append('|'); } if (index.indexOf("|" + _ip + "|") < 0) { index.append('|').append(_ip).append('|'); } // _tmp.add(_newArray); _temp.set(k, null); } } indexList.add(index); // dataList.add(_tmp); } } // for(String[] _array : _temp){ // output.collect(key,Utils.mergeKey(_array[1],_array[2],_array[3],_array[4])); // } StringBuffer allIndex = new StringBuffer(); for (StringBuffer index : indexList) { allIndex.append(index).append(';'); } if (allIndex.length() > 0) { allIndex.deleteCharAt(allIndex.length() - 1); } output.collect( key, Utils.mergeKey(String.valueOf(indexList.size()), StringUtils.join(ipSet, '|'))); }