// specify input and out keys public void map( LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String line = value.toString(); // define new variable to be string ArrayList<Integer> range = new ArrayList<Integer>(); for (int i = 2000; i <= 2010; i++) { range.add(i); } // String[] inputs = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)"); String[] inputs = line.split(","); try { int year = Integer.parseInt(inputs[165]); if (range.contains(year)) { String dur = inputs[3]; String artist_name = inputs[2]; String song_title = inputs[1]; String final_input = artist_name + ',' + dur + ',' + song_title; Final_Value.set(final_input); output.collect(Final_Value, dummy); } } catch (NumberFormatException e) { // do nothing } }
/** * Produce splits such that each is no greater than the quotient of the total size and the * number of splits requested. * * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { int cnfiles = job.getInt(SRC_COUNT_LABEL, -1); long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1); String srcfilelist = job.get(SRC_LIST_LABEL, ""); if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) { throw new RuntimeException( "Invalid metadata: #files(" + cnfiles + ") total_size(" + cbsize + ") listuri(" + srcfilelist + ")"); } Path src = new Path(srcfilelist); FileSystem fs = src.getFileSystem(job); FileStatus srcst = fs.getFileStatus(src); ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); LongWritable key = new LongWritable(); FilePair value = new FilePair(); final long targetsize = cbsize / numSplits; long pos = 0L; long last = 0L; long acc = 0L; long cbrem = srcst.getLen(); SequenceFile.Reader sl = null; try { sl = new SequenceFile.Reader(fs, src, job); for (; sl.next(key, value); last = sl.getPosition()) { // if adding this split would put this split past the target size, // cut the last split and put this next file in the next split. if (acc + key.get() > targetsize && acc != 0) { long splitsize = last - pos; splits.add(new FileSplit(src, pos, splitsize, (String[]) null)); cbrem -= splitsize; pos = last; acc = 0L; } acc += key.get(); } } finally { checkAndClose(sl); } if (cbrem != 0) { splits.add(new FileSplit(src, pos, cbrem, (String[]) null)); } return splits.toArray(new FileSplit[splits.size()]); }
private float mean(ArrayList<Integer> l) { int t = l.size(); Integer sum = new Integer(0); for (Integer i : l) { sum += i; } return ((float) sum) / t; }
private float standard_deviation(ArrayList<Integer> l) { int t = l.size(); float ans = 0, mn = this.mean(l); for (Integer i : l) { ans += (i - mn) * (i - mn); } return (float) Math.sqrt(ans / (t - 1)); }
public void reduce( IntWritable key, Iterator<Text> values, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { HashMap<String, Integer> countries_map = new HashMap<String, Integer>(); ArrayList<Integer> counties = new ArrayList<>(); String cp = new String(); while (values.hasNext()) { cp = values.next().toString(); if (countries_map.containsKey(cp)) { countries_map.put(cp, countries_map.get(cp) + 1); } else { countries_map.put(cp, 1); } } for (java.util.Map.Entry<String, Integer> entry : countries_map.entrySet()) { counties.add(entry.getValue()); } output.collect( key, new Text( "" + countries_map.entrySet().size() + " " + Collections.min(counties) + " " + median(counties) + " " + Collections.max(counties) + " " + mean(counties) + " " + standard_deviation(counties))); }
private int median(ArrayList<Integer> l) { Collections.sort(l); int t = l.size(); return l.get(t / 2); }
@Override public void reduce( IntWritable key, Iterator<ClusterWritable> values, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { float sumSimilarity = 0.0f; int numMovies = 0; float avgSimilarity = 0.0f; float similarity = 0.0f; int s = 0; int count; float diff = 0.0f; float minDiff = 1.0f; int candidate = 0; String data = new String(""); String shortline = new String(""); ArrayList<String> arrl = new ArrayList<String>(); ArrayList<Float> simArrl = new ArrayList<Float>(); String oneElm = new String(); int indexShort, index2; Text val = new Text(); while (values.hasNext()) { ClusterWritable cr = (ClusterWritable) values.next(); similarity = cr.similarity; simArrl.addAll(cr.similarities); for (int i = 0; i < cr.movies.size(); i++) { oneElm = cr.movies.get(i); indexShort = oneElm.indexOf( ",", 1000); // to avoid memory error caused by long arrays; it will results less // accurate if (indexShort == -1) { shortline = new String(oneElm); } else { shortline = new String(oneElm.substring(0, indexShort)); } arrl.add(shortline); output.collect(key, new Text(oneElm)); } numMovies += cr.movies.size(); sumSimilarity += similarity; } if (numMovies > 0) { avgSimilarity = sumSimilarity / (float) numMovies; } diff = 0.0f; minDiff = 1.0f; for (s = 0; s < numMovies; s++) { diff = (float) Math.abs(avgSimilarity - simArrl.get(s)); if (diff < minDiff) { minDiff = diff; candidate = s; } } data = arrl.get(candidate); index2 = data.indexOf(":"); String movieStr = data.substring(0, index2); String reviews = data.substring(index2 + 1); StringTokenizer token = new StringTokenizer(reviews, ","); count = 0; while (token.hasMoreTokens()) { token.nextToken(); count++; } System.out.println( "The key = " + key.toString() + " has members = " + numMovies + " simil = " + simArrl.get(candidate)); val = new Text(simArrl.get(candidate) + " " + movieStr + " " + count + " " + reviews); output.collect(key, val); reporter.incrCounter(Counter.VALUES, 1); }