// specify input and out keys public void map( LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String line = value.toString(); // define new variable to be string ArrayList<Integer> range = new ArrayList<Integer>(); for (int i = 2000; i <= 2010; i++) { range.add(i); } // String[] inputs = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)"); String[] inputs = line.split(","); try { int year = Integer.parseInt(inputs[165]); if (range.contains(year)) { String dur = inputs[3]; String artist_name = inputs[2]; String song_title = inputs[1]; String final_input = artist_name + ',' + dur + ',' + song_title; Final_Value.set(final_input); output.collect(Final_Value, dummy); } } catch (NumberFormatException e) { // do nothing } }
/** * Produce splits such that each is no greater than the quotient of the total size and the * number of splits requested. * * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { int cnfiles = job.getInt(SRC_COUNT_LABEL, -1); long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1); String srcfilelist = job.get(SRC_LIST_LABEL, ""); if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) { throw new RuntimeException( "Invalid metadata: #files(" + cnfiles + ") total_size(" + cbsize + ") listuri(" + srcfilelist + ")"); } Path src = new Path(srcfilelist); FileSystem fs = src.getFileSystem(job); FileStatus srcst = fs.getFileStatus(src); ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); LongWritable key = new LongWritable(); FilePair value = new FilePair(); final long targetsize = cbsize / numSplits; long pos = 0L; long last = 0L; long acc = 0L; long cbrem = srcst.getLen(); SequenceFile.Reader sl = null; try { sl = new SequenceFile.Reader(fs, src, job); for (; sl.next(key, value); last = sl.getPosition()) { // if adding this split would put this split past the target size, // cut the last split and put this next file in the next split. if (acc + key.get() > targetsize && acc != 0) { long splitsize = last - pos; splits.add(new FileSplit(src, pos, splitsize, (String[]) null)); cbrem -= splitsize; pos = last; acc = 0L; } acc += key.get(); } } finally { checkAndClose(sl); } if (cbrem != 0) { splits.add(new FileSplit(src, pos, cbrem, (String[]) null)); } return splits.toArray(new FileSplit[splits.size()]); }
private float mean(ArrayList<Integer> l) { int t = l.size(); Integer sum = new Integer(0); for (Integer i : l) { sum += i; } return ((float) sum) / t; }
private float standard_deviation(ArrayList<Integer> l) { int t = l.size(); float ans = 0, mn = this.mean(l); for (Integer i : l) { ans += (i - mn) * (i - mn); } return (float) Math.sqrt(ans / (t - 1)); }
public void reduce( IntWritable sameNum, Iterator<Text> data, OutputCollector<Text, jBLASArrayWritable> output, Reporter reporter) throws IOException { int totalBatchCount = exampleCount / batchSize; DoubleMatrix weights = DoubleMatrix.randn(hiddenNodes, visibleNodes); DoubleMatrix hbias = DoubleMatrix.zeros(hiddenNodes); DoubleMatrix vbias = DoubleMatrix.zeros(visibleNodes); DoubleMatrix label = DoubleMatrix.zeros(1); DoubleMatrix hidden_chain = null; DoubleMatrix vdata = DoubleMatrix.zeros(batchSize, visibleNodes); ArrayList<DoubleMatrix> outputmatricies = new ArrayList<DoubleMatrix>(); outputmatricies.add(weights); outputmatricies.add(hbias); outputmatricies.add(vbias); outputmatricies.add(label); outputmatricies.add(vdata); outputmatricies.add(hidden_chain); int j; for (int i = 0; i < totalBatchCount; i++) { j = 0; while (data.hasNext() && j < batchSize) { j++; StringTokenizer tk = new StringTokenizer(data.next().toString()); label.put(0, Double.parseDouble(tk.nextToken())); String image = tk.nextToken(); for (int k = 0; k < image.length(); k++) { Integer val = new Integer(image.charAt(k)); vdata.put(j, k, val.doubleValue()); } dataArray = new jBLASArrayWritable(outputmatricies); batchID.set("1\t" + i); output.collect(batchID, dataArray); } } }
@Override protected void reduce(Centroid key, Iterable<Point> values, Context context) throws IOException, InterruptedException { ArrayList<Point> points = new ArrayList<Point>(); points.clear(); int clusterId = key.id; Point newCenter = null; for (Point p : values) { Point copy = p.deepCopy(); points.add(copy); if (newCenter == null) { newCenter = new Point(copy.deepCopy()); } else { newCenter = newCenter.add(copy); } } Centroid center = new Centroid(clusterId, newCenter); for (Point p : points) { context.write(center, p); } }
public void reduce( IntWritable key, Iterator<Text> values, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { HashMap<String, Integer> countries_map = new HashMap<String, Integer>(); ArrayList<Integer> counties = new ArrayList<>(); String cp = new String(); while (values.hasNext()) { cp = values.next().toString(); if (countries_map.containsKey(cp)) { countries_map.put(cp, countries_map.get(cp) + 1); } else { countries_map.put(cp, 1); } } for (java.util.Map.Entry<String, Integer> entry : countries_map.entrySet()) { counties.add(entry.getValue()); } output.collect( key, new Text( "" + countries_map.entrySet().size() + " " + Collections.min(counties) + " " + median(counties) + " " + Collections.max(counties) + " " + mean(counties) + " " + standard_deviation(counties))); }
@SuppressWarnings("unchecked") public void writeToDisk(Configuration conf, boolean writeToDistributedCache) throws IOException { String bucketCachePath = PathUtils.getCachePath(conf) + BUCKET_CACHE_FOLDER; FileSystem fs = FileSystem.get(conf); MapFile.Writer writer = null; try { writer = new MapFile.Writer( conf, new Path(bucketCachePath), MapFile.Writer.keyClass(IntWritable.class), MapFile.Writer.valueClass(Bucket.class)); ArrayList<IntWritable> keyList = new ArrayList<IntWritable>(); for (IntWritable i : bucketCache.keySet()) { keyList.add(i); } Collections.sort(keyList); for (IntWritable i : keyList) { writer.append(i, bucketCache.get(i)); } } finally { if (writer != null) { IOUtils.closeStream(writer); } } if (writeToDistributedCache) { for (FileStatus status : fs.listStatus(new Path(bucketCachePath))) { if (!status.isDirectory()) { DistributedCache.addCacheFile(status.getPath().toUri(), conf); } } } }
// v is all nodes within this block B // u is all nodes pointing to this set of v // some u are inside the block as well, those are in BE // some u are outside the block, those are in BC // BE = the Edges from Nodes in Block B // BC = the Boundary Conditions // NPR[v] = Next PageRank value of Node v protected double IterateBlockOnce() { // used to iterate through the BE list of edges ArrayList<String> uList = new ArrayList<String>(); // npr = current PageRank value of Node v double npr = 0.0; // r = sum of PR[u]/deg[u] for boundary nodes pointing to v double r = 0.0; // resErr = the avg residual error for this iteration double resErr = 0.0; HashMap<String, Double> tempmap = new HashMap<String, Double>(); for (String v : vList) { npr = 0.0f; double prevPR = newPR.get(v); // calculate newPR using PR data from any BE nodes for this node if (BE.containsKey(v)) { uList = BE.get(v); for (String u : uList) { // npr += PR[u] / deg(u); NodeData uNode = nodeDataMap.get(u); npr += (newPR.get(u) / uNode.getDegrees()); } } // add on any PR from nodes outside the block (BC) if (BC.containsKey(v)) { r = BC.get(v); npr += r; } // NPR[v] = d*NPR[v] + (1-d)/N; npr = (dampingFactor * npr) + randomJumpFactor; // update the global newPR map tempmap.put(v, npr); // newPR.put(v, npr); // track the sum of the residual errors resErr += Math.abs(prevPR - npr) / npr; } // calculate the average residual error and return it newPR = tempmap; resErr = resErr / vList.size(); return resErr; }
private int median(ArrayList<Integer> l) { Collections.sort(l); int t = l.size(); return l.get(t / 2); }
@Override public void map(WritableComparable docID, Text docContents, Context context) throws IOException, InterruptedException { Matcher matcher = WORD_PATTERN.matcher(docContents.toString()); Func func = funcFromNum(funcNum); // YOUR CODE HERE ArrayList<String> doc_words = new ArrayList<String>(); // Store all words within the document. ArrayList<Double> targetGram_pos = new ArrayList< Double>(); // Store the index of each occurrence of target word in the document DoublePair values = new DoublePair(); // DoublePair that store distance, ocurrences values.setDouble2(new Double(1.0)); // ocurrences = 0 Text output = new Text(); // Store each word within the document in doc_words while (matcher.find()) { doc_words.add(new String(matcher.group().toLowerCase())); } // Traverse the document and store each word within it in ArrayList doc_words, and at the same // time store the index of each occurence of target word within the document in targetGram_pos for (int i = 0; i < doc_words.size(); i++) { String word = doc_words.get(i); if (word.equals(targetGram)) targetGram_pos.add(new Double(i)); } // Traverse the doc_words ArrayList and find the distance between each word within the // document and the target word // If there were not any ocurrence of target word distance is 0 to all words int index_tw = 0; // index target word Double distance = new Double(0); // store the distance between current word and target word for (int i = 0; i < doc_words.size(); i++) { if (targetGram_pos.size() == 0) { // If target word is not within the document, distance for all words is // Double.POSITIVE_INFINITY distance = Double.POSITIVE_INFINITY; } else { if (doc_words .get(i) .equals( targetGram)) { // If word within the document is the same target word skip it and // go to the next word continue; } if (targetGram_pos.size() == 1) { // If there were just one entre of the target word distance = Math.abs(i - targetGram_pos.get(index_tw)); } else { if (index_tw < targetGram_pos.size() - 1) { // If this is not the LAST position of the ArrayList of indexes of the // target word if (Math.abs(i - targetGram_pos.get(index_tw)) > Math.abs( i - targetGram_pos.get( index_tw + 1))) { // Compare the lowest distance between the nearest two // indexes index_tw++; } } distance = Math.abs(i - targetGram_pos.get(index_tw)); } } values.setDouble1( new Double(func.f(distance))); // Evaluate dist on f(d) and store it on distance.d1 output.set(doc_words.get(i)); // Output key is each word context.write( output, values); // key, value: key: each word, value:Pair of Double(distance, num of // co-currences) } // end for } // end map1
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Iterator<Text> itr = values.iterator(); Text input = new Text(); String[] inputTokens = null; // initialize/reset all variables Double pageRankOld = 0.0; Double residualError = 0.0; String output = ""; Integer maxNode = 0; ArrayList<String> temp = new ArrayList<String>(); Double tempBC = 0.0; vList.clear(); newPR.clear(); BE.clear(); BC.clear(); nodeDataMap.clear(); while (itr.hasNext()) { input = itr.next(); inputTokens = input.toString().split(" "); // if first element is PR, it is the node ID, previous pagerank and outgoing edgelist for this // node if (inputTokens[0].equals("PR")) { String nodeID = inputTokens[1]; pageRankOld = Double.parseDouble(inputTokens[2]); newPR.put(nodeID, pageRankOld); NodeData node = new NodeData(); node.setNodeID(nodeID); node.setPageRank(pageRankOld); if (inputTokens.length == 4) { node.setEdgeList(inputTokens[3]); node.setDegrees(inputTokens[3].split(",").length); } vList.add(nodeID); nodeDataMap.put(nodeID, node); // keep track of the max nodeID for this block if (Integer.parseInt(nodeID) > maxNode) { maxNode = Integer.parseInt(nodeID); } // if BE, it is an in-block edge } else if (inputTokens[0].equals("BE")) { if (BE.containsKey(inputTokens[2])) { // Initialize BC for this v temp = BE.get(inputTokens[2]); } else { temp = new ArrayList<String>(); } temp.add(inputTokens[1]); BE.put(inputTokens[2], temp); // if BC, it is an incoming node from outside of the block } else if (inputTokens[0].equals("BC")) { if (BC.containsKey(inputTokens[2])) { // Initialize BC for this v tempBC = BC.get(inputTokens[2]); } else { tempBC = 0.0; } tempBC += Double.parseDouble(inputTokens[3]); BC.put(inputTokens[2], tempBC); } } int i = 0; do { i++; residualError = IterateBlockOnce(); // System.out.println("Block " + key + " pass " + i + " resError:" + residualError); } while (residualError > threshold); // i < maxIterations && // compute the ultimate residual error for each node in this block residualError = 0.0; for (String v : vList) { NodeData node = nodeDataMap.get(v); residualError += Math.abs(node.getPageRank() - newPR.get(v)) / newPR.get(v); } residualError = residualError / vList.size(); // System.out.println("Block " + key + " overall resError for iteration: " + residualError); // add the residual error to the counter that is tracking the overall sum (must be expressed as // a long value) long residualAsLong = (long) Math.floor(residualError * PageRankBlock.precision); long numberOfIterations = (long) (i); context.getCounter(PageRankBlock.ProjectCounters.RESIDUAL_ERROR).increment(residualAsLong); context .getCounter(PageRankBlock.ProjectCounters.AVERAGE_ITERATIONS) .increment(numberOfIterations); // output should be // key:nodeID (for this node) // value:<pageRankNew> <degrees> <comma-separated outgoing edgeList> for (String v : vList) { NodeData node = nodeDataMap.get(v); output = newPR.get(v) + " " + node.getDegrees() + " " + node.getEdgeList(); Text outputText = new Text(output); Text outputKey = new Text(v); context.write(outputKey, outputText); if (v.equals(maxNode.toString())) { System.out.println("Block:" + key + " | node:" + v + " | pageRank:" + newPR.get(v)); } } cleanup(context); }