protected static double Distance(List<Double> p1, List<Double> p2) { double sumOfSquaredDifferences = 0.0; for (int i = 0; i < p1.size(); i++) { sumOfSquaredDifferences += Math.pow(p1.get(i) - p2.get(i), 2.0); } return Math.pow(sumOfSquaredDifferences, 0.5); }
/** * Generate the list of files and make them into FileSplits. This needs to be copied to insert a * filter on acceptable data */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); long desiredMappers = job.getConfiguration().getLong("org.systemsbiology.jxtandem.DesiredXMLInputMappers", 0); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> fileStatuses = listStatus(job); boolean forceNumberMappers = fileStatuses.size() == 1; for (FileStatus file : fileStatuses) { Path path = file.getPath(); if (!isPathAcceptable(path)) // filter acceptable data continue; FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); // use desired mappers to force more splits if (forceNumberMappers && desiredMappers > 0) maxSize = Math.min(maxSize, (length / desiredMappers)); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (withinSlop(splitSize, bytesRemaining)) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add( new FileSplit( path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add( new FileSplit( path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } System.out.println("Total # of splits: " + splits.size()); // LOG.debug("Total # of splits: " + splits.size()); return splits; }
/** return progress based on the amount of data processed so far. */ public float getProgress() throws IOException, InterruptedException { long subprogress = 0; // bytes processed in current split if (null != curReader) { // idx is always one past the current subsplit's true index. subprogress = (long) (curReader.getProgress() * split.getLength(idx - 1)); } return Math.min(1.0f, (progress + subprogress) / (float) (split.getLength())); }
private List<InputSplit> getSplits( Configuration configuration, int numSplits, long totalSizeBytes) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(numSplits); long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits); CopyListingFileStatus srcFileStatus = new CopyListingFileStatus(); Text srcRelPath = new Text(); long currentSplitSize = 0; long lastSplitStart = 0; long lastPosition = 0; final Path listingFilePath = getListingFilePath(configuration); if (LOG.isDebugEnabled()) { LOG.debug( "Average bytes per map: " + nBytesPerSplit + ", Number of maps: " + numSplits + ", total size: " + totalSizeBytes); } SequenceFile.Reader reader = null; try { reader = getListingFileReader(configuration); while (reader.next(srcRelPath, srcFileStatus)) { // If adding the current file would cause the bytes per map to exceed // limit. Add the current file to new split if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) { FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null); if (LOG.isDebugEnabled()) { LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize); } splits.add(split); lastSplitStart = lastPosition; currentSplitSize = 0; } currentSplitSize += srcFileStatus.getLen(); lastPosition = reader.getPosition(); } if (lastPosition > lastSplitStart) { FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null); if (LOG.isDebugEnabled()) { LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize); } splits.add(split); } } finally { IOUtils.closeStream(reader); } return splits; }
public void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { LongWritable curNodeId = key; double previousPRValue = 1; double nextPRValue = 0; double localResidual = 0; String edgeListOfCurNode = ""; long localResidualTransformed = 0; for (Text value : values) { String[] inputInfo = value.toString().split("\\s+"); // incoming pagerank value if (inputInfo.length == 1) { nextPRValue += Double.parseDouble(inputInfo[0]); } // current node info else if (inputInfo.length == 3) { edgeListOfCurNode = inputInfo[2]; previousPRValue = Double.parseDouble(inputInfo[1]); } else if (inputInfo.length == 2) { previousPRValue = Double.parseDouble(inputInfo[1]); } else { System.out.println("ERROR: received unexpected TEXT in length"); } } if (previousPRValue == 1) System.out.println("No node info has been received by a reducer"); // calculate the pagerank value according to the given formula nextPRValue = pagerankFormula(nextPRValue); // should also iterate sink nodes list, add the evenly splitted value // reducer should store the updated node info(NPR) to output directory context.write(null, new Text(curNodeId + " " + nextPRValue + " " + edgeListOfCurNode)); // then compare PPR with NPR try { localResidual = Math.abs(previousPRValue - nextPRValue) / nextPRValue; localResidualTransformed = (long) (localResidual * 10000); // System.out.println("Make sure you got the right transformed residual : // "+localResidualTransformed); } catch (ArithmeticException e) { System.out.println("PPR is zero. Check where you get the value!"); } // assume there is a global counter called residualCounter; context.getCounter(myCounter.ResidualCounter.RESIDUAL_SUM).increment(localResidualTransformed); }
// v is all nodes within this block B // u is all nodes pointing to this set of v // some u are inside the block as well, those are in BE // some u are outside the block, those are in BC // BE = the Edges from Nodes in Block B // BC = the Boundary Conditions // NPR[v] = Next PageRank value of Node v protected double IterateBlockOnce() { // used to iterate through the BE list of edges ArrayList<String> uList = new ArrayList<String>(); // npr = current PageRank value of Node v double npr = 0.0; // r = sum of PR[u]/deg[u] for boundary nodes pointing to v double r = 0.0; // resErr = the avg residual error for this iteration double resErr = 0.0; HashMap<String, Double> tempmap = new HashMap<String, Double>(); for (String v : vList) { npr = 0.0f; double prevPR = newPR.get(v); // calculate newPR using PR data from any BE nodes for this node if (BE.containsKey(v)) { uList = BE.get(v); for (String u : uList) { // npr += PR[u] / deg(u); NodeData uNode = nodeDataMap.get(u); npr += (newPR.get(u) / uNode.getDegrees()); } } // add on any PR from nodes outside the block (BC) if (BC.containsKey(v)) { r = BC.get(v); npr += r; } // NPR[v] = d*NPR[v] + (1-d)/N; npr = (dampingFactor * npr) + randomJumpFactor; // update the global newPR map tempmap.put(v, npr); // newPR.put(v, npr); // track the sum of the residual errors resErr += Math.abs(prevPR - npr) / npr; } // calculate the average residual error and return it newPR = tempmap; resErr = resErr / vList.size(); return resErr; }
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Iterator<Text> itr = values.iterator(); Text input = new Text(); String[] inputTokens = null; // initialize/reset all variables Double pageRankOld = 0.0; Double residualError = 0.0; String output = ""; Integer maxNode = 0; ArrayList<String> temp = new ArrayList<String>(); Double tempBC = 0.0; vList.clear(); newPR.clear(); BE.clear(); BC.clear(); nodeDataMap.clear(); while (itr.hasNext()) { input = itr.next(); inputTokens = input.toString().split(" "); // if first element is PR, it is the node ID, previous pagerank and outgoing edgelist for this // node if (inputTokens[0].equals("PR")) { String nodeID = inputTokens[1]; pageRankOld = Double.parseDouble(inputTokens[2]); newPR.put(nodeID, pageRankOld); NodeData node = new NodeData(); node.setNodeID(nodeID); node.setPageRank(pageRankOld); if (inputTokens.length == 4) { node.setEdgeList(inputTokens[3]); node.setDegrees(inputTokens[3].split(",").length); } vList.add(nodeID); nodeDataMap.put(nodeID, node); // keep track of the max nodeID for this block if (Integer.parseInt(nodeID) > maxNode) { maxNode = Integer.parseInt(nodeID); } // if BE, it is an in-block edge } else if (inputTokens[0].equals("BE")) { if (BE.containsKey(inputTokens[2])) { // Initialize BC for this v temp = BE.get(inputTokens[2]); } else { temp = new ArrayList<String>(); } temp.add(inputTokens[1]); BE.put(inputTokens[2], temp); // if BC, it is an incoming node from outside of the block } else if (inputTokens[0].equals("BC")) { if (BC.containsKey(inputTokens[2])) { // Initialize BC for this v tempBC = BC.get(inputTokens[2]); } else { tempBC = 0.0; } tempBC += Double.parseDouble(inputTokens[3]); BC.put(inputTokens[2], tempBC); } } int i = 0; do { i++; residualError = IterateBlockOnce(); // System.out.println("Block " + key + " pass " + i + " resError:" + residualError); } while (residualError > threshold); // i < maxIterations && // compute the ultimate residual error for each node in this block residualError = 0.0; for (String v : vList) { NodeData node = nodeDataMap.get(v); residualError += Math.abs(node.getPageRank() - newPR.get(v)) / newPR.get(v); } residualError = residualError / vList.size(); // System.out.println("Block " + key + " overall resError for iteration: " + residualError); // add the residual error to the counter that is tracking the overall sum (must be expressed as // a long value) long residualAsLong = (long) Math.floor(residualError * PageRankBlock.precision); long numberOfIterations = (long) (i); context.getCounter(PageRankBlock.ProjectCounters.RESIDUAL_ERROR).increment(residualAsLong); context .getCounter(PageRankBlock.ProjectCounters.AVERAGE_ITERATIONS) .increment(numberOfIterations); // output should be // key:nodeID (for this node) // value:<pageRankNew> <degrees> <comma-separated outgoing edgeList> for (String v : vList) { NodeData node = nodeDataMap.get(v); output = newPR.get(v) + " " + node.getDegrees() + " " + node.getEdgeList(); Text outputText = new Text(output); Text outputKey = new Text(v); context.write(outputKey, outputText); if (v.equals(maxNode.toString())) { System.out.println("Block:" + key + " | node:" + v + " | pageRank:" + newPR.get(v)); } } cleanup(context); }