/** * Constructs Countergroups from job runtime statistics * * @param counterNameToValue * @return */ public static Map<String, CounterGroup> counterGroupInfoMap( Map<String, Double> counterNameToValue) { Counters counters = new Counters(); for (Map.Entry<String, ? extends Number> entry : counterNameToValue.entrySet()) { String[] cNames = entry.getKey().split("::"); String groupName = cNames[0]; String counterName = cNames[1]; Counter counter = counters.findCounter(groupName, counterName); counter.setValue(entry.getValue().longValue()); } return CounterGroup.counterGroupInfoMap(counters); }
public static Object countersToJson(Counters counters) { Map<String, Object> jsonObj = new HashMap<String, Object>(); Collection<String> counterGroups = counters.getGroupNames(); for (String groupName : counterGroups) { Map<String, String> counterStats = new HashMap<String, String>(); Group group = counters.getGroup(groupName); Iterator<Counters.Counter> it = group.iterator(); while (it.hasNext()) { Counter counter = it.next(); counterStats.put(counter.getDisplayName(), String.valueOf(counter.getCounter())); } jsonObj.put(groupName, counterStats); } return jsonObj; }
/** * helper method for counter group map retrieval * * @param HadoopStepStats * @return a map of counter name to counter value */ private Map<String, Long> counterGroupInfoMapHelper(HadoopStepStats stats) { Counters counters = new Counters(); Map<String, Long> counterNameToValue = new HashMap<String, Long>(); for (String groupName : stats.getCounterGroups()) { // retreiving groups for (String counterName : stats.getCountersFor(groupName)) { // retreiving counters in that group Long counterValue = stats.getCounterValue(groupName, counterName); counterNameToValue.put(groupName + "::" + counterName, counterValue); // creating counter Counter counter = counters.findCounter(groupName, counterName); counter.setValue(counterValue); } } setCounterGroupMap(CounterGroup.counterGroupInfoMap(counters)); return counterNameToValue; }
/** * Reduces values for a given key * * @param key the Key for the given value being passed in * @param value an Array to process that corresponds to the given key * @param context the Context object for the currently executing job */ public void map(Object key, TransBinaryMapInputValue value, Context context) throws IOException, InterruptedException { TransBinaryInputSplit split = value.getSplit(); ZoneClient zclient = null; OptimusZone zone1 = null; OptimusZone zone2 = null; OptimusZone zone3 = null; cl = (Counter) context.getCounter("TRANS_READ", "MAPPER_LOCAL_READ"); cr = (Counter) context.getCounter("TRANS_READ", "MAPPER_REMOTE_READ"); co = (Counter) context.getCounter("TRANS_WRITE", "MAPPER_WRITE"); OptimusCatalogProtocol ci = null; boolean earlybird = split.isEarlybird(); PartitionReader reader = null; try { OptimusConfiguration conf = new OptimusConfiguration(split.getConfDir()); zclient = new ZoneClient(conf); ci = zclient.getCi(); zone1 = ci.openZone(new ZoneID(split.getZid1())); zone2 = ci.openZone(new ZoneID(split.getZid2())); zone3 = ci.openZone(new ZoneID(split.getZid3())); reader = new PartitionReader(conf); } catch (WrongArgumentException e) { throw new IOException("Create Client Failure"); } catch (JDOMException e) { throw new IOException("Create Client Failure"); } String[] host = split.getLocations(); Host h = UTILS.RandomHost(host, split.getHosts().getHosts()); Object[] data2 = null; Object[] data1 = null; DataChunk chunk = new DataChunk(zone3.getSize().getShape(), zone3.getPstep().getShape()); try { if (split.getZid1() == split.getZid2() && Arrays.equals(split.getStart1(), split.getStart2())) { data1 = this.colocatedRead(split, h, zone1, zone2, zone3); cl.increment(data1.length); } else { OptimusDataProtocol dp = h.getDataProtocol(); int[] start1 = split.getStart1(); int[] cstart1 = split.getCstart(); int[] rstart1 = new int[start1.length]; for (int i = 0; i < start1.length; i++) { rstart1[i] = start1[i] - cstart1[i]; } data1 = (Double[]) dp.readData( new ArrayID(split.getAid1()), new PID(split.getPnum1()), new OptimusShape(split.getPs1()), new OptimusShape(rstart1), new OptimusShape(split.getRoff())) .getData(); data2 = (Double[]) reader.readData(zone2, split.getAid2(), split.getStart2(), split.getOff()); // data1 = reader.readData(zone1, split.getAid1(), split.getStart1(), split.getOff()); cl.increment(data1.length); cr.increment(data2.length); data1 = split.getCal().calcArray(data1, data2); } } catch (WrongArgumentException e) { e.printStackTrace(); throw new IOException("Reading from the second aray failure"); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); throw new IOException("Reading from the second aray failure"); } int[] off = split.getRoff(); int[] rstart = split.getRstart(); TRANSDataIterator itr1 = new TRANSDataIterator( new TransDataType(data1[0].getClass()), data1, rstart, split.getRoff()); ZoneID zid = zone3.getId(); ArrayID aid = new ArrayID(split.getAid3()); RID rid = new RID(zone3.getStrategy().getShapes().size() - 2); Set<DataChunk> chunks = chunk.getAdjacentChunks(split.getRstart(), split.getOff()); for (DataChunk c : chunks) { int[] cstart = c.getStart(); int[] coff = c.getChunkSize(); int[] nstart = new int[cstart.length]; int[] noff = new int[cstart.length]; int len = 1; for (int i = 0; i < cstart.length; i++) { nstart[i] = cstart[i] > rstart[i] ? cstart[i] : rstart[i]; noff[i] = Math.min(cstart[i] + coff[i], rstart[i] + off[i]); noff[i] -= nstart[i]; len *= noff[i]; } Object[] tmp = new Object[len]; TRANSDataIterator itr = new TRANSDataIterator(new TransDataType(data1[0].getClass()), tmp, nstart, noff); if (!itr.init(itr1.getStart(), itr1.getShape())) { continue; } itr1.init(itr.getStart(), itr.getShape()); while (itr.next() && itr1.next()) { itr.set(itr1.get()); } if (earlybird) { Partition p = new Partition(zid, aid, new PID(c.getChunkNum()), rid); Host ho = null; try { ci.CreatePartition(p); ho = ci.getReplicateHost(p, rid); } catch (WrongArgumentException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new IOException("Get Replicate host failure"); } OptimusDataProtocol dp = ho.getDataProtocol(); dp.putPartitionData(p, itr); co.increment(itr.getSize()); } else { context.write(new IntWritable(c.getChunkNum()), itr); } } }
@Override public int run(String[] args) throws IOException { OptionParser p = new OptionParser(); OptionSpec<String> maxwiOpt = p.accepts(maxwiOptName, "location of maxWi map file (HDFS) REQUIRED") .withRequiredArg() .ofType(String.class); OptionSpec<Float> thresholdOpt = p.accepts(thresholdOptName, "similarity threshold") .withRequiredArg() .ofType(Float.class) .defaultsTo(DEFAULT_THRESHOLD); OptionSpec<Integer> stripesOpt = p.accepts(stripesOptName, "number of stripes to divide the similarity matrix") .withRequiredArg() .ofType(Integer.class) .defaultsTo(1); OptionSpec<Integer> spreadOpt = p.accepts(spreadOptName, "number of reducers per stripe") .withRequiredArg() .ofType(Integer.class) .defaultsTo(DEFAULT_SPREAD); OptionSpec<Integer> factorOpt = p.accepts(factorOptName, "number of mappers per reducer") .withRequiredArg() .ofType(Integer.class) .defaultsTo(DEFAULT_FACTOR); OptionSpec<Integer> maxVectorIDOpt = p.accepts(maxVectorIDOptName, "maximum vector ID").withRequiredArg().ofType(Integer.class); p.acceptsAll(Arrays.asList("h", "?"), "show help"); OptionSet options = parseOptions(p, args); // to distinguish indexes built in successive runs DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss"); Date date = new Date(); float threshold = options.valueOf(thresholdOpt); // threshold if (threshold < 0 || threshold >= 1) { System.err.println(thresholdOptName + " should be between 0 and 1"); System.exit(1); } int numStripes = options.valueOf(stripesOpt); // number of stripes if (numStripes < 1) { System.err.println(stripesOptName + " should be > 0"); System.exit(1); } // MapReduce parameters int spread = options.valueOf(spreadOpt); // how many reducers per stripe if (spread < 1) { System.err.println(spreadOptName + " should be > 0"); System.exit(1); } int factor = options.valueOf(factorOpt); // how many mappers per reducer if (factor < 1) { System.err.println(factorOptName + " should be > 0"); System.exit(1); } int maxKey = 0; if (options.has(maxVectorIDOpt)) { maxKey = options.valueOf(maxVectorIDOpt); // maximum value of the vector ID if (maxKey < 1) { System.err.println(maxVectorIDOptName + " should be > 0"); System.exit(1); } } int numReducers = GenericKey.StripePartitioner.numReducers(numStripes, spread); int numMappers = numReducers * factor; int numBuckets = numMappers; // pick the file with max weights from command line String maxWiDir = options.valueOf(maxwiOpt); List<String> nonOptArgs = options.nonOptionArguments(); LOG.info("Threshold set to " + threshold); LOG.info( String.format( "Buckets: %1$-10s Factor: %2$-10s Stripes: %3$-10s Spread: %4$-10s Reducers: %5$-10s", numBuckets, factor, numStripes, spread, numReducers)); // start building the jobs JobConf conf1 = new JobConf(getConf(), Similarity.class); conf1.setFloat(PARAM_APS_THRESHOLD, threshold); conf1.setInt(PARAM_APS_STRIPES, numStripes); DistributedCache.addCacheFile(URI.create(maxWiDir), conf1); Path inputPath = new Path(nonOptArgs.get(0)); Path indexPath = new Path( nonOptArgs.get(0) + "-index-" + threshold + "-s" + numStripes + "_" + df.format(date)); // index filtering pruned nested directory Path indexOnlyPath = new Path(indexPath, "part*"); Path outputPath = new Path(nonOptArgs.get(1) + "-" + threshold + "-s" + numStripes); FileInputFormat.setInputPaths(conf1, inputPath); FileOutputFormat.setOutputPath(conf1, indexPath); conf1.setInputFormat(SequenceFileInputFormat.class); conf1.setOutputFormat(SequenceFileOutputFormat.class); conf1.setMapOutputKeyClass(LongWritable.class); conf1.setMapOutputValueClass(IndexItem.class); conf1.setOutputKeyClass(LongWritable.class); conf1.setOutputValueClass(IndexItemArrayWritable.class); conf1.setMapperClass(IndexerMapper.class); conf1.setReducerClass(IndexerReducer.class); // assuming input is sorted according to the key (vectorID) so that the // part files are locally sorted MultipleOutputs.addNamedOutput( conf1, PRUNED, SequenceFileOutputFormat.class, IntWritable.class, VectorComponentArrayWritable.class); // remove the stuff we added from the job name conf1.set( "mapred.job.name", "APS-" + indexPath.getName().substring(0, indexPath.getName().length() - 16)); conf1.setNumTasksToExecutePerJvm(-1); // JVM reuse conf1.setSpeculativeExecution(false); conf1.setCompressMapOutput(true); // hash the posting lists in different buckets to distribute the load conf1.setNumReduceTasks(numBuckets); RunningJob job1 = JobClient.runJob(conf1); // part 2 JobConf conf2 = new JobConf(getConf(), Similarity.class); if (numStripes > 0) FileUtils.mergeRestFile(conf2, indexPath, PRUNED, INDEX_INTERVAL); MultipleInputs.addInputPath( conf2, indexOnlyPath, SequenceFileInputFormat.class, SimilarityMapperIndex.class); MultipleInputs.addInputPath( conf2, inputPath, SequenceFileInputFormat.class, SimilarityMapperInput.class); FileOutputFormat.setOutputPath(conf2, outputPath); conf2.setCombinerClass(SimilarityCombiner.class); conf2.setReducerClass(SimilarityReducer.class); conf2.setPartitionerClass(GenericKey.StripePartitioner.class); conf2.setOutputKeyComparatorClass(GenericKey.Comparator.class); conf2.setOutputValueGroupingComparator(GenericKey.PrimaryComparator.class); conf2.setMapOutputKeyClass(GenericKey.class); conf2.setMapOutputValueClass(GenericValue.class); conf2.setOutputKeyClass(VectorPair.class); conf2.setOutputValueClass(NullWritable.class); Counter numDocs = job1.getCounters() .findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS"); maxKey = maxKey > 0 ? maxKey : (int) numDocs.getValue(); LOG.info("Setting max key value in input to " + maxKey); conf2.setInt(PARAM_APS_MAXKEY, maxKey); conf2.setInt(PARAM_APS_STRIPES, numStripes); conf2.setFloat(PARAM_APS_THRESHOLD, threshold); conf2.setInt(PARAM_APS_REDUCER_PER_STRIPE, spread); conf2.set("mapred.job.name", "APS-" + outputPath.getName()); conf2.setNumTasksToExecutePerJvm(-1); // JVM reuse conf2.setSpeculativeExecution(false); conf2.setCompressMapOutput(true); conf2.setNumReduceTasks(numReducers); JobClient.runJob(conf2); return 0; }
/** * Performs a range query using MapReduce * * @param fs * @param inputFile * @param queryRange * @param shape * @param output * @return * @throws IOException */ public static long rangeQueryMapReduce( FileSystem fs, Path inputFile, Path userOutputPath, Shape queryShape, Shape shape, boolean overwrite, boolean background, QueryInput query) throws IOException { JobConf job = new JobConf(FileMBR.class); FileSystem outFs = inputFile.getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { do { outputPath = new Path( inputFile.toUri().getPath() + ".rangequery_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); } else { if (outFs.exists(outputPath)) { if (overwrite) { outFs.delete(outputPath, true); } else { throw new RuntimeException("Output path already exists and -overwrite flag is not set"); } } } job.setJobName("RangeQuery"); job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class); RangeFilter.setQueryRange(job, queryShape); // Set query range for // filter ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(3); // Decide which map function to use depending on how blocks are indexed // And also which input format to use if (SpatialSite.isRTree(fs, inputFile)) { // RTree indexed file LOG.info("Searching an RTree indexed file"); job.setInputFormat(RTreeInputFormat.class); } else { // A file with no local index LOG.info("Searching a non local-indexed file"); job.setInputFormat(ShapeInputFormat.class); } GlobalIndex<Partition> gIndex = SpatialSite.getGlobalIndex(fs, inputFile); // if (gIndex != null && gIndex.isReplicated()){ // job.setMapperClass(RangeQueryMap.class); Class<?> OutputKey = NullWritable.class; try { Class<?> c = shape.getClass(); Field f = c.getDeclaredField(query.field); f.setAccessible(true); if (f.getType().equals(Integer.TYPE)) { OutputKey = IntWritable.class; } else if (f.getType().equals(Double.TYPE)) { OutputKey = DoubleWritable.class; } else if (f.getType().equals(Long.TYPE)) { OutputKey = LongWritable.class; } } catch (SecurityException e) { e.printStackTrace(); } catch (NoSuchFieldException e) { // TODO Auto-generated catch block e.printStackTrace(); } job.setMapOutputKeyClass(OutputKey); switch (query.type) { case Distinct: job.setMapperClass(DistinctQueryMap.class); job.setReducerClass(DistinctQueryReduce.class); job.setMapOutputValueClass(NullWritable.class); break; case Distribution: job.setMapperClass(DistributionQueryMap.class); job.setReducerClass(DistributionQueryReduce.class); job.setMapOutputValueClass(IntWritable.class); break; default: break; } // } // else // job.setMapperClass(RangeQueryMapNoDupAvoidance.class); // Set query range for the map function job.set(QUERY_SHAPE_CLASS, queryShape.getClass().getName()); job.set(QUERY_SHAPE, queryShape.toText(new Text()).toString()); job.set(QUERY_FIELD, query.field); // Set shape class for the SpatialInputFormat SpatialSite.setShapeClass(job, shape.getClass()); job.setOutputFormat(TextOutputFormat.class); ShapeInputFormat.setInputPaths(job, inputFile); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job if (!background) { RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); // If outputPath not set by user, automatically delete it if (userOutputPath == null) outFs.delete(outputPath, true); return resultCount; } else { JobClient jc = new JobClient(job); lastRunningJob = jc.submitJob(job); return -1; } }