public BucketCache(Configuration conf) throws IOException { bucketCache = new HashMap<IntWritable, Bucket>(); for (String cachePath : PathUtils.getCachePaths(conf)) { String bucketCachePath = cachePath + BUCKET_CACHE_FOLDER; MapFile.Reader reader = new MapFile.Reader(new Path(bucketCachePath), conf); IntWritable key = new IntWritable(); Bucket value = new Bucket(); while (reader.next(key, value)) { bucketCache.put(new IntWritable(key.get()), new Bucket(value)); } } for (IntWritable i : bucketCache.keySet()) { System.out.println("Loaded bucket from cache:" + i.get() + ":" + bucketCache.get(i)); } }
@SuppressWarnings("unchecked") public void writeToDisk(Configuration conf, boolean writeToDistributedCache) throws IOException { String bucketCachePath = PathUtils.getCachePath(conf) + BUCKET_CACHE_FOLDER; FileSystem fs = FileSystem.get(conf); MapFile.Writer writer = null; try { writer = new MapFile.Writer( conf, new Path(bucketCachePath), MapFile.Writer.keyClass(IntWritable.class), MapFile.Writer.valueClass(Bucket.class)); ArrayList<IntWritable> keyList = new ArrayList<IntWritable>(); for (IntWritable i : bucketCache.keySet()) { keyList.add(i); } Collections.sort(keyList); for (IntWritable i : keyList) { writer.append(i, bucketCache.get(i)); } } finally { if (writer != null) { IOUtils.closeStream(writer); } } if (writeToDistributedCache) { for (FileStatus status : fs.listStatus(new Path(bucketCachePath))) { if (!status.isDirectory()) { DistributedCache.addCacheFile(status.getPath().toUri(), conf); } } } }
private boolean runSecondPhaseEnrichmentJob( AminoEnrichmentJob aej, Configuration conf, int jobType) throws Exception { System.out.println("Running Amino Job"); final Job job = new Job(conf, aej.getJobName() + " phase 2"); job.setJarByClass(aej.getClass()); AminoDriverUtils.setAminoJob(job.getConfiguration(), aej.getClass()); if (jobType == JOB_TYPE_ENRICHMENT) { job.getConfiguration().set(AminoDriverUtils.ENRICHMENT_OUTPUT, this.enrichmentOutput); } else if (jobType == JOB_TYPE_REUSE_ENRICHMENT) { String root = conf.get(AminoDriverUtils.ENRICHMENT_ROOT_OUTPUT); String front = ""; if (!root.endsWith("/")) { front = "/"; } root += front; final Iterable<String> inputs = ((AminoReuseEnrichmentJob) aej) .getSecondPhaseEnrichmentInputDirectories(job.getConfiguration()); String inputStr = ""; System.out.println("Using enrichment input paths:"); for (String input : inputs) { if (inputStr.length() > 0) { inputStr += "," + PathUtils.getJobDataPath(root + input); } else { inputStr += PathUtils.getJobDataPath(root + input); } System.out.println(PathUtils.getJobDataPath(root + input)); } job.getConfiguration().set(AminoDriverUtils.ENRICHMENT_OUTPUT, inputStr); // Need to do this because the first phase data loader is sitting in this slot currently AminoInputFormat.setDataLoader( job.getConfiguration(), aej.getDataLoaderClass().newInstance()); } int numReducers = job.getConfiguration() .getInt( AMINO_NUM_REDUCERS_ENRICH_PHASE2, job.getConfiguration().getInt(AMINO_NUM_REDUCERS, DEFAULT_NUM_REDUCERS)); job.setNumReduceTasks(numReducers); job.setMapperClass(FrameworkMapper.class); job.setReducerClass(FrameworkReducer.class); job.setMapOutputKeyClass(BucketStripped.class); job.setMapOutputValueClass(MapWritable.class); job.setOutputKeyClass(BucketStripped.class); job.setOutputValueClass(AminoWritable.class); job.setInputFormatClass(AminoMultiInputFormat.class); AminoMultiInputFormat.setDataLoader( job.getConfiguration(), aej.getDataLoaderClass().newInstance()); // Call job configuration for special properties jobConfiguration(job); @SuppressWarnings("serial") ArrayList<Class<? extends DataLoader>> joinSource = new ArrayList<Class<? extends DataLoader>>() { { add(EnrichmentDataLoader.class); } }; AminoMultiInputFormat.setJoinDataLoaders(job.getConfiguration(), joinSource); job.setOutputFormatClass(AminoOutputFormat.class); AminoOutputFormat.setAminoConfigPath( job, job.getConfiguration().get(AminoConfiguration.DEFAULT_CONFIGURATION_PATH_KEY)); String output = job.getConfiguration().get("amino.output"); System.out.println("Output will be written to: " + PathUtils.getJobDataPath(output)); AminoOutputFormat.setOutputPath(job, new Path(PathUtils.getJobDataPath(output))); JobUtilities.deleteDirectory(job.getConfiguration(), output); CacheBuilder.buildCaches( AminoDataUtils.getDataLoader(job.getConfiguration()), aej, output, job.getConfiguration()); return job.waitForCompletion(true); }
private int setJobParameters(Job job, AminoJob aj) throws Exception { final Configuration conf = job.getConfiguration(); final Class<? extends DataLoader> dataLoaderClass = aj.getDataLoaderClass(); AminoInputFormat.setDataLoader(job.getConfiguration(), dataLoaderClass.newInstance()); if (aj instanceof AminoEnrichmentJob) { String output = ""; int returnType = JOB_TYPE_ENRICHMENT; if (aj instanceof AminoReuseEnrichmentJob) { System.out.println("Running REUSE Enrichment Join Job"); AminoReuseEnrichmentJob reuseJob = (AminoReuseEnrichmentJob) aj; AminoInputFormat.setDataLoader( job.getConfiguration(), reuseJob.getFirstPhaseDataLoaderClass().newInstance()); String root = conf.get(AminoDriverUtils.ENRICHMENT_ROOT_OUTPUT); String front = ""; if (!root.endsWith("/")) front = "/"; root += front; String dir = reuseJob.getOutputSubDirectory(conf); output += root + dir; returnType = JOB_TYPE_REUSE_ENRICHMENT; } else { System.out.println("Running Enrichment Join Job"); } int numReducers = conf.getInt( AMINO_NUM_REDUCERS_ENRICH_PHASE1, conf.getInt(AMINO_NUM_REDUCERS, DEFAULT_NUM_REDUCERS)); job.setNumReduceTasks(numReducers); // Our Framework mapper and reducer job.setMapperClass(FrameworkEnrichmentJoinMapper.class); job.setCombinerClass(FrameworkEnrichmentJoinCombiner.class); job.setReducerClass(FrameworkEnrichmentJoinReducer.class); job.setMapOutputKeyClass(EnrichmentJoinKey.class); // Different job.setMapOutputValueClass(MapWritable.class); job.setOutputKeyClass(BucketStripped.class); job.setOutputValueClass(MapWritable.class); // Different job.setPartitionerClass(NaturalKeyPartitioner.class); job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); job.setInputFormatClass(AminoMultiInputFormat.class); AminoEnrichmentJob aej = (AminoEnrichmentJob) aj; // AminoMultiInputFormat.setJoinDataLoader(conf, aej.getEnrichmentDataLoader().newInstance()); AminoMultiInputFormat.setJoinDataLoaders(conf, aej.getEnrichmentDataLoaders()); AminoMultiInputFormat.setEnrichWorker(conf, aej.getEnrichWorker().newInstance()); job.setOutputFormatClass(SequenceFileOutputFormat.class); // TODO If it already exists, and its age is less than job running frequency, just reuse it // instead of doing the above job... if (output.length() == 0) { output = getEnrichmentOutputPath(aej, conf); } System.out.println("Output will be written to: " + PathUtils.getJobDataPath(output)); SequenceFileOutputFormat.setOutputPath(job, new Path(PathUtils.getJobDataPath(output))); JobUtilities.deleteDirectory(conf, output); CacheBuilder.buildCaches(AminoDataUtils.getDataLoader(conf), aj, output, conf); return returnType; } else { System.out.println("\n==================== Running Amino Job =================\n"); // Our Framework mapper and reducer job.setMapperClass(FrameworkMapper.class); job.setReducerClass(FrameworkReducer.class); job.setMapOutputKeyClass(BucketStripped.class); job.setMapOutputValueClass(MapWritable.class); job.setOutputKeyClass(BucketStripped.class); job.setOutputValueClass(AminoWritable.class); job.setInputFormatClass(AminoInputFormat.class); job.setOutputFormatClass(AminoOutputFormat.class); job.setNumReduceTasks(conf.getInt(AMINO_NUM_REDUCERS, DEFAULT_NUM_REDUCERS)); AminoOutputFormat.setAminoConfigPath( job, job.getConfiguration().get(AminoConfiguration.DEFAULT_CONFIGURATION_PATH_KEY)); String output = conf.get("amino.output"); System.out.println("Output will be written to: " + PathUtils.getJobDataPath(output)); AminoOutputFormat.setOutputPath(job, new Path(PathUtils.getJobDataPath(output))); JobUtilities.deleteDirectory(conf, output); CacheBuilder.buildCaches(AminoDataUtils.getDataLoader(conf), aj, output, conf); return JOB_TYPE_NORMAL; } }