@Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } // scatter the edges to lower degree vertex and build open triads Job scatter = prepareJob( getInputPath(), getTempPath(TMP_OPEN_TRIADS), SequenceFileInputFormat.class, ScatterEdgesToLowerDegreeVertexMapper.class, Vertex.class, Vertex.class, BuildOpenTriadsReducer.class, JoinableUndirectedEdge.class, VertexOrMarker.class, SequenceFileOutputFormat.class); scatter.waitForCompletion(true); // necessary as long as we don't have access to an undeprecated MultipleInputs Job prepareInput = prepareJob( getInputPath(), getTempPath(TMP_CLOSING_EDGES), SequenceFileInputFormat.class, PrepareInputMapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class, Reducer.class, JoinableUndirectedEdge.class, VertexOrMarker.class, SequenceFileOutputFormat.class); prepareInput.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class); prepareInput.waitForCompletion(true); // join opentriads and edges pairwise to get all triangles Job joinTriads = prepareJob( getCombinedTempPath(TMP_OPEN_TRIADS, TMP_CLOSING_EDGES), getOutputPath(), SequenceFileInputFormat.class, Mapper.class, JoinableUndirectedEdge.class, VertexOrMarker.class, JoinTrianglesReducer.class, Triangle.class, NullWritable.class, SequenceFileOutputFormat.class); joinTriads.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class); joinTriads.waitForCompletion(true); return 0; }
public int run(String[] args) throws Exception { // TODO Auto-generated method stub Job job = Job.getInstance( getConf(), "Import vessel locations from files in " + args[0] + " into table cdb_vessel:vessel_location"); // co FileInputFormat.addInputPath(job, new Path(args[0])); job.setJarByClass(ImportVTLocationFromFileWithReducer.class); job.setJobName("Vessel_location_injection"); job.setInputFormatClass(VTVesselLocationFileInputFormat.class); job.setMapOutputKeyClass(Key_IMOAndRecordTime.class); job.setMapOutputValueClass(TextArrayWritable.class); job.setPartitionerClass(Partitioner_IMO.class); job.setGroupingComparatorClass(GroupComparator_IMO.class); job.setReducerClass(ImportReducer.class); job.setNumReduceTasks(Integer.parseInt(args[1])); job.setOutputFormatClass(NullOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Rating predictor MR"; job.setJobName(jobName); job.setJarByClass(UtilityPredictor.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(UtilityPredictor.PredictionMapper.class); job.setReducerClass(UtilityPredictor.PredictorReducer.class); job.setMapOutputKeyClass(TextInt.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(ItemIdGroupComprator.class); job.setPartitionerClass(ItemIdPartitioner.class); Utility.setConfiguration(job.getConfiguration()); int numReducer = job.getConfiguration().getInt("utp.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Implicit rating estimator MR"; job.setJobName(jobName); job.setJarByClass(ImplicitRatingEstimator.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(ImplicitRatingEstimator.RatingEstimatorMapper.class); job.setReducerClass(ImplicitRatingEstimator.RatingEstimatorReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class); Utility.setConfiguration(job.getConfiguration()); job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1)); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
@Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: topreviews <in> [<in>...] <out>"); System.exit(2); } Job job = Job.getInstance(conf, "Top Five Reviews"); job.setJarByClass(TopFive.class); job.setPartitionerClass(NaturalKeyPartitioner.class); job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); job.setMapperClass(TopFiveMapper.class); job.setReducerClass(TopFiveReducer.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(TextPair.class); job.setOutputKeyClass(TextPair.class); job.setOutputValueClass(TextPair.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); return 0; }
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); // String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); // if (otherArgs.length != 3) { // System.err.println("Usage: <tradeTableDir> <payTableDir> <output>"); // System.exit(2); // } // String tradeTableDir = args[0]; // String payTableDir = args[1]; // String joinTableDir = args[2]; Job job = new Job(conf, "Join"); job.setJarByClass(JoinMain.class); job.setMapperClass(PreMapper.class); job.setMapOutputKeyClass(TextPair.class); job.setPartitionerClass(KeyPartition.class); job.setGroupingComparatorClass(FirstComparator.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setReducerClass(CommonReduce.class); FileInputFormat.addInputPath(job, new Path("/user/hadoop/input/load3/action.txt")); FileInputFormat.addInputPath(job, new Path("/user/hadoop/input/load3/alipay.txt")); FileOutputFormat.setOutputPath(job, new Path("/user/hadoop/output3/")); System.exit(job.waitForCompletion(true) ? 0 : 1); }
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Top n matches MR"; job.setJobName(jobName); job.setJarByClass(TopMatches.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(TopMatches.TopMatchesMapper.class); job.setReducerClass(TopMatches.TopMatchesReducer.class); job.setCombinerClass(TopMatches.TopMatchesCombiner.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TupleTextPartitioner.class); Utility.setConfiguration(job.getConfiguration()); int numReducer = job.getConfiguration().getInt("tom.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set("mapred.textoutputformat.separator", ","); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); // Logger log = Logger.getLogger("sds"); Job job = new Job(conf, "Max "); job.setMapOutputKeyClass(CompositeKey.class); job.setPartitionerClass(ActualKeyPartitioner.class); job.setGroupingComparatorClass(ActualKeyGroupingComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); job.setJarByClass(map_reduce.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setNumReduceTasks(27); job.setMapOutputKeyClass(CompositeKey.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Dirfferent type entity similarity MR"; job.setJobName(jobName); job.setJarByClass(DiffTypeSimilarity.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(DiffTypeSimilarity.SimilarityMapper.class); job.setReducerClass(DiffTypeSimilarity.SimilarityReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(IdPairGroupComprator.class); job.setPartitionerClass(IdPairPartitioner.class); Utility.setConfiguration(job.getConfiguration()); int numReducer = job.getConfiguration().getInt("dts.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
public static Job startJob(String[] args) throws IOException { // args[0] = hbase table name // args[1] = zookeeper Configuration hConf = HBaseConfiguration.create(new Configuration()); hConf.set("hbase.zookeeper.quorum", args[1]); hConf.set("scan.table", args[0]); hConf.set("hbase.zookeeper.property.clientPort", "2181"); Scan scan = new Scan(); // scan.setFilter(rowColBloomFilter()); Job job = new Job(hConf); job.setJobName("BSBM-Q11-RepartitionJoin"); job.setJarByClass(RepartitionJoinQ11.class); // Change caching to speed up the scan scan.setCaching(500); scan.setMaxVersions(200); scan.setCacheBlocks(false); // Mapper settings TableMapReduceUtil.initTableMapperJob( args[0], // input HBase table name scan, // Scan instance to control CF and attribute selection RepartitionMapper.class, // mapper CompositeKeyWritable.class, // mapper output key KeyValueArrayWritable.class, // mapper output value job); // Repartition settings job.setPartitionerClass(CompositePartitioner.class); job.setSortComparatorClass(CompositeSortComparator.class); job.setGroupingComparatorClass(CompositeGroupingComparator.class); // Reducer settings job.setReducerClass(SharedServices.RepartitionJoin_Reducer.class); // reducer class job.setNumReduceTasks(1); // at least one, adjust as required FileOutputFormat.setOutputPath(job, new Path("output/BSBMQ11")); try { System.exit(job.waitForCompletion(true) ? 0 : 1); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } return job; }
public Job call() throws IOException, InterruptedException, ClassNotFoundException { job.setMapperClass(GridmixMapper.class); job.setReducerClass(GridmixReducer.class); job.setNumReduceTasks(jobdesc.getNumberReduces()); job.setMapOutputKeyClass(GridmixKey.class); job.setMapOutputValueClass(GridmixRecord.class); job.setSortComparatorClass(GridmixKey.Comparator.class); job.setGroupingComparatorClass(SpecGroupingComparator.class); job.setInputFormatClass(GridmixInputFormat.class); job.setOutputFormatClass(RawBytesOutputFormat.class); job.setPartitionerClass(DraftPartitioner.class); job.setJarByClass(GridmixJob.class); job.getConfiguration().setInt("gridmix.job.seq", seq); job.getConfiguration() .set(ORIGNAME, null == jobdesc.getJobID() ? "<unknown>" : jobdesc.getJobID().toString()); job.getConfiguration().setBoolean("mapred.used.genericoptionsparser", true); FileInputFormat.addInputPath(job, new Path("ignored")); FileOutputFormat.setOutputPath(job, outdir); job.submit(); return job; }
@SuppressWarnings("rawtypes") public void afterPropertiesSet() throws Exception { final Configuration cfg = ConfigurationUtils.createFrom(configuration, properties); buildGenericOptions(cfg); if (StringUtils.hasText(user)) { UserGroupInformation ugi = UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser()); ugi.doAs( new PrivilegedExceptionAction<Void>() { @Override public Void run() throws Exception { job = new Job(cfg); return null; } }); } else { job = new Job(cfg); } ClassLoader loader = (beanClassLoader != null ? beanClassLoader : org.springframework.util.ClassUtils.getDefaultClassLoader()); if (jar != null) { JobConf conf = (JobConf) job.getConfiguration(); conf.setJar(jar.getURI().toString()); loader = ExecutionUtils.createParentLastClassLoader(jar, beanClassLoader, cfg); conf.setClassLoader(loader); } // set first to enable auto-detection of K/V to skip the key/value types to be specified if (mapper != null) { Class<? extends Mapper> mapperClass = resolveClass(mapper, loader, Mapper.class); job.setMapperClass(mapperClass); configureMapperTypesIfPossible(job, mapperClass); } if (reducer != null) { Class<? extends Reducer> reducerClass = resolveClass(reducer, loader, Reducer.class); job.setReducerClass(reducerClass); configureReducerTypesIfPossible(job, reducerClass); } if (StringUtils.hasText(name)) { job.setJobName(name); } if (combiner != null) { job.setCombinerClass(resolveClass(combiner, loader, Reducer.class)); } if (groupingComparator != null) { job.setGroupingComparatorClass(resolveClass(groupingComparator, loader, RawComparator.class)); } if (inputFormat != null) { job.setInputFormatClass(resolveClass(inputFormat, loader, InputFormat.class)); } if (mapKey != null) { job.setMapOutputKeyClass(resolveClass(mapKey, loader, Object.class)); } if (mapValue != null) { job.setMapOutputValueClass(resolveClass(mapValue, loader, Object.class)); } if (numReduceTasks != null) { job.setNumReduceTasks(numReduceTasks); } if (key != null) { job.setOutputKeyClass(resolveClass(key, loader, Object.class)); } if (value != null) { job.setOutputValueClass(resolveClass(value, loader, Object.class)); } if (outputFormat != null) { job.setOutputFormatClass(resolveClass(outputFormat, loader, OutputFormat.class)); } if (partitioner != null) { job.setPartitionerClass(resolveClass(partitioner, loader, Partitioner.class)); } if (sortComparator != null) { job.setSortComparatorClass(resolveClass(sortComparator, loader, RawComparator.class)); } if (StringUtils.hasText(workingDir)) { job.setWorkingDirectory(new Path(workingDir)); } if (jarClass != null) { job.setJarByClass(jarClass); } if (!CollectionUtils.isEmpty(inputPaths)) { for (String path : inputPaths) { FileInputFormat.addInputPath(job, new Path(path)); } } if (StringUtils.hasText(outputPath)) { FileOutputFormat.setOutputPath(job, new Path(outputPath)); } if (compressOutput != null) { FileOutputFormat.setCompressOutput(job, compressOutput); } if (codecClass != null) { FileOutputFormat.setOutputCompressorClass( job, resolveClass(codecClass, loader, CompressionCodec.class)); } processJob(job); }
/** The main entry point if this class is called as a {@link Tool}. */ @Override public int run(String[] args) throws Exception { Path inputPath = null; Path outputPath = null; Configuration conf = getConf(); // retrieve our paths from the configuration inputPath = new Path(conf.get(Util.CONF_LOGDATA_PATH)); outputPath = new Path(conf.get(Util.CONF_CACHING_SIMULATOR_PATH)); final int numCores = conf.getInt(Util.CONF_NUM_CORES, Util.DEFAULT_NUM_CORES); final int numNodes = conf.getInt(Util.CONF_NUM_NODES, Util.DEFAULT_NUM_NODES); NUM_OF_REDUCE_TASKS = numCores * numNodes; // set the jobname String jobName = Util.JOB_NAME + " [" + CachingTool.ACTION + "] {logdata=" + inputPath.getName() + ", session=" + conf.get(Util.CONF_SESSION_DURATION) + "}"; Util.showStatus("Running " + jobName); conf.set("hadoop.job.ugi", Util.HADOOP_USER); conf.set("mapred.child.java.opts", "-Xmx1500M -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode"); conf.set("mapred.task.timeout", "1800000"); conf.set("mapred.map.tasks.speculative.execution", "false"); conf.set("mapred.reduce.tasks.speculative.execution", "false"); FileSystem fs = FileSystem.get(conf); Job job = new Job(conf, jobName); // set number of reduce tasks job.setNumReduceTasks(NUM_OF_REDUCE_TASKS); // set mapper, reducer, partitioner and grouping comperator job.setJarByClass(CachingTool.class); job.setMapperClass(CachingMapper.class); job.setReducerClass(CachingReducer.class); // GroupingComperator used for Secondary-Sort job.setGroupingComparatorClass(TextPair.FirstComparator.class); job.setPartitionerClass(TextPair.FirstPartitioner.class); job.setOutputKeyClass(TextPair.class); job.setOutputValueClass(Text.class); // set input and output format job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setMaxInputSplitSize(job, Util.DATASET_MB_SPLIT * 25); FileInputFormat.setMinInputSplitSize(job, Util.DATASET_MB_SPLIT * 25); // add input path subdirectories if there are any ArrayList<Path> inputPaths = Util.getInputDirectories(fs, inputPath); int pathsAdded = 0; if (inputPaths.size() > 0) { for (Path p : inputPaths) { if (!p.getName().contains(".") && !p.getName().contains("_")) { Util.showStatus("Adding input paths " + p); FileInputFormat.addInputPath(job, p); pathsAdded++; } } } if (pathsAdded == 0) { Util.showStatus("Adding input path " + inputPath); FileInputFormat.addInputPath(job, inputPath); } // clear output dir fs.delete(outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION)), true); FileOutputFormat.setOutputPath( job, outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION))); // run the job and wait for it to be completed boolean b = job.waitForCompletion(true); // NOTE! The counters will be written HERE // retrieve the counters Counter numNewInCache = job.getCounters().findCounter(CachingReducer.CacheCounter.NEW_TO_CACHE); Counter numRenewCache = job.getCounters().findCounter(CachingReducer.CacheCounter.RENEW_CACHE); Counter numUsedFromCache = job.getCounters().findCounter(CachingReducer.CacheCounter.USED_FROM_CACHE); // write the counters to the metadata file Path headerPath = new Path(conf.get(Util.CONF_CACHING_SIMULATOR_PATH)); FSDataOutputStream out = fs.create(headerPath.suffix("/" + DataSetHeader.SIMULATE_CACHING_METADATA_FILE)); PrintWriter w = new PrintWriter(out); // the sum of all counters equals the sum of all queries in the log file w.println("hostnametypeAddedToCache=" + numNewInCache.getValue()); w.println("queriesAddedAgainToCache=" + numRenewCache.getValue()); w.println("queriesAnsweredFromCache=" + numUsedFromCache.getValue()); w.close(); out.close(); // Delete all empty output files Util.deleteEmptyFiles(fs, outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION))); return b ? 1 : 0; }
@Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption( "similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')'); addOption( "maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number " + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM)); addOption( "maxCooccurrencesPerItem", "mo", "try to cap the number of cooccurrences per item to this number " + "(default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM)); addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } String similarityClassName = parsedArgs.get("--similarityClassname"); int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem")); int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem")); boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData")); Path inputPath = getInputPath(); Path outputPath = getOutputPath(); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex"); Path countUsersPath = new Path(tempDirPath, "countUsers"); Path userVectorPath = new Path(tempDirPath, "userVectors"); Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix"); Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job itemIDIndex = prepareJob( inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); itemIDIndex.setCombinerClass(ItemIDIndexReducer.class); itemIDIndex.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job countUsers = prepareJob( inputPath, countUsersPath, TextInputFormat.class, CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class, CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class); countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class); countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class); countUsers.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job toUserVector = prepareJob( inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class, VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData); toUserVector.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job maybePruneAndTransponse = prepareJob( userVectorPath, itemUserMatrixPath, SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class, DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); maybePruneAndTransponse .getConfiguration() .setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES, maxCooccurrencesPerItem); maybePruneAndTransponse.waitForCompletion(true); } int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath); /* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like * new DistributedRowMatrix(...).rowSimilarity(...) */ ToolRunner.run( getConf(), new RowSimilarityJob(), new String[] { "-Dmapred.input.dir=" + itemUserMatrixPath.toString(), "-Dmapred.output.dir=" + similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName, "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem + 1), "--tempDir", tempDirPath.toString() }); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job mostSimilarItems = prepareJob( similarityMatrixPath, outputPath, SequenceFileInputFormat.class, MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class, MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class); Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration(); mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString()); mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem); mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class); mostSimilarItems.waitForCompletion(true); } return 0; }
private int setJobParameters(Job job, AminoJob aj) throws Exception { final Configuration conf = job.getConfiguration(); final Class<? extends DataLoader> dataLoaderClass = aj.getDataLoaderClass(); AminoInputFormat.setDataLoader(job.getConfiguration(), dataLoaderClass.newInstance()); if (aj instanceof AminoEnrichmentJob) { String output = ""; int returnType = JOB_TYPE_ENRICHMENT; if (aj instanceof AminoReuseEnrichmentJob) { System.out.println("Running REUSE Enrichment Join Job"); AminoReuseEnrichmentJob reuseJob = (AminoReuseEnrichmentJob) aj; AminoInputFormat.setDataLoader( job.getConfiguration(), reuseJob.getFirstPhaseDataLoaderClass().newInstance()); String root = conf.get(AminoDriverUtils.ENRICHMENT_ROOT_OUTPUT); String front = ""; if (!root.endsWith("/")) front = "/"; root += front; String dir = reuseJob.getOutputSubDirectory(conf); output += root + dir; returnType = JOB_TYPE_REUSE_ENRICHMENT; } else { System.out.println("Running Enrichment Join Job"); } int numReducers = conf.getInt( AMINO_NUM_REDUCERS_ENRICH_PHASE1, conf.getInt(AMINO_NUM_REDUCERS, DEFAULT_NUM_REDUCERS)); job.setNumReduceTasks(numReducers); // Our Framework mapper and reducer job.setMapperClass(FrameworkEnrichmentJoinMapper.class); job.setCombinerClass(FrameworkEnrichmentJoinCombiner.class); job.setReducerClass(FrameworkEnrichmentJoinReducer.class); job.setMapOutputKeyClass(EnrichmentJoinKey.class); // Different job.setMapOutputValueClass(MapWritable.class); job.setOutputKeyClass(BucketStripped.class); job.setOutputValueClass(MapWritable.class); // Different job.setPartitionerClass(NaturalKeyPartitioner.class); job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); job.setInputFormatClass(AminoMultiInputFormat.class); AminoEnrichmentJob aej = (AminoEnrichmentJob) aj; // AminoMultiInputFormat.setJoinDataLoader(conf, aej.getEnrichmentDataLoader().newInstance()); AminoMultiInputFormat.setJoinDataLoaders(conf, aej.getEnrichmentDataLoaders()); AminoMultiInputFormat.setEnrichWorker(conf, aej.getEnrichWorker().newInstance()); job.setOutputFormatClass(SequenceFileOutputFormat.class); // TODO If it already exists, and its age is less than job running frequency, just reuse it // instead of doing the above job... if (output.length() == 0) { output = getEnrichmentOutputPath(aej, conf); } System.out.println("Output will be written to: " + PathUtils.getJobDataPath(output)); SequenceFileOutputFormat.setOutputPath(job, new Path(PathUtils.getJobDataPath(output))); JobUtilities.deleteDirectory(conf, output); CacheBuilder.buildCaches(AminoDataUtils.getDataLoader(conf), aj, output, conf); return returnType; } else { System.out.println("\n==================== Running Amino Job =================\n"); // Our Framework mapper and reducer job.setMapperClass(FrameworkMapper.class); job.setReducerClass(FrameworkReducer.class); job.setMapOutputKeyClass(BucketStripped.class); job.setMapOutputValueClass(MapWritable.class); job.setOutputKeyClass(BucketStripped.class); job.setOutputValueClass(AminoWritable.class); job.setInputFormatClass(AminoInputFormat.class); job.setOutputFormatClass(AminoOutputFormat.class); job.setNumReduceTasks(conf.getInt(AMINO_NUM_REDUCERS, DEFAULT_NUM_REDUCERS)); AminoOutputFormat.setAminoConfigPath( job, job.getConfiguration().get(AminoConfiguration.DEFAULT_CONFIGURATION_PATH_KEY)); String output = conf.get("amino.output"); System.out.println("Output will be written to: " + PathUtils.getJobDataPath(output)); AminoOutputFormat.setOutputPath(job, new Path(PathUtils.getJobDataPath(output))); JobUtilities.deleteDirectory(conf, output); CacheBuilder.buildCaches(AminoDataUtils.getDataLoader(conf), aj, output, conf); return JOB_TYPE_NORMAL; } }