public int run(String[] args) throws Exception { if (args.length != 5) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int mapTasks = Integer.parseInt(args[2]); int reduceTasks = Integer.parseInt(args[3]); String stoplistPath = args[4]; sLogger.info("Tool: AFormatter"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - number of mappers: " + mapTasks); sLogger.info(" - number of reducers: " + reduceTasks); JobConf conf = new JobConf(AFormatterWG.class); conf.setJobName("Authority Formatter -- Web Graph"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); // conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(HITSNode.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setCompressMapOutput(true); conf.setSpeculativeExecution(false); // InputSampler.Sampler<IntWritable, Text> sampler = new // InputSampler.RandomSampler<IntWritable, Text>(0.1, 10, 10); // InputSampler.writePartitionFile(conf, sampler); // conf.setPartitionerClass(TotalOrderPartitioner.class); conf.setMapperClass(AFormatMapperIMC.class); conf.setCombinerClass(AFormatReducer.class); conf.setReducerClass(AFormatReducer.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); Path stopList = new Path(stoplistPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); sLogger.info("Starting job"); DistributedCache.addCacheFile(stopList.toUri(), conf); JobClient.runJob(conf); sLogger.info( "Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
/** Test compressible {@link GridmixRecord}. */ @Test public void testCompressibleGridmixRecord() throws IOException { JobConf conf = new JobConf(); CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true); CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true); FileSystem lfs = FileSystem.getLocal(conf); int dataSize = 1024 * 1024 * 10; // 10 MB float ratio = 0.357F; // define the test's root temp directory Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp")) .makeQualified(lfs.getUri(), lfs.getWorkingDirectory()); Path tempDir = new Path(rootTempDir, "TestPossiblyCompressibleGridmixRecord"); lfs.delete(tempDir, true); // define a compressible GridmixRecord GridmixRecord record = new GridmixRecord(dataSize, 0); record.setCompressibility(true, ratio); // enable compression conf.setClass(FileOutputFormat.COMPRESS_CODEC, GzipCodec.class, CompressionCodec.class); org.apache.hadoop.mapred.FileOutputFormat.setCompressOutput(conf, true); // write the record to a file Path recordFile = new Path(tempDir, "record"); OutputStream outStream = CompressionEmulationUtil.getPossiblyCompressedOutputStream(recordFile, conf); DataOutputStream out = new DataOutputStream(outStream); record.write(out); out.close(); outStream.close(); // open the compressed stream for reading Path actualRecordFile = recordFile.suffix(".gz"); InputStream in = CompressionEmulationUtil.getPossiblyDecompressedInputStream(actualRecordFile, conf, 0); // get the compressed file size long compressedFileSize = lfs.listStatus(actualRecordFile)[0].getLen(); GridmixRecord recordRead = new GridmixRecord(); recordRead.readFields(new DataInputStream(in)); assertEquals( "Record size mismatch in a compressible GridmixRecord", dataSize, recordRead.getSize()); assertTrue( "Failed to generate a compressible GridmixRecord", recordRead.getSize() > compressedFileSize); // check if the record can generate data with the desired compression ratio float seenRatio = ((float) compressedFileSize) / dataSize; assertEquals( CompressionEmulationUtil.standardizeCompressionRatio(ratio), CompressionEmulationUtil.standardizeCompressionRatio(seenRatio), 1.0D); }
private void createPageRankLinksDirectly() throws IOException, URISyntaxException { log.info("Creating PageRank links", null); JobConf job = new JobConf(PagerankData.class); String jobname = "Create pagerank links"; Path fout = new Path(options.getResultPath(), EDGES_DIR_NAME); job.setJobName(jobname); setPageRankLinksOptions(job); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); // job.setMapOutputKeyClass(LongWritable.class); // job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, dummy.getPath()); job.setInputFormat(NLineInputFormat.class); job.setMapperClass(DummyToPageRankLinksMapper.class); if (options.isSequenceOut()) { job.setOutputFormat(SequenceFileOutputFormat.class); } else { job.setOutputFormat(TextOutputFormat.class); } if (null != options.getCodecClass()) { job.set("mapred.output.compression.type", "BLOCK"); job.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK"); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass()); } FileOutputFormat.setOutputPath(job, fout); log.info("Running Job: " + jobname); log.info("Dummy file " + dummy.getPath() + " as input"); log.info("Edges file " + fout + " as output"); JobClient.runJob(job); log.info("Finished Running Job: " + jobname); }
/** Runs this tool. */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String mappingFile = args[2]; LOG.info("Tool: " + DemoCountTrecDocuments.class.getCanonicalName()); LOG.info(" - input: " + inputPath); LOG.info(" - output dir: " + outputPath); LOG.info(" - docno mapping file: " + mappingFile); JobConf conf = new JobConf(getConf(), DemoCountTrecDocuments.class); conf.setJobName(DemoCountTrecDocuments.class.getSimpleName()); conf.setNumReduceTasks(0); // Pass in the class name as a String; this is makes the mapper general in being able to load // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping // object. conf.set("DocnoMappingClass", TrecDocnoMapping.class.getCanonicalName()); // Put the mapping file in the distributed cache so each map worker will have it. DistributedCache.addCacheFile(new URI(mappingFile), conf); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(TrecDocumentInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
/** * Test of {@link FileQueue} can identify compressed file and provide readers to extract * uncompressed data only if input-compression is enabled. */ @Test public void testFileQueueDecompression() throws IOException { JobConf conf = new JobConf(); FileSystem lfs = FileSystem.getLocal(conf); String inputLine = "Hi Hello!"; CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true); CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true); org.apache.hadoop.mapred.FileOutputFormat.setCompressOutput(conf, true); org.apache.hadoop.mapred.FileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); // define the test's root temp directory Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp")) .makeQualified(lfs.getUri(), lfs.getWorkingDirectory()); Path tempDir = new Path(rootTempDir, "TestFileQueueDecompression"); lfs.delete(tempDir, true); // create a compressed file Path compressedFile = new Path(tempDir, "test"); OutputStream out = CompressionEmulationUtil.getPossiblyCompressedOutputStream(compressedFile, conf); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out)); writer.write(inputLine); writer.close(); compressedFile = compressedFile.suffix(".gz"); // now read back the data from the compressed stream using FileQueue long fileSize = lfs.listStatus(compressedFile)[0].getLen(); CombineFileSplit split = new CombineFileSplit(new Path[] {compressedFile}, new long[] {fileSize}); FileQueue queue = new FileQueue(split, conf); byte[] bytes = new byte[inputLine.getBytes().length]; queue.read(bytes); queue.close(); String readLine = new String(bytes); assertEquals("Compression/Decompression error", inputLine, readLine); }
private void createPageRankNodesDirectly() throws IOException { log.info("Creating PageRank nodes...", null); Path fout = new Path(options.getResultPath(), VERTICALS_DIR_NAME); JobConf job = new JobConf(PagerankData.class); String jobname = "Create pagerank nodes"; job.setJobName(jobname); setPageRankNodesOptions(job); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, dummy.getPath()); job.setInputFormat(NLineInputFormat.class); if (balance) { /** * * Balance the output order of nodes, to prevent the running of pagerank bench from * potential data skew */ job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setMapperClass(BalancedLinkNodesMapper.class); job.setReducerClass(BalancedLinkNodesReducer.class); // job.setPartitionerClass(ModulusPartitioner.class); if (options.getNumReds() > 0) { job.setNumReduceTasks(options.getNumReds()); } else { job.setNumReduceTasks(Utils.getMaxNumReds()); } } else { job.setMapOutputKeyClass(Text.class); job.setMapperClass(DummyToNodesMapper.class); job.setNumReduceTasks(0); } if (options.isSequenceOut()) { job.setOutputFormat(SequenceFileOutputFormat.class); } else { job.setOutputFormat(TextOutputFormat.class); } if (null != options.getCodecClass()) { job.set("mapred.output.compression.type", "BLOCK"); job.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK"); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass()); } FileOutputFormat.setOutputPath(job, fout); log.info("Running Job: " + jobname); log.info("Dummy file " + dummy.getPath() + " as input"); log.info("Vertices file " + fout + " as output"); JobClient.runJob(job); log.info("Finished Running Job: " + jobname); }
/** Runs this tool. */ public int run(String[] args) throws Exception { long startWholeProgram = System.currentTimeMillis(); boolean hasConverged = false; int counter = 0; Reader sequenceFilereader; // Must have four arguments if (args.length != 4) { MapReduce.printUsage(); return -1; } // Set input and output file paths String inputPathToAdjacencyTextFile = args[0]; String outputPathToNodeSequenceFileFormat = args[1]; // Configure Job 1: int mapTasks = Integer.parseInt(args[2]); int reduceTasks = Integer.parseInt(args[3]); int reduceTasksSetup = 1; // Configure Job Setup JobConf conf1 = new JobConf(MapReduce.class); conf1.setInt("numberOfNodes", numberNodes); conf1.setJobName("Setup Job"); conf1.setNumMapTasks(mapTasks); conf1.setNumReduceTasks(reduceTasksSetup); FileInputFormat.setInputPaths(conf1, new Path(inputPathToAdjacencyTextFile)); FileOutputFormat.setOutputPath(conf1, new Path(outputPathToNodeSequenceFileFormat)); FileOutputFormat.setCompressOutput(conf1, false); conf1.setOutputKeyClass(Text.class); conf1.setOutputValueClass(MapReduceNode.class); conf1.setOutputFormat(SequenceFileOutputFormat.class); conf1.setInputFormat(TextInputFormat.class); conf1.setMapperClass(ConfigureMapper.class); conf1.setReducerClass(ConfigureReducer.class); // Delete the output directory if it exists already Path tempDir = new Path(outputPathToNodeSequenceFileFormat); FileSystem.get(tempDir.toUri(), conf1).delete(tempDir, true); long startTime = System.currentTimeMillis(); // Run Configure Job RunningJob job = JobClient.runJob(conf1); sLogger.info( "Config Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); String inputPath = args[1]; String outputPath = null; String outputPathForNormalisedPagerank = null; String outputPathforFinalSortedPagerank = null; // Page Rank Calculation Job 2 (Iterative until convergence has been reached) while (!hasConverged) { System.out.println("*** ITERATION " + counter + ", number of nodes: " + numberNodes); counter++; sLogger.info("***** ITERATION " + counter); outputPath = args[1] + counter; // Pure Page Rank Calculation Job Setup JobConf pageRankJob = new JobConf(getConf(), MapReduce.class); pageRankJob.setInt("numberOfNodes", numberNodes); FileInputFormat.setInputPaths(pageRankJob, new Path(inputPath)); FileOutputFormat.setOutputPath(pageRankJob, new Path(outputPath)); FileOutputFormat.setCompressOutput(pageRankJob, false); pageRankJob.setJobName("PP Iteration " + counter); pageRankJob.setNumMapTasks(mapTasks); pageRankJob.setNumReduceTasks(reduceTasks); pageRankJob.setOutputKeyClass(Text.class); pageRankJob.setOutputValueClass(MapReduceNode.class); pageRankJob.setOutputFormat(SequenceFileOutputFormat.class); pageRankJob.setInputFormat(SequenceFileInputFormat.class); pageRankJob.setMapperClass(MapReduce.PageRankCalcMapper.class); pageRankJob.setReducerClass(MapReduce.PageRankCalcReducer.class); // Delete the output directory if it exists already Path tempPageRankDir = new Path(outputPath); FileSystem.get(tempDir.toUri(), conf1).delete(tempPageRankDir, true); startTime = System.currentTimeMillis(); // Run Pure Page Rank Calculation Job RunningJob runningJob = JobClient.runJob(pageRankJob); sLogger.info( "PP Job" + counter + "Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Delete the input directory if it exists already Path tempInputPageRankDir = new Path(inputPath); FileSystem.get(tempDir.toUri(), conf1).delete(tempInputPageRankDir, true); // Set the output path of this iteration to be the inputpath for the next iteration inputPath = outputPath; // Check for convergence after every five iterations if (counter % 5 == 0) { Configuration conf = getConf(); if (outputPath != null) { sLogger.info("Attempting to open file: " + outputPath + File.separator + "part-00000"); System.out.println( "Attempting to open file: " + outputPath + File.separator + "part-00000"); } else { sLogger.info("OUTPUT PATH IS NULL"); System.out.println("OUTPUT PATH IS NULL"); } Path cFile = new Path(outputPath + File.separator + "part-00000"); FileSystem fs = FileSystem.get(cFile.toUri(), conf); sequenceFilereader = new Reader(fs, cFile, conf); for (int i = 0; i < 5; i++) { MapReduceNode readValue = new MapReduceNode(); Text readKey = new Text(); sequenceFilereader.next(readKey, readValue); if (!(readValue.hasConverged())) { break; } if (i == 4) { hasConverged = true; sequenceFilereader.close(); } } sequenceFilereader.close(); } if (counter == 75) { sLogger.info("****************** Exiting (purposefully) after 75th iteration"); hasConverged = true; } } // Normalised Page Rank Calculation Job 3 outputPathForNormalisedPagerank = args[1] + "normalizedPageRank"; // Normalised Page Rank Calculation Job Setup JobConf normalizationJob = new JobConf(getConf(), MapReduce.class); FileInputFormat.setInputPaths(normalizationJob, new Path(inputPath)); FileOutputFormat.setOutputPath(normalizationJob, new Path(outputPathForNormalisedPagerank)); FileOutputFormat.setCompressOutput(normalizationJob, false); normalizationJob.setJobName("Normalised Pagerank Output"); normalizationJob.setNumMapTasks(mapTasks); normalizationJob.setNumReduceTasks(1); normalizationJob.setOutputKeyClass(Text.class); normalizationJob.setOutputValueClass(DoubleWritable.class); normalizationJob.setInputFormat(SequenceFileInputFormat.class); normalizationJob.setOutputFormat(SequenceFileOutputFormat.class); normalizationJob.setMapperClass(NormalisationMapper.class); normalizationJob.setReducerClass(NormalisationReducer.class); // Delete the output directory if it exists already Path tempUpdatedPageRankDir = new Path(outputPathForNormalisedPagerank); FileSystem.get(tempDir.toUri(), conf1).delete(tempUpdatedPageRankDir, true); startTime = System.currentTimeMillis(); // Run Normalised Page Rank Calculation Job RunningJob runningUpdateJob = JobClient.runJob(normalizationJob); sLogger.info( "Normalisation Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Sorting and Output Job 4 // Delete the intermediary files created Path tempNormalizationInputPath = new Path(inputPath); FileSystem.get(tempDir.toUri(), conf1).delete(tempNormalizationInputPath, true); inputPath = outputPathForNormalisedPagerank; outputPathforFinalSortedPagerank = args[1] + "FinalSortedPageRank"; // Sorting and Output Job Setup JobConf outputJob = new JobConf(getConf(), MapReduce.class); FileInputFormat.setInputPaths(outputJob, new Path(inputPath)); FileOutputFormat.setOutputPath(outputJob, new Path(outputPathforFinalSortedPagerank)); FileOutputFormat.setCompressOutput(outputJob, false); outputJob.setJobName("Final Pagerank Output"); sLogger.info("Starting final sotirng job -> this will output a single file"); outputJob.setNumMapTasks(1); outputJob.setNumReduceTasks(1); outputJob.setOutputKeyClass(DoubleWritable.class); outputJob.setOutputValueClass(Text.class); outputJob.setInputFormat(SequenceFileInputFormat.class); outputJob.setOutputFormat(TextOutputFormat.class); outputJob.setMapperClass(OutputMapper.class); outputJob.setReducerClass(OutputReducer.class); outputJob.setOutputKeyComparatorClass(ReverseComparator.class); startTime = System.currentTimeMillis(); // Run Sorting and Output Job RunningJob runningSortingJob = JobClient.runJob(outputJob); // Delete the intermediary files created Path tempFinalSortedInputPath = new Path(inputPath); FileSystem.get(tempDir.toUri(), conf1).delete(tempFinalSortedInputPath, true); sLogger.info( "Final Sorting Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); sLogger.info( "The program lasted " + (System.currentTimeMillis() - startWholeProgram) / 1000.0 + "s (" + (System.currentTimeMillis() - startWholeProgram) / 60000.0 + " mins)"); return 0; }
@SuppressWarnings("static-access") @Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("tmp output directory") .create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("index file") .create(INDEX_FILE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION)); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION); if (!inputPath.isAbsolute()) { System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!"); return -1; } JobConf conf = new JobConf(getConf(), BuildWikipediaForwardIndex.class); FileSystem fs = FileSystem.get(conf); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - index file: " + indexFile); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); conf.setJobName( String.format( "BuildWikipediaForwardIndex[%s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile)); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already fs.delete(new Path(outputPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.findCounter(Blocks.Total).getCounter(); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF("edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex"); out.writeUTF(inputPath.toString()); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } return 0; }