/** * Reads back the evaluations. * * @param fs File System * @param conf Job configuration * @param outpath output <code>Path</code> * @param evaluations <code>List<Fitness></code> that contains the evaluated fitness for * each candidate from the input population, sorted in the same order as the candidates. * @throws IOException */ private static void importEvaluations( FileSystem fs, JobConf conf, Path outpath, List<CDFitness> evaluations) throws IOException { Sorter sorter = new Sorter(fs, LongWritable.class, CDFitness.class, conf); // merge and sort the outputs Path[] outfiles = OutputUtils.listOutputFiles(fs, outpath); Path output = new Path(outpath, "output.sorted"); sorter.merge(outfiles, output); // import the evaluations LongWritable key = new LongWritable(); CDFitness value = new CDFitness(); Reader reader = new Reader(fs, output, conf); while (reader.next(key, value)) { evaluations.add(new CDFitness(value)); } reader.close(); }
private void cloneOutput() throws IOException { List<FileStatus> listStatus = getOutputMappings(); /* * Initialize to empty list, in which case swap() will be a no-op. The reference is then replaced with a real list, which is * used in the subsequent iterations. */ List<Path> crushInput = emptyList(); Text srcFile = new Text(); Text crushOut = new Text(); Text prevCrushOut = new Text(); for (FileStatus partFile : listStatus) { Path path = partFile.getPath(); Reader reader = new Reader(fs, path, fs.getConf()); try { while (reader.next(srcFile, crushOut)) { if (!crushOut.equals(prevCrushOut)) { swap(crushInput, prevCrushOut.toString()); prevCrushOut.set(crushOut); crushInput = new LinkedList<Path>(); } crushInput.add(new Path(srcFile.toString())); } } finally { try { reader.close(); } catch (IOException e) { LOG.warn("Trapped exception when closing " + path, e); } } swap(crushInput, prevCrushOut.toString()); } }
@Test public void testSequenceFile() throws Exception { populateFile(); Pipeline p = Pipeline.create(pipelineOptions.getOptions()); @SuppressWarnings("unchecked") Class<? extends FileInputFormat<IntWritable, Text>> inputFormatClass = (Class<? extends FileInputFormat<IntWritable, Text>>) (Class<?>) SequenceFileInputFormat.class; HadoopIO.Read.Bound<IntWritable, Text> read = HadoopIO.Read.from( inputFile.getAbsolutePath(), inputFormatClass, IntWritable.class, Text.class); PCollection<KV<IntWritable, Text>> input = p.apply(read) .setCoder( KvCoder.of(WritableCoder.of(IntWritable.class), WritableCoder.of(Text.class))); @SuppressWarnings("unchecked") Class<? extends FileOutputFormat<IntWritable, Text>> outputFormatClass = (Class<? extends FileOutputFormat<IntWritable, Text>>) (Class<?>) TemplatedSequenceFileOutputFormat.class; @SuppressWarnings("unchecked") HadoopIO.Write.Bound<IntWritable, Text> write = HadoopIO.Write.to( outputFile.getAbsolutePath(), outputFormatClass, IntWritable.class, Text.class); input.apply(write.withoutSharding()); p.run(); IntWritable key = new IntWritable(); Text value = new Text(); try (Reader reader = new Reader(new Configuration(), Reader.file(new Path(outputFile.toURI())))) { int i = 0; while (reader.next(key, value)) { assertEquals(i, key.get()); assertEquals("value-" + i, value.toString()); i++; } } }
/** * Moves the skipped files to the output directory. Called when operation in normal (non-clone) * mode. */ private void moveOutput() throws IOException { List<FileStatus> listStatus = getOutputMappings(); Text srcFile = new Text(); Text crushOut = new Text(); Set<String> crushOutputFiles = new HashSet<String>(nBuckets); for (FileStatus partFile : listStatus) { Path path = partFile.getPath(); Reader reader = new Reader(fs, path, fs.getConf()); try { while (reader.next(srcFile, crushOut)) { crushOutputFiles.add(new Path(crushOut.toString()).toUri().getPath()); } } finally { try { reader.close(); } catch (IOException e) { LOG.warn("Trapped exception when closing " + path, e); } } } assert crushOutputFiles.size() == nBuckets; /* * The crushoutput files will appear in a subdirectory of the output directory. The subdirectory will be the full path of the * input directory that was crushed. E.g. * * Crush input: * /user/me/input/dir1/file1 * /user/me/input/dir1/file2 * /user/me/input/dir2/file3 * /user/me/input/dir2/file4 * /user/me/input/dir3/dir4/file5 * /user/me/input/dir3/dir4/file6 * * Crush output: * /user/me/output/user/me/input/dir1/crushed_file ... * /user/me/output/user/me/input/dir2/crushed_file ... * /user/me/output/user/me/input/dir2/dir3/dir4/crushed_file ... * * We need to collapse this down to: * /user/me/output/dir1/crushed_file ... * /user/me/output/dir2/crushed_file ... * /user/me/output/dir2/dir3/dir4/crushed_file ... */ String srcDirName = fs.makeQualified(srcDir).toUri().getPath(); String destName = fs.makeQualified(dest).toUri().getPath(); String partToReplace = fs.makeQualified(outDir).toUri().getPath() + "/crush" + srcDirName; print(Verbosity.INFO, "\n\nCopying crush files to " + destName); for (String crushOutputFile : crushOutputFiles) { Path srcPath = new Path(crushOutputFile); Path destPath = new Path(destName + crushOutputFile.substring(partToReplace.length())).getParent(); rename(srcPath, destPath, null); } print(Verbosity.INFO, "\n\nMoving skipped files to " + destName); /* * Don't forget to move the files that were not crushed to the output dir so that the output dir has all the data that was in * the input dir, the difference being there are fewer files in the output dir. */ for (String name : skippedFiles) { Path srcPath = new Path(name); Path destPath = new Path(destName + name.substring(srcDirName.length())).getParent(); rename(srcPath, destPath, null); } }
/** Runs this tool. */ public int run(String[] args) throws Exception { long startWholeProgram = System.currentTimeMillis(); boolean hasConverged = false; int counter = 0; Reader sequenceFilereader; // Must have four arguments if (args.length != 4) { MapReduce.printUsage(); return -1; } // Set input and output file paths String inputPathToAdjacencyTextFile = args[0]; String outputPathToNodeSequenceFileFormat = args[1]; // Configure Job 1: int mapTasks = Integer.parseInt(args[2]); int reduceTasks = Integer.parseInt(args[3]); int reduceTasksSetup = 1; // Configure Job Setup JobConf conf1 = new JobConf(MapReduce.class); conf1.setInt("numberOfNodes", numberNodes); conf1.setJobName("Setup Job"); conf1.setNumMapTasks(mapTasks); conf1.setNumReduceTasks(reduceTasksSetup); FileInputFormat.setInputPaths(conf1, new Path(inputPathToAdjacencyTextFile)); FileOutputFormat.setOutputPath(conf1, new Path(outputPathToNodeSequenceFileFormat)); FileOutputFormat.setCompressOutput(conf1, false); conf1.setOutputKeyClass(Text.class); conf1.setOutputValueClass(MapReduceNode.class); conf1.setOutputFormat(SequenceFileOutputFormat.class); conf1.setInputFormat(TextInputFormat.class); conf1.setMapperClass(ConfigureMapper.class); conf1.setReducerClass(ConfigureReducer.class); // Delete the output directory if it exists already Path tempDir = new Path(outputPathToNodeSequenceFileFormat); FileSystem.get(tempDir.toUri(), conf1).delete(tempDir, true); long startTime = System.currentTimeMillis(); // Run Configure Job RunningJob job = JobClient.runJob(conf1); sLogger.info( "Config Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); String inputPath = args[1]; String outputPath = null; String outputPathForNormalisedPagerank = null; String outputPathforFinalSortedPagerank = null; // Page Rank Calculation Job 2 (Iterative until convergence has been reached) while (!hasConverged) { System.out.println("*** ITERATION " + counter + ", number of nodes: " + numberNodes); counter++; sLogger.info("***** ITERATION " + counter); outputPath = args[1] + counter; // Pure Page Rank Calculation Job Setup JobConf pageRankJob = new JobConf(getConf(), MapReduce.class); pageRankJob.setInt("numberOfNodes", numberNodes); FileInputFormat.setInputPaths(pageRankJob, new Path(inputPath)); FileOutputFormat.setOutputPath(pageRankJob, new Path(outputPath)); FileOutputFormat.setCompressOutput(pageRankJob, false); pageRankJob.setJobName("PP Iteration " + counter); pageRankJob.setNumMapTasks(mapTasks); pageRankJob.setNumReduceTasks(reduceTasks); pageRankJob.setOutputKeyClass(Text.class); pageRankJob.setOutputValueClass(MapReduceNode.class); pageRankJob.setOutputFormat(SequenceFileOutputFormat.class); pageRankJob.setInputFormat(SequenceFileInputFormat.class); pageRankJob.setMapperClass(MapReduce.PageRankCalcMapper.class); pageRankJob.setReducerClass(MapReduce.PageRankCalcReducer.class); // Delete the output directory if it exists already Path tempPageRankDir = new Path(outputPath); FileSystem.get(tempDir.toUri(), conf1).delete(tempPageRankDir, true); startTime = System.currentTimeMillis(); // Run Pure Page Rank Calculation Job RunningJob runningJob = JobClient.runJob(pageRankJob); sLogger.info( "PP Job" + counter + "Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Delete the input directory if it exists already Path tempInputPageRankDir = new Path(inputPath); FileSystem.get(tempDir.toUri(), conf1).delete(tempInputPageRankDir, true); // Set the output path of this iteration to be the inputpath for the next iteration inputPath = outputPath; // Check for convergence after every five iterations if (counter % 5 == 0) { Configuration conf = getConf(); if (outputPath != null) { sLogger.info("Attempting to open file: " + outputPath + File.separator + "part-00000"); System.out.println( "Attempting to open file: " + outputPath + File.separator + "part-00000"); } else { sLogger.info("OUTPUT PATH IS NULL"); System.out.println("OUTPUT PATH IS NULL"); } Path cFile = new Path(outputPath + File.separator + "part-00000"); FileSystem fs = FileSystem.get(cFile.toUri(), conf); sequenceFilereader = new Reader(fs, cFile, conf); for (int i = 0; i < 5; i++) { MapReduceNode readValue = new MapReduceNode(); Text readKey = new Text(); sequenceFilereader.next(readKey, readValue); if (!(readValue.hasConverged())) { break; } if (i == 4) { hasConverged = true; sequenceFilereader.close(); } } sequenceFilereader.close(); } if (counter == 75) { sLogger.info("****************** Exiting (purposefully) after 75th iteration"); hasConverged = true; } } // Normalised Page Rank Calculation Job 3 outputPathForNormalisedPagerank = args[1] + "normalizedPageRank"; // Normalised Page Rank Calculation Job Setup JobConf normalizationJob = new JobConf(getConf(), MapReduce.class); FileInputFormat.setInputPaths(normalizationJob, new Path(inputPath)); FileOutputFormat.setOutputPath(normalizationJob, new Path(outputPathForNormalisedPagerank)); FileOutputFormat.setCompressOutput(normalizationJob, false); normalizationJob.setJobName("Normalised Pagerank Output"); normalizationJob.setNumMapTasks(mapTasks); normalizationJob.setNumReduceTasks(1); normalizationJob.setOutputKeyClass(Text.class); normalizationJob.setOutputValueClass(DoubleWritable.class); normalizationJob.setInputFormat(SequenceFileInputFormat.class); normalizationJob.setOutputFormat(SequenceFileOutputFormat.class); normalizationJob.setMapperClass(NormalisationMapper.class); normalizationJob.setReducerClass(NormalisationReducer.class); // Delete the output directory if it exists already Path tempUpdatedPageRankDir = new Path(outputPathForNormalisedPagerank); FileSystem.get(tempDir.toUri(), conf1).delete(tempUpdatedPageRankDir, true); startTime = System.currentTimeMillis(); // Run Normalised Page Rank Calculation Job RunningJob runningUpdateJob = JobClient.runJob(normalizationJob); sLogger.info( "Normalisation Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // Sorting and Output Job 4 // Delete the intermediary files created Path tempNormalizationInputPath = new Path(inputPath); FileSystem.get(tempDir.toUri(), conf1).delete(tempNormalizationInputPath, true); inputPath = outputPathForNormalisedPagerank; outputPathforFinalSortedPagerank = args[1] + "FinalSortedPageRank"; // Sorting and Output Job Setup JobConf outputJob = new JobConf(getConf(), MapReduce.class); FileInputFormat.setInputPaths(outputJob, new Path(inputPath)); FileOutputFormat.setOutputPath(outputJob, new Path(outputPathforFinalSortedPagerank)); FileOutputFormat.setCompressOutput(outputJob, false); outputJob.setJobName("Final Pagerank Output"); sLogger.info("Starting final sotirng job -> this will output a single file"); outputJob.setNumMapTasks(1); outputJob.setNumReduceTasks(1); outputJob.setOutputKeyClass(DoubleWritable.class); outputJob.setOutputValueClass(Text.class); outputJob.setInputFormat(SequenceFileInputFormat.class); outputJob.setOutputFormat(TextOutputFormat.class); outputJob.setMapperClass(OutputMapper.class); outputJob.setReducerClass(OutputReducer.class); outputJob.setOutputKeyComparatorClass(ReverseComparator.class); startTime = System.currentTimeMillis(); // Run Sorting and Output Job RunningJob runningSortingJob = JobClient.runJob(outputJob); // Delete the intermediary files created Path tempFinalSortedInputPath = new Path(inputPath); FileSystem.get(tempDir.toUri(), conf1).delete(tempFinalSortedInputPath, true); sLogger.info( "Final Sorting Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); sLogger.info( "The program lasted " + (System.currentTimeMillis() - startWholeProgram) / 1000.0 + "s (" + (System.currentTimeMillis() - startWholeProgram) / 60000.0 + " mins)"); return 0; }