/**
   * Reads back the evaluations.
   *
   * @param fs File System
   * @param conf Job configuration
   * @param outpath output <code>Path</code>
   * @param evaluations <code>List&lt;Fitness&gt;</code> that contains the evaluated fitness for
   *     each candidate from the input population, sorted in the same order as the candidates.
   * @throws IOException
   */
  private static void importEvaluations(
      FileSystem fs, JobConf conf, Path outpath, List<CDFitness> evaluations) throws IOException {
    Sorter sorter = new Sorter(fs, LongWritable.class, CDFitness.class, conf);

    // merge and sort the outputs
    Path[] outfiles = OutputUtils.listOutputFiles(fs, outpath);
    Path output = new Path(outpath, "output.sorted");
    sorter.merge(outfiles, output);

    // import the evaluations
    LongWritable key = new LongWritable();
    CDFitness value = new CDFitness();
    Reader reader = new Reader(fs, output, conf);

    while (reader.next(key, value)) {
      evaluations.add(new CDFitness(value));
    }

    reader.close();
  }
示例#2
0
  private void cloneOutput() throws IOException {

    List<FileStatus> listStatus = getOutputMappings();

    /*
     * Initialize to empty list, in which case swap() will be a no-op. The reference is then replaced with a real list, which is
     * used in the subsequent iterations.
     */
    List<Path> crushInput = emptyList();

    Text srcFile = new Text();
    Text crushOut = new Text();
    Text prevCrushOut = new Text();

    for (FileStatus partFile : listStatus) {
      Path path = partFile.getPath();

      Reader reader = new Reader(fs, path, fs.getConf());

      try {
        while (reader.next(srcFile, crushOut)) {
          if (!crushOut.equals(prevCrushOut)) {
            swap(crushInput, prevCrushOut.toString());

            prevCrushOut.set(crushOut);
            crushInput = new LinkedList<Path>();
          }

          crushInput.add(new Path(srcFile.toString()));
        }
      } finally {
        try {
          reader.close();
        } catch (IOException e) {
          LOG.warn("Trapped exception when closing " + path, e);
        }
      }

      swap(crushInput, prevCrushOut.toString());
    }
  }
  @Test
  public void testSequenceFile() throws Exception {
    populateFile();

    Pipeline p = Pipeline.create(pipelineOptions.getOptions());
    @SuppressWarnings("unchecked")
    Class<? extends FileInputFormat<IntWritable, Text>> inputFormatClass =
        (Class<? extends FileInputFormat<IntWritable, Text>>)
            (Class<?>) SequenceFileInputFormat.class;
    HadoopIO.Read.Bound<IntWritable, Text> read =
        HadoopIO.Read.from(
            inputFile.getAbsolutePath(), inputFormatClass, IntWritable.class, Text.class);
    PCollection<KV<IntWritable, Text>> input =
        p.apply(read)
            .setCoder(
                KvCoder.of(WritableCoder.of(IntWritable.class), WritableCoder.of(Text.class)));
    @SuppressWarnings("unchecked")
    Class<? extends FileOutputFormat<IntWritable, Text>> outputFormatClass =
        (Class<? extends FileOutputFormat<IntWritable, Text>>)
            (Class<?>) TemplatedSequenceFileOutputFormat.class;
    @SuppressWarnings("unchecked")
    HadoopIO.Write.Bound<IntWritable, Text> write =
        HadoopIO.Write.to(
            outputFile.getAbsolutePath(), outputFormatClass, IntWritable.class, Text.class);
    input.apply(write.withoutSharding());
    p.run();

    IntWritable key = new IntWritable();
    Text value = new Text();
    try (Reader reader =
        new Reader(new Configuration(), Reader.file(new Path(outputFile.toURI())))) {
      int i = 0;
      while (reader.next(key, value)) {
        assertEquals(i, key.get());
        assertEquals("value-" + i, value.toString());
        i++;
      }
    }
  }
示例#4
0
  /**
   * Moves the skipped files to the output directory. Called when operation in normal (non-clone)
   * mode.
   */
  private void moveOutput() throws IOException {

    List<FileStatus> listStatus = getOutputMappings();

    Text srcFile = new Text();
    Text crushOut = new Text();

    Set<String> crushOutputFiles = new HashSet<String>(nBuckets);

    for (FileStatus partFile : listStatus) {
      Path path = partFile.getPath();

      Reader reader = new Reader(fs, path, fs.getConf());

      try {
        while (reader.next(srcFile, crushOut)) {
          crushOutputFiles.add(new Path(crushOut.toString()).toUri().getPath());
        }
      } finally {
        try {
          reader.close();
        } catch (IOException e) {
          LOG.warn("Trapped exception when closing " + path, e);
        }
      }
    }

    assert crushOutputFiles.size() == nBuckets;

    /*
     * The crushoutput files will appear in a subdirectory of the output directory. The subdirectory will be the full path of the
     * input directory that was crushed. E.g.
     *
     * Crush input:
     * /user/me/input/dir1/file1
     * /user/me/input/dir1/file2
     * /user/me/input/dir2/file3
     * /user/me/input/dir2/file4
     * /user/me/input/dir3/dir4/file5
     * /user/me/input/dir3/dir4/file6
     *
     * Crush output:
     * /user/me/output/user/me/input/dir1/crushed_file ...
     * /user/me/output/user/me/input/dir2/crushed_file ...
     * /user/me/output/user/me/input/dir2/dir3/dir4/crushed_file ...
     *
     * We need to collapse this down to:
     * /user/me/output/dir1/crushed_file ...
     * /user/me/output/dir2/crushed_file ...
     * /user/me/output/dir2/dir3/dir4/crushed_file ...
     */
    String srcDirName = fs.makeQualified(srcDir).toUri().getPath();

    String destName = fs.makeQualified(dest).toUri().getPath();
    String partToReplace = fs.makeQualified(outDir).toUri().getPath() + "/crush" + srcDirName;

    print(Verbosity.INFO, "\n\nCopying crush files to " + destName);

    for (String crushOutputFile : crushOutputFiles) {
      Path srcPath = new Path(crushOutputFile);
      Path destPath =
          new Path(destName + crushOutputFile.substring(partToReplace.length())).getParent();

      rename(srcPath, destPath, null);
    }

    print(Verbosity.INFO, "\n\nMoving skipped files to " + destName);

    /*
     * Don't forget to move the files that were not crushed to the output dir so that the output dir has all the data that was in
     * the input dir, the difference being there are fewer files in the output dir.
     */
    for (String name : skippedFiles) {
      Path srcPath = new Path(name);
      Path destPath = new Path(destName + name.substring(srcDirName.length())).getParent();

      rename(srcPath, destPath, null);
    }
  }
  /** Runs this tool. */
  public int run(String[] args) throws Exception {

    long startWholeProgram = System.currentTimeMillis();
    boolean hasConverged = false;
    int counter = 0;
    Reader sequenceFilereader;

    // Must have four arguments
    if (args.length != 4) {
      MapReduce.printUsage();
      return -1;
    }

    // Set input and output file paths
    String inputPathToAdjacencyTextFile = args[0];
    String outputPathToNodeSequenceFileFormat = args[1];

    // Configure Job 1:
    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);
    int reduceTasksSetup = 1;

    // Configure Job Setup
    JobConf conf1 = new JobConf(MapReduce.class);
    conf1.setInt("numberOfNodes", numberNodes);
    conf1.setJobName("Setup Job");
    conf1.setNumMapTasks(mapTasks);
    conf1.setNumReduceTasks(reduceTasksSetup);

    FileInputFormat.setInputPaths(conf1, new Path(inputPathToAdjacencyTextFile));
    FileOutputFormat.setOutputPath(conf1, new Path(outputPathToNodeSequenceFileFormat));
    FileOutputFormat.setCompressOutput(conf1, false);

    conf1.setOutputKeyClass(Text.class);
    conf1.setOutputValueClass(MapReduceNode.class);

    conf1.setOutputFormat(SequenceFileOutputFormat.class);
    conf1.setInputFormat(TextInputFormat.class);

    conf1.setMapperClass(ConfigureMapper.class);
    conf1.setReducerClass(ConfigureReducer.class);

    // Delete the output directory if it exists already
    Path tempDir = new Path(outputPathToNodeSequenceFileFormat);
    FileSystem.get(tempDir.toUri(), conf1).delete(tempDir, true);

    long startTime = System.currentTimeMillis();

    // Run Configure Job
    RunningJob job = JobClient.runJob(conf1);

    sLogger.info(
        "Config Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    String inputPath = args[1];
    String outputPath = null;
    String outputPathForNormalisedPagerank = null;
    String outputPathforFinalSortedPagerank = null;

    // Page Rank Calculation Job 2 (Iterative until convergence has been reached)

    while (!hasConverged) {
      System.out.println("*** ITERATION " + counter + ", number of nodes: " + numberNodes);
      counter++;

      sLogger.info("***** ITERATION " + counter);

      outputPath = args[1] + counter;

      // Pure Page Rank Calculation Job Setup
      JobConf pageRankJob = new JobConf(getConf(), MapReduce.class);
      pageRankJob.setInt("numberOfNodes", numberNodes);

      FileInputFormat.setInputPaths(pageRankJob, new Path(inputPath));
      FileOutputFormat.setOutputPath(pageRankJob, new Path(outputPath));
      FileOutputFormat.setCompressOutput(pageRankJob, false);

      pageRankJob.setJobName("PP Iteration " + counter);
      pageRankJob.setNumMapTasks(mapTasks);
      pageRankJob.setNumReduceTasks(reduceTasks);

      pageRankJob.setOutputKeyClass(Text.class);
      pageRankJob.setOutputValueClass(MapReduceNode.class);

      pageRankJob.setOutputFormat(SequenceFileOutputFormat.class);
      pageRankJob.setInputFormat(SequenceFileInputFormat.class);

      pageRankJob.setMapperClass(MapReduce.PageRankCalcMapper.class);
      pageRankJob.setReducerClass(MapReduce.PageRankCalcReducer.class);

      // Delete the output directory if it exists already
      Path tempPageRankDir = new Path(outputPath);
      FileSystem.get(tempDir.toUri(), conf1).delete(tempPageRankDir, true);

      startTime = System.currentTimeMillis();

      // Run Pure Page Rank Calculation Job
      RunningJob runningJob = JobClient.runJob(pageRankJob);

      sLogger.info(
          "PP Job"
              + counter
              + "Finished in "
              + (System.currentTimeMillis() - startTime) / 1000.0
              + " seconds");

      // Delete the input directory if it exists already
      Path tempInputPageRankDir = new Path(inputPath);
      FileSystem.get(tempDir.toUri(), conf1).delete(tempInputPageRankDir, true);

      // Set the output path of this iteration to be the inputpath for the next iteration
      inputPath = outputPath;

      // Check for convergence after every five iterations
      if (counter % 5 == 0) {

        Configuration conf = getConf();
        if (outputPath != null) {
          sLogger.info("Attempting to open file: " + outputPath + File.separator + "part-00000");
          System.out.println(
              "Attempting to open file: " + outputPath + File.separator + "part-00000");
        } else {
          sLogger.info("OUTPUT PATH IS NULL");
          System.out.println("OUTPUT PATH IS NULL");
        }
        Path cFile = new Path(outputPath + File.separator + "part-00000");
        FileSystem fs = FileSystem.get(cFile.toUri(), conf);

        sequenceFilereader = new Reader(fs, cFile, conf);

        for (int i = 0; i < 5; i++) {
          MapReduceNode readValue = new MapReduceNode();
          Text readKey = new Text();

          sequenceFilereader.next(readKey, readValue);
          if (!(readValue.hasConverged())) {
            break;
          }

          if (i == 4) {
            hasConverged = true;
            sequenceFilereader.close();
          }
        }
        sequenceFilereader.close();
      }
      if (counter == 75) {
        sLogger.info("****************** Exiting (purposefully) after 75th iteration");
        hasConverged = true;
      }
    }

    // Normalised Page Rank Calculation Job 3
    outputPathForNormalisedPagerank = args[1] + "normalizedPageRank";

    // Normalised Page Rank Calculation Job Setup
    JobConf normalizationJob = new JobConf(getConf(), MapReduce.class);

    FileInputFormat.setInputPaths(normalizationJob, new Path(inputPath));
    FileOutputFormat.setOutputPath(normalizationJob, new Path(outputPathForNormalisedPagerank));
    FileOutputFormat.setCompressOutput(normalizationJob, false);

    normalizationJob.setJobName("Normalised Pagerank Output");
    normalizationJob.setNumMapTasks(mapTasks);
    normalizationJob.setNumReduceTasks(1);

    normalizationJob.setOutputKeyClass(Text.class);
    normalizationJob.setOutputValueClass(DoubleWritable.class);

    normalizationJob.setInputFormat(SequenceFileInputFormat.class);
    normalizationJob.setOutputFormat(SequenceFileOutputFormat.class);

    normalizationJob.setMapperClass(NormalisationMapper.class);
    normalizationJob.setReducerClass(NormalisationReducer.class);

    // Delete the output directory if it exists already
    Path tempUpdatedPageRankDir = new Path(outputPathForNormalisedPagerank);
    FileSystem.get(tempDir.toUri(), conf1).delete(tempUpdatedPageRankDir, true);

    startTime = System.currentTimeMillis();

    // Run Normalised Page Rank Calculation Job
    RunningJob runningUpdateJob = JobClient.runJob(normalizationJob);

    sLogger.info(
        "Normalisation Job Finished in "
            + (System.currentTimeMillis() - startTime) / 1000.0
            + " seconds");

    // Sorting and Output Job 4

    // Delete the intermediary files created
    Path tempNormalizationInputPath = new Path(inputPath);
    FileSystem.get(tempDir.toUri(), conf1).delete(tempNormalizationInputPath, true);

    inputPath = outputPathForNormalisedPagerank;
    outputPathforFinalSortedPagerank = args[1] + "FinalSortedPageRank";

    // Sorting and Output Job Setup
    JobConf outputJob = new JobConf(getConf(), MapReduce.class);

    FileInputFormat.setInputPaths(outputJob, new Path(inputPath));
    FileOutputFormat.setOutputPath(outputJob, new Path(outputPathforFinalSortedPagerank));
    FileOutputFormat.setCompressOutput(outputJob, false);

    outputJob.setJobName("Final Pagerank Output");
    sLogger.info("Starting final sotirng job -> this will output a single file");
    outputJob.setNumMapTasks(1);
    outputJob.setNumReduceTasks(1);

    outputJob.setOutputKeyClass(DoubleWritable.class);
    outputJob.setOutputValueClass(Text.class);

    outputJob.setInputFormat(SequenceFileInputFormat.class);
    outputJob.setOutputFormat(TextOutputFormat.class);

    outputJob.setMapperClass(OutputMapper.class);
    outputJob.setReducerClass(OutputReducer.class);

    outputJob.setOutputKeyComparatorClass(ReverseComparator.class);

    startTime = System.currentTimeMillis();

    // Run Sorting and Output Job
    RunningJob runningSortingJob = JobClient.runJob(outputJob);

    // Delete the intermediary files created
    Path tempFinalSortedInputPath = new Path(inputPath);
    FileSystem.get(tempDir.toUri(), conf1).delete(tempFinalSortedInputPath, true);

    sLogger.info(
        "Final Sorting Job Finished in "
            + (System.currentTimeMillis() - startTime) / 1000.0
            + " seconds");

    sLogger.info(
        "The program lasted "
            + (System.currentTimeMillis() - startWholeProgram) / 1000.0
            + "s ("
            + (System.currentTimeMillis() - startWholeProgram) / 60000.0
            + " mins)");

    return 0;
  }