Exemplos de FileOutputFormat.setCompressOutput em Java, exemplos de org.apache.hadoop.mapred.FileOutputFormat.setCompressOutput em Java

Exemplo n.º 1

0

Exibir arquivo

Arquivo: AFormatterWG.java Projeto: ezubaric/Cloud9

  public int run(String[] args) throws Exception {

    if (args.length != 5) {
      printUsage();
      return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    String stoplistPath = args[4];

    sLogger.info("Tool: AFormatter");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);

    JobConf conf = new JobConf(AFormatterWG.class);
    conf.setJobName("Authority Formatter -- Web Graph");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    // conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HITSNode.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setCompressMapOutput(true);
    conf.setSpeculativeExecution(false);
    // InputSampler.Sampler<IntWritable, Text> sampler = new
    // InputSampler.RandomSampler<IntWritable, Text>(0.1, 10, 10);
    // InputSampler.writePartitionFile(conf, sampler);
    // conf.setPartitionerClass(TotalOrderPartitioner.class);
    conf.setMapperClass(AFormatMapperIMC.class);
    conf.setCombinerClass(AFormatReducer.class);
    conf.setReducerClass(AFormatReducer.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    Path stopList = new Path(stoplistPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    sLogger.info("Starting job");
    DistributedCache.addCacheFile(stopList.toUri(), conf);
    JobClient.runJob(conf);
    sLogger.info(
        "Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
  }

Exemplo n.º 2

0

Exibir arquivo

Arquivo: TestCompressionEmulationUtils.java Projeto: pwendell/mr2-fairscheduler

  /** Test compressible {@link GridmixRecord}. */
  @Test
  public void testCompressibleGridmixRecord() throws IOException {
    JobConf conf = new JobConf();
    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
    CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true);

    FileSystem lfs = FileSystem.getLocal(conf);
    int dataSize = 1024 * 1024 * 10; // 10 MB
    float ratio = 0.357F;

    // define the test's root temp directory
    Path rootTempDir =
        new Path(System.getProperty("test.build.data", "/tmp"))
            .makeQualified(lfs.getUri(), lfs.getWorkingDirectory());

    Path tempDir = new Path(rootTempDir, "TestPossiblyCompressibleGridmixRecord");
    lfs.delete(tempDir, true);

    // define a compressible GridmixRecord
    GridmixRecord record = new GridmixRecord(dataSize, 0);
    record.setCompressibility(true, ratio); // enable compression

    conf.setClass(FileOutputFormat.COMPRESS_CODEC, GzipCodec.class, CompressionCodec.class);
    org.apache.hadoop.mapred.FileOutputFormat.setCompressOutput(conf, true);

    // write the record to a file
    Path recordFile = new Path(tempDir, "record");
    OutputStream outStream =
        CompressionEmulationUtil.getPossiblyCompressedOutputStream(recordFile, conf);
    DataOutputStream out = new DataOutputStream(outStream);
    record.write(out);
    out.close();
    outStream.close();

    // open the compressed stream for reading
    Path actualRecordFile = recordFile.suffix(".gz");
    InputStream in =
        CompressionEmulationUtil.getPossiblyDecompressedInputStream(actualRecordFile, conf, 0);

    // get the compressed file size
    long compressedFileSize = lfs.listStatus(actualRecordFile)[0].getLen();

    GridmixRecord recordRead = new GridmixRecord();
    recordRead.readFields(new DataInputStream(in));

    assertEquals(
        "Record size mismatch in a compressible GridmixRecord", dataSize, recordRead.getSize());
    assertTrue(
        "Failed to generate a compressible GridmixRecord",
        recordRead.getSize() > compressedFileSize);

    // check if the record can generate data with the desired compression ratio
    float seenRatio = ((float) compressedFileSize) / dataSize;
    assertEquals(
        CompressionEmulationUtil.standardizeCompressionRatio(ratio),
        CompressionEmulationUtil.standardizeCompressionRatio(seenRatio),
        1.0D);
  }

Exemplo n.º 3

0

Exibir arquivo

Arquivo: PagerankData.java Projeto: ConeyLiu/HiBench

  private void createPageRankLinksDirectly() throws IOException, URISyntaxException {

    log.info("Creating PageRank links", null);

    JobConf job = new JobConf(PagerankData.class);
    String jobname = "Create pagerank links";

    Path fout = new Path(options.getResultPath(), EDGES_DIR_NAME);

    job.setJobName(jobname);
    setPageRankLinksOptions(job);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);
    //		job.setMapOutputKeyClass(LongWritable.class);
    //		job.setMapOutputValueClass(Text.class);

    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, dummy.getPath());
    job.setInputFormat(NLineInputFormat.class);

    job.setMapperClass(DummyToPageRankLinksMapper.class);

    if (options.isSequenceOut()) {
      job.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
      job.setOutputFormat(TextOutputFormat.class);
    }

    if (null != options.getCodecClass()) {
      job.set("mapred.output.compression.type", "BLOCK");
      job.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK");
      FileOutputFormat.setCompressOutput(job, true);
      FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass());
    }

    FileOutputFormat.setOutputPath(job, fout);

    log.info("Running Job: " + jobname);
    log.info("Dummy file " + dummy.getPath() + " as input");
    log.info("Edges file " + fout + " as output");
    JobClient.runJob(job);
    log.info("Finished Running Job: " + jobname);
  }

Exemplo n.º 4

0

Exibir arquivo

Arquivo: DemoCountTrecDocuments.java Projeto: zgap118/Cloud9

  /** Runs this tool. */
  public int run(String[] args) throws Exception {
    if (args.length != 3) {
      printUsage();
      return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String mappingFile = args[2];

    LOG.info("Tool: " + DemoCountTrecDocuments.class.getCanonicalName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output dir: " + outputPath);
    LOG.info(" - docno mapping file: " + mappingFile);

    JobConf conf = new JobConf(getConf(), DemoCountTrecDocuments.class);
    conf.setJobName(DemoCountTrecDocuments.class.getSimpleName());

    conf.setNumReduceTasks(0);

    // Pass in the class name as a String; this is makes the mapper general in being able to load
    // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping
    // object.
    conf.set("DocnoMappingClass", TrecDocnoMapping.class.getCanonicalName());

    // Put the mapping file in the distributed cache so each map worker will have it.
    DistributedCache.addCacheFile(new URI(mappingFile), conf);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(TrecDocumentInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
  }

Exemplo n.º 5

0

Exibir arquivo

Arquivo: TestCompressionEmulationUtils.java Projeto: pwendell/mr2-fairscheduler

  /**
   * Test of {@link FileQueue} can identify compressed file and provide readers to extract
   * uncompressed data only if input-compression is enabled.
   */
  @Test
  public void testFileQueueDecompression() throws IOException {
    JobConf conf = new JobConf();
    FileSystem lfs = FileSystem.getLocal(conf);
    String inputLine = "Hi Hello!";

    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
    CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true);
    org.apache.hadoop.mapred.FileOutputFormat.setCompressOutput(conf, true);
    org.apache.hadoop.mapred.FileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);

    // define the test's root temp directory
    Path rootTempDir =
        new Path(System.getProperty("test.build.data", "/tmp"))
            .makeQualified(lfs.getUri(), lfs.getWorkingDirectory());

    Path tempDir = new Path(rootTempDir, "TestFileQueueDecompression");
    lfs.delete(tempDir, true);

    // create a compressed file
    Path compressedFile = new Path(tempDir, "test");
    OutputStream out =
        CompressionEmulationUtil.getPossiblyCompressedOutputStream(compressedFile, conf);
    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out));
    writer.write(inputLine);
    writer.close();

    compressedFile = compressedFile.suffix(".gz");
    // now read back the data from the compressed stream using FileQueue
    long fileSize = lfs.listStatus(compressedFile)[0].getLen();
    CombineFileSplit split =
        new CombineFileSplit(new Path[] {compressedFile}, new long[] {fileSize});
    FileQueue queue = new FileQueue(split, conf);
    byte[] bytes = new byte[inputLine.getBytes().length];
    queue.read(bytes);
    queue.close();
    String readLine = new String(bytes);
    assertEquals("Compression/Decompression error", inputLine, readLine);
  }

Exemplo n.º 6

0

Exibir arquivo

Arquivo: PagerankData.java Projeto: ConeyLiu/HiBench

  private void createPageRankNodesDirectly() throws IOException {

    log.info("Creating PageRank nodes...", null);

    Path fout = new Path(options.getResultPath(), VERTICALS_DIR_NAME);

    JobConf job = new JobConf(PagerankData.class);
    String jobname = "Create pagerank nodes";

    job.setJobName(jobname);
    setPageRankNodesOptions(job);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job, dummy.getPath());
    job.setInputFormat(NLineInputFormat.class);

    if (balance) {
      /**
       * * Balance the output order of nodes, to prevent the running of pagerank bench from
       * potential data skew
       */
      job.setMapOutputKeyClass(LongWritable.class);
      job.setMapOutputValueClass(NullWritable.class);

      job.setMapperClass(BalancedLinkNodesMapper.class);
      job.setReducerClass(BalancedLinkNodesReducer.class);
      //			job.setPartitionerClass(ModulusPartitioner.class);

      if (options.getNumReds() > 0) {
        job.setNumReduceTasks(options.getNumReds());
      } else {
        job.setNumReduceTasks(Utils.getMaxNumReds());
      }
    } else {
      job.setMapOutputKeyClass(Text.class);
      job.setMapperClass(DummyToNodesMapper.class);
      job.setNumReduceTasks(0);
    }

    if (options.isSequenceOut()) {
      job.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
      job.setOutputFormat(TextOutputFormat.class);
    }

    if (null != options.getCodecClass()) {
      job.set("mapred.output.compression.type", "BLOCK");
      job.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK");
      FileOutputFormat.setCompressOutput(job, true);
      FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass());
    }

    FileOutputFormat.setOutputPath(job, fout);

    log.info("Running Job: " + jobname);
    log.info("Dummy file " + dummy.getPath() + " as input");
    log.info("Vertices file " + fout + " as output");
    JobClient.runJob(job);
    log.info("Finished Running Job: " + jobname);
  }

Exemplo n.º 7

0

Exibir arquivo

Arquivo: MapReduce.java Projeto: uuee6543/Panda-information-retrieval-package

  /** Runs this tool. */
  public int run(String[] args) throws Exception {

    long startWholeProgram = System.currentTimeMillis();
    boolean hasConverged = false;
    int counter = 0;
    Reader sequenceFilereader;

    // Must have four arguments
    if (args.length != 4) {
      MapReduce.printUsage();
      return -1;
    }

    // Set input and output file paths
    String inputPathToAdjacencyTextFile = args[0];
    String outputPathToNodeSequenceFileFormat = args[1];

    // Configure Job 1:
    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);
    int reduceTasksSetup = 1;

    // Configure Job Setup
    JobConf conf1 = new JobConf(MapReduce.class);
    conf1.setInt("numberOfNodes", numberNodes);
    conf1.setJobName("Setup Job");
    conf1.setNumMapTasks(mapTasks);
    conf1.setNumReduceTasks(reduceTasksSetup);

    FileInputFormat.setInputPaths(conf1, new Path(inputPathToAdjacencyTextFile));
    FileOutputFormat.setOutputPath(conf1, new Path(outputPathToNodeSequenceFileFormat));
    FileOutputFormat.setCompressOutput(conf1, false);

    conf1.setOutputKeyClass(Text.class);
    conf1.setOutputValueClass(MapReduceNode.class);

    conf1.setOutputFormat(SequenceFileOutputFormat.class);
    conf1.setInputFormat(TextInputFormat.class);

    conf1.setMapperClass(ConfigureMapper.class);
    conf1.setReducerClass(ConfigureReducer.class);

    // Delete the output directory if it exists already
    Path tempDir = new Path(outputPathToNodeSequenceFileFormat);
    FileSystem.get(tempDir.toUri(), conf1).delete(tempDir, true);

    long startTime = System.currentTimeMillis();

    // Run Configure Job
    RunningJob job = JobClient.runJob(conf1);

    sLogger.info(
        "Config Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    String inputPath = args[1];
    String outputPath = null;
    String outputPathForNormalisedPagerank = null;
    String outputPathforFinalSortedPagerank = null;

    // Page Rank Calculation Job 2 (Iterative until convergence has been reached)

    while (!hasConverged) {
      System.out.println("*** ITERATION " + counter + ", number of nodes: " + numberNodes);
      counter++;

      sLogger.info("***** ITERATION " + counter);

      outputPath = args[1] + counter;

      // Pure Page Rank Calculation Job Setup
      JobConf pageRankJob = new JobConf(getConf(), MapReduce.class);
      pageRankJob.setInt("numberOfNodes", numberNodes);

      FileInputFormat.setInputPaths(pageRankJob, new Path(inputPath));
      FileOutputFormat.setOutputPath(pageRankJob, new Path(outputPath));
      FileOutputFormat.setCompressOutput(pageRankJob, false);

      pageRankJob.setJobName("PP Iteration " + counter);
      pageRankJob.setNumMapTasks(mapTasks);
      pageRankJob.setNumReduceTasks(reduceTasks);

      pageRankJob.setOutputKeyClass(Text.class);
      pageRankJob.setOutputValueClass(MapReduceNode.class);

      pageRankJob.setOutputFormat(SequenceFileOutputFormat.class);
      pageRankJob.setInputFormat(SequenceFileInputFormat.class);

      pageRankJob.setMapperClass(MapReduce.PageRankCalcMapper.class);
      pageRankJob.setReducerClass(MapReduce.PageRankCalcReducer.class);

      // Delete the output directory if it exists already
      Path tempPageRankDir = new Path(outputPath);
      FileSystem.get(tempDir.toUri(), conf1).delete(tempPageRankDir, true);

      startTime = System.currentTimeMillis();

      // Run Pure Page Rank Calculation Job
      RunningJob runningJob = JobClient.runJob(pageRankJob);

      sLogger.info(
          "PP Job"
              + counter
              + "Finished in "
              + (System.currentTimeMillis() - startTime) / 1000.0
              + " seconds");

      // Delete the input directory if it exists already
      Path tempInputPageRankDir = new Path(inputPath);
      FileSystem.get(tempDir.toUri(), conf1).delete(tempInputPageRankDir, true);

      // Set the output path of this iteration to be the inputpath for the next iteration
      inputPath = outputPath;

      // Check for convergence after every five iterations
      if (counter % 5 == 0) {

        Configuration conf = getConf();
        if (outputPath != null) {
          sLogger.info("Attempting to open file: " + outputPath + File.separator + "part-00000");
          System.out.println(
              "Attempting to open file: " + outputPath + File.separator + "part-00000");
        } else {
          sLogger.info("OUTPUT PATH IS NULL");
          System.out.println("OUTPUT PATH IS NULL");
        }
        Path cFile = new Path(outputPath + File.separator + "part-00000");
        FileSystem fs = FileSystem.get(cFile.toUri(), conf);

        sequenceFilereader = new Reader(fs, cFile, conf);

        for (int i = 0; i < 5; i++) {
          MapReduceNode readValue = new MapReduceNode();
          Text readKey = new Text();

          sequenceFilereader.next(readKey, readValue);
          if (!(readValue.hasConverged())) {
            break;
          }

          if (i == 4) {
            hasConverged = true;
            sequenceFilereader.close();
          }
        }
        sequenceFilereader.close();
      }
      if (counter == 75) {
        sLogger.info("****************** Exiting (purposefully) after 75th iteration");
        hasConverged = true;
      }
    }

    // Normalised Page Rank Calculation Job 3
    outputPathForNormalisedPagerank = args[1] + "normalizedPageRank";

    // Normalised Page Rank Calculation Job Setup
    JobConf normalizationJob = new JobConf(getConf(), MapReduce.class);

    FileInputFormat.setInputPaths(normalizationJob, new Path(inputPath));
    FileOutputFormat.setOutputPath(normalizationJob, new Path(outputPathForNormalisedPagerank));
    FileOutputFormat.setCompressOutput(normalizationJob, false);

    normalizationJob.setJobName("Normalised Pagerank Output");
    normalizationJob.setNumMapTasks(mapTasks);
    normalizationJob.setNumReduceTasks(1);

    normalizationJob.setOutputKeyClass(Text.class);
    normalizationJob.setOutputValueClass(DoubleWritable.class);

    normalizationJob.setInputFormat(SequenceFileInputFormat.class);
    normalizationJob.setOutputFormat(SequenceFileOutputFormat.class);

    normalizationJob.setMapperClass(NormalisationMapper.class);
    normalizationJob.setReducerClass(NormalisationReducer.class);

    // Delete the output directory if it exists already
    Path tempUpdatedPageRankDir = new Path(outputPathForNormalisedPagerank);
    FileSystem.get(tempDir.toUri(), conf1).delete(tempUpdatedPageRankDir, true);

    startTime = System.currentTimeMillis();

    // Run Normalised Page Rank Calculation Job
    RunningJob runningUpdateJob = JobClient.runJob(normalizationJob);

    sLogger.info(
        "Normalisation Job Finished in "
            + (System.currentTimeMillis() - startTime) / 1000.0
            + " seconds");

    // Sorting and Output Job 4

    // Delete the intermediary files created
    Path tempNormalizationInputPath = new Path(inputPath);
    FileSystem.get(tempDir.toUri(), conf1).delete(tempNormalizationInputPath, true);

    inputPath = outputPathForNormalisedPagerank;
    outputPathforFinalSortedPagerank = args[1] + "FinalSortedPageRank";

    // Sorting and Output Job Setup
    JobConf outputJob = new JobConf(getConf(), MapReduce.class);

    FileInputFormat.setInputPaths(outputJob, new Path(inputPath));
    FileOutputFormat.setOutputPath(outputJob, new Path(outputPathforFinalSortedPagerank));
    FileOutputFormat.setCompressOutput(outputJob, false);

    outputJob.setJobName("Final Pagerank Output");
    sLogger.info("Starting final sotirng job -> this will output a single file");
    outputJob.setNumMapTasks(1);
    outputJob.setNumReduceTasks(1);

    outputJob.setOutputKeyClass(DoubleWritable.class);
    outputJob.setOutputValueClass(Text.class);

    outputJob.setInputFormat(SequenceFileInputFormat.class);
    outputJob.setOutputFormat(TextOutputFormat.class);

    outputJob.setMapperClass(OutputMapper.class);
    outputJob.setReducerClass(OutputReducer.class);

    outputJob.setOutputKeyComparatorClass(ReverseComparator.class);

    startTime = System.currentTimeMillis();

    // Run Sorting and Output Job
    RunningJob runningSortingJob = JobClient.runJob(outputJob);

    // Delete the intermediary files created
    Path tempFinalSortedInputPath = new Path(inputPath);
    FileSystem.get(tempDir.toUri(), conf1).delete(tempFinalSortedInputPath, true);

    sLogger.info(
        "Final Sorting Job Finished in "
            + (System.currentTimeMillis() - startTime) / 1000.0
            + " seconds");

    sLogger.info(
        "The program lasted "
            + (System.currentTimeMillis() - startWholeProgram) / 1000.0
            + "s ("
            + (System.currentTimeMillis() - startWholeProgram) / 60000.0
            + " mins)");

    return 0;
  }

Exemplo n.º 8

0

Exibir arquivo

Arquivo: BuildWikipediaForwardIndex.java Projeto: sdiao/Cloud9

  @SuppressWarnings("static-access")
  @Override
  public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
        OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("tmp output directory")
            .create(OUTPUT_OPTION));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("index file")
            .create(INDEX_FILE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION)
        || !cmdline.hasOption(OUTPUT_OPTION)
        || !cmdline.hasOption(INDEX_FILE_OPTION)) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp(this.getClass().getName(), options);
      ToolRunner.printGenericCommandUsage(System.out);
      return -1;
    }

    Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);

    if (!inputPath.isAbsolute()) {
      System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
      return -1;
    }

    JobConf conf = new JobConf(getConf(), BuildWikipediaForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");

    conf.setJobName(
        String.format(
            "BuildWikipediaForwardIndex[%s: %s, %s: %s]",
            INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile));

    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);

    // delete the output directory if it exists already
    fs.delete(new Path(outputPath), true);

    RunningJob job = JobClient.runJob(conf);

    Counters counters = job.getCounters();
    int blocks = (int) counters.findCounter(Blocks.Total).getCounter();

    LOG.info("number of blocks: " + blocks);

    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);

    out.writeUTF("edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex");
    out.writeUTF(inputPath.toString());
    out.writeInt(blocks);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
      String[] arr = line.toString().split("\\s+");

      int docno = Integer.parseInt(arr[0]);
      int offset = Integer.parseInt(arr[1]);
      short fileno = Short.parseShort(arr[2]);

      out.writeInt(docno);
      out.writeInt(offset);
      out.writeShort(fileno);

      cnt++;

      if (cnt % 100000 == 0) {
        LOG.info(cnt + " blocks written");
      }
    }

    reader.close();
    out.close();

    if (cnt != blocks) {
      throw new RuntimeException("Error: mismatch in block count!");
    }

    return 0;
  }