@Override
  public int run(String[] args) throws Exception {
    JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args);
    if (conf == null) {
      return -1;
    }

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK);

    conf.setPartitionerClass(TotalOrderPartitioner.class);

    InputSampler.Sampler<IntWritable, Text> sampler =
        new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10);

    Path input = FileInputFormat.getInputPaths(conf)[0];
    input = input.makeQualified(input.getFileSystem(conf));

    Path partitionFile = new Path(input, "_partitions");
    TotalOrderPartitioner.setPartitionFile(conf, partitionFile);
    InputSampler.writePartitionFile(conf, sampler);

    // Add to DistributedCache
    URI partitionUri = new URI(partitionFile.toString() + "#_partitions");
    DistributedCache.addCacheFile(partitionUri, conf);
    DistributedCache.createSymlink(conf);

    JobClient.runJob(conf);
    return 0;
  }
 public void getStats(Path segment, final SegmentReaderStats stats) throws Exception {
   SequenceFile.Reader[] readers =
       SequenceFileOutputFormat.getReaders(
           getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
   long cnt = 0L;
   Text key = new Text();
   for (int i = 0; i < readers.length; i++) {
     while (readers[i].next(key)) cnt++;
     readers[i].close();
   }
   stats.generated = cnt;
   Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
   if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
     cnt = 0L;
     long start = Long.MAX_VALUE;
     long end = Long.MIN_VALUE;
     CrawlDatum value = new CrawlDatum();
     MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, getConf());
     for (int i = 0; i < mreaders.length; i++) {
       while (mreaders[i].next(key, value)) {
         cnt++;
         if (value.getFetchTime() < start) start = value.getFetchTime();
         if (value.getFetchTime() > end) end = value.getFetchTime();
       }
       mreaders[i].close();
     }
     stats.start = start;
     stats.end = end;
     stats.fetched = cnt;
   }
   Path parseDir = new Path(segment, ParseData.DIR_NAME);
   if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
     cnt = 0L;
     long errors = 0L;
     ParseData value = new ParseData();
     MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf());
     for (int i = 0; i < mreaders.length; i++) {
       while (mreaders[i].next(key, value)) {
         cnt++;
         if (!value.getStatus().isSuccess()) errors++;
       }
       mreaders[i].close();
     }
     stats.parsed = cnt;
     stats.parseErrors = errors;
   }
 }
 private List<Writable> getSeqRecords(Path dir, Text key) throws Exception {
   SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), dir);
   ArrayList<Writable> res = new ArrayList<Writable>();
   Class keyClass = readers[0].getKeyClass();
   Class valueClass = readers[0].getValueClass();
   if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
     throw new IOException("Incompatible key (" + keyClass.getName() + ")");
   Writable aKey = (Writable) keyClass.newInstance();
   Writable value = (Writable) valueClass.newInstance();
   for (int i = 0; i < readers.length; i++) {
     while (readers[i].next(aKey, value)) {
       if (aKey.equals(key)) res.add(value);
     }
     readers[i].close();
   }
   return res;
 }
  /** Runs this tool. */
  public int run(String[] args) throws Exception {
    if (args.length != 3) {
      printUsage();
      return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    int n = Integer.parseInt(args[2]);

    sLogger.info("Tool name: BuildPageRankRecords");
    sLogger.info(" - inputDir: " + inputPath);
    sLogger.info(" - outputDir: " + outputPath);
    sLogger.info(" - numNodes: " + n);

    JobConf conf = new JobConf(BuildPageRankRecords.class);
    conf.setJobName("PackageLinkGraph");

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(0);

    conf.setInt("NodeCnt", n);
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    TextInputFormat.addInputPath(conf, new Path(inputPath));
    SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
  }
 public static RecordWriter getHiveRecordWriter(
     JobConf jc,
     TableDesc tableInfo,
     Class<? extends Writable> outputClass,
     FileSinkDesc conf,
     Path outPath)
     throws HiveException {
   try {
     HiveOutputFormat<?, ?> hiveOutputFormat = tableInfo.getOutputFileFormatClass().newInstance();
     boolean isCompressed = conf.getCompressed();
     JobConf jc_output = jc;
     if (isCompressed) {
       jc_output = new JobConf(jc);
       String codecStr = conf.getCompressCodec();
       if (codecStr != null && !codecStr.trim().equals("")) {
         Class<? extends CompressionCodec> codec =
             (Class<? extends CompressionCodec>) Class.forName(codecStr);
         FileOutputFormat.setOutputCompressorClass(jc_output, codec);
       }
       String type = conf.getCompressType();
       if (type != null && !type.trim().equals("")) {
         CompressionType style = CompressionType.valueOf(type);
         SequenceFileOutputFormat.setOutputCompressionType(jc, style);
       }
     }
     return getRecordWriter(
         jc_output,
         hiveOutputFormat,
         outputClass,
         isCompressed,
         tableInfo.getProperties(),
         outPath);
   } catch (Exception e) {
     throw new HiveException(e);
   }
 }