@Override public int run(String[] args) throws Exception { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; } conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); conf.setPartitionerClass(TotalOrderPartitioner.class); InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10); Path input = FileInputFormat.getInputPaths(conf)[0]; input = input.makeQualified(input.getFileSystem(conf)); Path partitionFile = new Path(input, "_partitions"); TotalOrderPartitioner.setPartitionFile(conf, partitionFile); InputSampler.writePartitionFile(conf, sampler); // Add to DistributedCache URI partitionUri = new URI(partitionFile.toString() + "#_partitions"); DistributedCache.addCacheFile(partitionUri, conf); DistributedCache.createSymlink(conf); JobClient.runJob(conf); return 0; }
public void getStats(Path segment, final SegmentReaderStats stats) throws Exception { SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders( getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); long cnt = 0L; Text key = new Text(); for (int i = 0; i < readers.length; i++) { while (readers[i].next(key)) cnt++; readers[i].close(); } stats.generated = cnt; Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME); if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) { cnt = 0L; long start = Long.MAX_VALUE; long end = Long.MIN_VALUE; CrawlDatum value = new CrawlDatum(); MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir, getConf()); for (int i = 0; i < mreaders.length; i++) { while (mreaders[i].next(key, value)) { cnt++; if (value.getFetchTime() < start) start = value.getFetchTime(); if (value.getFetchTime() > end) end = value.getFetchTime(); } mreaders[i].close(); } stats.start = start; stats.end = end; stats.fetched = cnt; } Path parseDir = new Path(segment, ParseData.DIR_NAME); if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) { cnt = 0L; long errors = 0L; ParseData value = new ParseData(); MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir, getConf()); for (int i = 0; i < mreaders.length; i++) { while (mreaders[i].next(key, value)) { cnt++; if (!value.getStatus().isSuccess()) errors++; } mreaders[i].close(); } stats.parsed = cnt; stats.parseErrors = errors; } }
private List<Writable> getSeqRecords(Path dir, Text key) throws Exception { SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), dir); ArrayList<Writable> res = new ArrayList<Writable>(); Class keyClass = readers[0].getKeyClass(); Class valueClass = readers[0].getValueClass(); if (!keyClass.getName().equals("org.apache.hadoop.io.Text")) throw new IOException("Incompatible key (" + keyClass.getName() + ")"); Writable aKey = (Writable) keyClass.newInstance(); Writable value = (Writable) valueClass.newInstance(); for (int i = 0; i < readers.length; i++) { while (readers[i].next(aKey, value)) { if (aKey.equals(key)) res.add(value); } readers[i].close(); } return res; }
/** Runs this tool. */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int n = Integer.parseInt(args[2]); sLogger.info("Tool name: BuildPageRankRecords"); sLogger.info(" - inputDir: " + inputPath); sLogger.info(" - outputDir: " + outputPath); sLogger.info(" - numNodes: " + n); JobConf conf = new JobConf(BuildPageRankRecords.class); conf.setJobName("PackageLinkGraph"); conf.setNumMapTasks(1); conf.setNumReduceTasks(0); conf.setInt("NodeCnt", n); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); TextInputFormat.addInputPath(conf, new Path(inputPath)); SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(PageRankNode.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
public static RecordWriter getHiveRecordWriter( JobConf jc, TableDesc tableInfo, Class<? extends Writable> outputClass, FileSinkDesc conf, Path outPath) throws HiveException { try { HiveOutputFormat<?, ?> hiveOutputFormat = tableInfo.getOutputFileFormatClass().newInstance(); boolean isCompressed = conf.getCompressed(); JobConf jc_output = jc; if (isCompressed) { jc_output = new JobConf(jc); String codecStr = conf.getCompressCodec(); if (codecStr != null && !codecStr.trim().equals("")) { Class<? extends CompressionCodec> codec = (Class<? extends CompressionCodec>) Class.forName(codecStr); FileOutputFormat.setOutputCompressorClass(jc_output, codec); } String type = conf.getCompressType(); if (type != null && !type.trim().equals("")) { CompressionType style = CompressionType.valueOf(type); SequenceFileOutputFormat.setOutputCompressionType(jc, style); } } return getRecordWriter( jc_output, hiveOutputFormat, outputClass, isCompressed, tableInfo.getProperties(), outPath); } catch (Exception e) { throw new HiveException(e); } }