public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { Configuration conf = ContextUtil.getConfiguration(context); this.maxLineLength = conf.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); FileSplit split = (FileSplit) genericSplit; start = (split.getStart()) << 16; end = (start + split.getLength()) << 16; final Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); bin = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>( fs.open(file), fs.getFileStatus(file).getLen(), file)); in = new LineReader(bin, conf); if (start != 0) { bin.seek(start); // Skip first line in.readLine(new Text()); start = bin.getFilePointer(); } this.pos = start; }
public RecordReader<Text, SequencedFragment> createRecordReader( InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { context.setStatus(genericSplit.toString()); return new QseqRecordReader( ContextUtil.getConfiguration(context), (FileSplit) genericSplit); // cast as per example in TextInputFormat }
@Override protected void map( LongWritable ignored, SAMRecordWritable wrec, Mapper<LongWritable, SAMRecordWritable, Text, SAMRecordWritable>.Context ctx) throws InterruptedException, IOException { Utils.correctSAMRecordForMerging(wrec.get(), ContextUtil.getConfiguration(ctx)); ctx.write(new Text(wrec.get().getReadName()), wrec); }
@Override protected void reduce( Text key, Iterable<SAMRecordWritable> records, Reducer<Text, SAMRecordWritable, Text, SAMRecordWritable>.Context ctx) throws IOException, InterruptedException { // Non-primary records are simply written out, but as long as we can find // two primaries, pair them up. final SAMFileHeader header = Utils.getSAMHeaderMerger(ContextUtil.getConfiguration(ctx)).getMergedHeader(); final Iterator<SAMRecordWritable> it = records.iterator(); while (it.hasNext()) { SAMRecordWritable a = it.next(); if (a.get().getNotPrimaryAlignmentFlag()) { ctx.write(key, a); continue; } // Cache the record since the iterator does its own caching, meaning // that after another it.next() we would have a == b. wrec.set(a.get()); a = wrec; SAMRecordWritable b = null; while (it.hasNext()) { b = it.next(); if (!b.get().getNotPrimaryAlignmentFlag()) break; ctx.write(key, b); } if (b == null) { // No more primaries, so just write the unpaired one as-is. ctx.write(key, a); break; } a.get().setHeader(header); b.get().setHeader(header); SamPairUtil.setMateInfo(a.get(), b.get(), header); ctx.write(key, a); ctx.write(key, b); } }
@Override public RecordWriter<NullWritable, Text> getRecordWriter(TaskAttemptContext ctx) throws IOException { Path path = getDefaultWorkFile(ctx, ""); FileSystem fs = path.getFileSystem(ContextUtil.getConfiguration(ctx)); final OutputStream file = fs.create(path); return new TextOutputFormat.LineRecordWriter<NullWritable, Text>( new DataOutputStream( new FilterOutputStream(new BlockCompressedOutputStream(file, null)) { @Override public void close() throws IOException { // Don't close the BlockCompressedOutputStream, so we don't // get an end-of-file sentinel. this.out.flush(); // Instead, close the file stream directly. file.close(); } })); }
@Override public boolean isSplitable(JobContext context, Path path) { CompressionCodec codec = new CompressionCodecFactory(ContextUtil.getConfiguration(context)).getCodec(path); return codec == null; }