@Override public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { // This method should only be called once (see Hadoop API). However, // there seems to be disagreement between implementations that call // initialize() and Hadoop-BAM's own code that relies on // {@link BAMInputFormat} to call initialize() when the reader is // created. Therefore we add this check for the time being. if (isInitialized) close(); isInitialized = true; final Configuration conf = ctx.getConfiguration(); final FileVirtualSplit split = (FileVirtualSplit) spl; final Path file = split.getPath(); final FileSystem fs = file.getFileSystem(conf); this.stringency = SAMHeaderReader.getValidationStringency(conf); final FSDataInputStream in = fs.open(file); final SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf); codec = new BAMRecordCodec(header); in.seek(0); bci = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file)); virtualStart = split.getStartVirtualOffset(); fileStart = virtualStart >>> 16; virtualEnd = split.getEndVirtualOffset(); bci.seek(virtualStart); codec.setInputStream(bci); if (BAMInputFormat.DEBUG_BAM_SPLITTER) { final long recordStart = virtualStart & 0xffff; System.err.println( "XXX inizialized BAMRecordReader byte offset: " + fileStart + " record offset: " + recordStart); } keepReadPairsTogether = SortOrder.queryname.equals(header.getSortOrder()) && conf.getBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, false); readPair = false; lastOfPair = false; intervals = BAMInputFormat.getIntervals(conf); if (intervals != null) { overlapDetector = new OverlapDetector<>(0, 0); overlapDetector.addAll(intervals, intervals); } }
@Override public boolean nextKeyValue() { long virtPos; while ((virtPos = bci.getFilePointer()) < virtualEnd || (keepReadPairsTogether && readPair && !lastOfPair)) { final SAMRecord r = codec.decode(); if (r == null) return false; // Since we're reading from a BAMRecordCodec directly we have to set the // validation stringency ourselves. if (this.stringency != null) r.setValidationStringency(this.stringency); readPair = r.getReadPairedFlag(); if (readPair) { boolean first = r.getFirstOfPairFlag(), second = r.getSecondOfPairFlag(); // According to the SAM spec (section 1.4) it is possible for pairs to have // multiple segments (i.e. more than two), in which case both `first` and // `second` will be true. boolean firstOfPair = first && !second; lastOfPair = !first && second; // ignore any template that is not first in a pair right at the start of a split // since it will have been returned in the previous split if (virtPos == virtualStart && keepReadPairsTogether && !firstOfPair) { continue; } } if (!overlaps(r)) { continue; } key.set(getKey(r)); record.set(r); return true; } return false; }
/** * Finds a virtual BAM record position in the physical position range [beg,end). Returns end if no * BAM record was found. */ public long guessNextBAMRecordStart(long beg, long end) throws IOException { // Buffer what we need to go through. byte[] arr = new byte[MAX_BYTES_READ]; this.inFile.seek(beg); int totalRead = 0; for (int left = Math.min((int) (end - beg), arr.length); left > 0; ) { final int r = inFile.read(arr, totalRead, left); if (r < 0) break; totalRead += r; left -= r; } arr = Arrays.copyOf(arr, totalRead); this.in = new SeekableArrayStream(arr); this.bgzf = new BlockCompressedInputStream(this.in); this.bgzf.setCheckCrcs(true); this.bamCodec.setInputStream(bgzf); final int firstBGZFEnd = Math.min((int) (end - beg), 0xffff); // cp: Compressed Position, indexes the entire BGZF input. for (int cp = 0; ; ++cp) { final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd); if (psz == null) return end; final int cp0 = cp = psz.pos; final long cp0Virt = (long) cp0 << 16; try { bgzf.seek(cp0Virt); // This has to catch Throwable, because it's possible to get an // OutOfMemoryError due to an overly large size. } catch (Throwable e) { // Guessed BGZF position incorrectly: try the next guess. continue; } // up: Uncompressed Position, indexes the data inside the BGZF block. for (int up = 0; ; ++up) { final int up0 = up = guessNextBAMPos(cp0Virt, up, psz.size); if (up0 < 0) { // No BAM records found in the BGZF block: try the next BGZF // block. break; } // Verify that we can actually decode BLOCKS_NEEDED_FOR_GUESS worth // of records starting at (cp0,up0). bgzf.seek(cp0Virt | up0); boolean decodedAny = false; try { byte b = 0; int prevCP = cp0; while (b < BLOCKS_NEEDED_FOR_GUESS) { SAMRecord record = bamCodec.decode(); if (record == null) { break; } record.getCigar(); // force decoding of CIGAR decodedAny = true; final int cp2 = (int) (bgzf.getFilePointer() >>> 16); if (cp2 != prevCP) { // The compressed position changed so we must be in a new // block. assert cp2 > prevCP; prevCP = cp2; ++b; } } // Running out of records to verify is fine as long as we // verified at least something. It should only happen if we // couldn't fill the array. if (b < BLOCKS_NEEDED_FOR_GUESS) { assert arr.length < MAX_BYTES_READ; if (!decodedAny) continue; } } catch (SAMFormatException e) { continue; } catch (FileTruncatedException e) { continue; } catch (OutOfMemoryError e) { continue; } catch (IllegalArgumentException e) { continue; } catch (RuntimeIOException e) { continue; } catch (RuntimeEOFException e) { // This can happen legitimately if the [beg,end) range is too // small to accommodate BLOCKS_NEEDED_FOR_GUESS and we get cut // off in the middle of a record. In that case, our stream // should have hit EOF as well. If we've then verified at least // something, go ahead with it and hope for the best. if (!decodedAny && this.in.eof()) continue; } return beg + cp0 << 16 | up0; } } }