@Override public boolean nextKeyValue() { long virtPos; while ((virtPos = bci.getFilePointer()) < virtualEnd || (keepReadPairsTogether && readPair && !lastOfPair)) { final SAMRecord r = codec.decode(); if (r == null) return false; // Since we're reading from a BAMRecordCodec directly we have to set the // validation stringency ourselves. if (this.stringency != null) r.setValidationStringency(this.stringency); readPair = r.getReadPairedFlag(); if (readPair) { boolean first = r.getFirstOfPairFlag(), second = r.getSecondOfPairFlag(); // According to the SAM spec (section 1.4) it is possible for pairs to have // multiple segments (i.e. more than two), in which case both `first` and // `second` will be true. boolean firstOfPair = first && !second; lastOfPair = !first && second; // ignore any template that is not first in a pair right at the start of a split // since it will have been returned in the previous split if (virtPos == virtualStart && keepReadPairsTogether && !firstOfPair) { continue; } } if (!overlaps(r)) { continue; } key.set(getKey(r)); record.set(r); return true; } return false; }
/** * Finds a virtual BAM record position in the physical position range [beg,end). Returns end if no * BAM record was found. */ public long guessNextBAMRecordStart(long beg, long end) throws IOException { // Buffer what we need to go through. byte[] arr = new byte[MAX_BYTES_READ]; this.inFile.seek(beg); int totalRead = 0; for (int left = Math.min((int) (end - beg), arr.length); left > 0; ) { final int r = inFile.read(arr, totalRead, left); if (r < 0) break; totalRead += r; left -= r; } arr = Arrays.copyOf(arr, totalRead); this.in = new SeekableArrayStream(arr); this.bgzf = new BlockCompressedInputStream(this.in); this.bgzf.setCheckCrcs(true); this.bamCodec.setInputStream(bgzf); final int firstBGZFEnd = Math.min((int) (end - beg), 0xffff); // cp: Compressed Position, indexes the entire BGZF input. for (int cp = 0; ; ++cp) { final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd); if (psz == null) return end; final int cp0 = cp = psz.pos; final long cp0Virt = (long) cp0 << 16; try { bgzf.seek(cp0Virt); // This has to catch Throwable, because it's possible to get an // OutOfMemoryError due to an overly large size. } catch (Throwable e) { // Guessed BGZF position incorrectly: try the next guess. continue; } // up: Uncompressed Position, indexes the data inside the BGZF block. for (int up = 0; ; ++up) { final int up0 = up = guessNextBAMPos(cp0Virt, up, psz.size); if (up0 < 0) { // No BAM records found in the BGZF block: try the next BGZF // block. break; } // Verify that we can actually decode BLOCKS_NEEDED_FOR_GUESS worth // of records starting at (cp0,up0). bgzf.seek(cp0Virt | up0); boolean decodedAny = false; try { byte b = 0; int prevCP = cp0; while (b < BLOCKS_NEEDED_FOR_GUESS) { SAMRecord record = bamCodec.decode(); if (record == null) { break; } record.getCigar(); // force decoding of CIGAR decodedAny = true; final int cp2 = (int) (bgzf.getFilePointer() >>> 16); if (cp2 != prevCP) { // The compressed position changed so we must be in a new // block. assert cp2 > prevCP; prevCP = cp2; ++b; } } // Running out of records to verify is fine as long as we // verified at least something. It should only happen if we // couldn't fill the array. if (b < BLOCKS_NEEDED_FOR_GUESS) { assert arr.length < MAX_BYTES_READ; if (!decodedAny) continue; } } catch (SAMFormatException e) { continue; } catch (FileTruncatedException e) { continue; } catch (OutOfMemoryError e) { continue; } catch (IllegalArgumentException e) { continue; } catch (RuntimeIOException e) { continue; } catch (RuntimeEOFException e) { // This can happen legitimately if the [beg,end) range is too // small to accommodate BLOCKS_NEEDED_FOR_GUESS and we get cut // off in the middle of a record. In that case, our stream // should have hit EOF as well. If we've then verified at least // something, go ahead with it and hope for the best. if (!decodedAny && this.in.eof()) continue; } return beg + cp0 << 16 | up0; } } }