/** The stream must point to a valid BAM file, because the header is read from it. */ public BAMSplitGuesser(SeekableStream ss, Configuration conf) throws IOException { this(ss, ss, conf); // Secondary check that the header points to a BAM file: Picard can get // things wrong due to its autodetection. ss.seek(0); if (ss.read(buf.array(), 0, 4) != 4 || buf.getInt(0) != BGZF_MAGIC) throw new SAMFormatException("Does not seem like a BAM file"); }
public static void main(String[] args) throws IOException { final GenericOptionsParser parser; try { parser = new GenericOptionsParser(args); // This should be IOException but Hadoop 0.20.2 doesn't throw it... } catch (Exception e) { System.err.printf("Error in Hadoop arguments: %s\n", e.getMessage()); System.exit(1); // Hooray for javac return; } args = parser.getRemainingArgs(); // final Configuration conf = ContextUtil.getConfiguration(parser); final Configuration conf = parser.getConfiguration(); long beg = 0; if (args.length < 2 || args.length > 3) { System.err.println("Usage: BAMSplitGuesser path-or-uri header-path-or-uri [beg]"); System.exit(2); } try { if (args.length > 2) beg = Long.decode(args[2]); } catch (NumberFormatException e) { System.err.println("Invalid beg offset."); if (e.getMessage() != null) System.err.println(e.getMessage()); System.exit(2); } SeekableStream ss = WrapSeekable.openPath(conf, new Path(args[0])); SeekableStream hs = WrapSeekable.openPath(conf, new Path(args[1])); final long end = beg + MAX_BYTES_READ; System.out.printf( "Will look for a BGZF block within: [%1$#x,%2$#x) = [%1$d,%2$d)\n" + "Will then verify BAM data within: [%1$#x,%3$#x) = [%1$d,%3$d)\n", beg, beg + 0xffff, end); final long g = new BAMSplitGuesser(ss, hs, conf).guessNextBAMRecordStart(beg, end); ss.close(); if (g == end) { System.out.println("Didn't find any acceptable BAM record in any BGZF block."); System.exit(1); } System.out.printf( "Accepted BGZF block at offset %1$#x (%1$d).\n" + "Accepted BAM record at offset %2$#x (%2$d) therein.\n", g >> 16, g & 0xffff); }
// Its too expensive to examine the remote file to determine type. // Rely on file extension. public static boolean sourceLikeBam(final SeekableStream strm) { String source = strm.getSource(); if (source == null) { // assume any stream with a null source is a BAM file // (https://github.com/samtools/htsjdk/issues/619) return true; } // Source will typically be a file path or URL // If it's a URL we require one of the query parameters to be a cram file try { final URL sourceURL = new URL(source); final String urlPath = sourceURL.getPath().toLowerCase(); String queryParams = sourceURL.getQuery(); if (queryParams != null) { queryParams = queryParams.toLowerCase(); } return urlPath.endsWith(".bam") || (queryParams != null && (queryParams.endsWith(".bam") || queryParams.contains(".bam?") || queryParams.contains(".bam&") || queryParams.contains(".bam%26"))); } catch (MalformedURLException e) { source = source.toLowerCase(); return source.endsWith(".bam") || source.contains(".bam?") || source.contains(".bam&") || source.contains(".bam%26"); } }
@Override public int position() { try { return (int) in.position(); } catch (final IOException e) { throw new RuntimeIOException(e); } }
@Override public void seek(final int position) { try { in.seek(position); } catch (final IOException e) { throw new RuntimeIOException(e); } }
@Override public void close() { try { in.close(); } catch (final IOException e) { throw new RuntimeIOException(e); } }
@Override public void skipBytes(final int count) { try { for (int s = count; s > 0; ) { final int skipped = (int) in.skip(s); if (skipped <= 0) throw new RuntimeIOException("Failed to skip " + s); s -= skipped; } } catch (final IOException e) { throw new RuntimeIOException(e); } }
public void readContainerHeader(long position) throws IOException { SeekableStream ss = IGVSeekableStreamFactory.getInstance().getStreamFor(path); ss.seek(position); BufferedInputStream bis = new BufferedInputStream(ss); int length = CramInt.int32(bis); int refSeqId = ITF8.readUnsignedITF8(bis); int startPos = ITF8.readUnsignedITF8(bis); int alignmentSpan = ITF8.readUnsignedITF8(bis); int nRecords = ITF8.readUnsignedITF8(bis); int recordCounter = ITF8.readUnsignedITF8(bis); int bases = ITF8.readUnsignedITF8(bis); int nBlocks = ITF8.readUnsignedITF8(bis); int[] landmarks = CramArray.array(bis); if (major >= 3) { int checksum = CramInt.int32(bis); } readBlocks(bis, nBlocks); }
/** * Continually reads from the provided {@link SeekableStream} into the buffer until the * specified number of bytes are read, or until the stream is exhausted, throwing a {@link * RuntimeIOException}. */ private static void readFully( final SeekableStream in, final byte[] buffer, final int offset, final int length) { int read = 0; while (read < length) { final int readThisLoop; try { readThisLoop = in.read(buffer, read, length - read); } catch (final IOException e) { throw new RuntimeIOException(e); } if (readThisLoop == -1) break; read += readThisLoop; } if (read != length) throw new RuntimeIOException( "Expected to read " + length + " bytes, but expired stream after " + read + "."); }
/** * Finds a virtual BAM record position in the physical position range [beg,end). Returns end if no * BAM record was found. */ public long guessNextBAMRecordStart(long beg, long end) throws IOException { // Buffer what we need to go through. byte[] arr = new byte[MAX_BYTES_READ]; this.inFile.seek(beg); int totalRead = 0; for (int left = Math.min((int) (end - beg), arr.length); left > 0; ) { final int r = inFile.read(arr, totalRead, left); if (r < 0) break; totalRead += r; left -= r; } arr = Arrays.copyOf(arr, totalRead); this.in = new SeekableArrayStream(arr); this.bgzf = new BlockCompressedInputStream(this.in); this.bgzf.setCheckCrcs(true); this.bamCodec.setInputStream(bgzf); final int firstBGZFEnd = Math.min((int) (end - beg), 0xffff); // cp: Compressed Position, indexes the entire BGZF input. for (int cp = 0; ; ++cp) { final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd); if (psz == null) return end; final int cp0 = cp = psz.pos; final long cp0Virt = (long) cp0 << 16; try { bgzf.seek(cp0Virt); // This has to catch Throwable, because it's possible to get an // OutOfMemoryError due to an overly large size. } catch (Throwable e) { // Guessed BGZF position incorrectly: try the next guess. continue; } // up: Uncompressed Position, indexes the data inside the BGZF block. for (int up = 0; ; ++up) { final int up0 = up = guessNextBAMPos(cp0Virt, up, psz.size); if (up0 < 0) { // No BAM records found in the BGZF block: try the next BGZF // block. break; } // Verify that we can actually decode BLOCKS_NEEDED_FOR_GUESS worth // of records starting at (cp0,up0). bgzf.seek(cp0Virt | up0); boolean decodedAny = false; try { byte b = 0; int prevCP = cp0; while (b < BLOCKS_NEEDED_FOR_GUESS) { SAMRecord record = bamCodec.decode(); if (record == null) { break; } record.getCigar(); // force decoding of CIGAR decodedAny = true; final int cp2 = (int) (bgzf.getFilePointer() >>> 16); if (cp2 != prevCP) { // The compressed position changed so we must be in a new // block. assert cp2 > prevCP; prevCP = cp2; ++b; } } // Running out of records to verify is fine as long as we // verified at least something. It should only happen if we // couldn't fill the array. if (b < BLOCKS_NEEDED_FOR_GUESS) { assert arr.length < MAX_BYTES_READ; if (!decodedAny) continue; } } catch (SAMFormatException e) { continue; } catch (FileTruncatedException e) { continue; } catch (OutOfMemoryError e) { continue; } catch (IllegalArgumentException e) { continue; } catch (RuntimeIOException e) { continue; } catch (RuntimeEOFException e) { // This can happen legitimately if the [beg,end) range is too // small to accommodate BLOCKS_NEEDED_FOR_GUESS and we get cut // off in the middle of a record. In that case, our stream // should have hit EOF as well. If we've then verified at least // something, go ahead with it and hope for the best. if (!decodedAny && this.in.eof()) continue; } return beg + cp0 << 16 | up0; } } }
// Gives the compressed size on the side. Returns null if it doesn't find // anything. private PosSize guessNextBGZFPos(int p, int end) { try { for (; ; ) { for (; ; ) { in.seek(p); in.read(buf.array(), 0, 4); int n = buf.getInt(0); if (n == BGZF_MAGIC) break; // Skip ahead a bit more than 1 byte if you can. if (n >>> 8 == BGZF_MAGIC << 8 >>> 8) ++p; else if (n >>> 16 == BGZF_MAGIC << 16 >>> 16) p += 2; else p += 3; if (p >= end) return null; } // Found what looks like a gzip block header: now get XLEN and // search for the BGZF subfield. final int p0 = p; p += 10; in.seek(p); in.read(buf.array(), 0, 2); p += 2; final int xlen = getUShort(0); final int subEnd = p + xlen; while (p < subEnd) { in.read(buf.array(), 0, 4); if (buf.getInt(0) != BGZF_MAGIC_SUB) { p += 4 + getUShort(2); in.seek(p); continue; } // Found it: this is close enough to a BGZF block, make it // our guess. // But find out the size before returning. First, grab bsize: // we'll need it later. in.read(buf.array(), 0, 2); int bsize = getUShort(0); // Then skip the rest of the subfields. p += BGZF_SUB_SIZE; while (p < subEnd) { in.seek(p); in.read(buf.array(), 0, 4); p += 4 + getUShort(2); } if (p != subEnd) { // Cancel our guess because the xlen field didn't match the // data. break; } // Now skip past the compressed data and the CRC-32. p += bsize - xlen - 19 + 4; in.seek(p); in.read(buf.array(), 0, 4); return new PosSize(p0, buf.getInt(0)); } // No luck: look for the next gzip block header. Start right after // where we last saw the identifiers, although we could probably // safely skip further ahead. (If we find the correct one right // now, the previous block contained 0x1f8b0804 bytes of data: that // seems... unlikely.) p = p0 + 4; } } catch (IOException e) { return null; } }