Exemplo n.º 1
0
  /** The stream must point to a valid BAM file, because the header is read from it. */
  public BAMSplitGuesser(SeekableStream ss, Configuration conf) throws IOException {
    this(ss, ss, conf);

    // Secondary check that the header points to a BAM file: Picard can get
    // things wrong due to its autodetection.
    ss.seek(0);
    if (ss.read(buf.array(), 0, 4) != 4 || buf.getInt(0) != BGZF_MAGIC)
      throw new SAMFormatException("Does not seem like a BAM file");
  }
Exemplo n.º 2
0
  public static void main(String[] args) throws IOException {
    final GenericOptionsParser parser;
    try {
      parser = new GenericOptionsParser(args);

      // This should be IOException but Hadoop 0.20.2 doesn't throw it...
    } catch (Exception e) {
      System.err.printf("Error in Hadoop arguments: %s\n", e.getMessage());
      System.exit(1);

      // Hooray for javac
      return;
    }

    args = parser.getRemainingArgs();
    // final Configuration conf = ContextUtil.getConfiguration(parser);
    final Configuration conf = parser.getConfiguration();

    long beg = 0;

    if (args.length < 2 || args.length > 3) {
      System.err.println("Usage: BAMSplitGuesser path-or-uri header-path-or-uri [beg]");
      System.exit(2);
    }

    try {
      if (args.length > 2) beg = Long.decode(args[2]);
    } catch (NumberFormatException e) {
      System.err.println("Invalid beg offset.");
      if (e.getMessage() != null) System.err.println(e.getMessage());
      System.exit(2);
    }

    SeekableStream ss = WrapSeekable.openPath(conf, new Path(args[0]));
    SeekableStream hs = WrapSeekable.openPath(conf, new Path(args[1]));

    final long end = beg + MAX_BYTES_READ;

    System.out.printf(
        "Will look for a BGZF block within: [%1$#x,%2$#x) = [%1$d,%2$d)\n"
            + "Will then verify BAM data within:  [%1$#x,%3$#x) = [%1$d,%3$d)\n",
        beg, beg + 0xffff, end);

    final long g = new BAMSplitGuesser(ss, hs, conf).guessNextBAMRecordStart(beg, end);

    ss.close();

    if (g == end) {
      System.out.println("Didn't find any acceptable BAM record in any BGZF block.");
      System.exit(1);
    }

    System.out.printf(
        "Accepted BGZF block at offset %1$#x (%1$d).\n"
            + "Accepted BAM record at offset %2$#x (%2$d) therein.\n",
        g >> 16, g & 0xffff);
  }
Exemplo n.º 3
0
  // Its too expensive to examine the remote file to determine type.
  // Rely on file extension.
  public static boolean sourceLikeBam(final SeekableStream strm) {
    String source = strm.getSource();
    if (source == null) {
      // assume any stream with a null source is a BAM file
      // (https://github.com/samtools/htsjdk/issues/619)
      return true;
    }

    // Source will typically be a file path or URL
    // If it's a URL we require one of the query parameters to be a cram file
    try {
      final URL sourceURL = new URL(source);
      final String urlPath = sourceURL.getPath().toLowerCase();
      String queryParams = sourceURL.getQuery();
      if (queryParams != null) {
        queryParams = queryParams.toLowerCase();
      }
      return urlPath.endsWith(".bam")
          || (queryParams != null
              && (queryParams.endsWith(".bam")
                  || queryParams.contains(".bam?")
                  || queryParams.contains(".bam&")
                  || queryParams.contains(".bam%26")));
    } catch (MalformedURLException e) {
      source = source.toLowerCase();
      return source.endsWith(".bam")
          || source.contains(".bam?")
          || source.contains(".bam&")
          || source.contains(".bam%26");
    }
  }
Exemplo n.º 4
0
 @Override
 public int position() {
   try {
     return (int) in.position();
   } catch (final IOException e) {
     throw new RuntimeIOException(e);
   }
 }
Exemplo n.º 5
0
 @Override
 public void seek(final int position) {
   try {
     in.seek(position);
   } catch (final IOException e) {
     throw new RuntimeIOException(e);
   }
 }
Exemplo n.º 6
0
 @Override
 public void close() {
   try {
     in.close();
   } catch (final IOException e) {
     throw new RuntimeIOException(e);
   }
 }
Exemplo n.º 7
0
 @Override
 public void skipBytes(final int count) {
   try {
     for (int s = count; s > 0; ) {
       final int skipped = (int) in.skip(s);
       if (skipped <= 0) throw new RuntimeIOException("Failed to skip " + s);
       s -= skipped;
     }
   } catch (final IOException e) {
     throw new RuntimeIOException(e);
   }
 }
Exemplo n.º 8
0
  public void readContainerHeader(long position) throws IOException {

    SeekableStream ss = IGVSeekableStreamFactory.getInstance().getStreamFor(path);
    ss.seek(position);

    BufferedInputStream bis = new BufferedInputStream(ss);

    int length = CramInt.int32(bis);

    int refSeqId = ITF8.readUnsignedITF8(bis);
    int startPos = ITF8.readUnsignedITF8(bis);
    int alignmentSpan = ITF8.readUnsignedITF8(bis);
    int nRecords = ITF8.readUnsignedITF8(bis);
    int recordCounter = ITF8.readUnsignedITF8(bis);
    int bases = ITF8.readUnsignedITF8(bis);
    int nBlocks = ITF8.readUnsignedITF8(bis);
    int[] landmarks = CramArray.array(bis);
    if (major >= 3) {
      int checksum = CramInt.int32(bis);
    }
    readBlocks(bis, nBlocks);
  }
Exemplo n.º 9
0
 /**
  * Continually reads from the provided {@link SeekableStream} into the buffer until the
  * specified number of bytes are read, or until the stream is exhausted, throwing a {@link
  * RuntimeIOException}.
  */
 private static void readFully(
     final SeekableStream in, final byte[] buffer, final int offset, final int length) {
   int read = 0;
   while (read < length) {
     final int readThisLoop;
     try {
       readThisLoop = in.read(buffer, read, length - read);
     } catch (final IOException e) {
       throw new RuntimeIOException(e);
     }
     if (readThisLoop == -1) break;
     read += readThisLoop;
   }
   if (read != length)
     throw new RuntimeIOException(
         "Expected to read " + length + " bytes, but expired stream after " + read + ".");
 }
Exemplo n.º 10
0
  /**
   * Finds a virtual BAM record position in the physical position range [beg,end). Returns end if no
   * BAM record was found.
   */
  public long guessNextBAMRecordStart(long beg, long end) throws IOException {
    // Buffer what we need to go through.

    byte[] arr = new byte[MAX_BYTES_READ];

    this.inFile.seek(beg);
    int totalRead = 0;
    for (int left = Math.min((int) (end - beg), arr.length); left > 0; ) {
      final int r = inFile.read(arr, totalRead, left);
      if (r < 0) break;
      totalRead += r;
      left -= r;
    }
    arr = Arrays.copyOf(arr, totalRead);

    this.in = new SeekableArrayStream(arr);

    this.bgzf = new BlockCompressedInputStream(this.in);
    this.bgzf.setCheckCrcs(true);

    this.bamCodec.setInputStream(bgzf);

    final int firstBGZFEnd = Math.min((int) (end - beg), 0xffff);

    // cp: Compressed Position, indexes the entire BGZF input.
    for (int cp = 0; ; ++cp) {
      final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd);
      if (psz == null) return end;

      final int cp0 = cp = psz.pos;
      final long cp0Virt = (long) cp0 << 16;
      try {
        bgzf.seek(cp0Virt);

        // This has to catch Throwable, because it's possible to get an
        // OutOfMemoryError due to an overly large size.
      } catch (Throwable e) {
        // Guessed BGZF position incorrectly: try the next guess.
        continue;
      }

      // up: Uncompressed Position, indexes the data inside the BGZF block.
      for (int up = 0; ; ++up) {
        final int up0 = up = guessNextBAMPos(cp0Virt, up, psz.size);

        if (up0 < 0) {
          // No BAM records found in the BGZF block: try the next BGZF
          // block.
          break;
        }

        // Verify that we can actually decode BLOCKS_NEEDED_FOR_GUESS worth
        // of records starting at (cp0,up0).
        bgzf.seek(cp0Virt | up0);
        boolean decodedAny = false;
        try {
          byte b = 0;
          int prevCP = cp0;
          while (b < BLOCKS_NEEDED_FOR_GUESS) {
            SAMRecord record = bamCodec.decode();
            if (record == null) {
              break;
            }
            record.getCigar(); // force decoding of CIGAR
            decodedAny = true;

            final int cp2 = (int) (bgzf.getFilePointer() >>> 16);
            if (cp2 != prevCP) {
              // The compressed position changed so we must be in a new
              // block.
              assert cp2 > prevCP;
              prevCP = cp2;
              ++b;
            }
          }

          // Running out of records to verify is fine as long as we
          // verified at least something. It should only happen if we
          // couldn't fill the array.
          if (b < BLOCKS_NEEDED_FOR_GUESS) {
            assert arr.length < MAX_BYTES_READ;
            if (!decodedAny) continue;
          }
        } catch (SAMFormatException e) {
          continue;
        } catch (FileTruncatedException e) {
          continue;
        } catch (OutOfMemoryError e) {
          continue;
        } catch (IllegalArgumentException e) {
          continue;
        } catch (RuntimeIOException e) {
          continue;
        } catch (RuntimeEOFException e) {
          // This can happen legitimately if the [beg,end) range is too
          // small to accommodate BLOCKS_NEEDED_FOR_GUESS and we get cut
          // off in the middle of a record. In that case, our stream
          // should have hit EOF as well. If we've then verified at least
          // something, go ahead with it and hope for the best.
          if (!decodedAny && this.in.eof()) continue;
        }

        return beg + cp0 << 16 | up0;
      }
    }
  }
Exemplo n.º 11
0
  // Gives the compressed size on the side. Returns null if it doesn't find
  // anything.
  private PosSize guessNextBGZFPos(int p, int end) {
    try {
      for (; ; ) {
        for (; ; ) {
          in.seek(p);
          in.read(buf.array(), 0, 4);
          int n = buf.getInt(0);

          if (n == BGZF_MAGIC) break;

          // Skip ahead a bit more than 1 byte if you can.
          if (n >>> 8 == BGZF_MAGIC << 8 >>> 8) ++p;
          else if (n >>> 16 == BGZF_MAGIC << 16 >>> 16) p += 2;
          else p += 3;

          if (p >= end) return null;
        }
        // Found what looks like a gzip block header: now get XLEN and
        // search for the BGZF subfield.
        final int p0 = p;
        p += 10;
        in.seek(p);
        in.read(buf.array(), 0, 2);
        p += 2;
        final int xlen = getUShort(0);
        final int subEnd = p + xlen;

        while (p < subEnd) {
          in.read(buf.array(), 0, 4);

          if (buf.getInt(0) != BGZF_MAGIC_SUB) {
            p += 4 + getUShort(2);
            in.seek(p);
            continue;
          }

          // Found it: this is close enough to a BGZF block, make it
          // our guess.

          // But find out the size before returning. First, grab bsize:
          // we'll need it later.
          in.read(buf.array(), 0, 2);
          int bsize = getUShort(0);

          // Then skip the rest of the subfields.
          p += BGZF_SUB_SIZE;
          while (p < subEnd) {
            in.seek(p);
            in.read(buf.array(), 0, 4);
            p += 4 + getUShort(2);
          }
          if (p != subEnd) {
            // Cancel our guess because the xlen field didn't match the
            // data.
            break;
          }

          // Now skip past the compressed data and the CRC-32.
          p += bsize - xlen - 19 + 4;
          in.seek(p);
          in.read(buf.array(), 0, 4);
          return new PosSize(p0, buf.getInt(0));
        }
        // No luck: look for the next gzip block header. Start right after
        // where we last saw the identifiers, although we could probably
        // safely skip further ahead. (If we find the correct one right
        // now, the previous block contained 0x1f8b0804 bytes of data: that
        // seems... unlikely.)
        p = p0 + 4;
      }
    } catch (IOException e) {
      return null;
    }
  }