Ejemplo n.º 1
0
  @Override
  public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    // This method should only be called once (see Hadoop API). However,
    // there seems to be disagreement between implementations that call
    // initialize() and Hadoop-BAM's own code that relies on
    // {@link BAMInputFormat} to call initialize() when the reader is
    // created. Therefore we add this check for the time being.
    if (isInitialized) close();
    isInitialized = true;

    final Configuration conf = ctx.getConfiguration();

    final FileVirtualSplit split = (FileVirtualSplit) spl;
    final Path file = split.getPath();
    final FileSystem fs = file.getFileSystem(conf);

    this.stringency = SAMHeaderReader.getValidationStringency(conf);

    final FSDataInputStream in = fs.open(file);

    final SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
    codec = new BAMRecordCodec(header);

    in.seek(0);
    bci =
        new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file));

    virtualStart = split.getStartVirtualOffset();

    fileStart = virtualStart >>> 16;
    virtualEnd = split.getEndVirtualOffset();

    bci.seek(virtualStart);
    codec.setInputStream(bci);

    if (BAMInputFormat.DEBUG_BAM_SPLITTER) {
      final long recordStart = virtualStart & 0xffff;
      System.err.println(
          "XXX inizialized BAMRecordReader byte offset: "
              + fileStart
              + " record offset: "
              + recordStart);
    }

    keepReadPairsTogether =
        SortOrder.queryname.equals(header.getSortOrder())
            && conf.getBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, false);
    readPair = false;
    lastOfPair = false;
    intervals = BAMInputFormat.getIntervals(conf);
    if (intervals != null) {
      overlapDetector = new OverlapDetector<>(0, 0);
      overlapDetector.addAll(intervals, intervals);
    }
  }
Ejemplo n.º 2
0
  @Override
  public boolean nextKeyValue() {
    long virtPos;
    while ((virtPos = bci.getFilePointer()) < virtualEnd
        || (keepReadPairsTogether && readPair && !lastOfPair)) {

      final SAMRecord r = codec.decode();
      if (r == null) return false;

      // Since we're reading from a BAMRecordCodec directly we have to set the
      // validation stringency ourselves.
      if (this.stringency != null) r.setValidationStringency(this.stringency);

      readPair = r.getReadPairedFlag();
      if (readPair) {
        boolean first = r.getFirstOfPairFlag(), second = r.getSecondOfPairFlag();
        // According to the SAM spec (section 1.4) it is possible for pairs to have
        // multiple segments (i.e. more than two), in which case both `first` and
        // `second` will be true.
        boolean firstOfPair = first && !second;
        lastOfPair = !first && second;
        // ignore any template that is not first in a pair right at the start of a split
        // since it will have been returned in the previous split
        if (virtPos == virtualStart && keepReadPairsTogether && !firstOfPair) {
          continue;
        }
      }

      if (!overlaps(r)) {
        continue;
      }

      key.set(getKey(r));
      record.set(r);
      return true;
    }
    return false;
  }
Ejemplo n.º 3
0
  /**
   * Finds a virtual BAM record position in the physical position range [beg,end). Returns end if no
   * BAM record was found.
   */
  public long guessNextBAMRecordStart(long beg, long end) throws IOException {
    // Buffer what we need to go through.

    byte[] arr = new byte[MAX_BYTES_READ];

    this.inFile.seek(beg);
    int totalRead = 0;
    for (int left = Math.min((int) (end - beg), arr.length); left > 0; ) {
      final int r = inFile.read(arr, totalRead, left);
      if (r < 0) break;
      totalRead += r;
      left -= r;
    }
    arr = Arrays.copyOf(arr, totalRead);

    this.in = new SeekableArrayStream(arr);

    this.bgzf = new BlockCompressedInputStream(this.in);
    this.bgzf.setCheckCrcs(true);

    this.bamCodec.setInputStream(bgzf);

    final int firstBGZFEnd = Math.min((int) (end - beg), 0xffff);

    // cp: Compressed Position, indexes the entire BGZF input.
    for (int cp = 0; ; ++cp) {
      final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd);
      if (psz == null) return end;

      final int cp0 = cp = psz.pos;
      final long cp0Virt = (long) cp0 << 16;
      try {
        bgzf.seek(cp0Virt);

        // This has to catch Throwable, because it's possible to get an
        // OutOfMemoryError due to an overly large size.
      } catch (Throwable e) {
        // Guessed BGZF position incorrectly: try the next guess.
        continue;
      }

      // up: Uncompressed Position, indexes the data inside the BGZF block.
      for (int up = 0; ; ++up) {
        final int up0 = up = guessNextBAMPos(cp0Virt, up, psz.size);

        if (up0 < 0) {
          // No BAM records found in the BGZF block: try the next BGZF
          // block.
          break;
        }

        // Verify that we can actually decode BLOCKS_NEEDED_FOR_GUESS worth
        // of records starting at (cp0,up0).
        bgzf.seek(cp0Virt | up0);
        boolean decodedAny = false;
        try {
          byte b = 0;
          int prevCP = cp0;
          while (b < BLOCKS_NEEDED_FOR_GUESS) {
            SAMRecord record = bamCodec.decode();
            if (record == null) {
              break;
            }
            record.getCigar(); // force decoding of CIGAR
            decodedAny = true;

            final int cp2 = (int) (bgzf.getFilePointer() >>> 16);
            if (cp2 != prevCP) {
              // The compressed position changed so we must be in a new
              // block.
              assert cp2 > prevCP;
              prevCP = cp2;
              ++b;
            }
          }

          // Running out of records to verify is fine as long as we
          // verified at least something. It should only happen if we
          // couldn't fill the array.
          if (b < BLOCKS_NEEDED_FOR_GUESS) {
            assert arr.length < MAX_BYTES_READ;
            if (!decodedAny) continue;
          }
        } catch (SAMFormatException e) {
          continue;
        } catch (FileTruncatedException e) {
          continue;
        } catch (OutOfMemoryError e) {
          continue;
        } catch (IllegalArgumentException e) {
          continue;
        } catch (RuntimeIOException e) {
          continue;
        } catch (RuntimeEOFException e) {
          // This can happen legitimately if the [beg,end) range is too
          // small to accommodate BLOCKS_NEEDED_FOR_GUESS and we get cut
          // off in the middle of a record. In that case, our stream
          // should have hit EOF as well. If we've then verified at least
          // something, go ahead with it and hope for the best.
          if (!decodedAny && this.in.eof()) continue;
        }

        return beg + cp0 << 16 | up0;
      }
    }
  }