예제 #1
0
  public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    Configuration conf = ContextUtil.getConfiguration(context);
    this.maxLineLength = conf.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);

    FileSplit split = (FileSplit) genericSplit;
    start = (split.getStart()) << 16;
    end = (start + split.getLength()) << 16;

    final Path file = split.getPath();
    FileSystem fs = file.getFileSystem(conf);

    bin =
        new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(
                fs.open(file), fs.getFileStatus(file).getLen(), file));

    in = new LineReader(bin, conf);

    if (start != 0) {
      bin.seek(start);

      // Skip first line
      in.readLine(new Text());
      start = bin.getFilePointer();
    }
    this.pos = start;
  }
예제 #2
0
  @Override
  public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    // This method should only be called once (see Hadoop API). However,
    // there seems to be disagreement between implementations that call
    // initialize() and Hadoop-BAM's own code that relies on
    // {@link BAMInputFormat} to call initialize() when the reader is
    // created. Therefore we add this check for the time being.
    if (isInitialized) close();
    isInitialized = true;

    final Configuration conf = ctx.getConfiguration();

    final FileVirtualSplit split = (FileVirtualSplit) spl;
    final Path file = split.getPath();
    final FileSystem fs = file.getFileSystem(conf);

    this.stringency = SAMHeaderReader.getValidationStringency(conf);

    final FSDataInputStream in = fs.open(file);

    final SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
    codec = new BAMRecordCodec(header);

    in.seek(0);
    bci =
        new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file));

    virtualStart = split.getStartVirtualOffset();

    fileStart = virtualStart >>> 16;
    virtualEnd = split.getEndVirtualOffset();

    bci.seek(virtualStart);
    codec.setInputStream(bci);

    if (BAMInputFormat.DEBUG_BAM_SPLITTER) {
      final long recordStart = virtualStart & 0xffff;
      System.err.println(
          "XXX inizialized BAMRecordReader byte offset: "
              + fileStart
              + " record offset: "
              + recordStart);
    }

    keepReadPairsTogether =
        SortOrder.queryname.equals(header.getSortOrder())
            && conf.getBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, false);
    readPair = false;
    lastOfPair = false;
    intervals = BAMInputFormat.getIntervals(conf);
    if (intervals != null) {
      overlapDetector = new OverlapDetector<>(0, 0);
      overlapDetector.addAll(intervals, intervals);
    }
  }
예제 #3
0
  public boolean nextKeyValue() throws IOException {
    while (pos <= end) {
      int newSize = in.readLine(value, maxLineLength);
      if (newSize == 0) return false;

      pos = bin.getFilePointer();
      if (newSize < maxLineLength) return true;
    }
    return false;
  }
예제 #4
0
 /**
  * Unless the end has been reached, this only takes file position into account, not the position
  * within the block.
  */
 @Override
 public float getProgress() {
   final long virtPos = bci.getFilePointer();
   final long filePos = virtPos >>> 16;
   if (virtPos >= virtualEnd) return 1;
   else {
     final long fileEnd = virtualEnd >>> 16;
     // Add 1 to the denominator to make sure it doesn't reach 1 here when
     // filePos == fileEnd.
     return (float) (filePos - fileStart) / (fileEnd - fileStart + 1);
   }
 }
예제 #5
0
  public static void main(String[] args) {
    try {
      String inFile =
          "/psychipc01/disk2/references/1000Genome/release/20130502_v5a/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz";
      String outFile =
          "/psychipc01/disk2/references/1000Genome/release/20130502_v5a/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes1.vcf.gz";
      BlockCompressedInputStream br = new BlockCompressedInputStream(new File(inFile));
      BlockCompressedOutputStream bw = new BlockCompressedOutputStream(new File(outFile));
      String line = null;
      String[] cells = null;

      int[] orgIndices =
          new int[] {
            0, 1, 2, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 33, 34, 35, 36, 37, 38,
            39, 40
          };
      int selectedColNum = orgIndices.length;
      int i, pos;
      String delimiter = "\t";

      while ((line = br.readLine()) != null) {
        line = line.trim();
        if (line.trim().length() == 0) {
          continue;
        }

        bw.write(line.replaceAll("[|]", "/").getBytes());
        bw.write("\n".getBytes());
      }
      bw.close();
      br.close();

    } catch (Exception ex) {
      ex.printStackTrace();
    }
  }
예제 #6
0
 /**
  * @param stream stream.markSupported() must be true
  * @return true if this looks like a BAM file.
  */
 public static boolean isBAMFile(final InputStream stream) throws IOException {
   if (!BlockCompressedInputStream.isValidFile(stream)) {
     return false;
   }
   final int buffSize = BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE;
   stream.mark(buffSize);
   final byte[] buffer = new byte[buffSize];
   readBytes(stream, buffer, 0, buffSize);
   stream.reset();
   try (final BlockCompressedInputStream bcis =
       new BlockCompressedInputStream(new ByteArrayInputStream(buffer))) {
     final byte[] magicBuf = new byte[4];
     final int magicLength = readBytes(bcis, magicBuf, 0, 4);
     return magicLength == BAMFileConstants.BAM_MAGIC.length
         && Arrays.equals(BAMFileConstants.BAM_MAGIC, magicBuf);
   }
 }
예제 #7
0
  @Override
  public boolean nextKeyValue() {
    long virtPos;
    while ((virtPos = bci.getFilePointer()) < virtualEnd
        || (keepReadPairsTogether && readPair && !lastOfPair)) {

      final SAMRecord r = codec.decode();
      if (r == null) return false;

      // Since we're reading from a BAMRecordCodec directly we have to set the
      // validation stringency ourselves.
      if (this.stringency != null) r.setValidationStringency(this.stringency);

      readPair = r.getReadPairedFlag();
      if (readPair) {
        boolean first = r.getFirstOfPairFlag(), second = r.getSecondOfPairFlag();
        // According to the SAM spec (section 1.4) it is possible for pairs to have
        // multiple segments (i.e. more than two), in which case both `first` and
        // `second` will be true.
        boolean firstOfPair = first && !second;
        lastOfPair = !first && second;
        // ignore any template that is not first in a pair right at the start of a split
        // since it will have been returned in the previous split
        if (virtPos == virtualStart && keepReadPairsTogether && !firstOfPair) {
          continue;
        }
      }

      if (!overlaps(r)) {
        continue;
      }

      key.set(getKey(r));
      record.set(r);
      return true;
    }
    return false;
  }
    @Override
    public void run() {

      if (myStartSite >= mySiteCount) {
        return;
      }

      BlockCompressedInputStream reader = getReader();
      try {

        reader.seek(myIndex.virtualOffset(mySeekIndex));

        int numSites = Math.min(myNumLinesPerInterval, mySiteCount - myStartSite);
        byte[][] result = new byte[numSites][];
        for (int i = 0; i < numSites; i++) {
          result[i] = parseLine(reader.readLine(), myTaxaCount, myStartSite + i, myIsOneLetter);
          CompletableFuture<byte[]> future = myFutureQueue.remove(myStartSite + i);
          if (future != null) {
            future.complete(result[i]);
          }
        }
        myGenoCache.put(myProcessBlock, result);
        // This get to prevent early eviction from cache
        myGenoCache.getIfPresent(myProcessBlock);
        myCurrentlyProcessingBlocks.remove(myProcessBlock);
        for (int i = 0; i < numSites; i++) {
          CompletableFuture<byte[]> future = myFutureQueue.remove(myStartSite + i);
          if (future != null) {
            future.complete(result[i]);
          }
        }
        myStartSite += myNumLinesPerInterval;
        if (myStartSite >= mySiteCount) {
          return;
        }

        for (int b = 1; b < NUM_LOOK_AHEAD_BLOCKS; b++) {

          if (myGenoCache.getIfPresent(myProcessBlock + b) != null) {
            return;
          }
          if (!myCurrentlyProcessingBlocks.add(myProcessBlock + b)) {
            return;
          }

          numSites = Math.min(myNumLinesPerInterval, mySiteCount - myStartSite);
          result = new byte[numSites][];
          for (int i = 0; i < numSites; i++) {
            result[i] = parseLine(reader.readLine(), myTaxaCount, myStartSite + i, myIsOneLetter);
          }
          myGenoCache.put(myProcessBlock + b, result);
          // This get to prevent early eviction from cache
          myGenoCache.getIfPresent(myProcessBlock + b);
          myCurrentlyProcessingBlocks.remove(myProcessBlock + b);
          for (int i = 0; i < numSites; i++) {
            CompletableFuture<byte[]> future = myFutureQueue.remove(myStartSite + i);
            if (future != null) {
              future.complete(result[i]);
            }
          }
          myStartSite += myNumLinesPerInterval;
          if (myStartSite >= mySiteCount) {
            return;
          }
        }

      } catch (Exception e) {
        myLogger.error(e.getMessage(), e);
      } finally {
        myReaders.add(reader);
      }
    }
예제 #9
0
 @Override
 public void close() throws IOException {
   bci.close();
 }
예제 #10
0
  /**
   * Finds a virtual BAM record position in the physical position range [beg,end). Returns end if no
   * BAM record was found.
   */
  public long guessNextBAMRecordStart(long beg, long end) throws IOException {
    // Buffer what we need to go through.

    byte[] arr = new byte[MAX_BYTES_READ];

    this.inFile.seek(beg);
    int totalRead = 0;
    for (int left = Math.min((int) (end - beg), arr.length); left > 0; ) {
      final int r = inFile.read(arr, totalRead, left);
      if (r < 0) break;
      totalRead += r;
      left -= r;
    }
    arr = Arrays.copyOf(arr, totalRead);

    this.in = new SeekableArrayStream(arr);

    this.bgzf = new BlockCompressedInputStream(this.in);
    this.bgzf.setCheckCrcs(true);

    this.bamCodec.setInputStream(bgzf);

    final int firstBGZFEnd = Math.min((int) (end - beg), 0xffff);

    // cp: Compressed Position, indexes the entire BGZF input.
    for (int cp = 0; ; ++cp) {
      final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd);
      if (psz == null) return end;

      final int cp0 = cp = psz.pos;
      final long cp0Virt = (long) cp0 << 16;
      try {
        bgzf.seek(cp0Virt);

        // This has to catch Throwable, because it's possible to get an
        // OutOfMemoryError due to an overly large size.
      } catch (Throwable e) {
        // Guessed BGZF position incorrectly: try the next guess.
        continue;
      }

      // up: Uncompressed Position, indexes the data inside the BGZF block.
      for (int up = 0; ; ++up) {
        final int up0 = up = guessNextBAMPos(cp0Virt, up, psz.size);

        if (up0 < 0) {
          // No BAM records found in the BGZF block: try the next BGZF
          // block.
          break;
        }

        // Verify that we can actually decode BLOCKS_NEEDED_FOR_GUESS worth
        // of records starting at (cp0,up0).
        bgzf.seek(cp0Virt | up0);
        boolean decodedAny = false;
        try {
          byte b = 0;
          int prevCP = cp0;
          while (b < BLOCKS_NEEDED_FOR_GUESS) {
            SAMRecord record = bamCodec.decode();
            if (record == null) {
              break;
            }
            record.getCigar(); // force decoding of CIGAR
            decodedAny = true;

            final int cp2 = (int) (bgzf.getFilePointer() >>> 16);
            if (cp2 != prevCP) {
              // The compressed position changed so we must be in a new
              // block.
              assert cp2 > prevCP;
              prevCP = cp2;
              ++b;
            }
          }

          // Running out of records to verify is fine as long as we
          // verified at least something. It should only happen if we
          // couldn't fill the array.
          if (b < BLOCKS_NEEDED_FOR_GUESS) {
            assert arr.length < MAX_BYTES_READ;
            if (!decodedAny) continue;
          }
        } catch (SAMFormatException e) {
          continue;
        } catch (FileTruncatedException e) {
          continue;
        } catch (OutOfMemoryError e) {
          continue;
        } catch (IllegalArgumentException e) {
          continue;
        } catch (RuntimeIOException e) {
          continue;
        } catch (RuntimeEOFException e) {
          // This can happen legitimately if the [beg,end) range is too
          // small to accommodate BLOCKS_NEEDED_FOR_GUESS and we get cut
          // off in the middle of a record. In that case, our stream
          // should have hit EOF as well. If we've then verified at least
          // something, go ahead with it and hope for the best.
          if (!decodedAny && this.in.eof()) continue;
        }

        return beg + cp0 << 16 | up0;
      }
    }
  }
예제 #11
0
  private int guessNextBAMPos(long cpVirt, int up, int cSize) {
    // What we're actually searching for is what's at offset [4], not [0]. So
    // skip ahead by 4, thus ensuring that whenever we find a valid [0] it's
    // at position up or greater.
    up += 4;

    try {
      while (up + SHORTEST_POSSIBLE_BAM_RECORD - 4 < cSize) {
        bgzf.seek(cpVirt | up);
        bgzf.read(buf.array(), 0, 8);

        // If the first two checks fail we have what looks like a valid
        // reference sequence ID. Assume we're at offset [4] or [24], i.e.
        // the ID of either this read or its mate, respectively. So check
        // the next integer ([8] or [28]) to make sure it's a 0-based
        // leftmost coordinate.
        final int id = buf.getInt(0);
        final int pos = buf.getInt(4);
        if (id < -1 || id > referenceSequenceCount || pos < -1) {
          ++up;
          continue;
        }

        // Okay, we could be at [4] or [24]. Assuming we're at [4], check
        // that [24] is valid. Assume [4] because we should hit it first:
        // the only time we expect to hit [24] is at the beginning of the
        // split, as part of the first read we should skip.

        bgzf.seek(cpVirt | up + 20);
        bgzf.read(buf.array(), 0, 8);

        final int nid = buf.getInt(0);
        final int npos = buf.getInt(4);
        if (nid < -1 || nid > referenceSequenceCount || npos < -1) {
          ++up;
          continue;
        }

        // So far so good: [4] and [24] seem okay. Now do something a bit
        // more involved: make sure that [36 + [12]&0xff - 1] == 0: that
        // is, the name of the read should be null terminated.

        // Move up to 0 just to make it less likely that we get confused
        // with offsets. Remember where we should continue from if we
        // reject this up.
        final int nextUP = up + 1;
        up -= 4;

        bgzf.seek(cpVirt | up + 12);
        bgzf.read(buf.array(), 0, 4);

        final int nameLength = buf.getInt(0) & 0xff;
        if (nameLength < 1) {
          // Names are null-terminated so length must be at least one
          up = nextUP;
          continue;
        }

        final int nullTerminator = up + 36 + nameLength - 1;

        if (nullTerminator >= cSize) {
          // This BAM record can't fit here. But maybe there's another in
          // the remaining space, so try again.
          up = nextUP;
          continue;
        }

        bgzf.seek(cpVirt | nullTerminator);
        bgzf.read(buf.array(), 0, 1);

        if (buf.get(0) != 0) {
          up = nextUP;
          continue;
        }

        // All of [4], [24], and [36 + [12]&0xff] look good. If [0] is also
        // sensible, that's good enough for us. "Sensible" to us means the
        // following:
        //
        // [0] >= 4*([16]&0xffff) + [20] + ([20]+1)/2 + 4*8 + ([12]&0xff)

        // Note that [0] is "length of the _remainder_ of the alignment
        // record", which is why this uses 4*8 instead of 4*9.
        int zeroMin = 4 * 8 + nameLength;

        bgzf.seek(cpVirt | up + 16);
        bgzf.read(buf.array(), 0, 8);

        zeroMin += (buf.getInt(0) & 0xffff) * 4;
        zeroMin += buf.getInt(4) + (buf.getInt(4) + 1) / 2;

        bgzf.seek(cpVirt | up);
        bgzf.read(buf.array(), 0, 4);

        if (buf.getInt(0) < zeroMin) {
          up = nextUP;
          continue;
        }
        return up;
      }
    } catch (IOException e) {
    }
    return -1;
  }