@Override
  public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    // This method should only be called once (see Hadoop API). However,
    // there seems to be disagreement between implementations that call
    // initialize() and Hadoop-BAM's own code that relies on
    // {@link BAMInputFormat} to call initialize() when the reader is
    // created. Therefore we add this check for the time being.
    if (isInitialized) close();
    isInitialized = true;

    final Configuration conf = ctx.getConfiguration();

    final FileVirtualSplit split = (FileVirtualSplit) spl;
    final Path file = split.getPath();
    final FileSystem fs = file.getFileSystem(conf);

    this.stringency = SAMHeaderReader.getValidationStringency(conf);

    final FSDataInputStream in = fs.open(file);

    final SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
    codec = new BAMRecordCodec(header);

    in.seek(0);
    bci =
        new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file));

    virtualStart = split.getStartVirtualOffset();

    fileStart = virtualStart >>> 16;
    virtualEnd = split.getEndVirtualOffset();

    bci.seek(virtualStart);
    codec.setInputStream(bci);

    if (BAMInputFormat.DEBUG_BAM_SPLITTER) {
      final long recordStart = virtualStart & 0xffff;
      System.err.println(
          "XXX inizialized BAMRecordReader byte offset: "
              + fileStart
              + " record offset: "
              + recordStart);
    }

    keepReadPairsTogether =
        SortOrder.queryname.equals(header.getSortOrder())
            && conf.getBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, false);
    readPair = false;
    lastOfPair = false;
    intervals = BAMInputFormat.getIntervals(conf);
    if (intervals != null) {
      overlapDetector = new OverlapDetector<>(0, 0);
      overlapDetector.addAll(intervals, intervals);
    }
  }
Beispiel #2
0
  /**
   * Create a common SAMFileWriter for use with GATK tools.
   *
   * @param outputFile - if this file has a .cram extension then a reference is required. Can not be
   *     null.
   * @param referenceFile - the reference source to use. Can not be null if a output file has a
   *     .cram extension.
   * @param header - header to be used for the output writer
   * @param preSorted - if true then the records must already be sorted to match the header sort
   *     order
   * @param createOutputBamIndex - if true an index will be created for .BAM and .CRAM files
   * @param createMD5 - if true an MD5 file will be created
   * @return SAMFileWriter
   */
  public static SAMFileWriter createCommonSAMWriter(
      final File outputFile,
      final File referenceFile,
      final SAMFileHeader header,
      final boolean preSorted,
      boolean createOutputBamIndex,
      final boolean createMD5) {
    Utils.nonNull(outputFile);
    Utils.nonNull(header);

    if (createOutputBamIndex && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
      logger.warn(
          "Skipping index file creation for: "
              + outputFile.getAbsolutePath()
              + ". Index file creation requires reads in coordinate sorted order.");
      createOutputBamIndex = false;
    }

    final SAMFileWriterFactory factory =
        new SAMFileWriterFactory().setCreateIndex(createOutputBamIndex).setCreateMd5File(createMD5);
    return ReadUtils.createCommonSAMWriterFromFactory(
        factory, outputFile, referenceFile, header, preSorted);
  }