Ejemplo n.º 1
0
/**
 * Command line program to print statistics from BAM index (.bai) file Statistics include count of
 * aligned and unaligned reads for each reference sequence and a count of all records with no start
 * coordinate. Similar to the 'samtools idxstats' command.
 *
 * @author Martha Borkan
 */
@CommandLineProgramProperties(
    usage =
        "Generates BAM index statistics, including the number of aligned and unaligned SAMRecords for each reference sequence, "
            + "and the number of SAMRecords with no coordinate."
            + "Input BAM file must have a corresponding index file.\n",
    usageShort = "Generates index statistics from a BAM file",
    programGroup = SamOrBam.class)
public class BamIndexStats extends CommandLineProgram {

  private static final Log log = Log.getInstance(BamIndexStats.class);

  @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "A BAM file to process.")
  public File INPUT;

  /** Stock main method for a command line program. */
  public static void main(final String[] argv) {
    System.exit(new BamIndexStats().instanceMain(argv));
  }

  /**
   * Main method for the program. Checks that input file is present and readable, then iterates
   * through the index printing meta data to stdout.
   */
  protected int doWork() {

    if (INPUT.getName().endsWith(BAMIndex.BAMIndexSuffix))
      log.warn("INPUT should be BAM file not index file");
    IOUtil.assertFileIsReadable(INPUT);
    BAMIndexMetaData.printIndexStats(INPUT);

    return 0;
  }
}
Ejemplo n.º 2
0
class BaiIndexer {
  private static Log log = Log.getInstance(BaiIndexer.class);

  public CountingInputStream is;
  public SAMFileHeader samFileHeader;
  public CRAMIndexer indexer;

  public BaiIndexer(InputStream is, SAMFileHeader samFileHeader, File output) {
    this.is = new CountingInputStream(is);
    this.samFileHeader = samFileHeader;

    indexer = new CRAMIndexer(output, samFileHeader);
  }

  public BaiIndexer(InputStream is, File output) throws IOException {
    this.is = new CountingInputStream(is);
    CramHeader cramHeader = CramIO.readCramHeader(this.is);
    samFileHeader = cramHeader.getSamFileHeader();

    indexer = new CRAMIndexer(output, samFileHeader);
  }

  private boolean nextContainer() throws IOException {
    long offset = is.getCount();
    Container c = CramIO.readContainer(is);
    if (c == null) return false;
    c.offset = offset;

    int i = 0;
    for (Slice slice : c.slices) {
      slice.containerOffset = offset;
      slice.index = i++;
      indexer.processAlignment(slice);
    }

    log.info("INDEXED: " + c.toString());
    return true;
  }

  private void index() throws IOException {
    while (true) {
      if (!nextContainer()) break;
    }
  }

  public void run() throws IOException {
    index();
    indexer.finish();
  }
}
Ejemplo n.º 3
0
/** The class provides version-dependant rules and policies for CRAM data. */
public class CramVersionPolicies {
  private static final Log log = Log.getInstance(CramVersionPolicies.class);

  /**
   * The method holds the behaviour for when the EOF marker is not found. Depending on the CRAM
   * version this will be ignored, a warning issued or an exception produced.
   *
   * @param version CRAM version to assume
   */
  public static void eofNotFound(final Version version) {
    if (version.compatibleWith(CramVersions.CRAM_v3)) {
      log.error("Incomplete data: EOF marker not found.");
      throw new RuntimeException("EOF not found.");
    }
    if (version.compatibleWith(CramVersions.CRAM_v2_1))
      log.warn("EOF marker not found, possibly incomplete file/stream.");
  }
}
Ejemplo n.º 4
0
/**
 * Command line program to print statistics from BAM index (.bai) file Statistics include count of
 * aligned and unaligned reads for each reference sequence and a count of all records with no start
 * coordinate. Similar to the 'samtools idxstats' command.
 *
 * @author Martha Borkan
 */
@CommandLineProgramProperties(
    usage = BamIndexStats.USAGE_SUMMARY + BamIndexStats.USAGE_DETAILS,
    usageShort = BamIndexStats.USAGE_SUMMARY,
    programGroup = SamOrBam.class)
public class BamIndexStats extends CommandLineProgram {
  static final String USAGE_SUMMARY = "Generate index statistics from a BAM file";
  static final String USAGE_DETAILS =
      "This tool calculates statistics from a BAM index (.bai) file, emulating the behavior of the "
          + "\"samtools idxstats\" command. The statistics collected include counts of aligned and unaligned reads as well as all "
          + "records with no start coordinate. The input to the tool is the BAM file name but it must be accompanied by a corresponding "
          + "index file.<br />"
          + "<h4>Usage example:</h4>"
          + "<pre>"
          + "java -jar picard.jar BamIndexStats \\<br />"
          + "      I=input.bam \\<br />"
          + "      O=output"
          + "</pre>"
          + "<hr />";
  private static final Log log = Log.getInstance(BamIndexStats.class);

  @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "A BAM file to process.")
  public File INPUT;

  /** Stock main method for a command line program. */
  public static void main(final String[] argv) {
    System.exit(new BamIndexStats().instanceMain(argv));
  }

  /**
   * Main method for the program. Checks that input file is present and readable, then iterates
   * through the index printing meta data to stdout.
   */
  protected int doWork() {

    if (INPUT.getName().endsWith(BAMIndex.BAMIndexSuffix))
      log.warn("INPUT should be the BAM file name, not its index file");
    IOUtil.assertFileIsReadable(INPUT);
    BAMIndexMetaData.printIndexStats(INPUT);

    return 0;
  }
}
Ejemplo n.º 5
0
public class SamReaderFactoryTest {
  private static final File TEST_DATA_DIR = new File("testdata/htsjdk/samtools");

  private static final Log LOG = Log.getInstance(SamReaderFactoryTest.class);

  @Test(dataProvider = "variousFormatReaderTestCases")
  public void variousFormatReaderTest(final String inputFile) throws IOException {
    final File input = new File(TEST_DATA_DIR, inputFile);
    final SamReader reader = SamReaderFactory.makeDefault().open(input);
    for (final SAMRecord ignored : reader) {}
    reader.close();
  }

  private int countRecordsInQueryInterval(final SamReader reader, final QueryInterval query) {
    final SAMRecordIterator iter = reader.queryOverlapping(new QueryInterval[] {query});
    int count = 0;
    while (iter.hasNext()) {
      iter.next();
      count++;
    }
    iter.close();
    return count;
  }

  // See https://github.com/samtools/htsjdk/issues/76
  @Test(dataProvider = "queryIntervalIssue76TestCases")
  public void queryIntervalIssue76(
      final String sequenceName, final int start, final int end, final int expectedCount)
      throws IOException {
    final File input = new File(TEST_DATA_DIR, "issue76.bam");
    final SamReader reader = SamReaderFactory.makeDefault().open(input);
    final QueryInterval interval =
        new QueryInterval(
            reader.getFileHeader().getSequence(sequenceName).getSequenceIndex(), start, end);
    Assert.assertEquals(countRecordsInQueryInterval(reader, interval), expectedCount);
    reader.close();
  }

  @DataProvider(name = "queryIntervalIssue76TestCases")
  public Object[][] queryIntervalIssue76TestCases() {
    return new Object[][] {
      {"1", 11966, 11966, 2},
      {"1", 11966, 11967, 2},
      {"1", 11967, 11967, 1}
    };
  }

  @DataProvider(name = "variousFormatReaderTestCases")
  public Object[][] variousFormatReaderTestCases() {
    return new Object[][] {
      {"block_compressed.sam.gz"}, {"uncompressed.sam"}, {"compressed.sam.gz"}, {"compressed.bam"},
    };
  }

  // Tests for the SAMRecordFactory usage
  class SAMRecordFactoryTester extends DefaultSAMRecordFactory {
    int samRecordsCreated;
    int bamRecordsCreated;

    public SAMRecord createSAMRecord(final SAMFileHeader header) {
      ++samRecordsCreated;
      return super.createSAMRecord(header);
    }

    public BAMRecord createBAMRecord(
        final SAMFileHeader header,
        final int referenceSequenceIndex,
        final int alignmentStart,
        final short readNameLength,
        final short mappingQuality,
        final int indexingBin,
        final int cigarLen,
        final int flags,
        final int readLen,
        final int mateReferenceSequenceIndex,
        final int mateAlignmentStart,
        final int insertSize,
        final byte[] variableLengthBlock) {
      ++bamRecordsCreated;
      return super.createBAMRecord(
          header,
          referenceSequenceIndex,
          alignmentStart,
          readNameLength,
          mappingQuality,
          indexingBin,
          cigarLen,
          flags,
          readLen,
          mateReferenceSequenceIndex,
          mateAlignmentStart,
          insertSize,
          variableLengthBlock);
    }
  }

  @Test(dataProvider = "variousFormatReaderTestCases")
  public void samRecordFactoryTest(final String inputFile) throws IOException {
    final File input = new File(TEST_DATA_DIR, inputFile);

    final SAMRecordFactoryTester recordFactory = new SAMRecordFactoryTester();
    final SamReaderFactory readerFactory =
        SamReaderFactory.makeDefault().samRecordFactory(recordFactory);
    final SamReader reader = readerFactory.open(input);

    int i = 0;
    for (final SAMRecord ignored : reader) {
      ++i;
    }
    reader.close();

    Assert.assertTrue(i > 0);
    if (inputFile.endsWith(".sam") || inputFile.endsWith(".sam.gz"))
      Assert.assertEquals(recordFactory.samRecordsCreated, i);
    else if (inputFile.endsWith(".bam")) Assert.assertEquals(recordFactory.bamRecordsCreated, i);
  }

  @Test(expectedExceptions = IllegalStateException.class)
  public void samRecordFactoryNullHeaderBAMTest() {
    final SAMRecordFactory recordFactory = new DefaultSAMRecordFactory();
    recordFactory.createBAMRecord(
        null, // null header
        0, 0, (short) 0, (short) 0, 0, 0, 0, 0, 0, 0, 0, null);
  }

  /**
   * Unit tests for asserting all permutations of data and index sources read the same records and
   * header.
   */
  final File localBam = new File("testdata/htsjdk/samtools/BAMFileIndexTest/index_test.bam");

  final File localBamIndex =
      new File("testdata/htsjdk/samtools/BAMFileIndexTest/index_test.bam.bai");

  final URL bamUrl, bamIndexUrl;

  {
    try {
      bamUrl = new URL("http://www.broadinstitute.org/~picard/testdata/index_test.bam");
      bamIndexUrl = new URL("http://www.broadinstitute.org/~picard/testdata/index_test.bam.bai");
    } catch (final MalformedURLException e) {
      throw new RuntimeException(e);
    }
  }

  @DataProvider
  public Object[][] composeAllPermutationsOfSamInputResource() {
    final List<SamInputResource> sources = new ArrayList<SamInputResource>();
    for (final InputResource.Type dataType : InputResource.Type.values()) {
      if (dataType.equals(InputResource.Type.SRA_ACCESSION)) continue;

      sources.add(new SamInputResource(composeInputResourceForType(dataType, false)));
      for (final InputResource.Type indexType : InputResource.Type.values()) {
        if (indexType.equals(InputResource.Type.SRA_ACCESSION)) continue;

        sources.add(
            new SamInputResource(
                composeInputResourceForType(dataType, false),
                composeInputResourceForType(indexType, true)));
      }
    }
    final Object[][] data = new Object[sources.size()][];
    for (final SamInputResource source : sources) {
      data[sources.indexOf(source)] = new Object[] {source};
    }

    return data;
  }

  private InputResource composeInputResourceForType(
      final InputResource.Type type, final boolean forIndex) {
    final File f = forIndex ? localBamIndex : localBam;
    final URL url = forIndex ? bamIndexUrl : bamUrl;
    switch (type) {
      case FILE:
        return new FileInputResource(f);
      case URL:
        return new UrlInputResource(url);
      case SEEKABLE_STREAM:
        return new SeekableStreamInputResource(new SeekableHTTPStream(url));
      case INPUT_STREAM:
        try {
          return new InputStreamInputResource(new FileInputStream(f));
        } catch (final FileNotFoundException e) {
          throw new RuntimeIOException(e);
        }
      default:
        throw new IllegalStateException();
    }
  }

  final Set<SAMFileHeader> observedHeaders = new HashSet<SAMFileHeader>();
  final Set<List<SAMRecord>> observedRecordOrdering = new HashSet<List<SAMRecord>>();

  @Test(dataProvider = "composeAllPermutationsOfSamInputResource")
  public void exhaustInputResourcePermutation(final SamInputResource resource) throws IOException {
    final SamReader reader = SamReaderFactory.makeDefault().open(resource);
    LOG.info(String.format("Reading from %s ...", resource));
    final List<SAMRecord> slurped = Iterables.slurp(reader);
    final SAMFileHeader fileHeader = reader.getFileHeader();
    reader.hasIndex();
    reader.indexing().hasBrowseableIndex();
    reader.close();

    /* Ensure all tests have read the same records in the same order or, if this is the first test, set it as the template. */
    observedHeaders.add(fileHeader);
    observedRecordOrdering.add(slurped);
    Assert.assertEquals(observedHeaders.size(), 1, "read different headers than other testcases");
    Assert.assertEquals(
        observedRecordOrdering.size(), 1, "read different records than other testcases");
  }

  final Set<List<SAMRecord>> observedRecordOrdering1 = new HashSet<List<SAMRecord>>();
  final Set<List<SAMRecord>> observedRecordOrdering3 = new HashSet<List<SAMRecord>>();
  final Set<List<SAMRecord>> observedRecordOrdering20 = new HashSet<List<SAMRecord>>();

  @Test(dataProvider = "composeAllPermutationsOfSamInputResource")
  public void queryInputResourcePermutation(final SamInputResource resource) throws IOException {
    final SamReader reader = SamReaderFactory.makeDefault().open(resource);
    LOG.info(String.format("Query from %s ...", resource));
    if (reader.hasIndex()) {
      final StopWatch stopWatch = new StopWatch();
      stopWatch.start();
      final SAMRecordIterator q1 = reader.query("chr1", 500000, 100000000, true);
      observedRecordOrdering1.add(Iterables.slurp(q1));
      q1.close();
      final SAMRecordIterator q20 = reader.query("chr20", 1, 1000000, true);
      observedRecordOrdering20.add(Iterables.slurp(q20));
      q20.close();
      final SAMRecordIterator q3 = reader.query("chr3", 1, 10000000, true);
      observedRecordOrdering3.add(Iterables.slurp(q3));
      q3.close();
      stopWatch.stop();
      LOG.info(String.format("Finished queries in %sms", stopWatch.getElapsedTime()));

      Assert.assertEquals(
          observedRecordOrdering1.size(), 1, "read different records for chromosome 1");
      Assert.assertEquals(
          observedRecordOrdering20.size(), 1, "read different records for chromosome 20");
      Assert.assertEquals(
          observedRecordOrdering3.size(), 1, "read different records for chromosome 3");
    } else if (resource.indexMaybe() != null) {
      LOG.warn("Resource has an index source, but is not indexed: " + resource);
    } else {
      LOG.info("Skipping query operation: no index.");
    }
    reader.close();
  }

  @Test
  public void customReaderFactoryTest() throws IOException {
    try {
      CustomReaderFactory.setInstance(
          new CustomReaderFactory(
              "https://www.googleapis.com/genomics/v1beta/reads/,"
                  + "htsjdk.samtools.SamReaderFactoryTest$TestReaderFactory"));
      final SamReader reader =
          SamReaderFactory.makeDefault()
              .open(
                  SamInputResource.of(
                      "https://www.googleapis.com/genomics/v1beta/reads/?uncompressed.sam"));
      int i = 0;
      for (@SuppressWarnings("unused") final SAMRecord ignored : reader) {
        ++i;
      }
      reader.close();

      Assert.assertTrue(i > 0);
    } finally {
      CustomReaderFactory.resetToDefaultInstance();
    }
  }

  public static class TestReaderFactory implements CustomReaderFactory.ICustomReaderFactory {
    @Override
    public SamReader open(URL url) {
      final File file = new File(TEST_DATA_DIR, url.getQuery());
      LOG.info("Opening customr reader for " + file.toString());
      return SamReaderFactory.makeDefault().open(file);
    }
  }

  @Test
  public void inputResourceFromStringTest() throws IOException {
    Assert.assertEquals(
        SamInputResource.of("http://test.url").data().type(), InputResource.Type.URL);
    Assert.assertEquals(
        SamInputResource.of("https://test.url").data().type(), InputResource.Type.URL);
    Assert.assertEquals(
        SamInputResource.of("ftp://test.url").data().type(), InputResource.Type.URL);
    Assert.assertEquals(SamInputResource.of("/a/b/c").data().type(), InputResource.Type.FILE);
  }
}
Ejemplo n.º 6
0
/**
 * Loads gene annotations from a refFlat file into an OverlapDetector<Gene>. Discards annotations
 * that are not internally consistent, e.g. transcripts on different chromosomes or different
 * strands.
 */
public class RefFlatReader {
  private static final Log LOG = Log.getInstance(RefFlatReader.class);
  // These are in the order that columns appear in refFlat format.
  public enum RefFlatColumns {
    GENE_NAME,
    TRANSCRIPT_NAME,
    CHROMOSOME,
    STRAND,
    TX_START,
    TX_END,
    CDS_START,
    CDS_END,
    EXON_COUNT,
    EXON_STARTS,
    EXON_ENDS
  }

  private static final String[] RefFlatColumnLabels = new String[RefFlatColumns.values().length];

  static {
    for (int i = 0; i < RefFlatColumnLabels.length; ++i) {
      RefFlatColumnLabels[i] = RefFlatColumns.values()[i].name();
    }
  }

  private final File refFlatFile;
  private final SAMSequenceDictionary sequenceDictionary;

  RefFlatReader(final File refFlatFile, final SAMSequenceDictionary sequenceDictionary) {
    this.refFlatFile = refFlatFile;
    this.sequenceDictionary = sequenceDictionary;
  }

  static OverlapDetector<Gene> load(
      final File refFlatFile, final SAMSequenceDictionary sequenceDictionary) {
    return new RefFlatReader(refFlatFile, sequenceDictionary).load();
  }

  OverlapDetector<Gene> load() {
    final OverlapDetector<Gene> overlapDetector = new OverlapDetector<Gene>(0, 0);

    final int expectedColumns = RefFlatColumns.values().length;
    final TabbedTextFileWithHeaderParser parser =
        new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels);
    final Map<String, List<TabbedTextFileWithHeaderParser.Row>> refFlatLinesByGene =
        new HashMap<String, List<TabbedTextFileWithHeaderParser.Row>>();

    for (final TabbedTextFileWithHeaderParser.Row row : parser) {
      final int lineNumber =
          parser.getCurrentLineNumber(); // getCurrentLineNumber returns the number of the next line
      if (row.getFields().length != expectedColumns) {
        throw new AnnotationException(
            "Wrong number of fields in refFlat file " + refFlatFile + " at line " + lineNumber);
      }
      final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
      final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
      final String transcriptDescription = geneName + ":" + transcriptName;
      final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name());
      if (!isSequenceRecognized(chromosome)) {
        LOG.debug(
            "Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome);
      } else {
        List<TabbedTextFileWithHeaderParser.Row> transcriptLines = refFlatLinesByGene.get(geneName);
        if (transcriptLines == null) {
          transcriptLines = new ArrayList<TabbedTextFileWithHeaderParser.Row>();
          refFlatLinesByGene.put(geneName, transcriptLines);
        }
        transcriptLines.add(row);
      }
    }

    int longestInterval = 0;
    int numIntervalsOver1MB = 0;

    for (final List<TabbedTextFileWithHeaderParser.Row> transcriptLines :
        refFlatLinesByGene.values()) {
      try {
        final Gene gene = makeGeneFromRefFlatLines(transcriptLines);
        overlapDetector.addLhs(gene, gene);
        if (gene.length() > longestInterval) longestInterval = gene.length();
        if (gene.length() > 1000000) ++numIntervalsOver1MB;
      } catch (AnnotationException e) {
        LOG.debug(e.getMessage() + " -- skipping");
      }
    }
    LOG.debug(
        "Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB);
    return overlapDetector;
  }

  private boolean isSequenceRecognized(final String sequence) {
    return (sequenceDictionary.getSequence(sequence) != null);
  }

  private Gene makeGeneFromRefFlatLines(
      final List<TabbedTextFileWithHeaderParser.Row> transcriptLines) {
    final String geneName = transcriptLines.get(0).getField(RefFlatColumns.GENE_NAME.name());
    final String strandStr = transcriptLines.get(0).getField(RefFlatColumns.STRAND.name());
    final boolean negative = strandStr.equals("-");
    final String chromosome = transcriptLines.get(0).getField(RefFlatColumns.CHROMOSOME.name());

    // Figure out the extend of the gene
    int start = Integer.MAX_VALUE;
    int end = Integer.MIN_VALUE;
    for (final TabbedTextFileWithHeaderParser.Row row : transcriptLines) {
      start = Math.min(start, row.getIntegerField(RefFlatColumns.TX_START.name()) + 1);
      end = Math.max(end, row.getIntegerField(RefFlatColumns.TX_END.name()));
    }

    final Gene gene = new Gene(chromosome, start, end, negative, geneName);

    for (final TabbedTextFileWithHeaderParser.Row row : transcriptLines) {
      if (!strandStr.equals(row.getField(RefFlatColumns.STRAND.name()))) {
        throw new AnnotationException("Strand disagreement in refFlat file for gene " + geneName);
      }
      if (!chromosome.equals(row.getField(RefFlatColumns.CHROMOSOME.name()))) {
        throw new AnnotationException(
            "Chromosome disagreement("
                + chromosome
                + " != "
                + row.getField(RefFlatColumns.CHROMOSOME.name())
                + ") in refFlat file for gene "
                + geneName);
      }

      // This adds it to the Gene also
      final Transcript tx = makeTranscriptFromRefFlatLine(gene, row);
    }

    return gene;
  }

  /** Conversion from 0-based half-open to 1-based inclusive intervals is done here. */
  private Gene.Transcript makeTranscriptFromRefFlatLine(
      final Gene gene, final TabbedTextFileWithHeaderParser.Row row) {
    final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
    final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
    final String transcriptDescription = geneName + ":" + transcriptName;
    final int exonCount = Integer.parseInt(row.getField(RefFlatColumns.EXON_COUNT.name()));
    final String[] exonStarts = row.getField(RefFlatColumns.EXON_STARTS.name()).split(",");
    final String[] exonEnds = row.getField(RefFlatColumns.EXON_ENDS.name()).split(",");

    if (exonCount != exonStarts.length) {
      throw new AnnotationException(
          "Number of exon starts does not agree with number of exons for " + transcriptDescription);
    }
    if (exonCount != exonEnds.length) {
      throw new AnnotationException(
          "Number of exon ends does not agree with number of exons for " + transcriptDescription);
    }

    final int transcriptionStart = row.getIntegerField(RefFlatColumns.TX_START.name()) + 1;
    final int transcriptionEnd = row.getIntegerField(RefFlatColumns.TX_END.name());
    final int codingStart = row.getIntegerField(RefFlatColumns.CDS_START.name()) + 1;
    final int codingEnd = row.getIntegerField(RefFlatColumns.CDS_END.name());

    final Transcript tx =
        gene.addTranscript(
            transcriptName,
            transcriptionStart,
            transcriptionEnd,
            codingStart,
            codingEnd,
            exonCount);

    for (int i = 0; i < exonCount; ++i) {
      final Exon e = tx.addExon(Integer.parseInt(exonStarts[i]) + 1, Integer.parseInt(exonEnds[i]));

      if (e.start > e.end) {
        throw new AnnotationException("Exon has 0 or negative extent for " + transcriptDescription);
      }
      if (i > 0 && tx.exons[i - 1].end >= tx.exons[i].start) {
        throw new AnnotationException("Exons overlap for " + transcriptDescription);
      }
    }

    return tx;
  }
}
Ejemplo n.º 7
0
/**
 * Computes a number of metrics that are useful for evaluating coverage and performance of whole
 * genome sequencing experiments.
 *
 * @author tfennell
 */
@CommandLineProgramProperties(
    usage =
        "Computes a number of metrics that are useful for evaluating coverage and performance of "
            + "whole genome sequencing experiments.",
    usageShort = "Writes whole genome sequencing-related metrics for a SAM or BAM file",
    programGroup = Metrics.class)
public class CollectWgsMetrics extends CommandLineProgram {

  @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input SAM or BAM file.")
  public File INPUT;

  @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output metrics file.")
  public File OUTPUT;

  @Option(
      shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME,
      doc = "The reference sequence fasta aligned to.")
  public File REFERENCE_SEQUENCE;

  @Option(
      shortName = "MQ",
      doc = "Minimum mapping quality for a read to contribute coverage.",
      overridable = true)
  public int MINIMUM_MAPPING_QUALITY = 20;

  @Option(
      shortName = "Q",
      doc = "Minimum base quality for a base to contribute coverage.",
      overridable = true)
  public int MINIMUM_BASE_QUALITY = 20;

  @Option(
      shortName = "CAP",
      doc = "Treat bases with coverage exceeding this value as if they had coverage at this value.",
      overridable = true)
  public int COVERAGE_CAP = 250;

  @Option(doc = "For debugging purposes, stop after processing this many genomic bases.")
  public long STOP_AFTER = -1;

  @Option(doc = "Determines whether to include the base quality histogram in the metrics file.")
  public boolean INCLUDE_BQ_HISTOGRAM = false;

  @Option(doc = "If true, count unpaired reads, and paired reads with one end unmapped")
  public boolean COUNT_UNPAIRED = false;

  private final Log log = Log.getInstance(CollectWgsMetrics.class);

  /** Metrics for evaluating the performance of whole genome sequencing experiments. */
  public static class WgsMetrics extends MetricBase {
    /** The number of non-N bases in the genome reference over which coverage will be evaluated. */
    public long GENOME_TERRITORY;
    /** The mean coverage in bases of the genome territory, after all filters are applied. */
    public double MEAN_COVERAGE;
    /** The standard deviation of coverage of the genome after all filters are applied. */
    public double SD_COVERAGE;
    /** The median coverage in bases of the genome territory, after all filters are applied. */
    public double MEDIAN_COVERAGE;
    /** The median absolute deviation of coverage of the genome after all filters are applied. */
    public double MAD_COVERAGE;

    /**
     * The fraction of aligned bases that were filtered out because they were in reads with low
     * mapping quality (default is < 20).
     */
    public double PCT_EXC_MAPQ;
    /**
     * The fraction of aligned bases that were filtered out because they were in reads marked as
     * duplicates.
     */
    public double PCT_EXC_DUPE;
    /**
     * The fraction of aligned bases that were filtered out because they were in reads without a
     * mapped mate pair.
     */
    public double PCT_EXC_UNPAIRED;
    /**
     * The fraction of aligned bases that were filtered out because they were of low base quality
     * (default is < 20).
     */
    public double PCT_EXC_BASEQ;
    /**
     * The fraction of aligned bases that were filtered out because they were the second observation
     * from an insert with overlapping reads.
     */
    public double PCT_EXC_OVERLAP;
    /**
     * The fraction of aligned bases that were filtered out because they would have raised coverage
     * above the capped value (default cap = 250x).
     */
    public double PCT_EXC_CAPPED;
    /** The total fraction of aligned bases excluded due to all filters. */
    public double PCT_EXC_TOTAL;

    /**
     * The fraction of bases that attained at least 1X sequence coverage in post-filtering bases.
     */
    public double PCT_1X;
    /**
     * The fraction of bases that attained at least 5X sequence coverage in post-filtering bases.
     */
    public double PCT_5X;
    /**
     * The fraction of bases that attained at least 10X sequence coverage in post-filtering bases.
     */
    public double PCT_10X;
    /**
     * The fraction of bases that attained at least 15X sequence coverage in post-filtering bases.
     */
    public double PCT_15X;
    /**
     * The fraction of bases that attained at least 20X sequence coverage in post-filtering bases.
     */
    public double PCT_20X;
    /**
     * The fraction of bases that attained at least 25X sequence coverage in post-filtering bases.
     */
    public double PCT_25X;
    /**
     * The fraction of bases that attained at least 30X sequence coverage in post-filtering bases.
     */
    public double PCT_30X;
    /**
     * The fraction of bases that attained at least 40X sequence coverage in post-filtering bases.
     */
    public double PCT_40X;
    /**
     * The fraction of bases that attained at least 50X sequence coverage in post-filtering bases.
     */
    public double PCT_50X;
    /**
     * The fraction of bases that attained at least 60X sequence coverage in post-filtering bases.
     */
    public double PCT_60X;
    /**
     * The fraction of bases that attained at least 70X sequence coverage in post-filtering bases.
     */
    public double PCT_70X;
    /**
     * The fraction of bases that attained at least 80X sequence coverage in post-filtering bases.
     */
    public double PCT_80X;
    /**
     * The fraction of bases that attained at least 90X sequence coverage in post-filtering bases.
     */
    public double PCT_90X;
    /**
     * The fraction of bases that attained at least 100X sequence coverage in post-filtering bases.
     */
    public double PCT_100X;
  }

  public static void main(final String[] args) {
    new CollectWgsMetrics().instanceMainWithExit(args);
  }

  @Override
  protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);

    // Setup all the inputs
    final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci");
    final ReferenceSequenceFileWalker refWalker =
        new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final SamReader in =
        SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
    final SamLocusIterator iterator = getLocusIterator(in);

    final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
    final CountingFilter dupeFilter = new CountingDuplicateFilter();
    final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY);
    final CountingPairedFilter pairFilter = new CountingPairedFilter();
    filters.add(mapqFilter);
    filters.add(dupeFilter);
    if (!COUNT_UNPAIRED) {
      filters.add(pairFilter);
    }
    filters.add(
        new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count
                                         // reads twice
    iterator.setSamFilters(filters);
    iterator.setEmitUncoveredLoci(true);
    iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases
    iterator.setQualityScoreCutoff(0); // Handled separately because we want to count bases
    iterator.setIncludeNonPfReads(false);

    final int max = COVERAGE_CAP;
    final long[] HistogramArray = new long[max + 1];
    final long[] baseQHistogramArray = new long[Byte.MAX_VALUE];
    final boolean usingStopAfter = STOP_AFTER > 0;
    final long stopAfter = STOP_AFTER - 1;
    long counter = 0;

    long basesExcludedByBaseq = 0;
    long basesExcludedByOverlap = 0;
    long basesExcludedByCapping = 0;

    // Loop through all the loci
    while (iterator.hasNext()) {
      final SamLocusIterator.LocusInfo info = iterator.next();

      // Check that the reference is not N
      final ReferenceSequence ref = refWalker.get(info.getSequenceIndex());
      final byte base = ref.getBases()[info.getPosition() - 1];
      if (base == 'N') continue;

      // Figure out the coverage while not counting overlapping reads twice, and excluding various
      // things
      final HashSet<String> readNames = new HashSet<String>(info.getRecordAndPositions().size());
      int pileupSize = 0;
      for (final SamLocusIterator.RecordAndOffset recs : info.getRecordAndPositions()) {

        if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY) {
          ++basesExcludedByBaseq;
          continue;
        }
        if (!readNames.add(recs.getRecord().getReadName())) {
          ++basesExcludedByOverlap;
          continue;
        }
        pileupSize++;
        if (pileupSize <= max) {
          baseQHistogramArray[recs.getRecord().getBaseQualities()[recs.getOffset()]]++;
        }
      }

      final int depth = Math.min(readNames.size(), max);
      if (depth < readNames.size()) basesExcludedByCapping += readNames.size() - max;
      HistogramArray[depth]++;

      // Record progress and perhaps stop
      progress.record(info.getSequenceName(), info.getPosition());
      if (usingStopAfter && ++counter > stopAfter) break;
    }

    // Construct and write the outputs
    final Histogram<Integer> histo = new Histogram<Integer>("coverage", "count");
    for (int i = 0; i < HistogramArray.length; ++i) {
      histo.increment(i, HistogramArray[i]);
    }

    // Construct and write the outputs
    final Histogram<Integer> baseQHisto = new Histogram<Integer>("value", "baseq_count");
    for (int i = 0; i < baseQHistogramArray.length; ++i) {
      baseQHisto.increment(i, baseQHistogramArray[i]);
    }

    final WgsMetrics metrics = generateWgsMetrics();
    metrics.GENOME_TERRITORY = (long) histo.getSumOfValues();
    metrics.MEAN_COVERAGE = histo.getMean();
    metrics.SD_COVERAGE = histo.getStandardDeviation();
    metrics.MEDIAN_COVERAGE = histo.getMedian();
    metrics.MAD_COVERAGE = histo.getMedianAbsoluteDeviation();

    final long basesExcludedByDupes = getBasesExcludedBy(dupeFilter);
    final long basesExcludedByMapq = getBasesExcludedBy(mapqFilter);
    final long basesExcludedByPairing = getBasesExcludedBy(pairFilter);
    final double total = histo.getSum();
    final double totalWithExcludes =
        total
            + basesExcludedByDupes
            + basesExcludedByMapq
            + basesExcludedByPairing
            + basesExcludedByBaseq
            + basesExcludedByOverlap
            + basesExcludedByCapping;
    metrics.PCT_EXC_DUPE = basesExcludedByDupes / totalWithExcludes;
    metrics.PCT_EXC_MAPQ = basesExcludedByMapq / totalWithExcludes;
    metrics.PCT_EXC_UNPAIRED = basesExcludedByPairing / totalWithExcludes;
    metrics.PCT_EXC_BASEQ = basesExcludedByBaseq / totalWithExcludes;
    metrics.PCT_EXC_OVERLAP = basesExcludedByOverlap / totalWithExcludes;
    metrics.PCT_EXC_CAPPED = basesExcludedByCapping / totalWithExcludes;
    metrics.PCT_EXC_TOTAL = (totalWithExcludes - total) / totalWithExcludes;

    metrics.PCT_1X =
        MathUtil.sum(HistogramArray, 1, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_5X =
        MathUtil.sum(HistogramArray, 5, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_10X =
        MathUtil.sum(HistogramArray, 10, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_15X =
        MathUtil.sum(HistogramArray, 15, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_20X =
        MathUtil.sum(HistogramArray, 20, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_25X =
        MathUtil.sum(HistogramArray, 25, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_30X =
        MathUtil.sum(HistogramArray, 30, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_40X =
        MathUtil.sum(HistogramArray, 40, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_50X =
        MathUtil.sum(HistogramArray, 50, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_60X =
        MathUtil.sum(HistogramArray, 60, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_70X =
        MathUtil.sum(HistogramArray, 70, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_80X =
        MathUtil.sum(HistogramArray, 80, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_90X =
        MathUtil.sum(HistogramArray, 90, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_100X =
        MathUtil.sum(HistogramArray, 100, HistogramArray.length)
            / (double) metrics.GENOME_TERRITORY;

    final MetricsFile<WgsMetrics, Integer> out = getMetricsFile();
    out.addMetric(metrics);
    out.addHistogram(histo);
    if (INCLUDE_BQ_HISTOGRAM) {
      out.addHistogram(baseQHisto);
    }
    out.write(OUTPUT);

    return 0;
  }

  protected WgsMetrics generateWgsMetrics() {
    return new WgsMetrics();
  }

  protected long getBasesExcludedBy(final CountingFilter filter) {
    return filter.getFilteredBases();
  }

  protected SamLocusIterator getLocusIterator(final SamReader in) {
    return new SamLocusIterator(in);
  }
}
/**
 * IlluminaDataProviderFactory accepts options for parsing Illumina data files for a lane and
 * creates an IlluminaDataProvider, an iterator over the ClusterData for that lane, which utilizes
 * these options.
 *
 * <p>
 *
 * <p>Note: Since we tend to use IlluminaDataProviderFactory in multithreaded environments (e.g. we
 * call makeDataProvider in a different thread per tile in IlluminaBasecallsToSam). I've made it
 * essentially immutable. makeDataProvider/getTiles are now idempotent (well as far as
 * IlluminaDataProviderFactory is concerned, many file handles and other things are opened when
 * makeDataProvider is called). We may in the future want dataTypes to be provided to the
 * makeDataProvider factory methods so configuration is not done multiple times for the same
 * basecallDirectory in client code.
 *
 * @author [email protected]
 */
public class IlluminaDataProviderFactory {
  private static final Log log = Log.getInstance(IlluminaDataProviderFactory.class);

  /**
   * A map of data types to a list of file formats in the order in which we prefer those file types
   * (E.g. we would rather parse Bcls before QSeqs, Locs files before Clocs files ...) We try to
   * prefer data types that will be the fastest to parse/smallest in memory NOTE: In the code below,
   * if Qseq is chosen to provide for ANY data type then it is used for ALL its data types (since
   * we'll have to parse the entire line for each Qseq anyways)
   */
  private static final Map<IlluminaDataType, List<SupportedIlluminaFormat>>
      DATA_TYPE_TO_PREFERRED_FORMATS =
          new HashMap<IlluminaDataType, List<SupportedIlluminaFormat>>();

  static {
    /**
     * For types found in Qseq, we prefer the NON-Qseq file formats first. However, if we end up
     * using Qseqs then we use Qseqs for EVERY type it provides, see determineFormats
     */
    DATA_TYPE_TO_PREFERRED_FORMATS.put(
        IlluminaDataType.BaseCalls,
        makeList(SupportedIlluminaFormat.MultiTileBcl, SupportedIlluminaFormat.Bcl));
    DATA_TYPE_TO_PREFERRED_FORMATS.put(
        IlluminaDataType.QualityScores,
        makeList(SupportedIlluminaFormat.MultiTileBcl, SupportedIlluminaFormat.Bcl));
    DATA_TYPE_TO_PREFERRED_FORMATS.put(
        IlluminaDataType.PF,
        makeList(SupportedIlluminaFormat.MultiTileFilter, SupportedIlluminaFormat.Filter));
    DATA_TYPE_TO_PREFERRED_FORMATS.put(
        IlluminaDataType.Position,
        makeList(
            SupportedIlluminaFormat.MultiTileLocs,
            SupportedIlluminaFormat.Locs,
            SupportedIlluminaFormat.Clocs,
            SupportedIlluminaFormat.Pos));

    DATA_TYPE_TO_PREFERRED_FORMATS.put(
        IlluminaDataType.Barcodes, makeList(SupportedIlluminaFormat.Barcode));
  }

  // The following properties must be specified by caller.
  /** basecallDirectory holds QSeqs or bcls * */
  private final File basecallDirectory;

  private final File barcodesDirectory;
  private final int lane;

  /** Whether or not to apply EAMSS filtering if parsing BCLs for the bases and quality scores. */
  private boolean applyEamssFiltering = true;

  /** A Map of file formats to the dataTypes they will provide for this run. */
  protected final Map<SupportedIlluminaFormat, Set<IlluminaDataType>> formatToDataTypes;

  /** Basecall Directory/lane parameterized util for finding IlluminaFiles */
  private final IlluminaFileUtil fileUtil;

  private final List<Integer> availableTiles;

  private final OutputMapping outputMapping;
  private final BclQualityEvaluationStrategy bclQualityEvaluationStrategy;

  /**
   * Create factory with the specified options, one that favors using QSeqs over all other files
   *
   * @param basecallDirectory The baseCalls directory of a complete Illumina directory. Files are
   *     found by searching relative to this folder (some of them higher up in the directory tree).
   * @param lane Which lane to iterate over.
   * @param readStructure The read structure to which output clusters will conform. When not using
   *     QSeqs, EAMSS masking(see BclParser) is run on individual reads as found in the
   *     readStructure, if the readStructure specified does not match the readStructure implied by
   *     the sequencer's output than the quality scores output may differ than what would be found
   *     in a run's QSeq files
   * @param dataTypesArg Which data types to read
   */
  public IlluminaDataProviderFactory(
      final File basecallDirectory,
      final int lane,
      final ReadStructure readStructure,
      final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
      final IlluminaDataType... dataTypesArg) {
    this(basecallDirectory, null, lane, readStructure, bclQualityEvaluationStrategy, dataTypesArg);
  }

  /**
   * Create factory with the specified options, one that favors using QSeqs over all other files
   *
   * @param basecallDirectory The baseCalls directory of a complete Illumina directory. Files are
   *     found by searching relative to this folder (some of them higher up in the directory tree).
   * @param barcodesDirectory The barcodesDirectory with barcode files extracted by
   *     'ExtractIlluminaBarcodes' (optional, use basecallDirectory if not specified)
   * @param lane Which lane to iterate over.
   * @param readStructure The read structure to which output clusters will conform. When not using
   *     QSeqs, EAMSS masking(see BclParser) is run on individual reads as found in the
   *     readStructure, if the readStructure specified does not match the readStructure implied by
   *     the sequencer's output than the quality scores output may differ than what would be found
   *     in a run's QSeq files
   * @param dataTypesArg Which data types to read
   */
  public IlluminaDataProviderFactory(
      final File basecallDirectory,
      File barcodesDirectory,
      final int lane,
      final ReadStructure readStructure,
      final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
      final IlluminaDataType... dataTypesArg) {
    this.basecallDirectory = basecallDirectory;
    this.barcodesDirectory = barcodesDirectory;
    this.bclQualityEvaluationStrategy = bclQualityEvaluationStrategy;

    this.lane = lane;
    /* The types of data that will be returned by any IlluminaDataProviders created by this factory.

    Note: In previous version, data of types not specified might be returned if a data type was specified
    for data residing in QSeqs (since QSeqs span multiple data types).  This is no longer the case, you
    MUST specify all data types that should be returned.*/
    final Set<IlluminaDataType> dataTypes =
        Collections.unmodifiableSet(new HashSet<IlluminaDataType>(Arrays.asList(dataTypesArg)));

    if (dataTypes.isEmpty()) {
      throw new PicardException(
          "No data types have been specified for basecall output "
              + basecallDirectory
              + ", lane "
              + lane);
    }

    this.fileUtil = new IlluminaFileUtil(basecallDirectory, barcodesDirectory, lane);

    // find what request IlluminaDataTypes we have files for and select the most preferred file
    // format available for that type
    formatToDataTypes = determineFormats(dataTypes, fileUtil);

    // find if we have any IlluminaDataType with NO available file formats and, if any exist, throw
    // an exception
    final Set<IlluminaDataType> unmatchedDataTypes =
        findUnmatchedTypes(dataTypes, formatToDataTypes);
    if (unmatchedDataTypes.size() > 0) {
      throw new PicardException(
          "Could not find a format with available files for the following data types: "
              + StringUtil.join(", ", new ArrayList<IlluminaDataType>(unmatchedDataTypes)));
    }

    log.debug(
        "The following file formats will be used by IlluminaDataProvider: "
            + StringUtil.join("," + formatToDataTypes.keySet()));

    availableTiles =
        fileUtil.getActualTiles(new ArrayList<SupportedIlluminaFormat>(formatToDataTypes.keySet()));
    if (availableTiles.isEmpty()) {
      throw new PicardException(
          "No available tiles were found, make sure that "
              + basecallDirectory.getAbsolutePath()
              + " has a lane "
              + lane);
    }

    outputMapping = new OutputMapping(readStructure);
  }

  /**
   * Sometimes (in the case of skipped reads) the logical read structure of the output cluster data
   * is different from the input readStructure
   *
   * @return The ReadStructure describing the output cluster data
   */
  public ReadStructure getOutputReadStructure() {
    return outputMapping.getOutputReadStructure();
  }

  /**
   * Return the list of tiles available for this flowcell and lane. These are in ascending numerical
   * order.
   *
   * @return List of all tiles available for this flowcell and lane.
   */
  public List<Integer> getAvailableTiles() {
    return availableTiles;
  }

  /**
   * Sets whether or not EAMSS filtering will be applied if parsing BCL files for bases and quality
   * scores.
   */
  public void setApplyEamssFiltering(final boolean applyEamssFiltering) {
    this.applyEamssFiltering = applyEamssFiltering;
  }

  /**
   * Call this method to create a ClusterData iterator over all clusters for all tiles in ascending
   * numeric order.
   *
   * @return An iterator for reading the Illumina basecall output for the lane specified in the
   *     ctor.
   */
  public IlluminaDataProvider makeDataProvider() {
    return makeDataProvider(null);
  }

  /**
   * Call this method to create a ClusterData iterator over the specified tiles.
   *
   * @return An iterator for reading the Illumina basecall output for the lane specified in the
   *     constructor.
   */
  public IlluminaDataProvider makeDataProvider(List<Integer> requestedTiles) {
    if (requestedTiles == null) {
      requestedTiles = availableTiles;
    } else {
      if (requestedTiles.size() == 0) {
        throw new PicardException(
            "Zero length tile list supplied to makeDataProvider, you must specify at least 1 tile OR pass NULL to use all available tiles");
      }
    }

    final Map<IlluminaParser, Set<IlluminaDataType>> parsersToDataType =
        new HashMap<IlluminaParser, Set<IlluminaDataType>>();
    for (final Map.Entry<SupportedIlluminaFormat, Set<IlluminaDataType>> fmToDt :
        formatToDataTypes.entrySet()) {
      parsersToDataType.put(makeParser(fmToDt.getKey(), requestedTiles), fmToDt.getValue());
    }

    log.debug(
        "The following parsers will be used by IlluminaDataProvider: "
            + StringUtil.join("," + parsersToDataType.keySet()));

    return new IlluminaDataProvider(outputMapping, parsersToDataType, basecallDirectory, lane);
  }

  /**
   * Given a set of formats to data types they provide, find any requested data types that do not
   * have a format associated with them and return them
   *
   * @param requestedDataTypes Data types that need to be provided
   * @param formatToMatchedTypes A map of file formats to data types that will support them
   * @return The data types that go unsupported by the formats found in formatToMatchedTypes
   */
  public static Set<IlluminaDataType> findUnmatchedTypes(
      final Set<IlluminaDataType> requestedDataTypes,
      final Map<SupportedIlluminaFormat, Set<IlluminaDataType>> formatToMatchedTypes) {
    final Set<IlluminaDataType> copiedTypes = new HashSet<IlluminaDataType>(requestedDataTypes);
    for (final Set<IlluminaDataType> matchedTypes : formatToMatchedTypes.values()) {
      copiedTypes.removeAll(matchedTypes);
    }

    return copiedTypes;
  }

  /**
   * For all requestedDataTypes return a map of file format to set of provided data types that
   * covers as many requestedDataTypes as possible and chooses the most preferred available formats
   * possible
   *
   * @param requestedDataTypes Data types to be provided
   * @param fileUtil A file util for the lane/directory we wish to provide data for
   * @return A Map<Supported file format, Set of data types file format provides>
   */
  public static Map<SupportedIlluminaFormat, Set<IlluminaDataType>> determineFormats(
      final Set<IlluminaDataType> requestedDataTypes, final IlluminaFileUtil fileUtil) {
    // For predictable ordering and uniqueness only, put the requestedDataTypes into a treeSet
    final SortedSet<IlluminaDataType> toSupport = new TreeSet<IlluminaDataType>(requestedDataTypes);
    final Map<SupportedIlluminaFormat, Set<IlluminaDataType>> fileTypeToDataTypes =
        new HashMap<SupportedIlluminaFormat, Set<IlluminaDataType>>();
    final Map<IlluminaDataType, SupportedIlluminaFormat> dataTypeToFormat =
        new HashMap<IlluminaDataType, SupportedIlluminaFormat>();

    for (final IlluminaDataType ts : toSupport) {
      final SupportedIlluminaFormat preferredFormat = findPreferredAvailableFormat(ts, fileUtil);
      if (preferredFormat != null) {
        dataTypeToFormat.put(ts, preferredFormat);
      }
    }

    for (final IlluminaDataType dt : toSupport) {
      final SupportedIlluminaFormat format = dataTypeToFormat.get(dt);

      if (format != null) {
        if (fileTypeToDataTypes.containsKey(format)) {
          fileTypeToDataTypes.get(format).add(dt);
        } else {
          fileTypeToDataTypes.put(dataTypeToFormat.get(dt), makeSet(dt));
        }
      }
    }

    return fileTypeToDataTypes;
  }

  /**
   * Given a data type find the most preferred file format that also has files available
   *
   * @param dt Type of desired data
   * @param fileUtil Util for the lane/directory in which we will find data
   * @return The file format that is "most preferred" (i.e. fastest to parse/smallest in memory)
   */
  private static SupportedIlluminaFormat findPreferredAvailableFormat(
      final IlluminaDataType dt, final IlluminaFileUtil fileUtil) {
    return findPreferredFormat(dt, fileUtil, true);
  }

  /**
   * Given a data type find the most preferred file format even if files are not available
   *
   * @param dt Type of desired data
   * @param fileUtil Util for the lane/directory in which we will find data
   * @return The file format that is "most preferred" (i.e. fastest to parse/smallest in memory)
   */
  public static SupportedIlluminaFormat findPreferredFormat(
      final IlluminaDataType dt, final IlluminaFileUtil fileUtil) {
    return findPreferredFormat(dt, fileUtil, false);
  }

  private static SupportedIlluminaFormat findPreferredFormat(
      final IlluminaDataType dt, final IlluminaFileUtil fileUtil, final boolean checkAvailable) {
    final List<SupportedIlluminaFormat> preferredFormats = DATA_TYPE_TO_PREFERRED_FORMATS.get(dt);
    SupportedIlluminaFormat format = null;
    for (int i = 0; i < preferredFormats.size() && format == null; i++) {
      if (checkAvailable && fileUtil.getUtil(preferredFormats.get(i)).filesAvailable()) {
        format = preferredFormats.get(i);
      } else if (!checkAvailable) {
        format = preferredFormats.get(i);
      }
    }

    return format;
  }

  /**
   * There are multiple parsers for the same IlluminaDataType (e.g. BCLParser and QSeqParser).
   * Instantiate an instance of the preferred parser for the given data type with the information
   * available and return it.
   *
   * @param format The type of data we want to parse
   * @param requestedTiles The requestedTiles over which we will be parsing data
   * @return A parser that will parse dataType data over the given requestedTiles and cycles and
   *     output it in groupings of the sizes specified in outputLengths
   */
  private IlluminaParser makeParser(
      final SupportedIlluminaFormat format, final List<Integer> requestedTiles) {
    final IlluminaParser parser;
    switch (format) {
      case Barcode:
        parser =
            new BarcodeParser(
                ((PerTileFileUtil) fileUtil.getUtil(SupportedIlluminaFormat.Barcode))
                    .getFiles(requestedTiles));
        break;

      case Bcl:
        {
          final CycleIlluminaFileMap bclFileMap =
              ((PerTilePerCycleFileUtil) fileUtil.getUtil(SupportedIlluminaFormat.Bcl))
                  .getFiles(requestedTiles, outputMapping.getOutputCycles());
          bclFileMap.assertValid(requestedTiles, outputMapping.getOutputCycles());
          parser =
              new BclParser(
                  basecallDirectory,
                  lane,
                  bclFileMap,
                  outputMapping,
                  this.applyEamssFiltering,
                  bclQualityEvaluationStrategy);
          break;
        }

      case Filter:
        final IlluminaFileMap filterFileMap =
            ((PerTileFileUtil) fileUtil.getUtil(SupportedIlluminaFormat.Filter))
                .getFiles(requestedTiles);
        parser = new FilterParser(filterFileMap);
        break;

      case Locs:
      case Clocs:
      case Pos:
        final PerTileFileUtil fu = (PerTileFileUtil) fileUtil.getUtil(format);
        parser = new PosParser(fu.getFiles(requestedTiles), format);
        break;

      case MultiTileFilter:
        parser =
            ((MultiTileFilterFileUtil) fileUtil.getUtil(SupportedIlluminaFormat.MultiTileFilter))
                .makeParser(requestedTiles);
        break;

      case MultiTileLocs:
        parser =
            ((MultiTileLocsFileUtil) fileUtil.getUtil(SupportedIlluminaFormat.MultiTileLocs))
                .makeParser(requestedTiles);
        break;

      case MultiTileBcl:
        {
          final MultiTileBclFileUtil util =
              (MultiTileBclFileUtil) fileUtil.getUtil(SupportedIlluminaFormat.MultiTileBcl);
          final CycleIlluminaFileMap bclFileMap =
              util.getFiles(requestedTiles, outputMapping.getOutputCycles());
          bclFileMap.assertValid(requestedTiles, outputMapping.getOutputCycles());
          parser =
              new MultiTileBclParser(
                  basecallDirectory,
                  lane,
                  bclFileMap,
                  outputMapping,
                  this.applyEamssFiltering,
                  bclQualityEvaluationStrategy,
                  util.tileIndex);
          break;
        }

      default:
        throw new PicardException(
            "Unrecognized data type(" + format + ") found by IlluminaDataProviderFactory!");
    }

    return parser;
  }
}
Ejemplo n.º 9
0
public class VcfVcf extends AbstractVCFFilter {
  private static Log LOG = Log.getInstance(VcfVcf.class);

  @Usage(programVersion = "1.0")
  public String USAGE =
      getStandardUsagePreamble() + "Get the INFO from a VCF and use it for another VCF. ";

  @Option(shortName = "TBX", doc = "The VCF file indexed with TABIX. Source of the annotations")
  public String TABIX;

  @Option(shortName = "INFO", doc = "The INFO keys to grab.", minElements = 0)
  public Set<String> INFO_IDS = new LinkedHashSet<String>();

  @Option(shortName = "RIF", doc = "Replace the INFO field if it exists.", minElements = 0)
  public boolean REPLACE_INFO_FIELD = true;

  @Option(shortName = "RID", doc = "Replace the ID field if it exists.", optional = true)
  public boolean REPLACE_ID = true;

  @Option(shortName = "RAM", doc = "REF allele matters.", optional = true)
  public boolean REF_ALLELE_MATTERS = true;

  @Option(shortName = "AAM", doc = "ALT alleles matters.", optional = true)
  public boolean ALT_ALLELES_MATTERS = false;

  @Option(shortName = "ACF", doc = "Flag to set if alternate alleles conflict.", optional = true)
  public String ALT_CONFLICT_FLAG = null;

  @Override
  protected void doWork(VcfIterator r, VariantContextWriter w) throws IOException {
    AbstractVCFCodec codeIn3 = VCFUtils.createDefaultVCFCodec();
    String line;

    StringWriter sw = new StringWriter();
    LOG.info("opening tabix file: " + this.TABIX);
    TabixReader tabix = new TabixReader(this.TABIX);

    while ((line = tabix.readLine()) != null) {
      if (!line.startsWith(VCFHeader.HEADER_INDICATOR)) {
        break;
      }
      sw.append(line).append("\n");
    }
    VCFHeader header3 =
        (VCFHeader)
            codeIn3.readActualHeader(
                new LineIteratorImpl(
                    LineReaderUtil.fromBufferedStream(
                        new ByteArrayInputStream(sw.toString().getBytes()))));
    VCFHeader header1 = r.getHeader();

    VCFHeader h2 =
        new VCFHeader(header1.getMetaDataInInputOrder(), header1.getSampleNamesInOrder());
    for (String infoId : this.INFO_IDS) {
      VCFInfoHeaderLine vihl = header3.getInfoHeaderLine(infoId);
      if (vihl == null) {
        LOG.warn("Not INFO=" + infoId + " in " + TABIX);
        continue;
      }
      if (h2.getInfoHeaderLine(infoId) != null) {
        LOG.warn("Input already contains INFO=" + vihl);
      }
      h2.addMetaDataLine(vihl);
    }

    if (ALT_CONFLICT_FLAG != null) {
      h2.addMetaDataLine(
          new VCFInfoHeaderLine(
              ALT_CONFLICT_FLAG,
              1,
              VCFHeaderLineType.Flag,
              "conflict ALT allele with " + this.TABIX));
    }

    w.writeHeader(h2);
    while (r.hasNext()) {
      VariantContext ctx1 = r.next();

      VariantContextBuilder vcb = new VariantContextBuilder(ctx1);
      String line2;
      String BEST_ID = null;
      boolean best_id_match_alt = false;

      List<VariantContext> variantsList = new ArrayList<VariantContext>();

      int[] array = tabix.parseReg(ctx1.getChr() + ":" + (ctx1.getStart()) + "-" + (ctx1.getEnd()));
      TabixReader.Iterator iter = null;

      if (array != null && array.length == 3 && array[0] != -1 && array[1] >= 0 && array[2] >= 0) {
        iter = tabix.query(array[0], array[1], array[2]);
      } else {
        LOG.info("Cannot get " + ctx1.getChr() + ":" + (ctx1.getStart()) + "-" + (ctx1.getEnd()));
      }

      while (iter != null && (line2 = iter.next()) != null) {
        VariantContext ctx3 = codeIn3.decode(line2);
        if (ctx3.getStart() != ctx1.getStart()) continue;
        if (ctx3.getEnd() != ctx1.getEnd()) continue;

        if (ctx1.getReference().equals(ctx3.getReference())
            && ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) {
          variantsList.clear();
          variantsList.add(ctx3);
          break;
        } else {
          variantsList.add(ctx3);
        }
      }

      for (VariantContext ctx3 : variantsList) {

        if (this.REF_ALLELE_MATTERS && !ctx1.getReference().equals(ctx3.getReference())) {
          continue;
        }
        if (this.ALT_ALLELES_MATTERS
            && !ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) {
          continue;
        }

        if (ctx3.getID() != null && this.REPLACE_ID) {
          if (BEST_ID != null && best_id_match_alt) {
            // nothing
          } else {
            BEST_ID = ctx3.getID();
            best_id_match_alt = ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles());
          }
        }

        for (String id : this.INFO_IDS) {
          Object info3 = ctx3.getAttribute(id);
          if (info3 == null) {
            continue;
          }
          Object info1 = ctx1.getAttribute(id);
          if (info1 != null && !this.REPLACE_INFO_FIELD) {
            continue;
          }

          vcb.attribute(id, info3);
        }

        if (ALT_CONFLICT_FLAG != null
            && !ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) {
          vcb.attribute(ALT_CONFLICT_FLAG, true);
        }
      }
      if (BEST_ID != null) {
        vcb.id(BEST_ID);
      }
      w.add(vcb.make());
    }
    tabix.close();
  }

  public static void main(String[] args) throws IOException {
    new VcfVcf().instanceMainWithExit(args);
  }
}
Ejemplo n.º 10
0
/**
 * Command line program to generate a BAM index (.bai) file from a BAM (.bam) file
 *
 * @author Martha Borkan
 */
public class BuildBamIndex extends CommandLineProgram {

  private static final Log log = Log.getInstance(BuildBamIndex.class);

  @Usage public String USAGE = getStandardUsagePreamble() + "Generates a BAM index (.bai) file.";

  @Option(
      shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
      doc = "A BAM file or URL to process. Must be sorted in coordinate order.")
  public String INPUT;

  URL inputUrl = null; // INPUT as URL
  File inputFile = null; // INPUT as File, if it can't be interpreted as a valid URL

  @Option(
      shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
      doc =
          "The BAM index file. Defaults to x.bai if INPUT is x.bam, otherwise INPUT.bai.\n"
              + "If INPUT is a URL and OUTPUT is unspecified, defaults to a file in the current directory.",
      optional = true)
  public File OUTPUT;

  /** Stock main method for a command line program. */
  public static void main(final String[] argv) {
    System.exit(new BuildBamIndex().instanceMain(argv));
  }

  /**
   * Main method for the program. Checks that all input files are present and readable and that the
   * output file can be written to. Then iterates through all the records generating a BAM Index,
   * then writes the bai file.
   */
  protected int doWork() {

    try {
      inputUrl = new URL(INPUT);
    } catch (java.net.MalformedURLException e) {
      inputFile = new File(INPUT);
    }

    // set default output file - input-file.bai
    if (OUTPUT == null) {

      final String baseFileName;
      if (inputUrl != null) {
        String path = inputUrl.getPath();
        int lastSlash = path.lastIndexOf("/");
        baseFileName = path.substring(lastSlash + 1, path.length());
      } else {
        baseFileName = inputFile.getAbsolutePath();
      }

      if (baseFileName.endsWith(BamFileIoUtils.BAM_FILE_EXTENSION)) {

        final int index = baseFileName.lastIndexOf(".");
        OUTPUT = new File(baseFileName.substring(0, index) + BAMIndex.BAMIndexSuffix);

      } else {
        OUTPUT = new File(baseFileName + BAMIndex.BAMIndexSuffix);
      }
    }

    IOUtil.assertFileIsWritable(OUTPUT);
    final SAMFileReader bam;

    if (inputUrl != null) {
      // remote input
      bam = new SAMFileReader(inputUrl, null, false);
    } else {
      // input from a normal file
      IOUtil.assertFileIsReadable(inputFile);
      bam = new SAMFileReader(inputFile);
    }

    if (!bam.isBinary()) {
      throw new SAMException("Input file must be bam file, not sam file.");
    }

    if (!bam.getFileHeader().getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
      throw new SAMException("Input bam file must be sorted by coordinates");
    }

    BAMIndexer.createIndex(bam, OUTPUT);

    log.info("Successfully wrote bam index file " + OUTPUT);
    CloserUtil.close(bam);
    return 0;
  }
}
Ejemplo n.º 11
0
/**
 * Reads a SAM or BAM file and combines the output to one file
 *
 * @author Tim Fennell
 */
@CommandLineProgramProperties(
    usage = "Merges multiple SAM/BAM files into one file.",
    usageShort = "Merges multiple SAM or BAM files into one file",
    programGroup = SamOrBam.class)
public class MergeSamFiles extends CommandLineProgram {
  private static final Log log = Log.getInstance(MergeSamFiles.class);

  @Option(shortName = "I", doc = "SAM or BAM input file", minElements = 1)
  public List<File> INPUT = new ArrayList<File>();

  @Option(shortName = "O", doc = "SAM or BAM file to write merged result to")
  public File OUTPUT;

  @Option(
      shortName = StandardOptionDefinitions.SORT_ORDER_SHORT_NAME,
      doc = "Sort order of output file",
      optional = true)
  public SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;

  @Option(
      doc =
          "If true, assume that the input files are in the same sort order as the requested output sort order, even if their headers say otherwise.",
      shortName = StandardOptionDefinitions.ASSUME_SORTED_SHORT_NAME)
  public boolean ASSUME_SORTED = false;

  @Option(shortName = "MSD", doc = "Merge the sequence dictionaries", optional = true)
  public boolean MERGE_SEQUENCE_DICTIONARIES = false;

  @Option(
      doc =
          "Option to create a background thread to encode, "
              + "compress and write to disk the output file. The threaded version uses about 20% more CPU and decreases "
              + "runtime by ~20% when writing out a compressed BAM file.")
  public boolean USE_THREADING = false;

  @Option(
      doc = "Comment(s) to include in the merged output file's header.",
      optional = true,
      shortName = "CO")
  public List<String> COMMENT = new ArrayList<String>();

  @Option(
      shortName = "RGN",
      doc =
          "An interval list file that contains the locations of the positions to merge. "
              + "Assume bam are sorted and indexed. "
              + "The resulting file will contain alignments that may overlap with genomic regions outside the requested region. "
              + "Unmapped reads are discarded.",
      optional = true)
  public File INTERVALS = null;

  private static final int PROGRESS_INTERVAL = 1000000;

  /** Required main method implementation. */
  public static void main(final String[] argv) {
    System.exit(new MergeSamFiles().instanceMain(argv));
  }

  /** Combines multiple SAM/BAM files into one. */
  @Override
  protected int doWork() {
    boolean matchedSortOrders = true;

    // read interval list if it is defined
    final List<Interval> intervalList =
        (INTERVALS == null ? null : IntervalList.fromFile(INTERVALS).uniqued().getIntervals());
    // map reader->iterator used if INTERVALS is defined
    final Map<SamReader, CloseableIterator<SAMRecord>> samReaderToIterator =
        new HashMap<SamReader, CloseableIterator<SAMRecord>>(INPUT.size());

    // Open the files for reading and writing
    final List<SamReader> readers = new ArrayList<SamReader>();
    final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>();
    {
      SAMSequenceDictionary dict = null; // Used to try and reduce redundant SDs in memory

      for (final File inFile : INPUT) {
        IOUtil.assertFileIsReadable(inFile);
        final SamReader in =
            SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(inFile);
        if (INTERVALS != null) {
          if (!in.hasIndex())
            throw new PicardException(
                "Merging with interval but Bam file is not indexed " + inFile);
          final CloseableIterator<SAMRecord> samIterator =
              new SamRecordIntervalIteratorFactory()
                  .makeSamRecordIntervalIterator(in, intervalList, true);
          samReaderToIterator.put(in, samIterator);
        }

        readers.add(in);
        headers.add(in.getFileHeader());

        // A slightly hackish attempt to keep memory consumption down when merging multiple files
        // with
        // large sequence dictionaries (10,000s of sequences). If the dictionaries are identical,
        // then
        // replace the duplicate copies with a single dictionary to reduce the memory footprint.
        if (dict == null) {
          dict = in.getFileHeader().getSequenceDictionary();
        } else if (dict.equals(in.getFileHeader().getSequenceDictionary())) {
          in.getFileHeader().setSequenceDictionary(dict);
        }

        matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER;
      }
    }

    // If all the input sort orders match the output sort order then just merge them and
    // write on the fly, otherwise setup to merge and sort before writing out the final file
    IOUtil.assertFileIsWritable(OUTPUT);
    final boolean presorted;
    final SAMFileHeader.SortOrder headerMergerSortOrder;
    final boolean mergingSamRecordIteratorAssumeSorted;

    if (matchedSortOrders
        || SORT_ORDER == SAMFileHeader.SortOrder.unsorted
        || ASSUME_SORTED
        || INTERVALS != null) {
      log.info(
          "Input files are in same order as output so sorting to temp directory is not needed.");
      headerMergerSortOrder = SORT_ORDER;
      mergingSamRecordIteratorAssumeSorted = ASSUME_SORTED;
      presorted = true;
    } else {
      log.info("Sorting input files using temp directory " + TMP_DIR);
      headerMergerSortOrder = SAMFileHeader.SortOrder.unsorted;
      mergingSamRecordIteratorAssumeSorted = false;
      presorted = false;
    }
    final SamFileHeaderMerger headerMerger =
        new SamFileHeaderMerger(headerMergerSortOrder, headers, MERGE_SEQUENCE_DICTIONARIES);
    final MergingSamRecordIterator iterator;
    // no interval defined, get an iterator for the whole bam
    if (intervalList == null) {
      iterator =
          new MergingSamRecordIterator(headerMerger, readers, mergingSamRecordIteratorAssumeSorted);
    } else {
      // show warning related to https://github.com/broadinstitute/picard/pull/314/files
      log.info(
          "Warning: merged bams from different interval lists may contain the same read in both files");
      iterator = new MergingSamRecordIterator(headerMerger, samReaderToIterator, true);
    }
    final SAMFileHeader header = headerMerger.getMergedHeader();
    for (final String comment : COMMENT) {
      header.addComment(comment);
    }
    header.setSortOrder(SORT_ORDER);
    final SAMFileWriterFactory samFileWriterFactory = new SAMFileWriterFactory();
    if (USE_THREADING) {
      samFileWriterFactory.setUseAsyncIo(true);
    }
    final SAMFileWriter out = samFileWriterFactory.makeSAMOrBAMWriter(header, presorted, OUTPUT);

    // Lastly loop through and write out the records
    final ProgressLogger progress = new ProgressLogger(log, PROGRESS_INTERVAL);
    while (iterator.hasNext()) {
      final SAMRecord record = iterator.next();
      out.addAlignment(record);
      progress.record(record);
    }

    log.info("Finished reading inputs.");
    for (final CloseableIterator<SAMRecord> iter : samReaderToIterator.values())
      CloserUtil.close(iter);
    CloserUtil.close(readers);
    out.close();
    return 0;
  }

  @Override
  protected String[] customCommandLineValidation() {
    if (CREATE_INDEX && SORT_ORDER != SAMFileHeader.SortOrder.coordinate) {
      return new String[] {"Can't CREATE_INDEX unless SORT_ORDER is coordinate"};
    }
    return null;
  }
}
Ejemplo n.º 12
0
/**
 * A command-line tool to merge BAM/SAM alignment info from a third-party aligner with the data in
 * an unmapped BAM file, producing a third BAM file that has alignment data and all the additional
 * data from the unmapped BAM
 *
 * @author [email protected]
 */
@CommandLineProgramProperties(
    usage = MergeBamAlignment.USAGE_SUMMARY + MergeBamAlignment.USAGE_DETAILS,
    usageShort = MergeBamAlignment.USAGE_SUMMARY,
    programGroup = SamOrBam.class)
public class MergeBamAlignment extends CommandLineProgram {
  static final String USAGE_SUMMARY =
      "Merge alignment data from a SAM or BAM with data in an unmapped BAM file.  ";
  static final String USAGE_DETAILS =
      "This tool produces a new SAM or BAM file that includes all aligned and unaligned reads and also carries "
          + "forward additional read attributes from the unmapped BAM (attributes that are otherwise lost in the process of alignment)."
          + "  The purpose of this tool is to use information from the unmapped BAM to fix up aligner output.  The resulting file will be valid "
          + "for use by other Picard tools.  For simple BAM file merges, use MergeSamFiles.  Note that MergeBamAlignment expects to "
          + "find a sequence dictionary in the same directory as REFERENCE_SEQUENCE and expects it "
          + "to have the same base name as the reference FASTA except with the extension \".dict\". "
          + "<h4>Usage example:</h4>"
          + "<pre>"
          + "java -jar picard.jar MergeBamAlignment \\<br /> "
          + "      ALIGNED=aligned.bam \\ <br /> "
          + "      UNMAPPED=unmapped.bam \\ <br /> "
          + "      O=merge_alignments.bam \\<br /> "
          + "      R=reference_sequence.fasta"
          + "</pre> "
          + "<hr />";

  @Option(
      shortName = "UNMAPPED",
      doc = "Original SAM or BAM file of unmapped reads, which must be in queryname order.")
  public File UNMAPPED_BAM;

  @Option(
      shortName = "ALIGNED",
      doc = "SAM or BAM file(s) with alignment data.",
      mutex = {"READ1_ALIGNED_BAM", "READ2_ALIGNED_BAM"},
      optional = true)
  public List<File> ALIGNED_BAM;

  @Option(
      shortName = "R1_ALIGNED",
      doc = "SAM or BAM file(s) with alignment data from the first read of a pair.",
      mutex = {"ALIGNED_BAM"},
      optional = true)
  public List<File> READ1_ALIGNED_BAM;

  @Option(
      shortName = "R2_ALIGNED",
      doc = "SAM or BAM file(s) with alignment data from the second read of a pair.",
      mutex = {"ALIGNED_BAM"},
      optional = true)
  public List<File> READ2_ALIGNED_BAM;

  @Option(
      shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
      doc = "Merged SAM or BAM file to write to.")
  public File OUTPUT;

  @Option(
      shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME,
      doc = "Path to the fasta file for the reference sequence.")
  public File REFERENCE_SEQUENCE;

  @Option(
      shortName = StandardOptionDefinitions.PROGRAM_RECORD_ID_SHORT_NAME,
      doc = "The program group ID of the aligner (if not supplied by the aligned file).",
      optional = true)
  public String PROGRAM_RECORD_ID;

  @Option(
      shortName = "PG_VERSION",
      doc = "The version of the program group (if not supplied by the aligned file).",
      optional = true)
  public String PROGRAM_GROUP_VERSION;

  @Option(
      shortName = "PG_COMMAND",
      doc = "The command line of the program group (if not supplied by the aligned file).",
      optional = true)
  public String PROGRAM_GROUP_COMMAND_LINE;

  @Option(
      shortName = "PG_NAME",
      doc = "The name of the program group (if not supplied by the aligned file).",
      optional = true)
  public String PROGRAM_GROUP_NAME;

  @Deprecated
  @Option(doc = "This argument is ignored and will be removed.", shortName = "PE", optional = true)
  public Boolean PAIRED_RUN = true;

  @Option(
      doc =
          "The expected jump size (required if this is a jumping library). Deprecated. Use EXPECTED_ORIENTATIONS instead",
      shortName = "JUMP",
      mutex = "EXPECTED_ORIENTATIONS",
      optional = true)
  public Integer JUMP_SIZE;

  @Option(doc = "Whether to clip adapters where identified.")
  public boolean CLIP_ADAPTERS = true;

  @Option(doc = "Whether the lane is bisulfite sequence (used when caculating the NM tag).")
  public boolean IS_BISULFITE_SEQUENCE = false;

  @Option(doc = "Whether to output only aligned reads.  ")
  public boolean ALIGNED_READS_ONLY = false;

  @Option(
      doc =
          "The maximum number of insertions or deletions permitted for an alignment to be "
              + "included. Alignments with more than this many insertions or deletions will be ignored. "
              + "Set to -1 to allow any number of insertions or deletions.",
      shortName = "MAX_GAPS")
  public int MAX_INSERTIONS_OR_DELETIONS = 1;

  @Option(
      doc =
          "Reserved alignment attributes (tags starting with X, Y, or Z) that should be "
              + "brought over from the alignment data when merging.")
  public List<String> ATTRIBUTES_TO_RETAIN = new ArrayList<String>();

  @Option(
      doc =
          "Attributes from the alignment record that should be removed when merging."
              + "  This overrides ATTRIBUTES_TO_RETAIN if they share common tags.")
  public List<String> ATTRIBUTES_TO_REMOVE = new ArrayList<String>();

  @Option(
      shortName = "R1_TRIM",
      doc = "The number of bases trimmed from the beginning of read 1 prior to alignment")
  public int READ1_TRIM = 0;

  @Option(
      shortName = "R2_TRIM",
      doc = "The number of bases trimmed from the beginning of read 2 prior to alignment")
  public int READ2_TRIM = 0;

  @Option(
      shortName = "ORIENTATIONS",
      doc = "The expected orientation of proper read pairs. Replaces JUMP_SIZE",
      mutex = "JUMP_SIZE",
      optional = true)
  public List<SamPairUtil.PairOrientation> EXPECTED_ORIENTATIONS;

  @Option(
      doc =
          "Use the aligner's idea of what a proper pair is rather than computing in this program.")
  public boolean ALIGNER_PROPER_PAIR_FLAGS = false;

  @Option(
      shortName = StandardOptionDefinitions.SORT_ORDER_SHORT_NAME,
      doc = "The order in which the merged reads should be output.")
  public SortOrder SORT_ORDER = SortOrder.coordinate;

  @Option(
      doc =
          "Strategy for selecting primary alignment when the aligner has provided more than one alignment "
              + "for a pair or fragment, and none are marked as primary, more than one is marked as primary, or the primary "
              + "alignment is filtered out for some reason. "
              + "BestMapq expects that multiple alignments will be correlated with HI tag, and prefers the pair of "
              + "alignments with the largest MAPQ, in the absence of a primary selected by the aligner. "
              + "EarliestFragment prefers the alignment which maps the earliest base in the read. Note that EarliestFragment "
              + "may not be used for paired reads. "
              + "BestEndMapq is appropriate for cases in which the aligner is not pair-aware, and does not output the HI tag. "
              + "It simply picks the alignment for each end with the highest MAPQ, and makes those alignments primary, regardless "
              + "of whether the two alignments make sense together."
              + "MostDistant is also for a non-pair-aware aligner, and picks the alignment pair with the largest insert size. "
              + "If all alignments would be chimeric, it picks the alignments for each end with the best MAPQ.  For all algorithms, "
              + "ties are resolved arbitrarily.")
  public PrimaryAlignmentStrategy PRIMARY_ALIGNMENT_STRATEGY = PrimaryAlignmentStrategy.BestMapq;

  @Option(
      doc =
          "For paired reads, soft clip the 3' end of each read if necessary so that it does not extend past the 5' end of its mate.")
  public boolean CLIP_OVERLAPPING_READS = true;

  @Option(doc = "If false, do not write secondary alignments to output.")
  public boolean INCLUDE_SECONDARY_ALIGNMENTS = true;

  @Option(
      shortName = "MC",
      optional = true,
      doc = "Adds the mate CIGAR tag (MC) if true, does not if false.")
  public Boolean ADD_MATE_CIGAR = true;

  @Option(
      shortName = "UNMAP_CONTAM",
      optional = true,
      doc =
          "Detect reads originating from foreign organisms (e.g. bacterial DNA in a non-bacterial sample),"
              + "and unmap + label those reads accordingly.")
  public boolean UNMAP_CONTAMINANT_READS = false;

  @Option(
      doc =
          "If UNMAP_CONTAMINANT_READS is set, require this many unclipped bases or else the read will be marked as contaminant.")
  public int MIN_UNCLIPPED_BASES = 32;

  private static final Log log = Log.getInstance(MergeBamAlignment.class);

  /**
   * Mechanism to bridge between command line option and PrimaryAlignmentSelectionStrategy
   * implementation.
   */
  enum PrimaryAlignmentStrategy {
    BestMapq(BestMapqPrimaryAlignmentSelectionStrategy.class),
    EarliestFragment(EarliestFragmentPrimaryAlignmentSelectionStrategy.class),
    BestEndMapq(BestEndMapqPrimaryAlignmentStrategy.class),
    MostDistant(MostDistantPrimaryAlignmentSelectionStrategy.class);

    private final Class<PrimaryAlignmentSelectionStrategy> clazz;

    PrimaryAlignmentStrategy(final Class<?> clazz) {
      this.clazz = (Class<PrimaryAlignmentSelectionStrategy>) clazz;
    }

    PrimaryAlignmentSelectionStrategy newInstance() {
      try {
        return clazz.newInstance();
      } catch (Exception e) {
        throw new PicardException("Trouble instantiating " + clazz.getName(), e);
      }
    }
  }

  /** Required main method implementation. */
  public static void main(final String[] argv) {
    System.exit(new MergeBamAlignment().instanceMain(argv));
  }

  @Override
  protected int doWork() {
    // Check the files are readable/writable
    SAMProgramRecord prod = null;
    if (PROGRAM_RECORD_ID != null) {
      prod = new SAMProgramRecord(PROGRAM_RECORD_ID);
      prod.setProgramVersion(PROGRAM_GROUP_VERSION);
      prod.setCommandLine(PROGRAM_GROUP_COMMAND_LINE);
      prod.setProgramName(PROGRAM_GROUP_NAME);
    }
    // TEMPORARY FIX until internal programs all specify EXPECTED_ORIENTATIONS
    if (JUMP_SIZE != null) {
      EXPECTED_ORIENTATIONS = Arrays.asList(SamPairUtil.PairOrientation.RF);
    } else if (EXPECTED_ORIENTATIONS == null || EXPECTED_ORIENTATIONS.isEmpty()) {
      EXPECTED_ORIENTATIONS = Arrays.asList(SamPairUtil.PairOrientation.FR);
    }

    final SamAlignmentMerger merger =
        new SamAlignmentMerger(
            UNMAPPED_BAM,
            OUTPUT,
            REFERENCE_SEQUENCE,
            prod,
            CLIP_ADAPTERS,
            IS_BISULFITE_SEQUENCE,
            ALIGNED_READS_ONLY,
            ALIGNED_BAM,
            MAX_INSERTIONS_OR_DELETIONS,
            ATTRIBUTES_TO_RETAIN,
            ATTRIBUTES_TO_REMOVE,
            READ1_TRIM,
            READ2_TRIM,
            READ1_ALIGNED_BAM,
            READ2_ALIGNED_BAM,
            EXPECTED_ORIENTATIONS,
            SORT_ORDER,
            PRIMARY_ALIGNMENT_STRATEGY.newInstance(),
            ADD_MATE_CIGAR,
            UNMAP_CONTAMINANT_READS,
            MIN_UNCLIPPED_BASES);
    merger.setClipOverlappingReads(CLIP_OVERLAPPING_READS);
    merger.setMaxRecordsInRam(MAX_RECORDS_IN_RAM);
    merger.setKeepAlignerProperPairFlags(ALIGNER_PROPER_PAIR_FLAGS);
    merger.setIncludeSecondaryAlignments(INCLUDE_SECONDARY_ALIGNMENTS);
    merger.mergeAlignment(REFERENCE_SEQUENCE);
    merger.close();

    return 0;
  }

  /**
   * Put any custom command-line validation in an override of this method. clp is initialized at
   * this point and can be used to print usage and access argv. Any options set by command-line
   * parser can be validated.
   *
   * @return null if command line is valid. If command line is invalid, returns an array of error
   *     messages to be written to the appropriate place.
   */
  protected String[] customCommandLineValidation() {

    if ((PROGRAM_RECORD_ID != null
            || PROGRAM_GROUP_VERSION != null
            || PROGRAM_GROUP_COMMAND_LINE != null)
        && (PROGRAM_RECORD_ID == null
            || PROGRAM_GROUP_VERSION == null
            || PROGRAM_GROUP_COMMAND_LINE == null)) {

      return new String[] {
        "PROGRAM_RECORD_ID, PROGRAM_GROUP_VERSION, and "
            + "PROGRAM_GROUP_COMMAND_LINE must all be supplied or none should "
            + "be included."
      };
    }

    final boolean r1sExist = READ1_ALIGNED_BAM != null && !READ1_ALIGNED_BAM.isEmpty();
    final boolean r2sExist = READ2_ALIGNED_BAM != null && !READ2_ALIGNED_BAM.isEmpty();
    if ((r1sExist && !r2sExist) || (r2sExist && !r1sExist)) {
      return new String[] {
        "READ1_ALIGNED_BAM and READ2_ALIGNED_BAM "
            + "must both be supplied or neither should be included.  For "
            + "single-end read use ALIGNED_BAM."
      };
    }
    if (ALIGNED_BAM == null || ALIGNED_BAM.isEmpty() && !(r1sExist && r2sExist)) {
      return new String[] {
        "Either ALIGNED_BAM or the combination of "
            + "READ1_ALIGNED_BAM and READ2_ALIGNED_BAM must be supplied."
      };
    }

    return null;
  }
}
Ejemplo n.º 13
0
@CommandLineProgramProperties(
    usage =
        "Generate fastq file(s) from data in an Illumina basecalls output directory.\n"
            + "Separate fastq file(s) are created for each template read, and for each barcode read, in the basecalls.\n"
            + "Template fastqs have extensions like .<number>.fastq, where <number> is the number of the template read,\n"
            + "starting with 1.  Barcode fastqs have extensions like .barcode_<number>.fastq, where <number> is the number\n"
            + "of the barcode read, starting with 1.",
    usageShort = "Generate fastq file(s) from data in an Illumina basecalls output directory",
    programGroup = Illumina.class)
public class IlluminaBasecallsToFastq extends CommandLineProgram {
  // The following attributes define the command-line arguments

  @Option(doc = "The basecalls directory. ", shortName = "B")
  public File BASECALLS_DIR;

  @Option(
      doc =
          "The barcodes directory with _barcode.txt files (generated by ExtractIlluminaBarcodes). If not set, use BASECALLS_DIR. ",
      shortName = "BCD",
      optional = true)
  public File BARCODES_DIR;

  @Option(doc = "Lane number. ", shortName = StandardOptionDefinitions.LANE_SHORT_NAME)
  public Integer LANE;

  @Option(
      doc =
          "The prefix for output fastqs.  Extensions as described above are appended.  Use this option for "
              + "a non-barcoded run, or for a barcoded run in which it is not desired to demultiplex reads into separate "
              + "files by barcode.",
      shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
      mutex = {"MULTIPLEX_PARAMS"})
  public File OUTPUT_PREFIX;

  @Option(doc = "The barcode of the run.  Prefixed to read names.", optional = false)
  public String RUN_BARCODE;

  @Option(
      doc =
          "The name of the machine on which the run was sequenced; required if emitting Casava1.8-style read name headers",
      optional = true)
  public String MACHINE_NAME;

  @Option(
      doc =
          "The barcode of the flowcell that was sequenced; required if emitting Casava1.8-style read name headers",
      optional = true)
  public String FLOWCELL_BARCODE;

  @Option(doc = ReadStructure.PARAMETER_DOC, shortName = "RS")
  public String READ_STRUCTURE;

  @Option(
      doc =
          "Tab-separated file for creating all output fastqs demultiplexed by barcode for a lane with single "
              + "IlluminaBasecallsToFastq invocation.  The columns are OUTPUT_PREFIX, and BARCODE_1, BARCODE_2 ... BARCODE_X "
              + "where X = number of barcodes per cluster (optional).  Row with BARCODE_1 set to 'N' is used to specify "
              + "an output_prefix for no barcode match.",
      mutex = {"OUTPUT_PREFIX"})
  public File MULTIPLEX_PARAMS;

  @Option(doc = "Which adapters to look for in the read.")
  public List<IlluminaUtil.IlluminaAdapterPair> ADAPTERS_TO_CHECK =
      new ArrayList<IlluminaUtil.IlluminaAdapterPair>(
          Arrays.asList(
              IlluminaUtil.IlluminaAdapterPair.INDEXED,
              IlluminaUtil.IlluminaAdapterPair.DUAL_INDEXED,
              IlluminaUtil.IlluminaAdapterPair.NEXTERA_V2,
              IlluminaUtil.IlluminaAdapterPair.FLUIDIGM));

  @Option(
      doc =
          "The number of threads to run in parallel. If NUM_PROCESSORS = 0, number of cores is automatically set to "
              + "the number of cores available on the machine. If NUM_PROCESSORS < 0, then the number of cores used will"
              + " be the number available on the machine less NUM_PROCESSORS.")
  public Integer NUM_PROCESSORS = 0;

  @Option(
      doc =
          "If set, this is the first tile to be processed (used for debugging).  Note that tiles are not processed"
              + " in numerical order.",
      optional = true)
  public Integer FIRST_TILE;

  @Option(
      doc = "If set, process no more than this many tiles (used for debugging).",
      optional = true)
  public Integer TILE_LIMIT;

  @Option(
      doc =
          "Apply EAMSS filtering to identify inappropriately quality scored bases towards the ends of reads"
              + " and convert their quality scores to Q2.")
  public boolean APPLY_EAMSS_FILTER = true;

  @Option(
      doc =
          "If true, call System.gc() periodically.  This is useful in cases in which the -Xmx value passed "
              + "is larger than the available memory.")
  public Boolean FORCE_GC = true;

  @Option(
      doc =
          "Configure SortingCollections to store this many records before spilling to disk. For an indexed"
              + " run, each SortingCollection gets this value/number of indices.")
  public int MAX_READS_IN_RAM_PER_TILE = 1200000;

  @Option(
      doc =
          "The minimum quality (after transforming 0s to 1s) expected from reads.  If qualities are lower than this value, an error is thrown."
              + "The default of 2 is what the Illumina's spec describes as the minimum, but in practice the value has been observed lower.")
  public int MINIMUM_QUALITY = BclQualityEvaluationStrategy.ILLUMINA_ALLEGED_MINIMUM_QUALITY;

  @Option(doc = "Whether to include non-PF reads", shortName = "NONPF", optional = true)
  public boolean INCLUDE_NON_PF_READS = true;

  @Option(
      doc =
          "Whether to ignore reads whose barcodes are not found in MULTIPLEX_PARAMS.  Useful when outputting "
              + "fastqs for only a subset of the barcodes in a lane.",
      shortName = "INGORE_UNEXPECTED")
  public boolean IGNORE_UNEXPECTED_BARCODES = false;

  @Option(
      doc =
          "The read name header formatting to emit.  Casava1.8 formatting has additional information beyond Illumina, including: "
              + "the passing-filter flag value for the read, the flowcell name, and the sequencer name.",
      optional = false)
  public ReadNameFormat READ_NAME_FORMAT = ReadNameFormat.CASAVA_1_8;

  @Option(
      shortName = "GZIP",
      doc = "Compress output FASTQ files using gzip and append a .gz extension to the file names.")
  public boolean COMPRESS_OUTPUTS = false;

  /** Simple switch to control the read name format to emit. */
  public enum ReadNameFormat {
    CASAVA_1_8,
    ILLUMINA
  }

  private final Map<String, FastqRecordsWriter> sampleBarcodeFastqWriterMap =
      new HashMap<String, FastqRecordsWriter>();
  private ReadStructure readStructure;
  IlluminaBasecallsConverter<FastqRecordsForCluster> basecallsConverter;
  private static final Log log = Log.getInstance(IlluminaBasecallsToFastq.class);
  private final FastqWriterFactory fastqWriterFactory = new FastqWriterFactory();
  private ReadNameEncoder readNameEncoder;
  private static final Comparator<FastqRecordsForCluster> queryNameComparator =
      new Comparator<FastqRecordsForCluster>() {
        @Override
        public int compare(final FastqRecordsForCluster r1, final FastqRecordsForCluster r2) {
          return SAMRecordQueryNameComparator.compareReadNames(
              r1.templateRecords[0].getReadHeader(), r2.templateRecords[0].getReadHeader());
        }
      };

  @Override
  protected int doWork() {
    initialize();

    basecallsConverter.doTileProcessing();

    return 0;
  }

  @Override
  protected String[] customCommandLineValidation() {
    final LinkedList<String> errors = new LinkedList<String>();
    if (READ_NAME_FORMAT == ReadNameFormat.CASAVA_1_8 && MACHINE_NAME == null) {
      errors.add("MACHINE_NAME is required when using Casava1.8-style read name headers.");
    }

    if (READ_NAME_FORMAT == ReadNameFormat.CASAVA_1_8 && FLOWCELL_BARCODE == null) {
      errors.add("FLOWCELL_BARCODE is required when using Casava1.8-style read name headers.");
    }

    if (errors.isEmpty()) {
      return null;
    } else {
      return errors.toArray(new String[errors.size()]);
    }
  }

  /**
   * Prepares loggers, initiates garbage collection thread, parses arguments and initialized
   * variables appropriately/
   */
  private void initialize() {
    fastqWriterFactory.setCreateMd5(CREATE_MD5_FILE);
    switch (READ_NAME_FORMAT) {
      case CASAVA_1_8:
        readNameEncoder = new Casava18ReadNameEncoder(MACHINE_NAME, RUN_BARCODE, FLOWCELL_BARCODE);
        break;
      case ILLUMINA:
        readNameEncoder = new IlluminaReadNameEncoder(RUN_BARCODE);
        break;
    }

    final BclQualityEvaluationStrategy bclQualityEvaluationStrategy =
        new BclQualityEvaluationStrategy(MINIMUM_QUALITY);
    readStructure = new ReadStructure(READ_STRUCTURE);
    if (MULTIPLEX_PARAMS != null) {
      IOUtil.assertFileIsReadable(MULTIPLEX_PARAMS);
    }
    final boolean demultiplex;
    if (OUTPUT_PREFIX != null) {
      sampleBarcodeFastqWriterMap.put(null, buildWriter(OUTPUT_PREFIX));
      demultiplex = false;
    } else {
      populateWritersFromMultiplexParams();
      demultiplex = true;
    }
    final int readsPerCluster =
        readStructure.templates.length() + readStructure.sampleBarcodes.length();
    basecallsConverter =
        new IlluminaBasecallsConverter<FastqRecordsForCluster>(
            BASECALLS_DIR,
            BARCODES_DIR,
            LANE,
            readStructure,
            sampleBarcodeFastqWriterMap,
            demultiplex,
            MAX_READS_IN_RAM_PER_TILE / readsPerCluster,
            TMP_DIR,
            NUM_PROCESSORS,
            FORCE_GC,
            FIRST_TILE,
            TILE_LIMIT,
            queryNameComparator,
            new FastqRecordsForClusterCodec(
                readStructure.templates.length(),
                readStructure.sampleBarcodes.length(),
                readStructure.molecularBarcode.length()),
            FastqRecordsForCluster.class,
            bclQualityEvaluationStrategy,
            this.APPLY_EAMSS_FILTER,
            INCLUDE_NON_PF_READS,
            IGNORE_UNEXPECTED_BARCODES);

    log.info("READ STRUCTURE IS " + readStructure.toString());

    basecallsConverter.setConverter(
        new ClusterToFastqRecordsForClusterConverter(
            basecallsConverter.getFactory().getOutputReadStructure()));
  }

  /**
   * Assert that expectedCols are present
   *
   * @param actualCols The columns present in the MULTIPLEX_PARAMS file
   * @param expectedCols The columns that are REQUIRED
   */
  private void assertExpectedColumns(final Set<String> actualCols, final Set<String> expectedCols) {
    final Set<String> missingColumns = new HashSet<String>(expectedCols);
    missingColumns.removeAll(actualCols);

    if (missingColumns.size() > 0) {
      throw new PicardException(
          String.format(
              "MULTIPLEX_PARAMS file %s is missing the following columns: %s.",
              MULTIPLEX_PARAMS.getAbsolutePath(), StringUtil.join(", ", missingColumns)));
    }
  }

  /**
   * For each line in the MULTIPLEX_PARAMS file create a FastqRecordsWriter and put it in the
   * sampleBarcodeFastqWriterMap map, where the key to the map is the concatenation of all
   * sampleBarcodes in order for the given line.
   */
  private void populateWritersFromMultiplexParams() {
    final TabbedTextFileWithHeaderParser libraryParamsParser =
        new TabbedTextFileWithHeaderParser(MULTIPLEX_PARAMS);

    final Set<String> expectedColumnLabels = CollectionUtil.makeSet("OUTPUT_PREFIX");
    final List<String> sampleBarcodeColumnLabels = new ArrayList<String>();
    for (int i = 1; i <= readStructure.sampleBarcodes.length(); i++) {
      sampleBarcodeColumnLabels.add("BARCODE_" + i);
    }

    expectedColumnLabels.addAll(sampleBarcodeColumnLabels);
    assertExpectedColumns(libraryParamsParser.columnLabels(), expectedColumnLabels);

    for (final TabbedTextFileWithHeaderParser.Row row : libraryParamsParser) {
      List<String> sampleBarcodeValues = null;

      if (sampleBarcodeColumnLabels.size() > 0) {
        sampleBarcodeValues = new ArrayList<String>();
        for (final String sampleBarcodeLabel : sampleBarcodeColumnLabels) {
          sampleBarcodeValues.add(row.getField(sampleBarcodeLabel));
        }
      }

      final String key =
          (sampleBarcodeValues == null || sampleBarcodeValues.contains("N"))
              ? null
              : StringUtil.join("", sampleBarcodeValues);
      if (sampleBarcodeFastqWriterMap.containsKey(
          key)) { // This will catch the case of having more than 1 line in a non-barcoded
                  // MULTIPLEX_PARAMS file
        throw new PicardException(
            "Row for barcode "
                + key
                + " appears more than once in MULTIPLEX_PARAMS file "
                + MULTIPLEX_PARAMS);
      }

      final FastqRecordsWriter writer = buildWriter(new File(row.getField("OUTPUT_PREFIX")));
      sampleBarcodeFastqWriterMap.put(key, writer);
    }
    if (sampleBarcodeFastqWriterMap.isEmpty()) {
      throw new PicardException(
          "MULTIPLEX_PARAMS file " + MULTIPLEX_PARAMS + " does have any data rows.");
    }
    libraryParamsParser.close();
  }

  /**
   * @return FastqRecordsWriter that contains one or more FastqWriters (amount depends on read
   *     structure), all using outputPrefix to determine the filename(s).
   */
  private FastqRecordsWriter buildWriter(final File outputPrefix) {
    final File outputDir = outputPrefix.getAbsoluteFile().getParentFile();
    IOUtil.assertDirectoryIsWritable(outputDir);
    final String prefixString = outputPrefix.getName();
    final String suffixString = COMPRESS_OUTPUTS ? "fastq.gz" : "fastq";
    final FastqWriter[] templateWriters = new FastqWriter[readStructure.templates.length()];
    final FastqWriter[] sampleBarcodeWriters =
        new FastqWriter[readStructure.sampleBarcodes.length()];
    final FastqWriter[] molecularBarcodeWriters =
        new FastqWriter[readStructure.molecularBarcode.length()];

    for (int i = 0; i < templateWriters.length; ++i) {
      final String filename = String.format("%s.%d.%s", prefixString, i + 1, suffixString);
      templateWriters[i] = fastqWriterFactory.newWriter(new File(outputDir, filename));
    }

    for (int i = 0; i < sampleBarcodeWriters.length; ++i) {
      final String filename = String.format("%s.barcode_%d.%s", prefixString, i + 1, suffixString);
      sampleBarcodeWriters[i] = fastqWriterFactory.newWriter(new File(outputDir, filename));
    }

    for (int i = 0; i < molecularBarcodeWriters.length; ++i) {
      final String filename = String.format("%s.index_%d.%s", prefixString, i + 1, suffixString);
      molecularBarcodeWriters[i] = fastqWriterFactory.newWriter(new File(outputDir, filename));
    }
    return new FastqRecordsWriter(templateWriters, sampleBarcodeWriters, molecularBarcodeWriters);
  }

  public static void main(final String[] args) {
    new IlluminaBasecallsToFastq().instanceMainWithExit(args);
  }

  /**
   * Container for various FastqWriters, one for each template read, one for each sample barcode
   * read, and one for each molecular barcode read.
   */
  private static class FastqRecordsWriter
      implements IlluminaBasecallsConverter.ConvertedClusterDataWriter<FastqRecordsForCluster> {
    final FastqWriter[] templateWriters;
    final FastqWriter[] sampleBarcodeWriters;
    final FastqWriter[] molecularBarcodeWriters;

    /**
     * @param templateWriters Writers for template reads in order, e,g. 0th element is for template
     *     read 1.
     * @param sampleBarcodeWriters Writers for sample barcode reads in order, e,g. 0th element is
     *     for sample barcode read 1.
     * @param molecularBarcodeWriters Writers for molecular barcode reads in order, e,g. 0th element
     *     is for molecualr barcode read 1.
     */
    private FastqRecordsWriter(
        final FastqWriter[] templateWriters,
        final FastqWriter[] sampleBarcodeWriters,
        final FastqWriter[] molecularBarcodeWriters) {
      this.templateWriters = templateWriters;
      this.sampleBarcodeWriters = sampleBarcodeWriters;
      this.molecularBarcodeWriters = molecularBarcodeWriters;
    }

    @Override
    public void write(final FastqRecordsForCluster records) {
      write(templateWriters, records.templateRecords);
      write(sampleBarcodeWriters, records.sampleBarcodeRecords);
      write(molecularBarcodeWriters, records.molecularBarcodeRecords);
    }

    private void write(final FastqWriter[] writers, final FastqRecord[] records) {
      for (int i = 0; i < writers.length; ++i) {
        writers[i].write(records[i]);
      }
    }

    @Override
    public void close() {
      for (final FastqWriter writer : templateWriters) {
        writer.close();
      }
      for (final FastqWriter writer : sampleBarcodeWriters) {
        writer.close();
      }
      for (final FastqWriter writer : molecularBarcodeWriters) {
        writer.close();
      }
    }
  }

  /**
   * Contains the results of transforming one cluster into the record(s) to be written to output
   * file(s).
   */
  static class FastqRecordsForCluster {
    // These are accessed directly by converter and writer rather than through getters and setters.
    final FastqRecord[] templateRecords;
    final FastqRecord[] sampleBarcodeRecords;
    final FastqRecord[] molecularBarcodeRecords;

    FastqRecordsForCluster(
        final int numTemplates, final int numSampleBarcodes, final int numMolecularBarcodes) {
      templateRecords = new FastqRecord[numTemplates];
      sampleBarcodeRecords = new FastqRecord[numSampleBarcodes];
      molecularBarcodeRecords = new FastqRecord[numMolecularBarcodes];
    }
  }

  /**
   * Passed to IlluminaBaseCallsConverter to do the conversion from input format to output format.
   */
  class ClusterToFastqRecordsForClusterConverter
      implements IlluminaBasecallsConverter.ClusterDataConverter<FastqRecordsForCluster> {

    private final int[] templateIndices;
    private final int[] sampleBarcodeIndicies;
    private final int[] molecularBarcodeIndicies;

    ClusterToFastqRecordsForClusterConverter(final ReadStructure outputReadStructure) {
      this.templateIndices = outputReadStructure.templates.getIndices();
      this.sampleBarcodeIndicies = outputReadStructure.sampleBarcodes.getIndices();
      this.molecularBarcodeIndicies = outputReadStructure.molecularBarcode.getIndices();
    }

    @Override
    public FastqRecordsForCluster convertClusterToOutputRecord(final ClusterData cluster) {
      final FastqRecordsForCluster ret =
          new FastqRecordsForCluster(
              readStructure.templates.length(),
              readStructure.sampleBarcodes.length(),
              readStructure.molecularBarcode.length());
      final boolean appendTemplateNumberSuffix = ret.templateRecords.length > 1;
      final boolean appendMolecularBarcodeNumber = ret.molecularBarcodeRecords.length > 1;

      makeFastqRecords(ret.templateRecords, templateIndices, cluster, appendTemplateNumberSuffix);
      makeFastqRecords(ret.sampleBarcodeRecords, sampleBarcodeIndicies, cluster, false);
      makeFastqRecords(
          ret.molecularBarcodeRecords,
          molecularBarcodeIndicies,
          cluster,
          appendMolecularBarcodeNumber);

      return ret;
    }

    private void makeFastqRecords(
        final FastqRecord[] recs,
        final int[] indices,
        final ClusterData cluster,
        final boolean appendReadNumberSuffix) {
      for (short i = 0; i < indices.length; ++i) {
        final ReadData readData = cluster.getRead(indices[i]);
        final String readBases = StringUtil.bytesToString(readData.getBases()).replace('.', 'N');
        final String readName =
            readNameEncoder.generateReadName(cluster, appendReadNumberSuffix ? i + 1 : null);
        recs[i] =
            new FastqRecord(
                readName, readBases, null, SAMUtils.phredToFastq(readData.getQualities()));
      }
    }
  }

  /** Codec passed to IlluminaBasecallsConverter for use in SortingCollections of output records. */
  static class FastqRecordsForClusterCodec
      implements SortingCollection.Codec<FastqRecordsForCluster> {
    private final int numTemplates;
    private final int numSampleBarcodes;
    private final int numMolecularBarcodes;

    private BasicFastqWriter writer = null;
    private FastqReader reader = null;

    FastqRecordsForClusterCodec(
        final int numTemplates, final int numSampleBarcodes, final int numMolecularBarcodes) {
      this.numTemplates = numTemplates;
      this.numSampleBarcodes = numSampleBarcodes;
      this.numMolecularBarcodes = numMolecularBarcodes;
    }

    @Override
    public void setOutputStream(final OutputStream os) {
      writer = new BasicFastqWriter(new PrintStream(os));
    }

    @Override
    public void setInputStream(final InputStream is) {
      reader = new FastqReader(new BufferedReader(new InputStreamReader(is)));
    }

    // TODO: add tests to encode and decode
    @Override
    public void encode(final FastqRecordsForCluster val) {
      if (numTemplates != val.templateRecords.length) throw new IllegalStateException();
      if (numSampleBarcodes != val.sampleBarcodeRecords.length) throw new IllegalStateException();
      encodeArray(val.templateRecords);
      encodeArray(val.sampleBarcodeRecords);
      //           encodeArray(val.molecularBarcodeRecords);
      writer.flush();
    }

    private void encodeArray(final FastqRecord[] recs) {
      for (final FastqRecord rec : recs) {
        writer.write(rec);
      }
    }

    @Override
    public FastqRecordsForCluster decode() {
      if (!reader.hasNext()) return null;
      final FastqRecordsForCluster ret =
          new FastqRecordsForCluster(numTemplates, numSampleBarcodes, numMolecularBarcodes);
      decodeArray(ret.templateRecords);
      decodeArray(ret.sampleBarcodeRecords);
      decodeArray(ret.molecularBarcodeRecords);
      return ret;
    }

    private void decodeArray(final FastqRecord[] recs) {
      for (int i = 0; i < recs.length; ++i) {
        recs[i] = reader.next();
      }
    }

    @Override
    public SortingCollection.Codec<FastqRecordsForCluster> clone() {
      return new FastqRecordsForClusterCodec(numTemplates, numSampleBarcodes, numMolecularBarcodes);
    }
  }
}
Ejemplo n.º 14
0
public class Biostar84786 extends AbstractCommandLineProgram {
  @Usage(programVersion = "1.0")
  public String USAGE =
      getStandardUsagePreamble()
          + " Matrix transposition ( see  http://www.biostars.org/p/84786/ ).";

  @Option(
      shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
      doc = "input stream default: stdin. ",
      optional = true)
  public File IN = null;

  @Option(shortName = "D", doc = "delimiter. Default is tabulation. ", optional = true)
  public String DELIM = "\t";

  private Log LOG = Log.getInstance(Biostar84786.class);

  private static class Cell {
    long row;
    long col;
    String content;

    Cell() {}

    Cell(long row, long col, StringBuilder b) {
      this.row = row;
      this.col = col;
      this.content = b.toString();
    }

    @Override
    public String toString() {
      return "(" + row + "," + col + ":" + content + ")";
    }
  }

  private static class CellCodec extends AbstractDataCodec<Cell> {
    @Override
    public Cell decode(DataInputStream dis) throws IOException {
      Cell c = new Cell();
      try {
        c.row = dis.readLong();
      } catch (Exception e) { // EOF reached
        throw new IOException(e);
      }
      c.col = dis.readLong();
      c.content = dis.readUTF();
      return c;
    }

    @Override
    public void encode(DataOutputStream dos, Cell c) throws IOException {
      dos.writeLong(c.row);
      dos.writeLong(c.col);
      dos.writeUTF(c.content);
    }

    @Override
    public AbstractDataCodec<Cell> clone() {
      return new CellCodec();
    }
  }

  @Override
  protected int doWork() {
    if (DELIM.length() != 1) {
      LOG.error("DELIM must have length==1 . Got " + DELIM.length());
      return -1;
    }
    InputStream in = System.in;
    SortingCollection<Cell> sorter = null;
    final Comparator<Cell> comparator =
        new Comparator<Biostar84786.Cell>() {
          @Override
          public int compare(final Cell o1, final Cell o2) {
            int i;
            i = (o1.col < o2.col ? -1 : o1.col > o2.col ? 1 : 0);
            if (i != 0) return i;
            i = (o1.row < o2.row ? -1 : o1.row > o2.row ? 1 : 0);
            if (i != 0) return i;
            return o1.content.compareTo(o2.content);
          }
        };
    try {
      final char delimiter = DELIM.charAt(0);
      sorter =
          SortingCollection.newInstance(
              Cell.class, new CellCodec(), comparator, super.MAX_RECORDS_IN_RAM);
      sorter.setDestructiveIteration(true);
      if (IN != null) {
        LOG.info("opening " + IN);
        in = IOUtils.openFileForReading(IN);
      }
      long row = 0L;
      long col = 0L;
      StringBuilder b = new StringBuilder();
      for (; ; ) {
        int c = in.read();
        if (c == '\n' || c == -1) {
          sorter.add(new Cell(row, col, b));
          row++;
          col = 0;
          b.setLength(0);
          if (c == -1) break;
          if (row % 10000 == 0) LOG.info("row:" + row);
        } else if (c == delimiter) {
          sorter.add(new Cell(row, col, b));
          b.setLength(0);
          col++;
        } else {
          b.append((char) c);
        }
      }
      sorter.doneAdding();
      if (IN != null) in.close();
      in = null;
      CloseableIterator<Cell> iter = sorter.iterator();
      long curr_col = -1L;
      long x = 0L;
      for (; ; ) {

        if (!iter.hasNext()) {
          System.out.println();
          break;
        }
        Cell c = iter.next();
        if (c.col != curr_col) {
          if (curr_col != -1L) System.out.println();
          x = 0L;
          curr_col = c.col;
        }
        if (x > 0L) System.out.print(DELIM);
        System.out.print(c.content);
        x++;
      }
      iter.close();
      LOG.info("Done.");
    } catch (Exception e) {
      e.printStackTrace();
      LOG.error(e, "BOUM");
      return -1;
    } finally {
      if (sorter != null) sorter.cleanup();
      if (in != null) CloserUtil.close(in);
    }
    return 0;
  }

  /** @param args */
  public static void main(String[] args) {
    new Biostar84786().instanceMainWithExit(args);
  }
}
/**
 * Program to check that all read groups within the set of BAM files appear to come from the same
 * individual.
 *
 * @author Tim Fennell
 */
@CommandLineProgramProperties(
    usage =
        "Checks if all read groups within a set of BAM files appear to come from the same individual",
    usageShort = "Checks if all read groups appear to come from the same individual",
    programGroup =
        Alpha.class // TODO -- when mature please move to a to-be-created Fingerprinting.class
    )
public class CrosscheckReadGroupFingerprints extends CommandLineProgram {

  @Option(
      shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
      doc = "One or more input BAM files (or lists of BAM files) to compare fingerprints for.")
  public List<File> INPUT;

  @Option(
      shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
      optional = true,
      doc = "Optional output file to write metrics to. Default is to write to stdout.")
  public File OUTPUT;

  @Option(shortName = "H", doc = "The file of haplotype data to use to pick SNPs to fingerprint")
  public File HAPLOTYPE_MAP;

  @Option(
      shortName = "LOD",
      doc =
          "If any two read groups match with a LOD score lower than the threshold the program will exit "
              + "with a non-zero code to indicate error. 0 means equal probability the read groups match vs. "
              + "come from different individuals, negative numbers mean N logs more likely that the read groups "
              + "are from different individuals and positive numbers mean N logs more likely that the read groups "
              + "are from the sample individual.")
  public double LOD_THRESHOLD = 0;

  @Option(
      doc =
          "Instead of producing the normal comparison of read-groups, roll fingerprints up to the sample level "
              + "and print out a sample x sample matrix with LOD scores.")
  public boolean CROSSCHECK_SAMPLES = false;

  @Option(
      doc =
          "Instead of producing the normal comparison of read-groups, roll fingerprints up to the library level "
              + "and print out a library x library matrix with LOD scores.")
  public boolean CROSSCHECK_LIBRARIES = false;

  @Option(doc = "The number of threads to use to process BAM files and generate Fingerprints.")
  public int NUM_THREADS = 1;

  @Option(
      doc =
          "Allow the use of duplicate reads in performing the comparison. Can be useful when duplicate "
              + "marking has been overly aggressive and coverage is low.")
  public boolean ALLOW_DUPLICATE_READS = false;

  @Option(
      doc =
          "Assumed genotyping error rate that provides a floor on the probability that a genotype comes from"
              + " the expected sample.")
  public double GENOTYPING_ERROR_RATE = 0.01;

  @Option(
      doc =
          "If true then only read groups that do not relate to each other as expected will have their LODs reported.")
  public boolean OUTPUT_ERRORS_ONLY = false;

  @Option(
      doc = "The rate at which a het in a normal sample turns into a hom in the tumor.",
      optional = true)
  public double LOSS_OF_HET_RATE = 0.5;

  @Option(
      doc =
          "Expect all read groups' fingerprints to match, irrespective of their sample names.  By default (with this value set to "
              + "false), read groups with different sample names are expected to mismatch, and those with the same sample name are expected "
              + "to match.")
  public boolean EXPECT_ALL_READ_GROUPS_TO_MATCH = false;

  @Option(
      doc =
          "When one or more mismatches between read groups are detected, exit with this value instead of 0.")
  public int EXIT_CODE_WHEN_MISMATCH = 1;

  private final Log log = Log.getInstance(CrosscheckReadGroupFingerprints.class);

  private final FormatUtil formatUtil = new FormatUtil();

  // These are public so that other programs can parse status from the crosscheck file
  public static final String EXPECTED_MATCH = "EXPECTED MATCH";
  public static final String EXPECTED_MISMATCH = "EXPECTED MISMATCH";
  public static final String UNEXPECTED_MATCH = "UNEXPECTED MATCH";
  public static final String UNEXPECTED_MISMATCH = "UNEXPECTED MISMATCH";

  /** Stock main method. */
  public static void main(final String[] args) {
    new CrosscheckReadGroupFingerprints().instanceMainWithExit(args);
  }

  @Override
  protected int doWork() {
    // Check inputs
    for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
    IOUtil.assertFileIsReadable(HAPLOTYPE_MAP);
    if (OUTPUT != null) IOUtil.assertFileIsWritable(OUTPUT);

    final HaplotypeMap map = new HaplotypeMap(HAPLOTYPE_MAP);
    final FingerprintChecker checker = new FingerprintChecker(map);

    checker.setAllowDuplicateReads(ALLOW_DUPLICATE_READS);

    log.info("Done checking input files, moving onto fingerprinting files.");

    List<File> unrolledFiles =
        IOUtil.unrollFiles(INPUT, BamFileIoUtils.BAM_FILE_EXTENSION, IOUtil.SAM_FILE_EXTENSION);
    final Map<SAMReadGroupRecord, Fingerprint> fpMap =
        checker.fingerprintSamFiles(unrolledFiles, NUM_THREADS, 1, TimeUnit.DAYS);
    final List<Fingerprint> fingerprints = new ArrayList<>(fpMap.values());

    log.info("Finished generating fingerprints from BAM files, moving on to cross-checking.");

    // Setup the output
    final PrintStream out;
    if (OUTPUT != null) {
      out = new PrintStream(IOUtil.openFileForWriting(OUTPUT), true);
    } else {
      out = System.out;
    }

    if (this.CROSSCHECK_SAMPLES) {
      crossCheckSamples(fingerprints, out);
      return 0;
    } else if (this.CROSSCHECK_LIBRARIES) {
      crossCheckLibraries(fpMap, out);
      return 0;
    } else {
      return crossCheckReadGroups(fpMap, out);
    }
  }

  /**
   * Method that combines the fingerprint evidence across all the read groups for the same sample
   * and then produces a matrix of LOD scores for comparing every sample with every other sample.
   */
  private void crossCheckSamples(final List<Fingerprint> fingerprints, final PrintStream out) {
    final SortedMap<String, Fingerprint> sampleFps =
        FingerprintChecker.mergeFingerprintsBySample(fingerprints);
    final SortedSet<String> samples = (SortedSet<String>) sampleFps.keySet();

    // Print header row
    out.print("\t");
    for (final String sample : samples) {
      out.print(sample);
      out.print("\t");
    }
    out.println();

    // Print results rows
    for (final String sample : samples) {
      out.print(sample);
      final Fingerprint fp = sampleFps.get(sample);

      for (final String otherSample : samples) {
        final MatchResults results =
            FingerprintChecker.calculateMatchResults(
                fp, sampleFps.get(otherSample), GENOTYPING_ERROR_RATE, LOSS_OF_HET_RATE);
        out.print("\t");
        out.print(formatUtil.format(results.getLOD()));
      }

      out.println();
    }
  }

  /**
   * Method that combines the fingerprint evidence across all the read groups for the same library
   * and then produces a matrix of LOD scores for comparing every library with every other library.
   */
  private void crossCheckLibraries(
      final Map<SAMReadGroupRecord, Fingerprint> fingerprints, final PrintStream out) {
    final List<Fingerprint> fixedFps = new ArrayList<>();
    for (final SAMReadGroupRecord rg : fingerprints.keySet()) {
      final Fingerprint old = fingerprints.get(rg);
      final String name = rg.getSample() + "::" + rg.getLibrary();
      final Fingerprint newFp = new Fingerprint(name, old.getSource(), old.getInfo());
      newFp.putAll(old);

      fixedFps.add(newFp);
    }

    crossCheckSamples(fixedFps, out);
  }

  /**
   * Method that pairwise checks every pair of read groups and reports a LOD score for the two read
   * groups coming from the same sample.
   */
  private int crossCheckReadGroups(
      final Map<SAMReadGroupRecord, Fingerprint> fingerprints, final PrintStream out) {
    int mismatches = 0;
    int unexpectedMatches = 0;

    final List<SAMReadGroupRecord> readGroupRecords = new ArrayList<>(fingerprints.keySet());
    final List<String> output = new ArrayList<>();

    for (int i = 0; i < readGroupRecords.size(); i++) {
      final SAMReadGroupRecord lhsRg = readGroupRecords.get(i);
      for (int j = i + 1; j < readGroupRecords.size(); j++) {
        final SAMReadGroupRecord rhsRg = readGroupRecords.get(j);
        final boolean expectedToMatch =
            EXPECT_ALL_READ_GROUPS_TO_MATCH || lhsRg.getSample().equals(rhsRg.getSample());

        final MatchResults results =
            FingerprintChecker.calculateMatchResults(
                fingerprints.get(lhsRg),
                fingerprints.get(rhsRg),
                GENOTYPING_ERROR_RATE,
                LOSS_OF_HET_RATE);
        if (expectedToMatch) {
          if (results.getLOD() < LOD_THRESHOLD) {
            mismatches++;
            output.add(getMatchDetails(UNEXPECTED_MISMATCH, results, lhsRg, rhsRg));
          } else {
            if (!OUTPUT_ERRORS_ONLY) {
              output.add(getMatchDetails(EXPECTED_MATCH, results, lhsRg, rhsRg));
            }
          }
        } else {
          if (results.getLOD() > -LOD_THRESHOLD) {
            unexpectedMatches++;
            output.add(getMatchDetails(UNEXPECTED_MATCH, results, lhsRg, rhsRg));
          } else {
            if (!OUTPUT_ERRORS_ONLY) {
              output.add(getMatchDetails(EXPECTED_MISMATCH, results, lhsRg, rhsRg));
            }
          }
        }
      }
    }

    if (!output.isEmpty()) {
      out.println(
          "RESULT\tLOD_SCORE\tLOD_SCORE_TUMOR_NORMAL\tLOD_SCORE_NORMAL_TUMOR\tLEFT_RUN_BARCODE\tLEFT_LANE\tLEFT_MOLECULAR_BARCODE_SEQUENCE\tLEFT_LIBRARY\tLEFT_SAMPLE\t"
              + "RIGHT_RUN_BARCODE\tRIGHT_LANE\tRIGHT_MOLECULAR_BARCODE_SEQUENCE\tRIGHT_LIBRARY\tRIGHT_SAMPLE");
      out.println(String.join("\n", output));
    }

    if (mismatches + unexpectedMatches > 0) {
      log.info("WARNING: At least two read groups did not relate as expected.");
      return EXIT_CODE_WHEN_MISMATCH;
    } else {
      log.info("All read groups related as expected.");
      return 0;
    }
  }

  /**
   * Generates tab delimited string containing details about a possible match between fingerprints
   * on two different SAMReadGroupRecords
   *
   * @param matchResult String describing the match type.
   * @param results MatchResults object
   * @param left left hand side SAMReadGroupRecord
   * @param right right hand side SAMReadGroupRecord
   * @return tab delimited string containing details about a possible match
   */
  private String getMatchDetails(
      final String matchResult,
      final MatchResults results,
      final SAMReadGroupRecord left,
      final SAMReadGroupRecord right) {
    final List<String> elements = new ArrayList<>(4);
    elements.add(matchResult);
    elements.add(formatUtil.format(results.getLOD()));
    elements.add(formatUtil.format(results.getLodTN()));
    elements.add(formatUtil.format(results.getLodNT()));
    elements.add(getReadGroupDetails(left));
    elements.add(getReadGroupDetails(right));
    return String.join("\t", elements);
  }

  /**
   * Generates tab delimited string containing details about the passed SAMReadGroupRecord
   *
   * @param readGroupRecord record
   * @return tab delimited string containing details about the SAMReadGroupRecord
   */
  private String getReadGroupDetails(final SAMReadGroupRecord readGroupRecord) {
    final List<String> elements = new ArrayList<>(5);

    final String tmp[] =
        readGroupRecord
            .getPlatformUnit()
            .split("\\."); // Expect to look like: D047KACXX110901.1.ACCAACTG
    String runBarcode = "?";
    String lane = "?";
    String molBarcode = "?";
    if ((tmp.length == 3) || (tmp.length == 2)) {
      runBarcode = tmp[0];
      lane = tmp[1];
      molBarcode =
          (tmp.length == 3)
              ? tmp[2]
              : ""; // In older BAMS there may be no molecular barcode sequence
    } else {
      log.error("Unexpected format " + readGroupRecord.getPlatformUnit() + " for PU attribute");
    }
    elements.add(runBarcode);
    elements.add(lane);
    elements.add(molBarcode);
    elements.add(readGroupRecord.getLibrary());
    elements.add(readGroupRecord.getSample());
    return String.join("\t", elements);
  }
}
  /**
   * Utility for collating Tile records from the Illumina TileMetrics file into lane-level and
   * phasing-level metrics.
   */
  public static class IlluminaLaneMetricsCollector {

    private static final Log LOG = Log.getInstance(IlluminaLaneMetricsCollector.class);

    /**
     * Returns a partitioned collection of lane number to Tile objects from the provided basecall
     * directory.
     */
    public static Map<Integer, ? extends Collection<Tile>> readLaneTiles(
        final File illuminaRunDirectory, final ReadStructure readStructure) {
      final Collection<Tile> tiles;
      try {
        tiles =
            TileMetricsUtil.parseTileMetrics(
                TileMetricsUtil.renderTileMetricsFileFromBasecallingDirectory(illuminaRunDirectory),
                readStructure);
      } catch (final FileNotFoundException e) {
        throw new PicardException("Unable to open laneMetrics file.", e);
      }

      return tiles.stream().collect(Collectors.groupingBy(Tile::getLaneNumber));
    }

    /**
     * Parses the tile data from the basecall directory and writes to both the lane and phasing
     * metrics files
     */
    public static void collectLaneMetrics(
        final File runDirectory,
        final File outputDirectory,
        final String outputPrefix,
        final MetricsFile<MetricBase, Comparable<?>> laneMetricsFile,
        final MetricsFile<MetricBase, Comparable<?>> phasingMetricsFile,
        final ReadStructure readStructure) {
      final Map<Integer, ? extends Collection<Tile>> laneTiles =
          readLaneTiles(runDirectory, readStructure);
      writeLaneMetrics(laneTiles, outputDirectory, outputPrefix, laneMetricsFile);
      writePhasingMetrics(laneTiles, outputDirectory, outputPrefix, phasingMetricsFile);
    }

    public static File writePhasingMetrics(
        final Map<Integer, ? extends Collection<Tile>> laneTiles,
        final File outputDirectory,
        final String outputPrefix,
        final MetricsFile<MetricBase, Comparable<?>> phasingMetricsFile) {
      laneTiles
          .entrySet()
          .stream()
          .forEach(
              entry ->
                  IlluminaPhasingMetrics.getPhasingMetricsForTiles(
                          entry.getKey().longValue(), entry.getValue())
                      .forEach(phasingMetricsFile::addMetric));

      return writeMetrics(
          phasingMetricsFile, outputDirectory, outputPrefix, IlluminaPhasingMetrics.getExtension());
    }

    public static File writeLaneMetrics(
        final Map<Integer, ? extends Collection<Tile>> laneTiles,
        final File outputDirectory,
        final String outputPrefix,
        final MetricsFile<MetricBase, Comparable<?>> laneMetricsFile) {
      laneTiles
          .entrySet()
          .stream()
          .forEach(
              entry -> {
                final IlluminaLaneMetrics laneMetric = new IlluminaLaneMetrics();
                laneMetric.LANE = entry.getKey().longValue();
                laneMetric.CLUSTER_DENSITY = calculateLaneDensityFromTiles(entry.getValue());
                laneMetricsFile.addMetric(laneMetric);
              });

      return writeMetrics(
          laneMetricsFile, outputDirectory, outputPrefix, IlluminaLaneMetrics.getExtension());
    }

    private static File writeMetrics(
        final MetricsFile<MetricBase, Comparable<?>> metricsFile,
        final File outputDirectory,
        final String outputPrefix,
        final String outputExtension) {
      final File outputFile =
          new File(outputDirectory, String.format("%s.%s", outputPrefix, outputExtension));
      LOG.info(
          String.format(
              "Writing %s lane metrics to %s ...", metricsFile.getMetrics().size(), outputFile));
      metricsFile.write(outputFile);
      return outputFile;
    }

    private static double calculateLaneDensityFromTiles(final Collection<Tile> tiles) {
      double area = 0;
      double clusters = 0;
      for (final Tile tile : tiles) {
        area += (tile.getClusterCount() / tile.getClusterDensity());
        clusters += tile.getClusterCount();
      }
      return clusters / area;
    }
  }
Ejemplo n.º 17
0
public class SamReaderFactoryTest {
  private static final File TEST_DATA_DIR = new File("src/test/resources/htsjdk/samtools");

  private static final Log LOG = Log.getInstance(SamReaderFactoryTest.class);

  @Test(dataProvider = "variousFormatReaderTestCases")
  public void variousFormatReaderTest(final String inputFile) throws IOException {
    final File input = new File(TEST_DATA_DIR, inputFile);
    final SamReader reader = SamReaderFactory.makeDefault().open(input);
    for (final SAMRecord ignored : reader) {}
    reader.close();
  }

  private int countRecordsInQueryInterval(final SamReader reader, final QueryInterval query) {
    final SAMRecordIterator iter = reader.queryOverlapping(new QueryInterval[] {query});
    int count = 0;
    while (iter.hasNext()) {
      iter.next();
      count++;
    }
    iter.close();
    return count;
  }

  private int countRecords(final SamReader reader) {
    int count = 0;
    try (final SAMRecordIterator iter = reader.iterator()) {
      while (iter.hasNext()) {
        iter.next();
        count++;
      }
    }
    return count;
  }

  private static SeekableByteChannel addHeader(SeekableByteChannel input) {
    try {
      int total = (int) input.size();
      final String comment =
          "@HD\tVN:1.0  SO:unsorted\n"
              + "@SQ\tSN:chr1\tLN:101\n"
              + "@SQ\tSN:chr2\tLN:101\n"
              + "@SQ\tSN:chr3\tLN:101\n"
              + "@RG\tID:0\tSM:JP was here\n";

      byte[] commentBuf = comment.getBytes();
      ByteBuffer buf = ByteBuffer.allocate(total + commentBuf.length);
      buf.put(commentBuf);
      input.position(0);
      while (input.read(buf) > 0) {
        // read until EOF
      }
      buf.flip();
      return new SeekableByteChannelFromBuffer(buf);
    } catch (IOException x) {
      throw new RuntimeException(x);
    }
  }

  @Test
  public void testWrap() throws IOException {
    final Path input = Paths.get(TEST_DATA_DIR.getPath(), "noheader.sam");
    final SamReader wrappedReader =
        SamReaderFactory.makeDefault().setPathWrapper(SamReaderFactoryTest::addHeader).open(input);
    int records = countRecords(wrappedReader);
    Assert.assertEquals(10, records);
  }

  // See https://github.com/samtools/htsjdk/issues/76
  @Test(dataProvider = "queryIntervalIssue76TestCases")
  public void queryIntervalIssue76(
      final String sequenceName, final int start, final int end, final int expectedCount)
      throws IOException {
    final File input = new File(TEST_DATA_DIR, "issue76.bam");
    final SamReader reader = SamReaderFactory.makeDefault().open(input);
    final QueryInterval interval =
        new QueryInterval(
            reader.getFileHeader().getSequence(sequenceName).getSequenceIndex(), start, end);
    Assert.assertEquals(countRecordsInQueryInterval(reader, interval), expectedCount);
    reader.close();
  }

  @DataProvider(name = "queryIntervalIssue76TestCases")
  public Object[][] queryIntervalIssue76TestCases() {
    return new Object[][] {
      {"1", 11966, 11966, 2},
      {"1", 11966, 11967, 2},
      {"1", 11967, 11967, 1}
    };
  }

  @DataProvider(name = "variousFormatReaderTestCases")
  public Object[][] variousFormatReaderTestCases() {
    return new Object[][] {
      {"block_compressed.sam.gz"},
      {"uncompressed.sam"},
      {"compressed.sam.gz"},
      {"compressed.bam"},
      {"unsorted.sam"}
    };
  }

  // Tests for the SAMRecordFactory usage
  class SAMRecordFactoryTester extends DefaultSAMRecordFactory {
    int samRecordsCreated;
    int bamRecordsCreated;

    public SAMRecord createSAMRecord(final SAMFileHeader header) {
      ++samRecordsCreated;
      return super.createSAMRecord(header);
    }

    public BAMRecord createBAMRecord(
        final SAMFileHeader header,
        final int referenceSequenceIndex,
        final int alignmentStart,
        final short readNameLength,
        final short mappingQuality,
        final int indexingBin,
        final int cigarLen,
        final int flags,
        final int readLen,
        final int mateReferenceSequenceIndex,
        final int mateAlignmentStart,
        final int insertSize,
        final byte[] variableLengthBlock) {
      ++bamRecordsCreated;
      return super.createBAMRecord(
          header,
          referenceSequenceIndex,
          alignmentStart,
          readNameLength,
          mappingQuality,
          indexingBin,
          cigarLen,
          flags,
          readLen,
          mateReferenceSequenceIndex,
          mateAlignmentStart,
          insertSize,
          variableLengthBlock);
    }
  }

  @Test(dataProvider = "variousFormatReaderTestCases")
  public void samRecordFactoryTest(final String inputFile) throws IOException {
    final File input = new File(TEST_DATA_DIR, inputFile);

    final SAMRecordFactoryTester recordFactory = new SAMRecordFactoryTester();
    final SamReaderFactory readerFactory =
        SamReaderFactory.makeDefault().samRecordFactory(recordFactory);
    final SamReader reader = readerFactory.open(input);

    int i = 0;
    for (final SAMRecord ignored : reader) {
      ++i;
    }
    reader.close();

    Assert.assertTrue(i > 0);
    if (inputFile.endsWith(".sam") || inputFile.endsWith(".sam.gz"))
      Assert.assertEquals(recordFactory.samRecordsCreated, i);
    else if (inputFile.endsWith(".bam")) Assert.assertEquals(recordFactory.bamRecordsCreated, i);
  }

  @Test(expectedExceptions = IllegalStateException.class)
  public void samRecordFactoryNullHeaderBAMTest() {
    final SAMRecordFactory recordFactory = new DefaultSAMRecordFactory();
    recordFactory.createBAMRecord(
        null, // null header
        0, 0, (short) 0, (short) 0, 0, 0, 0, 0, 0, 0, 0, null);
  }

  /**
   * Unit tests for asserting all permutations of data and index sources read the same records and
   * header.
   */
  final File localBam =
      new File("src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam");

  final File localBamIndex =
      new File("src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam.bai");

  final URL bamUrl, bamIndexUrl;

  {
    try {
      bamUrl = new URL(TestUtil.BASE_URL_FOR_HTTP_TESTS + "index_test.bam");
      bamIndexUrl = new URL(TestUtil.BASE_URL_FOR_HTTP_TESTS + "index_test.bam.bai");
    } catch (final MalformedURLException e) {
      throw new RuntimeException(e);
    }
  }

  @DataProvider
  public Object[][] composeAllPermutationsOfSamInputResource() {
    final List<SamInputResource> sources = new ArrayList<SamInputResource>();
    for (final InputResource.Type dataType : InputResource.Type.values()) {
      if (dataType.equals(InputResource.Type.SRA_ACCESSION)) continue;

      sources.add(new SamInputResource(composeInputResourceForType(dataType, false)));
      for (final InputResource.Type indexType : InputResource.Type.values()) {
        if (indexType.equals(InputResource.Type.SRA_ACCESSION)) continue;

        sources.add(
            new SamInputResource(
                composeInputResourceForType(dataType, false),
                composeInputResourceForType(indexType, true)));
      }
    }
    final Object[][] data = new Object[sources.size()][];
    for (final SamInputResource source : sources) {
      data[sources.indexOf(source)] = new Object[] {source};
    }

    return data;
  }

  private InputResource composeInputResourceForType(
      final InputResource.Type type, final boolean forIndex) {
    final File f = forIndex ? localBamIndex : localBam;
    final URL url = forIndex ? bamIndexUrl : bamUrl;
    switch (type) {
      case FILE:
        return new FileInputResource(f);
      case PATH:
        return new PathInputResource(f.toPath(), Function.identity());
      case URL:
        return new UrlInputResource(url);
      case SEEKABLE_STREAM:
        return new SeekableStreamInputResource(new SeekableHTTPStream(url));
      case INPUT_STREAM:
        try {
          return new InputStreamInputResource(new FileInputStream(f));
        } catch (final FileNotFoundException e) {
          throw new RuntimeIOException(e);
        }
      default:
        throw new IllegalStateException();
    }
  }

  final Set<SAMFileHeader> observedHeaders = new HashSet<SAMFileHeader>();
  final Set<List<SAMRecord>> observedRecordOrdering = new HashSet<List<SAMRecord>>();

  @Test(dataProvider = "composeAllPermutationsOfSamInputResource")
  public void exhaustInputResourcePermutation(final SamInputResource resource) throws IOException {
    final SamReader reader = SamReaderFactory.makeDefault().open(resource);
    LOG.info(String.format("Reading from %s ...", resource));
    final List<SAMRecord> slurped = Iterables.slurp(reader);
    final SAMFileHeader fileHeader = reader.getFileHeader();
    reader.hasIndex();
    reader.indexing().hasBrowseableIndex();
    reader.close();

    /* Ensure all tests have read the same records in the same order or, if this is the first test, set it as the template. */
    observedHeaders.add(fileHeader);
    observedRecordOrdering.add(slurped);
    Assert.assertEquals(observedHeaders.size(), 1, "read different headers than other testcases");
    Assert.assertEquals(
        observedRecordOrdering.size(), 1, "read different records than other testcases");
  }

  @Test
  public void openPath() throws IOException {
    final Path path = localBam.toPath();
    final List<SAMRecord> records;
    final SAMFileHeader fileHeader;
    try (final SamReader reader = SamReaderFactory.makeDefault().open(path)) {
      LOG.info(String.format("Reading from %s ...", path));
      records = Iterables.slurp(reader);
      fileHeader = reader.getFileHeader();
      reader.close();
    }

    try (final SamReader fileReader = SamReaderFactory.makeDefault().open(localBam)) {
      final List<SAMRecord> expectedRecords = Iterables.slurp(fileReader);
      final SAMFileHeader expectedFileHeader = fileReader.getFileHeader();
      Assert.assertEquals(records, expectedRecords);
      Assert.assertEquals(fileHeader, expectedFileHeader);
    }
  }

  final Set<List<SAMRecord>> observedRecordOrdering1 = new HashSet<List<SAMRecord>>();
  final Set<List<SAMRecord>> observedRecordOrdering3 = new HashSet<List<SAMRecord>>();
  final Set<List<SAMRecord>> observedRecordOrdering20 = new HashSet<List<SAMRecord>>();

  @Test(dataProvider = "composeAllPermutationsOfSamInputResource")
  public void queryInputResourcePermutation(final SamInputResource resource) throws IOException {
    final SamReader reader = SamReaderFactory.makeDefault().open(resource);
    LOG.info(String.format("Query from %s ...", resource));
    if (reader.hasIndex()) {
      final StopWatch stopWatch = new StopWatch();
      stopWatch.start();
      final SAMRecordIterator q1 = reader.query("chr1", 500000, 100000000, true);
      observedRecordOrdering1.add(Iterables.slurp(q1));
      q1.close();
      final SAMRecordIterator q20 = reader.query("chr20", 1, 1000000, true);
      observedRecordOrdering20.add(Iterables.slurp(q20));
      q20.close();
      final SAMRecordIterator q3 = reader.query("chr3", 1, 10000000, true);
      observedRecordOrdering3.add(Iterables.slurp(q3));
      q3.close();
      stopWatch.stop();
      LOG.info(String.format("Finished queries in %sms", stopWatch.getElapsedTime()));

      Assert.assertEquals(
          observedRecordOrdering1.size(), 1, "read different records for chromosome 1");
      Assert.assertEquals(
          observedRecordOrdering20.size(), 1, "read different records for chromosome 20");
      Assert.assertEquals(
          observedRecordOrdering3.size(), 1, "read different records for chromosome 3");
    } else if (resource.indexMaybe() != null) {
      LOG.warn("Resource has an index source, but is not indexed: " + resource);
    } else {
      LOG.info("Skipping query operation: no index.");
    }
    reader.close();
  }

  /**
   * A path that pretends it's not based upon a file. This helps in cases where we want to test
   * branches that apply to non-file based paths without actually having to use non-file based
   * resources (like cloud urls)
   */
  private static class NeverFilePathInputResource extends PathInputResource {
    public NeverFilePathInputResource(Path pathResource) {
      super(pathResource);
    }

    @Override
    public File asFile() {
      return null;
    }
  }

  @Test
  public void checkHasIndexForStreamingPathBamWithFileIndex() throws IOException {
    InputResource bam = new NeverFilePathInputResource(localBam.toPath());
    InputResource index = new FileInputResource(localBamIndex);

    // ensure that the index is being used, not checked in queryInputResourcePermutation
    try (final SamReader reader =
        SamReaderFactory.makeDefault().open(new SamInputResource(bam, index))) {
      Assert.assertTrue(reader.hasIndex());
    }
  }

  @Test
  public void queryStreamingPathBamWithFileIndex() throws IOException {
    InputResource bam = new NeverFilePathInputResource(localBam.toPath());
    InputResource index = new FileInputResource(localBamIndex);

    final SamInputResource resource = new SamInputResource(bam, index);
    queryInputResourcePermutation(new SamInputResource(bam, index));
  }

  @Test
  public void customReaderFactoryTest() throws IOException {
    try {
      CustomReaderFactory.setInstance(
          new CustomReaderFactory(
              "https://www.googleapis.com/genomics/v1beta/reads/,"
                  + "htsjdk.samtools.SamReaderFactoryTest$TestReaderFactory"));
      final SamReader reader =
          SamReaderFactory.makeDefault()
              .open(
                  SamInputResource.of(
                      "https://www.googleapis.com/genomics/v1beta/reads/?uncompressed.sam"));
      int i = 0;
      for (@SuppressWarnings("unused") final SAMRecord ignored : reader) {
        ++i;
      }
      reader.close();

      Assert.assertTrue(i > 0);
    } finally {
      CustomReaderFactory.resetToDefaultInstance();
    }
  }

  public static class TestReaderFactory implements CustomReaderFactory.ICustomReaderFactory {
    @Override
    public SamReader open(URL url) {
      final File file = new File(TEST_DATA_DIR, url.getQuery());
      LOG.info("Opening customr reader for " + file.toString());
      return SamReaderFactory.makeDefault().open(file);
    }
  }

  @Test
  public void inputResourceFromStringTest() throws IOException {
    Assert.assertEquals(
        SamInputResource.of("http://test.url").data().type(), InputResource.Type.URL);
    Assert.assertEquals(
        SamInputResource.of("https://test.url").data().type(), InputResource.Type.URL);
    Assert.assertEquals(
        SamInputResource.of("ftp://test.url").data().type(), InputResource.Type.URL);
    Assert.assertEquals(SamInputResource.of("/a/b/c").data().type(), InputResource.Type.FILE);
  }

  @Test
  public void testCRAMReaderFromURL() throws IOException {
    // get a CRAM reader with an index from a URL-backed resource
    getCRAMReaderFromInputResource(
        (cramURL, indexURL) -> {
          return SamInputResource.of(cramURL).index(indexURL);
        },
        true,
        3);
  }

  @Test
  public void testCRAMReaderFromURLStream() throws IOException {
    // get a CRAM reader with an index from a stream-backed resource created from a URL
    getCRAMReaderFromInputResource(
        (cramURL, indexURL) -> {
          try {
            ISeekableStreamFactory streamFactory = SeekableStreamFactory.getInstance();
            return SamInputResource.of(streamFactory.getStreamFor(cramURL))
                .index(streamFactory.getStreamFor(indexURL));
          } catch (IOException e) {
            throw new RuntimeIOException(e);
          }
        },
        true,
        3);
  }

  @Test
  public void testCRAMReaderFromURLNoIndexFile() throws IOException {
    // get just a CRAM reader (no index) from an URL-backed resource
    getCRAMReaderFromInputResource(
        (cramURL, indexURL) -> {
          return SamInputResource.of(cramURL);
        },
        false,
        11);
  }

  @Test(expectedExceptions = RuntimeIOException.class)
  public void testCRAMReaderFromURLBadIndexFile() throws IOException {
    // deliberately specify a bad index file to ensure we get an IOException
    getCRAMReaderFromInputResource(
        (cramURL, indexURL) -> {
          return SamInputResource.of(cramURL).index(new File("nonexistent.bai"));
        },
        true,
        3);
  }

  private void getCRAMReaderFromInputResource(
      final BiFunction<URL, URL, SamInputResource> getInputResource,
      final boolean hasIndex,
      final int expectedCount)
      throws IOException {
    final String cramFilePath =
        new File(TEST_DATA_DIR, "cram_with_bai_index.cram").getAbsolutePath();
    final String cramIndexPath =
        new File(TEST_DATA_DIR, "cram_with_bai_index.cram.bai").getAbsolutePath();
    final URL cramURL = new URL("file://" + cramFilePath);
    final URL indexURL = new URL("file://" + cramIndexPath);

    final SamReaderFactory factory =
        SamReaderFactory.makeDefault()
            .referenceSource(new ReferenceSource(new File(TEST_DATA_DIR, "hg19mini.fasta")))
            .validationStringency(ValidationStringency.SILENT);
    final SamReader reader = factory.open(getInputResource.apply(cramURL, indexURL));

    int count =
        hasIndex
            ? countRecordsInQueryInterval(reader, new QueryInterval(1, 10, 1000))
            : countRecords(reader);
    Assert.assertEquals(count, expectedCount);
  }

  @Test
  public void testSamReaderFromSeekableStream() throws IOException {
    // even though a SAM isn't indexable, make sure we can open one
    // using a seekable stream
    final File samFile = new File(TEST_DATA_DIR, "unsorted.sam");
    final SamReaderFactory factory =
        SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT);
    final SamReader reader = factory.open(SamInputResource.of(new SeekableFileStream(samFile)));
    Assert.assertEquals(countRecords(reader), 10);
  }

  @Test
  public void testSamReaderFromURL() throws IOException {
    final String samFilePath = new File(TEST_DATA_DIR, "unsorted.sam").getAbsolutePath();
    final URL samURL = new URL("file://" + samFilePath);
    final SamReaderFactory factory =
        SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT);
    final SamReader reader = factory.open(SamInputResource.of(samURL));
    Assert.assertEquals(countRecords(reader), 10);
  }

  @Test(expectedExceptions = SAMFormatException.class)
  public void testSamReaderFromMalformedSeekableStream() throws IOException {
    // use a bogus (.bai file) to force SamReaderFactory to fall through to the
    // fallback code that assumes a SAM File when it can't determine the
    // format of the input, to ensure that it results in a SAMFormatException
    final File samFile = new File(TEST_DATA_DIR, "cram_with_bai_index.cram.bai");
    final SamReaderFactory factory =
        SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT);
    final SamReader reader = factory.open(SamInputResource.of(new SeekableFileStream(samFile)));
    countRecords(reader);
  }
}