Example #1
0
  @Test(dataProvider = "composeAllPermutationsOfSamInputResource")
  public void queryInputResourcePermutation(final SamInputResource resource) throws IOException {
    final SamReader reader = SamReaderFactory.makeDefault().open(resource);
    LOG.info(String.format("Query from %s ...", resource));
    if (reader.hasIndex()) {
      final StopWatch stopWatch = new StopWatch();
      stopWatch.start();
      final SAMRecordIterator q1 = reader.query("chr1", 500000, 100000000, true);
      observedRecordOrdering1.add(Iterables.slurp(q1));
      q1.close();
      final SAMRecordIterator q20 = reader.query("chr20", 1, 1000000, true);
      observedRecordOrdering20.add(Iterables.slurp(q20));
      q20.close();
      final SAMRecordIterator q3 = reader.query("chr3", 1, 10000000, true);
      observedRecordOrdering3.add(Iterables.slurp(q3));
      q3.close();
      stopWatch.stop();
      LOG.info(String.format("Finished queries in %sms", stopWatch.getElapsedTime()));

      Assert.assertEquals(
          observedRecordOrdering1.size(), 1, "read different records for chromosome 1");
      Assert.assertEquals(
          observedRecordOrdering20.size(), 1, "read different records for chromosome 20");
      Assert.assertEquals(
          observedRecordOrdering3.size(), 1, "read different records for chromosome 3");
    } else if (resource.indexMaybe() != null) {
      LOG.warn("Resource has an index source, but is not indexed: " + resource);
    } else {
      LOG.info("Skipping query operation: no index.");
    }
    reader.close();
  }
Example #2
0
 /**
  * The method holds the behaviour for when the EOF marker is not found. Depending on the CRAM
  * version this will be ignored, a warning issued or an exception produced.
  *
  * @param version CRAM version to assume
  */
 public static void eofNotFound(final Version version) {
   if (version.compatibleWith(CramVersions.CRAM_v3)) {
     log.error("Incomplete data: EOF marker not found.");
     throw new RuntimeException("EOF not found.");
   }
   if (version.compatibleWith(CramVersions.CRAM_v2_1))
     log.warn("EOF marker not found, possibly incomplete file/stream.");
 }
  /**
   * Method that pairwise checks every pair of read groups and reports a LOD score for the two read
   * groups coming from the same sample.
   */
  private int crossCheckReadGroups(
      final Map<SAMReadGroupRecord, Fingerprint> fingerprints, final PrintStream out) {
    int mismatches = 0;
    int unexpectedMatches = 0;

    final List<SAMReadGroupRecord> readGroupRecords = new ArrayList<>(fingerprints.keySet());
    final List<String> output = new ArrayList<>();

    for (int i = 0; i < readGroupRecords.size(); i++) {
      final SAMReadGroupRecord lhsRg = readGroupRecords.get(i);
      for (int j = i + 1; j < readGroupRecords.size(); j++) {
        final SAMReadGroupRecord rhsRg = readGroupRecords.get(j);
        final boolean expectedToMatch =
            EXPECT_ALL_READ_GROUPS_TO_MATCH || lhsRg.getSample().equals(rhsRg.getSample());

        final MatchResults results =
            FingerprintChecker.calculateMatchResults(
                fingerprints.get(lhsRg),
                fingerprints.get(rhsRg),
                GENOTYPING_ERROR_RATE,
                LOSS_OF_HET_RATE);
        if (expectedToMatch) {
          if (results.getLOD() < LOD_THRESHOLD) {
            mismatches++;
            output.add(getMatchDetails(UNEXPECTED_MISMATCH, results, lhsRg, rhsRg));
          } else {
            if (!OUTPUT_ERRORS_ONLY) {
              output.add(getMatchDetails(EXPECTED_MATCH, results, lhsRg, rhsRg));
            }
          }
        } else {
          if (results.getLOD() > -LOD_THRESHOLD) {
            unexpectedMatches++;
            output.add(getMatchDetails(UNEXPECTED_MATCH, results, lhsRg, rhsRg));
          } else {
            if (!OUTPUT_ERRORS_ONLY) {
              output.add(getMatchDetails(EXPECTED_MISMATCH, results, lhsRg, rhsRg));
            }
          }
        }
      }
    }

    if (!output.isEmpty()) {
      out.println(
          "RESULT\tLOD_SCORE\tLOD_SCORE_TUMOR_NORMAL\tLOD_SCORE_NORMAL_TUMOR\tLEFT_RUN_BARCODE\tLEFT_LANE\tLEFT_MOLECULAR_BARCODE_SEQUENCE\tLEFT_LIBRARY\tLEFT_SAMPLE\t"
              + "RIGHT_RUN_BARCODE\tRIGHT_LANE\tRIGHT_MOLECULAR_BARCODE_SEQUENCE\tRIGHT_LIBRARY\tRIGHT_SAMPLE");
      out.println(String.join("\n", output));
    }

    if (mismatches + unexpectedMatches > 0) {
      log.info("WARNING: At least two read groups did not relate as expected.");
      return EXIT_CODE_WHEN_MISMATCH;
    } else {
      log.info("All read groups related as expected.");
      return 0;
    }
  }
Example #4
0
  OverlapDetector<Gene> load() {
    final OverlapDetector<Gene> overlapDetector = new OverlapDetector<Gene>(0, 0);

    final int expectedColumns = RefFlatColumns.values().length;
    final TabbedTextFileWithHeaderParser parser =
        new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels);
    final Map<String, List<TabbedTextFileWithHeaderParser.Row>> refFlatLinesByGene =
        new HashMap<String, List<TabbedTextFileWithHeaderParser.Row>>();

    for (final TabbedTextFileWithHeaderParser.Row row : parser) {
      final int lineNumber =
          parser.getCurrentLineNumber(); // getCurrentLineNumber returns the number of the next line
      if (row.getFields().length != expectedColumns) {
        throw new AnnotationException(
            "Wrong number of fields in refFlat file " + refFlatFile + " at line " + lineNumber);
      }
      final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
      final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
      final String transcriptDescription = geneName + ":" + transcriptName;
      final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name());
      if (!isSequenceRecognized(chromosome)) {
        LOG.debug(
            "Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome);
      } else {
        List<TabbedTextFileWithHeaderParser.Row> transcriptLines = refFlatLinesByGene.get(geneName);
        if (transcriptLines == null) {
          transcriptLines = new ArrayList<TabbedTextFileWithHeaderParser.Row>();
          refFlatLinesByGene.put(geneName, transcriptLines);
        }
        transcriptLines.add(row);
      }
    }

    int longestInterval = 0;
    int numIntervalsOver1MB = 0;

    for (final List<TabbedTextFileWithHeaderParser.Row> transcriptLines :
        refFlatLinesByGene.values()) {
      try {
        final Gene gene = makeGeneFromRefFlatLines(transcriptLines);
        overlapDetector.addLhs(gene, gene);
        if (gene.length() > longestInterval) longestInterval = gene.length();
        if (gene.length() > 1000000) ++numIntervalsOver1MB;
      } catch (AnnotationException e) {
        LOG.debug(e.getMessage() + " -- skipping");
      }
    }
    LOG.debug(
        "Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB);
    return overlapDetector;
  }
  /**
   * Generates tab delimited string containing details about the passed SAMReadGroupRecord
   *
   * @param readGroupRecord record
   * @return tab delimited string containing details about the SAMReadGroupRecord
   */
  private String getReadGroupDetails(final SAMReadGroupRecord readGroupRecord) {
    final List<String> elements = new ArrayList<>(5);

    final String tmp[] =
        readGroupRecord
            .getPlatformUnit()
            .split("\\."); // Expect to look like: D047KACXX110901.1.ACCAACTG
    String runBarcode = "?";
    String lane = "?";
    String molBarcode = "?";
    if ((tmp.length == 3) || (tmp.length == 2)) {
      runBarcode = tmp[0];
      lane = tmp[1];
      molBarcode =
          (tmp.length == 3)
              ? tmp[2]
              : ""; // In older BAMS there may be no molecular barcode sequence
    } else {
      log.error("Unexpected format " + readGroupRecord.getPlatformUnit() + " for PU attribute");
    }
    elements.add(runBarcode);
    elements.add(lane);
    elements.add(molBarcode);
    elements.add(readGroupRecord.getLibrary());
    elements.add(readGroupRecord.getSample());
    return String.join("\t", elements);
  }
  /**
   * Create factory with the specified options, one that favors using QSeqs over all other files
   *
   * @param basecallDirectory The baseCalls directory of a complete Illumina directory. Files are
   *     found by searching relative to this folder (some of them higher up in the directory tree).
   * @param barcodesDirectory The barcodesDirectory with barcode files extracted by
   *     'ExtractIlluminaBarcodes' (optional, use basecallDirectory if not specified)
   * @param lane Which lane to iterate over.
   * @param readStructure The read structure to which output clusters will conform. When not using
   *     QSeqs, EAMSS masking(see BclParser) is run on individual reads as found in the
   *     readStructure, if the readStructure specified does not match the readStructure implied by
   *     the sequencer's output than the quality scores output may differ than what would be found
   *     in a run's QSeq files
   * @param dataTypesArg Which data types to read
   */
  public IlluminaDataProviderFactory(
      final File basecallDirectory,
      File barcodesDirectory,
      final int lane,
      final ReadStructure readStructure,
      final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
      final IlluminaDataType... dataTypesArg) {
    this.basecallDirectory = basecallDirectory;
    this.barcodesDirectory = barcodesDirectory;
    this.bclQualityEvaluationStrategy = bclQualityEvaluationStrategy;

    this.lane = lane;
    /* The types of data that will be returned by any IlluminaDataProviders created by this factory.

    Note: In previous version, data of types not specified might be returned if a data type was specified
    for data residing in QSeqs (since QSeqs span multiple data types).  This is no longer the case, you
    MUST specify all data types that should be returned.*/
    final Set<IlluminaDataType> dataTypes =
        Collections.unmodifiableSet(new HashSet<IlluminaDataType>(Arrays.asList(dataTypesArg)));

    if (dataTypes.isEmpty()) {
      throw new PicardException(
          "No data types have been specified for basecall output "
              + basecallDirectory
              + ", lane "
              + lane);
    }

    this.fileUtil = new IlluminaFileUtil(basecallDirectory, barcodesDirectory, lane);

    // find what request IlluminaDataTypes we have files for and select the most preferred file
    // format available for that type
    formatToDataTypes = determineFormats(dataTypes, fileUtil);

    // find if we have any IlluminaDataType with NO available file formats and, if any exist, throw
    // an exception
    final Set<IlluminaDataType> unmatchedDataTypes =
        findUnmatchedTypes(dataTypes, formatToDataTypes);
    if (unmatchedDataTypes.size() > 0) {
      throw new PicardException(
          "Could not find a format with available files for the following data types: "
              + StringUtil.join(", ", new ArrayList<IlluminaDataType>(unmatchedDataTypes)));
    }

    log.debug(
        "The following file formats will be used by IlluminaDataProvider: "
            + StringUtil.join("," + formatToDataTypes.keySet()));

    availableTiles =
        fileUtil.getActualTiles(new ArrayList<SupportedIlluminaFormat>(formatToDataTypes.keySet()));
    if (availableTiles.isEmpty()) {
      throw new PicardException(
          "No available tiles were found, make sure that "
              + basecallDirectory.getAbsolutePath()
              + " has a lane "
              + lane);
    }

    outputMapping = new OutputMapping(readStructure);
  }
Example #7
0
/**
 * Command line program to print statistics from BAM index (.bai) file Statistics include count of
 * aligned and unaligned reads for each reference sequence and a count of all records with no start
 * coordinate. Similar to the 'samtools idxstats' command.
 *
 * @author Martha Borkan
 */
@CommandLineProgramProperties(
    usage =
        "Generates BAM index statistics, including the number of aligned and unaligned SAMRecords for each reference sequence, "
            + "and the number of SAMRecords with no coordinate."
            + "Input BAM file must have a corresponding index file.\n",
    usageShort = "Generates index statistics from a BAM file",
    programGroup = SamOrBam.class)
public class BamIndexStats extends CommandLineProgram {

  private static final Log log = Log.getInstance(BamIndexStats.class);

  @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "A BAM file to process.")
  public File INPUT;

  /** Stock main method for a command line program. */
  public static void main(final String[] argv) {
    System.exit(new BamIndexStats().instanceMain(argv));
  }

  /**
   * Main method for the program. Checks that input file is present and readable, then iterates
   * through the index printing meta data to stdout.
   */
  protected int doWork() {

    if (INPUT.getName().endsWith(BAMIndex.BAMIndexSuffix))
      log.warn("INPUT should be BAM file not index file");
    IOUtil.assertFileIsReadable(INPUT);
    BAMIndexMetaData.printIndexStats(INPUT);

    return 0;
  }
}
  /**
   * Prepares loggers, initiates garbage collection thread, parses arguments and initialized
   * variables appropriately/
   */
  private void initialize() {
    fastqWriterFactory.setCreateMd5(CREATE_MD5_FILE);
    switch (READ_NAME_FORMAT) {
      case CASAVA_1_8:
        readNameEncoder = new Casava18ReadNameEncoder(MACHINE_NAME, RUN_BARCODE, FLOWCELL_BARCODE);
        break;
      case ILLUMINA:
        readNameEncoder = new IlluminaReadNameEncoder(RUN_BARCODE);
        break;
    }

    final BclQualityEvaluationStrategy bclQualityEvaluationStrategy =
        new BclQualityEvaluationStrategy(MINIMUM_QUALITY);
    readStructure = new ReadStructure(READ_STRUCTURE);
    if (MULTIPLEX_PARAMS != null) {
      IOUtil.assertFileIsReadable(MULTIPLEX_PARAMS);
    }
    final boolean demultiplex;
    if (OUTPUT_PREFIX != null) {
      sampleBarcodeFastqWriterMap.put(null, buildWriter(OUTPUT_PREFIX));
      demultiplex = false;
    } else {
      populateWritersFromMultiplexParams();
      demultiplex = true;
    }
    final int readsPerCluster =
        readStructure.templates.length() + readStructure.sampleBarcodes.length();
    basecallsConverter =
        new IlluminaBasecallsConverter<FastqRecordsForCluster>(
            BASECALLS_DIR,
            BARCODES_DIR,
            LANE,
            readStructure,
            sampleBarcodeFastqWriterMap,
            demultiplex,
            MAX_READS_IN_RAM_PER_TILE / readsPerCluster,
            TMP_DIR,
            NUM_PROCESSORS,
            FORCE_GC,
            FIRST_TILE,
            TILE_LIMIT,
            queryNameComparator,
            new FastqRecordsForClusterCodec(
                readStructure.templates.length(),
                readStructure.sampleBarcodes.length(),
                readStructure.molecularBarcode.length()),
            FastqRecordsForCluster.class,
            bclQualityEvaluationStrategy,
            this.APPLY_EAMSS_FILTER,
            INCLUDE_NON_PF_READS,
            IGNORE_UNEXPECTED_BARCODES);

    log.info("READ STRUCTURE IS " + readStructure.toString());

    basecallsConverter.setConverter(
        new ClusterToFastqRecordsForClusterConverter(
            basecallsConverter.getFactory().getOutputReadStructure()));
  }
Example #9
0
  /**
   * Main method for the program. Checks that input file is present and readable, then iterates
   * through the index printing meta data to stdout.
   */
  protected int doWork() {

    if (INPUT.getName().endsWith(BAMIndex.BAMIndexSuffix))
      log.warn("INPUT should be the BAM file name, not its index file");
    IOUtil.assertFileIsReadable(INPUT);
    BAMIndexMetaData.printIndexStats(INPUT);

    return 0;
  }
Example #10
0
  /**
   * Main method for the program. Checks that all input files are present and readable and that the
   * output file can be written to. Then iterates through all the records generating a BAM Index,
   * then writes the bai file.
   */
  protected int doWork() {

    try {
      inputUrl = new URL(INPUT);
    } catch (java.net.MalformedURLException e) {
      inputFile = new File(INPUT);
    }

    // set default output file - input-file.bai
    if (OUTPUT == null) {

      final String baseFileName;
      if (inputUrl != null) {
        String path = inputUrl.getPath();
        int lastSlash = path.lastIndexOf("/");
        baseFileName = path.substring(lastSlash + 1, path.length());
      } else {
        baseFileName = inputFile.getAbsolutePath();
      }

      if (baseFileName.endsWith(BamFileIoUtils.BAM_FILE_EXTENSION)) {

        final int index = baseFileName.lastIndexOf(".");
        OUTPUT = new File(baseFileName.substring(0, index) + BAMIndex.BAMIndexSuffix);

      } else {
        OUTPUT = new File(baseFileName + BAMIndex.BAMIndexSuffix);
      }
    }

    IOUtil.assertFileIsWritable(OUTPUT);
    final SAMFileReader bam;

    if (inputUrl != null) {
      // remote input
      bam = new SAMFileReader(inputUrl, null, false);
    } else {
      // input from a normal file
      IOUtil.assertFileIsReadable(inputFile);
      bam = new SAMFileReader(inputFile);
    }

    if (!bam.isBinary()) {
      throw new SAMException("Input file must be bam file, not sam file.");
    }

    if (!bam.getFileHeader().getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
      throw new SAMException("Input bam file must be sorted by coordinates");
    }

    BAMIndexer.createIndex(bam, OUTPUT);

    log.info("Successfully wrote bam index file " + OUTPUT);
    CloserUtil.close(bam);
    return 0;
  }
  @Override
  protected int doWork() {
    // Check inputs
    for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
    IOUtil.assertFileIsReadable(HAPLOTYPE_MAP);
    if (OUTPUT != null) IOUtil.assertFileIsWritable(OUTPUT);

    final HaplotypeMap map = new HaplotypeMap(HAPLOTYPE_MAP);
    final FingerprintChecker checker = new FingerprintChecker(map);

    checker.setAllowDuplicateReads(ALLOW_DUPLICATE_READS);

    log.info("Done checking input files, moving onto fingerprinting files.");

    List<File> unrolledFiles =
        IOUtil.unrollFiles(INPUT, BamFileIoUtils.BAM_FILE_EXTENSION, IOUtil.SAM_FILE_EXTENSION);
    final Map<SAMReadGroupRecord, Fingerprint> fpMap =
        checker.fingerprintSamFiles(unrolledFiles, NUM_THREADS, 1, TimeUnit.DAYS);
    final List<Fingerprint> fingerprints = new ArrayList<>(fpMap.values());

    log.info("Finished generating fingerprints from BAM files, moving on to cross-checking.");

    // Setup the output
    final PrintStream out;
    if (OUTPUT != null) {
      out = new PrintStream(IOUtil.openFileForWriting(OUTPUT), true);
    } else {
      out = System.out;
    }

    if (this.CROSSCHECK_SAMPLES) {
      crossCheckSamples(fingerprints, out);
      return 0;
    } else if (this.CROSSCHECK_LIBRARIES) {
      crossCheckLibraries(fpMap, out);
      return 0;
    } else {
      return crossCheckReadGroups(fpMap, out);
    }
  }
 private static File writeMetrics(
     final MetricsFile<MetricBase, Comparable<?>> metricsFile,
     final File outputDirectory,
     final String outputPrefix,
     final String outputExtension) {
   final File outputFile =
       new File(outputDirectory, String.format("%s.%s", outputPrefix, outputExtension));
   LOG.info(
       String.format(
           "Writing %s lane metrics to %s ...", metricsFile.getMetrics().size(), outputFile));
   metricsFile.write(outputFile);
   return outputFile;
 }
Example #13
0
class BaiIndexer {
  private static Log log = Log.getInstance(BaiIndexer.class);

  public CountingInputStream is;
  public SAMFileHeader samFileHeader;
  public CRAMIndexer indexer;

  public BaiIndexer(InputStream is, SAMFileHeader samFileHeader, File output) {
    this.is = new CountingInputStream(is);
    this.samFileHeader = samFileHeader;

    indexer = new CRAMIndexer(output, samFileHeader);
  }

  public BaiIndexer(InputStream is, File output) throws IOException {
    this.is = new CountingInputStream(is);
    CramHeader cramHeader = CramIO.readCramHeader(this.is);
    samFileHeader = cramHeader.getSamFileHeader();

    indexer = new CRAMIndexer(output, samFileHeader);
  }

  private boolean nextContainer() throws IOException {
    long offset = is.getCount();
    Container c = CramIO.readContainer(is);
    if (c == null) return false;
    c.offset = offset;

    int i = 0;
    for (Slice slice : c.slices) {
      slice.containerOffset = offset;
      slice.index = i++;
      indexer.processAlignment(slice);
    }

    log.info("INDEXED: " + c.toString());
    return true;
  }

  private void index() throws IOException {
    while (true) {
      if (!nextContainer()) break;
    }
  }

  public void run() throws IOException {
    index();
    indexer.finish();
  }
}
Example #14
0
  /**
   * Generates a BAM index file from an input BAM file
   *
   * @param reader SAMFileReader for input BAM file
   * @param output File for output index file
   */
  public static void createIndex(SAMFileReader reader, File output, Log log) {

    BAMIndexer indexer = new BAMIndexer(output, reader.getFileHeader());

    reader.enableFileSource(true);
    int totalRecords = 0;

    // create and write the content
    for (SAMRecord rec : reader) {
      if (++totalRecords % 1000000 == 0) {
        if (null != log) log.info(totalRecords + " reads processed ...");
      }
      indexer.processAlignment(rec);
    }
    indexer.finish();
  }
Example #15
0
  private boolean nextContainer() throws IOException {
    long offset = is.getCount();
    Container c = CramIO.readContainer(is);
    if (c == null) return false;
    c.offset = offset;

    int i = 0;
    for (Slice slice : c.slices) {
      slice.containerOffset = offset;
      slice.index = i++;
      indexer.processAlignment(slice);
    }

    log.info("INDEXED: " + c.toString());
    return true;
  }
Example #16
0
  @Test(dataProvider = "composeAllPermutationsOfSamInputResource")
  public void exhaustInputResourcePermutation(final SamInputResource resource) throws IOException {
    final SamReader reader = SamReaderFactory.makeDefault().open(resource);
    LOG.info(String.format("Reading from %s ...", resource));
    final List<SAMRecord> slurped = Iterables.slurp(reader);
    final SAMFileHeader fileHeader = reader.getFileHeader();
    reader.hasIndex();
    reader.indexing().hasBrowseableIndex();
    reader.close();

    /* Ensure all tests have read the same records in the same order or, if this is the first test, set it as the template. */
    observedHeaders.add(fileHeader);
    observedRecordOrdering.add(slurped);
    Assert.assertEquals(observedHeaders.size(), 1, "read different headers than other testcases");
    Assert.assertEquals(
        observedRecordOrdering.size(), 1, "read different records than other testcases");
  }
Example #17
0
/** The class provides version-dependant rules and policies for CRAM data. */
public class CramVersionPolicies {
  private static final Log log = Log.getInstance(CramVersionPolicies.class);

  /**
   * The method holds the behaviour for when the EOF marker is not found. Depending on the CRAM
   * version this will be ignored, a warning issued or an exception produced.
   *
   * @param version CRAM version to assume
   */
  public static void eofNotFound(final Version version) {
    if (version.compatibleWith(CramVersions.CRAM_v3)) {
      log.error("Incomplete data: EOF marker not found.");
      throw new RuntimeException("EOF not found.");
    }
    if (version.compatibleWith(CramVersions.CRAM_v2_1))
      log.warn("EOF marker not found, possibly incomplete file/stream.");
  }
}
Example #18
0
  @Test
  public void openPath() throws IOException {
    final Path path = localBam.toPath();
    final List<SAMRecord> records;
    final SAMFileHeader fileHeader;
    try (final SamReader reader = SamReaderFactory.makeDefault().open(path)) {
      LOG.info(String.format("Reading from %s ...", path));
      records = Iterables.slurp(reader);
      fileHeader = reader.getFileHeader();
      reader.close();
    }

    try (final SamReader fileReader = SamReaderFactory.makeDefault().open(localBam)) {
      final List<SAMRecord> expectedRecords = Iterables.slurp(fileReader);
      final SAMFileHeader expectedFileHeader = fileReader.getFileHeader();
      Assert.assertEquals(records, expectedRecords);
      Assert.assertEquals(fileHeader, expectedFileHeader);
    }
  }
Example #19
0
/**
 * Command line program to print statistics from BAM index (.bai) file Statistics include count of
 * aligned and unaligned reads for each reference sequence and a count of all records with no start
 * coordinate. Similar to the 'samtools idxstats' command.
 *
 * @author Martha Borkan
 */
@CommandLineProgramProperties(
    usage = BamIndexStats.USAGE_SUMMARY + BamIndexStats.USAGE_DETAILS,
    usageShort = BamIndexStats.USAGE_SUMMARY,
    programGroup = SamOrBam.class)
public class BamIndexStats extends CommandLineProgram {
  static final String USAGE_SUMMARY = "Generate index statistics from a BAM file";
  static final String USAGE_DETAILS =
      "This tool calculates statistics from a BAM index (.bai) file, emulating the behavior of the "
          + "\"samtools idxstats\" command. The statistics collected include counts of aligned and unaligned reads as well as all "
          + "records with no start coordinate. The input to the tool is the BAM file name but it must be accompanied by a corresponding "
          + "index file.<br />"
          + "<h4>Usage example:</h4>"
          + "<pre>"
          + "java -jar picard.jar BamIndexStats \\<br />"
          + "      I=input.bam \\<br />"
          + "      O=output"
          + "</pre>"
          + "<hr />";
  private static final Log log = Log.getInstance(BamIndexStats.class);

  @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "A BAM file to process.")
  public File INPUT;

  /** Stock main method for a command line program. */
  public static void main(final String[] argv) {
    System.exit(new BamIndexStats().instanceMain(argv));
  }

  /**
   * Main method for the program. Checks that input file is present and readable, then iterates
   * through the index printing meta data to stdout.
   */
  protected int doWork() {

    if (INPUT.getName().endsWith(BAMIndex.BAMIndexSuffix))
      log.warn("INPUT should be the BAM file name, not its index file");
    IOUtil.assertFileIsReadable(INPUT);
    BAMIndexMetaData.printIndexStats(INPUT);

    return 0;
  }
}
  /**
   * Call this method to create a ClusterData iterator over the specified tiles.
   *
   * @return An iterator for reading the Illumina basecall output for the lane specified in the
   *     constructor.
   */
  public IlluminaDataProvider makeDataProvider(List<Integer> requestedTiles) {
    if (requestedTiles == null) {
      requestedTiles = availableTiles;
    } else {
      if (requestedTiles.size() == 0) {
        throw new PicardException(
            "Zero length tile list supplied to makeDataProvider, you must specify at least 1 tile OR pass NULL to use all available tiles");
      }
    }

    final Map<IlluminaParser, Set<IlluminaDataType>> parsersToDataType =
        new HashMap<IlluminaParser, Set<IlluminaDataType>>();
    for (final Map.Entry<SupportedIlluminaFormat, Set<IlluminaDataType>> fmToDt :
        formatToDataTypes.entrySet()) {
      parsersToDataType.put(makeParser(fmToDt.getKey(), requestedTiles), fmToDt.getValue());
    }

    log.debug(
        "The following parsers will be used by IlluminaDataProvider: "
            + StringUtil.join("," + parsersToDataType.keySet()));

    return new IlluminaDataProvider(outputMapping, parsersToDataType, basecallDirectory, lane);
  }
Example #21
0
 @Override
 public SamReader open(URL url) {
   final File file = new File(TEST_DATA_DIR, url.getQuery());
   LOG.info("Opening customr reader for " + file.toString());
   return SamReaderFactory.makeDefault().open(file);
 }
Example #22
0
/**
 * Command line program to generate a BAM index (.bai) file from a BAM (.bam) file
 *
 * @author Martha Borkan
 */
public class BuildBamIndex extends CommandLineProgram {

  private static final Log log = Log.getInstance(BuildBamIndex.class);

  @Usage public String USAGE = getStandardUsagePreamble() + "Generates a BAM index (.bai) file.";

  @Option(
      shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
      doc = "A BAM file or URL to process. Must be sorted in coordinate order.")
  public String INPUT;

  URL inputUrl = null; // INPUT as URL
  File inputFile = null; // INPUT as File, if it can't be interpreted as a valid URL

  @Option(
      shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
      doc =
          "The BAM index file. Defaults to x.bai if INPUT is x.bam, otherwise INPUT.bai.\n"
              + "If INPUT is a URL and OUTPUT is unspecified, defaults to a file in the current directory.",
      optional = true)
  public File OUTPUT;

  /** Stock main method for a command line program. */
  public static void main(final String[] argv) {
    System.exit(new BuildBamIndex().instanceMain(argv));
  }

  /**
   * Main method for the program. Checks that all input files are present and readable and that the
   * output file can be written to. Then iterates through all the records generating a BAM Index,
   * then writes the bai file.
   */
  protected int doWork() {

    try {
      inputUrl = new URL(INPUT);
    } catch (java.net.MalformedURLException e) {
      inputFile = new File(INPUT);
    }

    // set default output file - input-file.bai
    if (OUTPUT == null) {

      final String baseFileName;
      if (inputUrl != null) {
        String path = inputUrl.getPath();
        int lastSlash = path.lastIndexOf("/");
        baseFileName = path.substring(lastSlash + 1, path.length());
      } else {
        baseFileName = inputFile.getAbsolutePath();
      }

      if (baseFileName.endsWith(BamFileIoUtils.BAM_FILE_EXTENSION)) {

        final int index = baseFileName.lastIndexOf(".");
        OUTPUT = new File(baseFileName.substring(0, index) + BAMIndex.BAMIndexSuffix);

      } else {
        OUTPUT = new File(baseFileName + BAMIndex.BAMIndexSuffix);
      }
    }

    IOUtil.assertFileIsWritable(OUTPUT);
    final SAMFileReader bam;

    if (inputUrl != null) {
      // remote input
      bam = new SAMFileReader(inputUrl, null, false);
    } else {
      // input from a normal file
      IOUtil.assertFileIsReadable(inputFile);
      bam = new SAMFileReader(inputFile);
    }

    if (!bam.isBinary()) {
      throw new SAMException("Input file must be bam file, not sam file.");
    }

    if (!bam.getFileHeader().getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
      throw new SAMException("Input bam file must be sorted by coordinates");
    }

    BAMIndexer.createIndex(bam, OUTPUT);

    log.info("Successfully wrote bam index file " + OUTPUT);
    CloserUtil.close(bam);
    return 0;
  }
}
Example #23
0
public class SamReaderFactoryTest {
  private static final File TEST_DATA_DIR = new File("testdata/htsjdk/samtools");

  private static final Log LOG = Log.getInstance(SamReaderFactoryTest.class);

  @Test(dataProvider = "variousFormatReaderTestCases")
  public void variousFormatReaderTest(final String inputFile) throws IOException {
    final File input = new File(TEST_DATA_DIR, inputFile);
    final SamReader reader = SamReaderFactory.makeDefault().open(input);
    for (final SAMRecord ignored : reader) {}
    reader.close();
  }

  private int countRecordsInQueryInterval(final SamReader reader, final QueryInterval query) {
    final SAMRecordIterator iter = reader.queryOverlapping(new QueryInterval[] {query});
    int count = 0;
    while (iter.hasNext()) {
      iter.next();
      count++;
    }
    iter.close();
    return count;
  }

  // See https://github.com/samtools/htsjdk/issues/76
  @Test(dataProvider = "queryIntervalIssue76TestCases")
  public void queryIntervalIssue76(
      final String sequenceName, final int start, final int end, final int expectedCount)
      throws IOException {
    final File input = new File(TEST_DATA_DIR, "issue76.bam");
    final SamReader reader = SamReaderFactory.makeDefault().open(input);
    final QueryInterval interval =
        new QueryInterval(
            reader.getFileHeader().getSequence(sequenceName).getSequenceIndex(), start, end);
    Assert.assertEquals(countRecordsInQueryInterval(reader, interval), expectedCount);
    reader.close();
  }

  @DataProvider(name = "queryIntervalIssue76TestCases")
  public Object[][] queryIntervalIssue76TestCases() {
    return new Object[][] {
      {"1", 11966, 11966, 2},
      {"1", 11966, 11967, 2},
      {"1", 11967, 11967, 1}
    };
  }

  @DataProvider(name = "variousFormatReaderTestCases")
  public Object[][] variousFormatReaderTestCases() {
    return new Object[][] {
      {"block_compressed.sam.gz"}, {"uncompressed.sam"}, {"compressed.sam.gz"}, {"compressed.bam"},
    };
  }

  // Tests for the SAMRecordFactory usage
  class SAMRecordFactoryTester extends DefaultSAMRecordFactory {
    int samRecordsCreated;
    int bamRecordsCreated;

    public SAMRecord createSAMRecord(final SAMFileHeader header) {
      ++samRecordsCreated;
      return super.createSAMRecord(header);
    }

    public BAMRecord createBAMRecord(
        final SAMFileHeader header,
        final int referenceSequenceIndex,
        final int alignmentStart,
        final short readNameLength,
        final short mappingQuality,
        final int indexingBin,
        final int cigarLen,
        final int flags,
        final int readLen,
        final int mateReferenceSequenceIndex,
        final int mateAlignmentStart,
        final int insertSize,
        final byte[] variableLengthBlock) {
      ++bamRecordsCreated;
      return super.createBAMRecord(
          header,
          referenceSequenceIndex,
          alignmentStart,
          readNameLength,
          mappingQuality,
          indexingBin,
          cigarLen,
          flags,
          readLen,
          mateReferenceSequenceIndex,
          mateAlignmentStart,
          insertSize,
          variableLengthBlock);
    }
  }

  @Test(dataProvider = "variousFormatReaderTestCases")
  public void samRecordFactoryTest(final String inputFile) throws IOException {
    final File input = new File(TEST_DATA_DIR, inputFile);

    final SAMRecordFactoryTester recordFactory = new SAMRecordFactoryTester();
    final SamReaderFactory readerFactory =
        SamReaderFactory.makeDefault().samRecordFactory(recordFactory);
    final SamReader reader = readerFactory.open(input);

    int i = 0;
    for (final SAMRecord ignored : reader) {
      ++i;
    }
    reader.close();

    Assert.assertTrue(i > 0);
    if (inputFile.endsWith(".sam") || inputFile.endsWith(".sam.gz"))
      Assert.assertEquals(recordFactory.samRecordsCreated, i);
    else if (inputFile.endsWith(".bam")) Assert.assertEquals(recordFactory.bamRecordsCreated, i);
  }

  @Test(expectedExceptions = IllegalStateException.class)
  public void samRecordFactoryNullHeaderBAMTest() {
    final SAMRecordFactory recordFactory = new DefaultSAMRecordFactory();
    recordFactory.createBAMRecord(
        null, // null header
        0, 0, (short) 0, (short) 0, 0, 0, 0, 0, 0, 0, 0, null);
  }

  /**
   * Unit tests for asserting all permutations of data and index sources read the same records and
   * header.
   */
  final File localBam = new File("testdata/htsjdk/samtools/BAMFileIndexTest/index_test.bam");

  final File localBamIndex =
      new File("testdata/htsjdk/samtools/BAMFileIndexTest/index_test.bam.bai");

  final URL bamUrl, bamIndexUrl;

  {
    try {
      bamUrl = new URL("http://www.broadinstitute.org/~picard/testdata/index_test.bam");
      bamIndexUrl = new URL("http://www.broadinstitute.org/~picard/testdata/index_test.bam.bai");
    } catch (final MalformedURLException e) {
      throw new RuntimeException(e);
    }
  }

  @DataProvider
  public Object[][] composeAllPermutationsOfSamInputResource() {
    final List<SamInputResource> sources = new ArrayList<SamInputResource>();
    for (final InputResource.Type dataType : InputResource.Type.values()) {
      if (dataType.equals(InputResource.Type.SRA_ACCESSION)) continue;

      sources.add(new SamInputResource(composeInputResourceForType(dataType, false)));
      for (final InputResource.Type indexType : InputResource.Type.values()) {
        if (indexType.equals(InputResource.Type.SRA_ACCESSION)) continue;

        sources.add(
            new SamInputResource(
                composeInputResourceForType(dataType, false),
                composeInputResourceForType(indexType, true)));
      }
    }
    final Object[][] data = new Object[sources.size()][];
    for (final SamInputResource source : sources) {
      data[sources.indexOf(source)] = new Object[] {source};
    }

    return data;
  }

  private InputResource composeInputResourceForType(
      final InputResource.Type type, final boolean forIndex) {
    final File f = forIndex ? localBamIndex : localBam;
    final URL url = forIndex ? bamIndexUrl : bamUrl;
    switch (type) {
      case FILE:
        return new FileInputResource(f);
      case URL:
        return new UrlInputResource(url);
      case SEEKABLE_STREAM:
        return new SeekableStreamInputResource(new SeekableHTTPStream(url));
      case INPUT_STREAM:
        try {
          return new InputStreamInputResource(new FileInputStream(f));
        } catch (final FileNotFoundException e) {
          throw new RuntimeIOException(e);
        }
      default:
        throw new IllegalStateException();
    }
  }

  final Set<SAMFileHeader> observedHeaders = new HashSet<SAMFileHeader>();
  final Set<List<SAMRecord>> observedRecordOrdering = new HashSet<List<SAMRecord>>();

  @Test(dataProvider = "composeAllPermutationsOfSamInputResource")
  public void exhaustInputResourcePermutation(final SamInputResource resource) throws IOException {
    final SamReader reader = SamReaderFactory.makeDefault().open(resource);
    LOG.info(String.format("Reading from %s ...", resource));
    final List<SAMRecord> slurped = Iterables.slurp(reader);
    final SAMFileHeader fileHeader = reader.getFileHeader();
    reader.hasIndex();
    reader.indexing().hasBrowseableIndex();
    reader.close();

    /* Ensure all tests have read the same records in the same order or, if this is the first test, set it as the template. */
    observedHeaders.add(fileHeader);
    observedRecordOrdering.add(slurped);
    Assert.assertEquals(observedHeaders.size(), 1, "read different headers than other testcases");
    Assert.assertEquals(
        observedRecordOrdering.size(), 1, "read different records than other testcases");
  }

  final Set<List<SAMRecord>> observedRecordOrdering1 = new HashSet<List<SAMRecord>>();
  final Set<List<SAMRecord>> observedRecordOrdering3 = new HashSet<List<SAMRecord>>();
  final Set<List<SAMRecord>> observedRecordOrdering20 = new HashSet<List<SAMRecord>>();

  @Test(dataProvider = "composeAllPermutationsOfSamInputResource")
  public void queryInputResourcePermutation(final SamInputResource resource) throws IOException {
    final SamReader reader = SamReaderFactory.makeDefault().open(resource);
    LOG.info(String.format("Query from %s ...", resource));
    if (reader.hasIndex()) {
      final StopWatch stopWatch = new StopWatch();
      stopWatch.start();
      final SAMRecordIterator q1 = reader.query("chr1", 500000, 100000000, true);
      observedRecordOrdering1.add(Iterables.slurp(q1));
      q1.close();
      final SAMRecordIterator q20 = reader.query("chr20", 1, 1000000, true);
      observedRecordOrdering20.add(Iterables.slurp(q20));
      q20.close();
      final SAMRecordIterator q3 = reader.query("chr3", 1, 10000000, true);
      observedRecordOrdering3.add(Iterables.slurp(q3));
      q3.close();
      stopWatch.stop();
      LOG.info(String.format("Finished queries in %sms", stopWatch.getElapsedTime()));

      Assert.assertEquals(
          observedRecordOrdering1.size(), 1, "read different records for chromosome 1");
      Assert.assertEquals(
          observedRecordOrdering20.size(), 1, "read different records for chromosome 20");
      Assert.assertEquals(
          observedRecordOrdering3.size(), 1, "read different records for chromosome 3");
    } else if (resource.indexMaybe() != null) {
      LOG.warn("Resource has an index source, but is not indexed: " + resource);
    } else {
      LOG.info("Skipping query operation: no index.");
    }
    reader.close();
  }

  @Test
  public void customReaderFactoryTest() throws IOException {
    try {
      CustomReaderFactory.setInstance(
          new CustomReaderFactory(
              "https://www.googleapis.com/genomics/v1beta/reads/,"
                  + "htsjdk.samtools.SamReaderFactoryTest$TestReaderFactory"));
      final SamReader reader =
          SamReaderFactory.makeDefault()
              .open(
                  SamInputResource.of(
                      "https://www.googleapis.com/genomics/v1beta/reads/?uncompressed.sam"));
      int i = 0;
      for (@SuppressWarnings("unused") final SAMRecord ignored : reader) {
        ++i;
      }
      reader.close();

      Assert.assertTrue(i > 0);
    } finally {
      CustomReaderFactory.resetToDefaultInstance();
    }
  }

  public static class TestReaderFactory implements CustomReaderFactory.ICustomReaderFactory {
    @Override
    public SamReader open(URL url) {
      final File file = new File(TEST_DATA_DIR, url.getQuery());
      LOG.info("Opening customr reader for " + file.toString());
      return SamReaderFactory.makeDefault().open(file);
    }
  }

  @Test
  public void inputResourceFromStringTest() throws IOException {
    Assert.assertEquals(
        SamInputResource.of("http://test.url").data().type(), InputResource.Type.URL);
    Assert.assertEquals(
        SamInputResource.of("https://test.url").data().type(), InputResource.Type.URL);
    Assert.assertEquals(
        SamInputResource.of("ftp://test.url").data().type(), InputResource.Type.URL);
    Assert.assertEquals(SamInputResource.of("/a/b/c").data().type(), InputResource.Type.FILE);
  }
}
Example #24
0
/**
 * Loads gene annotations from a refFlat file into an OverlapDetector<Gene>. Discards annotations
 * that are not internally consistent, e.g. transcripts on different chromosomes or different
 * strands.
 */
public class RefFlatReader {
  private static final Log LOG = Log.getInstance(RefFlatReader.class);
  // These are in the order that columns appear in refFlat format.
  public enum RefFlatColumns {
    GENE_NAME,
    TRANSCRIPT_NAME,
    CHROMOSOME,
    STRAND,
    TX_START,
    TX_END,
    CDS_START,
    CDS_END,
    EXON_COUNT,
    EXON_STARTS,
    EXON_ENDS
  }

  private static final String[] RefFlatColumnLabels = new String[RefFlatColumns.values().length];

  static {
    for (int i = 0; i < RefFlatColumnLabels.length; ++i) {
      RefFlatColumnLabels[i] = RefFlatColumns.values()[i].name();
    }
  }

  private final File refFlatFile;
  private final SAMSequenceDictionary sequenceDictionary;

  RefFlatReader(final File refFlatFile, final SAMSequenceDictionary sequenceDictionary) {
    this.refFlatFile = refFlatFile;
    this.sequenceDictionary = sequenceDictionary;
  }

  static OverlapDetector<Gene> load(
      final File refFlatFile, final SAMSequenceDictionary sequenceDictionary) {
    return new RefFlatReader(refFlatFile, sequenceDictionary).load();
  }

  OverlapDetector<Gene> load() {
    final OverlapDetector<Gene> overlapDetector = new OverlapDetector<Gene>(0, 0);

    final int expectedColumns = RefFlatColumns.values().length;
    final TabbedTextFileWithHeaderParser parser =
        new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels);
    final Map<String, List<TabbedTextFileWithHeaderParser.Row>> refFlatLinesByGene =
        new HashMap<String, List<TabbedTextFileWithHeaderParser.Row>>();

    for (final TabbedTextFileWithHeaderParser.Row row : parser) {
      final int lineNumber =
          parser.getCurrentLineNumber(); // getCurrentLineNumber returns the number of the next line
      if (row.getFields().length != expectedColumns) {
        throw new AnnotationException(
            "Wrong number of fields in refFlat file " + refFlatFile + " at line " + lineNumber);
      }
      final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
      final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
      final String transcriptDescription = geneName + ":" + transcriptName;
      final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name());
      if (!isSequenceRecognized(chromosome)) {
        LOG.debug(
            "Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome);
      } else {
        List<TabbedTextFileWithHeaderParser.Row> transcriptLines = refFlatLinesByGene.get(geneName);
        if (transcriptLines == null) {
          transcriptLines = new ArrayList<TabbedTextFileWithHeaderParser.Row>();
          refFlatLinesByGene.put(geneName, transcriptLines);
        }
        transcriptLines.add(row);
      }
    }

    int longestInterval = 0;
    int numIntervalsOver1MB = 0;

    for (final List<TabbedTextFileWithHeaderParser.Row> transcriptLines :
        refFlatLinesByGene.values()) {
      try {
        final Gene gene = makeGeneFromRefFlatLines(transcriptLines);
        overlapDetector.addLhs(gene, gene);
        if (gene.length() > longestInterval) longestInterval = gene.length();
        if (gene.length() > 1000000) ++numIntervalsOver1MB;
      } catch (AnnotationException e) {
        LOG.debug(e.getMessage() + " -- skipping");
      }
    }
    LOG.debug(
        "Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB);
    return overlapDetector;
  }

  private boolean isSequenceRecognized(final String sequence) {
    return (sequenceDictionary.getSequence(sequence) != null);
  }

  private Gene makeGeneFromRefFlatLines(
      final List<TabbedTextFileWithHeaderParser.Row> transcriptLines) {
    final String geneName = transcriptLines.get(0).getField(RefFlatColumns.GENE_NAME.name());
    final String strandStr = transcriptLines.get(0).getField(RefFlatColumns.STRAND.name());
    final boolean negative = strandStr.equals("-");
    final String chromosome = transcriptLines.get(0).getField(RefFlatColumns.CHROMOSOME.name());

    // Figure out the extend of the gene
    int start = Integer.MAX_VALUE;
    int end = Integer.MIN_VALUE;
    for (final TabbedTextFileWithHeaderParser.Row row : transcriptLines) {
      start = Math.min(start, row.getIntegerField(RefFlatColumns.TX_START.name()) + 1);
      end = Math.max(end, row.getIntegerField(RefFlatColumns.TX_END.name()));
    }

    final Gene gene = new Gene(chromosome, start, end, negative, geneName);

    for (final TabbedTextFileWithHeaderParser.Row row : transcriptLines) {
      if (!strandStr.equals(row.getField(RefFlatColumns.STRAND.name()))) {
        throw new AnnotationException("Strand disagreement in refFlat file for gene " + geneName);
      }
      if (!chromosome.equals(row.getField(RefFlatColumns.CHROMOSOME.name()))) {
        throw new AnnotationException(
            "Chromosome disagreement("
                + chromosome
                + " != "
                + row.getField(RefFlatColumns.CHROMOSOME.name())
                + ") in refFlat file for gene "
                + geneName);
      }

      // This adds it to the Gene also
      final Transcript tx = makeTranscriptFromRefFlatLine(gene, row);
    }

    return gene;
  }

  /** Conversion from 0-based half-open to 1-based inclusive intervals is done here. */
  private Gene.Transcript makeTranscriptFromRefFlatLine(
      final Gene gene, final TabbedTextFileWithHeaderParser.Row row) {
    final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
    final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
    final String transcriptDescription = geneName + ":" + transcriptName;
    final int exonCount = Integer.parseInt(row.getField(RefFlatColumns.EXON_COUNT.name()));
    final String[] exonStarts = row.getField(RefFlatColumns.EXON_STARTS.name()).split(",");
    final String[] exonEnds = row.getField(RefFlatColumns.EXON_ENDS.name()).split(",");

    if (exonCount != exonStarts.length) {
      throw new AnnotationException(
          "Number of exon starts does not agree with number of exons for " + transcriptDescription);
    }
    if (exonCount != exonEnds.length) {
      throw new AnnotationException(
          "Number of exon ends does not agree with number of exons for " + transcriptDescription);
    }

    final int transcriptionStart = row.getIntegerField(RefFlatColumns.TX_START.name()) + 1;
    final int transcriptionEnd = row.getIntegerField(RefFlatColumns.TX_END.name());
    final int codingStart = row.getIntegerField(RefFlatColumns.CDS_START.name()) + 1;
    final int codingEnd = row.getIntegerField(RefFlatColumns.CDS_END.name());

    final Transcript tx =
        gene.addTranscript(
            transcriptName,
            transcriptionStart,
            transcriptionEnd,
            codingStart,
            codingEnd,
            exonCount);

    for (int i = 0; i < exonCount; ++i) {
      final Exon e = tx.addExon(Integer.parseInt(exonStarts[i]) + 1, Integer.parseInt(exonEnds[i]));

      if (e.start > e.end) {
        throw new AnnotationException("Exon has 0 or negative extent for " + transcriptDescription);
      }
      if (i > 0 && tx.exons[i - 1].end >= tx.exons[i].start) {
        throw new AnnotationException("Exons overlap for " + transcriptDescription);
      }
    }

    return tx;
  }
}
Example #25
0
/**
 * Computes a number of metrics that are useful for evaluating coverage and performance of whole
 * genome sequencing experiments.
 *
 * @author tfennell
 */
@CommandLineProgramProperties(
    usage =
        "Computes a number of metrics that are useful for evaluating coverage and performance of "
            + "whole genome sequencing experiments.",
    usageShort = "Writes whole genome sequencing-related metrics for a SAM or BAM file",
    programGroup = Metrics.class)
public class CollectWgsMetrics extends CommandLineProgram {

  @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input SAM or BAM file.")
  public File INPUT;

  @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output metrics file.")
  public File OUTPUT;

  @Option(
      shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME,
      doc = "The reference sequence fasta aligned to.")
  public File REFERENCE_SEQUENCE;

  @Option(
      shortName = "MQ",
      doc = "Minimum mapping quality for a read to contribute coverage.",
      overridable = true)
  public int MINIMUM_MAPPING_QUALITY = 20;

  @Option(
      shortName = "Q",
      doc = "Minimum base quality for a base to contribute coverage.",
      overridable = true)
  public int MINIMUM_BASE_QUALITY = 20;

  @Option(
      shortName = "CAP",
      doc = "Treat bases with coverage exceeding this value as if they had coverage at this value.",
      overridable = true)
  public int COVERAGE_CAP = 250;

  @Option(doc = "For debugging purposes, stop after processing this many genomic bases.")
  public long STOP_AFTER = -1;

  @Option(doc = "Determines whether to include the base quality histogram in the metrics file.")
  public boolean INCLUDE_BQ_HISTOGRAM = false;

  @Option(doc = "If true, count unpaired reads, and paired reads with one end unmapped")
  public boolean COUNT_UNPAIRED = false;

  private final Log log = Log.getInstance(CollectWgsMetrics.class);

  /** Metrics for evaluating the performance of whole genome sequencing experiments. */
  public static class WgsMetrics extends MetricBase {
    /** The number of non-N bases in the genome reference over which coverage will be evaluated. */
    public long GENOME_TERRITORY;
    /** The mean coverage in bases of the genome territory, after all filters are applied. */
    public double MEAN_COVERAGE;
    /** The standard deviation of coverage of the genome after all filters are applied. */
    public double SD_COVERAGE;
    /** The median coverage in bases of the genome territory, after all filters are applied. */
    public double MEDIAN_COVERAGE;
    /** The median absolute deviation of coverage of the genome after all filters are applied. */
    public double MAD_COVERAGE;

    /**
     * The fraction of aligned bases that were filtered out because they were in reads with low
     * mapping quality (default is < 20).
     */
    public double PCT_EXC_MAPQ;
    /**
     * The fraction of aligned bases that were filtered out because they were in reads marked as
     * duplicates.
     */
    public double PCT_EXC_DUPE;
    /**
     * The fraction of aligned bases that were filtered out because they were in reads without a
     * mapped mate pair.
     */
    public double PCT_EXC_UNPAIRED;
    /**
     * The fraction of aligned bases that were filtered out because they were of low base quality
     * (default is < 20).
     */
    public double PCT_EXC_BASEQ;
    /**
     * The fraction of aligned bases that were filtered out because they were the second observation
     * from an insert with overlapping reads.
     */
    public double PCT_EXC_OVERLAP;
    /**
     * The fraction of aligned bases that were filtered out because they would have raised coverage
     * above the capped value (default cap = 250x).
     */
    public double PCT_EXC_CAPPED;
    /** The total fraction of aligned bases excluded due to all filters. */
    public double PCT_EXC_TOTAL;

    /**
     * The fraction of bases that attained at least 1X sequence coverage in post-filtering bases.
     */
    public double PCT_1X;
    /**
     * The fraction of bases that attained at least 5X sequence coverage in post-filtering bases.
     */
    public double PCT_5X;
    /**
     * The fraction of bases that attained at least 10X sequence coverage in post-filtering bases.
     */
    public double PCT_10X;
    /**
     * The fraction of bases that attained at least 15X sequence coverage in post-filtering bases.
     */
    public double PCT_15X;
    /**
     * The fraction of bases that attained at least 20X sequence coverage in post-filtering bases.
     */
    public double PCT_20X;
    /**
     * The fraction of bases that attained at least 25X sequence coverage in post-filtering bases.
     */
    public double PCT_25X;
    /**
     * The fraction of bases that attained at least 30X sequence coverage in post-filtering bases.
     */
    public double PCT_30X;
    /**
     * The fraction of bases that attained at least 40X sequence coverage in post-filtering bases.
     */
    public double PCT_40X;
    /**
     * The fraction of bases that attained at least 50X sequence coverage in post-filtering bases.
     */
    public double PCT_50X;
    /**
     * The fraction of bases that attained at least 60X sequence coverage in post-filtering bases.
     */
    public double PCT_60X;
    /**
     * The fraction of bases that attained at least 70X sequence coverage in post-filtering bases.
     */
    public double PCT_70X;
    /**
     * The fraction of bases that attained at least 80X sequence coverage in post-filtering bases.
     */
    public double PCT_80X;
    /**
     * The fraction of bases that attained at least 90X sequence coverage in post-filtering bases.
     */
    public double PCT_90X;
    /**
     * The fraction of bases that attained at least 100X sequence coverage in post-filtering bases.
     */
    public double PCT_100X;
  }

  public static void main(final String[] args) {
    new CollectWgsMetrics().instanceMainWithExit(args);
  }

  @Override
  protected int doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsWritable(OUTPUT);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);

    // Setup all the inputs
    final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci");
    final ReferenceSequenceFileWalker refWalker =
        new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final SamReader in =
        SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
    final SamLocusIterator iterator = getLocusIterator(in);

    final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
    final CountingFilter dupeFilter = new CountingDuplicateFilter();
    final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY);
    final CountingPairedFilter pairFilter = new CountingPairedFilter();
    filters.add(mapqFilter);
    filters.add(dupeFilter);
    if (!COUNT_UNPAIRED) {
      filters.add(pairFilter);
    }
    filters.add(
        new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count
                                         // reads twice
    iterator.setSamFilters(filters);
    iterator.setEmitUncoveredLoci(true);
    iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases
    iterator.setQualityScoreCutoff(0); // Handled separately because we want to count bases
    iterator.setIncludeNonPfReads(false);

    final int max = COVERAGE_CAP;
    final long[] HistogramArray = new long[max + 1];
    final long[] baseQHistogramArray = new long[Byte.MAX_VALUE];
    final boolean usingStopAfter = STOP_AFTER > 0;
    final long stopAfter = STOP_AFTER - 1;
    long counter = 0;

    long basesExcludedByBaseq = 0;
    long basesExcludedByOverlap = 0;
    long basesExcludedByCapping = 0;

    // Loop through all the loci
    while (iterator.hasNext()) {
      final SamLocusIterator.LocusInfo info = iterator.next();

      // Check that the reference is not N
      final ReferenceSequence ref = refWalker.get(info.getSequenceIndex());
      final byte base = ref.getBases()[info.getPosition() - 1];
      if (base == 'N') continue;

      // Figure out the coverage while not counting overlapping reads twice, and excluding various
      // things
      final HashSet<String> readNames = new HashSet<String>(info.getRecordAndPositions().size());
      int pileupSize = 0;
      for (final SamLocusIterator.RecordAndOffset recs : info.getRecordAndPositions()) {

        if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY) {
          ++basesExcludedByBaseq;
          continue;
        }
        if (!readNames.add(recs.getRecord().getReadName())) {
          ++basesExcludedByOverlap;
          continue;
        }
        pileupSize++;
        if (pileupSize <= max) {
          baseQHistogramArray[recs.getRecord().getBaseQualities()[recs.getOffset()]]++;
        }
      }

      final int depth = Math.min(readNames.size(), max);
      if (depth < readNames.size()) basesExcludedByCapping += readNames.size() - max;
      HistogramArray[depth]++;

      // Record progress and perhaps stop
      progress.record(info.getSequenceName(), info.getPosition());
      if (usingStopAfter && ++counter > stopAfter) break;
    }

    // Construct and write the outputs
    final Histogram<Integer> histo = new Histogram<Integer>("coverage", "count");
    for (int i = 0; i < HistogramArray.length; ++i) {
      histo.increment(i, HistogramArray[i]);
    }

    // Construct and write the outputs
    final Histogram<Integer> baseQHisto = new Histogram<Integer>("value", "baseq_count");
    for (int i = 0; i < baseQHistogramArray.length; ++i) {
      baseQHisto.increment(i, baseQHistogramArray[i]);
    }

    final WgsMetrics metrics = generateWgsMetrics();
    metrics.GENOME_TERRITORY = (long) histo.getSumOfValues();
    metrics.MEAN_COVERAGE = histo.getMean();
    metrics.SD_COVERAGE = histo.getStandardDeviation();
    metrics.MEDIAN_COVERAGE = histo.getMedian();
    metrics.MAD_COVERAGE = histo.getMedianAbsoluteDeviation();

    final long basesExcludedByDupes = getBasesExcludedBy(dupeFilter);
    final long basesExcludedByMapq = getBasesExcludedBy(mapqFilter);
    final long basesExcludedByPairing = getBasesExcludedBy(pairFilter);
    final double total = histo.getSum();
    final double totalWithExcludes =
        total
            + basesExcludedByDupes
            + basesExcludedByMapq
            + basesExcludedByPairing
            + basesExcludedByBaseq
            + basesExcludedByOverlap
            + basesExcludedByCapping;
    metrics.PCT_EXC_DUPE = basesExcludedByDupes / totalWithExcludes;
    metrics.PCT_EXC_MAPQ = basesExcludedByMapq / totalWithExcludes;
    metrics.PCT_EXC_UNPAIRED = basesExcludedByPairing / totalWithExcludes;
    metrics.PCT_EXC_BASEQ = basesExcludedByBaseq / totalWithExcludes;
    metrics.PCT_EXC_OVERLAP = basesExcludedByOverlap / totalWithExcludes;
    metrics.PCT_EXC_CAPPED = basesExcludedByCapping / totalWithExcludes;
    metrics.PCT_EXC_TOTAL = (totalWithExcludes - total) / totalWithExcludes;

    metrics.PCT_1X =
        MathUtil.sum(HistogramArray, 1, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_5X =
        MathUtil.sum(HistogramArray, 5, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_10X =
        MathUtil.sum(HistogramArray, 10, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_15X =
        MathUtil.sum(HistogramArray, 15, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_20X =
        MathUtil.sum(HistogramArray, 20, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_25X =
        MathUtil.sum(HistogramArray, 25, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_30X =
        MathUtil.sum(HistogramArray, 30, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_40X =
        MathUtil.sum(HistogramArray, 40, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_50X =
        MathUtil.sum(HistogramArray, 50, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_60X =
        MathUtil.sum(HistogramArray, 60, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_70X =
        MathUtil.sum(HistogramArray, 70, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_80X =
        MathUtil.sum(HistogramArray, 80, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_90X =
        MathUtil.sum(HistogramArray, 90, HistogramArray.length) / (double) metrics.GENOME_TERRITORY;
    metrics.PCT_100X =
        MathUtil.sum(HistogramArray, 100, HistogramArray.length)
            / (double) metrics.GENOME_TERRITORY;

    final MetricsFile<WgsMetrics, Integer> out = getMetricsFile();
    out.addMetric(metrics);
    out.addHistogram(histo);
    if (INCLUDE_BQ_HISTOGRAM) {
      out.addHistogram(baseQHisto);
    }
    out.write(OUTPUT);

    return 0;
  }

  protected WgsMetrics generateWgsMetrics() {
    return new WgsMetrics();
  }

  protected long getBasesExcludedBy(final CountingFilter filter) {
    return filter.getFilteredBases();
  }

  protected SamLocusIterator getLocusIterator(final SamReader in) {
    return new SamLocusIterator(in);
  }
}
/**
 * IlluminaDataProviderFactory accepts options for parsing Illumina data files for a lane and
 * creates an IlluminaDataProvider, an iterator over the ClusterData for that lane, which utilizes
 * these options.
 *
 * <p>
 *
 * <p>Note: Since we tend to use IlluminaDataProviderFactory in multithreaded environments (e.g. we
 * call makeDataProvider in a different thread per tile in IlluminaBasecallsToSam). I've made it
 * essentially immutable. makeDataProvider/getTiles are now idempotent (well as far as
 * IlluminaDataProviderFactory is concerned, many file handles and other things are opened when
 * makeDataProvider is called). We may in the future want dataTypes to be provided to the
 * makeDataProvider factory methods so configuration is not done multiple times for the same
 * basecallDirectory in client code.
 *
 * @author [email protected]
 */
public class IlluminaDataProviderFactory {
  private static final Log log = Log.getInstance(IlluminaDataProviderFactory.class);

  /**
   * A map of data types to a list of file formats in the order in which we prefer those file types
   * (E.g. we would rather parse Bcls before QSeqs, Locs files before Clocs files ...) We try to
   * prefer data types that will be the fastest to parse/smallest in memory NOTE: In the code below,
   * if Qseq is chosen to provide for ANY data type then it is used for ALL its data types (since
   * we'll have to parse the entire line for each Qseq anyways)
   */
  private static final Map<IlluminaDataType, List<SupportedIlluminaFormat>>
      DATA_TYPE_TO_PREFERRED_FORMATS =
          new HashMap<IlluminaDataType, List<SupportedIlluminaFormat>>();

  static {
    /**
     * For types found in Qseq, we prefer the NON-Qseq file formats first. However, if we end up
     * using Qseqs then we use Qseqs for EVERY type it provides, see determineFormats
     */
    DATA_TYPE_TO_PREFERRED_FORMATS.put(
        IlluminaDataType.BaseCalls,
        makeList(SupportedIlluminaFormat.MultiTileBcl, SupportedIlluminaFormat.Bcl));
    DATA_TYPE_TO_PREFERRED_FORMATS.put(
        IlluminaDataType.QualityScores,
        makeList(SupportedIlluminaFormat.MultiTileBcl, SupportedIlluminaFormat.Bcl));
    DATA_TYPE_TO_PREFERRED_FORMATS.put(
        IlluminaDataType.PF,
        makeList(SupportedIlluminaFormat.MultiTileFilter, SupportedIlluminaFormat.Filter));
    DATA_TYPE_TO_PREFERRED_FORMATS.put(
        IlluminaDataType.Position,
        makeList(
            SupportedIlluminaFormat.MultiTileLocs,
            SupportedIlluminaFormat.Locs,
            SupportedIlluminaFormat.Clocs,
            SupportedIlluminaFormat.Pos));

    DATA_TYPE_TO_PREFERRED_FORMATS.put(
        IlluminaDataType.Barcodes, makeList(SupportedIlluminaFormat.Barcode));
  }

  // The following properties must be specified by caller.
  /** basecallDirectory holds QSeqs or bcls * */
  private final File basecallDirectory;

  private final File barcodesDirectory;
  private final int lane;

  /** Whether or not to apply EAMSS filtering if parsing BCLs for the bases and quality scores. */
  private boolean applyEamssFiltering = true;

  /** A Map of file formats to the dataTypes they will provide for this run. */
  protected final Map<SupportedIlluminaFormat, Set<IlluminaDataType>> formatToDataTypes;

  /** Basecall Directory/lane parameterized util for finding IlluminaFiles */
  private final IlluminaFileUtil fileUtil;

  private final List<Integer> availableTiles;

  private final OutputMapping outputMapping;
  private final BclQualityEvaluationStrategy bclQualityEvaluationStrategy;

  /**
   * Create factory with the specified options, one that favors using QSeqs over all other files
   *
   * @param basecallDirectory The baseCalls directory of a complete Illumina directory. Files are
   *     found by searching relative to this folder (some of them higher up in the directory tree).
   * @param lane Which lane to iterate over.
   * @param readStructure The read structure to which output clusters will conform. When not using
   *     QSeqs, EAMSS masking(see BclParser) is run on individual reads as found in the
   *     readStructure, if the readStructure specified does not match the readStructure implied by
   *     the sequencer's output than the quality scores output may differ than what would be found
   *     in a run's QSeq files
   * @param dataTypesArg Which data types to read
   */
  public IlluminaDataProviderFactory(
      final File basecallDirectory,
      final int lane,
      final ReadStructure readStructure,
      final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
      final IlluminaDataType... dataTypesArg) {
    this(basecallDirectory, null, lane, readStructure, bclQualityEvaluationStrategy, dataTypesArg);
  }

  /**
   * Create factory with the specified options, one that favors using QSeqs over all other files
   *
   * @param basecallDirectory The baseCalls directory of a complete Illumina directory. Files are
   *     found by searching relative to this folder (some of them higher up in the directory tree).
   * @param barcodesDirectory The barcodesDirectory with barcode files extracted by
   *     'ExtractIlluminaBarcodes' (optional, use basecallDirectory if not specified)
   * @param lane Which lane to iterate over.
   * @param readStructure The read structure to which output clusters will conform. When not using
   *     QSeqs, EAMSS masking(see BclParser) is run on individual reads as found in the
   *     readStructure, if the readStructure specified does not match the readStructure implied by
   *     the sequencer's output than the quality scores output may differ than what would be found
   *     in a run's QSeq files
   * @param dataTypesArg Which data types to read
   */
  public IlluminaDataProviderFactory(
      final File basecallDirectory,
      File barcodesDirectory,
      final int lane,
      final ReadStructure readStructure,
      final BclQualityEvaluationStrategy bclQualityEvaluationStrategy,
      final IlluminaDataType... dataTypesArg) {
    this.basecallDirectory = basecallDirectory;
    this.barcodesDirectory = barcodesDirectory;
    this.bclQualityEvaluationStrategy = bclQualityEvaluationStrategy;

    this.lane = lane;
    /* The types of data that will be returned by any IlluminaDataProviders created by this factory.

    Note: In previous version, data of types not specified might be returned if a data type was specified
    for data residing in QSeqs (since QSeqs span multiple data types).  This is no longer the case, you
    MUST specify all data types that should be returned.*/
    final Set<IlluminaDataType> dataTypes =
        Collections.unmodifiableSet(new HashSet<IlluminaDataType>(Arrays.asList(dataTypesArg)));

    if (dataTypes.isEmpty()) {
      throw new PicardException(
          "No data types have been specified for basecall output "
              + basecallDirectory
              + ", lane "
              + lane);
    }

    this.fileUtil = new IlluminaFileUtil(basecallDirectory, barcodesDirectory, lane);

    // find what request IlluminaDataTypes we have files for and select the most preferred file
    // format available for that type
    formatToDataTypes = determineFormats(dataTypes, fileUtil);

    // find if we have any IlluminaDataType with NO available file formats and, if any exist, throw
    // an exception
    final Set<IlluminaDataType> unmatchedDataTypes =
        findUnmatchedTypes(dataTypes, formatToDataTypes);
    if (unmatchedDataTypes.size() > 0) {
      throw new PicardException(
          "Could not find a format with available files for the following data types: "
              + StringUtil.join(", ", new ArrayList<IlluminaDataType>(unmatchedDataTypes)));
    }

    log.debug(
        "The following file formats will be used by IlluminaDataProvider: "
            + StringUtil.join("," + formatToDataTypes.keySet()));

    availableTiles =
        fileUtil.getActualTiles(new ArrayList<SupportedIlluminaFormat>(formatToDataTypes.keySet()));
    if (availableTiles.isEmpty()) {
      throw new PicardException(
          "No available tiles were found, make sure that "
              + basecallDirectory.getAbsolutePath()
              + " has a lane "
              + lane);
    }

    outputMapping = new OutputMapping(readStructure);
  }

  /**
   * Sometimes (in the case of skipped reads) the logical read structure of the output cluster data
   * is different from the input readStructure
   *
   * @return The ReadStructure describing the output cluster data
   */
  public ReadStructure getOutputReadStructure() {
    return outputMapping.getOutputReadStructure();
  }

  /**
   * Return the list of tiles available for this flowcell and lane. These are in ascending numerical
   * order.
   *
   * @return List of all tiles available for this flowcell and lane.
   */
  public List<Integer> getAvailableTiles() {
    return availableTiles;
  }

  /**
   * Sets whether or not EAMSS filtering will be applied if parsing BCL files for bases and quality
   * scores.
   */
  public void setApplyEamssFiltering(final boolean applyEamssFiltering) {
    this.applyEamssFiltering = applyEamssFiltering;
  }

  /**
   * Call this method to create a ClusterData iterator over all clusters for all tiles in ascending
   * numeric order.
   *
   * @return An iterator for reading the Illumina basecall output for the lane specified in the
   *     ctor.
   */
  public IlluminaDataProvider makeDataProvider() {
    return makeDataProvider(null);
  }

  /**
   * Call this method to create a ClusterData iterator over the specified tiles.
   *
   * @return An iterator for reading the Illumina basecall output for the lane specified in the
   *     constructor.
   */
  public IlluminaDataProvider makeDataProvider(List<Integer> requestedTiles) {
    if (requestedTiles == null) {
      requestedTiles = availableTiles;
    } else {
      if (requestedTiles.size() == 0) {
        throw new PicardException(
            "Zero length tile list supplied to makeDataProvider, you must specify at least 1 tile OR pass NULL to use all available tiles");
      }
    }

    final Map<IlluminaParser, Set<IlluminaDataType>> parsersToDataType =
        new HashMap<IlluminaParser, Set<IlluminaDataType>>();
    for (final Map.Entry<SupportedIlluminaFormat, Set<IlluminaDataType>> fmToDt :
        formatToDataTypes.entrySet()) {
      parsersToDataType.put(makeParser(fmToDt.getKey(), requestedTiles), fmToDt.getValue());
    }

    log.debug(
        "The following parsers will be used by IlluminaDataProvider: "
            + StringUtil.join("," + parsersToDataType.keySet()));

    return new IlluminaDataProvider(outputMapping, parsersToDataType, basecallDirectory, lane);
  }

  /**
   * Given a set of formats to data types they provide, find any requested data types that do not
   * have a format associated with them and return them
   *
   * @param requestedDataTypes Data types that need to be provided
   * @param formatToMatchedTypes A map of file formats to data types that will support them
   * @return The data types that go unsupported by the formats found in formatToMatchedTypes
   */
  public static Set<IlluminaDataType> findUnmatchedTypes(
      final Set<IlluminaDataType> requestedDataTypes,
      final Map<SupportedIlluminaFormat, Set<IlluminaDataType>> formatToMatchedTypes) {
    final Set<IlluminaDataType> copiedTypes = new HashSet<IlluminaDataType>(requestedDataTypes);
    for (final Set<IlluminaDataType> matchedTypes : formatToMatchedTypes.values()) {
      copiedTypes.removeAll(matchedTypes);
    }

    return copiedTypes;
  }

  /**
   * For all requestedDataTypes return a map of file format to set of provided data types that
   * covers as many requestedDataTypes as possible and chooses the most preferred available formats
   * possible
   *
   * @param requestedDataTypes Data types to be provided
   * @param fileUtil A file util for the lane/directory we wish to provide data for
   * @return A Map<Supported file format, Set of data types file format provides>
   */
  public static Map<SupportedIlluminaFormat, Set<IlluminaDataType>> determineFormats(
      final Set<IlluminaDataType> requestedDataTypes, final IlluminaFileUtil fileUtil) {
    // For predictable ordering and uniqueness only, put the requestedDataTypes into a treeSet
    final SortedSet<IlluminaDataType> toSupport = new TreeSet<IlluminaDataType>(requestedDataTypes);
    final Map<SupportedIlluminaFormat, Set<IlluminaDataType>> fileTypeToDataTypes =
        new HashMap<SupportedIlluminaFormat, Set<IlluminaDataType>>();
    final Map<IlluminaDataType, SupportedIlluminaFormat> dataTypeToFormat =
        new HashMap<IlluminaDataType, SupportedIlluminaFormat>();

    for (final IlluminaDataType ts : toSupport) {
      final SupportedIlluminaFormat preferredFormat = findPreferredAvailableFormat(ts, fileUtil);
      if (preferredFormat != null) {
        dataTypeToFormat.put(ts, preferredFormat);
      }
    }

    for (final IlluminaDataType dt : toSupport) {
      final SupportedIlluminaFormat format = dataTypeToFormat.get(dt);

      if (format != null) {
        if (fileTypeToDataTypes.containsKey(format)) {
          fileTypeToDataTypes.get(format).add(dt);
        } else {
          fileTypeToDataTypes.put(dataTypeToFormat.get(dt), makeSet(dt));
        }
      }
    }

    return fileTypeToDataTypes;
  }

  /**
   * Given a data type find the most preferred file format that also has files available
   *
   * @param dt Type of desired data
   * @param fileUtil Util for the lane/directory in which we will find data
   * @return The file format that is "most preferred" (i.e. fastest to parse/smallest in memory)
   */
  private static SupportedIlluminaFormat findPreferredAvailableFormat(
      final IlluminaDataType dt, final IlluminaFileUtil fileUtil) {
    return findPreferredFormat(dt, fileUtil, true);
  }

  /**
   * Given a data type find the most preferred file format even if files are not available
   *
   * @param dt Type of desired data
   * @param fileUtil Util for the lane/directory in which we will find data
   * @return The file format that is "most preferred" (i.e. fastest to parse/smallest in memory)
   */
  public static SupportedIlluminaFormat findPreferredFormat(
      final IlluminaDataType dt, final IlluminaFileUtil fileUtil) {
    return findPreferredFormat(dt, fileUtil, false);
  }

  private static SupportedIlluminaFormat findPreferredFormat(
      final IlluminaDataType dt, final IlluminaFileUtil fileUtil, final boolean checkAvailable) {
    final List<SupportedIlluminaFormat> preferredFormats = DATA_TYPE_TO_PREFERRED_FORMATS.get(dt);
    SupportedIlluminaFormat format = null;
    for (int i = 0; i < preferredFormats.size() && format == null; i++) {
      if (checkAvailable && fileUtil.getUtil(preferredFormats.get(i)).filesAvailable()) {
        format = preferredFormats.get(i);
      } else if (!checkAvailable) {
        format = preferredFormats.get(i);
      }
    }

    return format;
  }

  /**
   * There are multiple parsers for the same IlluminaDataType (e.g. BCLParser and QSeqParser).
   * Instantiate an instance of the preferred parser for the given data type with the information
   * available and return it.
   *
   * @param format The type of data we want to parse
   * @param requestedTiles The requestedTiles over which we will be parsing data
   * @return A parser that will parse dataType data over the given requestedTiles and cycles and
   *     output it in groupings of the sizes specified in outputLengths
   */
  private IlluminaParser makeParser(
      final SupportedIlluminaFormat format, final List<Integer> requestedTiles) {
    final IlluminaParser parser;
    switch (format) {
      case Barcode:
        parser =
            new BarcodeParser(
                ((PerTileFileUtil) fileUtil.getUtil(SupportedIlluminaFormat.Barcode))
                    .getFiles(requestedTiles));
        break;

      case Bcl:
        {
          final CycleIlluminaFileMap bclFileMap =
              ((PerTilePerCycleFileUtil) fileUtil.getUtil(SupportedIlluminaFormat.Bcl))
                  .getFiles(requestedTiles, outputMapping.getOutputCycles());
          bclFileMap.assertValid(requestedTiles, outputMapping.getOutputCycles());
          parser =
              new BclParser(
                  basecallDirectory,
                  lane,
                  bclFileMap,
                  outputMapping,
                  this.applyEamssFiltering,
                  bclQualityEvaluationStrategy);
          break;
        }

      case Filter:
        final IlluminaFileMap filterFileMap =
            ((PerTileFileUtil) fileUtil.getUtil(SupportedIlluminaFormat.Filter))
                .getFiles(requestedTiles);
        parser = new FilterParser(filterFileMap);
        break;

      case Locs:
      case Clocs:
      case Pos:
        final PerTileFileUtil fu = (PerTileFileUtil) fileUtil.getUtil(format);
        parser = new PosParser(fu.getFiles(requestedTiles), format);
        break;

      case MultiTileFilter:
        parser =
            ((MultiTileFilterFileUtil) fileUtil.getUtil(SupportedIlluminaFormat.MultiTileFilter))
                .makeParser(requestedTiles);
        break;

      case MultiTileLocs:
        parser =
            ((MultiTileLocsFileUtil) fileUtil.getUtil(SupportedIlluminaFormat.MultiTileLocs))
                .makeParser(requestedTiles);
        break;

      case MultiTileBcl:
        {
          final MultiTileBclFileUtil util =
              (MultiTileBclFileUtil) fileUtil.getUtil(SupportedIlluminaFormat.MultiTileBcl);
          final CycleIlluminaFileMap bclFileMap =
              util.getFiles(requestedTiles, outputMapping.getOutputCycles());
          bclFileMap.assertValid(requestedTiles, outputMapping.getOutputCycles());
          parser =
              new MultiTileBclParser(
                  basecallDirectory,
                  lane,
                  bclFileMap,
                  outputMapping,
                  this.applyEamssFiltering,
                  bclQualityEvaluationStrategy,
                  util.tileIndex);
          break;
        }

      default:
        throw new PicardException(
            "Unrecognized data type(" + format + ") found by IlluminaDataProviderFactory!");
    }

    return parser;
  }
}
Example #27
0
public class VcfVcf extends AbstractVCFFilter {
  private static Log LOG = Log.getInstance(VcfVcf.class);

  @Usage(programVersion = "1.0")
  public String USAGE =
      getStandardUsagePreamble() + "Get the INFO from a VCF and use it for another VCF. ";

  @Option(shortName = "TBX", doc = "The VCF file indexed with TABIX. Source of the annotations")
  public String TABIX;

  @Option(shortName = "INFO", doc = "The INFO keys to grab.", minElements = 0)
  public Set<String> INFO_IDS = new LinkedHashSet<String>();

  @Option(shortName = "RIF", doc = "Replace the INFO field if it exists.", minElements = 0)
  public boolean REPLACE_INFO_FIELD = true;

  @Option(shortName = "RID", doc = "Replace the ID field if it exists.", optional = true)
  public boolean REPLACE_ID = true;

  @Option(shortName = "RAM", doc = "REF allele matters.", optional = true)
  public boolean REF_ALLELE_MATTERS = true;

  @Option(shortName = "AAM", doc = "ALT alleles matters.", optional = true)
  public boolean ALT_ALLELES_MATTERS = false;

  @Option(shortName = "ACF", doc = "Flag to set if alternate alleles conflict.", optional = true)
  public String ALT_CONFLICT_FLAG = null;

  @Override
  protected void doWork(VcfIterator r, VariantContextWriter w) throws IOException {
    AbstractVCFCodec codeIn3 = VCFUtils.createDefaultVCFCodec();
    String line;

    StringWriter sw = new StringWriter();
    LOG.info("opening tabix file: " + this.TABIX);
    TabixReader tabix = new TabixReader(this.TABIX);

    while ((line = tabix.readLine()) != null) {
      if (!line.startsWith(VCFHeader.HEADER_INDICATOR)) {
        break;
      }
      sw.append(line).append("\n");
    }
    VCFHeader header3 =
        (VCFHeader)
            codeIn3.readActualHeader(
                new LineIteratorImpl(
                    LineReaderUtil.fromBufferedStream(
                        new ByteArrayInputStream(sw.toString().getBytes()))));
    VCFHeader header1 = r.getHeader();

    VCFHeader h2 =
        new VCFHeader(header1.getMetaDataInInputOrder(), header1.getSampleNamesInOrder());
    for (String infoId : this.INFO_IDS) {
      VCFInfoHeaderLine vihl = header3.getInfoHeaderLine(infoId);
      if (vihl == null) {
        LOG.warn("Not INFO=" + infoId + " in " + TABIX);
        continue;
      }
      if (h2.getInfoHeaderLine(infoId) != null) {
        LOG.warn("Input already contains INFO=" + vihl);
      }
      h2.addMetaDataLine(vihl);
    }

    if (ALT_CONFLICT_FLAG != null) {
      h2.addMetaDataLine(
          new VCFInfoHeaderLine(
              ALT_CONFLICT_FLAG,
              1,
              VCFHeaderLineType.Flag,
              "conflict ALT allele with " + this.TABIX));
    }

    w.writeHeader(h2);
    while (r.hasNext()) {
      VariantContext ctx1 = r.next();

      VariantContextBuilder vcb = new VariantContextBuilder(ctx1);
      String line2;
      String BEST_ID = null;
      boolean best_id_match_alt = false;

      List<VariantContext> variantsList = new ArrayList<VariantContext>();

      int[] array = tabix.parseReg(ctx1.getChr() + ":" + (ctx1.getStart()) + "-" + (ctx1.getEnd()));
      TabixReader.Iterator iter = null;

      if (array != null && array.length == 3 && array[0] != -1 && array[1] >= 0 && array[2] >= 0) {
        iter = tabix.query(array[0], array[1], array[2]);
      } else {
        LOG.info("Cannot get " + ctx1.getChr() + ":" + (ctx1.getStart()) + "-" + (ctx1.getEnd()));
      }

      while (iter != null && (line2 = iter.next()) != null) {
        VariantContext ctx3 = codeIn3.decode(line2);
        if (ctx3.getStart() != ctx1.getStart()) continue;
        if (ctx3.getEnd() != ctx1.getEnd()) continue;

        if (ctx1.getReference().equals(ctx3.getReference())
            && ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) {
          variantsList.clear();
          variantsList.add(ctx3);
          break;
        } else {
          variantsList.add(ctx3);
        }
      }

      for (VariantContext ctx3 : variantsList) {

        if (this.REF_ALLELE_MATTERS && !ctx1.getReference().equals(ctx3.getReference())) {
          continue;
        }
        if (this.ALT_ALLELES_MATTERS
            && !ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) {
          continue;
        }

        if (ctx3.getID() != null && this.REPLACE_ID) {
          if (BEST_ID != null && best_id_match_alt) {
            // nothing
          } else {
            BEST_ID = ctx3.getID();
            best_id_match_alt = ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles());
          }
        }

        for (String id : this.INFO_IDS) {
          Object info3 = ctx3.getAttribute(id);
          if (info3 == null) {
            continue;
          }
          Object info1 = ctx1.getAttribute(id);
          if (info1 != null && !this.REPLACE_INFO_FIELD) {
            continue;
          }

          vcb.attribute(id, info3);
        }

        if (ALT_CONFLICT_FLAG != null
            && !ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) {
          vcb.attribute(ALT_CONFLICT_FLAG, true);
        }
      }
      if (BEST_ID != null) {
        vcb.id(BEST_ID);
      }
      w.add(vcb.make());
    }
    tabix.close();
  }

  public static void main(String[] args) throws IOException {
    new VcfVcf().instanceMainWithExit(args);
  }
}
  /**
   * Utility for collating Tile records from the Illumina TileMetrics file into lane-level and
   * phasing-level metrics.
   */
  public static class IlluminaLaneMetricsCollector {

    private static final Log LOG = Log.getInstance(IlluminaLaneMetricsCollector.class);

    /**
     * Returns a partitioned collection of lane number to Tile objects from the provided basecall
     * directory.
     */
    public static Map<Integer, ? extends Collection<Tile>> readLaneTiles(
        final File illuminaRunDirectory, final ReadStructure readStructure) {
      final Collection<Tile> tiles;
      try {
        tiles =
            TileMetricsUtil.parseTileMetrics(
                TileMetricsUtil.renderTileMetricsFileFromBasecallingDirectory(illuminaRunDirectory),
                readStructure);
      } catch (final FileNotFoundException e) {
        throw new PicardException("Unable to open laneMetrics file.", e);
      }

      return tiles.stream().collect(Collectors.groupingBy(Tile::getLaneNumber));
    }

    /**
     * Parses the tile data from the basecall directory and writes to both the lane and phasing
     * metrics files
     */
    public static void collectLaneMetrics(
        final File runDirectory,
        final File outputDirectory,
        final String outputPrefix,
        final MetricsFile<MetricBase, Comparable<?>> laneMetricsFile,
        final MetricsFile<MetricBase, Comparable<?>> phasingMetricsFile,
        final ReadStructure readStructure) {
      final Map<Integer, ? extends Collection<Tile>> laneTiles =
          readLaneTiles(runDirectory, readStructure);
      writeLaneMetrics(laneTiles, outputDirectory, outputPrefix, laneMetricsFile);
      writePhasingMetrics(laneTiles, outputDirectory, outputPrefix, phasingMetricsFile);
    }

    public static File writePhasingMetrics(
        final Map<Integer, ? extends Collection<Tile>> laneTiles,
        final File outputDirectory,
        final String outputPrefix,
        final MetricsFile<MetricBase, Comparable<?>> phasingMetricsFile) {
      laneTiles
          .entrySet()
          .stream()
          .forEach(
              entry ->
                  IlluminaPhasingMetrics.getPhasingMetricsForTiles(
                          entry.getKey().longValue(), entry.getValue())
                      .forEach(phasingMetricsFile::addMetric));

      return writeMetrics(
          phasingMetricsFile, outputDirectory, outputPrefix, IlluminaPhasingMetrics.getExtension());
    }

    public static File writeLaneMetrics(
        final Map<Integer, ? extends Collection<Tile>> laneTiles,
        final File outputDirectory,
        final String outputPrefix,
        final MetricsFile<MetricBase, Comparable<?>> laneMetricsFile) {
      laneTiles
          .entrySet()
          .stream()
          .forEach(
              entry -> {
                final IlluminaLaneMetrics laneMetric = new IlluminaLaneMetrics();
                laneMetric.LANE = entry.getKey().longValue();
                laneMetric.CLUSTER_DENSITY = calculateLaneDensityFromTiles(entry.getValue());
                laneMetricsFile.addMetric(laneMetric);
              });

      return writeMetrics(
          laneMetricsFile, outputDirectory, outputPrefix, IlluminaLaneMetrics.getExtension());
    }

    private static File writeMetrics(
        final MetricsFile<MetricBase, Comparable<?>> metricsFile,
        final File outputDirectory,
        final String outputPrefix,
        final String outputExtension) {
      final File outputFile =
          new File(outputDirectory, String.format("%s.%s", outputPrefix, outputExtension));
      LOG.info(
          String.format(
              "Writing %s lane metrics to %s ...", metricsFile.getMetrics().size(), outputFile));
      metricsFile.write(outputFile);
      return outputFile;
    }

    private static double calculateLaneDensityFromTiles(final Collection<Tile> tiles) {
      double area = 0;
      double clusters = 0;
      for (final Tile tile : tiles) {
        area += (tile.getClusterCount() / tile.getClusterDensity());
        clusters += tile.getClusterCount();
      }
      return clusters / area;
    }
  }
Example #29
0
  @Override
  protected void doWork(VcfIterator r, VariantContextWriter w) throws IOException {
    AbstractVCFCodec codeIn3 = VCFUtils.createDefaultVCFCodec();
    String line;

    StringWriter sw = new StringWriter();
    LOG.info("opening tabix file: " + this.TABIX);
    TabixReader tabix = new TabixReader(this.TABIX);

    while ((line = tabix.readLine()) != null) {
      if (!line.startsWith(VCFHeader.HEADER_INDICATOR)) {
        break;
      }
      sw.append(line).append("\n");
    }
    VCFHeader header3 =
        (VCFHeader)
            codeIn3.readActualHeader(
                new LineIteratorImpl(
                    LineReaderUtil.fromBufferedStream(
                        new ByteArrayInputStream(sw.toString().getBytes()))));
    VCFHeader header1 = r.getHeader();

    VCFHeader h2 =
        new VCFHeader(header1.getMetaDataInInputOrder(), header1.getSampleNamesInOrder());
    for (String infoId : this.INFO_IDS) {
      VCFInfoHeaderLine vihl = header3.getInfoHeaderLine(infoId);
      if (vihl == null) {
        LOG.warn("Not INFO=" + infoId + " in " + TABIX);
        continue;
      }
      if (h2.getInfoHeaderLine(infoId) != null) {
        LOG.warn("Input already contains INFO=" + vihl);
      }
      h2.addMetaDataLine(vihl);
    }

    if (ALT_CONFLICT_FLAG != null) {
      h2.addMetaDataLine(
          new VCFInfoHeaderLine(
              ALT_CONFLICT_FLAG,
              1,
              VCFHeaderLineType.Flag,
              "conflict ALT allele with " + this.TABIX));
    }

    w.writeHeader(h2);
    while (r.hasNext()) {
      VariantContext ctx1 = r.next();

      VariantContextBuilder vcb = new VariantContextBuilder(ctx1);
      String line2;
      String BEST_ID = null;
      boolean best_id_match_alt = false;

      List<VariantContext> variantsList = new ArrayList<VariantContext>();

      int[] array = tabix.parseReg(ctx1.getChr() + ":" + (ctx1.getStart()) + "-" + (ctx1.getEnd()));
      TabixReader.Iterator iter = null;

      if (array != null && array.length == 3 && array[0] != -1 && array[1] >= 0 && array[2] >= 0) {
        iter = tabix.query(array[0], array[1], array[2]);
      } else {
        LOG.info("Cannot get " + ctx1.getChr() + ":" + (ctx1.getStart()) + "-" + (ctx1.getEnd()));
      }

      while (iter != null && (line2 = iter.next()) != null) {
        VariantContext ctx3 = codeIn3.decode(line2);
        if (ctx3.getStart() != ctx1.getStart()) continue;
        if (ctx3.getEnd() != ctx1.getEnd()) continue;

        if (ctx1.getReference().equals(ctx3.getReference())
            && ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) {
          variantsList.clear();
          variantsList.add(ctx3);
          break;
        } else {
          variantsList.add(ctx3);
        }
      }

      for (VariantContext ctx3 : variantsList) {

        if (this.REF_ALLELE_MATTERS && !ctx1.getReference().equals(ctx3.getReference())) {
          continue;
        }
        if (this.ALT_ALLELES_MATTERS
            && !ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) {
          continue;
        }

        if (ctx3.getID() != null && this.REPLACE_ID) {
          if (BEST_ID != null && best_id_match_alt) {
            // nothing
          } else {
            BEST_ID = ctx3.getID();
            best_id_match_alt = ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles());
          }
        }

        for (String id : this.INFO_IDS) {
          Object info3 = ctx3.getAttribute(id);
          if (info3 == null) {
            continue;
          }
          Object info1 = ctx1.getAttribute(id);
          if (info1 != null && !this.REPLACE_INFO_FIELD) {
            continue;
          }

          vcb.attribute(id, info3);
        }

        if (ALT_CONFLICT_FLAG != null
            && !ctx1.getAlternateAlleles().equals(ctx3.getAlternateAlleles())) {
          vcb.attribute(ALT_CONFLICT_FLAG, true);
        }
      }
      if (BEST_ID != null) {
        vcb.id(BEST_ID);
      }
      w.add(vcb.make());
    }
    tabix.close();
  }
Example #30
0
public class SamReaderFactoryTest {
  private static final File TEST_DATA_DIR = new File("src/test/resources/htsjdk/samtools");

  private static final Log LOG = Log.getInstance(SamReaderFactoryTest.class);

  @Test(dataProvider = "variousFormatReaderTestCases")
  public void variousFormatReaderTest(final String inputFile) throws IOException {
    final File input = new File(TEST_DATA_DIR, inputFile);
    final SamReader reader = SamReaderFactory.makeDefault().open(input);
    for (final SAMRecord ignored : reader) {}
    reader.close();
  }

  private int countRecordsInQueryInterval(final SamReader reader, final QueryInterval query) {
    final SAMRecordIterator iter = reader.queryOverlapping(new QueryInterval[] {query});
    int count = 0;
    while (iter.hasNext()) {
      iter.next();
      count++;
    }
    iter.close();
    return count;
  }

  private int countRecords(final SamReader reader) {
    int count = 0;
    try (final SAMRecordIterator iter = reader.iterator()) {
      while (iter.hasNext()) {
        iter.next();
        count++;
      }
    }
    return count;
  }

  private static SeekableByteChannel addHeader(SeekableByteChannel input) {
    try {
      int total = (int) input.size();
      final String comment =
          "@HD\tVN:1.0  SO:unsorted\n"
              + "@SQ\tSN:chr1\tLN:101\n"
              + "@SQ\tSN:chr2\tLN:101\n"
              + "@SQ\tSN:chr3\tLN:101\n"
              + "@RG\tID:0\tSM:JP was here\n";

      byte[] commentBuf = comment.getBytes();
      ByteBuffer buf = ByteBuffer.allocate(total + commentBuf.length);
      buf.put(commentBuf);
      input.position(0);
      while (input.read(buf) > 0) {
        // read until EOF
      }
      buf.flip();
      return new SeekableByteChannelFromBuffer(buf);
    } catch (IOException x) {
      throw new RuntimeException(x);
    }
  }

  @Test
  public void testWrap() throws IOException {
    final Path input = Paths.get(TEST_DATA_DIR.getPath(), "noheader.sam");
    final SamReader wrappedReader =
        SamReaderFactory.makeDefault().setPathWrapper(SamReaderFactoryTest::addHeader).open(input);
    int records = countRecords(wrappedReader);
    Assert.assertEquals(10, records);
  }

  // See https://github.com/samtools/htsjdk/issues/76
  @Test(dataProvider = "queryIntervalIssue76TestCases")
  public void queryIntervalIssue76(
      final String sequenceName, final int start, final int end, final int expectedCount)
      throws IOException {
    final File input = new File(TEST_DATA_DIR, "issue76.bam");
    final SamReader reader = SamReaderFactory.makeDefault().open(input);
    final QueryInterval interval =
        new QueryInterval(
            reader.getFileHeader().getSequence(sequenceName).getSequenceIndex(), start, end);
    Assert.assertEquals(countRecordsInQueryInterval(reader, interval), expectedCount);
    reader.close();
  }

  @DataProvider(name = "queryIntervalIssue76TestCases")
  public Object[][] queryIntervalIssue76TestCases() {
    return new Object[][] {
      {"1", 11966, 11966, 2},
      {"1", 11966, 11967, 2},
      {"1", 11967, 11967, 1}
    };
  }

  @DataProvider(name = "variousFormatReaderTestCases")
  public Object[][] variousFormatReaderTestCases() {
    return new Object[][] {
      {"block_compressed.sam.gz"},
      {"uncompressed.sam"},
      {"compressed.sam.gz"},
      {"compressed.bam"},
      {"unsorted.sam"}
    };
  }

  // Tests for the SAMRecordFactory usage
  class SAMRecordFactoryTester extends DefaultSAMRecordFactory {
    int samRecordsCreated;
    int bamRecordsCreated;

    public SAMRecord createSAMRecord(final SAMFileHeader header) {
      ++samRecordsCreated;
      return super.createSAMRecord(header);
    }

    public BAMRecord createBAMRecord(
        final SAMFileHeader header,
        final int referenceSequenceIndex,
        final int alignmentStart,
        final short readNameLength,
        final short mappingQuality,
        final int indexingBin,
        final int cigarLen,
        final int flags,
        final int readLen,
        final int mateReferenceSequenceIndex,
        final int mateAlignmentStart,
        final int insertSize,
        final byte[] variableLengthBlock) {
      ++bamRecordsCreated;
      return super.createBAMRecord(
          header,
          referenceSequenceIndex,
          alignmentStart,
          readNameLength,
          mappingQuality,
          indexingBin,
          cigarLen,
          flags,
          readLen,
          mateReferenceSequenceIndex,
          mateAlignmentStart,
          insertSize,
          variableLengthBlock);
    }
  }

  @Test(dataProvider = "variousFormatReaderTestCases")
  public void samRecordFactoryTest(final String inputFile) throws IOException {
    final File input = new File(TEST_DATA_DIR, inputFile);

    final SAMRecordFactoryTester recordFactory = new SAMRecordFactoryTester();
    final SamReaderFactory readerFactory =
        SamReaderFactory.makeDefault().samRecordFactory(recordFactory);
    final SamReader reader = readerFactory.open(input);

    int i = 0;
    for (final SAMRecord ignored : reader) {
      ++i;
    }
    reader.close();

    Assert.assertTrue(i > 0);
    if (inputFile.endsWith(".sam") || inputFile.endsWith(".sam.gz"))
      Assert.assertEquals(recordFactory.samRecordsCreated, i);
    else if (inputFile.endsWith(".bam")) Assert.assertEquals(recordFactory.bamRecordsCreated, i);
  }

  @Test(expectedExceptions = IllegalStateException.class)
  public void samRecordFactoryNullHeaderBAMTest() {
    final SAMRecordFactory recordFactory = new DefaultSAMRecordFactory();
    recordFactory.createBAMRecord(
        null, // null header
        0, 0, (short) 0, (short) 0, 0, 0, 0, 0, 0, 0, 0, null);
  }

  /**
   * Unit tests for asserting all permutations of data and index sources read the same records and
   * header.
   */
  final File localBam =
      new File("src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam");

  final File localBamIndex =
      new File("src/test/resources/htsjdk/samtools/BAMFileIndexTest/index_test.bam.bai");

  final URL bamUrl, bamIndexUrl;

  {
    try {
      bamUrl = new URL(TestUtil.BASE_URL_FOR_HTTP_TESTS + "index_test.bam");
      bamIndexUrl = new URL(TestUtil.BASE_URL_FOR_HTTP_TESTS + "index_test.bam.bai");
    } catch (final MalformedURLException e) {
      throw new RuntimeException(e);
    }
  }

  @DataProvider
  public Object[][] composeAllPermutationsOfSamInputResource() {
    final List<SamInputResource> sources = new ArrayList<SamInputResource>();
    for (final InputResource.Type dataType : InputResource.Type.values()) {
      if (dataType.equals(InputResource.Type.SRA_ACCESSION)) continue;

      sources.add(new SamInputResource(composeInputResourceForType(dataType, false)));
      for (final InputResource.Type indexType : InputResource.Type.values()) {
        if (indexType.equals(InputResource.Type.SRA_ACCESSION)) continue;

        sources.add(
            new SamInputResource(
                composeInputResourceForType(dataType, false),
                composeInputResourceForType(indexType, true)));
      }
    }
    final Object[][] data = new Object[sources.size()][];
    for (final SamInputResource source : sources) {
      data[sources.indexOf(source)] = new Object[] {source};
    }

    return data;
  }

  private InputResource composeInputResourceForType(
      final InputResource.Type type, final boolean forIndex) {
    final File f = forIndex ? localBamIndex : localBam;
    final URL url = forIndex ? bamIndexUrl : bamUrl;
    switch (type) {
      case FILE:
        return new FileInputResource(f);
      case PATH:
        return new PathInputResource(f.toPath(), Function.identity());
      case URL:
        return new UrlInputResource(url);
      case SEEKABLE_STREAM:
        return new SeekableStreamInputResource(new SeekableHTTPStream(url));
      case INPUT_STREAM:
        try {
          return new InputStreamInputResource(new FileInputStream(f));
        } catch (final FileNotFoundException e) {
          throw new RuntimeIOException(e);
        }
      default:
        throw new IllegalStateException();
    }
  }

  final Set<SAMFileHeader> observedHeaders = new HashSet<SAMFileHeader>();
  final Set<List<SAMRecord>> observedRecordOrdering = new HashSet<List<SAMRecord>>();

  @Test(dataProvider = "composeAllPermutationsOfSamInputResource")
  public void exhaustInputResourcePermutation(final SamInputResource resource) throws IOException {
    final SamReader reader = SamReaderFactory.makeDefault().open(resource);
    LOG.info(String.format("Reading from %s ...", resource));
    final List<SAMRecord> slurped = Iterables.slurp(reader);
    final SAMFileHeader fileHeader = reader.getFileHeader();
    reader.hasIndex();
    reader.indexing().hasBrowseableIndex();
    reader.close();

    /* Ensure all tests have read the same records in the same order or, if this is the first test, set it as the template. */
    observedHeaders.add(fileHeader);
    observedRecordOrdering.add(slurped);
    Assert.assertEquals(observedHeaders.size(), 1, "read different headers than other testcases");
    Assert.assertEquals(
        observedRecordOrdering.size(), 1, "read different records than other testcases");
  }

  @Test
  public void openPath() throws IOException {
    final Path path = localBam.toPath();
    final List<SAMRecord> records;
    final SAMFileHeader fileHeader;
    try (final SamReader reader = SamReaderFactory.makeDefault().open(path)) {
      LOG.info(String.format("Reading from %s ...", path));
      records = Iterables.slurp(reader);
      fileHeader = reader.getFileHeader();
      reader.close();
    }

    try (final SamReader fileReader = SamReaderFactory.makeDefault().open(localBam)) {
      final List<SAMRecord> expectedRecords = Iterables.slurp(fileReader);
      final SAMFileHeader expectedFileHeader = fileReader.getFileHeader();
      Assert.assertEquals(records, expectedRecords);
      Assert.assertEquals(fileHeader, expectedFileHeader);
    }
  }

  final Set<List<SAMRecord>> observedRecordOrdering1 = new HashSet<List<SAMRecord>>();
  final Set<List<SAMRecord>> observedRecordOrdering3 = new HashSet<List<SAMRecord>>();
  final Set<List<SAMRecord>> observedRecordOrdering20 = new HashSet<List<SAMRecord>>();

  @Test(dataProvider = "composeAllPermutationsOfSamInputResource")
  public void queryInputResourcePermutation(final SamInputResource resource) throws IOException {
    final SamReader reader = SamReaderFactory.makeDefault().open(resource);
    LOG.info(String.format("Query from %s ...", resource));
    if (reader.hasIndex()) {
      final StopWatch stopWatch = new StopWatch();
      stopWatch.start();
      final SAMRecordIterator q1 = reader.query("chr1", 500000, 100000000, true);
      observedRecordOrdering1.add(Iterables.slurp(q1));
      q1.close();
      final SAMRecordIterator q20 = reader.query("chr20", 1, 1000000, true);
      observedRecordOrdering20.add(Iterables.slurp(q20));
      q20.close();
      final SAMRecordIterator q3 = reader.query("chr3", 1, 10000000, true);
      observedRecordOrdering3.add(Iterables.slurp(q3));
      q3.close();
      stopWatch.stop();
      LOG.info(String.format("Finished queries in %sms", stopWatch.getElapsedTime()));

      Assert.assertEquals(
          observedRecordOrdering1.size(), 1, "read different records for chromosome 1");
      Assert.assertEquals(
          observedRecordOrdering20.size(), 1, "read different records for chromosome 20");
      Assert.assertEquals(
          observedRecordOrdering3.size(), 1, "read different records for chromosome 3");
    } else if (resource.indexMaybe() != null) {
      LOG.warn("Resource has an index source, but is not indexed: " + resource);
    } else {
      LOG.info("Skipping query operation: no index.");
    }
    reader.close();
  }

  /**
   * A path that pretends it's not based upon a file. This helps in cases where we want to test
   * branches that apply to non-file based paths without actually having to use non-file based
   * resources (like cloud urls)
   */
  private static class NeverFilePathInputResource extends PathInputResource {
    public NeverFilePathInputResource(Path pathResource) {
      super(pathResource);
    }

    @Override
    public File asFile() {
      return null;
    }
  }

  @Test
  public void checkHasIndexForStreamingPathBamWithFileIndex() throws IOException {
    InputResource bam = new NeverFilePathInputResource(localBam.toPath());
    InputResource index = new FileInputResource(localBamIndex);

    // ensure that the index is being used, not checked in queryInputResourcePermutation
    try (final SamReader reader =
        SamReaderFactory.makeDefault().open(new SamInputResource(bam, index))) {
      Assert.assertTrue(reader.hasIndex());
    }
  }

  @Test
  public void queryStreamingPathBamWithFileIndex() throws IOException {
    InputResource bam = new NeverFilePathInputResource(localBam.toPath());
    InputResource index = new FileInputResource(localBamIndex);

    final SamInputResource resource = new SamInputResource(bam, index);
    queryInputResourcePermutation(new SamInputResource(bam, index));
  }

  @Test
  public void customReaderFactoryTest() throws IOException {
    try {
      CustomReaderFactory.setInstance(
          new CustomReaderFactory(
              "https://www.googleapis.com/genomics/v1beta/reads/,"
                  + "htsjdk.samtools.SamReaderFactoryTest$TestReaderFactory"));
      final SamReader reader =
          SamReaderFactory.makeDefault()
              .open(
                  SamInputResource.of(
                      "https://www.googleapis.com/genomics/v1beta/reads/?uncompressed.sam"));
      int i = 0;
      for (@SuppressWarnings("unused") final SAMRecord ignored : reader) {
        ++i;
      }
      reader.close();

      Assert.assertTrue(i > 0);
    } finally {
      CustomReaderFactory.resetToDefaultInstance();
    }
  }

  public static class TestReaderFactory implements CustomReaderFactory.ICustomReaderFactory {
    @Override
    public SamReader open(URL url) {
      final File file = new File(TEST_DATA_DIR, url.getQuery());
      LOG.info("Opening customr reader for " + file.toString());
      return SamReaderFactory.makeDefault().open(file);
    }
  }

  @Test
  public void inputResourceFromStringTest() throws IOException {
    Assert.assertEquals(
        SamInputResource.of("http://test.url").data().type(), InputResource.Type.URL);
    Assert.assertEquals(
        SamInputResource.of("https://test.url").data().type(), InputResource.Type.URL);
    Assert.assertEquals(
        SamInputResource.of("ftp://test.url").data().type(), InputResource.Type.URL);
    Assert.assertEquals(SamInputResource.of("/a/b/c").data().type(), InputResource.Type.FILE);
  }

  @Test
  public void testCRAMReaderFromURL() throws IOException {
    // get a CRAM reader with an index from a URL-backed resource
    getCRAMReaderFromInputResource(
        (cramURL, indexURL) -> {
          return SamInputResource.of(cramURL).index(indexURL);
        },
        true,
        3);
  }

  @Test
  public void testCRAMReaderFromURLStream() throws IOException {
    // get a CRAM reader with an index from a stream-backed resource created from a URL
    getCRAMReaderFromInputResource(
        (cramURL, indexURL) -> {
          try {
            ISeekableStreamFactory streamFactory = SeekableStreamFactory.getInstance();
            return SamInputResource.of(streamFactory.getStreamFor(cramURL))
                .index(streamFactory.getStreamFor(indexURL));
          } catch (IOException e) {
            throw new RuntimeIOException(e);
          }
        },
        true,
        3);
  }

  @Test
  public void testCRAMReaderFromURLNoIndexFile() throws IOException {
    // get just a CRAM reader (no index) from an URL-backed resource
    getCRAMReaderFromInputResource(
        (cramURL, indexURL) -> {
          return SamInputResource.of(cramURL);
        },
        false,
        11);
  }

  @Test(expectedExceptions = RuntimeIOException.class)
  public void testCRAMReaderFromURLBadIndexFile() throws IOException {
    // deliberately specify a bad index file to ensure we get an IOException
    getCRAMReaderFromInputResource(
        (cramURL, indexURL) -> {
          return SamInputResource.of(cramURL).index(new File("nonexistent.bai"));
        },
        true,
        3);
  }

  private void getCRAMReaderFromInputResource(
      final BiFunction<URL, URL, SamInputResource> getInputResource,
      final boolean hasIndex,
      final int expectedCount)
      throws IOException {
    final String cramFilePath =
        new File(TEST_DATA_DIR, "cram_with_bai_index.cram").getAbsolutePath();
    final String cramIndexPath =
        new File(TEST_DATA_DIR, "cram_with_bai_index.cram.bai").getAbsolutePath();
    final URL cramURL = new URL("file://" + cramFilePath);
    final URL indexURL = new URL("file://" + cramIndexPath);

    final SamReaderFactory factory =
        SamReaderFactory.makeDefault()
            .referenceSource(new ReferenceSource(new File(TEST_DATA_DIR, "hg19mini.fasta")))
            .validationStringency(ValidationStringency.SILENT);
    final SamReader reader = factory.open(getInputResource.apply(cramURL, indexURL));

    int count =
        hasIndex
            ? countRecordsInQueryInterval(reader, new QueryInterval(1, 10, 1000))
            : countRecords(reader);
    Assert.assertEquals(count, expectedCount);
  }

  @Test
  public void testSamReaderFromSeekableStream() throws IOException {
    // even though a SAM isn't indexable, make sure we can open one
    // using a seekable stream
    final File samFile = new File(TEST_DATA_DIR, "unsorted.sam");
    final SamReaderFactory factory =
        SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT);
    final SamReader reader = factory.open(SamInputResource.of(new SeekableFileStream(samFile)));
    Assert.assertEquals(countRecords(reader), 10);
  }

  @Test
  public void testSamReaderFromURL() throws IOException {
    final String samFilePath = new File(TEST_DATA_DIR, "unsorted.sam").getAbsolutePath();
    final URL samURL = new URL("file://" + samFilePath);
    final SamReaderFactory factory =
        SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT);
    final SamReader reader = factory.open(SamInputResource.of(samURL));
    Assert.assertEquals(countRecords(reader), 10);
  }

  @Test(expectedExceptions = SAMFormatException.class)
  public void testSamReaderFromMalformedSeekableStream() throws IOException {
    // use a bogus (.bai file) to force SamReaderFactory to fall through to the
    // fallback code that assumes a SAM File when it can't determine the
    // format of the input, to ensure that it results in a SAMFormatException
    final File samFile = new File(TEST_DATA_DIR, "cram_with_bai_index.cram.bai");
    final SamReaderFactory factory =
        SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT);
    final SamReader reader = factory.open(SamInputResource.of(new SeekableFileStream(samFile)));
    countRecords(reader);
  }
}